001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: /**
036: * Wildcard pattern. Wildcards are similar to sh-style file globbing.
037: * A wildcard pattern is implicitly anchored, meaning that it must match the entire string.
038: * The wildcard operators are:
039: * <PRE>
040: * ? matches one arbitrary character
041: * * matches zero or more arbitrary characters
042: * [xyz] matches characters x or y or z
043: * {foo,bar,baz} matches expressions foo or bar or baz
044: * () grouping to extract fields
045: * \ escape one of these special characters
046: * </PRE>
047: * Escape codes (like \n and \t) and Perl5 character classes (like \w and \s) may also be used.
048: */
049: public class Wildcard extends Regexp {
050: String stringRep;
051:
052: public Wildcard(String pattern) {
053: super ("^" + toRegexp(pattern) + "$");
054: stringRep = pattern;
055: }
056:
057: public boolean equals(Object object) {
058: if (!(object instanceof Wildcard))
059: return false;
060: Wildcard p = (Wildcard) object;
061: return p.stringRep.equals(stringRep);
062: }
063:
064: public static String toRegexp(String wildcard) {
065: String s = wildcard;
066:
067: int inAlternative = 0;
068: int inSet = 0;
069: boolean inEscape = false;
070:
071: StringBuffer output = new StringBuffer();
072:
073: int len = s.length();
074: for (int i = 0; i < len; ++i) {
075: char c = s.charAt(i);
076: if (inEscape) {
077: output.append(c);
078: inEscape = false;
079: } else {
080: switch (c) {
081: case '\\':
082: output.append(c);
083: inEscape = true;
084: break;
085: case '?':
086: output.append('.');
087: break;
088: case '*':
089: output.append(".*");
090: break;
091: case '[':
092: output.append(c);
093: ++inSet;
094: break;
095: case ']':
096: // FIX: handle [] case properly
097: output.append(c);
098: --inSet;
099: break;
100: case '{':
101: output.append("(?:");
102: ++inAlternative;
103: break;
104: case ',':
105: if (inAlternative > 0)
106: output.append("|");
107: else
108: output.append(c);
109: break;
110: case '}':
111: output.append(")");
112: --inAlternative;
113: break;
114: case '^':
115: if (inSet > 0) {
116: output.append(c);
117: } else {
118: output.append('\\');
119: output.append(c);
120: }
121: break;
122: case '$':
123: case '.':
124: case '|':
125: case '+':
126: output.append('\\');
127: output.append(c);
128: break;
129: default:
130: output.append(c);
131: break;
132: }
133: }
134: }
135: if (inEscape)
136: output.append('\\');
137:
138: return output.toString();
139: }
140:
141: public static String escape(String s) {
142: return rcm.util.Str.escape(s, '\\', "\\?*{}()[]");
143: }
144:
145: public String toString() {
146: return stringRep;
147: }
148:
149: public static void main(String[] args) throws Exception {
150: if (args.length < 2) {
151: System.err.println("usage: Wildcard <pattern> <string>*");
152: return;
153: }
154:
155: Pattern p = new Wildcard(args[0].replace('_', ' '));
156: for (int i = 1; i < args.length; ++i) {
157: Region r = p.oneMatch(args[i]);
158: System.out.println(args[i] + ": " + (r != null));
159: if (r != null) {
160: System.out.println(" [" + r.getStart() + ","
161: + r.getEnd() + "]" + r);
162: Region[] groups = r.getFields("websphinx.groups");
163: if (groups != null)
164: for (int j = 0; j < groups.length; ++j) {
165: Region s = groups[j];
166: System.out.println(" " + "[" + s.getStart()
167: + "," + s.getEnd() + "]" + s);
168: }
169: }
170: }
171: }
172: }
|