001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.util.Vector;
036: import java.util.Enumeration;
037: import java.io.IOException; //#ifdef JDK1.1
038: import java.io.ObjectInputStream;
039:
040: //#endif JDK1.1
041:
042: public class Regexp extends Pattern {
043:
044: String stringRep;
045: transient org.apache.regexp.REProgram pattern;
046: transient String[] fields;
047:
048: public Regexp(String pattern) {
049: stringRep = pattern;
050: init();
051: }
052:
053: public boolean equals(Object object) {
054: if (!(object instanceof Regexp))
055: return false;
056: Regexp p = (Regexp) object;
057: return p.stringRep.equals(stringRep);
058: }
059:
060: //#ifdef JDK1.1
061: private void readObject(ObjectInputStream in) throws IOException,
062: ClassNotFoundException {
063: in.defaultReadObject();
064: init();
065: }
066:
067: //#endif JDK1.1
068:
069: private void init() {
070: try {
071: this .pattern = new org.apache.regexp.RECompiler()
072: .compile(translateFields(stringRep));
073: } catch (org.apache.regexp.RESyntaxException e) {
074: throw new RuntimeException("syntax error in pattern: "
075: + e.getMessage());
076: }
077: }
078:
079: public String[] getFieldNames() {
080: return fields;
081: }
082:
083: public String toString() {
084: return stringRep;
085: }
086:
087: public PatternMatcher match(Region region) {
088: return new RegexpMatcher(this , region);
089: }
090:
091: public static String escape(String s) {
092: return rcm.util.Str.escape(s, '\\', "\\^.$|()[]*+?{}");
093: }
094:
095: String translateFields(String s) {
096: Vector vfields = new Vector();
097: boolean inEscape = false;
098:
099: StringBuffer output = new StringBuffer();
100:
101: int len = s.length();
102: for (int i = 0; i < len; ++i) {
103: char c = s.charAt(i);
104: if (inEscape) {
105: output.append(c);
106: inEscape = false;
107: } else {
108: switch (c) {
109: case '\\':
110: output.append(c);
111: inEscape = true;
112: break;
113:
114: case '(':
115: output.append(c);
116: if (s.startsWith("?{", i + 1)) {
117: int start = i + 3;
118: int end = s.indexOf('}', start);
119: vfields.addElement(s.substring(start, end));
120: i = end;
121: } else if (!s.startsWith("?", i + 1))
122: vfields.addElement(String.valueOf(vfields
123: .size()));
124: break;
125:
126: default:
127: output.append(c);
128: break;
129: }
130: }
131: }
132:
133: fields = new String[vfields.size()];
134: vfields.copyInto(fields);
135: return output.toString();
136: }
137:
138: public static void main (String[] args) throws Exception {
139: if (args.length < 2) {
140: System.err.println ("usage: Regexp <pattern> <source URL>*");
141: return;
142: }
143:
144: Pattern p = new Regexp (args[0].replace ('_', ' ') );
145: for (int i=1; i<args.length; ++i) {
146: Page page = new Page (new Link (args[i]));
147: System.out.println ("--------------------" + args[i]);
148: PatternMatcher m = p.match (page);
149: for (Region r = m.nextMatch(); r != null; r = m.nextMatch()) {
150: System.out.println ("[" + r.getStart() + "," + r.getEnd() + "]" + r);
151: Enumeration enum = r.enumerateObjectLabels ();
152: while (enum.hasMoreElements ()) {
153: String lbl = (String)enum.nextElement ();
154: Object object = r.getObjectLabel (lbl);
155: if (object instanceof Region) {
156: Region s = (Region)object;
157: System.out.println (" "+lbl+"=[" + s.getStart() + "," + s.getEnd() + "]" + s);
158: }
159: }
160: }
161: }
162: }
163: }
164:
165: class RegexpMatcher extends PatternMatcher {
166: Regexp regexp;
167: Region source;
168: org.apache.regexp.RE re;
169: String content;
170: int pos;
171:
172: public RegexpMatcher(Regexp regexp, Region source) {
173: this .regexp = regexp;
174: this .source = source;
175: this .re = new org.apache.regexp.RE(regexp.pattern, 0);
176: this .content = source.toString();
177: this .pos = 0;
178: }
179:
180: protected Region findNext() {
181: if (pos < content.length() && re.match(content, pos)) {
182: pos = Math.max(pos + 1, re.getParenEnd(0));
183:
184: Page page = source.getSource();
185: int base = source.getStart();
186: Region match = new Region(page, base + re.getParenStart(0),
187: base + re.getParenEnd(0));
188:
189: int n = re.getParenCount() - 1;
190: Region[] groups = new Region[n];
191: for (int i = 0; i < n; ++i) {
192: Region r = new Region(page, base
193: + re.getParenStart(i + 1), base
194: + re.getParenEnd(i + 1));
195: groups[i] = r;
196: match.setField(regexp.fields[i], r);
197: }
198: match.setFields(Pattern.groups, groups);
199: return match;
200: } else {
201: pos = content.length();
202: return null;
203: }
204: }
205: }
|