001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import rcm.util.Str;
036: import java.util.Enumeration;
037:
038: /**
039: * Tag pattern. Tag patterns are regular expressions over
040: * the alphabet of HTML tags.
041: */
042: public class Tagexp extends Regexp {
043: String stringRep;
044:
045: public Tagexp(String pattern) {
046: super (toRegexp(pattern));
047: stringRep = pattern;
048: }
049:
050: public boolean equals(Object object) {
051: if (!(object instanceof Tagexp))
052: return false;
053: Tagexp p = (Tagexp) object;
054: return p.stringRep.equals(stringRep);
055: }
056:
057: public String toString() {
058: return stringRep;
059: }
060:
061: public PatternMatcher match(Region region) {
062: return new TagexpMatcher(this , region);
063: }
064:
065: static HTMLParser parser = new HTMLParser();
066:
067: public static String toRegexp(String tagexp) {
068: StringBuffer output = new StringBuffer();
069:
070: // parse the tagexp as HTML
071: Page page;
072: //System.err.println ("Parsing: " + tagexp);
073: synchronized (parser) {
074: page = new Page(null, tagexp, parser);
075: }
076:
077: // canonicalize the tags
078: Region[] tokens = page.getTokens();
079: for (int i = 0; i < tokens.length; ++i) {
080: //System.err.println ("tok=" + tokens[i].toHTML());
081: if (tokens[i] instanceof Tag)
082: canonicalizeTagPattern(output, (Tag) tokens[i]);
083: else
084: translateText(output, tokens[i].toString());
085: }
086:
087: //System.err.println ("regexp=" + output);
088: return output.toString();
089: }
090:
091: static void canonicalizeTag(StringBuffer output, Tag tag, int j) {
092: String tagName = tag.getTagName();
093: if (tagName == Tag.COMMENT)
094: return; // don't put comments or decls in the canonicalization
095:
096: output.append('<');
097: if (tag.isEndTag())
098: output.append('/');
099: output.append(tagName);
100: output.append('#');
101: output.append(String.valueOf(j));
102: output.append('#');
103:
104: if (tag.countHTMLAttributes() > 0) {
105: String[] attrs = tag.getHTMLAttributes();
106: sortAttrs(attrs);
107:
108: for (int i = 0; i < attrs.length;) {
109: String name = attrs[i++];
110: String value = attrs[i++];
111:
112: output.append(' ');
113: output.append(name);
114:
115: if (value != Region.TRUE) {
116: output.append('=');
117: value = encodeAttrValue(value);
118: output.append(value);
119: }
120:
121: output.append(' ');
122: }
123: }
124:
125: output.append('>');
126: }
127:
128: static void canonicalizeTagPattern(StringBuffer output, Tag tag) {
129: String tagName = tag.getTagName();
130: if (tagName == Tag.COMMENT)
131: return; // don't put comments or decls in the canonicalization
132:
133: output.append('<');
134: if (tag.isEndTag())
135: output.append('/');
136: translatePattern(output, tagName, "#");
137: output.append('#');
138: output.append("\\d+");
139: output.append('#');
140:
141: output.append("[^>]*");
142:
143: if (tag.countHTMLAttributes() > 0) {
144: String[] attrs = tag.getHTMLAttributes();
145: sortAttrs(attrs);
146:
147: for (int i = 0; i < attrs.length;) {
148: String name = attrs[i++];
149: String value = attrs[i++];
150:
151: output.append(' ');
152: translatePattern(output, name, "= >");
153:
154: if (value != Region.TRUE) {
155: output.append('=');
156: value = encodeAttrValue(value);
157: translatePattern(output, value, " >");
158: }
159:
160: output.append(' ');
161: output.append("[^>]*");
162: }
163: }
164:
165: output.append('>');
166: }
167:
168: static void sortAttrs(String[] attrs) {
169: // simple insertion sort suffices (since attrs.length is
170: // almost always less than 5
171: for (int i = 2; i < attrs.length; i += 2) {
172: String name = attrs[i];
173: String value = attrs[i + 1];
174:
175: int j;
176: for (j = i; j > 0 && attrs[j - 2].compareTo(name) > 0; j -= 2) {
177: attrs[j] = attrs[j - 2];
178: attrs[j + 1] = attrs[j - 1];
179: }
180:
181: attrs[j] = name;
182: attrs[j + 1] = value;
183: }
184: }
185:
186: static String encodeAttrValue(String value) {
187: if (value.indexOf('%') != -1)
188: value = Str.replace(value, "%", "%25");
189: if (value.indexOf(' ') != -1)
190: value = Str.replace(value, " ", "%20");
191: if (value.indexOf('<') != -1)
192: value = Str.replace(value, "<", "%3C");
193: if (value.indexOf('>') != -1)
194: value = Str.replace(value, ">", "%3E");
195: return value;
196: }
197:
198: static String translatePattern(StringBuffer output, String s,
199: String delimiters) {
200: s = Wildcard.toRegexp(s);
201:
202: boolean inEscape = false;
203:
204: int len = s.length();
205: for (int i = 0; i < len; ++i) {
206: char c = s.charAt(i);
207: if (inEscape) {
208: output.append(c);
209: inEscape = false;
210: } else if (c == '\\') {
211: output.append(c);
212: inEscape = true;
213: } else if (c == '.') {
214: output.append("[^");
215: output.append(delimiters);
216: output.append(']');
217: } else {
218: output.append(c);
219: }
220: }
221:
222: return output.toString();
223: }
224:
225: static void translateText(StringBuffer output, String s) {
226: // NIY: (@<tag>) and (<tag>@)
227: s = Str.replace(s, ".", "(?:<[^>]*>)");
228: output.append(s);
229: }
230:
231: public static void main (String[] args) throws Exception {
232: if (args.length < 2) {
233: System.err.println ("usage: Tagexp <pattern> <source URL>*");
234: return;
235: }
236:
237: Pattern p = new Tagexp (args[0].replace ('_', ' ') );
238: for (int i=1; i<args.length; ++i) {
239: Page page = new Page (new Link (args[i]));
240: //System.out.println (page.substringCanonicalTags (0, page.getEnd()));
241:
242: System.out.println ("-----------" + args[i]);
243: PatternMatcher m = p.match (page);
244: for (Region r = m.nextMatch(); r != null; r = m.nextMatch()) {
245: System.out.println ("[" + r.getStart() + "," + r.getEnd() + "]" + r);
246: Enumeration enum = r.enumerateObjectLabels ();
247: while (enum.hasMoreElements ()) {
248: String lbl = (String)enum.nextElement ();
249: Object object = r.getObjectLabel (lbl);
250: if (object instanceof Region) {
251: Region s = (Region)object;
252: System.out.println (" "+lbl+"=[" + s.getStart() + "," + s.getEnd() + "]" + s);
253: }
254: }
255: }
256: }
257: }
258: }
259:
260: class TagexpMatcher extends PatternMatcher {
261: Tagexp tagexp;
262: Region source;
263: org.apache.regexp.RE re;
264: String canon;
265: int pos;
266:
267: public TagexpMatcher(Tagexp tagexp, Region source) {
268: this .tagexp = tagexp;
269: this .source = source;
270: this .re = new org.apache.regexp.RE(tagexp.pattern, 0);
271: this .canon = source.getSource().substringCanonicalTags(
272: source.getStart(), source.getLength());
273: this .pos = 0;
274: }
275:
276: protected Region findNext() {
277: if (pos < canon.length() && re.match(canon, pos)) {
278: pos = Math.max(pos + 1, re.getParenEnd(0));
279:
280: Page page = source.getSource();
281:
282: Region match = mapCanonical2Region(page, canon, re
283: .getParenStart(0), re.getParenEnd(0));
284:
285: int n = re.getParenCount() - 1;
286: Region[] groups = new Region[n];
287: for (int i = 0; i < n; ++i) {
288: Region r = mapCanonical2Region(page, canon, re
289: .getParenStart(i + 1), re.getParenEnd(i + 1));
290: groups[i] = r;
291: match.setField(
292: tagexp.fields[i] != null ? tagexp.fields[i]
293: : String.valueOf(i), r);
294: }
295: match.setFields(Pattern.groups, groups);
296: return match;
297: } else
298: return null;
299: }
300:
301: final static Region mapCanonical2Region(Page page, String canon,
302: int start, int end) {
303: // NIY: (@ and @)
304: Region[] tokens = page.getTokens();
305: int ft, lt;
306:
307: if (start == end) {
308: ft = prevTag(canon, start);
309: lt = nextTag(canon, end);
310:
311: if (ft != -1)
312: if (lt != -1)
313: return new Region(page, tokens[ft].getEnd(),
314: tokens[lt].getStart());
315: else
316: return new Region(page, tokens[ft].getEnd(), page
317: .getEnd());
318: else if (lt != -1)
319: return new Region(page, page.getStart(), tokens[lt]
320: .getStart());
321: else
322: return page;
323: } else {
324: ft = nextTag(canon, start);
325: lt = prevTag(canon, end);
326:
327: Tag f = (Tag) tokens[ft];
328: Tag l = (Tag) tokens[lt];
329: Element e = f.getElement();
330: if (e != null && e.getStart() == f.getStart()
331: && e.getEnd() == l.getEnd())
332: return e;
333: else if (ft == lt)
334: return tokens[ft];
335: else
336: return tokens[ft].span(tokens[lt]);
337: }
338: }
339:
340: final static int nextTag(String canon, int p) {
341: return indexOfTag(canon, canon.indexOf('<', p));
342: }
343:
344: final static int prevTag(String canon, int p) {
345: if (p == 0)
346: return -1;
347: return indexOfTag(canon, canon.lastIndexOf('<', p - 1));
348: }
349:
350: final static int indexOfTag(String canon, int p) {
351: if (p == -1)
352: return -1;
353: int s = canon.indexOf('#', p);
354: if (s == -1)
355: return -1;
356: int e = canon.indexOf('#', s + 1);
357: if (e == -1)
358: return -1;
359: return Integer.parseInt(canon.substring(s + 1, e));
360: }
361: }
|