001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * SimpleHTMLExtractor.java
020: * Created on Jun 5, 2003
021: *
022: * $Header$
023: */
024: package org.archive.extractor;
025:
026: import java.util.ArrayList;
027: import java.util.Iterator;
028: import java.util.LinkedList;
029: import java.util.logging.Level;
030: import java.util.logging.Logger;
031: import java.util.regex.Matcher;
032:
033: import org.apache.commons.httpclient.URIException;
034: import org.archive.crawler.extractor.Link;
035: import org.archive.net.UURI;
036: import org.archive.net.UURIFactory;
037: import org.archive.util.DevUtils;
038: import org.archive.util.TextUtils;
039:
040: /**
041: * Basic link-extraction, from an HTML content-body,
042: * using regular expressions.
043: *
044: * ROUGH DRAFT IN PROGRESS / incomplete... untested...
045: *
046: * @author gojomo
047: */
048: public class RegexpHTMLLinkExtractor extends CharSequenceLinkExtractor {
049: private static Logger logger = Logger
050: .getLogger(RegexpHTMLLinkExtractor.class.getName());
051:
052: boolean honorRobots = true;
053: boolean extractInlineCss = true;
054: boolean extractInlineJs = true;
055:
056: protected LinkedList<Link> next = new LinkedList<Link>();
057: protected Matcher tags;
058:
059: /* (non-Javadoc)
060: * @see org.archive.extractor.CharSequenceLinkExtractor#findNextLink()
061: */
062: protected boolean findNextLink() {
063: if (tags == null) {
064: tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR,
065: sourceContent);
066: }
067: while (tags.find()) {
068: if (Thread.interrupted()) {
069: // TODO: throw an exception, perhaps, rather than just clear & break?
070: break;
071: }
072: if (tags.start(8) > 0) {
073: // comment match
074: // for now do nothing
075: } else if (tags.start(7) > 0) {
076: // <meta> match
077: int start = tags.start(5);
078: int end = tags.end(5);
079: processMeta(sourceContent.subSequence(start, end));
080: } else if (tags.start(5) > 0) {
081: // generic <whatever> match
082: int start5 = tags.start(5);
083: int end5 = tags.end(5);
084: int start6 = tags.start(6);
085: int end6 = tags.end(6);
086: processGeneralTag(sourceContent.subSequence(start6,
087: end6), sourceContent.subSequence(start5, end5));
088: } else if (tags.start(1) > 0) {
089: // <script> match
090: int start = tags.start(1);
091: int end = tags.end(1);
092: processScript(sourceContent.subSequence(start, end),
093: tags.end(2) - start);
094: } else if (tags.start(3) > 0) {
095: // <style... match
096: int start = tags.start(3);
097: int end = tags.end(3);
098: processStyle(sourceContent.subSequence(start, end),
099: tags.end(4) - start);
100: }
101: if (!next.isEmpty()) {
102: // at least one link found
103: return true;
104: }
105: }
106: // no relevant tags found
107: return false;
108: }
109:
110: /**
111: * Compiled relevant tag extractor.
112: *
113: * <p>
114: * This pattern extracts either:
115: * <li> (1) whole <script>...</script> or
116: * <li> (2) <style>...</style> or
117: * <li> (3) <meta ...> or
118: * <li> (4) any other open-tag with at least one attribute
119: * (eg matches "<a href='boo'>" but not "</a>" or "<br>")
120: * <p>
121: * groups:
122: * <li> 1: SCRIPT SRC=foo>boo</SCRIPT
123: * <li> 2: just script open tag
124: * <li> 3: STYLE TYPE=moo>zoo</STYLE
125: * <li> 4: just style open tag
126: * <li> 5: entire other tag, without '<' '>'
127: * <li> 6: element
128: * <li> 7: META
129: * <li> 8: !-- comment --
130: */
131: static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?:\\w+))\\s+[^>]*+)|(!--.*?--))>";
132:
133: // this pattern extracts attributes from any open-tag innards
134: // matched by the above. attributes known to be URIs of various
135: // sorts are matched specially
136: static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\s((href)|(action)|(on\\w*)"
137: + "|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)"
138: + "|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))"
139: + "|(codebase)|((?:classid)|(?:data))|(archive)|(code)"
140: + "|(value)|([-\\w]+))"
141: + "\\s*=\\s*"
142: + "(?:(?:\"(.*?)(?:\"|$))"
143: + "|(?:'(.*?)(?:'|$))"
144: + "|(\\S+))";
145: // groups:
146: // 1: attribute name
147: // 2: HREF - single URI relative to doc base, or occasionally javascript:
148: // 3: ACTION - single URI relative to doc base, or occasionally javascript:
149: // 4: ON[WHATEVER] - script handler
150: // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE,DATASRC, or FOR
151: // single URI relative to doc base
152: // 6: CODEBASE - a single URI relative to doc base, affecting other
153: // attributes
154: // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
155: // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
156: // (if supplied)
157: // 9: CODE - a single URI relative to the CODEBASE (is specified).
158: // 10: VALUE - often includes a uri path on forms
159: // 11: any other attribute
160: // 12: double-quote delimited attr value
161: // 13: single-quote delimited attr value
162: // 14: space-delimited attr value
163:
164: // much like the javascript likely-URI extractor, but
165: // without requiring quotes -- this can indicate whether
166: // an HTML tag attribute that isn't definitionally a
167: // URI might be one anyway, as in form-tag VALUE attributes
168: static final String LIKELY_URI_PATH = "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
169: static final String ESCAPED_AMP = "&";
170: static final String AMP = "&";
171: static final String WHITESPACE = "\\s";
172: static final String CLASSEXT = ".class";
173: static final String APPLET = "applet";
174: static final String BASE = "base";
175: static final String LINK = "link";
176:
177: protected boolean processGeneralTag(CharSequence element,
178: CharSequence cs) {
179:
180: Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR,
181: cs);
182:
183: // Just in case it's an OBJECT or APPLET tag
184: String codebase = null;
185: ArrayList<String> resources = null;
186: long tally = next.size();
187:
188: while (attr.find()) {
189: int valueGroup = (attr.start(12) > -1) ? 12 : (attr
190: .start(13) > -1) ? 13 : 14;
191: int start = attr.start(valueGroup);
192: int end = attr.end(valueGroup);
193: CharSequence value = cs.subSequence(start, end);
194: if (attr.start(2) > -1) {
195: // HREF
196: CharSequence context = Link.elementContext(element,
197: attr.group(2));
198: if (element.toString().equalsIgnoreCase(LINK)) {
199: // <LINK> elements treated as embeds (css, ico, etc)
200: processEmbed(value, context);
201: } else {
202: if (element.toString().equalsIgnoreCase(BASE)) {
203: try {
204: base = UURIFactory.getInstance(value
205: .toString());
206: } catch (URIException e) {
207: extractErrorListener.noteExtractError(e,
208: source, value);
209: }
210: }
211: // other HREFs treated as links
212: processLink(value, context);
213: }
214: } else if (attr.start(3) > -1) {
215: // ACTION
216: CharSequence context = Link.elementContext(element,
217: attr.group(3));
218: processLink(value, context);
219: } else if (attr.start(4) > -1) {
220: // ON____
221: processScriptCode(value); // TODO: context?
222: } else if (attr.start(5) > -1) {
223: // SRC etc.
224: CharSequence context = Link.elementContext(element,
225: attr.group(5));
226: processEmbed(value, context);
227: } else if (attr.start(6) > -1) {
228: // CODEBASE
229: // TODO: more HTML deescaping?
230: codebase = TextUtils
231: .replaceAll(ESCAPED_AMP, value, AMP);
232: CharSequence context = Link.elementContext(element,
233: attr.group(6));
234: processEmbed(codebase, context);
235: } else if (attr.start(7) > -1) {
236: // CLASSID, DATA
237: if (resources == null) {
238: resources = new ArrayList<String>();
239: }
240: resources.add(value.toString());
241: } else if (attr.start(8) > -1) {
242: // ARCHIVE
243: if (resources == null) {
244: resources = new ArrayList<String>();
245: }
246: String[] multi = TextUtils.split(WHITESPACE, value);
247: for (int i = 0; i < multi.length; i++) {
248: resources.add(multi[i]);
249: }
250: } else if (attr.start(9) > -1) {
251: // CODE
252: if (resources == null) {
253: resources = new ArrayList<String>();
254: }
255: // If element is applet and code value does not end with
256: // '.class' then append '.class' to the code value.
257: if (element.toString().toLowerCase().equals(APPLET)
258: && !value.toString().toLowerCase().endsWith(
259: CLASSEXT)) {
260: resources.add(value.toString() + CLASSEXT);
261: } else {
262: resources.add(value.toString());
263: }
264:
265: } else if (attr.start(10) > -1) {
266: // VALUE
267: if (TextUtils.matches(LIKELY_URI_PATH, value)) {
268: CharSequence context = Link.elementContext(element,
269: attr.group(10));
270: processLink(value, context);
271: }
272:
273: } else if (attr.start(11) > -1) {
274: // any other attribute
275: // ignore for now
276: // could probe for path- or script-looking strings, but
277: // those should be vanishingly rare in other attributes,
278: // and/or symptomatic of page bugs
279: }
280: }
281: TextUtils.recycleMatcher(attr);
282:
283: // handle codebase/resources
284: if (resources == null) {
285: return (tally - next.size()) > 0;
286: }
287: Iterator iter = resources.iterator();
288: UURI codebaseURI = null;
289: String res = null;
290: try {
291: if (codebase != null) {
292: // TODO: Pass in the charset.
293: codebaseURI = UURIFactory.getInstance(base, codebase);
294: }
295: while (iter.hasNext()) {
296: res = iter.next().toString();
297: // TODO: more HTML deescaping?
298: res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP);
299: if (codebaseURI != null) {
300: res = codebaseURI.resolve(res).toString();
301: }
302: processEmbed(res, element); // TODO: include attribute too
303: }
304: } catch (URIException e) {
305: extractErrorListener.noteExtractError(e, source, codebase);
306: } catch (IllegalArgumentException e) {
307: DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"
308: + "codebase=" + codebase + " res=" + res + "\n"
309: + DevUtils.extraInfo(), e);
310: }
311: return (tally - next.size()) > 0;
312: }
313:
314: /**
315: * @param cs
316: */
317: protected void processScriptCode(CharSequence cs) {
318: RegexpJSLinkExtractor.extract(cs, source, base, next,
319: extractErrorListener);
320: }
321:
322: static final String JAVASCRIPT = "(?i)^javascript:.*";
323:
324: /**
325: * @param value
326: * @param context
327: */
328: protected void processLink(CharSequence value, CharSequence context) {
329: String link = TextUtils.replaceAll(ESCAPED_AMP, value, "&");
330:
331: if (TextUtils.matches(JAVASCRIPT, link)) {
332: processScriptCode(value.subSequence(11, value.length()));
333: } else {
334: addLinkFromString(link, context, Link.NAVLINK_HOP);
335: }
336: }
337:
338: /**
339: * @param uri
340: * @param context
341: */
342: private void addLinkFromString(String uri, CharSequence context,
343: char hopType) {
344: try {
345: Link link = new Link(source, UURIFactory.getInstance(base,
346: uri), context, hopType);
347: next.addLast(link);
348: } catch (URIException e) {
349: extractErrorListener.noteExtractError(e, source, uri);
350: }
351: }
352:
353: protected long processEmbed(CharSequence value, CharSequence context) {
354: String embed = TextUtils.replaceAll(ESCAPED_AMP, value, "&");
355: addLinkFromString(embed, context, Link.EMBED_HOP);
356: return 1;
357: }
358:
359: static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"
360: + "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
361:
362: protected void processScript(CharSequence sequence, int endOfOpenTag) {
363: // first, get attributes of script-open tag
364: // as per any other tag
365: processGeneralTag(sequence.subSequence(0, 6), sequence
366: .subSequence(0, endOfOpenTag));
367:
368: // then, apply best-effort string-analysis heuristics
369: // against any code present (false positives are OK)
370: processScriptCode(sequence.subSequence(endOfOpenTag, sequence
371: .length()));
372: }
373:
374: protected void processMeta(CharSequence cs) {
375: Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR,
376: cs);
377:
378: String name = null;
379: String httpEquiv = null;
380: String content = null;
381:
382: while (attr.find()) {
383: int valueGroup = (attr.start(12) > -1) ? 12 : (attr
384: .start(13) > -1) ? 13 : 14;
385: CharSequence value = cs.subSequence(attr.start(valueGroup),
386: attr.end(valueGroup));
387: if (attr.group(1).equalsIgnoreCase("name")) {
388: name = value.toString();
389: } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
390: httpEquiv = value.toString();
391: } else if (attr.group(1).equalsIgnoreCase("content")) {
392: content = value.toString();
393: }
394: // TODO: handle other stuff
395: }
396: TextUtils.recycleMatcher(attr);
397:
398: // Look for the 'robots' meta-tag
399: if ("robots".equalsIgnoreCase(name) && content != null) {
400: if (getHonorRobots()) {
401: String contentLower = content.toLowerCase();
402: if ((contentLower.indexOf("nofollow") >= 0 || contentLower
403: .indexOf("none") >= 0)) {
404: // if 'nofollow' or 'none' is specified and we
405: // are honoring robots, end html extraction
406: logger
407: .fine("HTML extraction skipped due to robots meta-tag for: "
408: + source);
409: cancelFurtherExtraction();
410: return;
411: }
412: }
413: } else if ("refresh".equalsIgnoreCase(httpEquiv)
414: && content != null) {
415: String refreshUri = content
416: .substring(content.indexOf("=") + 1);
417: try {
418: Link refreshLink = new Link(source, UURIFactory
419: .getInstance(base, refreshUri), Link
420: .elementContext("meta", httpEquiv),
421: Link.REFER_HOP);
422: next.addLast(refreshLink);
423: } catch (URIException e) {
424: extractErrorListener.noteExtractError(e, source,
425: refreshUri);
426: }
427: }
428: }
429:
430: /**
431: * @return whether to honor internal robots directives (eg meta robots)
432: */
433: private boolean getHonorRobots() {
434: return honorRobots;
435: }
436:
437: /**
438: * Ensure no further Links are extracted (by setting matcher up to fail)
439: */
440: private void cancelFurtherExtraction() {
441: // java 1.5 only:
442: // tags.region(tags.regionEnd(),tags.regionEnd());
443: tags.reset("");
444: }
445:
446: /**
447: * @param sequence
448: * @param endOfOpenTag
449: */
450: protected void processStyle(CharSequence sequence, int endOfOpenTag) {
451: // First, get attributes of script-open tag as per any other tag.
452: processGeneralTag(sequence.subSequence(0, 6), sequence
453: .subSequence(0, endOfOpenTag));
454:
455: // then, parse for URIs
456: RegexpCSSLinkExtractor.extract(sequence.subSequence(
457: endOfOpenTag, sequence.length()), source, base, next,
458: extractErrorListener);
459: }
460:
461: /**
462: * Discard all state. Another setup() is required to use again.
463: */
464: public void reset() {
465: super .reset();
466: TextUtils.recycleMatcher(tags);
467: tags = null;
468: }
469:
470: protected static CharSequenceLinkExtractor newDefaultInstance() {
471: return new RegexpHTMLLinkExtractor();
472: }
473: }
|