001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * SimpleHTMLExtractor.java
020: * Created on Jun 5, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.extractor;
025:
026: import java.io.IOException;
027: import java.util.ArrayList;
028: import java.util.Iterator;
029: import java.util.logging.Level;
030: import java.util.logging.Logger;
031: import java.util.regex.Matcher;
032:
033: import org.apache.commons.httpclient.URIException;
034: import org.archive.crawler.datamodel.CoreAttributeConstants;
035: import org.archive.crawler.datamodel.CrawlURI;
036: import org.archive.crawler.datamodel.RobotsHonoringPolicy;
037: import org.archive.crawler.settings.SimpleType;
038: import org.archive.crawler.settings.Type;
039: import org.archive.io.ReplayCharSequence;
040: import org.archive.net.UURI;
041: import org.archive.net.UURIFactory;
042: import org.archive.util.DevUtils;
043: import org.archive.util.HttpRecorder;
044: import org.archive.util.TextUtils;
045:
046: /**
047: * Basic link-extraction, from an HTML content-body,
048: * using regular expressions.
049: *
050: * @author gojomo
051: *
052: */
053: public class ExtractorHTML extends Extractor implements
054: CoreAttributeConstants {
055:
056: private static final long serialVersionUID = 5855731422080471017L;
057:
058: private static Logger logger = Logger.getLogger(ExtractorHTML.class
059: .getName());
060:
061: /**
062: * Compiled relevant tag extractor.
063: *
064: * <p>
065: * This pattern extracts either:
066: * <li> (1) whole <script>...</script> or
067: * <li> (2) <style>...</style> or
068: * <li> (3) <meta ...> or
069: * <li> (4) any other open-tag with at least one attribute
070: * (eg matches "<a href='boo'>" but not "</a>" or "<br>")
071: * <p>
072: * groups:
073: * <li> 1: SCRIPT SRC=foo>boo</SCRIPT
074: * <li> 2: just script open tag
075: * <li> 3: STYLE TYPE=moo>zoo</STYLE
076: * <li> 4: just style open tag
077: * <li> 5: entire other tag, without '<' '>'
078: * <li> 6: element
079: * <li> 7: META
080: * <li> 8: !-- comment --
081: */
082: // version w/ less unnecessary backtracking
083: private static final int MAX_ELEMENT_LENGTH = Integer
084: .parseInt(System.getProperty(ExtractorHTML.class.getName()
085: + ".maxElementNameLength", "1024"));
086:
087: static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)"
088: + // 1, 2
089: "|((style[^>]*+)>.*?</style)"
090: + // 3, 4
091: "|(((meta)|(?:\\w{1,"
092: + MAX_ELEMENT_LENGTH
093: + "}))\\s+[^>]*+)" + // 5, 6, 7
094: "|(!--.*?--))>"; // 8
095:
096: // version w/ problems with unclosed script tags
097: // static final String RELEVANT_TAG_EXTRACTOR =
098: // "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?:\\w+))\\s+.*?)|(!--.*?--))>";
099:
100: // // this pattern extracts 'href' or 'src' attributes from
101: // // any open-tag innards matched by the above
102: // static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(
103: // "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))");
104: //
105: // // this pattern extracts 'robots' attributes
106: // static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(
107: // "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))");
108:
109: private static final int MAX_ATTR_NAME_LENGTH = Integer
110: .parseInt(System.getProperty(ExtractorHTML.class.getName()
111: + ".maxAttributeNameLength", "1024")); // 1K;
112:
113: static final int MAX_ATTR_VAL_LENGTH = Integer.parseInt(System
114: .getProperty(ExtractorHTML.class.getName()
115: + ".maxAttributeValueLength", "16384")); // 16K;
116:
117: // TODO: perhaps cut to near MAX_URI_LENGTH
118:
119: // this pattern extracts attributes from any open-tag innards
120: // matched by the above. attributes known to be URIs of various
121: // sorts are matched specially
122: static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\s((href)|(action)|(on\\w*)" // 1, 2, 3, 4
123: + "|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ...
124: + "|(?:usemap)|(?:profile)|(?:datasrc))" // 5
125: + "|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9
126: + "|(value)|(style)|([-\\w]{1,"
127: + MAX_ATTR_NAME_LENGTH
128: + "}))" // 10, 11, 12
129: + "\\s*=\\s*"
130: + "(?:(?:\"(.{0,"
131: + MAX_ATTR_VAL_LENGTH
132: + "}?)(?:\"|$))" // 13
133: + "|(?:'(.{0," + MAX_ATTR_VAL_LENGTH + "}?)(?:'|$))" // 14
134: + "|(\\S{1," + MAX_ATTR_VAL_LENGTH + "}))"; // 15
135: // groups:
136: // 1: attribute name
137: // 2: HREF - single URI relative to doc base, or occasionally javascript:
138: // 3: ACTION - single URI relative to doc base, or occasionally javascript:
139: // 4: ON[WHATEVER] - script handler
140: // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC
141: // single URI relative to doc base
142: // 6: CODEBASE - a single URI relative to doc base, affecting other
143: // attributes
144: // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
145: // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
146: // (if supplied)
147: // 9: CODE - a single URI relative to the CODEBASE (is specified).
148: // 10: VALUE - often includes a uri path on forms
149: // 11: STYLE - inline attribute style info
150: // 12: any other attribute
151: // 13: double-quote delimited attr value
152: // 14: single-quote delimited attr value
153: // 15: space-delimited attr value
154:
155: // much like the javascript likely-URI extractor, but
156: // without requiring quotes -- this can indicate whether
157: // an HTML tag attribute that isn't definitionally a
158: // URI might be one anyway, as in form-tag VALUE attributes
159: static final String LIKELY_URI_PATH = "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
160: static final String WHITESPACE = "\\s";
161: static final String CLASSEXT = ".class";
162: static final String APPLET = "applet";
163: static final String BASE = "base";
164: static final String LINK = "link";
165: static final String FRAME = "frame";
166: static final String IFRAME = "iframe";
167:
168: public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS = "treat-frames-as-embed-links";
169:
170: public static final String ATTR_IGNORE_FORM_ACTION_URLS = "ignore-form-action-urls";
171:
172: /** whether to try finding links in Javscript; default true */
173: public static final String ATTR_EXTRACT_JAVASCRIPT = "extract-javascript";
174:
175: public static final String ATTR_OVERLY_EAGER_LINK_DETECTION = "overly-eager-link-detection";
176:
177: public static final String ATTR_IGNORE_UNEXPECTED_HTML = "ignore-unexpected-html";
178:
179: protected long numberOfCURIsHandled = 0;
180: protected long numberOfLinksExtracted = 0;
181:
182: public ExtractorHTML(String name) {
183: this (name, "HTML extractor. Extracts links from HTML documents");
184: }
185:
186: public ExtractorHTML(String name, String description) {
187: super (name, description);
188: Type t = addElementToDefinition(new SimpleType(
189: ATTR_EXTRACT_JAVASCRIPT,
190: "If true, in-page Javascript is scanned for strings that "
191: + "appear likely to be URIs. This typically finds both valid "
192: + "and invalid URIs, and attempts to fetch the invalid URIs "
193: + "sometimes generates webmaster concerns over odd crawler "
194: + "behavior. Default is true.", Boolean.TRUE));
195: t.setExpertSetting(true);
196: t = addElementToDefinition(new SimpleType(
197: ATTR_TREAT_FRAMES_AS_EMBED_LINKS,
198: "If true, FRAME/IFRAME SRC-links are treated as embedded "
199: + "resources (like IMG, 'E' hop-type), otherwise they are "
200: + "treated as navigational links. Default is true.",
201: Boolean.TRUE));
202: t.setExpertSetting(true);
203: t = addElementToDefinition(new SimpleType(
204: ATTR_IGNORE_FORM_ACTION_URLS,
205: "If true, URIs appearing as the ACTION attribute in "
206: + "HTML FORMs are ignored. Default is false.",
207: Boolean.FALSE));
208: t.setExpertSetting(true);
209: t = addElementToDefinition(new SimpleType(
210: ATTR_OVERLY_EAGER_LINK_DETECTION,
211: "If true, strings that look like URIs found in unusual "
212: + "places (such as form VALUE attributes) will be extracted. "
213: + "This typically finds both valid and invalid URIs, and "
214: + "attempts to fetch the invalid URIs sometimes generate "
215: + "webmaster concerns over odd crawler behavior. Default "
216: + "is true.", Boolean.TRUE));
217: t.setExpertSetting(true);
218: t = addElementToDefinition(new SimpleType(
219: ATTR_IGNORE_UNEXPECTED_HTML,
220: "If true, URIs which end in typical non-HTML extensions "
221: + "(such as .gif) will not be scanned as if it were HTML. "
222: + "Default is true.", Boolean.TRUE));
223: t.setExpertSetting(true);
224: }
225:
226: protected void processGeneralTag(CrawlURI curi,
227: CharSequence element, CharSequence cs) {
228:
229: Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR,
230: cs);
231:
232: // Just in case it's an OBJECT or APPLET tag
233: String codebase = null;
234: ArrayList<String> resources = null;
235:
236: final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(
237: curi, ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
238:
239: final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
240: curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
241:
242: final boolean overlyEagerLinkDetection = ((Boolean) getUncheckedAttribute(
243: curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue();
244:
245: final String elementStr = element.toString();
246:
247: while (attr.find()) {
248: int valueGroup = (attr.start(13) > -1) ? 13 : (attr
249: .start(14) > -1) ? 14 : 15;
250: int start = attr.start(valueGroup);
251: int end = attr.end(valueGroup);
252: assert start >= 0 : "Start is: " + start + ", " + curi;
253: assert end >= 0 : "End is :" + end + ", " + curi;
254: CharSequence value = cs.subSequence(start, end);
255: value = TextUtils.unescapeHtml(value);
256: if (attr.start(2) > -1) {
257: // HREF
258: CharSequence context = Link.elementContext(element,
259: attr.group(2));
260: if (elementStr.equalsIgnoreCase(LINK)) {
261: // <LINK> elements treated as embeds (css, ico, etc)
262: processEmbed(curi, value, context);
263: } else {
264: // other HREFs treated as links
265: processLink(curi, value, context);
266: }
267: if (elementStr.equalsIgnoreCase(BASE)) {
268: try {
269: curi.setBaseURI(value.toString());
270: } catch (URIException e) {
271: if (getController() != null) {
272: // Controller can be null: e.g. when running
273: // ExtractorTool.
274: getController().logUriError(e,
275: curi.getUURI(), value.toString());
276: } else {
277: logger.info("Failed set base uri: " + curi
278: + ", " + value.toString() + ": "
279: + e.getMessage());
280: }
281: }
282: }
283: } else if (attr.start(3) > -1) {
284: // ACTION
285: if (!ignoreFormActions) {
286: CharSequence context = Link.elementContext(element,
287: attr.group(3));
288: processLink(curi, value, context);
289: }
290: } else if (attr.start(4) > -1) {
291: // ON____
292: processScriptCode(curi, value); // TODO: context?
293: } else if (attr.start(5) > -1) {
294: // SRC etc.
295: CharSequence context = Link.elementContext(element,
296: attr.group(5));
297:
298: // true, if we expect another HTML page instead of an image etc.
299: final char hopType;
300:
301: if (!framesAsEmbeds
302: && (elementStr.equalsIgnoreCase(FRAME) || elementStr
303: .equalsIgnoreCase(IFRAME))) {
304: hopType = Link.NAVLINK_HOP;
305: } else {
306: hopType = Link.EMBED_HOP;
307: }
308: processEmbed(curi, value, context, hopType);
309: } else if (attr.start(6) > -1) {
310: // CODEBASE
311: codebase = (value instanceof String) ? (String) value
312: : value.toString();
313: CharSequence context = Link.elementContext(element,
314: attr.group(6));
315: processEmbed(curi, codebase, context);
316: } else if (attr.start(7) > -1) {
317: // CLASSID, DATA
318: if (resources == null) {
319: resources = new ArrayList<String>();
320: }
321: resources.add(value.toString());
322: } else if (attr.start(8) > -1) {
323: // ARCHIVE
324: if (resources == null) {
325: resources = new ArrayList<String>();
326: }
327: String[] multi = TextUtils.split(WHITESPACE, value);
328: for (int i = 0; i < multi.length; i++) {
329: resources.add(multi[i]);
330: }
331: } else if (attr.start(9) > -1) {
332: // CODE
333: if (resources == null) {
334: resources = new ArrayList<String>();
335: }
336: // If element is applet and code value does not end with
337: // '.class' then append '.class' to the code value.
338: if (elementStr.equalsIgnoreCase(APPLET)
339: && !value.toString().toLowerCase().endsWith(
340: CLASSEXT)) {
341: resources.add(value.toString() + CLASSEXT);
342: } else {
343: resources.add(value.toString());
344: }
345: } else if (attr.start(10) > -1) {
346: // VALUE, with possibility of URI
347: if (overlyEagerLinkDetection
348: && TextUtils.matches(LIKELY_URI_PATH, value)) {
349: CharSequence context = Link.elementContext(element,
350: attr.group(10));
351: processLink(curi, value, context);
352: }
353:
354: } else if (attr.start(11) > -1) {
355: // STYLE inline attribute
356: // then, parse for URIs
357: this .numberOfLinksExtracted += ExtractorCSS
358: .processStyleCode(curi, value, getController());
359:
360: } else if (attr.start(12) > -1) {
361: // any other attribute
362: // ignore for now
363: // could probe for path- or script-looking strings, but
364: // those should be vanishingly rare in other attributes,
365: // and/or symptomatic of page bugs
366: }
367: }
368: TextUtils.recycleMatcher(attr);
369:
370: // handle codebase/resources
371: if (resources == null) {
372: return;
373: }
374: Iterator iter = resources.iterator();
375: UURI codebaseURI = null;
376: String res = null;
377: try {
378: if (codebase != null) {
379: // TODO: Pass in the charset.
380: codebaseURI = UURIFactory.getInstance(curi.getUURI(),
381: codebase);
382: }
383: while (iter.hasNext()) {
384: res = iter.next().toString();
385: res = (String) TextUtils.unescapeHtml(res);
386: if (codebaseURI != null) {
387: res = codebaseURI.resolve(res).toString();
388: }
389: processEmbed(curi, res, element); // TODO: include attribute too
390: }
391: } catch (URIException e) {
392: curi.addLocalizedError(getName(), e, "BAD CODEBASE "
393: + codebase);
394: } catch (IllegalArgumentException e) {
395: DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"
396: + "codebase=" + codebase + " res=" + res + "\n"
397: + DevUtils.extraInfo(), e);
398: }
399: }
400:
401: /**
402: * Extract the (java)script source in the given CharSequence.
403: *
404: * @param curi source CrawlURI
405: * @param cs CharSequence of javascript code
406: */
407: protected void processScriptCode(CrawlURI curi, CharSequence cs) {
408: if ((Boolean) getUncheckedAttribute(curi,
409: ATTR_EXTRACT_JAVASCRIPT)) {
410: this .numberOfLinksExtracted += ExtractorJS.considerStrings(
411: curi, cs, getController(), false);
412: } // else do nothing
413: }
414:
415: static final String JAVASCRIPT = "(?i)^javascript:.*";
416:
417: /**
418: * Handle generic HREF cases.
419: *
420: * @param curi
421: * @param value
422: * @param context
423: */
424: protected void processLink(CrawlURI curi, final CharSequence value,
425: CharSequence context) {
426: if (TextUtils.matches(JAVASCRIPT, value)) {
427: processScriptCode(curi, value.subSequence(11, value
428: .length()));
429: } else {
430: if (logger.isLoggable(Level.FINEST)) {
431: logger.finest("link: " + value.toString() + " from "
432: + curi);
433: }
434: addLinkFromString(curi,
435: (value instanceof String) ? (String) value : value
436: .toString(), context, Link.NAVLINK_HOP);
437: this .numberOfLinksExtracted++;
438: }
439: }
440:
441: private void addLinkFromString(CrawlURI curi, String uri,
442: CharSequence context, char hopType) {
443: try {
444: // We do a 'toString' on context because its a sequence from
445: // the underlying ReplayCharSequence and the link its about
446: // to become a part of is expected to outlive the current
447: // ReplayCharSequence.
448: curi.createAndAddLinkRelativeToBase(uri,
449: context.toString(), hopType);
450: } catch (URIException e) {
451: if (getController() != null) {
452: getController().logUriError(e, curi.getUURI(), uri);
453: } else {
454: logger.info("Failed createAndAddLinkRelativeToBase "
455: + curi + ", " + uri + ", " + context + ", "
456: + hopType + ": " + e);
457: }
458: }
459: }
460:
461: protected final void processEmbed(CrawlURI curi,
462: CharSequence value, CharSequence context) {
463: processEmbed(curi, value, context, Link.EMBED_HOP);
464: }
465:
466: protected void processEmbed(CrawlURI curi,
467: final CharSequence value, CharSequence context, char hopType) {
468: if (logger.isLoggable(Level.FINEST)) {
469: logger.finest("embed (" + hopType + "): "
470: + value.toString() + " from " + curi);
471: }
472: addLinkFromString(curi,
473: (value instanceof String) ? (String) value : value
474: .toString(), context, hopType);
475: this .numberOfLinksExtracted++;
476: }
477:
478: public void extract(CrawlURI curi) {
479: if (!isHttpTransactionContentToProcess(curi)
480: || !(isExpectedMimeType(curi.getContentType(),
481: "text/html") || isExpectedMimeType(curi
482: .getContentType(), "application/xhtml"))) {
483: return;
484: }
485:
486: final boolean ignoreUnexpectedHTML = ((Boolean) getUncheckedAttribute(
487: curi, ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();
488:
489: if (ignoreUnexpectedHTML) {
490: try {
491: if (!isHtmlExpectedHere(curi)) {
492: // HTML was not expected (eg a GIF was expected) so ignore
493: // (as if a soft 404)
494: return;
495: }
496: } catch (URIException e) {
497: logger.severe("Failed expectedHTML test: "
498: + e.getMessage());
499: }
500: }
501:
502: this .numberOfCURIsHandled++;
503:
504: ReplayCharSequence cs = null;
505:
506: try {
507: HttpRecorder hr = curi.getHttpRecorder();
508: if (hr == null) {
509: throw new IOException("Why is recorder null here?");
510: }
511: cs = hr.getReplayCharSequence();
512: } catch (IOException e) {
513: curi.addLocalizedError(this .getName(), e,
514: "Failed get of replay char sequence "
515: + curi.toString() + " " + e.getMessage());
516: logger.log(Level.SEVERE,
517: "Failed get of replay char sequence in "
518: + Thread.currentThread().getName(), e);
519: }
520:
521: if (cs == null) {
522: return;
523: }
524:
525: // We have a ReplayCharSequence open. Wrap all in finally so we
526: // for sure close it before we leave.
527: try {
528: // Extract all links from the charsequence
529: extract(curi, cs);
530: // Set flag to indicate that link extraction is completed.
531: curi.linkExtractorFinished();
532: } finally {
533: if (cs != null) {
534: try {
535: cs.close();
536: } catch (IOException ioe) {
537: logger
538: .warning(TextUtils
539: .exceptionToString(
540: "Failed close of ReplayCharSequence.",
541: ioe));
542: }
543: }
544: }
545: }
546:
547: /**
548: * Run extractor.
549: * This method is package visible to ease testing.
550: * @param curi CrawlURI we're processing.
551: * @param cs Sequence from underlying ReplayCharSequence. This
552: * is TRANSIENT data. Make a copy if you want the data to live outside
553: * of this extractors' lifetime.
554: */
555: void extract(CrawlURI curi, CharSequence cs) {
556: Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);
557: while (tags.find()) {
558: if (Thread.interrupted()) {
559: break;
560: }
561: if (tags.start(8) > 0) {
562: // comment match
563: // for now do nothing
564: } else if (tags.start(7) > 0) {
565: // <meta> match
566: int start = tags.start(5);
567: int end = tags.end(5);
568: assert start >= 0 : "Start is: " + start + ", " + curi;
569: assert end >= 0 : "End is :" + end + ", " + curi;
570: if (processMeta(curi, cs.subSequence(start, end))) {
571:
572: // meta tag included NOFOLLOW; abort processing
573: break;
574: }
575: } else if (tags.start(5) > 0) {
576: // generic <whatever> match
577: int start5 = tags.start(5);
578: int end5 = tags.end(5);
579: assert start5 >= 0 : "Start is: " + start5 + ", "
580: + curi;
581: assert end5 >= 0 : "End is :" + end5 + ", " + curi;
582: int start6 = tags.start(6);
583: int end6 = tags.end(6);
584: assert start6 >= 0 : "Start is: " + start6 + ", "
585: + curi;
586: assert end6 >= 0 : "End is :" + end6 + ", " + curi;
587: processGeneralTag(curi, cs.subSequence(start6, end6),
588: cs.subSequence(start5, end5));
589:
590: } else if (tags.start(1) > 0) {
591: // <script> match
592: int start = tags.start(1);
593: int end = tags.end(1);
594: assert start >= 0 : "Start is: " + start + ", " + curi;
595: assert end >= 0 : "End is :" + end + ", " + curi;
596: assert tags.end(2) >= 0 : "Tags.end(2) illegal "
597: + tags.end(2) + ", " + curi;
598: processScript(curi, cs.subSequence(start, end), tags
599: .end(2)
600: - start);
601:
602: } else if (tags.start(3) > 0) {
603: // <style... match
604: int start = tags.start(3);
605: int end = tags.end(3);
606: assert start >= 0 : "Start is: " + start + ", " + curi;
607: assert end >= 0 : "End is :" + end + ", " + curi;
608: assert tags.end(4) >= 0 : "Tags.end(4) illegal "
609: + tags.end(4) + ", " + curi;
610: processStyle(curi, cs.subSequence(start, end), tags
611: .end(4)
612: - start);
613: }
614: }
615: TextUtils.recycleMatcher(tags);
616: }
617:
618: static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"
619: + "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
620:
621: /**
622: * Test whether this HTML is so unexpected (eg in place of a GIF URI)
623: * that it shouldn't be scanned for links.
624: *
625: * @param curi CrawlURI to examine.
626: * @return True if HTML is acceptable/expected here
627: * @throws URIException
628: */
629: protected boolean isHtmlExpectedHere(CrawlURI curi)
630: throws URIException {
631: String path = curi.getUURI().getPath();
632: if (path == null) {
633: // no path extension, HTML is fine
634: return true;
635: }
636: int dot = path.lastIndexOf('.');
637: if (dot < 0) {
638: // no path extension, HTML is fine
639: return true;
640: }
641: if (dot < (path.length() - 5)) {
642: // extension too long to recognize, HTML is fine
643: return true;
644: }
645: String ext = path.substring(dot + 1);
646: return !TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);
647: }
648:
649: protected void processScript(CrawlURI curi, CharSequence sequence,
650: int endOfOpenTag) {
651: // first, get attributes of script-open tag
652: // as per any other tag
653: processGeneralTag(curi, sequence.subSequence(0, 6), sequence
654: .subSequence(0, endOfOpenTag));
655:
656: // then, apply best-effort string-analysis heuristics
657: // against any code present (false positives are OK)
658: processScriptCode(curi, sequence.subSequence(endOfOpenTag,
659: sequence.length()));
660: }
661:
662: /**
663: * Process metadata tags.
664: * @param curi CrawlURI we're processing.
665: * @param cs Sequence from underlying ReplayCharSequence. This
666: * is TRANSIENT data. Make a copy if you want the data to live outside
667: * of this extractors' lifetime.
668: * @return True robots exclusion metatag.
669: */
670: protected boolean processMeta(CrawlURI curi, CharSequence cs) {
671: Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR,
672: cs);
673: String name = null;
674: String httpEquiv = null;
675: String content = null;
676: while (attr.find()) {
677: int valueGroup = (attr.start(13) > -1) ? 13 : (attr
678: .start(14) > -1) ? 14 : 15;
679: CharSequence value = cs.subSequence(attr.start(valueGroup),
680: attr.end(valueGroup));
681: if (attr.group(1).equalsIgnoreCase("name")) {
682: name = value.toString();
683: } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
684: httpEquiv = value.toString();
685: } else if (attr.group(1).equalsIgnoreCase("content")) {
686: content = value.toString();
687: }
688: // TODO: handle other stuff
689: }
690: TextUtils.recycleMatcher(attr);
691:
692: // Look for the 'robots' meta-tag
693: if ("robots".equalsIgnoreCase(name) && content != null) {
694: curi.putString(A_META_ROBOTS, content);
695: RobotsHonoringPolicy policy = getSettingsHandler()
696: .getOrder().getRobotsHonoringPolicy();
697: String contentLower = content.toLowerCase();
698: if ((policy == null || (!policy.isType(curi,
699: RobotsHonoringPolicy.IGNORE) && !policy.isType(
700: curi, RobotsHonoringPolicy.CUSTOM)))
701: && (contentLower.indexOf("nofollow") >= 0 || contentLower
702: .indexOf("none") >= 0)) {
703: // if 'nofollow' or 'none' is specified and the
704: // honoring policy is not IGNORE or CUSTOM, end html extraction
705: logger
706: .fine("HTML extraction skipped due to robots meta-tag for: "
707: + curi.toString());
708: return true;
709: }
710: } else if ("refresh".equalsIgnoreCase(httpEquiv)
711: && content != null) {
712: String refreshUri = content
713: .substring(content.indexOf("=") + 1);
714: try {
715: curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
716: Link.REFER_HOP);
717: } catch (URIException e) {
718: if (getController() != null) {
719: getController().logUriError(e, curi.getUURI(),
720: refreshUri);
721: } else {
722: logger
723: .info("Failed createAndAddLinkRelativeToBase "
724: + curi
725: + ", "
726: + cs
727: + ", "
728: + refreshUri + ": " + e);
729: }
730: }
731: }
732: return false;
733: }
734:
735: /**
736: * Process style text.
737: * @param curi CrawlURI we're processing.
738: * @param sequence Sequence from underlying ReplayCharSequence. This
739: * is TRANSIENT data. Make a copy if you want the data to live outside
740: * of this extractors' lifetime.
741: * @param endOfOpenTag
742: */
743: protected void processStyle(CrawlURI curi, CharSequence sequence,
744: int endOfOpenTag) {
745: // First, get attributes of script-open tag as per any other tag.
746: processGeneralTag(curi, sequence.subSequence(0, 6), sequence
747: .subSequence(0, endOfOpenTag));
748:
749: // then, parse for URIs
750: this .numberOfLinksExtracted += ExtractorCSS.processStyleCode(
751: curi, sequence.subSequence(endOfOpenTag, sequence
752: .length()), getController());
753: }
754:
755: /* (non-Javadoc)
756: * @see org.archive.crawler.framework.Processor#report()
757: */
758: public String report() {
759: StringBuffer ret = new StringBuffer();
760: ret
761: .append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
762: ret
763: .append(" Function: Link extraction on HTML documents\n");
764: ret.append(" CrawlURIs handled: " + this .numberOfCURIsHandled
765: + "\n");
766: ret.append(" Links extracted: "
767: + this .numberOfLinksExtracted + "\n\n");
768: return ret.toString();
769: }
770: }
|