001: /* JerichoExtractorHTML
002: *
003: * Copyright (C) 2006 Olaf Freyer
004: *
005: * This file is part of the Heritrix web crawler (crawler.archive.org).
006: *
007: * Heritrix is free software; you can redistribute it and/or modify
008: * it under the terms of the GNU Lesser Public License as published by
009: * the Free Software Foundation; either version 2.1 of the License, or
010: * any later version.
011: *
012: * Heritrix is distributed in the hope that it will be useful,
013: * but WITHOUT ANY WARRANTY; without even the implied warranty of
014: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
015: * GNU Lesser Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser Public License
018: * along with Heritrix; if not, write to the Free Software
019: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
020: *
021: * $Id: JerichoExtractorHTML.java 4726 2006-11-15 17:57:11Z stack-sf $
022: */
023: package org.archive.crawler.extractor;
024:
025: import java.util.ArrayList;
026: import java.util.Collection;
027: import java.util.Iterator;
028: import java.util.LinkedList;
029: import java.util.List;
030: import java.util.logging.Level;
031: import java.util.logging.Logger;
032:
033: import org.apache.commons.httpclient.URIException;
034: import org.apache.commons.lang.StringEscapeUtils;
035: import org.archive.crawler.datamodel.CoreAttributeConstants;
036: import org.archive.crawler.datamodel.CrawlURI;
037: import org.archive.crawler.datamodel.RobotsHonoringPolicy;
038: import org.archive.net.UURI;
039: import org.archive.net.UURIFactory;
040: import org.archive.util.DevUtils;
041: import org.archive.util.TextUtils;
042:
043: import au.id.jericho.lib.html.Attribute;
044: import au.id.jericho.lib.html.Attributes;
045: import au.id.jericho.lib.html.Element;
046: import au.id.jericho.lib.html.FormControl;
047: import au.id.jericho.lib.html.FormControlType;
048: import au.id.jericho.lib.html.FormField;
049: import au.id.jericho.lib.html.FormFields;
050: import au.id.jericho.lib.html.HTMLElementName;
051: import au.id.jericho.lib.html.Source;
052: import au.id.jericho.lib.html.StartTagType;
053:
054: /**
055: * Improved link-extraction from an HTML content-body using jericho-html parser.
056: * This extractor extends ExtractorHTML and mimics its workflow - but has some
057: * substantial differences when it comes to internal implementation. Instead
058: * of heavily relying upon java regular expressions it uses a real html parser
059: * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net).
060: * Using this parser it can better handle broken html (i.e. missing quotes)
061: * and also offer improved extraction of HTML form URLs (not only extract
062: * the action of a form, but also its default values).
063: * Unfortunately this parser also has one major drawback - it has to read the
064: * whole document into memory for parsing, thus has an inherent OOME risk.
065: * This OOME risk can be reduced/eleminated by limiting the size of documents
066: * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule).
067: * Also note that this extractor seems to have a lower overall memory
068: * consumption compared to ExtractorHTML. (still to be confirmed on a larger
069: * scale crawl)
070: *
071: * @author Olaf Freyer
072: * @version $Date: 2006-11-15 17:57:11 +0000 (Wed, 15 Nov 2006) $ $Revision: 4726 $
073: */
074: public class JerichoExtractorHTML extends ExtractorHTML implements
075: CoreAttributeConstants {
076:
077: private static final long serialVersionUID = 1684681316546343615L;
078:
079: private Logger logger = Logger.getLogger(this .getClass().getName());
080:
081: protected long numberOfFormsProcessed = 0;
082:
083: public JerichoExtractorHTML(String name) {
084: this (
085: name,
086: "Jericho-HTML extractor. Extracts links from HTML "
087: + "documents using Jericho HTML Parser. Offers same "
088: + "basic functionality as ExtractorHTML but better "
089: + "handles broken HTML and extraction of default "
090: + "values from HTML forms. A word of warning: the used "
091: + "parser, the Jericho HTML Parser, reads the whole "
092: + "document into memory for "
093: + "parsing - thus this extractor has an inherent OOME risk. "
094: + "This OOME risk can be reduced/eleminated by limiting the "
095: + "size of documents to be parsed (i.e. using "
096: + "NotExceedsDocumentLengthTresholdDecideRule). ");
097: }
098:
099: public JerichoExtractorHTML(String name, String description) {
100: super (name, description);
101: }
102:
103: private static List<Attribute> findOnAttributes(
104: Attributes attributes) {
105: List<Attribute> result = new LinkedList<Attribute>();
106: for (Iterator attrIter = attributes.iterator(); attrIter
107: .hasNext();) {
108: Attribute attr = (Attribute) attrIter.next();
109: if (attr.getKey().startsWith("on"))
110: result.add(attr);
111: }
112: return result;
113: }
114:
115: protected void processGeneralTag(CrawlURI curi, Element element,
116: Attributes attributes) {
117: Attribute attr;
118: String attrValue;
119: List attrList;
120: String elementName = element.getName();
121:
122: // Just in case it's an OBJECT or APPLET tag
123: String codebase = null;
124: ArrayList<String> resources = null;
125:
126: final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(
127: curi, ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
128:
129: final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
130: curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
131:
132: final boolean overlyEagerLinkDetection = ((Boolean) getUncheckedAttribute(
133: curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue();
134:
135: // HREF
136: if (((attr = attributes.get("href")) != null)
137: && ((attrValue = attr.getValue()) != null)) {
138: CharSequence context = Link.elementContext(elementName,
139: attr.getKey());
140: if ("link".equals(elementName)) {
141: // <LINK> elements treated as embeds (css, ico, etc)
142: processEmbed(curi, attrValue, context);
143: } else {
144: // other HREFs treated as links
145: processLink(curi, attrValue, context);
146: }
147: if ("base".equals(elementName)) {
148: try {
149: curi.setBaseURI(attrValue);
150: } catch (URIException e) {
151: if (getController() != null) {
152: // Controller can be null: e.g. when running
153: // ExtractorTool.
154: getController().logUriError(e, curi.getUURI(),
155: attrValue);
156: } else {
157: logger.info("Failed set base uri: " + curi
158: + ", " + attrValue + ": "
159: + e.getMessage());
160: }
161: }
162: }
163: }
164: // ACTION
165: else if (((attr = attributes.get("action")) != null)
166: && ((attrValue = attr.getValue()) != null)) {
167: if (!ignoreFormActions) {
168: CharSequence context = Link.elementContext(elementName,
169: attr.getKey());
170: processLink(curi, attrValue, context);
171: }
172: }
173: // ON_
174: else if ((attrList = findOnAttributes(attributes)).size() != 0) {
175: for (Iterator attrIter = attrList.iterator(); attrIter
176: .hasNext();) {
177: attr = (Attribute) attrIter.next();
178: CharSequence valueSegment = attr.getValueSegment();
179: if (valueSegment != null)
180: processScriptCode(curi, valueSegment);
181:
182: }
183: }
184: // SRC atc.
185: else if ((((attr = attributes.get("src")) != null)
186: || ((attr = attributes.get("lowsrc")) != null)
187: || ((attr = attributes.get("background")) != null)
188: || ((attr = attributes.get("cite")) != null)
189: || ((attr = attributes.get("longdesc")) != null)
190: || ((attr = attributes.get("usemap")) != null)
191: || ((attr = attributes.get("profile")) != null) || ((attr = attributes
192: .get("datasrc")) != null))
193: && ((attrValue = attr.getValue()) != null)) {
194:
195: final char hopType;
196: CharSequence context = Link.elementContext(elementName,
197: attr.getKey());
198:
199: if (!framesAsEmbeds
200: && ("frame".equals(elementName) || "iframe"
201: .equals(elementName)))
202: hopType = Link.NAVLINK_HOP;
203: else
204: hopType = Link.EMBED_HOP;
205:
206: processEmbed(curi, attrValue, context, hopType);
207: }
208: // CODEBASE
209: else if (((attr = attributes.get("codebase")) != null)
210: && ((attrValue = attr.getValue()) != null)) {
211: codebase = StringEscapeUtils.unescapeHtml(attrValue);
212: CharSequence context = Link.elementContext(elementName,
213: attr.getKey());
214: processEmbed(curi, codebase, context);
215: }
216: // CLASSID DATA
217: else if ((((attr = attributes.get("classid")) != null) || ((attr = attributes
218: .get("data")) != null))
219: && ((attrValue = attr.getValue()) != null)) {
220: if (resources == null)
221: resources = new ArrayList<String>();
222: resources.add(attrValue);
223: }
224: // ARCHIVE
225: else if (((attr = attributes.get("archive")) != null)
226: && ((attrValue = attr.getValue()) != null)) {
227: if (resources == null)
228: resources = new ArrayList<String>();
229: String[] multi = TextUtils.split(WHITESPACE, attrValue);
230: for (int i = 0; i < multi.length; i++) {
231: resources.add(multi[i]);
232: }
233: }
234: // CODE
235: else if (((attr = attributes.get("code")) != null)
236: && ((attrValue = attr.getValue()) != null)) {
237: if (resources == null)
238: resources = new ArrayList<String>();
239: // If element is applet and code value does not end with
240: // '.class' then append '.class' to the code value.
241: if (APPLET.equals(elementName)
242: && !attrValue.endsWith(CLASSEXT)) {
243: resources.add(attrValue + CLASSEXT);
244: } else {
245: resources.add(attrValue);
246: }
247: }
248: // VALUE
249: else if (((attr = attributes.get("value")) != null)
250: && ((attrValue = attr.getValue()) != null)) {
251: if (TextUtils.matches(LIKELY_URI_PATH, attrValue)
252: && overlyEagerLinkDetection) {
253: CharSequence context = Link.elementContext(elementName,
254: attr.getKey());
255: processLink(curi, attrValue, context);
256: }
257:
258: }
259: // STYLE
260: else if (((attr = attributes.get("style")) != null)
261: && ((attrValue = attr.getValue()) != null)) {
262: // STYLE inline attribute
263: // then, parse for URIs
264: this .numberOfLinksExtracted += ExtractorCSS
265: .processStyleCode(curi, attrValue, getController());
266: }
267:
268: // handle codebase/resources
269: if (resources == null)
270: return;
271:
272: Iterator<String> iter = resources.iterator();
273: UURI codebaseURI = null;
274: String res = null;
275: try {
276: if (codebase != null) {
277: // TODO: Pass in the charset.
278: codebaseURI = UURIFactory.getInstance(curi.getUURI(),
279: codebase);
280: }
281: while (iter.hasNext()) {
282: res = iter.next();
283: res = StringEscapeUtils.unescapeHtml(res);
284: if (codebaseURI != null) {
285: res = codebaseURI.resolve(res).toString();
286: }
287: processEmbed(curi, res, element); // TODO: include attribute
288: // too
289: }
290: } catch (URIException e) {
291: curi.addLocalizedError(getName(), e, "BAD CODEBASE "
292: + codebase);
293: } catch (IllegalArgumentException e) {
294: DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"
295: + "codebase=" + codebase + " res=" + res + "\n"
296: + DevUtils.extraInfo(), e);
297: }
298: }
299:
300: protected boolean processMeta(CrawlURI curi, Element element) {
301: String name = element.getAttributeValue("name");
302: String httpEquiv = element.getAttributeValue("http-equiv");
303: String content = element.getAttributeValue("content");
304:
305: if ("robots".equals(name) && content != null) {
306: curi.putString(A_META_ROBOTS, content);
307: RobotsHonoringPolicy policy = getSettingsHandler()
308: .getOrder().getRobotsHonoringPolicy();
309: String contentLower = content.toLowerCase();
310: if ((policy == null || (!policy.isType(curi,
311: RobotsHonoringPolicy.IGNORE) && !policy.isType(
312: curi, RobotsHonoringPolicy.CUSTOM)))
313: && (contentLower.indexOf("nofollow") >= 0 || contentLower
314: .indexOf("none") >= 0)) {
315: // if 'nofollow' or 'none' is specified and the
316: // honoring policy is not IGNORE or CUSTOM, end html extraction
317: logger
318: .fine("HTML extraction skipped due to robots meta-tag "
319: + "for: " + curi.toString());
320: return true;
321: }
322: }
323: if ("refresh".equals(httpEquiv) && content != null) {
324: String refreshUri = content
325: .substring(content.indexOf("=") + 1);
326: try {
327: curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
328: Link.REFER_HOP);
329: } catch (URIException e) {
330: if (getController() != null) {
331: getController().logUriError(e, curi.getUURI(),
332: refreshUri);
333: } else {
334: logger
335: .info("Failed createAndAddLinkRelativeToBase "
336: + curi
337: + ", "
338: + element.toString()
339: + ", " + refreshUri + ": " + e);
340: }
341: }
342: }
343: return false;
344: }
345:
346: protected void processScript(CrawlURI curi, Element element) {
347: // first, get attributes of script-open tag
348: // as per any other tag
349: processGeneralTag(curi, element, element.getAttributes());
350:
351: // then, apply best-effort string-analysis heuristics
352: // against any code present (false positives are OK)
353: processScriptCode(curi, element.getContent());
354:
355: }
356:
357: protected void processStyle(CrawlURI curi, Element element) {
358: // First, get attributes of script-open tag as per any other tag.
359: processGeneralTag(curi, element, element.getAttributes());
360:
361: // then, parse for URIs
362: this .numberOfLinksExtracted += ExtractorCSS.processStyleCode(
363: curi, element.getContent(), getController());
364: }
365:
366: protected void processForm(CrawlURI curi, Element element) {
367: String action = element.getAttributeValue("action");
368: String name = element.getAttributeValue("name");
369: String queryURL = "";
370:
371: final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
372: curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
373:
374: if (ignoreFormActions)
375: return;
376:
377: numberOfFormsProcessed++;
378:
379: // get all form fields
380: FormFields formFields = element.findFormFields();
381: for (Iterator fieldsIter = formFields.iterator(); fieldsIter
382: .hasNext();) {
383: // for each form field
384: FormField formField = (FormField) fieldsIter.next();
385:
386: // for each form control
387: for (Iterator controlIter = formField.getFormControls()
388: .iterator(); controlIter.hasNext();) {
389: FormControl formControl = (FormControl) controlIter
390: .next();
391:
392: // get name of control element (and URLEncode it)
393: String controlName = formControl.getName();
394:
395: // retrieve list of values - submit needs special handling
396: Collection controlValues;
397: if (!(formControl.getFormControlType() == FormControlType.SUBMIT)) {
398: controlValues = formControl.getValues();
399: } else {
400: controlValues = formControl.getPredefinedValues();
401: }
402:
403: if (controlValues.size() > 0) {
404: // for each value set
405: for (Iterator valueIter = controlValues.iterator(); valueIter
406: .hasNext();) {
407: String value = (String) valueIter.next();
408: queryURL += "&" + controlName + "=" + value;
409: }
410: } else {
411: queryURL += "&" + controlName + "=";
412: }
413: }
414: }
415:
416: // clean up url
417: if (action == null) {
418: queryURL = queryURL.replaceFirst("&", "?");
419: } else {
420: if (!action.contains("?"))
421: queryURL = queryURL.replaceFirst("&", "?");
422: queryURL = action + queryURL;
423: }
424:
425: CharSequence context = Link.elementContext(element.getName(),
426: "name=" + name);
427: processLink(curi, queryURL, context);
428:
429: }
430:
431: /**
432: * Run extractor. This method is package visible to ease testing.
433: *
434: * @param curi
435: * CrawlURI we're processing.
436: * @param cs
437: * Sequence from underlying ReplayCharSequence.
438: */
439: void extract(CrawlURI curi, CharSequence cs) {
440: Source source = new Source(cs);
441: List elements = source.findAllElements(StartTagType.NORMAL);
442: for (Iterator elementIter = elements.iterator(); elementIter
443: .hasNext();) {
444: Element element = (Element) elementIter.next();
445: String elementName = element.getName();
446: Attributes attributes;
447: if (elementName.equals(HTMLElementName.META)) {
448: if (processMeta(curi, element)) {
449: // meta tag included NOFOLLOW; abort processing
450: break;
451: }
452: } else if (elementName.equals(HTMLElementName.SCRIPT)) {
453: processScript(curi, element);
454: } else if (elementName.equals(HTMLElementName.STYLE)) {
455: processStyle(curi, element);
456: } else if (elementName.equals(HTMLElementName.FORM)) {
457: processForm(curi, element);
458: } else if (!(attributes = element.getAttributes())
459: .isEmpty()) {
460: processGeneralTag(curi, element, attributes);
461: }
462: }
463: }
464:
465: /*
466: * (non-Javadoc)
467: *
468: * @see org.archive.crawler.framework.Processor#report()
469: */
470: public String report() {
471: StringBuffer ret = new StringBuffer();
472: ret
473: .append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n");
474: ret
475: .append(" Function: Link extraction on HTML documents\n");
476: ret.append(" CrawlURIs handled: " + this .numberOfCURIsHandled
477: + "\n");
478: ret.append(" Forms processed: "
479: + this .numberOfFormsProcessed + "\n");
480: ret.append(" Links extracted: "
481: + this .numberOfLinksExtracted + "\n\n");
482: return ret.toString();
483: }
484: }
|