01: /* Extractor
02: *
03: * $Id: Extractor.java 4497 2006-08-15 01:31:35Z stack-sf $
04: *
05: * Created on Sep 22, 2005
06: *
07: * Copyright (C) 2005 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.crawler.extractor;
26:
27: import java.util.logging.Level;
28: import java.util.logging.Logger;
29:
30: import org.archive.crawler.datamodel.CrawlURI;
31: import org.archive.crawler.framework.Processor;
32:
33: /**
34: * Convenience shared superclass for Extractor Processors.
35: *
36: * Currently only wraps Extractor-specific extract() action with
37: * a StackOverflowError catch/log/proceed handler, so that any
38: * extractors that recurse too deep on problematic input will
39: * only suffer a local error, and other normal CrawlURI processing
40: * can continue. See:
41: * [ 1122836 ] Localize StackOverflowError in Extractors
42: * http://sourceforge.net/tracker/index.php?func=detail&aid=1122836&group_id=73833&atid=539099
43: *
44: * This class could also become home to common utility features
45: * of extractors, like a running tally of the URIs examined/discovered,
46: * etc.
47: *
48: * @author gojomo
49: */
50: public abstract class Extractor extends Processor {
51: private static final Logger logger = Logger
52: .getLogger(Extractor.class.getName());
53:
54: /**
55: * Passthrough constructor.
56: *
57: * @param name
58: * @param description
59: */
60: public Extractor(String name, String description) {
61: super (name, description);
62: // TODO Auto-generated constructor stub
63: }
64:
65: public void innerProcess(CrawlURI curi) {
66: try {
67: extract(curi);
68: } catch (NullPointerException npe) {
69: // both annotate (to highlight in crawl log) & add as local-error
70: curi.addAnnotation("err=" + npe.getClass().getName());
71: curi.addLocalizedError(getName(), npe, "");
72: // also log as warning
73: logger.log(Level.WARNING, getName()
74: + ": NullPointerException", npe);
75: } catch (StackOverflowError soe) {
76: // both annotate (to highlight in crawl log) & add as local-error
77: curi.addAnnotation("err=" + soe.getClass().getName());
78: curi.addLocalizedError(getName(), soe, "");
79: // also log as warning
80: logger.log(Level.WARNING, getName()
81: + ": StackOverflowError", soe);
82: } catch (java.nio.charset.CoderMalfunctionError cme) {
83: // See http://sourceforge.net/tracker/index.php?func=detail&aid=1540222&group_id=73833&atid=539099
84: // Both annotate (to highlight in crawl log) & add as local-error
85: curi.addAnnotation("err=" + cme.getClass().getName());
86: curi.addLocalizedError(getName(), cme, ""); // <-- Message field ignored when logging.
87: logger.log(Level.WARNING, getName()
88: + ": CoderMalfunctionError", cme);
89: }
90: }
91:
92: protected abstract void extract(CrawlURI curi);
93: }
|