001: /* ExtractorTool
002: *
003: * Created on Mar 14, 2005
004: *
005: * Copyright (C) 2005 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.extractor;
024:
025: import java.io.File;
026: import java.io.IOException;
027: import java.lang.reflect.Constructor;
028: import java.util.ArrayList;
029: import java.util.Iterator;
030: import java.util.List;
031: import java.util.logging.ConsoleHandler;
032: import java.util.logging.Handler;
033: import java.util.logging.Logger;
034:
035: import javax.management.Attribute;
036:
037: import org.apache.commons.cli.CommandLine;
038: import org.apache.commons.cli.HelpFormatter;
039: import org.apache.commons.cli.Option;
040: import org.apache.commons.cli.Options;
041: import org.apache.commons.cli.PosixParser;
042: import org.apache.commons.httpclient.Header;
043: import org.apache.commons.httpclient.HttpMethodBase;
044: import org.apache.commons.httpclient.URIException;
045: import org.archive.crawler.datamodel.CoreAttributeConstants;
046: import org.archive.crawler.datamodel.CrawlOrder;
047: import org.archive.crawler.datamodel.CrawlURI;
048: import org.archive.crawler.framework.Processor;
049: import org.archive.crawler.settings.CrawlerSettings;
050: import org.archive.crawler.settings.MapType;
051: import org.archive.crawler.settings.SettingsHandler;
052: import org.archive.crawler.settings.XMLSettingsHandler;
053: import org.archive.io.arc.ARCReader;
054: import org.archive.io.arc.ARCReaderFactory;
055: import org.archive.io.arc.ARCRecord;
056: import org.archive.net.UURIFactory;
057: import org.archive.util.HttpRecorder;
058: import org.archive.util.OneLineSimpleLogger;
059:
060: /**
061: * Run named extractors against passed ARC file.
062: * This extractor tool runs suboptimally. It takes each ARC file record,
063: * writes it to a new scratch file, and then it runs each listed
064: * extractor against the scratch. It works in this manner because
065: * extractors want CharSequence, being able to refer to characters
066: * by absolute position, but ARCs are compressed streams. The work
067: * to get a CharSequence on an underlying compressed stream has not
068: * been done. Other issues are need to setup CrawlerSetting environment
069: * so extractors can run.
070: * @author stack
071: * @version $Date: 2006-09-26 23:47:15 +0000 (Tue, 26 Sep 2006) $, $Revision: 4671 $
072: */
073: public class ExtractorTool {
074: // private static final Logger logger =
075: // Logger.getLogger(ExtractorTool.class.getName());
076: static {
077: // Setup the oneline logger.
078: Handler[] hs = Logger.getLogger("").getHandlers();
079: for (int i = 0; i < hs.length; i++) {
080: Handler h = hs[0];
081: if (h instanceof ConsoleHandler) {
082: h.setFormatter(new OneLineSimpleLogger());
083: }
084: }
085: }
086:
087: private static final String[] DEFAULT_EXTRACTORS = {
088: "org.archive.crawler.extractor.ExtractorHTTP",
089: "org.archive.crawler.extractor.ExtractorHTML" };
090: private final List<Processor> extractors;
091: private final File scratchDir;
092: private static final String DEFAULT_SCRATCH = "/tmp";
093:
094: public ExtractorTool() throws Exception {
095: this (DEFAULT_EXTRACTORS, DEFAULT_SCRATCH);
096: }
097:
098: public ExtractorTool(String[] e, String scratch) throws Exception {
099: super ();
100: // Setup the scratch directory.
101: this .scratchDir = scratch == null ? new File(DEFAULT_SCRATCH)
102: : new File(scratch);
103: if (!this .scratchDir.exists()) {
104: this .scratchDir.mkdirs();
105: }
106: // Set up settings system. Needed by extractors.
107: File orderFile = new File(this .scratchDir.getAbsolutePath(),
108: ExtractorTool.class.getName() + "_order.xml");
109: SettingsHandler settingsHandler = new XMLSettingsHandler(
110: orderFile);
111: settingsHandler.initialize();
112: settingsHandler.getOrder().setAttribute(
113: new Attribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY,
114: this .scratchDir.getAbsolutePath()));
115: CrawlerSettings globalSettings = settingsHandler
116: .getSettingsObject(null);
117: MapType extractorsSettings = (MapType) settingsHandler
118: .getOrder().getAttribute(
119: CrawlOrder.ATTR_EXTRACT_PROCESSORS);
120: this .extractors = new ArrayList<Processor>();
121: for (int i = 0; i < e.length; i++) {
122: Constructor c = Class.forName(e[i]).getConstructor(
123: new Class[] { String.class });
124: String name = Integer.toString(i);
125: Processor p = (Processor) c
126: .newInstance(new Object[] { name });
127: extractorsSettings.addElement(globalSettings, p);
128: p.setAttribute(new Attribute(Processor.ATTR_ENABLED,
129: Boolean.TRUE));
130: this .extractors.add(p);
131: }
132: }
133:
134: public void extract(String resource) throws IOException,
135: URIException, InterruptedException {
136: ARCReader reader = ARCReaderFactory.get(new File(resource));
137: for (Iterator i = reader.iterator(); i.hasNext();) {
138: ARCRecord ar = (ARCRecord) i.next();
139: HttpRecorder hr = HttpRecorder
140: .wrapInputStreamWithHttpRecord(this .scratchDir,
141: this .getClass().getName(), ar, null);
142: CrawlURI curi = getCrawlURI(ar, hr);
143: for (Iterator ii = this .extractors.iterator(); ii.hasNext();) {
144: ((Processor) ii.next()).process(curi);
145: }
146: outlinks(curi);
147: }
148: }
149:
150: protected void outlinks(CrawlURI curi) {
151: System.out.println(curi.getUURI().toString());
152: for (Link l : curi.getOutLinks()) {
153: System.out.println(" " + l.getDestination() + " "
154: + l.getHopType() + " " + l.getContext());
155: }
156: }
157:
158: protected CrawlURI getCrawlURI(final ARCRecord record,
159: final HttpRecorder hr) throws URIException {
160: CrawlURI curi = new CrawlURI(UURIFactory.getInstance(record
161: .getMetaData().getUrl()));
162: curi.setContentSize(record.getMetaData().getLength());
163: curi.setContentType(record.getMetaData().getMimetype());
164: curi.setHttpRecorder(hr);
165: // Fake out the extractor that this is a legit HTTP transaction.
166: if (!curi.getUURI().getScheme().equals("filedesc")) {
167: curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
168: new HttpMethodBase() {
169: public String getName() {
170: return this .getClass().getName()
171: + "_method";
172: }
173:
174: public Header getResponseHeader(
175: String headerName) {
176: String value = (String) record
177: .getMetaData().getHeaderValue(
178: headerName);
179: return (value == null || value.length() == 0) ? null
180: : new Header(headerName, value);
181: }
182: });
183: String statusCode = record.getMetaData().getStatusCode();
184: curi.setFetchStatus(statusCode == null ? 200 : Integer
185: .parseInt(statusCode));
186: }
187: return curi;
188: }
189:
190: /**
191: * Format usage message.
192: * @param formatter Help formatter instance.
193: * @param options Usage options.
194: * @param exitCode Exit code.
195: */
196: private static void usage(HelpFormatter formatter, Options options,
197: int exitCode) {
198: formatter
199: .printHelp(
200: "java "
201: + ExtractorTool.class.getName()
202: + " \\\n[--scratch=DIR] [--extractor=EXTRACTOR1,EXTRACTOR2,...] ARC",
203: options);
204: System.exit(exitCode);
205: }
206:
207: public static void main(String[] args) throws Exception {
208: Options options = new Options();
209: options.addOption(new Option("h", "help", false,
210: "Prints this message and exits."));
211: StringBuffer defaultExtractors = new StringBuffer();
212: for (int i = 0; i < DEFAULT_EXTRACTORS.length; i++) {
213: if (i > 0) {
214: defaultExtractors.append(", ");
215: }
216: defaultExtractors.append(DEFAULT_EXTRACTORS[i]);
217: }
218: options.addOption(new Option("e", "extractor", true,
219: "List of comma-separated extractor class names. "
220: + "Run in order listed. "
221: + "If no extractors listed, runs following: "
222: + defaultExtractors.toString() + "."));
223: options
224: .addOption(new Option("s", "scratch", true,
225: "Directory to write scratch files to. Default: '/tmp'."));
226: PosixParser parser = new PosixParser();
227: CommandLine cmdline = parser.parse(options, args, false);
228: List cmdlineArgs = cmdline.getArgList();
229: Option[] cmdlineOptions = cmdline.getOptions();
230: HelpFormatter formatter = new HelpFormatter();
231: // If no args, print help.
232: if (cmdlineArgs.size() <= 0) {
233: usage(formatter, options, 0);
234: }
235:
236: // Now look at options passed.
237: String[] extractors = DEFAULT_EXTRACTORS;
238: String scratch = null;
239: for (int i = 0; i < cmdlineOptions.length; i++) {
240: switch (cmdlineOptions[i].getId()) {
241: case 'h':
242: usage(formatter, options, 0);
243: break;
244:
245: case 'e':
246: String value = cmdlineOptions[i].getValue();
247: if (value == null || value.length() <= 0) {
248: // Allow saying NO extractors so we can see
249: // how much it costs just reading through
250: // ARCs.
251: extractors = new String[0];
252: } else {
253: extractors = value.split(",");
254: }
255: break;
256:
257: case 's':
258: scratch = cmdlineOptions[i].getValue();
259: break;
260:
261: default:
262: throw new RuntimeException("Unexpected option: "
263: + +cmdlineOptions[i].getId());
264: }
265: }
266:
267: ExtractorTool tool = new ExtractorTool(extractors, scratch);
268: for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
269: tool.extract((String) i.next());
270: }
271: }
272: }
|