001: /* $Id: WARCReader.java 4754 2006-11-28 02:03:03Z stack-sf $
002: *
003: * Created Aug 23, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io.warc.v10;
024:
025: import java.io.File;
026: import java.io.IOException;
027: import java.io.InputStream;
028: import java.util.Iterator;
029: import java.util.List;
030:
031: import org.apache.commons.cli.CommandLine;
032: import org.apache.commons.cli.HelpFormatter;
033: import org.apache.commons.cli.Option;
034: import org.apache.commons.cli.Options;
035: import org.apache.commons.cli.ParseException;
036: import org.apache.commons.cli.PosixParser;
037: import org.apache.commons.lang.NotImplementedException;
038: import org.archive.io.ArchiveReader;
039: import org.archive.io.ArchiveRecord;
040: import org.archive.io.warc.WARCConstants;
041:
042: /**
043: * WARCReader.
044: * Go via {@link WARCReaderFactory} to get instance.
045: * @author stack
046: * @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$
047: */
048: public class WARCReader extends ArchiveReader implements WARCConstants {
049: WARCReader() {
050: super ();
051: }
052:
053: @Override
054: protected void initialize(String i) {
055: super .initialize(i);
056: setVersion("0.10");
057: }
058:
059: /**
060: * Skip over any trailing new lines at end of the record so we're lined up
061: * ready to read the next.
062: * @param record
063: * @throws IOException
064: */
065: protected void gotoEOR(ArchiveRecord record) throws IOException {
066: if (record.available() != 0) {
067: throw new IOException(
068: "Record should be exhausted before coming "
069: + "in here");
070: }
071:
072: // Records end in 2*CRLF. Such it up.
073: readExpectedChar(getIn(), CRLF.charAt(0));
074: readExpectedChar(getIn(), CRLF.charAt(1));
075: readExpectedChar(getIn(), CRLF.charAt(0));
076: readExpectedChar(getIn(), CRLF.charAt(1));
077: }
078:
079: protected void readExpectedChar(final InputStream is,
080: final int expected) throws IOException {
081: int c = is.read();
082: if (c != expected) {
083: throw new IOException("Unexpected character "
084: + Integer.toHexString(c) + "(Expecting "
085: + Integer.toHexString(expected) + ")");
086: }
087: }
088:
089: /**
090: * Create new WARC record.
091: * Encapsulate housekeeping that has to do w/ creating new Record.
092: * @param is InputStream to use.
093: * @param offset Absolute offset into WARC file.
094: * @return A WARCRecord.
095: * @throws IOException
096: */
097: protected WARCRecord createArchiveRecord(InputStream is, long offset)
098: throws IOException {
099: return (WARCRecord) currentRecord(new WARCRecord(is,
100: getReaderIdentifier(), offset, isDigest(), isStrict()));
101: }
102:
103: @Override
104: public void dump(boolean compress) throws IOException,
105: java.text.ParseException {
106: for (final Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
107: ArchiveRecord r = i.next();
108: System.out.println(r.getHeader().toString());
109: r.dump();
110: System.out.println();
111: }
112: }
113:
114: @Override
115: public ArchiveReader getDeleteFileOnCloseReader(final File f) {
116: throw new NotImplementedException("TODO");
117: }
118:
119: @Override
120: public String getDotFileExtension() {
121: return DOT_WARC_FILE_EXTENSION;
122: }
123:
124: @Override
125: public String getFileExtension() {
126: return WARC_FILE_EXTENSION;
127: }
128:
129: // Static methods follow. Mostly for command-line processing.
130:
131: /**
132: *
133: * @param formatter Help formatter instance.
134: * @param options Usage options.
135: * @param exitCode Exit code.
136: */
137: private static void usage(HelpFormatter formatter, Options options,
138: int exitCode) {
139: formatter
140: .printHelp(
141: "java org.archive.io.arc.WARCReader"
142: + " [--digest=true|false] \\\n"
143: + " [--format=cdx|cdxfile|dump|gzipdump]"
144: + " [--offset=#] \\\n[--strict] [--parse] WARC_FILE|WARC_URL",
145: options);
146: System.exit(exitCode);
147: }
148:
149: /**
150: * Write out the arcfile.
151: *
152: * @param reader
153: * @param format Format to use outputting.
154: * @throws IOException
155: * @throws java.text.ParseException
156: */
157: protected static void output(WARCReader reader, String format)
158: throws IOException, java.text.ParseException {
159: if (!reader.output(format)) {
160: throw new IOException("Unsupported format: " + format);
161: }
162: }
163:
164: /**
165: * Output passed record using passed format specifier.
166: * @param r ARCReader instance to output.
167: * @param format What format to use outputting.
168: * @throws IOException
169: */
170: protected static void outputRecord(final WARCReader r,
171: final String format) throws IOException {
172: if (!r.outputRecord(format)) {
173: throw new IOException("Unsupported format"
174: + " (or unsupported on a single record): " + format);
175: }
176: }
177:
178: /**
179: * Generate a CDX index file for an ARC file.
180: *
181: * @param urlOrPath The ARC file to generate a CDX index for
182: * @throws IOException
183: * @throws java.text.ParseException
184: */
185: public static void createCDXIndexFile(String urlOrPath)
186: throws IOException, java.text.ParseException {
187: WARCReader r = WARCReaderFactory.get(urlOrPath);
188: r.setStrict(false);
189: r.setDigest(true);
190: output(r, CDX_FILE);
191: }
192:
193: /**
194: * Command-line interface to WARCReader.
195: *
196: * Here is the command-line interface:
197: * <pre>
198: * usage: java org.archive.io.arc.WARCReader [--offset=#] ARCFILE
199: * -h,--help Prints this message and exits.
200: * -o,--offset Outputs record at this offset into arc file.</pre>
201: *
202: * <p>Outputs using a pseudo-CDX format as described here:
203: * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX
204: * Legent</a> and here
205: * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
206: * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
207: * Hash is hard-coded straight SHA-1 hash of content.
208: *
209: * @param args Command-line arguments.
210: * @throws ParseException Failed parse of the command line.
211: * @throws IOException
212: * @throws java.text.ParseException
213: */
214: public static void main(String[] args) throws ParseException,
215: IOException, java.text.ParseException {
216: Options options = new Options();
217: options.addOption(new Option("h", "help", false,
218: "Prints this message and exits."));
219: options.addOption(new Option("o", "offset", true,
220: "Outputs record at this offset into arc file."));
221: options.addOption(new Option("d", "digest", true,
222: "Pass true|false. Expensive. Default: true (SHA-1)."));
223: options
224: .addOption(new Option("s", "strict", false,
225: "Strict mode. Fails parse if incorrectly formatted WARC."));
226: options.addOption(new Option("f", "format", true,
227: "Output options: 'cdx', cdxfile', 'dump', 'gzipdump',"
228: + "'or 'nohead'. Default: 'cdx'."));
229: PosixParser parser = new PosixParser();
230: CommandLine cmdline = parser.parse(options, args, false);
231: List cmdlineArgs = cmdline.getArgList();
232: Option[] cmdlineOptions = cmdline.getOptions();
233: HelpFormatter formatter = new HelpFormatter();
234:
235: // If no args, print help.
236: if (cmdlineArgs.size() <= 0) {
237: usage(formatter, options, 0);
238: }
239:
240: // Now look at options passed.
241: long offset = -1;
242: boolean digest = false;
243: boolean strict = false;
244: String format = CDX;
245: for (int i = 0; i < cmdlineOptions.length; i++) {
246: switch (cmdlineOptions[i].getId()) {
247: case 'h':
248: usage(formatter, options, 0);
249: break;
250:
251: case 'o':
252: offset = Long.parseLong(cmdlineOptions[i].getValue());
253: break;
254:
255: case 's':
256: strict = true;
257: break;
258:
259: case 'd':
260: digest = getTrueOrFalse(cmdlineOptions[i].getValue());
261: break;
262:
263: case 'f':
264: format = cmdlineOptions[i].getValue().toLowerCase();
265: boolean match = false;
266: // List of supported formats.
267: final String[] supportedFormats = { CDX, DUMP,
268: GZIP_DUMP, CDX_FILE };
269: for (int ii = 0; ii < supportedFormats.length; ii++) {
270: if (supportedFormats[ii].equals(format)) {
271: match = true;
272: break;
273: }
274: }
275: if (!match) {
276: usage(formatter, options, 1);
277: }
278: break;
279:
280: default:
281: throw new RuntimeException("Unexpected option: "
282: + +cmdlineOptions[i].getId());
283: }
284: }
285:
286: if (offset >= 0) {
287: if (cmdlineArgs.size() != 1) {
288: System.out.println("Error: Pass one arcfile only.");
289: usage(formatter, options, 1);
290: }
291: WARCReader r = WARCReaderFactory.get(new File(
292: (String) cmdlineArgs.get(0)), offset);
293: r.setStrict(strict);
294: outputRecord(r, format);
295: } else {
296: for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
297: String urlOrPath = (String) i.next();
298: try {
299: WARCReader r = WARCReaderFactory.get(urlOrPath);
300: r.setStrict(strict);
301: r.setDigest(digest);
302: output(r, format);
303: } catch (RuntimeException e) {
304: // Write out name of file we failed on to help with
305: // debugging. Then print stack trace and try to keep
306: // going. We do this for case where we're being fed
307: // a bunch of ARCs; just note the bad one and move
308: // on to the next.
309: System.err.println("Exception processing "
310: + urlOrPath + ": " + e.getMessage());
311: e.printStackTrace(System.err);
312: System.exit(1);
313: }
314: }
315: }
316: }
317: }
|