001: /* $Id: WARCReader.java 4754 2006-11-28 02:03:03Z stack-sf $
002: *
003: * Created Aug 23, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io.warc;
024:
025: import java.io.File;
026: import java.io.IOException;
027: import java.io.InputStream;
028: import java.util.Iterator;
029: import java.util.List;
030:
031: import org.apache.commons.cli.CommandLine;
032: import org.apache.commons.cli.HelpFormatter;
033: import org.apache.commons.cli.Option;
034: import org.apache.commons.cli.Options;
035: import org.apache.commons.cli.ParseException;
036: import org.apache.commons.cli.PosixParser;
037: import org.apache.commons.lang.NotImplementedException;
038: import org.archive.io.ArchiveReader;
039: import org.archive.io.ArchiveRecord;
040: import org.archive.io.warc.WARCConstants;
041:
042: /**
043: * WARCReader.
044: * Go via {@link WARCReaderFactory} to get instance.
045: * @author stack
046: * @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$
047: */
048: public class WARCReader extends ArchiveReader implements WARCConstants {
049: WARCReader() {
050: super ();
051: }
052:
053: @Override
054: protected void initialize(String i) {
055: super .initialize(i);
056: setVersion(WARC_VERSION);
057: }
058:
059: /**
060: * Skip over any trailing new lines at end of the record so we're lined up
061: * ready to read the next.
062: * @param record
063: * @throws IOException
064: */
065: protected void gotoEOR(ArchiveRecord record) throws IOException {
066: if (record.available() != 0) {
067: throw new IOException(
068: "Record should be exhausted before coming "
069: + "in here");
070: }
071:
072: // Records end in 2*CRLF. Suck it up.
073: readExpectedChar(getIn(), CRLF.charAt(0));
074: readExpectedChar(getIn(), CRLF.charAt(1));
075: readExpectedChar(getIn(), CRLF.charAt(0));
076: readExpectedChar(getIn(), CRLF.charAt(1));
077: }
078:
079: protected void readExpectedChar(final InputStream is,
080: final int expected) throws IOException {
081: int c = is.read();
082: if (c != expected) {
083: throw new IOException("Unexpected character "
084: + Integer.toHexString(c) + "(Expecting "
085: + Integer.toHexString(expected) + ")");
086: }
087: }
088:
089: /**
090: * Create new WARC record.
091: * Encapsulate housekeeping that has to do w/ creating new Record.
092: * @param is InputStream to use.
093: * @param offset Absolute offset into WARC file.
094: * @return A WARCRecord.
095: * @throws IOException
096: */
097: protected WARCRecord createArchiveRecord(InputStream is, long offset)
098: throws IOException {
099: return (WARCRecord) currentRecord(new WARCRecord(is,
100: getReaderIdentifier(), offset, isDigest(), isStrict()));
101: }
102:
103: @Override
104: public void dump(boolean compress) throws IOException,
105: java.text.ParseException {
106: for (final Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
107: ArchiveRecord r = i.next();
108: System.out.println(r.getHeader().toString());
109: r.dump();
110: System.out.println();
111: }
112: }
113:
114: @Override
115: public ArchiveReader getDeleteFileOnCloseReader(final File f) {
116: throw new NotImplementedException("TODO");
117: }
118:
119: @Override
120: public String getDotFileExtension() {
121: return DOT_WARC_FILE_EXTENSION;
122: }
123:
124: @Override
125: public String getFileExtension() {
126: return WARC_FILE_EXTENSION;
127: }
128:
129: // Static methods follow. Mostly for command-line processing.
130:
131: /**
132: *
133: * @param formatter Help formatter instance.
134: * @param options Usage options.
135: * @param exitCode Exit code.
136: */
137: private static void usage(HelpFormatter formatter, Options options,
138: int exitCode) {
139: formatter
140: .printHelp(
141: "java org.archive.io.arc.WARCReader"
142: + " [--digest=true|false] \\\n"
143: + " [--format=cdx|cdxfile|dump|gzipdump]"
144: + " [--offset=#] \\\n[--strict] [--parse] WARC_FILE|WARC_URL",
145: options);
146: System.exit(exitCode);
147: }
148:
149: /**
150: * Write out the arcfile.
151: *
152: * @param reader
153: * @param format Format to use outputting.
154: * @throws IOException
155: * @throws java.text.ParseException
156: */
157: protected static void output(WARCReader reader, String format)
158: throws IOException, java.text.ParseException {
159: if (!reader.output(format)) {
160: throw new IOException("Unsupported format: " + format);
161: }
162: }
163:
164: /**
165: * Generate a CDX index file for an ARC file.
166: *
167: * @param urlOrPath The ARC file to generate a CDX index for
168: * @throws IOException
169: * @throws java.text.ParseException
170: */
171: public static void createCDXIndexFile(String urlOrPath)
172: throws IOException, java.text.ParseException {
173: WARCReader r = WARCReaderFactory.get(urlOrPath);
174: r.setStrict(false);
175: r.setDigest(true);
176: output(r, CDX_FILE);
177: }
178:
179: /**
180: * Command-line interface to WARCReader.
181: *
182: * Here is the command-line interface:
183: * <pre>
184: * usage: java org.archive.io.arc.WARCReader [--offset=#] ARCFILE
185: * -h,--help Prints this message and exits.
186: * -o,--offset Outputs record at this offset into arc file.</pre>
187: *
188: * <p>Outputs using a pseudo-CDX format as described here:
189: * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX
190: * Legent</a> and here
191: * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
192: * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
193: * Hash is hard-coded straight SHA-1 hash of content.
194: *
195: * @param args Command-line arguments.
196: * @throws ParseException Failed parse of the command line.
197: * @throws IOException
198: * @throws java.text.ParseException
199: */
200: public static void main(String[] args) throws ParseException,
201: IOException, java.text.ParseException {
202: Options options = getOptions();
203: PosixParser parser = new PosixParser();
204: CommandLine cmdline = parser.parse(options, args, false);
205: List cmdlineArgs = cmdline.getArgList();
206: Option[] cmdlineOptions = cmdline.getOptions();
207: HelpFormatter formatter = new HelpFormatter();
208:
209: // If no args, print help.
210: if (cmdlineArgs.size() <= 0) {
211: usage(formatter, options, 0);
212: }
213:
214: // Now look at options passed.
215: long offset = -1;
216: boolean digest = false;
217: boolean strict = false;
218: String format = CDX;
219: for (int i = 0; i < cmdlineOptions.length; i++) {
220: switch (cmdlineOptions[i].getId()) {
221: case 'h':
222: usage(formatter, options, 0);
223: break;
224:
225: case 'o':
226: offset = Long.parseLong(cmdlineOptions[i].getValue());
227: break;
228:
229: case 's':
230: strict = true;
231: break;
232:
233: case 'd':
234: digest = getTrueOrFalse(cmdlineOptions[i].getValue());
235: break;
236:
237: case 'f':
238: format = cmdlineOptions[i].getValue().toLowerCase();
239: boolean match = false;
240: // List of supported formats.
241: final String[] supportedFormats = { CDX, DUMP,
242: GZIP_DUMP, CDX_FILE };
243: for (int ii = 0; ii < supportedFormats.length; ii++) {
244: if (supportedFormats[ii].equals(format)) {
245: match = true;
246: break;
247: }
248: }
249: if (!match) {
250: usage(formatter, options, 1);
251: }
252: break;
253:
254: default:
255: throw new RuntimeException("Unexpected option: "
256: + +cmdlineOptions[i].getId());
257: }
258: }
259:
260: if (offset >= 0) {
261: if (cmdlineArgs.size() != 1) {
262: System.out.println("Error: Pass one arcfile only.");
263: usage(formatter, options, 1);
264: }
265: WARCReader r = WARCReaderFactory.get(new File(
266: (String) cmdlineArgs.get(0)), offset);
267: r.setStrict(strict);
268: outputRecord(r, format);
269: } else {
270: for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
271: String urlOrPath = (String) i.next();
272: try {
273: WARCReader r = WARCReaderFactory.get(urlOrPath);
274: r.setStrict(strict);
275: r.setDigest(digest);
276: output(r, format);
277: } catch (RuntimeException e) {
278: // Write out name of file we failed on to help with
279: // debugging. Then print stack trace and try to keep
280: // going. We do this for case where we're being fed
281: // a bunch of ARCs; just note the bad one and move
282: // on to the next.
283: System.err.println("Exception processing "
284: + urlOrPath + ": " + e.getMessage());
285: e.printStackTrace(System.err);
286: System.exit(1);
287: }
288: }
289: }
290: }
291: }
|