001: /* $Id: ARCReader.java 5039 2007-04-06 00:29:39Z gojomo $
002: *
003: * Created on May 1, 2004
004: *
005: * Copyright (C) 2004 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io.arc;
024:
025: import java.io.ByteArrayOutputStream;
026: import java.io.File;
027: import java.io.IOException;
028: import java.io.InputStream;
029: import java.util.ArrayList;
030: import java.util.Arrays;
031: import java.util.HashMap;
032: import java.util.Iterator;
033: import java.util.List;
034: import java.util.Map;
035: import java.util.concurrent.atomic.AtomicInteger;
036: import java.util.logging.Level;
037: import java.util.logging.Logger;
038: import java.util.regex.Matcher;
039:
040: import org.apache.commons.cli.CommandLine;
041: import org.apache.commons.cli.HelpFormatter;
042: import org.apache.commons.cli.Option;
043: import org.apache.commons.cli.Options;
044: import org.apache.commons.cli.ParseException;
045: import org.apache.commons.cli.PosixParser;
046: import org.archive.io.ArchiveReader;
047: import org.archive.io.ArchiveRecord;
048: import org.archive.io.ArchiveRecordHeader;
049: import org.archive.io.RecoverableIOException;
050: import org.archive.io.WriterPoolMember;
051: import org.archive.util.ArchiveUtils;
052: import org.archive.util.InetAddressUtil;
053: import org.archive.util.TextUtils;
054:
055: /**
056: * Get an iterator on an ARC file or get a record by absolute position.
057: *
058: * ARC files are described here:
059: * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
060: * File Format</a>.
061: *
062: * <p>This class knows how to parse an ARC file. Pass it a file path
063: * or an URL to an ARC. It can parse ARC Version 1 and 2.
064: *
065: * <p>Iterator returns <code>ARCRecord</code>
066: * though {@link Iterator#next()} is returning
067: * java.lang.Object. Cast the return.
068: *
069: * <p>Profiling java.io vs. memory-mapped ByteBufferInputStream shows the
070: * latter slightly slower -- but not by much. TODO: Test more. Just
071: * change {@link #getInputStream(File, long)}.
072: *
073: * @author stack
074: * @version $Date: 2007-04-06 00:29:39 +0000 (Fri, 06 Apr 2007) $ $Revision: 5039 $
075: */
076: public abstract class ARCReader extends ArchiveReader implements
077: ARCConstants {
078: Logger logger = Logger.getLogger(ARCReader.class.getName());
079:
080: /**
081: * Set to true if we are aligned on first record of Archive file.
082: * We used depend on offset. If offset was zero, then we were
083: * aligned on first record. This is no longer necessarily the case when
084: * Reader is created at an offset into an Archive file: The offset is zero
085: * but its relative to where we started reading.
086: */
087: private boolean alignedOnFirstRecord = true;
088:
089: /**
090: * Assumed maximum size of a record meta header line.
091: *
092: * This 100k which seems massive but its the same as the LINE_LENGTH from
093: * <code>alexa/include/a_arcio.h</code>:
094: * <pre>
095: * #define LINE_LENGTH (100*1024)
096: * </pre>
097: */
098: private static final int MAX_HEADER_LINE_LENGTH = 1024 * 100;
099:
100: /**
101: * Array of field names.
102: *
103: * Used to initialize <code>headerFieldNameKeys</code>.
104: */
105: private final String[] headerFieldNameKeysArray = { URL_FIELD_KEY,
106: IP_HEADER_FIELD_KEY, DATE_FIELD_KEY, MIMETYPE_FIELD_KEY,
107: LENGTH_FIELD_KEY };
108:
109: /**
110: * An array of the header field names found in the ARC file header on
111: * the 3rd line.
112: *
113: * We used to read these in from the arc file first record 3rd line but
114: * now we hardcode them for sake of improved performance.
115: */
116: private final List<String> headerFieldNameKeys = Arrays
117: .asList(this .headerFieldNameKeysArray);
118:
119: private boolean parseHttpHeaders = true;
120:
121: ARCReader() {
122: super ();
123: }
124:
125: /**
126: * Skip over any trailing new lines at end of the record so we're lined up
127: * ready to read the next.
128: * @param record
129: * @throws IOException
130: */
131: protected void gotoEOR(ArchiveRecord record) throws IOException {
132: if (getIn().available() <= 0) {
133: return;
134: }
135:
136: // Remove any trailing LINE_SEPARATOR
137: int c = -1;
138: while (getIn().available() > 0) {
139: if (getIn().markSupported()) {
140: getIn().mark(1);
141: }
142: c = getIn().read();
143: if (c != -1) {
144: if (c == LINE_SEPARATOR) {
145: continue;
146: }
147: if (getIn().markSupported()) {
148: // We've overread. We're probably in next record. There is
149: // no way of telling for sure. It may be dross at end of
150: // current record. Backup.
151: getIn().reset();
152: break;
153: }
154: ArchiveRecordHeader h = (getCurrentRecord() != null) ? record
155: .getHeader()
156: : null;
157: throw new IOException("Read "
158: + (char) c
159: + " when only "
160: + LINE_SEPARATOR
161: + " expected. "
162: + getReaderIdentifier()
163: + ((h != null) ? h.getHeaderFields().toString()
164: : ""));
165: }
166: }
167: }
168:
169: /**
170: * Create new arc record.
171: *
172: * Encapsulate housekeeping that has to do w/ creating a new record.
173: *
174: * <p>Call this method at end of constructor to read in the
175: * arcfile header. Will be problems reading subsequent arc records
176: * if you don't since arcfile header has the list of metadata fields for
177: * all records that follow.
178: *
179: * <p>When parsing through ARCs writing out CDX info, we spend about
180: * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine
181: * -- of which 16% is reading.
182: *
183: * @param is InputStream to use.
184: * @param offset Absolute offset into arc file.
185: * @return An arc record.
186: * @throws IOException
187: */
188: protected ARCRecord createArchiveRecord(InputStream is, long offset)
189: throws IOException {
190: ArrayList<String> firstLineValues = new ArrayList<String>(20);
191: getTokenizedHeaderLine(is, firstLineValues);
192: int bodyOffset = 0;
193: if (offset == 0 && isAlignedOnFirstRecord()) {
194: // If offset is zero and we were aligned at first record on
195: // creation (See #alignedOnFirstRecord for more on this), then no
196: // records have been read yet and we're reading our first one, the
197: // record of ARC file meta info. Its special. In ARC versions
198: // 1.x, first record has three lines of meta info. We've just read
199: // the first line. There are two more. The second line has misc.
200: // info. We're only interested in the first field, the version
201: // number. The third line is the list of field names. Here's what
202: // ARC file version 1.x meta content looks like:
203: //
204: // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\
205: // 20040107015752 text/plain 77
206: // 1 0 InternetArchive
207: // URL IP-address Archive-date Content-type Archive-length
208: //
209: ArrayList<String> secondLineValues = new ArrayList<String>(
210: 20);
211: bodyOffset += getTokenizedHeaderLine(is, secondLineValues);
212: setVersion((String) secondLineValues.get(0) + "."
213: + (String) secondLineValues.get(1));
214: // Just read over the 3rd line. We used to parse it and use
215: // values found here but now we just hardcode them to avoid
216: // having to read this 3rd line even for random arc file accesses.
217: bodyOffset += getTokenizedHeaderLine(is, null);
218: }
219:
220: try {
221: currentRecord(new ARCRecord(is,
222: (ArchiveRecordHeader) computeMetaData(
223: this .headerFieldNameKeys, firstLineValues,
224: getVersion(), offset), bodyOffset,
225: isDigest(), isStrict(), isParseHttpHeaders()));
226: } catch (IOException e) {
227: if (e instanceof RecoverableIOException) {
228: // Don't mess with RecoverableIOExceptions. Let them out.
229: throw e;
230: }
231: IOException newE = new IOException(e.getMessage()
232: + " (Offset " + offset + ").");
233: newE.setStackTrace(e.getStackTrace());
234: throw newE;
235: }
236: return (ARCRecord) getCurrentRecord();
237: }
238:
239: /**
240: * Returns version of this ARC file. Usually read from first record of ARC.
241: * If we're reading without having first read the first record -- e.g.
242: * random access into middle of an ARC -- then version will not have been
243: * set. For now, we return a default, version 1.1. Later, if more than
244: * just one version of ARC, we could look at such as the meta line to see
245: * what version of ARC this is.
246: * @return Version of this ARC file.
247: */
248: public String getVersion() {
249: return (super .getVersion() == null) ? "1.1" : super
250: .getVersion();
251: }
252:
253: /**
254: * Get a record header line as list of tokens.
255: *
256: * We keep reading till we find a LINE_SEPARATOR or we reach the end
257: * of file w/o finding a LINE_SEPARATOR or the line length is crazy.
258: *
259: * @param stream InputStream to read from.
260: * @param list Empty list that gets filled w/ string tokens.
261: * @return Count of characters read.
262: * @exception IOException If problem reading stream or no line separator
263: * found or EOF before EOL or we didn't get minimum header fields.
264: */
265: private int getTokenizedHeaderLine(final InputStream stream,
266: List<String> list) throws IOException {
267: // Preallocate usual line size.
268: StringBuilder buffer = new StringBuilder(2048 + 20);
269: int read = 0;
270: int previous = -1;
271: for (int c = -1; true;) {
272: previous = c;
273: c = stream.read();
274: if (c == -1) {
275: throw new RecoverableIOException(
276: "Hit EOF before header EOL.");
277: }
278: c &= 0xff;
279: read++;
280: if (read > MAX_HEADER_LINE_LENGTH) {
281: throw new IOException(
282: "Header line longer than max allowed "
283: + " -- "
284: + String
285: .valueOf(MAX_HEADER_LINE_LENGTH)
286: + " -- or passed buffer doesn't contain a line (Read: "
287: + buffer.length()
288: + "). Here's"
289: + " some of what was read: "
290: + buffer.substring(0, Math.min(buffer
291: .length(), 256)));
292: }
293:
294: if (c == LINE_SEPARATOR) {
295: if (buffer.length() == 0) {
296: // Empty line at start of buffer. Skip it and try again.
297: continue;
298: }
299:
300: if (list != null) {
301: list.add(buffer.toString());
302: }
303: // LOOP TERMINATION.
304: break;
305: } else if (c == HEADER_FIELD_SEPARATOR) {
306: if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {
307: // Early ARCs sometimes had multiple spaces between fields.
308: continue;
309: }
310: if (list != null) {
311: list.add(buffer.toString());
312: }
313: // reset to empty
314: buffer.setLength(0);
315: } else {
316: buffer.append((char) c);
317: }
318: }
319:
320: // List must have at least 3 elements in it and no more than 10. If
321: // it has other than this, then bogus parse.
322: if (list != null && (list.size() < 3 || list.size() > 100)) {
323: throw new IOException("Unparseable header line: " + list);
324: }
325:
326: return read;
327: }
328:
329: /**
330: * Compute metadata fields.
331: *
332: * Here we check the meta field has right number of items in it.
333: *
334: * @param keys Keys to use composing headerFields map.
335: * @param values Values to set into the headerFields map.
336: * @param v The version of this ARC file.
337: * @param offset Offset into arc file.
338: *
339: * @return Metadata structure for this record.
340: *
341: * @exception IOException If no. of keys doesn't match no. of values.
342: */
343: private ARCRecordMetaData computeMetaData(List<String> keys,
344: List<String> values, String v, long offset)
345: throws IOException {
346: if (keys.size() != values.size()) {
347: List<String> originalValues = values;
348: if (!isStrict()) {
349: values = fixSpaceInURL(values, keys.size());
350: // If values still doesn't match key size, try and do
351: // further repair.
352: if (keys.size() != values.size()) {
353: // Early ARCs had a space in mimetype.
354: if (values.size() == (keys.size() + 1)
355: && values.get(4).toLowerCase().startsWith(
356: "charset=")) {
357: List<String> nuvalues = new ArrayList<String>(
358: keys.size());
359: nuvalues.add(0, values.get(0));
360: nuvalues.add(1, values.get(1));
361: nuvalues.add(2, values.get(2));
362: nuvalues.add(3, values.get(3) + values.get(4));
363: nuvalues.add(4, values.get(5));
364: values = nuvalues;
365: } else if ((values.size() + 1) == keys.size()
366: && isLegitimateIPValue(values.get(1))
367: && isDate(values.get(2))
368: && isNumber(values.get(3))) {
369: // Mimetype is empty.
370: List<String> nuvalues = new ArrayList<String>(
371: keys.size());
372: nuvalues.add(0, values.get(0));
373: nuvalues.add(1, values.get(1));
374: nuvalues.add(2, values.get(2));
375: nuvalues.add(3, "-");
376: nuvalues.add(4, values.get(3));
377: values = nuvalues;
378: }
379: }
380: }
381: if (keys.size() != values.size()) {
382: throw new IOException("Size of field name keys does"
383: + " not match count of field values: " + values);
384: }
385: // Note that field was fixed on stderr.
386: logStdErr(Level.WARNING,
387: "Fixed spaces in metadata line at " + "offset "
388: + offset + " Original: " + originalValues
389: + ", New: " + values);
390: }
391:
392: Map<Object, Object> headerFields = new HashMap<Object, Object>(
393: keys.size() + 2);
394: for (int i = 0; i < keys.size(); i++) {
395: headerFields.put(keys.get(i), values.get(i));
396: }
397:
398: // Add a check for tabs in URLs. If any, replace with '%09'.
399: // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,
400: // [ 1010966 ] crawl.log has URIs with spaces in them.
401: String url = (String) headerFields.get(URL_FIELD_KEY);
402: if (url != null && url.indexOf('\t') >= 0) {
403: headerFields.put(URL_FIELD_KEY, TextUtils.replaceAll("\t",
404: url, "%09"));
405: }
406:
407: headerFields.put(VERSION_FIELD_KEY, v);
408: headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
409:
410: return new ARCRecordMetaData(getReaderIdentifier(),
411: headerFields);
412: }
413:
414: protected boolean isDate(final String date) {
415: if (date.length() != 14) {
416: return false;
417: }
418: return isNumber(date);
419: }
420:
421: protected boolean isNumber(final String n) {
422: for (int i = 0; i < n.length(); i++) {
423: if (!Character.isDigit(n.charAt(i))) {
424: return false;
425: }
426: }
427: return true;
428: }
429:
430: protected boolean isLegitimateIPValue(final String ip) {
431: if ("-".equals(ip)) {
432: return true;
433: }
434: Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip);
435: return m != null && m.matches();
436: }
437:
438: /**
439: * Fix space in URLs.
440: * The ARCWriter used to write into the ARC URLs with spaces in them.
441: * See <a
442: * href="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ]
443: * crawl.log has URIs with spaces in them</a>.
444: * This method does fix up on such headers converting all spaces found
445: * to '%20'.
446: * @param values List of metadata values.
447: * @param requiredSize Expected size of resultant values list.
448: * @return New list if we successfully fixed up values or original if
449: * fixup failed.
450: */
451: protected List<String> fixSpaceInURL(List<String> values,
452: int requiredSize) {
453: // Do validity check. 3rd from last is a date of 14 numeric
454: // characters. The 4th from last is IP, all before the IP
455: // should be concatenated together with a '%20' joiner.
456: // In the below, '4' is 4th field from end which has the IP.
457: if (!(values.size() > requiredSize) || values.size() < 4) {
458: return values;
459: }
460: // Test 3rd field is valid date.
461: if (!isDate((String) values.get(values.size() - 3))) {
462: return values;
463: }
464:
465: // Test 4th field is valid IP.
466: if (!isLegitimateIPValue((String) values.get(values.size() - 4))) {
467: return values;
468: }
469:
470: List<String> newValues = new ArrayList<String>(requiredSize);
471: StringBuffer url = new StringBuffer();
472: for (int i = 0; i < (values.size() - 4); i++) {
473: if (i > 0) {
474: url.append("%20");
475: }
476: url.append(values.get(i));
477: }
478: newValues.add(url.toString());
479: for (int i = values.size() - 4; i < values.size(); i++) {
480: newValues.add(values.get(i));
481: }
482: return newValues;
483: }
484:
485: protected boolean isAlignedOnFirstRecord() {
486: return alignedOnFirstRecord;
487: }
488:
489: protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {
490: this .alignedOnFirstRecord = alignedOnFirstRecord;
491: }
492:
493: /**
494: * @return Returns the parseHttpHeaders.
495: */
496: public boolean isParseHttpHeaders() {
497: return this .parseHttpHeaders;
498: }
499:
500: /**
501: * @param parse The parseHttpHeaders to set.
502: */
503: public void setParseHttpHeaders(boolean parse) {
504: this .parseHttpHeaders = parse;
505: }
506:
507: public String getFileExtension() {
508: return ARC_FILE_EXTENSION;
509: }
510:
511: public String getDotFileExtension() {
512: return DOT_ARC_FILE_EXTENSION;
513: }
514:
515: protected boolean output(final String format) throws IOException,
516: java.text.ParseException {
517: boolean result = super .output(format);
518: if (!result && (format.equals(NOHEAD) || format.equals(HEADER))) {
519: throw new IOException(format
520: + " format only supported for single Records");
521: }
522: return result;
523: }
524:
525: public boolean outputRecord(final String format) throws IOException {
526: boolean result = super .outputRecord(format);
527: if (result) {
528: return result;
529: }
530: if (format.equals(NOHEAD)) {
531: // No point digesting if dumping content.
532: setDigest(false);
533: ARCRecord r = (ARCRecord) get();
534: r.skipHttpHeader();
535: r.dump();
536: result = true;
537: } else if (format.equals(HEADER)) {
538: // No point digesting if dumping content.
539: setDigest(false);
540: ARCRecord r = (ARCRecord) get();
541: r.dumpHttpHeader();
542: result = true;
543: }
544:
545: return result;
546: }
547:
548: public void dump(final boolean compress) throws IOException,
549: java.text.ParseException {
550: // No point digesting if we're doing a dump.
551: setDigest(false);
552: boolean firstRecord = true;
553: ARCWriter writer = null;
554: for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
555: ARCRecord r = (ARCRecord) ii.next();
556: // We're to dump the arc on stdout.
557: // Get the first record's data if any.
558: ARCRecordMetaData meta = r.getMetaData();
559: if (firstRecord) {
560: firstRecord = false;
561: // Get an ARCWriter.
562: ByteArrayOutputStream baos = new ByteArrayOutputStream(
563: r.available());
564: // This is slow but done only once at top of ARC.
565: while (r.available() > 0) {
566: baos.write(r.read());
567: }
568: List<String> listOfMetadata = new ArrayList<String>();
569: listOfMetadata
570: .add(baos.toString(WriterPoolMember.UTF8));
571: // Assume getArc returns full path to file. ARCWriter
572: // or new File will complain if it is otherwise.
573: writer = new ARCWriter(new AtomicInteger(), System.out,
574: new File(meta.getArc()), compress, meta
575: .getDate(), listOfMetadata);
576: continue;
577: }
578:
579: writer.write(meta.getUrl(), meta.getMimetype(), meta
580: .getIp(), ArchiveUtils.parse14DigitDate(
581: meta.getDate()).getTime(), (int) meta.getLength(),
582: r);
583: }
584: // System.out.println(System.currentTimeMillis() - start);
585: }
586:
587: /**
588: * @return an ArchiveReader that will delete a local file on close. Used
589: * when we bring Archive files local and need to clean up afterward.
590: */
591: public ARCReader getDeleteFileOnCloseReader(final File f) {
592: final ARCReader d = this ;
593: return new ARCReader() {
594: private final ARCReader delegate = d;
595: private File archiveFile = f;
596:
597: public void close() throws IOException {
598: this .delegate.close();
599: if (this .archiveFile != null) {
600: if (archiveFile.exists()) {
601: archiveFile.delete();
602: }
603: this .archiveFile = null;
604: }
605: }
606:
607: public ArchiveRecord get(long o) throws IOException {
608: return this .delegate.get(o);
609: }
610:
611: public boolean isDigest() {
612: return this .delegate.isDigest();
613: }
614:
615: public boolean isStrict() {
616: return this .delegate.isStrict();
617: }
618:
619: public Iterator<ArchiveRecord> iterator() {
620: return this .delegate.iterator();
621: }
622:
623: public void setDigest(boolean d) {
624: this .delegate.setDigest(d);
625: }
626:
627: public void setStrict(boolean s) {
628: this .delegate.setStrict(s);
629: }
630:
631: public List validate() throws IOException {
632: return this .delegate.validate();
633: }
634:
635: @Override
636: public ArchiveRecord get() throws IOException {
637: return this .delegate.get();
638: }
639:
640: @Override
641: public String getVersion() {
642: return this .delegate.getVersion();
643: }
644:
645: @Override
646: public List validate(int noRecords) throws IOException {
647: return this .delegate.validate(noRecords);
648: }
649:
650: @Override
651: protected ARCRecord createArchiveRecord(InputStream is,
652: long offset) throws IOException {
653: return this .delegate.createArchiveRecord(is, offset);
654: }
655:
656: @Override
657: protected void gotoEOR(ArchiveRecord record)
658: throws IOException {
659: this .delegate.gotoEOR(record);
660: }
661:
662: @Override
663: public void dump(boolean compress) throws IOException,
664: java.text.ParseException {
665: this .delegate.dump(compress);
666: }
667:
668: @Override
669: public String getDotFileExtension() {
670: return this .delegate.getDotFileExtension();
671: }
672:
673: @Override
674: public String getFileExtension() {
675: return this .delegate.getFileExtension();
676: }
677: };
678: }
679:
680: // Static methods follow.
681:
682: /**
683: *
684: * @param formatter Help formatter instance.
685: * @param options Usage options.
686: * @param exitCode Exit code.
687: */
688: private static void usage(HelpFormatter formatter, Options options,
689: int exitCode) {
690: formatter
691: .printHelp(
692: "java org.archive.io.arc.ARCReader"
693: + " [--digest=true|false] \\\n"
694: + " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]"
695: + " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL",
696: options);
697: System.exit(exitCode);
698: }
699:
700: /**
701: * Write out the arcfile.
702: *
703: * @param reader
704: * @param format Format to use outputting.
705: * @throws IOException
706: * @throws java.text.ParseException
707: */
708: protected static void output(ARCReader reader, String format)
709: throws IOException, java.text.ParseException {
710: if (!reader.output(format)) {
711: throw new IOException("Unsupported format: " + format);
712: }
713: }
714:
715: /**
716: * Generate a CDX index file for an ARC file.
717: *
718: * @param urlOrPath The ARC file to generate a CDX index for
719: * @throws IOException
720: * @throws java.text.ParseException
721: */
722: public static void createCDXIndexFile(String urlOrPath)
723: throws IOException, java.text.ParseException {
724: ARCReader r = ARCReaderFactory.get(urlOrPath);
725: r.setStrict(false);
726: r.setParseHttpHeaders(true);
727: r.setDigest(true);
728: output(r, CDX_FILE);
729: }
730:
731: /**
732: * Command-line interface to ARCReader.
733: *
734: * Here is the command-line interface:
735: * <pre>
736: * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
737: * -h,--help Prints this message and exits.
738: * -o,--offset Outputs record at this offset into arc file.</pre>
739: *
740: * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll
741: * take care of classpaths and the calling of ARCReader.
742: *
743: * <p>Outputs using a pseudo-CDX format as described here:
744: * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX
745: * Legent</a> and here
746: * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
747: * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
748: * Hash is hard-coded straight SHA-1 hash of content.
749: *
750: * @param args Command-line arguments.
751: * @throws ParseException Failed parse of the command line.
752: * @throws IOException
753: * @throws java.text.ParseException
754: */
755: public static void main(String[] args) throws ParseException,
756: IOException, java.text.ParseException {
757: Options options = getOptions();
758: options.addOption(new Option("p", "parse", false,
759: "Parse headers."));
760: PosixParser parser = new PosixParser();
761: CommandLine cmdline = parser.parse(options, args, false);
762: List cmdlineArgs = cmdline.getArgList();
763: Option[] cmdlineOptions = cmdline.getOptions();
764: HelpFormatter formatter = new HelpFormatter();
765:
766: // If no args, print help.
767: if (cmdlineArgs.size() <= 0) {
768: usage(formatter, options, 0);
769: }
770:
771: // Now look at options passed.
772: long offset = -1;
773: boolean digest = false;
774: boolean strict = false;
775: boolean parse = false;
776: String format = CDX;
777: for (int i = 0; i < cmdlineOptions.length; i++) {
778: switch (cmdlineOptions[i].getId()) {
779: case 'h':
780: usage(formatter, options, 0);
781: break;
782:
783: case 'o':
784: offset = Long.parseLong(cmdlineOptions[i].getValue());
785: break;
786:
787: case 's':
788: strict = true;
789: break;
790:
791: case 'p':
792: parse = true;
793: break;
794:
795: case 'd':
796: digest = getTrueOrFalse(cmdlineOptions[i].getValue());
797: break;
798:
799: case 'f':
800: format = cmdlineOptions[i].getValue().toLowerCase();
801: boolean match = false;
802: // List of supported formats.
803: final String[] supportedFormats = { CDX, DUMP,
804: GZIP_DUMP, HEADER, NOHEAD, CDX_FILE };
805: for (int ii = 0; ii < supportedFormats.length; ii++) {
806: if (supportedFormats[ii].equals(format)) {
807: match = true;
808: break;
809: }
810: }
811: if (!match) {
812: usage(formatter, options, 1);
813: }
814: break;
815:
816: default:
817: throw new RuntimeException("Unexpected option: "
818: + +cmdlineOptions[i].getId());
819: }
820: }
821:
822: if (offset >= 0) {
823: if (cmdlineArgs.size() != 1) {
824: System.out.println("Error: Pass one arcfile only.");
825: usage(formatter, options, 1);
826: }
827: ARCReader arc = ARCReaderFactory.get((String) cmdlineArgs
828: .get(0), offset);
829: arc.setStrict(strict);
830: // We must parse headers if we need to skip them.
831: if (format.equals(NOHEAD) || format.equals(HEADER)) {
832: parse = true;
833: }
834: arc.setParseHttpHeaders(parse);
835: outputRecord(arc, format);
836: } else {
837: for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
838: String urlOrPath = (String) i.next();
839: try {
840: ARCReader r = ARCReaderFactory.get(urlOrPath);
841: r.setStrict(strict);
842: r.setParseHttpHeaders(parse);
843: r.setDigest(digest);
844: output(r, format);
845: } catch (RuntimeException e) {
846: // Write out name of file we failed on to help with
847: // debugging. Then print stack trace and try to keep
848: // going. We do this for case where we're being fed
849: // a bunch of ARCs; just note the bad one and move
850: // on to the next.
851: System.err.println("Exception processing "
852: + urlOrPath + ": " + e.getMessage());
853: e.printStackTrace(System.err);
854: System.exit(1);
855: }
856: }
857: }
858: }
859: }
|