001: /* $Id: Warc2Arc.java 4977 2007-03-09 23:57:28Z stack-sf $
002: *
003: * Created Aug 29, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io;
024:
025: import java.io.File;
026: import java.io.IOException;
027: import java.util.ArrayList;
028: import java.util.Arrays;
029: import java.util.Iterator;
030: import java.util.List;
031: import java.util.concurrent.atomic.AtomicInteger;
032: import java.util.logging.Level;
033: import java.util.logging.Logger;
034:
035: import org.apache.commons.cli.CommandLine;
036: import org.apache.commons.cli.HelpFormatter;
037: import org.apache.commons.cli.Option;
038: import org.apache.commons.cli.Options;
039: import org.apache.commons.cli.ParseException;
040: import org.apache.commons.cli.PosixParser;
041: import org.archive.io.arc.ARCWriter;
042: import org.archive.io.warc.WARCConstants;
043: import org.archive.io.warc.WARCReader;
044: import org.archive.io.warc.WARCReaderFactory;
045: import org.archive.io.warc.WARCRecord;
046: import org.archive.util.ArchiveUtils;
047: import org.archive.util.FileUtils;
048:
049: /**
050: * Convert WARCs to (sortof) ARCs.
051: * WARCs can be 1Gig in size, that is, 10x default ARC size. Script takes
052: * directory as output and will write multiple ARCs for a single large WARC.
053: * Only writes resource records of type <code>text/dns</code> or
054: * <code>application/http; msgtype=response</code>. All others -- metadata,
055: * request -- are skipped.
056: * @author stack
057: * @version $Date: 2007-03-09 23:57:28 +0000 (Fri, 09 Mar 2007) $ $Revision: 4977 $
058: */
059: public class Warc2Arc {
060: private static void usage(HelpFormatter formatter, Options options,
061: int exitCode) {
062: formatter
063: .printHelp(
064: "java org.archive.io.arc.Warc2Arc "
065: + "[--force] [--prefix=PREFIX] [--suffix=SUFFIX] WARC_INPUT "
066: + "OUTPUT_DIR", options);
067: System.exit(exitCode);
068: }
069:
070: static String parseRevision(final String version) {
071: final String ID = "$Revision: ";
072: int index = version.indexOf(ID);
073: return (index < 0) ? version : version.substring(
074: index + ID.length(), version.length() - 1).trim();
075: }
076:
077: private static String getRevision() {
078: return parseRevision("$Revision: 4977 $");
079: }
080:
081: public void transform(final File warc, final File dir,
082: final String prefix, final String suffix,
083: final boolean force) throws IOException,
084: java.text.ParseException {
085: FileUtils.isReadable(warc);
086: FileUtils.isReadable(dir);
087: WARCReader reader = WARCReaderFactory.get(warc);
088: List<String> metadata = new ArrayList<String>();
089: metadata.add("Made from " + reader.getReaderIdentifier()
090: + " by " + this .getClass().getName() + "/"
091: + getRevision());
092: ARCWriter writer = new ARCWriter(new AtomicInteger(), Arrays
093: .asList(new File[] { dir }), prefix, suffix, reader
094: .isCompressed(), -1, metadata);
095: transform(reader, writer);
096: }
097:
098: protected void transform(final WARCReader reader,
099: final ARCWriter writer) throws IOException,
100: java.text.ParseException {
101: // No point digesting. Digest is available after reading of ARC which
102: // is too late for inclusion in WARC.
103: reader.setDigest(false);
104: // I don't want the close being logged -- least, not w/o log of
105: // an opening (and that'd be a little silly for simple script
106: // like this). Currently, it logs at level INFO so that close
107: // of files gets written to log files. Up the log level just
108: // for the close.
109: Logger l = Logger.getLogger(writer.getClass().getName());
110: Level oldLevel = l.getLevel();
111: try {
112: l.setLevel(Level.WARNING);
113: for (final Iterator i = reader.iterator(); i.hasNext();) {
114: WARCRecord r = (WARCRecord) i.next();
115: if (!isARCType(r.getHeader().getMimetype())) {
116: continue;
117: }
118: if (r.getHeader().getContentBegin() <= 0) {
119: // Otherwise, because length include Header-Line and
120: // Named Fields, these will end up in the ARC unless there
121: // is a non-zero content begin.
122: continue;
123: }
124: String ip = (String) r.getHeader().getHeaderValue(
125: (WARCConstants.HEADER_KEY_IP));
126: long length = r.getHeader().getLength();
127: int offset = r.getHeader().getContentBegin();
128: // This mimetype is not exactly what you'd expect to find in
129: // an ARC though technically its 'correct'. To get right one,
130: // need to parse the HTTP Headers. Thats messy. Not doing for
131: // now.
132: String mimetype = r.getHeader().getMimetype();
133: // Clean out ISO time string '-', 'T', ':', and 'Z' characters.
134: String t = r.getHeader().getDate().replaceAll("[-T:Z]",
135: "");
136: long time = ArchiveUtils.getSecondsSinceEpoch(t)
137: .getTime();
138: writer.write(r.getHeader().getUrl(), mimetype, ip,
139: time, (int) (length - offset), r);
140: }
141: } finally {
142: if (reader != null) {
143: reader.close();
144: }
145: if (writer != null) {
146: try {
147: writer.close();
148: } finally {
149: l.setLevel(oldLevel);
150: }
151: }
152: }
153: }
154:
155: protected boolean isARCType(final String mimetype) {
156: // Comparing mimetypes, especially WARC types can be problematic since
157: // they have whitespace. For now, ignore.
158: if (mimetype == null || mimetype.length() <= 0) {
159: return false;
160: }
161: String cleaned = mimetype.toLowerCase().trim();
162: if (cleaned.equals(WARCConstants.HTTP_RESPONSE_MIMETYPE)
163: || cleaned.equals("text/dns")) {
164: return true;
165: }
166: return false;
167: }
168:
169: /**
170: * Command-line interface to Arc2Warc.
171: *
172: * @param args Command-line arguments.
173: * @throws ParseException Failed parse of the command line.
174: * @throws IOException
175: * @throws java.text.ParseException
176: */
177: public static void main(String[] args) throws ParseException,
178: IOException, java.text.ParseException {
179: Options options = new Options();
180: options.addOption(new Option("h", "help", false,
181: "Prints this message and exits."));
182: options.addOption(new Option("f", "force", false,
183: "Force overwrite of target file."));
184: options
185: .addOption(new Option("p", "prefix", true,
186: "Prefix to use on created ARC files, else uses default."));
187: options
188: .addOption(new Option("s", "suffix", true,
189: "Suffix to use on created ARC files, else uses default."));
190: PosixParser parser = new PosixParser();
191: CommandLine cmdline = parser.parse(options, args, false);
192: List cmdlineArgs = cmdline.getArgList();
193: Option[] cmdlineOptions = cmdline.getOptions();
194: HelpFormatter formatter = new HelpFormatter();
195:
196: // If no args, print help.
197: if (cmdlineArgs.size() < 0) {
198: usage(formatter, options, 0);
199: }
200:
201: // Now look at options passed.
202: boolean force = false;
203: String prefix = "WARC2ARC";
204: String suffix = null;
205: for (int i = 0; i < cmdlineOptions.length; i++) {
206: switch (cmdlineOptions[i].getId()) {
207: case 'h':
208: usage(formatter, options, 0);
209: break;
210:
211: case 'f':
212: force = true;
213: break;
214:
215: case 'p':
216: prefix = cmdlineOptions[i].getValue();
217: break;
218:
219: case 's':
220: suffix = cmdlineOptions[i].getValue();
221: break;
222:
223: default:
224: throw new RuntimeException("Unexpected option: "
225: + +cmdlineOptions[i].getId());
226: }
227: }
228:
229: // If no args, print help.
230: if (cmdlineArgs.size() != 2) {
231: usage(formatter, options, 0);
232: }
233: (new Warc2Arc()).transform(new File(cmdlineArgs.get(0)
234: .toString()), new File(cmdlineArgs.get(1).toString()),
235: prefix, suffix, force);
236: }
237: }
|