001: /* $Id: Arc2Warc.java 4977 2007-03-09 23:57:28Z stack-sf $
002: *
003: * Created Aug 29, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io;
024:
025: import java.io.BufferedOutputStream;
026: import java.io.ByteArrayOutputStream;
027: import java.io.File;
028: import java.io.FileOutputStream;
029: import java.io.IOException;
030: import java.util.ArrayList;
031: import java.util.Iterator;
032: import java.util.List;
033: import java.util.logging.Level;
034: import java.util.logging.Logger;
035:
036: import org.apache.commons.cli.CommandLine;
037: import org.apache.commons.cli.HelpFormatter;
038: import org.apache.commons.cli.Option;
039: import org.apache.commons.cli.Options;
040: import org.apache.commons.cli.ParseException;
041: import org.apache.commons.cli.PosixParser;
042: import org.archive.io.arc.ARCConstants;
043: import org.archive.io.arc.ARCReader;
044: import org.archive.io.arc.ARCReaderFactory;
045: import org.archive.io.arc.ARCRecord;
046: import org.archive.io.warc.WARCConstants;
047: import org.archive.io.warc.ExperimentalWARCWriter;
048: import org.archive.util.FileUtils;
049: import org.archive.util.anvl.ANVLRecord;
050:
051: /**
052: * Convert ARCs to (sortof) WARCs.
053: * @author stack
054: * @version $Date: 2007-03-09 23:57:28 +0000 (Fri, 09 Mar 2007) $ $Revision: 4977 $
055: */
056: public class Arc2Warc {
057: private static void usage(HelpFormatter formatter, Options options,
058: int exitCode) {
059: formatter.printHelp("java org.archive.io.arc.Arc2Warc "
060: + "[--force] ARC_INPUT WARC_OUTPUT", options);
061: System.exit(exitCode);
062: }
063:
064: private static String getRevision() {
065: return Warc2Arc.parseRevision("$Revision: 4977 $");
066: }
067:
068: public void transform(final File arc, final File warc,
069: final boolean force) throws IOException {
070: FileUtils.isReadable(arc);
071: if (warc.exists() && !force) {
072: throw new IOException("Target WARC already exists. "
073: + "Will not overwrite.");
074: }
075:
076: ARCReader reader = ARCReaderFactory.get(arc, false, 0);
077: transform(reader, warc);
078: }
079:
080: protected void transform(final ARCReader reader, final File warc)
081: throws IOException {
082: ExperimentalWARCWriter writer = null;
083: // No point digesting. Digest is available after reading of ARC which
084: // is too late for inclusion in WARC.
085: reader.setDigest(false);
086: try {
087: BufferedOutputStream bos = new BufferedOutputStream(
088: new FileOutputStream(warc));
089: // Get the body of the first ARC record as a String so can dump it
090: // into first record of WARC.
091: final Iterator<ArchiveRecord> i = reader.iterator();
092: ARCRecord firstRecord = (ARCRecord) i.next();
093: ByteArrayOutputStream baos = new ByteArrayOutputStream(
094: (int) firstRecord.getHeader().getLength());
095: firstRecord.dump(baos);
096: // Add ARC first record content as an ANVLRecord.
097: ANVLRecord ar = new ANVLRecord(1);
098: ar.addLabelValue("Filedesc", baos.toString());
099: List<String> metadata = new ArrayList<String>(1);
100: metadata.add(ar.toString());
101: // Now create the writer. If reader was compressed, lets write
102: // a compressed WARC.
103: writer = new ExperimentalWARCWriter(null, bos, warc, reader
104: .isCompressed(), null, metadata);
105: // Write a warcinfo record with description about how this WARC
106: // was made.
107: writer.writeWarcinfoRecord(warc.getName(), "Made from "
108: + reader.getReaderIdentifier() + " by "
109: + this .getClass().getName() + "/" + getRevision());
110: for (; i.hasNext();) {
111: write(writer, (ARCRecord) i.next());
112: }
113: } finally {
114: if (reader != null) {
115: reader.close();
116: }
117: if (writer != null) {
118: // I don't want the close being logged -- least, not w/o log of
119: // an opening (and that'd be a little silly for simple script
120: // like this). Currently, it logs at level INFO so that close
121: // of files gets written to log files. Up the log level just
122: // for the close.
123: Logger l = Logger
124: .getLogger(writer.getClass().getName());
125: Level oldLevel = l.getLevel();
126: l.setLevel(Level.WARNING);
127: try {
128: writer.close();
129: } finally {
130: l.setLevel(oldLevel);
131: }
132: }
133: }
134: }
135:
136: protected void write(final ExperimentalWARCWriter writer,
137: final ARCRecord r) throws IOException {
138: ANVLRecord ar = new ANVLRecord();
139: String ip = (String) r.getHeader().getHeaderValue(
140: (ARCConstants.IP_HEADER_FIELD_KEY));
141: if (ip != null && ip.length() > 0) {
142: ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
143: }
144: // If contentBody > 0, assume http headers. Make the mimetype
145: // be application/http. Otherwise, give it ARC mimetype.
146: writer
147: .writeResourceRecord(
148: r.getHeader().getUrl(),
149: r.getHeader().getDate(),
150: (r.getHeader().getContentBegin() > 0) ? WARCConstants.HTTP_RESPONSE_MIMETYPE
151: : r.getHeader().getMimetype(), ar, r, r
152: .getHeader().getLength());
153: }
154:
155: /**
156: * Command-line interface to Arc2Warc.
157: *
158: * @param args Command-line arguments.
159: * @throws ParseException Failed parse of the command line.
160: * @throws IOException
161: * @throws java.text.ParseException
162: */
163: public static void main(String[] args) throws ParseException,
164: IOException, java.text.ParseException {
165: Options options = new Options();
166: options.addOption(new Option("h", "help", false,
167: "Prints this message and exits."));
168: options.addOption(new Option("f", "force", false,
169: "Force overwrite of target file."));
170: PosixParser parser = new PosixParser();
171: CommandLine cmdline = parser.parse(options, args, false);
172: List cmdlineArgs = cmdline.getArgList();
173: Option[] cmdlineOptions = cmdline.getOptions();
174: HelpFormatter formatter = new HelpFormatter();
175:
176: // If no args, print help.
177: if (cmdlineArgs.size() <= 0) {
178: usage(formatter, options, 0);
179: }
180:
181: // Now look at options passed.
182: boolean force = false;
183: for (int i = 0; i < cmdlineOptions.length; i++) {
184: switch (cmdlineOptions[i].getId()) {
185: case 'h':
186: usage(formatter, options, 0);
187: break;
188:
189: case 'f':
190: force = true;
191: break;
192:
193: default:
194: throw new RuntimeException("Unexpected option: "
195: + +cmdlineOptions[i].getId());
196: }
197: }
198:
199: // If no args, print help.
200: if (cmdlineArgs.size() != 2) {
201: usage(formatter, options, 0);
202: }
203: (new Arc2Warc()).transform(new File(cmdlineArgs.get(0)
204: .toString()), new File(cmdlineArgs.get(1).toString()),
205: force);
206: }
207: }
|