001: /*
002: * ARC2WCDX.java
003: *
004: * $Id: ARC2WCDX.java 4903 2007-02-16 01:45:10Z gojomo $
005: *
006: * Created on Nov 13, 2006
007: *
008: * Copyright (C) 2006 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026: package org.archive.io.arc;
027:
028: import java.io.File;
029: import java.io.FileOutputStream;
030: import java.io.IOException;
031: import java.io.PrintStream;
032: import java.util.Date;
033: import java.util.Iterator;
034: import java.util.zip.GZIPOutputStream;
035:
036: import org.apache.commons.httpclient.Header;
037: import org.apache.commons.httpclient.HeaderGroup;
038: import org.apache.commons.httpclient.util.DateParseException;
039: import org.apache.commons.httpclient.util.DateUtil;
040: import org.archive.util.ArchiveUtils;
041: import org.archive.util.SURT;
042:
043: /**
044: * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC.
045: * Writes .wcdx.gz in same directory.
046: *
047: * @author gojomo
048: */
049: public class ARC2WCDX {
050: final public static String WCDX_VERSION = "0.1";
051:
052: public static void main(String[] args) throws IOException {
053: String arcFilename = args[0];
054: createWcdx(arcFilename);
055: }
056:
057: public static Object[] createWcdx(String arcFilename)
058: throws IOException {
059: ARCReader reader = ARCReaderFactory.get(arcFilename);
060: Object[] retVal = createWcdx(reader);
061: reader.close();
062: return retVal;
063: }
064:
065: public static Object[] createWcdx(ARCReader reader) {
066: reader.setDigest(true);
067:
068: String wcdxPath = reader.getReaderIdentifier().replaceAll(
069: "\\.arc(\\.gz)?$", ".wcdx.gz");
070: File wcdxFile = new File(wcdxPath + ".open");
071: PrintStream writer = null;
072: long count = 0;
073: try {
074: writer = new PrintStream(new GZIPOutputStream(
075: new FileOutputStream(wcdxFile)));
076:
077: // write header: legend + timestamp
078: StringBuilder legend = new StringBuilder();
079: appendField(legend, "CDX");
080: appendField(legend, "surt-uri");
081: appendField(legend, "b"); // ARC timestamp
082: appendField(legend, "http-date");
083: appendField(legend, "s"); // status code
084: appendField(legend, "m"); // media type
085: appendField(legend, "sha1"); // content sha1
086: appendField(legend, "g"); // ARC name
087: appendField(legend, "V"); // start offset
088: appendField(legend, "end-offset"); // TODO: implement
089: appendField(legend, "n"); // ARC record length TODO: verify
090: appendField(legend, "http-content-length");
091: appendField(legend, "http-last-modified");
092: appendField(legend, "http-expires");
093: appendField(legend, "http-etag");
094: appendField(legend, "http-location");
095: appendField(legend, "e"); // IP
096: appendField(legend, "a"); // original URL
097: // WCDX version+creation time: crude version control
098: appendField(legend, WCDX_VERSION + "@"
099: + ArchiveUtils.get14DigitDate());
100: writer.println(legend.toString());
101:
102: Iterator iter = reader.iterator();
103: count = 0;
104: while (iter.hasNext()) {
105: ARCRecord record = (ARCRecord) iter.next();
106: record.close();
107: ARCRecordMetaData h = (ARCRecordMetaData) record
108: .getHeader();
109: Header[] httpHeaders = record.getHttpHeaders();
110: if (httpHeaders == null) {
111: httpHeaders = new Header[0];
112: }
113: HeaderGroup hg = new HeaderGroup();
114: hg.setHeaders(httpHeaders);
115: StringBuilder builder = new StringBuilder();
116:
117: // SURT-form URI
118: appendField(builder, SURT.fromURI(h.getUrl()));
119: // record timestamp ('b')
120: appendField(builder, h.getDate());
121: // http header date
122: appendTimeField(builder, hg.getFirstHeader("Date"));
123: // response code ('s')
124: appendField(builder, h.getStatusCode());
125: // media type ('m')
126: appendField(builder, h.getMimetype());
127: // content checksum (like 'c', but here Base32 SHA1)
128: appendField(builder, record.getDigestStr());
129: // arc name ('g')
130: appendField(builder, reader.getFileName());
131: // compressed start offset ('V')
132: appendField(builder, h.getOffset());
133:
134: // compressed end offset (?)
135: // appendField(builder,
136: // reader.getInputStream() instanceof RepositionableStream
137: // ? ((GzippedInputStream)reader.getInputStream()).vPosition()
138: // : "-");
139: // TODO; leave unavail for now
140: appendField(builder, "-");
141:
142: // uncompressed (declared in ARC headerline) record length
143: appendField(builder, h.getLength());
144: // http header content-length
145: appendField(builder, hg
146: .getFirstHeader("Content-Length"));
147:
148: // http header mod-date
149: appendTimeField(builder, hg
150: .getFirstHeader("Last-Modified"));
151: // http header expires
152: appendTimeField(builder, hg.getFirstHeader("Expires"));
153:
154: // http header etag
155: appendField(builder, hg.getFirstHeader("ETag"));
156: // http header redirect ('Location' header?)
157: appendField(builder, hg.getFirstHeader("Location"));
158: // ip ('e')
159: appendField(builder, h.getIp());
160: // original URI
161: appendField(builder, h.getUrl());
162: // TODO MAYBE - a title from inside content?
163:
164: writer.println(builder.toString());
165: count++;
166: }
167: wcdxFile.renameTo(new File(wcdxPath));
168: } catch (IOException e) {
169: // soldier on: but leave '.open' wcdx file as indicator of error
170: if (!wcdxFile.exists()) {
171: try {
172: wcdxFile.createNewFile();
173: } catch (IOException e1) {
174: // TODO Auto-generated catch block
175: throw new RuntimeException(e1);
176: }
177: }
178: } catch (RuntimeException e) {
179: // soldier on: but leave '.open' wcdx file as indicator of error
180: if (!wcdxFile.exists()) {
181: try {
182: wcdxFile.createNewFile();
183: } catch (IOException e1) {
184: // TODO Auto-generated catch block
185: throw new RuntimeException(e1);
186: }
187: }
188: } finally {
189: if (writer != null) {
190: writer.close();
191: }
192: }
193:
194: return new Object[] { wcdxPath, count };
195: }
196:
197: protected static void appendField(StringBuilder builder, Object obj) {
198: if (builder.length() > 0) {
199: // prepend with delimiter
200: builder.append(' ');
201: }
202: if (obj instanceof Header) {
203: obj = ((Header) obj).getValue().trim();
204: }
205:
206: builder
207: .append((obj == null || obj.toString().length() == 0) ? "-"
208: : obj);
209: }
210:
211: protected static void appendTimeField(StringBuilder builder,
212: Object obj) {
213: if (builder.length() > 0) {
214: // prepend with delimiter
215: builder.append(' ');
216: }
217: if (obj == null) {
218: builder.append("-");
219: return;
220: }
221: if (obj instanceof Header) {
222: String s = ((Header) obj).getValue().trim();
223: try {
224: Date date = DateUtil.parseDate(s);
225: String d = ArchiveUtils.get14DigitDate(date);
226: if (d.startsWith("209")) {
227: d = "199" + d.substring(3);
228: }
229: obj = d;
230: } catch (DateParseException e) {
231: builder.append('e');
232: return;
233: }
234:
235: }
236: builder.append(obj);
237: }
238: }
239:
240: //'wide' CDX
241: //a original url
242: //b timestamp
243: //s resp code
244: //m type
245: //? content md5 (full 'k'? 'c'?
246: //g arc name
247: //V compressed start offset
248: //? compressed length
249: //n? uncompressed length
250: //? mod date
251: //? expires
252: //? server 'date' hdr
253: //? etag
254: //r redirect ('Location'?)
255: //e ip
256: //MAYBE:
257: //? TITLE from HTML or other format?
|