Source Code Cross Referenced for ARCReader.java in » Web-Crawler » heritrix » org » archive » io » arc » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.io.arc
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /* $Id: ARCReader.java 5039 2007-04-06 00:29:39Z gojomo $
002:         *
003:         * Created on May 1, 2004
004:         *
005:         * Copyright (C) 2004 Internet Archive.
006:         *
007:         * This file is part of the Heritrix web crawler (crawler.archive.org).
008:         *
009:         * Heritrix is free software; you can redistribute it and/or modify
010:         * it under the terms of the GNU Lesser Public License as published by
011:         * the Free Software Foundation; either version 2.1 of the License, or
012:         * any later version.
013:         *
014:         * Heritrix is distributed in the hope that it will be useful,
015:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
016:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017:         * GNU Lesser Public License for more details.
018:         *
019:         * You should have received a copy of the GNU Lesser Public License
020:         * along with Heritrix; if not, write to the Free Software
021:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022:         */
023:        package org.archive.io.arc;
024:
025:        import java.io.ByteArrayOutputStream;
026:        import java.io.File;
027:        import java.io.IOException;
028:        import java.io.InputStream;
029:        import java.util.ArrayList;
030:        import java.util.Arrays;
031:        import java.util.HashMap;
032:        import java.util.Iterator;
033:        import java.util.List;
034:        import java.util.Map;
035:        import java.util.concurrent.atomic.AtomicInteger;
036:        import java.util.logging.Level;
037:        import java.util.logging.Logger;
038:        import java.util.regex.Matcher;
039:
040:        import org.apache.commons.cli.CommandLine;
041:        import org.apache.commons.cli.HelpFormatter;
042:        import org.apache.commons.cli.Option;
043:        import org.apache.commons.cli.Options;
044:        import org.apache.commons.cli.ParseException;
045:        import org.apache.commons.cli.PosixParser;
046:        import org.archive.io.ArchiveReader;
047:        import org.archive.io.ArchiveRecord;
048:        import org.archive.io.ArchiveRecordHeader;
049:        import org.archive.io.RecoverableIOException;
050:        import org.archive.io.WriterPoolMember;
051:        import org.archive.util.ArchiveUtils;
052:        import org.archive.util.InetAddressUtil;
053:        import org.archive.util.TextUtils;
054:
055:        /**
056:         * Get an iterator on an ARC file or get a record by absolute position.
057:         *
058:         * ARC files are described here:
059:         * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
060:         * File Format</a>.
061:         *
062:         * <p>This class knows how to parse an ARC file.  Pass it a file path
063:         * or an URL to an ARC. It can parse ARC Version 1 and 2.
064:         *
065:         * <p>Iterator returns <code>ARCRecord</code>
066:         * though {@link Iterator#next()} is returning
067:         * java.lang.Object.  Cast the return.
068:         *
069:         * <p>Profiling java.io vs. memory-mapped ByteBufferInputStream shows the
070:         * latter slightly slower -- but not by much.  TODO: Test more.  Just
071:         * change {@link #getInputStream(File, long)}.
072:         *
073:         * @author stack
074:         * @version $Date: 2007-04-06 00:29:39 +0000 (Fri, 06 Apr 2007) $ $Revision: 5039 $
075:         */
076:        public abstract class ARCReader extends ArchiveReader implements 
077:                ARCConstants {
078:            Logger logger = Logger.getLogger(ARCReader.class.getName());
079:
080:            /**
081:             * Set to true if we are aligned on first record of Archive file.
082:             * We used depend on offset. If offset was zero, then we were
083:             * aligned on first record.  This is no longer necessarily the case when
084:             * Reader is created at an offset into an Archive file: The offset is zero
085:             * but its relative to where we started reading.
086:             */
087:            private boolean alignedOnFirstRecord = true;
088:
089:            /**
090:             * Assumed maximum size of a record meta header line.
091:             *
092:             * This 100k which seems massive but its the same as the LINE_LENGTH from
093:             * <code>alexa/include/a_arcio.h</code>:
094:             * <pre>
095:             * #define LINE_LENGTH     (100*1024)
096:             * </pre>
097:             */
098:            private static final int MAX_HEADER_LINE_LENGTH = 1024 * 100;
099:
100:            /**
101:             * Array of field names.
102:             * 
103:             * Used to initialize <code>headerFieldNameKeys</code>.
104:             */
105:            private final String[] headerFieldNameKeysArray = { URL_FIELD_KEY,
106:                    IP_HEADER_FIELD_KEY, DATE_FIELD_KEY, MIMETYPE_FIELD_KEY,
107:                    LENGTH_FIELD_KEY };
108:
109:            /**
110:             * An array of the header field names found in the ARC file header on
111:             * the 3rd line.
112:             * 
113:             * We used to read these in from the arc file first record 3rd line but
114:             * now we hardcode them for sake of improved performance.
115:             */
116:            private final List<String> headerFieldNameKeys = Arrays
117:                    .asList(this .headerFieldNameKeysArray);
118:
119:            private boolean parseHttpHeaders = true;
120:
121:            ARCReader() {
122:                super ();
123:            }
124:
125:            /**
126:             * Skip over any trailing new lines at end of the record so we're lined up
127:             * ready to read the next.
128:             * @param record
129:             * @throws IOException
130:             */
131:            protected void gotoEOR(ArchiveRecord record) throws IOException {
132:                if (getIn().available() <= 0) {
133:                    return;
134:                }
135:
136:                // Remove any trailing LINE_SEPARATOR
137:                int c = -1;
138:                while (getIn().available() > 0) {
139:                    if (getIn().markSupported()) {
140:                        getIn().mark(1);
141:                    }
142:                    c = getIn().read();
143:                    if (c != -1) {
144:                        if (c == LINE_SEPARATOR) {
145:                            continue;
146:                        }
147:                        if (getIn().markSupported()) {
148:                            // We've overread.  We're probably in next record.  There is
149:                            // no way of telling for sure. It may be dross at end of
150:                            // current record. Backup.
151:                            getIn().reset();
152:                            break;
153:                        }
154:                        ArchiveRecordHeader h = (getCurrentRecord() != null) ? record
155:                                .getHeader()
156:                                : null;
157:                        throw new IOException("Read "
158:                                + (char) c
159:                                + " when only "
160:                                + LINE_SEPARATOR
161:                                + " expected. "
162:                                + getReaderIdentifier()
163:                                + ((h != null) ? h.getHeaderFields().toString()
164:                                        : ""));
165:                    }
166:                }
167:            }
168:
169:            /**
170:             * Create new arc record.
171:             *
172:             * Encapsulate housekeeping that has to do w/ creating a new record.
173:             *
174:             * <p>Call this method at end of constructor to read in the
175:             * arcfile header.  Will be problems reading subsequent arc records
176:             * if you don't since arcfile header has the list of metadata fields for
177:             * all records that follow.
178:             * 
179:             * <p>When parsing through ARCs writing out CDX info, we spend about
180:             * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine
181:             * -- of which 16% is reading.
182:             *
183:             * @param is InputStream to use.
184:             * @param offset Absolute offset into arc file.
185:             * @return An arc record.
186:             * @throws IOException
187:             */
188:            protected ARCRecord createArchiveRecord(InputStream is, long offset)
189:                    throws IOException {
190:                ArrayList<String> firstLineValues = new ArrayList<String>(20);
191:                getTokenizedHeaderLine(is, firstLineValues);
192:                int bodyOffset = 0;
193:                if (offset == 0 && isAlignedOnFirstRecord()) {
194:                    // If offset is zero and we were aligned at first record on
195:                    // creation (See #alignedOnFirstRecord for more on this), then no
196:                    // records have been read yet and we're reading our first one, the
197:                    // record of ARC file meta info.  Its special.  In ARC versions
198:                    // 1.x, first record has three lines of meta info. We've just read
199:                    // the first line. There are two more.  The second line has misc.
200:                    // info.  We're only interested in the first field, the version
201:                    // number.  The third line is the list of field names. Here's what
202:                    // ARC file version 1.x meta content looks like:
203:                    //
204:                    // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\
205:                    //      20040107015752 text/plain 77
206:                    // 1 0 InternetArchive
207:                    // URL IP-address Archive-date Content-type Archive-length
208:                    //
209:                    ArrayList<String> secondLineValues = new ArrayList<String>(
210:                            20);
211:                    bodyOffset += getTokenizedHeaderLine(is, secondLineValues);
212:                    setVersion((String) secondLineValues.get(0) + "."
213:                            + (String) secondLineValues.get(1));
214:                    // Just read over the 3rd line.  We used to parse it and use
215:                    // values found here but now we just hardcode them to avoid
216:                    // having to read this 3rd line even for random arc file accesses.
217:                    bodyOffset += getTokenizedHeaderLine(is, null);
218:                }
219:
220:                try {
221:                    currentRecord(new ARCRecord(is,
222:                            (ArchiveRecordHeader) computeMetaData(
223:                                    this .headerFieldNameKeys, firstLineValues,
224:                                    getVersion(), offset), bodyOffset,
225:                            isDigest(), isStrict(), isParseHttpHeaders()));
226:                } catch (IOException e) {
227:                    if (e instanceof  RecoverableIOException) {
228:                        // Don't mess with RecoverableIOExceptions.  Let them out.
229:                        throw e;
230:                    }
231:                    IOException newE = new IOException(e.getMessage()
232:                            + " (Offset " + offset + ").");
233:                    newE.setStackTrace(e.getStackTrace());
234:                    throw newE;
235:                }
236:                return (ARCRecord) getCurrentRecord();
237:            }
238:
239:            /**
240:             * Returns version of this ARC file.  Usually read from first record of ARC.
241:             * If we're reading without having first read the first record -- e.g.
242:             * random access into middle of an ARC -- then version will not have been
243:             * set.  For now, we return a default, version 1.1.  Later, if more than
244:             * just one version of ARC, we could look at such as the meta line to see
245:             * what version of ARC this is.
246:             * @return Version of this ARC file.
247:             */
248:            public String getVersion() {
249:                return (super .getVersion() == null) ? "1.1" : super 
250:                        .getVersion();
251:            }
252:
253:            /**
254:             * Get a record header line as list of tokens.
255:             *
256:             * We keep reading till we find a LINE_SEPARATOR or we reach the end
257:             * of file w/o finding a LINE_SEPARATOR or the line length is crazy.
258:             *
259:             * @param stream InputStream to read from.
260:             * @param list Empty list that gets filled w/ string tokens.
261:             * @return Count of characters read.
262:             * @exception IOException If problem reading stream or no line separator
263:             * found or EOF before EOL or we didn't get minimum header fields.
264:             */
265:            private int getTokenizedHeaderLine(final InputStream stream,
266:                    List<String> list) throws IOException {
267:                // Preallocate usual line size.
268:                StringBuilder buffer = new StringBuilder(2048 + 20);
269:                int read = 0;
270:                int previous = -1;
271:                for (int c = -1; true;) {
272:                    previous = c;
273:                    c = stream.read();
274:                    if (c == -1) {
275:                        throw new RecoverableIOException(
276:                                "Hit EOF before header EOL.");
277:                    }
278:                    c &= 0xff;
279:                    read++;
280:                    if (read > MAX_HEADER_LINE_LENGTH) {
281:                        throw new IOException(
282:                                "Header line longer than max allowed "
283:                                        + " -- "
284:                                        + String
285:                                                .valueOf(MAX_HEADER_LINE_LENGTH)
286:                                        + " -- or passed buffer doesn't contain a line (Read: "
287:                                        + buffer.length()
288:                                        + ").  Here's"
289:                                        + " some of what was read: "
290:                                        + buffer.substring(0, Math.min(buffer
291:                                                .length(), 256)));
292:                    }
293:
294:                    if (c == LINE_SEPARATOR) {
295:                        if (buffer.length() == 0) {
296:                            // Empty line at start of buffer.  Skip it and try again.
297:                            continue;
298:                        }
299:
300:                        if (list != null) {
301:                            list.add(buffer.toString());
302:                        }
303:                        // LOOP TERMINATION.
304:                        break;
305:                    } else if (c == HEADER_FIELD_SEPARATOR) {
306:                        if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {
307:                            // Early ARCs sometimes had multiple spaces between fields.
308:                            continue;
309:                        }
310:                        if (list != null) {
311:                            list.add(buffer.toString());
312:                        }
313:                        // reset to empty
314:                        buffer.setLength(0);
315:                    } else {
316:                        buffer.append((char) c);
317:                    }
318:                }
319:
320:                // List must have at least 3 elements in it and no more than 10.  If
321:                // it has other than this, then bogus parse.
322:                if (list != null && (list.size() < 3 || list.size() > 100)) {
323:                    throw new IOException("Unparseable header line: " + list);
324:                }
325:
326:                return read;
327:            }
328:
329:            /**
330:             * Compute metadata fields.
331:             *
332:             * Here we check the meta field has right number of items in it.
333:             *
334:             * @param keys Keys to use composing headerFields map.
335:             * @param values Values to set into the headerFields map.
336:             * @param v The version of this ARC file.
337:             * @param offset Offset into arc file.
338:             *
339:             * @return Metadata structure for this record.
340:             *
341:             * @exception IOException  If no. of keys doesn't match no. of values.
342:             */
343:            private ARCRecordMetaData computeMetaData(List<String> keys,
344:                    List<String> values, String v, long offset)
345:                    throws IOException {
346:                if (keys.size() != values.size()) {
347:                    List<String> originalValues = values;
348:                    if (!isStrict()) {
349:                        values = fixSpaceInURL(values, keys.size());
350:                        // If values still doesn't match key size, try and do
351:                        // further repair.
352:                        if (keys.size() != values.size()) {
353:                            // Early ARCs had a space in mimetype.
354:                            if (values.size() == (keys.size() + 1)
355:                                    && values.get(4).toLowerCase().startsWith(
356:                                            "charset=")) {
357:                                List<String> nuvalues = new ArrayList<String>(
358:                                        keys.size());
359:                                nuvalues.add(0, values.get(0));
360:                                nuvalues.add(1, values.get(1));
361:                                nuvalues.add(2, values.get(2));
362:                                nuvalues.add(3, values.get(3) + values.get(4));
363:                                nuvalues.add(4, values.get(5));
364:                                values = nuvalues;
365:                            } else if ((values.size() + 1) == keys.size()
366:                                    && isLegitimateIPValue(values.get(1))
367:                                    && isDate(values.get(2))
368:                                    && isNumber(values.get(3))) {
369:                                // Mimetype is empty.
370:                                List<String> nuvalues = new ArrayList<String>(
371:                                        keys.size());
372:                                nuvalues.add(0, values.get(0));
373:                                nuvalues.add(1, values.get(1));
374:                                nuvalues.add(2, values.get(2));
375:                                nuvalues.add(3, "-");
376:                                nuvalues.add(4, values.get(3));
377:                                values = nuvalues;
378:                            }
379:                        }
380:                    }
381:                    if (keys.size() != values.size()) {
382:                        throw new IOException("Size of field name keys does"
383:                                + " not match count of field values: " + values);
384:                    }
385:                    // Note that field was fixed on stderr.
386:                    logStdErr(Level.WARNING,
387:                            "Fixed spaces in metadata line at " + "offset "
388:                                    + offset + " Original: " + originalValues
389:                                    + ", New: " + values);
390:                }
391:
392:                Map<Object, Object> headerFields = new HashMap<Object, Object>(
393:                        keys.size() + 2);
394:                for (int i = 0; i < keys.size(); i++) {
395:                    headerFields.put(keys.get(i), values.get(i));
396:                }
397:
398:                // Add a check for tabs in URLs.  If any, replace with '%09'.
399:                // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,
400:                // [ 1010966 ] crawl.log has URIs with spaces in them.
401:                String url = (String) headerFields.get(URL_FIELD_KEY);
402:                if (url != null && url.indexOf('\t') >= 0) {
403:                    headerFields.put(URL_FIELD_KEY, TextUtils.replaceAll("\t",
404:                            url, "%09"));
405:                }
406:
407:                headerFields.put(VERSION_FIELD_KEY, v);
408:                headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
409:
410:                return new ARCRecordMetaData(getReaderIdentifier(),
411:                        headerFields);
412:            }
413:
414:            protected boolean isDate(final String date) {
415:                if (date.length() != 14) {
416:                    return false;
417:                }
418:                return isNumber(date);
419:            }
420:
421:            protected boolean isNumber(final String n) {
422:                for (int i = 0; i < n.length(); i++) {
423:                    if (!Character.isDigit(n.charAt(i))) {
424:                        return false;
425:                    }
426:                }
427:                return true;
428:            }
429:
430:            protected boolean isLegitimateIPValue(final String ip) {
431:                if ("-".equals(ip)) {
432:                    return true;
433:                }
434:                Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip);
435:                return m != null && m.matches();
436:            }
437:
438:            /**
439:             * Fix space in URLs.
440:             * The ARCWriter used to write into the ARC URLs with spaces in them.
441:             * See <a
442:             * href="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ]
443:             * crawl.log has URIs with spaces in them</a>.
444:             * This method does fix up on such headers converting all spaces found
445:             * to '%20'.
446:             * @param values List of metadata values.
447:             * @param requiredSize Expected size of resultant values list.
448:             * @return New list if we successfully fixed up values or original if
449:             * fixup failed.
450:             */
451:            protected List<String> fixSpaceInURL(List<String> values,
452:                    int requiredSize) {
453:                // Do validity check. 3rd from last is a date of 14 numeric
454:                // characters. The 4th from last is IP, all before the IP
455:                // should be concatenated together with a '%20' joiner.
456:                // In the below, '4' is 4th field from end which has the IP.
457:                if (!(values.size() > requiredSize) || values.size() < 4) {
458:                    return values;
459:                }
460:                // Test 3rd field is valid date.
461:                if (!isDate((String) values.get(values.size() - 3))) {
462:                    return values;
463:                }
464:
465:                // Test 4th field is valid IP.
466:                if (!isLegitimateIPValue((String) values.get(values.size() - 4))) {
467:                    return values;
468:                }
469:
470:                List<String> newValues = new ArrayList<String>(requiredSize);
471:                StringBuffer url = new StringBuffer();
472:                for (int i = 0; i < (values.size() - 4); i++) {
473:                    if (i > 0) {
474:                        url.append("%20");
475:                    }
476:                    url.append(values.get(i));
477:                }
478:                newValues.add(url.toString());
479:                for (int i = values.size() - 4; i < values.size(); i++) {
480:                    newValues.add(values.get(i));
481:                }
482:                return newValues;
483:            }
484:
485:            protected boolean isAlignedOnFirstRecord() {
486:                return alignedOnFirstRecord;
487:            }
488:
489:            protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {
490:                this .alignedOnFirstRecord = alignedOnFirstRecord;
491:            }
492:
493:            /**
494:             * @return Returns the parseHttpHeaders.
495:             */
496:            public boolean isParseHttpHeaders() {
497:                return this .parseHttpHeaders;
498:            }
499:
500:            /**
501:             * @param parse The parseHttpHeaders to set.
502:             */
503:            public void setParseHttpHeaders(boolean parse) {
504:                this .parseHttpHeaders = parse;
505:            }
506:
507:            public String getFileExtension() {
508:                return ARC_FILE_EXTENSION;
509:            }
510:
511:            public String getDotFileExtension() {
512:                return DOT_ARC_FILE_EXTENSION;
513:            }
514:
515:            protected boolean output(final String format) throws IOException,
516:                    java.text.ParseException {
517:                boolean result = super .output(format);
518:                if (!result && (format.equals(NOHEAD) || format.equals(HEADER))) {
519:                    throw new IOException(format
520:                            + " format only supported for single Records");
521:                }
522:                return result;
523:            }
524:
525:            public boolean outputRecord(final String format) throws IOException {
526:                boolean result = super .outputRecord(format);
527:                if (result) {
528:                    return result;
529:                }
530:                if (format.equals(NOHEAD)) {
531:                    // No point digesting if dumping content.
532:                    setDigest(false);
533:                    ARCRecord r = (ARCRecord) get();
534:                    r.skipHttpHeader();
535:                    r.dump();
536:                    result = true;
537:                } else if (format.equals(HEADER)) {
538:                    // No point digesting if dumping content.
539:                    setDigest(false);
540:                    ARCRecord r = (ARCRecord) get();
541:                    r.dumpHttpHeader();
542:                    result = true;
543:                }
544:
545:                return result;
546:            }
547:
548:            public void dump(final boolean compress) throws IOException,
549:                    java.text.ParseException {
550:                // No point digesting if we're doing a dump.
551:                setDigest(false);
552:                boolean firstRecord = true;
553:                ARCWriter writer = null;
554:                for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
555:                    ARCRecord r = (ARCRecord) ii.next();
556:                    // We're to dump the arc on stdout.
557:                    // Get the first record's data if any.
558:                    ARCRecordMetaData meta = r.getMetaData();
559:                    if (firstRecord) {
560:                        firstRecord = false;
561:                        // Get an ARCWriter.
562:                        ByteArrayOutputStream baos = new ByteArrayOutputStream(
563:                                r.available());
564:                        // This is slow but done only once at top of ARC.
565:                        while (r.available() > 0) {
566:                            baos.write(r.read());
567:                        }
568:                        List<String> listOfMetadata = new ArrayList<String>();
569:                        listOfMetadata
570:                                .add(baos.toString(WriterPoolMember.UTF8));
571:                        // Assume getArc returns full path to file.  ARCWriter
572:                        // or new File will complain if it is otherwise.
573:                        writer = new ARCWriter(new AtomicInteger(), System.out,
574:                                new File(meta.getArc()), compress, meta
575:                                        .getDate(), listOfMetadata);
576:                        continue;
577:                    }
578:
579:                    writer.write(meta.getUrl(), meta.getMimetype(), meta
580:                            .getIp(), ArchiveUtils.parse14DigitDate(
581:                            meta.getDate()).getTime(), (int) meta.getLength(),
582:                            r);
583:                }
584:                // System.out.println(System.currentTimeMillis() - start);
585:            }
586:
587:            /**
588:             * @return an ArchiveReader that will delete a local file on close.  Used
589:             * when we bring Archive files local and need to clean up afterward.
590:             */
591:            public ARCReader getDeleteFileOnCloseReader(final File f) {
592:                final ARCReader d = this ;
593:                return new ARCReader() {
594:                    private final ARCReader delegate = d;
595:                    private File archiveFile = f;
596:
597:                    public void close() throws IOException {
598:                        this .delegate.close();
599:                        if (this .archiveFile != null) {
600:                            if (archiveFile.exists()) {
601:                                archiveFile.delete();
602:                            }
603:                            this .archiveFile = null;
604:                        }
605:                    }
606:
607:                    public ArchiveRecord get(long o) throws IOException {
608:                        return this .delegate.get(o);
609:                    }
610:
611:                    public boolean isDigest() {
612:                        return this .delegate.isDigest();
613:                    }
614:
615:                    public boolean isStrict() {
616:                        return this .delegate.isStrict();
617:                    }
618:
619:                    public Iterator<ArchiveRecord> iterator() {
620:                        return this .delegate.iterator();
621:                    }
622:
623:                    public void setDigest(boolean d) {
624:                        this .delegate.setDigest(d);
625:                    }
626:
627:                    public void setStrict(boolean s) {
628:                        this .delegate.setStrict(s);
629:                    }
630:
631:                    public List validate() throws IOException {
632:                        return this .delegate.validate();
633:                    }
634:
635:                    @Override
636:                    public ArchiveRecord get() throws IOException {
637:                        return this .delegate.get();
638:                    }
639:
640:                    @Override
641:                    public String getVersion() {
642:                        return this .delegate.getVersion();
643:                    }
644:
645:                    @Override
646:                    public List validate(int noRecords) throws IOException {
647:                        return this .delegate.validate(noRecords);
648:                    }
649:
650:                    @Override
651:                    protected ARCRecord createArchiveRecord(InputStream is,
652:                            long offset) throws IOException {
653:                        return this .delegate.createArchiveRecord(is, offset);
654:                    }
655:
656:                    @Override
657:                    protected void gotoEOR(ArchiveRecord record)
658:                            throws IOException {
659:                        this .delegate.gotoEOR(record);
660:                    }
661:
662:                    @Override
663:                    public void dump(boolean compress) throws IOException,
664:                            java.text.ParseException {
665:                        this .delegate.dump(compress);
666:                    }
667:
668:                    @Override
669:                    public String getDotFileExtension() {
670:                        return this .delegate.getDotFileExtension();
671:                    }
672:
673:                    @Override
674:                    public String getFileExtension() {
675:                        return this .delegate.getFileExtension();
676:                    }
677:                };
678:            }
679:
680:            // Static methods follow.
681:
682:            /**
683:             *
684:             * @param formatter Help formatter instance.
685:             * @param options Usage options.
686:             * @param exitCode Exit code.
687:             */
688:            private static void usage(HelpFormatter formatter, Options options,
689:                    int exitCode) {
690:                formatter
691:                        .printHelp(
692:                                "java org.archive.io.arc.ARCReader"
693:                                        + " [--digest=true|false] \\\n"
694:                                        + " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]"
695:                                        + " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL",
696:                                options);
697:                System.exit(exitCode);
698:            }
699:
700:            /**
701:             * Write out the arcfile.
702:             * 
703:             * @param reader
704:             * @param format Format to use outputting.
705:             * @throws IOException
706:             * @throws java.text.ParseException
707:             */
708:            protected static void output(ARCReader reader, String format)
709:                    throws IOException, java.text.ParseException {
710:                if (!reader.output(format)) {
711:                    throw new IOException("Unsupported format: " + format);
712:                }
713:            }
714:
715:            /**
716:             * Generate a CDX index file for an ARC file.
717:             *
718:             * @param urlOrPath The ARC file to generate a CDX index for
719:             * @throws IOException
720:             * @throws java.text.ParseException
721:             */
722:            public static void createCDXIndexFile(String urlOrPath)
723:                    throws IOException, java.text.ParseException {
724:                ARCReader r = ARCReaderFactory.get(urlOrPath);
725:                r.setStrict(false);
726:                r.setParseHttpHeaders(true);
727:                r.setDigest(true);
728:                output(r, CDX_FILE);
729:            }
730:
731:            /**
732:             * Command-line interface to ARCReader.
733:             *
734:             * Here is the command-line interface:
735:             * <pre>
736:             * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
737:             *  -h,--help      Prints this message and exits.
738:             *  -o,--offset    Outputs record at this offset into arc file.</pre>
739:             *
740:             * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll
741:             * take care of classpaths and the calling of ARCReader.
742:             *
743:             * <p>Outputs using a pseudo-CDX format as described here:
744:             * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX
745:             * Legent</a> and here
746:             * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
747:             * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
748:             * Hash is hard-coded straight SHA-1 hash of content.
749:             *
750:             * @param args Command-line arguments.
751:             * @throws ParseException Failed parse of the command line.
752:             * @throws IOException
753:             * @throws java.text.ParseException
754:             */
755:            public static void main(String[] args) throws ParseException,
756:                    IOException, java.text.ParseException {
757:                Options options = getOptions();
758:                options.addOption(new Option("p", "parse", false,
759:                        "Parse headers."));
760:                PosixParser parser = new PosixParser();
761:                CommandLine cmdline = parser.parse(options, args, false);
762:                List cmdlineArgs = cmdline.getArgList();
763:                Option[] cmdlineOptions = cmdline.getOptions();
764:                HelpFormatter formatter = new HelpFormatter();
765:
766:                // If no args, print help.
767:                if (cmdlineArgs.size() <= 0) {
768:                    usage(formatter, options, 0);
769:                }
770:
771:                // Now look at options passed.
772:                long offset = -1;
773:                boolean digest = false;
774:                boolean strict = false;
775:                boolean parse = false;
776:                String format = CDX;
777:                for (int i = 0; i < cmdlineOptions.length; i++) {
778:                    switch (cmdlineOptions[i].getId()) {
779:                    case 'h':
780:                        usage(formatter, options, 0);
781:                        break;
782:
783:                    case 'o':
784:                        offset = Long.parseLong(cmdlineOptions[i].getValue());
785:                        break;
786:
787:                    case 's':
788:                        strict = true;
789:                        break;
790:
791:                    case 'p':
792:                        parse = true;
793:                        break;
794:
795:                    case 'd':
796:                        digest = getTrueOrFalse(cmdlineOptions[i].getValue());
797:                        break;
798:
799:                    case 'f':
800:                        format = cmdlineOptions[i].getValue().toLowerCase();
801:                        boolean match = false;
802:                        // List of supported formats.
803:                        final String[] supportedFormats = { CDX, DUMP,
804:                                GZIP_DUMP, HEADER, NOHEAD, CDX_FILE };
805:                        for (int ii = 0; ii < supportedFormats.length; ii++) {
806:                            if (supportedFormats[ii].equals(format)) {
807:                                match = true;
808:                                break;
809:                            }
810:                        }
811:                        if (!match) {
812:                            usage(formatter, options, 1);
813:                        }
814:                        break;
815:
816:                    default:
817:                        throw new RuntimeException("Unexpected option: "
818:                                + +cmdlineOptions[i].getId());
819:                    }
820:                }
821:
822:                if (offset >= 0) {
823:                    if (cmdlineArgs.size() != 1) {
824:                        System.out.println("Error: Pass one arcfile only.");
825:                        usage(formatter, options, 1);
826:                    }
827:                    ARCReader arc = ARCReaderFactory.get((String) cmdlineArgs
828:                            .get(0), offset);
829:                    arc.setStrict(strict);
830:                    // We must parse headers if we need to skip them.
831:                    if (format.equals(NOHEAD) || format.equals(HEADER)) {
832:                        parse = true;
833:                    }
834:                    arc.setParseHttpHeaders(parse);
835:                    outputRecord(arc, format);
836:                } else {
837:                    for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
838:                        String urlOrPath = (String) i.next();
839:                        try {
840:                            ARCReader r = ARCReaderFactory.get(urlOrPath);
841:                            r.setStrict(strict);
842:                            r.setParseHttpHeaders(parse);
843:                            r.setDigest(digest);
844:                            output(r, format);
845:                        } catch (RuntimeException e) {
846:                            // Write out name of file we failed on to help with
847:                            // debugging.  Then print stack trace and try to keep
848:                            // going.  We do this for case where we're being fed
849:                            // a bunch of ARCs; just note the bad one and move
850:                            // on to the next.
851:                            System.err.println("Exception processing "
852:                                    + urlOrPath + ": " + e.getMessage());
853:                            e.printStackTrace(System.err);
854:                            System.exit(1);
855:                        }
856:                    }
857:                }
858:            }
859:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.