001: /*
002: * ARCWriter
003: *
004: * $Id: ARCWriterProcessor.java 5031 2007-04-02 19:03:04Z gojomo $
005: *
006: * Created on Jun 5, 2003
007: *
008: * Copyright (C) 2003 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026: package org.archive.crawler.writer;
027:
028: import java.io.File;
029: import java.io.FileInputStream;
030: import java.io.FileNotFoundException;
031: import java.io.IOException;
032: import java.io.InputStream;
033: import java.io.StringWriter;
034: import java.net.InetAddress;
035: import java.net.UnknownHostException;
036: import java.util.ArrayList;
037: import java.util.List;
038: import java.util.concurrent.atomic.AtomicInteger;
039: import java.util.logging.Level;
040: import java.util.logging.Logger;
041:
042: import javax.xml.transform.SourceLocator;
043: import javax.xml.transform.Templates;
044: import javax.xml.transform.Transformer;
045: import javax.xml.transform.TransformerConfigurationException;
046: import javax.xml.transform.TransformerException;
047: import javax.xml.transform.TransformerFactory;
048: import javax.xml.transform.stream.StreamResult;
049: import javax.xml.transform.stream.StreamSource;
050:
051: import org.archive.crawler.Heritrix;
052: import org.archive.crawler.datamodel.CoreAttributeConstants;
053: import org.archive.crawler.datamodel.CrawlURI;
054: import org.archive.crawler.datamodel.FetchStatusCodes;
055: import org.archive.crawler.event.CrawlStatusListener;
056: import org.archive.crawler.framework.WriterPoolProcessor;
057: import org.archive.crawler.settings.XMLSettingsHandler;
058: import org.archive.io.ReplayInputStream;
059: import org.archive.io.WriterPoolMember;
060: import org.archive.io.WriterPoolSettings;
061: import org.archive.io.arc.ARCConstants;
062: import org.archive.io.arc.ARCWriter;
063: import org.archive.io.arc.ARCWriterPool;
064:
065: /**
066: * Processor module for writing the results of successful fetches (and
067: * perhaps someday, certain kinds of network failures) to the Internet Archive
068: * ARC file format.
069: *
070: * Assumption is that there is only one of these ARCWriterProcessors per
071: * Heritrix instance.
072: *
073: * @author Parker Thompson
074: */
075: public class ARCWriterProcessor extends WriterPoolProcessor implements
076: CoreAttributeConstants, ARCConstants, CrawlStatusListener,
077: WriterPoolSettings, FetchStatusCodes {
078: private static final long serialVersionUID = 1957518408532644531L;
079:
080: private final Logger logger = Logger.getLogger(this .getClass()
081: .getName());
082:
083: /**
084: * Default path list.
085: */
086: private static final String[] DEFAULT_PATH = { "arcs" };
087:
088: /**
089: * @param name Name of this writer.
090: */
091: public ARCWriterProcessor(String name) {
092: super (name, "ARCWriter processor");
093: }
094:
095: protected String[] getDefaultPath() {
096: return DEFAULT_PATH;
097: }
098:
099: protected void setupPool(final AtomicInteger serialNo) {
100: setPool(new ARCWriterPool(serialNo, this ,
101: getPoolMaximumActive(), getPoolMaximumWait()));
102: }
103:
104: /**
105: * Writes a CrawlURI and its associated data to store file.
106: *
107: * Currently this method understands the following uri types: dns, http,
108: * and https.
109: *
110: * @param curi CrawlURI to process.
111: */
112: protected void innerProcess(CrawlURI curi) {
113: // If failure, or we haven't fetched the resource yet, return
114: if (curi.getFetchStatus() <= 0) {
115: return;
116: }
117:
118: // If no recorded content at all, don't write record.
119: long recordLength = curi.getContentSize();
120: if (recordLength <= 0) {
121: // getContentSize() should be > 0 if any material (even just
122: // HTTP headers with zero-length body) is available.
123: return;
124: }
125:
126: try {
127: if (shouldWrite(curi)) {
128: InputStream is = curi.getHttpRecorder()
129: .getRecordedInput().getReplayInputStream();
130: write(curi, recordLength, is, getHostAddress(curi));
131: } else {
132: logger.info("does not write " + curi.toString());
133: }
134: } catch (IOException e) {
135: curi.addLocalizedError(this .getName(), e, "WriteRecord: "
136: + curi.toString());
137: logger.log(Level.SEVERE, "Failed write of Record: "
138: + curi.toString(), e);
139: }
140: }
141:
142: protected void write(CrawlURI curi, long recordLength,
143: InputStream in, String ip) throws IOException {
144: WriterPoolMember writer = getPool().borrowFile();
145: long position = writer.getPosition();
146: // See if we need to open a new file because we've exceeed maxBytes.
147: // Call to checkFileSize will open new file if we're at maximum for
148: // current file.
149: writer.checkSize();
150: if (writer.getPosition() != position) {
151: // We just closed the file because it was larger than maxBytes.
152: // Add to the totalBytesWritten the size of the first record
153: // in the file, if any.
154: setTotalBytesWritten(getTotalBytesWritten()
155: + (writer.getPosition() - position));
156: position = writer.getPosition();
157: }
158:
159: ARCWriter w = (ARCWriter) writer;
160: try {
161: if (in instanceof ReplayInputStream) {
162: w.write(curi.toString(), curi.getContentType(), ip,
163: curi.getLong(A_FETCH_BEGAN_TIME), recordLength,
164: (ReplayInputStream) in);
165: } else {
166: w.write(curi.toString(), curi.getContentType(), ip,
167: curi.getLong(A_FETCH_BEGAN_TIME), recordLength,
168: in);
169: }
170: } catch (IOException e) {
171: // Invalidate this file (It gets a '.invalid' suffix).
172: getPool().invalidateFile(writer);
173: // Set the writer to null otherwise the pool accounting
174: // of how many active writers gets skewed if we subsequently
175: // do a returnWriter call on this object in the finally block.
176: writer = null;
177: throw e;
178: } finally {
179: if (writer != null) {
180: setTotalBytesWritten(getTotalBytesWritten()
181: + (writer.getPosition() - position));
182: getPool().returnFile(writer);
183: }
184: }
185: checkBytesWritten();
186: }
187:
188: @Override
189: protected String getFirstrecordStylesheet() {
190: return "/arcMetaheaderBody.xsl";
191: }
192: }
|