001: /* PersistLogProcessor.java
002: *
003: * Created on Feb 18, 2005
004: *
005: * Copyright (C) 2007 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.processor.recrawl;
024:
025: import java.io.File;
026: import java.io.FileNotFoundException;
027: import java.io.IOException;
028: import java.io.PrintStream;
029: import java.util.concurrent.atomic.AtomicInteger;
030:
031: import org.apache.commons.codec.binary.Base64;
032: import org.archive.crawler.datamodel.CrawlURI;
033: import org.archive.crawler.event.CrawlStatusListener;
034: import org.archive.crawler.io.CrawlerJournal;
035: import org.archive.crawler.settings.SimpleType;
036: import org.archive.util.FileUtils;
037: import org.archive.util.IoUtils;
038:
039: /**
040: * Log CrawlURI attributes from latest fetch for consultation by a later
041: * recrawl. Log must be imported into alternate data structure in order
042: * to be consulted.
043: *
044: * @author gojomo
045: * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
046: */
047: public class PersistLogProcessor extends PersistProcessor implements
048: CrawlStatusListener {
049: private static final long serialVersionUID = 1678691994065439346L;
050:
051: protected CrawlerJournal log;
052:
053: /** setting for log filename */
054: public static final String ATTR_LOG_FILENAME = "log-filename";
055: /** default log filename */
056: public static final String DEFAULT_LOG_FILENAME = "persistlog.txtser.gz";
057:
058: /**
059: * Usual constructor
060: *
061: * @param name
062: */
063: public PersistLogProcessor(String name) {
064: super (
065: name,
066: "PersistLogProcessor. Logs CrawlURI attributes "
067: + "from latest fetch for consultation by a later recrawl.");
068:
069: addElementToDefinition(new SimpleType(
070: ATTR_LOG_FILENAME,
071: "Filename to which to log URI persistence information. "
072: + "Interpreted relative to job logs directory. "
073: + "Default is 'persistlog.txtser.gz'. ",
074: DEFAULT_LOG_FILENAME));
075: }
076:
077: @Override
078: protected void initialTasks() {
079: // Add this class to crawl state listeners to note checkpoints
080: getController().addCrawlStatusListener(this );
081: try {
082: File logFile = FileUtils.maybeRelative(getController()
083: .getLogsDir(), (String) getUncheckedAttribute(null,
084: ATTR_LOG_FILENAME));
085: log = new CrawlerJournal(logFile);
086: } catch (IOException e) {
087: // TODO Auto-generated catch block
088: throw new RuntimeException(e);
089: }
090: }
091:
092: @Override
093: protected void finalTasks() {
094: log.close();
095: }
096:
097: @Override
098: protected void innerProcess(CrawlURI curi) {
099: if (shouldStore(curi)) {
100: log.writeLine(persistKeyFor(curi), " ", new String(Base64
101: .encodeBase64(IoUtils.serializeToByteArray(curi
102: .getPersistentAList()))));
103: }
104: }
105:
106: public void crawlCheckpoint(File checkpointDir) throws Exception {
107: // rotate log
108: log.checkpoint(checkpointDir);
109: }
110:
111: public void crawlEnded(String sExitMessage) {
112: // ignored
113:
114: }
115:
116: public void crawlEnding(String sExitMessage) {
117: // ignored
118:
119: }
120:
121: public void crawlPaused(String statusMessage) {
122: // ignored
123:
124: }
125:
126: public void crawlPausing(String statusMessage) {
127: // ignored
128:
129: }
130:
131: public void crawlResuming(String statusMessage) {
132: // ignored
133:
134: }
135:
136: public void crawlStarted(String message) {
137: // ignored
138: }
139: }
|