001: /* CrawlerJournal.java
002: *
003: * Created on Mar 6, 2007
004: *
005: * Copyright (C) 2007 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.io;
024:
025: import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
026: import it.unimi.dsi.mg4j.util.MutableString;
027:
028: import java.io.BufferedInputStream;
029: import java.io.BufferedReader;
030: import java.io.File;
031: import java.io.FileInputStream;
032: import java.io.FileNotFoundException;
033: import java.io.FileOutputStream;
034: import java.io.IOException;
035: import java.io.InputStreamReader;
036: import java.io.OutputStreamWriter;
037: import java.io.Writer;
038: import java.util.zip.GZIPInputStream;
039: import java.util.zip.GZIPOutputStream;
040:
041: import org.archive.util.ArchiveUtils;
042:
043: /**
044: * Utility class for a crawler journal/log that is compressed and
045: * rotates by serial number at checkpoints.
046: *
047: * @author gojomo
048: */
049: public class CrawlerJournal {
050:
051: /** prefix for error lines*/
052: public static final String LOG_ERROR = "E ";
053: /** prefix for timestamp lines */
054: public static final String LOG_TIMESTAMP = "T ";
055:
056: /**
057: * Get a BufferedReader on the crawler journal given
058: *
059: * @param source File journal
060: * @return journal buffered reader.
061: * @throws IOException
062: */
063: public static BufferedReader getBufferedReader(File source)
064: throws IOException {
065: boolean isGzipped = source.getName().toLowerCase().endsWith(
066: GZIP_SUFFIX);
067: FileInputStream fis = new FileInputStream(source);
068: return new BufferedReader(isGzipped ? new InputStreamReader(
069: new GZIPInputStream(fis)) : new InputStreamReader(fis));
070: }
071:
072: /**
073: * Get a BufferedInputStream on the recovery file given.
074: *
075: * @param source file to open
076: * @return journal buffered input stream.
077: * @throws IOException
078: */
079: public static BufferedInputStream getBufferedInput(File source)
080: throws IOException {
081: boolean isGzipped = source.getName().toLowerCase().endsWith(
082: GZIP_SUFFIX);
083: FileInputStream fis = new FileInputStream(source);
084: return isGzipped ? new BufferedInputStream(new GZIPInputStream(
085: fis)) : new BufferedInputStream(fis);
086: }
087:
088: /**
089: * Stream on which we record frontier events.
090: */
091: protected Writer out = null;
092:
093: /** line count */
094: protected long lines = 0;
095: /** number of lines between timestamps */
096: protected int timestamp_interval = 0; // 0 means no timestamps
097:
098: /** suffix to recognize gzipped files */
099: public static final String GZIP_SUFFIX = ".gz";
100:
101: /**
102: * File we're writing journal to.
103: * Keep a reference in case we want to rotate it off.
104: */
105: protected File gzipFile = null;
106:
107: /**
108: * Create a new crawler journal at the given location
109: *
110: * @param path Directory to make thejournal in.
111: * @param filename Name to use for journal file.
112: * @throws IOException
113: */
114: public CrawlerJournal(String path, String filename)
115: throws IOException {
116: this .gzipFile = new File(path, filename);
117: this .out = initialize(gzipFile);
118: }
119:
120: /**
121: * Create a new crawler journal at the given location
122: *
123: * @param file path at which to make journal
124: * @throws IOException
125: */
126: public CrawlerJournal(File file) throws IOException {
127: this .gzipFile = file;
128: this .out = initialize(gzipFile);
129: }
130:
131: /**
132: * Allocate a buffer for accumulating lines to write and reuse it.
133: */
134: protected MutableString accumulatingBuffer = new MutableString(1024);
135:
136: protected Writer initialize(final File f)
137: throws FileNotFoundException, IOException {
138: return new OutputStreamWriter(new GZIPOutputStream(
139: new FastBufferedOutputStream(new FileOutputStream(f))));
140: }
141:
142: /**
143: * Write a line
144: *
145: * @param string String
146: */
147: public synchronized void writeLine(String string) {
148: try {
149: this .out.write("\n");
150: this .out.write(string);
151: noteLine();
152: } catch (IOException e) {
153: e.printStackTrace();
154: }
155: }
156:
157: /**
158: * Write a line of two strings
159: *
160: * @param s1 String
161: * @param s2 String
162: */
163: public synchronized void writeLine(String s1, String s2) {
164: try {
165: this .out.write("\n");
166: this .out.write(s1);
167: this .out.write(s2);
168: noteLine();
169: } catch (IOException e) {
170: e.printStackTrace();
171: }
172: }
173:
174: /**
175: * Write a line of three strings
176: *
177: * @param s1 String
178: * @param s2 String
179: * @param s3 String
180: */
181: public synchronized void writeLine(String s1, String s2, String s3) {
182: try {
183: this .out.write("\n");
184: this .out.write(s1);
185: this .out.write(s2);
186: this .out.write(s3);
187: noteLine();
188: } catch (IOException e) {
189: e.printStackTrace();
190: }
191: }
192:
193: /**
194: * Write a line.
195: *
196: * @param mstring MutableString to write
197: */
198: public synchronized void writeLine(MutableString mstring) {
199: if (this .out == null) {
200: return;
201: }
202: try {
203: this .out.write("\n");
204: mstring.write(out);
205: noteLine();
206: } catch (IOException e) {
207: e.printStackTrace();
208: }
209: }
210:
211: /**
212: * Count and note a line
213: *
214: * @throws IOException
215: */
216: protected void noteLine() throws IOException {
217: lines++;
218: considerTimestamp();
219: }
220:
221: /**
222: * Write a timestamp line if appropriate
223: *
224: * @throws IOException
225: */
226: protected void considerTimestamp() throws IOException {
227: if (timestamp_interval > 0 && lines % timestamp_interval == 0) {
228: out.write("\n");
229: out.write(LOG_TIMESTAMP);
230: out.write(ArchiveUtils.getLog14Date());
231: }
232: }
233:
234: /**
235: * Flush and close the underlying IO objects.
236: */
237: public void close() {
238: if (this .out == null) {
239: return;
240: }
241: try {
242: this .out.flush();
243: this .out.close();
244: this .out = null;
245: } catch (IOException e) {
246: e.printStackTrace();
247: }
248: }
249:
250: /**
251: * Note a serious error vioa a special log line
252: *
253: * @param err
254: */
255: public void seriousError(String err) {
256: writeLine("\n" + LOG_ERROR + ArchiveUtils.getLog14Date() + " "
257: + err);
258: }
259:
260: /**
261: * Handle a checkpoint by rotating the current log to a checkpoint-named
262: * file and starting a new log.
263: *
264: * @param checkpointDir
265: * @throws IOException
266: */
267: public synchronized void checkpoint(final File checkpointDir)
268: throws IOException {
269: if (this .out == null || !this .gzipFile.exists()) {
270: return;
271: }
272: close();
273: // Rename gzipFile with the checkpoint name as suffix.
274: this .gzipFile
275: .renameTo(new File(this .gzipFile.getParentFile(),
276: this .gzipFile.getName() + "."
277: + checkpointDir.getName()));
278: // Open new gzip file.
279: this.out = initialize(this.gzipFile);
280: }
281:
282: }
|