001: /* PersistProcessor.java
002: *
003: * Created on Feb 17, 2005
004: *
005: * Copyright (C) 2007 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.processor.recrawl;
024:
025: import java.io.BufferedReader;
026: import java.io.File;
027: import java.io.FileNotFoundException;
028: import java.io.FileReader;
029: import java.io.IOException;
030: import java.io.UnsupportedEncodingException;
031: import java.util.Iterator;
032: import java.util.Map.Entry;
033:
034: import org.apache.commons.codec.binary.Base64;
035: import org.archive.crawler.datamodel.CrawlURI;
036: import org.archive.crawler.framework.Processor;
037: import org.archive.crawler.io.CrawlerJournal;
038: import org.archive.util.IoUtils;
039: import org.archive.util.SURT;
040: import org.archive.util.bdbje.EnhancedEnvironment;
041: import org.archive.util.iterator.LineReadingIterator;
042:
043: import st.ata.util.AList;
044:
045: import com.sleepycat.bind.serial.SerialBinding;
046: import com.sleepycat.bind.serial.StoredClassCatalog;
047: import com.sleepycat.bind.tuple.StringBinding;
048: import com.sleepycat.collections.StoredIterator;
049: import com.sleepycat.collections.StoredSortedMap;
050: import com.sleepycat.je.Database;
051: import com.sleepycat.je.DatabaseConfig;
052: import com.sleepycat.je.DatabaseException;
053: import com.sleepycat.je.EnvironmentConfig;
054:
055: /**
056: * Superclass for Processors which utilize BDB-JE for URI state
057: * (including most notably history) persistence.
058: *
059: * @author gojomo
060: */
061: public abstract class PersistProcessor extends Processor {
062: /** name of history Database */
063: public static final String URI_HISTORY_DBNAME = "uri_history";
064:
065: /**
066: * @return DatabaseConfig for history Database
067: */
068: protected static DatabaseConfig historyDatabaseConfig() {
069: DatabaseConfig dbConfig = new DatabaseConfig();
070: dbConfig.setTransactional(false);
071: dbConfig.setAllowCreate(true);
072: dbConfig.setDeferredWrite(true);
073: return dbConfig;
074: }
075:
076: /**
077: * Usual constructor
078: *
079: * @param name
080: * @param string
081: */
082: public PersistProcessor(String name, String string) {
083: super (name, string);
084: }
085:
086: /**
087: * Return a preferred String key for persisting the given CrawlURI's
088: * AList state.
089: *
090: * @param curi CrawlURI
091: * @return String key
092: */
093: public String persistKeyFor(CrawlURI curi) {
094: // use a case-sensitive SURT for uniqueness and sorting benefits
095: return SURT.fromURI(curi.getUURI().toString(), true);
096: }
097:
098: /**
099: * Whether the current CrawlURI's state should be persisted (to log or
100: * direct to database)
101: *
102: * @param curi CrawlURI
103: * @return true if state should be stored; false to skip persistence
104: */
105: protected boolean shouldStore(CrawlURI curi) {
106: // TODO: don't store some codes, such as 304 unchanged?
107: return curi.isSuccess();
108: }
109:
110: /**
111: * Whether the current CrawlURI's state should be loaded
112: *
113: * @param curi CrawlURI
114: * @return true if state should be loaded; false to skip loading
115: */
116: protected boolean shouldLoad(CrawlURI curi) {
117: // TODO: don't load some (prereqs?)
118: return true;
119: }
120:
121: /**
122: * Utility main for importing a log into a BDB-JE environment or moving a
123: * database between environments (2 arguments), or simply dumping a log
124: * to stdout in a more readable format (1 argument).
125: *
126: * @param args command-line arguments
127: * @throws DatabaseException
128: * @throws IOException
129: */
130: public static void main(String[] args) throws DatabaseException,
131: IOException {
132: if (args.length == 2) {
133: main2args(args);
134: } else if (args.length == 1) {
135: main1arg(args);
136: } else {
137: System.out.println("Arguments: ");
138: System.out.println(" source [target]");
139: System.out
140: .println("...where source is either a txtser log file or BDB env dir");
141: System.out
142: .println("and target, if present, is a BDB env dir. ");
143: return;
144: }
145:
146: }
147:
148: /**
149: * Move the history information in the first argument (either the path
150: * to a log or to an environment containing a uri_history database) to
151: * the environment in the second environment (path; environment will
152: * be created if it dow not already exist).
153: *
154: * @param args command-line arguments
155: * @throws DatabaseException
156: * @throws FileNotFoundException
157: * @throws UnsupportedEncodingException
158: * @throws IOException
159: */
160: private static void main2args(String[] args)
161: throws DatabaseException, FileNotFoundException,
162: UnsupportedEncodingException, IOException {
163: File source = new File(args[0]);
164: File env = new File(args[1]);
165: if (!env.exists()) {
166: env.mkdirs();
167: }
168:
169: // setup target environment
170: EnhancedEnvironment targetEnv = setupEnvironment(env);
171: StoredClassCatalog classCatalog = targetEnv.getClassCatalog();
172: Database historyDB = targetEnv.openDatabase(null,
173: URI_HISTORY_DBNAME, historyDatabaseConfig());
174: StoredSortedMap historyMap = new StoredSortedMap(historyDB,
175: new StringBinding(), new SerialBinding(classCatalog,
176: AList.class), true);
177:
178: int count = 0;
179:
180: if (source.isFile()) {
181: // scan log, writing to database
182: BufferedReader br = CrawlerJournal
183: .getBufferedReader(source);
184: Iterator iter = new LineReadingIterator(br);
185: while (iter.hasNext()) {
186: String line = (String) iter.next();
187: String[] splits = line.split(" ");
188: historyMap.put(splits[0], IoUtils
189: .deserializeFromByteArray(Base64
190: .decodeBase64(splits[1]
191: .getBytes("UTF8"))));
192: count++;
193: }
194: br.close();
195: } else {
196: // open the source env history DB, copying entries to target env
197: EnhancedEnvironment sourceEnv = setupEnvironment(source);
198: StoredClassCatalog sourceClassCatalog = sourceEnv
199: .getClassCatalog();
200: Database sourceHistoryDB = sourceEnv.openDatabase(null,
201: URI_HISTORY_DBNAME, historyDatabaseConfig());
202: StoredSortedMap sourceHistoryMap = new StoredSortedMap(
203: sourceHistoryDB, new StringBinding(),
204: new SerialBinding(sourceClassCatalog, AList.class),
205: true);
206: Iterator iter = sourceHistoryMap.entrySet().iterator();
207: while (iter.hasNext()) {
208: Entry item = (Entry) iter.next();
209: historyMap.put(item.getKey(), item.getValue());
210: count++;
211: }
212: StoredIterator.close(iter);
213: sourceHistoryDB.close();
214: sourceEnv.close();
215: }
216:
217: // cleanup
218: historyDB.sync();
219: historyDB.close();
220: targetEnv.close();
221: System.out.println(count + " records imported from " + source
222: + " to BDB env " + env);
223: }
224:
225: /**
226: * Dump the contents of the argument (path to a persist log) to stdout
227: * in a slightly more readable format.
228: *
229: * @param args command-line arguments
230: * @throws DatabaseException
231: * @throws FileNotFoundException
232: * @throws UnsupportedEncodingException
233: * @throws IOException
234: */
235: private static void main1arg(String[] args)
236: throws DatabaseException, FileNotFoundException,
237: UnsupportedEncodingException, IOException {
238: File source = new File(args[0]);
239:
240: int count = 0;
241:
242: if (source.isFile()) {
243: // scan log, writing to database
244: BufferedReader br = CrawlerJournal
245: .getBufferedReader(source);
246: Iterator iter = new LineReadingIterator(br);
247: while (iter.hasNext()) {
248: String line = (String) iter.next();
249: String[] splits = line.split(" ");
250: AList alist = (AList) IoUtils
251: .deserializeFromByteArray(Base64
252: .decodeBase64(splits[1]
253: .getBytes("UTF8")));
254: System.out.println(splits[0] + " "
255: + alist.toPrettyString());
256: count++;
257: }
258: br.close();
259: } else {
260: // open the source env history DB, copying entries to target env
261: EnhancedEnvironment sourceEnv = setupEnvironment(source);
262: StoredClassCatalog sourceClassCatalog = sourceEnv
263: .getClassCatalog();
264: Database sourceHistoryDB = sourceEnv.openDatabase(null,
265: URI_HISTORY_DBNAME, historyDatabaseConfig());
266: StoredSortedMap sourceHistoryMap = new StoredSortedMap(
267: sourceHistoryDB, new StringBinding(),
268: new SerialBinding(sourceClassCatalog, AList.class),
269: true);
270: Iterator iter = sourceHistoryMap.entrySet().iterator();
271: while (iter.hasNext()) {
272: Entry item = (Entry) iter.next();
273: AList alist = (AList) item.getValue();
274: System.out.println(item.getKey() + " "
275: + alist.toPrettyString());
276: count++;
277: }
278: StoredIterator.close(iter);
279: sourceHistoryDB.close();
280: sourceEnv.close();
281: }
282:
283: System.out.println(count + " records dumped from " + source);
284: }
285:
286: private static EnhancedEnvironment setupEnvironment(File env)
287: throws DatabaseException {
288: EnvironmentConfig envConfig = new EnvironmentConfig();
289: envConfig.setAllowCreate(true);
290: return new EnhancedEnvironment(env, envConfig);
291: }
292: }
|