001: /* RecoveryLogMapper.java
002: *
003: * $Id: RecoveryLogMapper.java 4647 2006-09-22 18:39:39Z paul_jack $
004: *
005: * Created on Mar 7, 2005
006: *
007: * Copyright (C) 2005 Mike Schwartz.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025:
026: /**
027: * Parses a Heritrix recovery log file (recover.gz), and builds maps
028: * that allow a caller to look up any seed URL and get back an Iterator of all
029: * URLs successfully crawled from given seed.
030: *
031: * Also allows lookup on any crawled
032: * URL to find the seed URL from which the crawler reached that URL (through 1
033: * or more discovered URL hops, which are collapsed in this lookup).
034: *
035: * <p>This code creates some fairly large collections (proprotionate in size to
036: * # discovered URLs) so make sure you allocate
037: * it a large heap to work in. It also takes a while to process a recover log.
038: * <p>See {@link #main()} method at end for test/demo code.
039: * @author Mike Schwartz, schwartz at CodeOnTheRoad dot com
040: */package org.archive.crawler.util;
041:
042: import org.archive.crawler.frontier.RecoveryJournal;
043:
044: import java.io.File;
045: import java.io.LineNumberReader;
046: import java.io.PrintWriter;
047: import java.io.FileOutputStream;
048: import java.util.Collection;
049: import java.util.HashMap;
050: import java.util.HashSet;
051: import java.util.Iterator;
052: import java.util.Map;
053: import java.util.Set;
054: import java.util.logging.Level;
055: import java.util.logging.Logger;
056:
057: public class RecoveryLogMapper {
058: private static final char LOG_LINE_START_CHAR = RecoveryJournal.F_ADD
059: .charAt(0);
060: private static final Logger logger = Logger
061: .getLogger(RecoveryLogMapper.class.getName());
062: private PrintWriter seedNotFoundPrintWriter = null;
063:
064: /**
065: * Tracks seed for each crawled URL
066: */
067: private Map<String, String> crawledUrlToSeedMap = new HashMap<String, String>();
068:
069: /**
070: * Maps seed URLs to Set of discovered URLs
071: */
072: private Map<String, Set<String>> seedUrlToDiscoveredUrlsMap = new HashMap<String, Set<String>>();
073:
074: /**
075: * Tracks which URLs were successfully crawled
076: */
077: private Set<String> successfullyCrawledUrls = new HashSet<String>();
078:
079: /**
080: * Normal constructor - if encounter not-found seeds while loading
081: * recoverLogFileName, will throw throw SeedUrlNotFoundException.
082: * Use {@link #RecoveryLogMapper(String)} if you want to just log
083: * such cases and keep going. (Those should not happen if the
084: * recover log is written correctly, but we see them in pratice.)
085: * @param recoverLogFileName
086: * @throws java.io.FileNotFoundException
087: * @throws java.io.IOException
088: * @throws SeedUrlNotFoundException
089: */
090: public RecoveryLogMapper(String recoverLogFileName)
091: throws java.io.FileNotFoundException, java.io.IOException,
092: SeedUrlNotFoundException {
093: load(recoverLogFileName);
094: }
095:
096: /**
097: * Constructor to use if you want to allow not-found seeds, logging
098: * them to seedNotFoundLogFileName. In contrast, {@link
099: * #RecoveryLogMapper(String)} will throw SeedUrlNotFoundException
100: * when a seed isn't found.
101: * @param recoverLogFileName
102: * @param seedNotFoundLogFileName
103: */
104: public RecoveryLogMapper(String recoverLogFileName,
105: String seedNotFoundLogFileName)
106: throws java.io.FileNotFoundException, java.io.IOException,
107: SeedUrlNotFoundException {
108: seedNotFoundPrintWriter = new PrintWriter(new FileOutputStream(
109: seedNotFoundLogFileName));
110: load(recoverLogFileName);
111: }
112:
113: protected void load(String recoverLogFileName)
114: throws java.io.FileNotFoundException, java.io.IOException,
115: SeedUrlNotFoundException {
116: LineNumberReader reader = new LineNumberReader(RecoveryJournal
117: .getBufferedReader(new File(recoverLogFileName)));
118: String curLine = null;
119: while ((curLine = reader.readLine()) != null) {
120: if (curLine.length() == 0
121: || curLine.charAt(0) != LOG_LINE_START_CHAR) {
122: continue;
123: }
124: String args[] = curLine.split("\\s+");
125: int curLineNumWords = args.length;
126: String firstUrl = args[1];
127: // Ignore DNS log entries
128: if (firstUrl.startsWith("dns:")) {
129: continue;
130: }
131: if (curLine.startsWith(RecoveryJournal.F_ADD)) {
132: // Seed URL
133: if (curLineNumWords == 2) {
134: if (logger.isLoggable(Level.FINE)) {
135: logger.fine("F_ADD with 2 words --> seed URL ("
136: + firstUrl + ")");
137: }
138: // Add seed the first time we find it
139: if (seedUrlToDiscoveredUrlsMap.get(firstUrl) == null) {
140: seedUrlToDiscoveredUrlsMap.put(firstUrl,
141: new HashSet<String>());
142: }
143: } else {
144: // URL found via an earlier seeded / discovered URL
145: // Look for the seed from which firstUrlString came, so
146: // we can collapse new URLString back to it
147: String viaUrl = args[curLineNumWords - 1];
148: if (logger.isLoggable(Level.FINE)) {
149: logger.fine("F_ADD with 3+ words --> new URL "
150: + firstUrl + " via URL " + viaUrl);
151: }
152: String seedForFirstUrl = (String) crawledUrlToSeedMap
153: .get(viaUrl);
154: // viaUrlString is a seed URL
155: if (seedForFirstUrl == null) {
156: if (logger.isLoggable(Level.FINE)) {
157: logger.fine("\tvia URL is a seed");
158: }
159: crawledUrlToSeedMap.put(firstUrl, viaUrl);
160: seedForFirstUrl = viaUrl;
161: } else {
162: if (logger.isLoggable(Level.FINE)) {
163: logger
164: .fine("\tvia URL discovered via seed URL "
165: + seedForFirstUrl);
166: }
167: // Collapse
168: crawledUrlToSeedMap.put(firstUrl,
169: seedForFirstUrl);
170: }
171: Set<String> theSeedUrlList = seedUrlToDiscoveredUrlsMap
172: .get(seedForFirstUrl);
173: if (theSeedUrlList == null) {
174: String message = "recover log "
175: + recoverLogFileName + " at line "
176: + reader.getLineNumber()
177: + " listed F+ URL (" + viaUrl
178: + ") for which found no seed list.";
179: if (seedNotFoundPrintWriter != null) {
180: seedNotFoundPrintWriter.println(message);
181: } else {
182: throw new SeedUrlNotFoundException(message);
183: }
184: } else {
185: theSeedUrlList.add(firstUrl);
186: }
187: }
188: } else if (curLine.startsWith(RecoveryJournal.F_SUCCESS)) {
189: if (logger.isLoggable(Level.FINE)) {
190: logger.fine("F_SUCCESS for URL " + firstUrl);
191: }
192: successfullyCrawledUrls.add(firstUrl);
193: }
194: }
195: reader.close();
196: if (seedNotFoundPrintWriter != null) {
197: seedNotFoundPrintWriter.close();
198: }
199: }
200:
201: /**
202: * Returns seed for urlString (null if seed not found).
203: * @param urlString
204: * @return Seed.
205: */
206: public String getSeedForUrl(String urlString) {
207: return (seedUrlToDiscoveredUrlsMap.get(urlString) != null) ? urlString
208: : crawledUrlToSeedMap.get(urlString);
209: }
210:
211: /**
212: * @return Returns the seedUrlToDiscoveredUrlsMap.
213: */
214: public Map getSeedUrlToDiscoveredUrlsMap() {
215: return this .seedUrlToDiscoveredUrlsMap;
216: }
217:
218: /**
219: * @return Returns the successfullyCrawledUrls.
220: */
221: public Set getSuccessfullyCrawledUrls() {
222: return this .successfullyCrawledUrls;
223: }
224:
225: /**
226: * @return Returns the logger.
227: */
228: public static Logger getLogger() {
229: return logger;
230: }
231:
232: private class SuccessfullyCrawledURLsIterator implements
233: Iterator<String> {
234: private String nextValue = null;
235: private Iterator discoveredUrlsIterator;
236:
237: public SuccessfullyCrawledURLsIterator(String seedUrlString)
238: throws SeedUrlNotFoundException {
239: Set discoveredUrlList = (Set) getSeedUrlToDiscoveredUrlsMap()
240: .get(seedUrlString);
241: if (discoveredUrlList == null) {
242: throw new SeedUrlNotFoundException("Seed URL "
243: + seedUrlString + " not found in seed list");
244: }
245: discoveredUrlsIterator = discoveredUrlList.iterator();
246: }
247:
248: /**
249: * Idempotent method (because of null check on nextValue).
250: */
251: private void populateNextValue() {
252: while (nextValue == null & discoveredUrlsIterator.hasNext()) {
253: String curDiscoveredUrl = (String) discoveredUrlsIterator
254: .next();
255: boolean succCrawled = getSuccessfullyCrawledUrls()
256: .contains(curDiscoveredUrl);
257: if (getLogger().isLoggable(Level.FINE)) {
258: getLogger().fine(
259: "populateNextValue: curDiscoveredUrl="
260: + curDiscoveredUrl
261: + ", succCrawled=" + succCrawled);
262: }
263: if (succCrawled)
264: nextValue = curDiscoveredUrl;
265: }
266: }
267:
268: public boolean hasNext() {
269: populateNextValue();
270: return (nextValue != null);
271: }
272:
273: public String next() {
274: populateNextValue();
275: String returnValue = nextValue;
276: nextValue = null;
277: return returnValue;
278: }
279:
280: /**
281: * Remove operation is unsupported in this Iterator
282: * (will throw UnsupportedOperationException if called).
283: */
284: public void remove() {
285: throw new UnsupportedOperationException(
286: "SuccessfullyCrawledURLsIterator.remove: not supported.");
287: }
288: }
289:
290: public Iterator<String> getIteratorOfURLsSuccessfullyCrawledFromSeedUrl(
291: String seedUrlString) throws SeedUrlNotFoundException {
292: return new SuccessfullyCrawledURLsIterator(seedUrlString);
293: }
294:
295: public Collection<String> getSeedCollection() {
296: return seedUrlToDiscoveredUrlsMap.keySet();
297: }
298:
299: public static void main(String args[]) {
300: if (args.length < 1) {
301: System.out
302: .println("Usage: RecoveryLogMapper recoverLogFileName");
303: Runtime.getRuntime().exit(-1);
304: }
305: String recoverLogFileName = args[0];
306: try {
307: RecoveryLogMapper myRecoveryLogMapper = new RecoveryLogMapper(
308: recoverLogFileName);
309: for (String curSeedUrl : myRecoveryLogMapper
310: .getSeedCollection()) {
311: System.out
312: .println("URLs successfully crawled from seed URL "
313: + curSeedUrl);
314: Iterator iteratorOfUrlsCrawledFromSeedUrl = myRecoveryLogMapper
315: .getIteratorOfURLsSuccessfullyCrawledFromSeedUrl(curSeedUrl);
316: while (iteratorOfUrlsCrawledFromSeedUrl.hasNext()) {
317: String curCrawledUrlString = (String) iteratorOfUrlsCrawledFromSeedUrl
318: .next();
319: System.out.println(" -> " + curCrawledUrlString);
320: }
321: }
322: } catch (Exception e) {
323: e.printStackTrace();
324: }
325: }
326: }
|