001: /* SeedFileIterator
002: *
003: * $Id: SeedFileIterator.java 4651 2006-09-25 18:31:13Z paul_jack $
004: *
005: * Created on Mar 28, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.scope;
026:
027: import java.io.BufferedReader;
028: import java.io.IOException;
029: import java.io.Writer;
030: import java.util.logging.Level;
031: import java.util.logging.Logger;
032:
033: import org.apache.commons.httpclient.URIException;
034: import org.archive.net.UURI;
035: import org.archive.net.UURIFactory;
036: import org.archive.util.iterator.LineReadingIterator;
037: import org.archive.util.iterator.RegexpLineIterator;
038: import org.archive.util.iterator.TransformingIteratorWrapper;
039:
040: /**
041: * Iterator wrapper for seeds file on disk.
042: *
043: * @author gojomo
044: */
045: public class SeedFileIterator extends
046: TransformingIteratorWrapper<String, UURI> {
047: private static Logger logger = Logger
048: .getLogger(SeedFileIterator.class.getName());
049:
050: BufferedReader input;
051: Writer ignored;
052:
053: /**
054: * Construct a SeedFileIterator over the input available
055: * from the supplied BufferedReader.
056: * @param br BufferedReader from which to get seeds
057: */
058: public SeedFileIterator(BufferedReader br) {
059: this (br, null);
060: }
061:
062: /**
063: * Construct a SeedFileIterator over the input available
064: * from the supplied BufferedReader, reporting any nonblank
065: * noncomment entries which don't generate a valid seed to
066: * the supplied BufferedWriter.
067: *
068: * @param inputReader BufferedReader from which to get seeds
069: * @param ignoredWriter BufferedWriter to report any ignored input
070: */
071: public SeedFileIterator(BufferedReader inputReader,
072: Writer ignoredWriter) {
073: super ();
074: inner = new RegexpLineIterator(
075: new LineReadingIterator(inputReader),
076: RegexpLineIterator.COMMENT_LINE,
077: RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
078: RegexpLineIterator.ENTRY);
079: input = inputReader;
080: ignored = ignoredWriter;
081: }
082:
083: protected UURI transform(String uri) {
084: if (!uri.matches("[a-zA-Z][\\w+\\-]+:.*")) { // Rfc2396 s3.1 scheme,
085: // minus '.'
086: // Does not begin with scheme, so try http://
087: uri = "http://" + uri;
088: }
089: try {
090: // TODO: ignore lines beginning with non-word char
091: return UURIFactory.getInstance(uri);
092: } catch (URIException e) {
093: logger.log(Level.INFO, "line in seed file ignored: "
094: + e.getMessage(), e);
095: if (ignored != null) {
096: try {
097: ignored.write(uri + "\n");
098: } catch (IOException e1) {
099: // TODO Auto-generated catch block
100: e1.printStackTrace();
101: }
102: }
103: return null;
104: }
105: }
106:
107: /**
108: * Clean-up when hasNext() has returned null: close open files.
109: *
110: * @see org.archive.util.iterator.TransformingIteratorWrapper#noteExhausted()
111: */
112: protected void noteExhausted() {
113: super .noteExhausted();
114: close();
115: }
116:
117: public void close() {
118: try {
119: if (input != null) {
120: input.close();
121: }
122: if (ignored != null) {
123: ignored.close();
124: }
125: } catch (IOException e) {
126: // TODO Auto-generated catch block
127: e.printStackTrace();
128: }
129: }
130: }
|