001: /* CharSequenceLinkExtractor
002: *
003: * $Id: CharSequenceLinkExtractor.java 4646 2006-09-22 17:23:04Z paul_jack $
004: *
005: * Created on Mar 17, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.extractor;
026:
027: import java.io.InputStream;
028: import java.nio.charset.Charset;
029: import java.util.LinkedList;
030: import java.util.List;
031: import java.util.NoSuchElementException;
032:
033: import org.archive.crawler.extractor.Link;
034: import org.archive.net.UURI;
035:
036: /**
037: * Abstract superclass providing utility methods for LinkExtractors which
038: * would prefer to work on a CharSequence rather than a stream.
039: *
040: * ROUGH DRAFT IN PROGRESS / incomplete... untested...
041: *
042: * @author gojomo
043: */
044: public abstract class CharSequenceLinkExtractor implements
045: LinkExtractor {
046:
047: protected UURI source;
048: protected UURI base;
049: protected ExtractErrorListener extractErrorListener;
050:
051: protected CharSequence sourceContent;
052: protected LinkedList<Link> next;
053:
054: public void setup(UURI source, UURI base, InputStream content,
055: Charset charset, ExtractErrorListener listener) {
056: setup(source, base, charSequenceFrom(content, charset),
057: listener);
058: }
059:
060: /**
061: * @param source
062: * @param base
063: * @param content
064: * @param listener
065: */
066: public void setup(UURI source, UURI base, CharSequence content,
067: ExtractErrorListener listener) {
068: this .source = source;
069: this .base = base;
070: this .extractErrorListener = listener;
071: this .sourceContent = content;
072: this .next = new LinkedList<Link>();
073: }
074:
075: /**
076: * Convenience method for when source and base are same.
077: *
078: * @param sourceandbase
079: * @param content
080: * @param listener
081: */
082: public void setup(UURI sourceandbase, CharSequence content,
083: ExtractErrorListener listener) {
084: setup(sourceandbase, sourceandbase, content, listener);
085: }
086:
087: /* (non-Javadoc)
088: * @see org.archive.extractor.LinkExtractor#setup(org.archive.crawler.datamodel.UURI, java.io.InputStream, java.nio.charset.Charset)
089: */
090: public void setup(UURI sourceandbase, InputStream content,
091: Charset charset, ExtractErrorListener listener) {
092: setup(sourceandbase, sourceandbase, content, charset, listener);
093: }
094:
095: /* (non-Javadoc)
096: * @see org.archive.extractor.LinkExtractor#nextLink()
097: */
098: public Link nextLink() {
099: if (!hasNext()) {
100: throw new NoSuchElementException();
101: }
102: // next will have been filled with at least one item
103: return (Link) next.removeFirst();
104: }
105:
106: /**
107: * Discard all state. Another setup() is required to use again.
108: */
109: public void reset() {
110: base = null;
111: source = null;
112: sourceContent = null; // TODO: discard other resources
113: }
114:
115: /* (non-Javadoc)
116: * @see java.util.Iterator#hasNext()
117: */
118: public boolean hasNext() {
119: if (!next.isEmpty()) {
120: return true;
121: }
122: return findNextLink();
123: }
124:
125: /**
126: * Scan to the next link(s), if any, loading it into the next buffer.
127: *
128: * @return true if any links are found/available, false otherwise
129: */
130: abstract protected boolean findNextLink();
131:
132: /* (non-Javadoc)
133: * @see java.util.Iterator#next()
134: */
135: public Object next() {
136: return nextLink();
137: }
138:
139: /* (non-Javadoc)
140: * @see java.util.Iterator#remove()
141: */
142: public void remove() {
143: throw new UnsupportedOperationException();
144: }
145:
146: /**
147: * @param content
148: * @param charset
149: * @return CharSequence obtained from stream in given charset
150: */
151: protected CharSequence charSequenceFrom(InputStream content,
152: Charset charset) {
153: // See if content InputStream can provide
154: if (content instanceof CharSequenceProvider) {
155: return ((CharSequenceProvider) content).getCharSequence();
156: }
157: // otherwise, create one
158: return createCharSequenceFrom(content, charset);
159: }
160:
161: /**
162: * @param content
163: * @param charset
164: * @return CharSequence built over given stream in given charset
165: */
166: protected CharSequence createCharSequenceFrom(InputStream content,
167: Charset charset) {
168: // TODO: implement
169: return null;
170: // TODO: consider cleanup in reset()
171: }
172:
173: /**
174: * Convenience method to do default extraction.
175: *
176: * @param content
177: * @param source
178: * @param base
179: * @param collector
180: * @param extractErrorListener
181: */
182: public static void extract(CharSequence content, UURI source,
183: UURI base, List<Link> collector,
184: ExtractErrorListener extractErrorListener) {
185: // TODO: arrange for inheritance of prefs... eg when HTML includes JS
186: // includes HTML, have inner HTML follow robots, etc from outer
187: CharSequenceLinkExtractor extractor = newDefaultInstance();
188: extractor.setup(source, base, content, extractErrorListener);
189: while (extractor.hasNext()) {
190: collector.add(extractor.nextLink());
191: }
192: extractor.reset();
193: }
194:
195: protected static CharSequenceLinkExtractor newDefaultInstance() {
196: // override in subclasses
197: return null;
198: }
199: }
|