001: package it.unimi.dsi.mg4j.document;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.chars.CharArrays;
025: import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
026: import it.unimi.dsi.io.FastBufferedReader;
027: import it.unimi.dsi.io.WordReader;
028: import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;
029: import it.unimi.dsi.parser.BulletParser;
030: import it.unimi.dsi.parser.callback.ComposedCallbackBuilder;
031: import it.unimi.dsi.parser.callback.TextExtractor;
032: import it.unimi.dsi.util.Properties;
033:
034: import java.io.IOException;
035: import java.io.InputStream;
036: import java.io.InputStreamReader;
037: import java.io.ObjectInputStream;
038: import java.io.Reader;
039: import java.nio.charset.Charset;
040:
041: import org.apache.commons.configuration.ConfigurationException;
042:
043: /** A factory that provides fields for body and title of HTML documents.
044: * It uses internally a {@link BulletParser}.
045: * A default encoding can be provided
046: * using the property {@link it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#ENCODING}.
047: */
048:
049: public class HtmlDocumentFactory extends PropertyBasedDocumentFactory {
050: private static final long serialVersionUID = 1L;
051:
052: public static enum MetadataKeys {
053: /** The maximum number of characters before an anchor. */
054: MAXPREANCHOR,
055: /** The maximum number of characters in an anchor. */
056: MAXANCHOR,
057: /** The maximum number of characters after an anchor. */
058: MAXPOSTANCHOR,
059: };
060:
061: private static final int DEFAULT_BUFFER_SIZE = 16 * 1024;
062: /** A parser that will be used to extract text from HTML documents. */
063: private transient BulletParser parser;
064: /** The callback recording text. */
065: private transient TextExtractor textExtractor;
066: /** The callback for anchors. */
067: private transient AnchorExtractor anchorExtractor;
068: /** The word reader used for all documents. */
069: private transient WordReader wordReader;
070: /** The maximum number of characters before an anchor. */
071: private int maxPreAnchor;
072: /** The maximum number of characters in an anchor. */
073: private int maxAnchor;
074: /** The maximum number of characters after an anchor. */
075: private int maxPostAnchor;
076:
077: private transient char[] text;
078:
079: protected boolean parseProperty(final String key,
080: final String[] values,
081: final Reference2ObjectMap<Enum<?>, Object> metadata)
082: throws ConfigurationException {
083: if (sameKey(PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE,
084: key)) {
085: metadata.put(
086: PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE,
087: ensureJustOne(key, values));
088: return true;
089: } else if (sameKey(
090: PropertyBasedDocumentFactory.MetadataKeys.ENCODING, key)) {
091: metadata.put(
092: PropertyBasedDocumentFactory.MetadataKeys.ENCODING,
093: Charset.forName(ensureJustOne(key, values))
094: .toString());
095: return true;
096: } else if (sameKey(MetadataKeys.MAXPREANCHOR, key)) {
097: metadata.put(MetadataKeys.MAXPREANCHOR, Integer
098: .valueOf(ensureJustOne(key, values)));
099: return true;
100: } else if (sameKey(MetadataKeys.MAXANCHOR, key)) {
101: metadata.put(MetadataKeys.MAXANCHOR, Integer
102: .valueOf(ensureJustOne(key, values)));
103: return true;
104: } else if (sameKey(MetadataKeys.MAXPOSTANCHOR, key)) {
105: metadata.put(MetadataKeys.MAXPOSTANCHOR, Integer
106: .valueOf(ensureJustOne(key, values)));
107: return true;
108: }
109:
110: return super .parseProperty(key, values, metadata);
111: }
112:
113: private void init() {
114: this .parser = new BulletParser();
115:
116: ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();
117: composedBuilder.add(this .textExtractor = new TextExtractor());
118: composedBuilder.add(this .anchorExtractor = new AnchorExtractor(
119: maxPreAnchor, maxAnchor, maxPostAnchor));
120: parser.setCallback(composedBuilder.compose());
121:
122: this .wordReader = new FastBufferedReader();
123: text = new char[DEFAULT_BUFFER_SIZE];
124: }
125:
126: @SuppressWarnings("boxing")
127: private void initVars() {
128: maxPreAnchor = (Integer) resolve(MetadataKeys.MAXPREANCHOR,
129: defaultMetadata, 32);
130: maxAnchor = (Integer) resolve(MetadataKeys.MAXANCHOR,
131: defaultMetadata, 256);
132: maxPostAnchor = (Integer) resolve(MetadataKeys.MAXPOSTANCHOR,
133: defaultMetadata, 32);
134: }
135:
136: /** Returns a copy of this document factory. A new parser is allocated for the copy. */
137: public HtmlDocumentFactory copy() {
138: return new HtmlDocumentFactory(defaultMetadata);
139: }
140:
141: public HtmlDocumentFactory(final Properties properties)
142: throws ConfigurationException {
143: super (properties);
144: initVars();
145: init();
146: }
147:
148: public HtmlDocumentFactory(
149: final Reference2ObjectMap<Enum<?>, Object> defaultMetadata) {
150: super (defaultMetadata);
151: initVars();
152: init();
153: }
154:
155: public HtmlDocumentFactory(final String[] property)
156: throws ConfigurationException {
157: super (property);
158: initVars();
159: init();
160: }
161:
162: public HtmlDocumentFactory() {
163: super ();
164: initVars();
165: init();
166: }
167:
168: public int numberOfFields() {
169: return 3;
170: }
171:
172: public String fieldName(final int field) {
173: ensureFieldIndex(field);
174: switch (field) {
175: case 0:
176: return "text";
177: case 1:
178: return "title";
179: case 2:
180: return "anchor";
181: default:
182: throw new IllegalArgumentException();
183: }
184: }
185:
186: public int fieldIndex(final String fieldName) {
187: for (int i = 0; i < numberOfFields(); i++)
188: if (fieldName(i).equals(fieldName))
189: return i;
190: return -1;
191: }
192:
193: public FieldType fieldType(final int field) {
194: ensureFieldIndex(field);
195: switch (field) {
196: case 0:
197: return FieldType.TEXT;
198: case 1:
199: return FieldType.TEXT;
200: case 2:
201: return FieldType.VIRTUAL;
202: default:
203: throw new IllegalArgumentException();
204: }
205: }
206:
207: private void readObject(final ObjectInputStream s)
208: throws IOException, ClassNotFoundException {
209: s.defaultReadObject();
210: init();
211: }
212:
213: /** An HTML document. If a <samp>TITLE</samp> element is available, it will be used for {@link #title()}
214: * instead of the default value.
215: *
216: * <p>We delay the actual parsing until it is actually necessary, so operations like
217: * getting the document URI will not require parsing. */
218:
219: protected class HtmlDocument extends AbstractDocument {
220: private final Reference2ObjectMap<Enum<?>, Object> metadata;
221: /** Whether we already parsed the document. */
222: private boolean parsed;
223: /** The cached raw content. */
224: private final InputStream rawContent;
225:
226: private void ensureParsed() throws IOException {
227: if (parsed)
228: return;
229:
230: int offset = 0, l;
231: Reader r = new InputStreamReader(
232: rawContent,
233: (String) resolveNotNull(
234: PropertyBasedDocumentFactory.MetadataKeys.ENCODING,
235: metadata));
236: while ((l = r.read(text, offset, text.length - offset)) > 0) {
237: offset += l;
238: text = CharArrays.grow(text, offset + 1);
239: }
240: parser.parse(text, 0, offset);
241: textExtractor.title.trim();
242:
243: parsed = true;
244: }
245:
246: protected HtmlDocument(final InputStream rawContent,
247: final Reference2ObjectMap<Enum<?>, Object> metadata) {
248: this .metadata = metadata;
249: this .rawContent = rawContent;
250: }
251:
252: public CharSequence title() {
253: try {
254: ensureParsed();
255: } catch (IOException e) {
256: throw new RuntimeException(e);
257: }
258: return (CharSequence) (textExtractor.title.length() == 0 ? resolve(
259: PropertyBasedDocumentFactory.MetadataKeys.TITLE,
260: metadata)
261: : textExtractor.title);
262: }
263:
264: public String toString() {
265: return title().toString();
266: }
267:
268: public CharSequence uri() {
269: return (CharSequence) resolve(
270: PropertyBasedDocumentFactory.MetadataKeys.URI,
271: metadata);
272: }
273:
274: public Object content(final int field) throws IOException {
275: ensureFieldIndex(field);
276: ensureParsed();
277: switch (field) {
278: case 0:
279: return new FastBufferedReader(textExtractor.text);
280: case 1:
281: return new FastBufferedReader(textExtractor.title);
282: case 2:
283: return anchorExtractor.anchors;
284: default:
285: throw new IllegalArgumentException();
286: }
287: }
288:
289: public WordReader wordReader(final int field) {
290: ensureFieldIndex(field);
291: return wordReader;
292: }
293: }
294:
295: public Document getDocument(final InputStream rawContent,
296: final Reference2ObjectMap<Enum<?>, Object> metadata)
297: throws IOException {
298: return new HtmlDocument(rawContent, metadata);
299: }
300: }
|