001: package it.unimi.dsi.mg4j.document;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
025: import it.unimi.dsi.io.FastBufferedReader;
026: import it.unimi.dsi.io.LineWordReader;
027: import it.unimi.dsi.io.WordReader;
028: import it.unimi.dsi.util.Properties;
029:
030: import java.io.InputStream;
031: import java.io.InputStreamReader;
032: import java.io.UnsupportedEncodingException;
033: import java.nio.charset.Charset;
034:
035: import org.apache.commons.configuration.ConfigurationException;
036:
037: /** A factory that provides a single field containing just the raw input stream; the encoding
038: * is set using the property {@link it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#ENCODING}.
039: * The field is named <samp>text</samp>, but you can change the name using the property
040: * <samp>fieldname</samp>.
041: *
042: * <p>By default, the {@link WordReader} provided by this factory
043: * is just a {@link FastBufferedReader}, but you can specify
044: * an alternative word reader using the property
045: * {@link it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#WORDREADER}.
046: * For instance, if you need to index a list of identifiers to retrieve documents from
047: * the collection more easily, you can use a {@link LineWordReader}
048: * to index each line of a file as a whole.
049: *
050: */
051:
052: public class IdentityDocumentFactory extends
053: PropertyBasedDocumentFactory {
054: private static final long serialVersionUID = 1L;
055:
056: /** Case-insensitive keys for metadata.
057: *
058: * @see PropertyBasedDocumentFactory.MetadataKeys
059: */
060: public static enum MetadataKeys {
061: /** The tag for the optional name of the only field provided by this factory. */
062: FIELDNAME
063: };
064:
065: /** The name of the only field. */
066: private String fieldName;
067: /** The word reader used for all documents. */
068: private WordReader wordReader;
069: /** The class to be used to instantiate {@link #wordReader}. */
070: private Class<? extends WordReader> wordReaderClass;
071:
072: @SuppressWarnings("unchecked")
073: protected boolean parseProperty(final String key,
074: final String[] values,
075: final Reference2ObjectMap<Enum<?>, Object> metadata)
076: throws ConfigurationException {
077: if (sameKey(PropertyBasedDocumentFactory.MetadataKeys.ENCODING,
078: key)) {
079: metadata.put(
080: PropertyBasedDocumentFactory.MetadataKeys.ENCODING,
081: Charset.forName(ensureJustOne(key, values))
082: .toString());
083: return true;
084: }
085: if (sameKey(
086: PropertyBasedDocumentFactory.MetadataKeys.WORDREADER,
087: key)) {
088: try {
089: metadata
090: .put(
091: PropertyBasedDocumentFactory.MetadataKeys.WORDREADER,
092: (wordReaderClass = (Class<? extends WordReader>) Class
093: .forName(ensureJustOne(key,
094: values))).toString());
095: } catch (ClassNotFoundException e) {
096: throw new ConfigurationException(e);
097: }
098: return true;
099: }
100: if (sameKey(MetadataKeys.FIELDNAME, key)) {
101: fieldName = ensureJustOne(key, values).toString();
102: return true;
103: }
104:
105: return super .parseProperty(key, values, metadata);
106: }
107:
108: public IdentityDocumentFactory() {
109: init();
110: }
111:
112: private void init() {
113: if (fieldName == null)
114: fieldName = "text";
115: try {
116: wordReader = wordReaderClass == null ? new FastBufferedReader()
117: : wordReaderClass.newInstance();
118: } catch (Exception e) {
119: throw new RuntimeException(e);
120: }
121: }
122:
123: public IdentityDocumentFactory(
124: final Reference2ObjectMap<Enum<?>, Object> defaultMetadata) {
125: super (defaultMetadata);
126: init();
127: }
128:
129: public IdentityDocumentFactory(final Properties properties)
130: throws ConfigurationException {
131: super (properties);
132: init();
133: }
134:
135: public IdentityDocumentFactory(final String[] property)
136: throws ConfigurationException {
137: super (property);
138: init();
139: }
140:
141: public IdentityDocumentFactory copy() {
142: return new IdentityDocumentFactory(defaultMetadata);
143: }
144:
145: public int numberOfFields() {
146: return 1;
147: }
148:
149: public String fieldName(final int field) {
150: ensureFieldIndex(field);
151: return fieldName;
152: }
153:
154: public int fieldIndex(final String fieldName) {
155: return fieldName.equals(this .fieldName) ? 0 : -1;
156: }
157:
158: public FieldType fieldType(final int field) {
159: ensureFieldIndex(field);
160: return FieldType.TEXT;
161: }
162:
163: public Document getDocument(final InputStream rawContent,
164: final Reference2ObjectMap<Enum<?>, Object> metadata) {
165: return new Document() {
166:
167: public CharSequence title() {
168: return (CharSequence) resolve(
169: PropertyBasedDocumentFactory.MetadataKeys.TITLE,
170: metadata);
171: }
172:
173: public String toString() {
174: return title().toString();
175: }
176:
177: public CharSequence uri() {
178: return (CharSequence) resolve(
179: PropertyBasedDocumentFactory.MetadataKeys.URI,
180: metadata);
181: }
182:
183: public Object content(final int field) {
184: ensureFieldIndex(field);
185: try {
186: return new InputStreamReader(
187: rawContent,
188: (String) resolveNotNull(
189: PropertyBasedDocumentFactory.MetadataKeys.ENCODING,
190: metadata));
191: } catch (UnsupportedEncodingException e) {
192: throw new RuntimeException(e);
193: }
194: }
195:
196: public WordReader wordReader(final int field) {
197: ensureFieldIndex(field);
198: // TODO: should depend on locale (or something)
199: return wordReader;
200: }
201:
202: public void close() {
203: }
204: };
205: }
206: }
|