001: package it.unimi.dsi.mg4j.document;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023: import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
024: import it.unimi.dsi.fastutil.objects.ObjectArrayList;
025: import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
026: import it.unimi.dsi.io.WordReader;
027:
028: import java.io.IOException;
029: import java.io.InputStream;
030:
031: /** A factory that replicates a given factory several times. A special case of a composite factory.
032: *
033: * <p>Note that in general replicated factories support only sequential access to
034: * field content (albeit skipping items is allowed).
035: */
036: public class ReplicatedDocumentFactory extends AbstractDocumentFactory {
037: private static final long serialVersionUID = 1L;
038:
039: /** The document factory that will be replicated. */
040: public final DocumentFactory documentFactory;
041: /** The number of copies. */
042: public final int numberOfCopies;
043: /** The map from field names to field indices. */
044: private final Object2IntOpenHashMap<String> field2Index;
045:
046: /** Creates a new replicated document factory.
047: *
048: * @param documentFactory the factory that will be replicated.
049: * @param numberOfCopies the number of copies.
050: * @param fieldName the names to be given to the fields of the new factory.
051: */
052: private ReplicatedDocumentFactory(
053: final DocumentFactory documentFactory,
054: final int numberOfCopies, final String[] fieldName) {
055: this .documentFactory = documentFactory;
056: this .numberOfCopies = numberOfCopies;
057: if (numberOfFields() != fieldName.length)
058: throw new IllegalArgumentException(
059: "The number of field names ("
060: + fieldName.length
061: + ") is not equal to the number of fields in the replicated factory ("
062: + numberOfFields() + ")");
063: field2Index = new Object2IntOpenHashMap<String>(
064: fieldName.length, .5f);
065: field2Index.defaultReturnValue(-1);
066: for (int i = 0; i < fieldName.length; i++)
067: field2Index.put(fieldName[i], i);
068: if (field2Index.size() != fieldName.length)
069: throw new IllegalArgumentException("The field name array "
070: + ObjectArrayList.wrap(fieldName)
071: + " contains duplicates");
072: }
073:
074: protected ReplicatedDocumentFactory(
075: final DocumentFactory documentFactory,
076: final int numberOfCopies,
077: final Object2IntOpenHashMap<String> field2Index) {
078: this .documentFactory = documentFactory;
079: this .numberOfCopies = numberOfCopies;
080: this .field2Index = field2Index;
081: }
082:
083: /** Returns a document factory replicating the given factory.
084: *
085: * @param documentFactory the factory that will be replicated.
086: * @param numberOfCopies the number of copies.
087: * @return a replicated document factory.
088: */
089: public static DocumentFactory getFactory(
090: final DocumentFactory documentFactory,
091: final int numberOfCopies, final String[] fieldName) {
092: //if ( numberOfCopies == 1 ) return documentFactory; TODO: should be optimised if no renaming is done.
093: return new ReplicatedDocumentFactory(documentFactory,
094: numberOfCopies, fieldName);
095: }
096:
097: public ReplicatedDocumentFactory copy() {
098: return new ReplicatedDocumentFactory(documentFactory.copy(),
099: numberOfCopies, field2Index);
100: }
101:
102: public int numberOfFields() {
103: return numberOfCopies * documentFactory.numberOfFields();
104: }
105:
106: public String fieldName(final int field) {
107: ensureFieldIndex(field);
108: return documentFactory.fieldName(field
109: % documentFactory.numberOfFields());
110: }
111:
112: public int fieldIndex(final String fieldName) {
113: return field2Index.getInt(fieldName);
114: }
115:
116: public FieldType fieldType(final int field) {
117: ensureFieldIndex(field);
118: return documentFactory.fieldType(field
119: % documentFactory.numberOfFields());
120: }
121:
122: /** A document obtained by replication of the underlying-factory document. */
123:
124: protected class ReplicatedDocument extends AbstractDocument {
125: /** The last returned field. */
126: private int currField = -1;
127: /** The current document. */
128: private Document currDocument;
129: /** The title returned by the first factory. */
130: private CharSequence title;
131: /** The uri returned by the first factory. */
132: private CharSequence uri;
133:
134: private final InputStream rawContent;
135: private final Reference2ObjectMap<Enum<?>, Object> metadata;
136:
137: protected ReplicatedDocument(final InputStream rawContent,
138: final Reference2ObjectMap<Enum<?>, Object> metadata)
139: throws IOException {
140: this .rawContent = rawContent;
141: this .metadata = metadata;
142: currDocument = documentFactory.getDocument(rawContent,
143: metadata);
144: title = currDocument.title();
145: uri = currDocument.uri();
146: }
147:
148: public CharSequence title() {
149: return title;
150: }
151:
152: public String toString() {
153: return title().toString();
154: }
155:
156: public CharSequence uri() {
157: return uri;
158: }
159:
160: public Object content(final int field) throws IOException {
161: ensureFieldIndex(field);
162: if (field <= currField)
163: throw new IOException(
164: "Composite document factories require sequential access");
165: while (currField < field) {
166: currField++;
167: if (currField % documentFactory.numberOfFields() == 0) {
168: if (currField > 0)
169: rawContent.reset();
170: currDocument = documentFactory.getDocument(
171: rawContent, metadata);
172: }
173: }
174: return currDocument.content(field
175: % documentFactory.numberOfFields());
176: }
177:
178: public WordReader wordReader(final int field) {
179: ensureFieldIndex(field);
180: return currDocument.wordReader(field
181: % documentFactory.numberOfFields());
182: }
183:
184: }
185:
186: public Document getDocument(final InputStream rawContent,
187: final Reference2ObjectMap<Enum<?>, Object> metadata)
188: throws IOException {
189: return new ReplicatedDocument(rawContent, metadata);
190: }
191: }
|