001: package it.unimi.dsi.mg4j.document;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Paolo Boldi
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.Util;
025: import it.unimi.dsi.fastutil.objects.ObjectArrayList;
026: import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
027: import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
028: import it.unimi.dsi.io.FastBufferedReader;
029: import it.unimi.dsi.io.WordReader;
030: import it.unimi.dsi.lang.MutableString;
031: import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;
032: import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;
033: import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;
034:
035: import java.io.EOFException;
036: import java.io.File;
037: import java.io.FileInputStream;
038: import java.io.FileNotFoundException;
039: import java.io.IOException;
040: import java.io.InputStream;
041: import java.io.ObjectInputStream;
042: import java.io.Reader;
043: import java.io.Serializable;
044: import java.util.NoSuchElementException;
045: import java.util.zip.ZipEntry;
046: import java.util.zip.ZipFile;
047: import java.util.zip.ZipInputStream;
048:
049: import org.apache.log4j.Logger;
050:
051: /** A {@link it.unimi.dsi.mg4j.document.DocumentCollection} produced from a document
052: * sequence using {@link it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder}.
053: *
054: * <p>The collection will produce the same documents as the original sequence whence it
055: * was produced, in the following sense:
056: *
057: * <ul>
058: * <li>the resulting collection has as many document as the original sequence, in the same order, with
059: * the same titles and URI;
060: * <li>every document has the same number of fields, with the same names and types;
061: * <li>non-textual non-virtual fields will be written out as objects, so they need to be serializable;
062: * <li>virtual fields will be written as a sequence of {@linkplain MutableString#writeSelfDelimUTF8(java.io.DataOutput) self-delimiting UTF-8 mutable strings}
063: * starting with the number of fragments (converted into a string with {@link String#valueOf(int)}),
064: * followed by a pair of strings for each fragment (the first string being the document specifier,
065: * and the second being the associated text);
066: * <li>textual fields will be written out in such a way that, when reading them, the same sequence
067: * of words and non-words will be produced; alternatively, one may produce a collection that only
068: * copies words (non-words are not copied).
069: * </ul>
070: *
071: * <p><strong>Warning:</strong> the {@link java.io.Reader} returned by {@link it.unimi.dsi.mg4j.document.Document#content(int)}
072: * for documents produced by this factory is just obtained as the concatenation of words and non-words returned by
073: * the word reader for that field.
074: *
075: * <p>The collection will be, as any other collection, serialized on a file, but it will refer to another
076: * zip file that is going to contain the documents themselves.
077: */
078: public class ZipDocumentCollection extends AbstractDocumentCollection
079: implements Serializable {
080: private static final long serialVersionUID = 1L;
081:
082: private static final Logger LOGGER = Util
083: .getLogger(ZipDocumentCollection.class);
084: private static final boolean DEBUG = false;
085:
086: /** The name of the zip collection file. */
087: private final String zipFilename;
088: /** The zip collection file. */
089: private transient ZipFile zipFile;
090: /** The factory used for the original document sequence. */
091: private final DocumentFactory underlyingFactory;
092: /** The factory used for this document collection. */
093: private transient DocumentFactory factory;
094: /** The number of documents. */
095: private final int numberOfDocuments;
096: /** <code>true</code> iff this is an exact reproduction of the original sequence (i.e., if also non-words are preserved). */
097: private final boolean exact;
098:
099: /** A factory tightly coupled to a {@link ZipDocumentCollection}. */
100: protected static class ZipFactory extends AbstractDocumentFactory {
101: private static final long serialVersionUID = 1L;
102:
103: private final boolean exact;
104: private final DocumentFactory underlyingFactory;
105:
106: protected ZipFactory(final boolean exact,
107: final DocumentFactory underlyingFactory) {
108: this .exact = exact;
109: this .underlyingFactory = underlyingFactory;
110: }
111:
112: public ZipFactory copy() {
113: return this ;
114: }
115:
116: public int numberOfFields() {
117: return underlyingFactory.numberOfFields();
118: }
119:
120: public String fieldName(final int field) {
121: ensureFieldIndex(field);
122: return underlyingFactory.fieldName(field);
123: }
124:
125: public int fieldIndex(final String fieldName) {
126: return underlyingFactory.fieldIndex(fieldName);
127: }
128:
129: public FieldType fieldType(final int field) {
130: ensureFieldIndex(field);
131: return underlyingFactory.fieldType(field);
132: }
133:
134: public Document getDocument(final InputStream rawContent,
135: final Reference2ObjectMap<Enum<?>, Object> metadata)
136: throws IOException {
137: return new AbstractDocument() {
138: int nextFieldToRead = 0;
139: final MutableString uri = new MutableString();
140:
141: {
142: uri.readSelfDelimUTF8(rawContent).compact();
143: }
144:
145: public CharSequence title() {
146: return (CharSequence) metadata
147: .get(MetadataKeys.TITLE);
148: }
149:
150: public String toString() {
151: return title().toString();
152: }
153:
154: public CharSequence uri() {
155: return uri;
156: }
157:
158: /** Skips until the end of the current field, and increments <code>nextFieldToRead</code>.
159: * @throws ClassNotFoundException
160: * @throws IOException
161: */
162: private void skipOneField() throws IOException,
163: ClassNotFoundException {
164: switch (fieldType(nextFieldToRead)) {
165: case TEXT:
166: MutableString word = new MutableString();
167: MutableString nonWord = new MutableString();
168: do {
169: word.readSelfDelimUTF8(rawContent);
170: if (exact)
171: nonWord.readSelfDelimUTF8(rawContent);
172: } while (word.length() > 0
173: || (exact && nonWord.length() > 0));
174: break;
175: case VIRTUAL:
176: MutableString dummy = new MutableString();
177: int nfrag = Integer.parseInt(dummy
178: .readSelfDelimUTF8(rawContent)
179: .toString());
180: for (int i = 0; i < 2 * nfrag; i++)
181: dummy.readSelfDelimUTF8(rawContent);
182: break;
183: default: // Non-text and non-virtual
184: new ObjectInputStream(rawContent).readObject();
185: }
186: nextFieldToRead++;
187: }
188:
189: /** Skips to the given field.
190: *
191: * @param field the field to skip to.
192: * @throws IOException
193: * @throws ClassNotFoundException
194: */
195: private void skipToField(final int field)
196: throws IOException, ClassNotFoundException {
197: if (nextFieldToRead > field)
198: throw new IllegalStateException(
199: "Trying to skip to field " + field
200: + " after " + nextFieldToRead);
201: while (nextFieldToRead < field)
202: skipOneField();
203: }
204:
205: public Object content(final int field) {
206: ensureFieldIndex(field);
207: Object result = null;
208: if (DEBUG)
209: LOGGER.debug("Called content(" + field
210: + "); nextField:" + nextFieldToRead);
211: try {
212: skipToField(field);
213: if (fieldType(nextFieldToRead) == FieldType.VIRTUAL) {
214: int nfrag = Integer
215: .parseInt(new MutableString()
216: .readSelfDelimUTF8(
217: rawContent)
218: .toString());
219: MutableString doc = new MutableString();
220: MutableString text = new MutableString();
221: VirtualDocumentFragment[] fragArray = new VirtualDocumentFragment[nfrag];
222: for (int i = 0; i < nfrag; i++) {
223: doc.readSelfDelimUTF8(rawContent);
224: text.readSelfDelimUTF8(rawContent);
225: fragArray[i] = new AnchorExtractor.Anchor(
226: doc.copy(), text.copy());
227: }
228: result = new ObjectArrayList<VirtualDocumentFragment>(
229: fragArray);
230: } else if (fieldType(nextFieldToRead) != FieldType.TEXT) {
231: result = new ObjectInputStream(rawContent)
232: .readObject();
233: if (DEBUG)
234: LOGGER.debug("Read " + result
235: + " from field "
236: + fieldName(nextFieldToRead)
237: + " of object " + title());
238: nextFieldToRead++;
239: } else {
240: if (DEBUG)
241: LOGGER.debug("Returning reader for "
242: + field);
243: result = new Reader() {
244: FastBufferedReader fbr = null;
245: int f = field;
246:
247: public void close() {
248: }
249:
250: public int read(final char[] cbuf,
251: final int off, final int len)
252: throws IOException {
253: if (fbr == null) {
254: if (DEBUG)
255: LOGGER
256: .debug("Initialising reader for content "
257: + f);
258: MutableString text = new MutableString();
259: MutableString word = new MutableString();
260: MutableString nonWord = new MutableString();
261: do {
262: text
263: .append(word
264: .readSelfDelimUTF8(rawContent));
265: if (exact)
266: text
267: .append(nonWord
268: .readSelfDelimUTF8(rawContent));
269: } while (word.length() > 0
270: || (exact && nonWord
271: .length() > 0));
272: fbr = new FastBufferedReader(
273: text);
274: nextFieldToRead++;
275: }
276: return fbr.read(cbuf, off, len);
277: }
278: };
279: }
280: } catch (IOException e) {
281: throw new RuntimeException(e);
282: } catch (ClassNotFoundException e) {
283: throw new RuntimeException(e);
284: }
285: return result;
286: }
287:
288: public WordReader wordReader(final int field) {
289: ensureFieldIndex(field);
290: if (DEBUG)
291: LOGGER
292: .debug("Called wordReader(" + field
293: + ")");
294: try {
295: skipToField(field);
296: } catch (Exception e) {
297: throw new RuntimeException(e);
298: }
299: //logger.debug( "Asked for a new word reader for field " + fieldName( field ) );
300: switch (fieldType(field)) {
301: case TEXT:
302: return new WordReader() {
303: private static final long serialVersionUID = 1L;
304:
305: public boolean next(
306: final MutableString word,
307: final MutableString nonWord)
308: throws IOException {
309: try {
310: word.readSelfDelimUTF8(rawContent);
311: } catch (EOFException e) {
312: return false; // TODO: a bit raw
313: }
314: nonWord.length(0);
315:
316: if (exact) {
317: try {
318: nonWord
319: .readSelfDelimUTF8(rawContent);
320: } catch (EOFException e) {
321: return true; // TODO: a bit raw
322: }
323: }
324:
325: final boolean goOn = word.length() != 0
326: || (exact && nonWord.length() != 0);
327: if (DEBUG)
328: LOGGER.debug("Got word <" + word
329: + "|" + nonWord
330: + "> exact=" + exact
331: + " returning " + goOn);
332: if (!goOn)
333: nextFieldToRead++;
334: return goOn;
335: }
336:
337: public WordReader setReader(
338: final Reader reader) {
339: return this ;
340: }
341:
342: public WordReader copy() {
343: throw new UnsupportedOperationException();
344: }
345: };
346: case VIRTUAL:
347: return new FastBufferedReader();
348: default:
349: return null;
350: }
351:
352: }
353: };
354: }
355: }
356:
357: /** Constructs a document collection (for reading) corresponding to a given zip collection file.
358: *
359: * @param zipFilename the filename of the zip collection.
360: * @param underlyingFactory the underlying document factory.
361: * @param numberOfDocuments the number of documents.
362: * @param exact <code>true</code> iff this is an exact reproduction of the original sequence.
363: * @throws IOException
364: */
365: public ZipDocumentCollection(final String zipFilename,
366: final DocumentFactory underlyingFactory,
367: final int numberOfDocuments, final boolean exact)
368: throws IOException {
369: this .zipFilename = zipFilename;
370: this .underlyingFactory = underlyingFactory;
371: this .numberOfDocuments = numberOfDocuments;
372: this .exact = exact;
373: zipFile = new ZipFile(new File(zipFilename));
374: // Creates the factory
375: factory = new ZipFactory(exact, underlyingFactory);
376: }
377:
378: public ZipDocumentCollection copy() {
379: try {
380: return new ZipDocumentCollection(zipFilename,
381: underlyingFactory, numberOfDocuments, exact);
382: } catch (IOException e) {
383: throw new RuntimeException(e);
384: }
385: }
386:
387: private Object readResolve() throws IOException {
388: super .close();
389: return new ZipDocumentCollection(zipFilename,
390: underlyingFactory, numberOfDocuments, exact);
391: }
392:
393: public DocumentFactory factory() {
394: return factory;
395: }
396:
397: public int size() {
398: return numberOfDocuments;
399: }
400:
401: private ZipEntry getEntry(final int index) {
402: ensureDocumentIndex(index);
403: final ZipEntry entry = zipFile
404: .getEntry(Integer.toString(index));
405: if (entry == null)
406: throw new NoSuchElementException(
407: "Failure retrieving entry " + index);
408: return entry;
409: }
410:
411: public Document document(final int index) throws IOException {
412: final ZipEntry entry = getEntry(index);
413: final Reference2ObjectMap<Enum<?>, Object> metadata = metadata(
414: index, entry);
415: InputStream is = zipFile.getInputStream(entry);
416: return factory.getDocument(is, metadata);
417: }
418:
419: private Reference2ObjectMap<Enum<?>, Object> metadata(
420: final int index, ZipEntry entry) {
421: if (entry == null)
422: entry = getEntry(index);
423: final Reference2ObjectArrayMap<Enum<?>, Object> metadata = new Reference2ObjectArrayMap<Enum<?>, Object>(
424: 1);
425: metadata.put(MetadataKeys.TITLE, entry.getComment());
426: return metadata;
427: }
428:
429: public Reference2ObjectMap<Enum<?>, Object> metadata(final int index) {
430: return metadata(index, null);
431: }
432:
433: public InputStream stream(final int index) throws IOException {
434: final ZipEntry entry = zipFile
435: .getEntry(Integer.toString(index));
436: entry.getComment(); // Just skip title
437: InputStream is = zipFile.getInputStream(entry);
438: return is;
439: }
440:
441: public DocumentIterator iterator() {
442: try {
443: return new AbstractDocumentIterator() {
444: final Reference2ObjectArrayMap<Enum<?>, Object> metadata = new Reference2ObjectArrayMap<Enum<?>, Object>(
445: new Enum[1], new Object[1]);
446:
447: ZipInputStream zis = new ZipInputStream(
448: new FileInputStream(zipFile.getName()));
449:
450: public Document nextDocument() throws IOException {
451: ZipEntry entry;
452: String name;
453: do {
454: entry = zis.getNextEntry();
455: if (entry == null)
456: return null;
457: name = entry.getName();
458: } while (!Character.isDigit(name.charAt(0)));
459: if (entry == null)
460: return null;
461: String title = entry.getComment();
462: if (DEBUG)
463: LOGGER.debug("Reading sequentially document "
464: + title + ", name: " + entry.getName());
465: InputStream is = zipFile.getInputStream(entry);
466: metadata.put(MetadataKeys.TITLE, title);
467: return factory.getDocument(is, metadata);
468: }
469: };
470: } catch (FileNotFoundException e) {
471: throw new RuntimeException(e);
472: }
473: }
474:
475: public void close() throws IOException {
476: super.close();
477: zipFile.close();
478: }
479: }
|