001: package it.unimi.dsi.mg4j.document;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.Util;
025: import it.unimi.dsi.fastutil.io.BinIO;
026: import it.unimi.dsi.fastutil.objects.ObjectList;
027: import it.unimi.dsi.io.WordReader;
028: import it.unimi.dsi.lang.MutableString;
029: import it.unimi.dsi.logging.ProgressLogger;
030: import it.unimi.dsi.mg4j.document.DocumentFactory.FieldType;
031: import it.unimi.dsi.mg4j.tool.Scan;
032: import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;
033: import it.unimi.dsi.mg4j.util.MG4JClassParser;
034:
035: import java.io.FileNotFoundException;
036: import java.io.FileOutputStream;
037: import java.io.IOException;
038: import java.io.ObjectOutputStream;
039: import java.io.Reader;
040: import java.lang.reflect.InvocationTargetException;
041: import java.util.zip.ZipEntry;
042: import java.util.zip.ZipOutputStream;
043:
044: import org.apache.log4j.Logger;
045:
046: import com.martiansoftware.jsap.FlaggedOption;
047: import com.martiansoftware.jsap.JSAP;
048: import com.martiansoftware.jsap.JSAPException;
049: import com.martiansoftware.jsap.JSAPResult;
050: import com.martiansoftware.jsap.Parameter;
051: import com.martiansoftware.jsap.SimpleJSAP;
052: import com.martiansoftware.jsap.Switch;
053: import com.martiansoftware.jsap.UnflaggedOption;
054:
055: /** A builder to create {@link ZipDocumentCollection}s.
056: *
057: * <p>After creating an instance of this class, it is possible to add incrementally
058: * new documents. Each document must be started with {@link #startDocument(CharSequence, CharSequence)}
059: * and ended with {@link #endDocument()}; inside each document, each non-text field must be written by passing
060: * an object to {@link #nonTextField(Object)}, whereas each text field must be
061: * started with {@link #startTextField()} and ended with {@link #endTextField()}: inbetween, a call
062: * to {@link #add(MutableString, MutableString)} must be made for each word/nonword pair retrieved
063: * from the original collection. At the end, {@link #close()} returns a {@link it.unimi.dsi.mg4j.document.ZipDocumentCollection}
064: * that must be serialised.
065: *
066: * <p>Alternatively, you can just call {@link #build(DocumentSequence)} and all the above will
067: * be handled for you.
068: *
069: * <p>Each Zip entry corresponds to a document: the title is recorded in the comment field, whereas the
070: * URI is written with {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)}
071: * directly to the zipped output stream. When building an <em>exact</em>
072: * {@linkplain it.unimi.dsi.mg4j.document.ZipDocumentCollection}
073: * subsequent word/nonword pairs are written in the same way, and
074: * delimited by two empty strings. If the collection is not exact, just words are written,
075: * and delimited by an empty string. Non-text fields are written directly to the zipped output stream.
076: */
077: public class ZipDocumentCollectionBuilder {
078: private static final Logger LOGGER = Util
079: .getLogger(ZipDocumentCollectionBuilder.class);
080:
081: private static final boolean DEBUG = false;
082:
083: /** The output stream of the zip file. */
084: private ZipOutputStream zipOut;
085: /** The number of documents written so far. */
086: private int numberOfDocuments;
087: /** True iff also non-words should be reproduced. */
088: private boolean exact;
089: /** The progress logger. */
090: private final ProgressLogger progressLogger;
091: /** The filename of the zip file. */
092: private final String zipFilename;
093: /** The factory of the base document sequence. */
094: private final DocumentFactory factory;
095: /** Whether a text field has started but not yet ended. */
096: private boolean inTextField;
097:
098: /** Creates a new zipped collection builder.
099: *
100: * @param zipFilename the filename of the zip file.
101: * @param factory the factory of the base document sequence.
102: * @param exact true iff also non-words should be preserved.
103: * @param progressLogger a progress logger.
104: */
105: public ZipDocumentCollectionBuilder(final String zipFilename,
106: final DocumentFactory factory, final boolean exact,
107: final ProgressLogger progressLogger)
108: throws FileNotFoundException {
109: this .zipFilename = zipFilename;
110: this .factory = factory;
111: this .zipOut = new ZipOutputStream(new FileOutputStream(
112: zipFilename));
113: this .exact = exact;
114: this .progressLogger = progressLogger;
115: this .inTextField = false;
116: }
117:
118: /** Starts a document entry.
119: *
120: * @param title the document title (usually, the result of {@link Document#title()}).
121: * @param uri the document uri (usually, the result of {@link Document#uri()}).
122: */
123:
124: public void startDocument(final CharSequence title,
125: final CharSequence uri) throws IOException {
126: final ZipEntry currEntry = new ZipEntry(Integer
127: .toString(numberOfDocuments));
128: currEntry.setComment(title.toString());
129: zipOut.putNextEntry(currEntry);
130: new MutableString(uri).writeSelfDelimUTF8(zipOut);
131: }
132:
133: /** Ends a document entry.
134: */
135:
136: public void endDocument() throws IOException {
137: zipOut.closeEntry();
138: numberOfDocuments++;
139: }
140:
141: /** Starts a new text field.
142: */
143:
144: public void startTextField() {
145: inTextField = true;
146: }
147:
148: /** Adds a non-text field.
149: *
150: * @param o the content of the non-text field.
151: */
152: public void nonTextField(final Object o) throws IOException {
153: if (DEBUG)
154: LOGGER.debug("Going to write non-text field " + o
155: + " of class " + o.getClass() + " for document #"
156: + numberOfDocuments);
157: ObjectOutputStream oos = new ObjectOutputStream(zipOut);
158: oos.writeObject(o);
159: oos.flush();
160: }
161:
162: /** Adds a virtual field.
163: *
164: * @param fragments the virtual fragments to be added.
165: *
166: */
167: public void virtualField(
168: final ObjectList<VirtualDocumentFragment> fragments)
169: throws IOException {
170: if (DEBUG)
171: LOGGER.debug("Going to write virtual field " + fragments
172: + " for document #" + numberOfDocuments);
173: new MutableString().append(String.valueOf(fragments.size()))
174: .writeSelfDelimUTF8(zipOut);
175: for (VirtualDocumentFragment fragment : fragments) {
176: fragment.documentSpecifier().writeSelfDelimUTF8(zipOut);
177: fragment.text().writeSelfDelimUTF8(zipOut);
178: }
179: }
180:
181: //This method can only be called if {@link #inTextField} is <code>true</code>, otherwise it will throw an {@link IllegalStateException}.
182: /** Ends a new text field. */
183: public void endTextField() throws IOException {
184: // Writing a 0 is like writing an empty string.
185: if (!inTextField)
186: throw new IllegalStateException();
187: inTextField = false;
188: zipOut.write(0);
189: if (exact)
190: zipOut.write(0);
191: }
192:
193: /** Adds a word and a nonword to the current text field, provided that a text field has {@linkplain #startTextField() started} but not yet {@linkplain #endTextField() ended};
194: * otherwise, doesn't do anything.
195: *
196: * <p>Usually, <code>word</code> e <code>nonWord</code> are just the result of a call
197: * to {@link WordReader#next(MutableString, MutableString)}.
198: *
199: * @param word a word.
200: * @param nonWord a nonword.
201: * */
202: public void add(final MutableString word,
203: final MutableString nonWord) throws IOException {
204: if (!inTextField)
205: return;
206: if (DEBUG)
207: LOGGER.debug("Going to write pair <" + word + "|" + nonWord
208: + ">");
209: if (exact || word.length() > 0)
210: word.writeSelfDelimUTF8(zipOut);
211: if (exact)
212: nonWord.writeSelfDelimUTF8(zipOut);
213: }
214:
215: /** Terminates the contruction of the zipped collection and returns it. */
216:
217: public ZipDocumentCollection close() throws IOException {
218: zipOut.close();
219: return new ZipDocumentCollection(zipFilename, factory,
220: numberOfDocuments, exact);
221: }
222:
223: /** A utility method copying all documents of an input sequence to a zipped collection. */
224:
225: @SuppressWarnings("unchecked")
226: public ZipDocumentCollection build(
227: final DocumentSequence inputSequence) throws IOException {
228: progressLogger.start("Zipping collection...");
229: numberOfDocuments = 0;
230:
231: final DocumentIterator docIt = inputSequence.iterator();
232: if (factory != inputSequence.factory())
233: throw new IllegalStateException(
234: "The factory provided by the constructor does not correspond to the factory of the input sequence");
235: final int numberOfFields = factory.numberOfFields();
236: WordReader wordReader;
237: MutableString word = new MutableString();
238: MutableString nonWord = new MutableString();
239:
240: for (;;) {
241: progressLogger.update();
242: Document document = docIt.nextDocument();
243: if (document == null)
244: break;
245: startDocument(document.title(), document.uri());
246:
247: for (int field = 0; field < numberOfFields; field++) {
248: Object content = document.content(field);
249: if (factory.fieldType(field) == FieldType.TEXT) {
250: startTextField();
251: wordReader = document.wordReader(field);
252: wordReader.setReader((Reader) content);
253: while (wordReader.next(word, nonWord))
254: add(word, nonWord);
255: endTextField();
256: } else if (factory.fieldType(field) == FieldType.VIRTUAL)
257: virtualField((ObjectList<VirtualDocumentFragment>) content);
258: else
259: nonTextField(content);
260: }
261: document.close();
262: endDocument();
263: }
264: progressLogger.done();
265: docIt.close();
266: return close();
267: }
268:
269: public static void main(final String[] arg) throws JSAPException,
270: IOException, ClassNotFoundException,
271: InvocationTargetException, NoSuchMethodException,
272: IllegalAccessException, InstantiationException {
273:
274: SimpleJSAP jsap = new SimpleJSAP(
275: ZipDocumentCollectionBuilder.class.getName(),
276: "Produces a zip document collection from an existing document sequence.",
277: new Parameter[] {
278: new FlaggedOption("sequence",
279: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
280: JSAP.NOT_REQUIRED, 'S', "sequence",
281: "A serialised document sequence that will be used instead of stdin."),
282: new FlaggedOption(
283: "factory",
284: MG4JClassParser.getParser(),
285: IdentityDocumentFactory.class.getName(),
286: JSAP.NOT_REQUIRED, 'f', "factory",
287: "A document factory with a standard constructor."),
288: new FlaggedOption("property",
289: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
290: JSAP.NOT_REQUIRED, 'p', "property",
291: "A 'key=value' specification, or the name of a property file")
292: .setAllowMultipleDeclarations(true),
293: new FlaggedOption(
294: "delimiter",
295: JSAP.INTEGER_PARSER,
296: Integer
297: .toString(Scan.DEFAULT_DELIMITER),
298: JSAP.NOT_REQUIRED, 'd', "delimiter",
299: "The document delimiter."),
300: new Switch("approximated", 'a', "approximated",
301: "If specified, non-words will not be copied."),
302: new FlaggedOption(
303: "logInterval",
304: JSAP.LONG_PARSER,
305: Long
306: .toString(ProgressLogger.DEFAULT_LOG_INTERVAL),
307: JSAP.NOT_REQUIRED, 'l', "log-interval",
308: "The minimum time interval between activity logs in milliseconds."),
309: new UnflaggedOption("collection",
310: JSAP.STRING_PARSER, JSAP.REQUIRED,
311: "The filename for the output document collection."),
312: new UnflaggedOption("zipfile",
313: JSAP.STRING_PARSER, JSAP.REQUIRED,
314: "The filename for the output zip file."), });
315: JSAPResult jsapResult = jsap.parse(arg);
316: if (jsap.messagePrinted())
317: return;
318:
319: DocumentSequence documentSequence = Scan.getSequence(jsapResult
320: .getString("sequence"), jsapResult.getClass("factory"),
321: jsapResult.getStringArray("property"), jsapResult
322: .getInt("delimiter"), LOGGER);
323: final ProgressLogger progressLogger = new ProgressLogger(
324: LOGGER, "documents");
325: if (documentSequence instanceof DocumentCollection)
326: progressLogger.expectedUpdates = ((DocumentCollection) documentSequence)
327: .size();
328: ZipDocumentCollectionBuilder builder = new ZipDocumentCollectionBuilder(
329: jsapResult.getString("zipfile"), documentSequence
330: .factory(), !jsapResult
331: .getBoolean("approximated"), progressLogger);
332: BinIO.storeObject(builder.build(documentSequence), jsapResult
333: .getString("collection"));
334: }
335: }
|