001: package it.unimi.dsi.mg4j.document;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.io.BinIO;
025: import it.unimi.dsi.fastutil.objects.ObjectArrayList;
026: import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
027: import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
028: import it.unimi.dsi.io.FileLinesCollection;
029: import it.unimi.dsi.io.NullInputStream;
030: import it.unimi.dsi.lang.MutableString;
031: import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;
032: import it.unimi.dsi.mg4j.util.MG4JClassParser;
033: import it.unimi.dsi.mg4j.util.MimeTypeResolver;
034:
035: import java.io.BufferedReader;
036: import java.io.File;
037: import java.io.FileInputStream;
038: import java.io.IOException;
039: import java.io.InputStream;
040: import java.io.InputStreamReader;
041: import java.io.Serializable;
042: import java.lang.reflect.InvocationTargetException;
043: import java.util.Collection;
044:
045: import com.martiansoftware.jsap.FlaggedOption;
046: import com.martiansoftware.jsap.JSAP;
047: import com.martiansoftware.jsap.JSAPException;
048: import com.martiansoftware.jsap.JSAPResult;
049: import com.martiansoftware.jsap.Parameter;
050: import com.martiansoftware.jsap.SimpleJSAP;
051: import com.martiansoftware.jsap.UnflaggedOption;
052:
053: /** A {@link it.unimi.dsi.mg4j.document.DocumentCollection} corresponding to
054: * a given set of files.
055: *
056: * <P>This class provides a main method with a flexible syntax that serialises
057: * into a document collection a list of files given on the command line or
058: * piped into standard input. Optionally, you can provide a parallel list of URIs
059: * that will be associated to each file.
060: */
061: public class FileSetDocumentCollection extends
062: AbstractDocumentCollection implements Serializable {
063:
064: private static final long serialVersionUID = 0L;
065:
066: /** The files in this collection. */
067: private final String[] file;
068: /** URIs for each file in this collection, or <code>null</code>, in which case the filename will be used as URI. */
069: private final String[] uri;
070: /** The factory to be used by this collection. */
071: private final DocumentFactory factory;
072: /** The last returned file input stream. */
073: private InputStream last = NullInputStream.getInstance();
074:
075: /** Builds a document collection corresponding to a given set of files specified as an array.
076: *
077: * <p><strong>Beware.</strong> This class is not guaranteed to work if files are
078: * deleted or modified after creation!
079: *
080: * @param file an array containing the files that will be contained inthe collection.
081: * @param factory the factory that will be used to create documents.
082: */
083: public FileSetDocumentCollection(final String[] file,
084: final DocumentFactory factory) {
085: this (file, null, factory);
086: }
087:
088: /** Builds a document collection corresponding to a given set of files specified as an array and
089: * a parallel array of URIs, one for each file.
090: *
091: * <p><strong>Beware.</strong> This class is not guaranteed to work if files are
092: * deleted or modified after creation!
093: *
094: * @param file an array containing the files that will be contained inthe collection.
095: * @param uri an array, parallel to <code>file</code>, containing URIs to be associated to each element of <code>file</code>.
096: * @param factory the factory that will be used to create documents.
097: */
098: public FileSetDocumentCollection(final String[] file,
099: final String uri[], final DocumentFactory factory) {
100: this .file = file;
101: this .uri = uri;
102: this .factory = factory;
103: }
104:
105: public DocumentFactory factory() {
106: return factory;
107: }
108:
109: public int size() {
110: return file.length;
111: }
112:
113: public Reference2ObjectMap<Enum<?>, Object> metadata(final int index) {
114: ensureDocumentIndex(index);
115: final Reference2ObjectArrayMap<Enum<?>, Object> metadata = new Reference2ObjectArrayMap<Enum<?>, Object>(
116: 2);
117: metadata.put(MetadataKeys.TITLE, file[index]);
118: if (uri != null)
119: metadata.put(MetadataKeys.URI, uri[index]);
120: else
121: metadata.put(MetadataKeys.URI, new File(file[index])
122: .toURI().toString());
123: metadata.put(MetadataKeys.MIMETYPE, MimeTypeResolver
124: .getContentType(file[index]));
125: return metadata;
126: }
127:
128: public Document document(final int index) throws IOException {
129: return factory.getDocument(stream(index), metadata(index));
130: }
131:
132: public InputStream stream(final int index) throws IOException {
133: ensureDocumentIndex(index);
134: last.close();
135: return last = new FileInputStream(file[index]);
136: }
137:
138: public FileSetDocumentCollection copy() {
139: return new FileSetDocumentCollection(file, uri, factory.copy());
140: }
141:
142: public void close() throws IOException {
143: last.close();
144: super .close();
145: }
146:
147: public static void main(final String[] arg) throws IOException,
148: JSAPException, InstantiationException,
149: IllegalAccessException, InvocationTargetException,
150: NoSuchMethodException {
151:
152: SimpleJSAP jsap = new SimpleJSAP(
153: FileSetDocumentCollection.class.getName(),
154: "Saves a serialised document collection based on a set of files.",
155: new Parameter[] {
156: new FlaggedOption(
157: "factory",
158: MG4JClassParser.getParser(),
159: IdentityDocumentFactory.class.getName(),
160: JSAP.NOT_REQUIRED, 'f', "factory",
161: "A document factory with a standard constructor."),
162: new FlaggedOption("property",
163: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
164: JSAP.NOT_REQUIRED, 'p', "property",
165: "A 'key=value' specification, or the name of a property file")
166: .setAllowMultipleDeclarations(true),
167: new FlaggedOption(
168: "uris",
169: JSAP.STRING_PARSER,
170: JSAP.NO_DEFAULT,
171: JSAP.NOT_REQUIRED,
172: 'u',
173: "uris",
174: "A file containing a list of URIs in ASCII encoding, one per line, that will be associated to each file"),
175: new UnflaggedOption("collection",
176: JSAP.STRING_PARSER, JSAP.REQUIRED,
177: "The filename for the serialised collection."),
178: new UnflaggedOption(
179: "file",
180: JSAP.STRING_PARSER,
181: JSAP.NO_DEFAULT,
182: JSAP.NOT_REQUIRED,
183: JSAP.GREEDY,
184: "A list of files that will be indexed. If missing, a list of files will be read from standard input.") });
185:
186: JSAPResult jsapResult = jsap.parse(arg);
187: if (jsap.messagePrinted())
188: return;
189:
190: String uri[] = null;
191: if (jsapResult.getString("uris") != null) {
192: Collection<MutableString> lines = new FileLinesCollection(
193: jsapResult.getString("uris"), "ASCII").allLines();
194: uri = new String[lines.size()];
195: int i = 0;
196: for (Object l : lines)
197: uri[i++] = l.toString();
198: }
199:
200: final DocumentFactory factory = PropertyBasedDocumentFactory
201: .getInstance(jsapResult.getClass("factory"), jsapResult
202: .getStringArray("property"));
203:
204: String[] file = (String[]) jsapResult.getObjectArray("file",
205: new String[0]);
206: if (file.length == 0) {
207: final ObjectArrayList<String> files = new ObjectArrayList<String>();
208: BufferedReader bufferedReader = new BufferedReader(
209: new InputStreamReader(System.in));
210: String s;
211: while ((s = bufferedReader.readLine()) != null)
212: files.add(s);
213: file = files.toArray(new String[0]);
214: }
215:
216: if (file.length == 0)
217: System.err.println("WARNING: empty file set.");
218: if (uri != null && file.length != uri.length)
219: throw new IllegalArgumentException("The number of files ("
220: + file.length + ") and the number of URIs ("
221: + uri.length + ") differ");
222: BinIO.storeObject(new FileSetDocumentCollection(file, uri,
223: factory), jsapResult.getString("collection"));
224: }
225: }
|