Source Code Cross Referenced for ZipDocumentCollectionBuilder.java in » Search-Engine » mg4j » it » unimi » dsi » mg4j » document » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Search Engine » mg4j » it.unimi.dsi.mg4j.document

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        package it.unimi.dsi.mg4j.document;
002:
003:        /*		 
004:         * MG4J: Managing Gigabytes for Java
005:         *
006:         * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna 
007:         *
008:         *  This library is free software; you can redistribute it and/or modify it
009:         *  under the terms of the GNU Lesser General Public License as published by the Free
010:         *  Software Foundation; either version 2.1 of the License, or (at your option)
011:         *  any later version.
012:         *
013:         *  This library is distributed in the hope that it will be useful, but
014:         *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015:         *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
016:         *  for more details.
017:         *
018:         *  You should have received a copy of the GNU Lesser General Public License
019:         *  along with this program; if not, write to the Free Software
020:         *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021:         *
022:         */
023:
024:        import it.unimi.dsi.Util;
025:        import it.unimi.dsi.fastutil.io.BinIO;
026:        import it.unimi.dsi.fastutil.objects.ObjectList;
027:        import it.unimi.dsi.io.WordReader;
028:        import it.unimi.dsi.lang.MutableString;
029:        import it.unimi.dsi.logging.ProgressLogger;
030:        import it.unimi.dsi.mg4j.document.DocumentFactory.FieldType;
031:        import it.unimi.dsi.mg4j.tool.Scan;
032:        import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;
033:        import it.unimi.dsi.mg4j.util.MG4JClassParser;
034:
035:        import java.io.FileNotFoundException;
036:        import java.io.FileOutputStream;
037:        import java.io.IOException;
038:        import java.io.ObjectOutputStream;
039:        import java.io.Reader;
040:        import java.lang.reflect.InvocationTargetException;
041:        import java.util.zip.ZipEntry;
042:        import java.util.zip.ZipOutputStream;
043:
044:        import org.apache.log4j.Logger;
045:
046:        import com.martiansoftware.jsap.FlaggedOption;
047:        import com.martiansoftware.jsap.JSAP;
048:        import com.martiansoftware.jsap.JSAPException;
049:        import com.martiansoftware.jsap.JSAPResult;
050:        import com.martiansoftware.jsap.Parameter;
051:        import com.martiansoftware.jsap.SimpleJSAP;
052:        import com.martiansoftware.jsap.Switch;
053:        import com.martiansoftware.jsap.UnflaggedOption;
054:
055:        /** A builder to create {@link ZipDocumentCollection}s.
056:         * 
057:         * <p>After creating an instance of this class, it is possible to add incrementally
058:         * new documents. Each document must be started with {@link #startDocument(CharSequence, CharSequence)}
059:         * and ended with {@link #endDocument()}; inside each document, each non-text field must be written by passing
060:         * an object to {@link #nonTextField(Object)}, whereas each text field must be
061:         * started with {@link #startTextField()} and ended with {@link #endTextField()}: inbetween, a call
062:         * to {@link #add(MutableString, MutableString)} must be made for each word/nonword pair retrieved
063:         * from the original collection. At the end, {@link #close()} returns a {@link it.unimi.dsi.mg4j.document.ZipDocumentCollection}
064:         * that must be serialised.
065:         * 
066:         * <p>Alternatively, you can just call {@link #build(DocumentSequence)} and all the above will
067:         * be handled for you.
068:         * 
069:         * <p>Each Zip entry corresponds to a document: the title is recorded in the comment field, whereas the 
070:         * URI is written with {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)}
071:         * directly to the zipped output stream. When building an <em>exact</em>
072:         * {@linkplain it.unimi.dsi.mg4j.document.ZipDocumentCollection} 
073:         * subsequent word/nonword pairs are written in the same way, and
074:         * delimited by two empty strings. If the collection is not exact, just words are written,
075:         * and delimited by an empty string. Non-text fields are written directly to the zipped output stream.
076:         */
077:        public class ZipDocumentCollectionBuilder {
078:            private static final Logger LOGGER = Util
079:                    .getLogger(ZipDocumentCollectionBuilder.class);
080:
081:            private static final boolean DEBUG = false;
082:
083:            /** The output stream of the zip file. */
084:            private ZipOutputStream zipOut;
085:            /** The number of documents written so far. */
086:            private int numberOfDocuments;
087:            /** True iff also non-words should be reproduced. */
088:            private boolean exact;
089:            /** The progress logger. */
090:            private final ProgressLogger progressLogger;
091:            /** The filename of the zip file. */
092:            private final String zipFilename;
093:            /** The factory of the base document sequence. */
094:            private final DocumentFactory factory;
095:            /** Whether a text field has started but not yet ended. */
096:            private boolean inTextField;
097:
098:            /** Creates a new zipped collection builder.
099:             * 
100:             * @param zipFilename the filename of the zip file.
101:             * @param factory the factory of the base document sequence.
102:             * @param exact true iff also non-words should be preserved.
103:             * @param progressLogger a progress logger.
104:             */
105:            public ZipDocumentCollectionBuilder(final String zipFilename,
106:                    final DocumentFactory factory, final boolean exact,
107:                    final ProgressLogger progressLogger)
108:                    throws FileNotFoundException {
109:                this .zipFilename = zipFilename;
110:                this .factory = factory;
111:                this .zipOut = new ZipOutputStream(new FileOutputStream(
112:                        zipFilename));
113:                this .exact = exact;
114:                this .progressLogger = progressLogger;
115:                this .inTextField = false;
116:            }
117:
118:            /** Starts a document entry.
119:             * 
120:             * @param title the document title (usually, the result of {@link Document#title()}).
121:             * @param uri the document uri (usually, the result of {@link Document#uri()}).
122:             */
123:
124:            public void startDocument(final CharSequence title,
125:                    final CharSequence uri) throws IOException {
126:                final ZipEntry currEntry = new ZipEntry(Integer
127:                        .toString(numberOfDocuments));
128:                currEntry.setComment(title.toString());
129:                zipOut.putNextEntry(currEntry);
130:                new MutableString(uri).writeSelfDelimUTF8(zipOut);
131:            }
132:
133:            /** Ends a document entry. 
134:             */
135:
136:            public void endDocument() throws IOException {
137:                zipOut.closeEntry();
138:                numberOfDocuments++;
139:            }
140:
141:            /** Starts a new text field.
142:             */
143:
144:            public void startTextField() {
145:                inTextField = true;
146:            }
147:
148:            /** Adds a non-text field.
149:             * 
150:             * @param o the content of the non-text field.
151:             */
152:            public void nonTextField(final Object o) throws IOException {
153:                if (DEBUG)
154:                    LOGGER.debug("Going to write non-text field " + o
155:                            + " of class " + o.getClass() + " for document #"
156:                            + numberOfDocuments);
157:                ObjectOutputStream oos = new ObjectOutputStream(zipOut);
158:                oos.writeObject(o);
159:                oos.flush();
160:            }
161:
162:            /** Adds a virtual field.
163:             * 
164:             *  @param fragments the virtual fragments to be added.
165:             * 
166:             */
167:            public void virtualField(
168:                    final ObjectList<VirtualDocumentFragment> fragments)
169:                    throws IOException {
170:                if (DEBUG)
171:                    LOGGER.debug("Going to write virtual field " + fragments
172:                            + " for document #" + numberOfDocuments);
173:                new MutableString().append(String.valueOf(fragments.size()))
174:                        .writeSelfDelimUTF8(zipOut);
175:                for (VirtualDocumentFragment fragment : fragments) {
176:                    fragment.documentSpecifier().writeSelfDelimUTF8(zipOut);
177:                    fragment.text().writeSelfDelimUTF8(zipOut);
178:                }
179:            }
180:
181:            //This method can only be called if {@link #inTextField} is <code>true</code>, otherwise it will throw an {@link IllegalStateException}.
182:            /** Ends a new text field. */
183:            public void endTextField() throws IOException {
184:                // Writing a 0 is like writing an empty string.
185:                if (!inTextField)
186:                    throw new IllegalStateException();
187:                inTextField = false;
188:                zipOut.write(0);
189:                if (exact)
190:                    zipOut.write(0);
191:            }
192:
193:            /** Adds a word and a nonword to the current text field, provided that a text field has {@linkplain #startTextField() started} but not yet {@linkplain #endTextField() ended};
194:             *  otherwise, doesn't do anything.
195:             *
196:             * <p>Usually, <code>word</code> e <code>nonWord</code> are just the result of a call
197:             * to {@link WordReader#next(MutableString, MutableString)}.
198:             *  
199:             * @param word a word.
200:             * @param nonWord a nonword.
201:             * */
202:            public void add(final MutableString word,
203:                    final MutableString nonWord) throws IOException {
204:                if (!inTextField)
205:                    return;
206:                if (DEBUG)
207:                    LOGGER.debug("Going to write pair <" + word + "|" + nonWord
208:                            + ">");
209:                if (exact || word.length() > 0)
210:                    word.writeSelfDelimUTF8(zipOut);
211:                if (exact)
212:                    nonWord.writeSelfDelimUTF8(zipOut);
213:            }
214:
215:            /** Terminates the contruction of the zipped collection and returns it. */
216:
217:            public ZipDocumentCollection close() throws IOException {
218:                zipOut.close();
219:                return new ZipDocumentCollection(zipFilename, factory,
220:                        numberOfDocuments, exact);
221:            }
222:
223:            /** A utility method copying all documents of an input sequence to a zipped collection. */
224:
225:            @SuppressWarnings("unchecked")
226:            public ZipDocumentCollection build(
227:                    final DocumentSequence inputSequence) throws IOException {
228:                progressLogger.start("Zipping collection...");
229:                numberOfDocuments = 0;
230:
231:                final DocumentIterator docIt = inputSequence.iterator();
232:                if (factory != inputSequence.factory())
233:                    throw new IllegalStateException(
234:                            "The factory provided by the constructor does not correspond to the factory of the input sequence");
235:                final int numberOfFields = factory.numberOfFields();
236:                WordReader wordReader;
237:                MutableString word = new MutableString();
238:                MutableString nonWord = new MutableString();
239:
240:                for (;;) {
241:                    progressLogger.update();
242:                    Document document = docIt.nextDocument();
243:                    if (document == null)
244:                        break;
245:                    startDocument(document.title(), document.uri());
246:
247:                    for (int field = 0; field < numberOfFields; field++) {
248:                        Object content = document.content(field);
249:                        if (factory.fieldType(field) == FieldType.TEXT) {
250:                            startTextField();
251:                            wordReader = document.wordReader(field);
252:                            wordReader.setReader((Reader) content);
253:                            while (wordReader.next(word, nonWord))
254:                                add(word, nonWord);
255:                            endTextField();
256:                        } else if (factory.fieldType(field) == FieldType.VIRTUAL)
257:                            virtualField((ObjectList<VirtualDocumentFragment>) content);
258:                        else
259:                            nonTextField(content);
260:                    }
261:                    document.close();
262:                    endDocument();
263:                }
264:                progressLogger.done();
265:                docIt.close();
266:                return close();
267:            }
268:
269:            public static void main(final String[] arg) throws JSAPException,
270:                    IOException, ClassNotFoundException,
271:                    InvocationTargetException, NoSuchMethodException,
272:                    IllegalAccessException, InstantiationException {
273:
274:                SimpleJSAP jsap = new SimpleJSAP(
275:                        ZipDocumentCollectionBuilder.class.getName(),
276:                        "Produces a zip document collection from an existing document sequence.",
277:                        new Parameter[] {
278:                                new FlaggedOption("sequence",
279:                                        JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
280:                                        JSAP.NOT_REQUIRED, 'S', "sequence",
281:                                        "A serialised document sequence that will be used instead of stdin."),
282:                                new FlaggedOption(
283:                                        "factory",
284:                                        MG4JClassParser.getParser(),
285:                                        IdentityDocumentFactory.class.getName(),
286:                                        JSAP.NOT_REQUIRED, 'f', "factory",
287:                                        "A document factory with a standard constructor."),
288:                                new FlaggedOption("property",
289:                                        JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
290:                                        JSAP.NOT_REQUIRED, 'p', "property",
291:                                        "A 'key=value' specification, or the name of a property file")
292:                                        .setAllowMultipleDeclarations(true),
293:                                new FlaggedOption(
294:                                        "delimiter",
295:                                        JSAP.INTEGER_PARSER,
296:                                        Integer
297:                                                .toString(Scan.DEFAULT_DELIMITER),
298:                                        JSAP.NOT_REQUIRED, 'd', "delimiter",
299:                                        "The document delimiter."),
300:                                new Switch("approximated", 'a', "approximated",
301:                                        "If specified, non-words will not be copied."),
302:                                new FlaggedOption(
303:                                        "logInterval",
304:                                        JSAP.LONG_PARSER,
305:                                        Long
306:                                                .toString(ProgressLogger.DEFAULT_LOG_INTERVAL),
307:                                        JSAP.NOT_REQUIRED, 'l', "log-interval",
308:                                        "The minimum time interval between activity logs in milliseconds."),
309:                                new UnflaggedOption("collection",
310:                                        JSAP.STRING_PARSER, JSAP.REQUIRED,
311:                                        "The filename for the output document collection."),
312:                                new UnflaggedOption("zipfile",
313:                                        JSAP.STRING_PARSER, JSAP.REQUIRED,
314:                                        "The filename for the output zip file."), });
315:                JSAPResult jsapResult = jsap.parse(arg);
316:                if (jsap.messagePrinted())
317:                    return;
318:
319:                DocumentSequence documentSequence = Scan.getSequence(jsapResult
320:                        .getString("sequence"), jsapResult.getClass("factory"),
321:                        jsapResult.getStringArray("property"), jsapResult
322:                                .getInt("delimiter"), LOGGER);
323:                final ProgressLogger progressLogger = new ProgressLogger(
324:                        LOGGER, "documents");
325:                if (documentSequence instanceof  DocumentCollection)
326:                    progressLogger.expectedUpdates = ((DocumentCollection) documentSequence)
327:                            .size();
328:                ZipDocumentCollectionBuilder builder = new ZipDocumentCollectionBuilder(
329:                        jsapResult.getString("zipfile"), documentSequence
330:                                .factory(), !jsapResult
331:                                .getBoolean("approximated"), progressLogger);
332:                BinIO.storeObject(builder.build(documentSequence), jsapResult
333:                        .getString("collection"));
334:            }
335:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.