Source Code Cross Referenced for ZipDocumentCollection.java in » Search-Engine » mg4j » it » unimi » dsi » mg4j » document » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » mg4j » it.unimi.dsi.mg4j.document
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        package it.unimi.dsi.mg4j.document;
002:
003:        /*		 
004:         * MG4J: Managing Gigabytes for Java
005:         *
006:         * Copyright (C) 2005-2007 Paolo Boldi  
007:         *
008:         *  This library is free software; you can redistribute it and/or modify it
009:         *  under the terms of the GNU Lesser General Public License as published by the Free
010:         *  Software Foundation; either version 2.1 of the License, or (at your option)
011:         *  any later version.
012:         *
013:         *  This library is distributed in the hope that it will be useful, but
014:         *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015:         *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
016:         *  for more details.
017:         *
018:         *  You should have received a copy of the GNU Lesser General Public License
019:         *  along with this program; if not, write to the Free Software
020:         *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021:         *
022:         */
023:
024:        import it.unimi.dsi.Util;
025:        import it.unimi.dsi.fastutil.objects.ObjectArrayList;
026:        import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
027:        import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
028:        import it.unimi.dsi.io.FastBufferedReader;
029:        import it.unimi.dsi.io.WordReader;
030:        import it.unimi.dsi.lang.MutableString;
031:        import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;
032:        import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;
033:        import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;
034:
035:        import java.io.EOFException;
036:        import java.io.File;
037:        import java.io.FileInputStream;
038:        import java.io.FileNotFoundException;
039:        import java.io.IOException;
040:        import java.io.InputStream;
041:        import java.io.ObjectInputStream;
042:        import java.io.Reader;
043:        import java.io.Serializable;
044:        import java.util.NoSuchElementException;
045:        import java.util.zip.ZipEntry;
046:        import java.util.zip.ZipFile;
047:        import java.util.zip.ZipInputStream;
048:
049:        import org.apache.log4j.Logger;
050:
051:        /** A {@link it.unimi.dsi.mg4j.document.DocumentCollection} produced from a document
052:         * sequence using {@link it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder}.
053:         * 
054:         * <p>The collection will produce the same documents as the original sequence whence it
055:         * was produced, in the following sense:
056:         * 
057:         *  <ul>
058:         *    <li>the resulting collection has as many document as the original sequence, in the same order, with
059:         *     the same titles and URI;
060:         *    <li>every document has the same number of fields, with the same names and types;
061:         *    <li>non-textual non-virtual fields will be written out as objects, so they need to be serializable;
062:         *    <li>virtual fields will be written as a sequence of {@linkplain MutableString#writeSelfDelimUTF8(java.io.DataOutput) self-delimiting UTF-8 mutable strings}
063:         *     starting with the number of fragments (converted into a string with {@link String#valueOf(int)}),
064:         *     followed by a pair of strings for each fragment (the first string being the document specifier,
065:         *     and the second being the associated text);
066:         *    <li>textual fields will be written out in such a way that, when reading them, the same sequence
067:         *     of words and non-words will be produced; alternatively, one may produce a collection that only
068:         *     copies words (non-words are not copied). 
069:         *  </ul>
070:         *  
071:         * <p><strong>Warning:</strong> the {@link java.io.Reader} returned by {@link it.unimi.dsi.mg4j.document.Document#content(int)}
072:         * for documents produced by this factory is just obtained as the concatenation of words and non-words returned by
073:         * the word reader for that field.
074:         * 
075:         * <p>The collection will be, as any other collection, serialized on a file, but it will refer to another
076:         * zip file that is going to contain the documents themselves.
077:         */
078:        public class ZipDocumentCollection extends AbstractDocumentCollection
079:                implements  Serializable {
080:            private static final long serialVersionUID = 1L;
081:
082:            private static final Logger LOGGER = Util
083:                    .getLogger(ZipDocumentCollection.class);
084:            private static final boolean DEBUG = false;
085:
086:            /** The name of the zip collection file. */
087:            private final String zipFilename;
088:            /** The zip collection file. */
089:            private transient ZipFile zipFile;
090:            /** The factory used for the original document sequence. */
091:            private final DocumentFactory underlyingFactory;
092:            /** The factory used for this document collection. */
093:            private transient DocumentFactory factory;
094:            /** The number of documents. */
095:            private final int numberOfDocuments;
096:            /** <code>true</code> iff this is an exact reproduction of the original sequence (i.e., if also non-words are preserved). */
097:            private final boolean exact;
098:
099:            /** A factory tightly coupled to a {@link ZipDocumentCollection}. */
100:            protected static class ZipFactory extends AbstractDocumentFactory {
101:                private static final long serialVersionUID = 1L;
102:
103:                private final boolean exact;
104:                private final DocumentFactory underlyingFactory;
105:
106:                protected ZipFactory(final boolean exact,
107:                        final DocumentFactory underlyingFactory) {
108:                    this .exact = exact;
109:                    this .underlyingFactory = underlyingFactory;
110:                }
111:
112:                public ZipFactory copy() {
113:                    return this ;
114:                }
115:
116:                public int numberOfFields() {
117:                    return underlyingFactory.numberOfFields();
118:                }
119:
120:                public String fieldName(final int field) {
121:                    ensureFieldIndex(field);
122:                    return underlyingFactory.fieldName(field);
123:                }
124:
125:                public int fieldIndex(final String fieldName) {
126:                    return underlyingFactory.fieldIndex(fieldName);
127:                }
128:
129:                public FieldType fieldType(final int field) {
130:                    ensureFieldIndex(field);
131:                    return underlyingFactory.fieldType(field);
132:                }
133:
134:                public Document getDocument(final InputStream rawContent,
135:                        final Reference2ObjectMap<Enum<?>, Object> metadata)
136:                        throws IOException {
137:                    return new AbstractDocument() {
138:                        int nextFieldToRead = 0;
139:                        final MutableString uri = new MutableString();
140:
141:                        {
142:                            uri.readSelfDelimUTF8(rawContent).compact();
143:                        }
144:
145:                        public CharSequence title() {
146:                            return (CharSequence) metadata
147:                                    .get(MetadataKeys.TITLE);
148:                        }
149:
150:                        public String toString() {
151:                            return title().toString();
152:                        }
153:
154:                        public CharSequence uri() {
155:                            return uri;
156:                        }
157:
158:                        /** Skips until the end of the current field, and increments <code>nextFieldToRead</code>.
159:                         * @throws ClassNotFoundException
160:                         * @throws IOException
161:                         */
162:                        private void skipOneField() throws IOException,
163:                                ClassNotFoundException {
164:                            switch (fieldType(nextFieldToRead)) {
165:                            case TEXT:
166:                                MutableString word = new MutableString();
167:                                MutableString nonWord = new MutableString();
168:                                do {
169:                                    word.readSelfDelimUTF8(rawContent);
170:                                    if (exact)
171:                                        nonWord.readSelfDelimUTF8(rawContent);
172:                                } while (word.length() > 0
173:                                        || (exact && nonWord.length() > 0));
174:                                break;
175:                            case VIRTUAL:
176:                                MutableString dummy = new MutableString();
177:                                int nfrag = Integer.parseInt(dummy
178:                                        .readSelfDelimUTF8(rawContent)
179:                                        .toString());
180:                                for (int i = 0; i < 2 * nfrag; i++)
181:                                    dummy.readSelfDelimUTF8(rawContent);
182:                                break;
183:                            default: // Non-text and non-virtual
184:                                new ObjectInputStream(rawContent).readObject();
185:                            }
186:                            nextFieldToRead++;
187:                        }
188:
189:                        /** Skips to the given field.
190:                         * 
191:                         * @param field the field to skip to.
192:                         * @throws IOException
193:                         * @throws ClassNotFoundException
194:                         */
195:                        private void skipToField(final int field)
196:                                throws IOException, ClassNotFoundException {
197:                            if (nextFieldToRead > field)
198:                                throw new IllegalStateException(
199:                                        "Trying to skip to field " + field
200:                                                + " after " + nextFieldToRead);
201:                            while (nextFieldToRead < field)
202:                                skipOneField();
203:                        }
204:
205:                        public Object content(final int field) {
206:                            ensureFieldIndex(field);
207:                            Object result = null;
208:                            if (DEBUG)
209:                                LOGGER.debug("Called content(" + field
210:                                        + "); nextField:" + nextFieldToRead);
211:                            try {
212:                                skipToField(field);
213:                                if (fieldType(nextFieldToRead) == FieldType.VIRTUAL) {
214:                                    int nfrag = Integer
215:                                            .parseInt(new MutableString()
216:                                                    .readSelfDelimUTF8(
217:                                                            rawContent)
218:                                                    .toString());
219:                                    MutableString doc = new MutableString();
220:                                    MutableString text = new MutableString();
221:                                    VirtualDocumentFragment[] fragArray = new VirtualDocumentFragment[nfrag];
222:                                    for (int i = 0; i < nfrag; i++) {
223:                                        doc.readSelfDelimUTF8(rawContent);
224:                                        text.readSelfDelimUTF8(rawContent);
225:                                        fragArray[i] = new AnchorExtractor.Anchor(
226:                                                doc.copy(), text.copy());
227:                                    }
228:                                    result = new ObjectArrayList<VirtualDocumentFragment>(
229:                                            fragArray);
230:                                } else if (fieldType(nextFieldToRead) != FieldType.TEXT) {
231:                                    result = new ObjectInputStream(rawContent)
232:                                            .readObject();
233:                                    if (DEBUG)
234:                                        LOGGER.debug("Read " + result
235:                                                + " from field "
236:                                                + fieldName(nextFieldToRead)
237:                                                + " of object " + title());
238:                                    nextFieldToRead++;
239:                                } else {
240:                                    if (DEBUG)
241:                                        LOGGER.debug("Returning reader for "
242:                                                + field);
243:                                    result = new Reader() {
244:                                        FastBufferedReader fbr = null;
245:                                        int f = field;
246:
247:                                        public void close() {
248:                                        }
249:
250:                                        public int read(final char[] cbuf,
251:                                                final int off, final int len)
252:                                                throws IOException {
253:                                            if (fbr == null) {
254:                                                if (DEBUG)
255:                                                    LOGGER
256:                                                            .debug("Initialising reader for content "
257:                                                                    + f);
258:                                                MutableString text = new MutableString();
259:                                                MutableString word = new MutableString();
260:                                                MutableString nonWord = new MutableString();
261:                                                do {
262:                                                    text
263:                                                            .append(word
264:                                                                    .readSelfDelimUTF8(rawContent));
265:                                                    if (exact)
266:                                                        text
267:                                                                .append(nonWord
268:                                                                        .readSelfDelimUTF8(rawContent));
269:                                                } while (word.length() > 0
270:                                                        || (exact && nonWord
271:                                                                .length() > 0));
272:                                                fbr = new FastBufferedReader(
273:                                                        text);
274:                                                nextFieldToRead++;
275:                                            }
276:                                            return fbr.read(cbuf, off, len);
277:                                        }
278:                                    };
279:                                }
280:                            } catch (IOException e) {
281:                                throw new RuntimeException(e);
282:                            } catch (ClassNotFoundException e) {
283:                                throw new RuntimeException(e);
284:                            }
285:                            return result;
286:                        }
287:
288:                        public WordReader wordReader(final int field) {
289:                            ensureFieldIndex(field);
290:                            if (DEBUG)
291:                                LOGGER
292:                                        .debug("Called wordReader(" + field
293:                                                + ")");
294:                            try {
295:                                skipToField(field);
296:                            } catch (Exception e) {
297:                                throw new RuntimeException(e);
298:                            }
299:                            //logger.debug( "Asked for a new word reader for field " + fieldName( field ) );
300:                            switch (fieldType(field)) {
301:                            case TEXT:
302:                                return new WordReader() {
303:                                    private static final long serialVersionUID = 1L;
304:
305:                                    public boolean next(
306:                                            final MutableString word,
307:                                            final MutableString nonWord)
308:                                            throws IOException {
309:                                        try {
310:                                            word.readSelfDelimUTF8(rawContent);
311:                                        } catch (EOFException e) {
312:                                            return false; // TODO: a bit raw
313:                                        }
314:                                        nonWord.length(0);
315:
316:                                        if (exact) {
317:                                            try {
318:                                                nonWord
319:                                                        .readSelfDelimUTF8(rawContent);
320:                                            } catch (EOFException e) {
321:                                                return true; // TODO: a bit raw
322:                                            }
323:                                        }
324:
325:                                        final boolean goOn = word.length() != 0
326:                                                || (exact && nonWord.length() != 0);
327:                                        if (DEBUG)
328:                                            LOGGER.debug("Got word <" + word
329:                                                    + "|" + nonWord
330:                                                    + "> exact=" + exact
331:                                                    + " returning " + goOn);
332:                                        if (!goOn)
333:                                            nextFieldToRead++;
334:                                        return goOn;
335:                                    }
336:
337:                                    public WordReader setReader(
338:                                            final Reader reader) {
339:                                        return this ;
340:                                    }
341:
342:                                    public WordReader copy() {
343:                                        throw new UnsupportedOperationException();
344:                                    }
345:                                };
346:                            case VIRTUAL:
347:                                return new FastBufferedReader();
348:                            default:
349:                                return null;
350:                            }
351:
352:                        }
353:                    };
354:                }
355:            }
356:
357:            /** Constructs a document collection (for reading) corresponding to a given zip collection file.
358:             * 
359:             * @param zipFilename the filename of the zip collection.
360:             * @param underlyingFactory the underlying document factory.
361:             * @param numberOfDocuments the number of documents.
362:             * @param exact <code>true</code> iff this is an exact reproduction of the original sequence.
363:             * @throws IOException
364:             */
365:            public ZipDocumentCollection(final String zipFilename,
366:                    final DocumentFactory underlyingFactory,
367:                    final int numberOfDocuments, final boolean exact)
368:                    throws IOException {
369:                this .zipFilename = zipFilename;
370:                this .underlyingFactory = underlyingFactory;
371:                this .numberOfDocuments = numberOfDocuments;
372:                this .exact = exact;
373:                zipFile = new ZipFile(new File(zipFilename));
374:                // Creates the factory
375:                factory = new ZipFactory(exact, underlyingFactory);
376:            }
377:
378:            public ZipDocumentCollection copy() {
379:                try {
380:                    return new ZipDocumentCollection(zipFilename,
381:                            underlyingFactory, numberOfDocuments, exact);
382:                } catch (IOException e) {
383:                    throw new RuntimeException(e);
384:                }
385:            }
386:
387:            private Object readResolve() throws IOException {
388:                super .close();
389:                return new ZipDocumentCollection(zipFilename,
390:                        underlyingFactory, numberOfDocuments, exact);
391:            }
392:
393:            public DocumentFactory factory() {
394:                return factory;
395:            }
396:
397:            public int size() {
398:                return numberOfDocuments;
399:            }
400:
401:            private ZipEntry getEntry(final int index) {
402:                ensureDocumentIndex(index);
403:                final ZipEntry entry = zipFile
404:                        .getEntry(Integer.toString(index));
405:                if (entry == null)
406:                    throw new NoSuchElementException(
407:                            "Failure retrieving entry " + index);
408:                return entry;
409:            }
410:
411:            public Document document(final int index) throws IOException {
412:                final ZipEntry entry = getEntry(index);
413:                final Reference2ObjectMap<Enum<?>, Object> metadata = metadata(
414:                        index, entry);
415:                InputStream is = zipFile.getInputStream(entry);
416:                return factory.getDocument(is, metadata);
417:            }
418:
419:            private Reference2ObjectMap<Enum<?>, Object> metadata(
420:                    final int index, ZipEntry entry) {
421:                if (entry == null)
422:                    entry = getEntry(index);
423:                final Reference2ObjectArrayMap<Enum<?>, Object> metadata = new Reference2ObjectArrayMap<Enum<?>, Object>(
424:                        1);
425:                metadata.put(MetadataKeys.TITLE, entry.getComment());
426:                return metadata;
427:            }
428:
429:            public Reference2ObjectMap<Enum<?>, Object> metadata(final int index) {
430:                return metadata(index, null);
431:            }
432:
433:            public InputStream stream(final int index) throws IOException {
434:                final ZipEntry entry = zipFile
435:                        .getEntry(Integer.toString(index));
436:                entry.getComment(); // Just skip title
437:                InputStream is = zipFile.getInputStream(entry);
438:                return is;
439:            }
440:
441:            public DocumentIterator iterator() {
442:                try {
443:                    return new AbstractDocumentIterator() {
444:                        final Reference2ObjectArrayMap<Enum<?>, Object> metadata = new Reference2ObjectArrayMap<Enum<?>, Object>(
445:                                new Enum[1], new Object[1]);
446:
447:                        ZipInputStream zis = new ZipInputStream(
448:                                new FileInputStream(zipFile.getName()));
449:
450:                        public Document nextDocument() throws IOException {
451:                            ZipEntry entry;
452:                            String name;
453:                            do {
454:                                entry = zis.getNextEntry();
455:                                if (entry == null)
456:                                    return null;
457:                                name = entry.getName();
458:                            } while (!Character.isDigit(name.charAt(0)));
459:                            if (entry == null)
460:                                return null;
461:                            String title = entry.getComment();
462:                            if (DEBUG)
463:                                LOGGER.debug("Reading sequentially document "
464:                                        + title + ", name: " + entry.getName());
465:                            InputStream is = zipFile.getInputStream(entry);
466:                            metadata.put(MetadataKeys.TITLE, title);
467:                            return factory.getDocument(is, metadata);
468:                        }
469:                    };
470:                } catch (FileNotFoundException e) {
471:                    throw new RuntimeException(e);
472:                }
473:            }
474:
475:            public void close() throws IOException {
476:                super.close();
477:                zipFile.close();
478:            }
479:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.