Source Code Cross Referenced for TermVectorsReader.java in  » Net » lucene-connector » org » apache » lucene » index » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Net » lucene connector » org.apache.lucene.index 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        package org.apache.lucene.index;
002:
003:        /**
004:         * Licensed to the Apache Software Foundation (ASF) under one or more
005:         * contributor license agreements.  See the NOTICE file distributed with
006:         * this work for additional information regarding copyright ownership.
007:         * The ASF licenses this file to You under the Apache License, Version 2.0
008:         * (the "License"); you may not use this file except in compliance with
009:         * the License.  You may obtain a copy of the License at
010:         *
011:         *     http://www.apache.org/licenses/LICENSE-2.0
012:         *
013:         * Unless required by applicable law or agreed to in writing, software
014:         * distributed under the License is distributed on an "AS IS" BASIS,
015:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016:         * See the License for the specific language governing permissions and
017:         * limitations under the License.
018:         */
019:
020:        import org.apache.lucene.store.BufferedIndexInput;
021:        import org.apache.lucene.store.Directory;
022:        import org.apache.lucene.store.IndexInput;
023:
024:        import java.io.IOException;
025:
026:        /**
027:         * @version $Id: TermVectorsReader.java 601337 2007-12-05 13:59:37Z mikemccand $
028:         */
029:        class TermVectorsReader implements  Cloneable {
030:
031:            static final int FORMAT_VERSION = 2;
032:            //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file 
033:            static final int FORMAT_SIZE = 4;
034:
035:            static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
036:            static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
037:
038:            private FieldInfos fieldInfos;
039:
040:            private IndexInput tvx;
041:            private IndexInput tvd;
042:            private IndexInput tvf;
043:            private int size;
044:
045:            // The docID offset where our docs begin in the index
046:            // file.  This will be 0 if we have our own private file.
047:            private int docStoreOffset;
048:
049:            private int tvdFormat;
050:            private int tvfFormat;
051:
052:            TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
053:                    throws CorruptIndexException, IOException {
054:                this (d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE);
055:            }
056:
057:            TermVectorsReader(Directory d, String segment,
058:                    FieldInfos fieldInfos, int readBufferSize)
059:                    throws CorruptIndexException, IOException {
060:                this (d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE,
061:                        -1, 0);
062:            }
063:
064:            TermVectorsReader(Directory d, String segment,
065:                    FieldInfos fieldInfos, int readBufferSize,
066:                    int docStoreOffset, int size) throws CorruptIndexException,
067:                    IOException {
068:                boolean success = false;
069:
070:                try {
071:                    if (d.fileExists(segment + "."
072:                            + IndexFileNames.VECTORS_INDEX_EXTENSION)) {
073:                        tvx = d.openInput(segment + "."
074:                                + IndexFileNames.VECTORS_INDEX_EXTENSION,
075:                                readBufferSize);
076:                        checkValidFormat(tvx);
077:                        tvd = d.openInput(segment + "."
078:                                + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION,
079:                                readBufferSize);
080:                        tvdFormat = checkValidFormat(tvd);
081:                        tvf = d.openInput(segment + "."
082:                                + IndexFileNames.VECTORS_FIELDS_EXTENSION,
083:                                readBufferSize);
084:                        tvfFormat = checkValidFormat(tvf);
085:                        if (-1 == docStoreOffset) {
086:                            this .docStoreOffset = 0;
087:                            this .size = (int) (tvx.length() >> 3);
088:                        } else {
089:                            this .docStoreOffset = docStoreOffset;
090:                            this .size = size;
091:                            // Verify the file is long enough to hold all of our
092:                            // docs
093:                            assert ((int) (tvx.length() / 8)) >= size
094:                                    + docStoreOffset;
095:                        }
096:                    }
097:
098:                    this .fieldInfos = fieldInfos;
099:                    success = true;
100:                } finally {
101:                    // With lock-less commits, it's entirely possible (and
102:                    // fine) to hit a FileNotFound exception above. In
103:                    // this case, we want to explicitly close any subset
104:                    // of things that were opened so that we don't have to
105:                    // wait for a GC to do so.
106:                    if (!success) {
107:                        close();
108:                    }
109:                }
110:            }
111:
112:            private int checkValidFormat(IndexInput in)
113:                    throws CorruptIndexException, IOException {
114:                int format = in.readInt();
115:                if (format > FORMAT_VERSION) {
116:                    throw new CorruptIndexException(
117:                            "Incompatible format version: " + format
118:                                    + " expected " + FORMAT_VERSION
119:                                    + " or less");
120:                }
121:                return format;
122:            }
123:
124:            void close() throws IOException {
125:                // make all effort to close up. Keep the first exception
126:                // and throw it as a new one.
127:                IOException keep = null;
128:                if (tvx != null)
129:                    try {
130:                        tvx.close();
131:                    } catch (IOException e) {
132:                        if (keep == null)
133:                            keep = e;
134:                    }
135:                if (tvd != null)
136:                    try {
137:                        tvd.close();
138:                    } catch (IOException e) {
139:                        if (keep == null)
140:                            keep = e;
141:                    }
142:                if (tvf != null)
143:                    try {
144:                        tvf.close();
145:                    } catch (IOException e) {
146:                        if (keep == null)
147:                            keep = e;
148:                    }
149:                if (keep != null)
150:                    throw (IOException) keep.fillInStackTrace();
151:            }
152:
153:            /**
154:             * 
155:             * @return The number of documents in the reader
156:             */
157:            int size() {
158:                return size;
159:            }
160:
161:            public void get(int docNum, String field, TermVectorMapper mapper)
162:                    throws IOException {
163:                if (tvx != null) {
164:                    int fieldNumber = fieldInfos.fieldNumber(field);
165:                    //We need to account for the FORMAT_SIZE at when seeking in the tvx
166:                    //We don't need to do this in other seeks because we already have the
167:                    // file pointer
168:                    //that was written in another file
169:                    tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
170:                    //System.out.println("TVX Pointer: " + tvx.getFilePointer());
171:                    long position = tvx.readLong();
172:
173:                    tvd.seek(position);
174:                    int fieldCount = tvd.readVInt();
175:                    //System.out.println("Num Fields: " + fieldCount);
176:                    // There are only a few fields per document. We opt for a full scan
177:                    // rather then requiring that they be ordered. We need to read through
178:                    // all of the fields anyway to get to the tvf pointers.
179:                    int number = 0;
180:                    int found = -1;
181:                    for (int i = 0; i < fieldCount; i++) {
182:                        if (tvdFormat == FORMAT_VERSION)
183:                            number = tvd.readVInt();
184:                        else
185:                            number += tvd.readVInt();
186:
187:                        if (number == fieldNumber)
188:                            found = i;
189:                    }
190:
191:                    // This field, although valid in the segment, was not found in this
192:                    // document
193:                    if (found != -1) {
194:                        // Compute position in the tvf file
195:                        position = 0;
196:                        for (int i = 0; i <= found; i++)
197:                            position += tvd.readVLong();
198:
199:                        mapper.setDocumentNumber(docNum);
200:                        readTermVector(field, position, mapper);
201:                    } else {
202:                        //System.out.println("Fieldable not found");
203:                    }
204:                } else {
205:                    //System.out.println("No tvx file");
206:                }
207:            }
208:
209:            /**
210:             * Retrieve the term vector for the given document and field
211:             * @param docNum The document number to retrieve the vector for
212:             * @param field The field within the document to retrieve
213:             * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
214:             * @throws IOException if there is an error reading the term vector files
215:             */
216:            TermFreqVector get(int docNum, String field) throws IOException {
217:                // Check if no term vectors are available for this segment at all
218:                ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
219:                get(docNum, field, mapper);
220:
221:                return mapper.materializeVector();
222:            }
223:
224:            /**
225:             * Return all term vectors stored for this document or null if the could not be read in.
226:             * 
227:             * @param docNum The document number to retrieve the vector for
228:             * @return All term frequency vectors
229:             * @throws IOException if there is an error reading the term vector files 
230:             */
231:            TermFreqVector[] get(int docNum) throws IOException {
232:                TermFreqVector[] result = null;
233:                if (tvx != null) {
234:                    //We need to offset by
235:                    tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
236:                    long position = tvx.readLong();
237:
238:                    tvd.seek(position);
239:                    int fieldCount = tvd.readVInt();
240:
241:                    // No fields are vectorized for this document
242:                    if (fieldCount != 0) {
243:                        int number = 0;
244:                        String[] fields = new String[fieldCount];
245:
246:                        for (int i = 0; i < fieldCount; i++) {
247:                            if (tvdFormat == FORMAT_VERSION)
248:                                number = tvd.readVInt();
249:                            else
250:                                number += tvd.readVInt();
251:
252:                            fields[i] = fieldInfos.fieldName(number);
253:                        }
254:
255:                        // Compute position in the tvf file
256:                        position = 0;
257:                        long[] tvfPointers = new long[fieldCount];
258:                        for (int i = 0; i < fieldCount; i++) {
259:                            position += tvd.readVLong();
260:                            tvfPointers[i] = position;
261:                        }
262:
263:                        result = readTermVectors(docNum, fields, tvfPointers);
264:                    }
265:                } else {
266:                    //System.out.println("No tvx file");
267:                }
268:                return result;
269:            }
270:
271:            public void get(int docNumber, TermVectorMapper mapper)
272:                    throws IOException {
273:                // Check if no term vectors are available for this segment at all
274:                if (tvx != null) {
275:                    //We need to offset by
276:                    tvx.seek((docNumber * 8L) + FORMAT_SIZE);
277:                    long position = tvx.readLong();
278:
279:                    tvd.seek(position);
280:                    int fieldCount = tvd.readVInt();
281:
282:                    // No fields are vectorized for this document
283:                    if (fieldCount != 0) {
284:                        int number = 0;
285:                        String[] fields = new String[fieldCount];
286:
287:                        for (int i = 0; i < fieldCount; i++) {
288:                            if (tvdFormat == FORMAT_VERSION)
289:                                number = tvd.readVInt();
290:                            else
291:                                number += tvd.readVInt();
292:
293:                            fields[i] = fieldInfos.fieldName(number);
294:                        }
295:
296:                        // Compute position in the tvf file
297:                        position = 0;
298:                        long[] tvfPointers = new long[fieldCount];
299:                        for (int i = 0; i < fieldCount; i++) {
300:                            position += tvd.readVLong();
301:                            tvfPointers[i] = position;
302:                        }
303:
304:                        mapper.setDocumentNumber(docNumber);
305:                        readTermVectors(fields, tvfPointers, mapper);
306:                    }
307:                } else {
308:                    //System.out.println("No tvx file");
309:                }
310:            }
311:
312:            private SegmentTermVector[] readTermVectors(int docNum,
313:                    String fields[], long tvfPointers[]) throws IOException {
314:                SegmentTermVector res[] = new SegmentTermVector[fields.length];
315:                for (int i = 0; i < fields.length; i++) {
316:                    ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
317:                    mapper.setDocumentNumber(docNum);
318:                    readTermVector(fields[i], tvfPointers[i], mapper);
319:                    res[i] = (SegmentTermVector) mapper.materializeVector();
320:                }
321:                return res;
322:            }
323:
324:            private void readTermVectors(String fields[], long tvfPointers[],
325:                    TermVectorMapper mapper) throws IOException {
326:                for (int i = 0; i < fields.length; i++) {
327:                    readTermVector(fields[i], tvfPointers[i], mapper);
328:                }
329:
330:            }
331:
332:            /**
333:             * 
334:             * @param field The field to read in
335:             * @param tvfPointer The pointer within the tvf file where we should start reading
336:             * @param mapper The mapper used to map the TermVector
337:             * @return The TermVector located at that position
338:             * @throws IOException
339:
340:             */
341:            private void readTermVector(String field, long tvfPointer,
342:                    TermVectorMapper mapper) throws IOException {
343:
344:                // Now read the data from specified position
345:                //We don't need to offset by the FORMAT here since the pointer already includes the offset
346:                tvf.seek(tvfPointer);
347:
348:                int numTerms = tvf.readVInt();
349:                //System.out.println("Num Terms: " + numTerms);
350:                // If no terms - return a constant empty termvector. However, this should never occur!
351:                if (numTerms == 0)
352:                    return;
353:
354:                boolean storePositions;
355:                boolean storeOffsets;
356:
357:                if (tvfFormat == FORMAT_VERSION) {
358:                    byte bits = tvf.readByte();
359:                    storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
360:                    storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
361:                } else {
362:                    tvf.readVInt();
363:                    storePositions = false;
364:                    storeOffsets = false;
365:                }
366:                mapper.setExpectations(field, numTerms, storeOffsets,
367:                        storePositions);
368:                int start = 0;
369:                int deltaLength = 0;
370:                int totalLength = 0;
371:                char[] buffer = new char[10]; // init the buffer with a length of 10 character
372:                char[] previousBuffer = {};
373:
374:                for (int i = 0; i < numTerms; i++) {
375:                    start = tvf.readVInt();
376:                    deltaLength = tvf.readVInt();
377:                    totalLength = start + deltaLength;
378:                    if (buffer.length < totalLength) { // increase buffer
379:                        buffer = null; // give a hint to garbage collector
380:                        buffer = new char[totalLength];
381:
382:                        if (start > 0) // just copy if necessary
383:                            System.arraycopy(previousBuffer, 0, buffer, 0,
384:                                    start);
385:                    }
386:
387:                    tvf.readChars(buffer, start, deltaLength);
388:                    String term = new String(buffer, 0, totalLength);
389:                    previousBuffer = buffer;
390:                    int freq = tvf.readVInt();
391:                    int[] positions = null;
392:                    if (storePositions) { //read in the positions
393:                        //does the mapper even care about positions?
394:                        if (mapper.isIgnoringPositions() == false) {
395:                            positions = new int[freq];
396:                            int prevPosition = 0;
397:                            for (int j = 0; j < freq; j++) {
398:                                positions[j] = prevPosition + tvf.readVInt();
399:                                prevPosition = positions[j];
400:                            }
401:                        } else {
402:                            //we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
403:                            //
404:                            for (int j = 0; j < freq; j++) {
405:                                tvf.readVInt();
406:                            }
407:                        }
408:                    }
409:                    TermVectorOffsetInfo[] offsets = null;
410:                    if (storeOffsets) {
411:                        //does the mapper even care about offsets?
412:                        if (mapper.isIgnoringOffsets() == false) {
413:                            offsets = new TermVectorOffsetInfo[freq];
414:                            int prevOffset = 0;
415:                            for (int j = 0; j < freq; j++) {
416:                                int startOffset = prevOffset + tvf.readVInt();
417:                                int endOffset = startOffset + tvf.readVInt();
418:                                offsets[j] = new TermVectorOffsetInfo(
419:                                        startOffset, endOffset);
420:                                prevOffset = endOffset;
421:                            }
422:                        } else {
423:                            for (int j = 0; j < freq; j++) {
424:                                tvf.readVInt();
425:                                tvf.readVInt();
426:                            }
427:                        }
428:                    }
429:                    mapper.map(term, freq, offsets, positions);
430:                }
431:            }
432:
433:            protected Object clone() {
434:
435:                if (tvx == null || tvd == null || tvf == null)
436:                    return null;
437:
438:                TermVectorsReader clone = null;
439:                try {
440:                    clone = (TermVectorsReader) super .clone();
441:                } catch (CloneNotSupportedException e) {
442:                }
443:
444:                clone.tvx = (IndexInput) tvx.clone();
445:                clone.tvd = (IndexInput) tvd.clone();
446:                clone.tvf = (IndexInput) tvf.clone();
447:
448:                return clone;
449:            }
450:
451:        }
452:
453:        /**
454:         * Models the existing parallel array structure
455:         */
456:        class ParallelArrayTermVectorMapper extends TermVectorMapper {
457:
458:            private String[] terms;
459:            private int[] termFreqs;
460:            private int positions[][];
461:            private TermVectorOffsetInfo offsets[][];
462:            private int currentPosition;
463:            private boolean storingOffsets;
464:            private boolean storingPositions;
465:            private String field;
466:
467:            public void setExpectations(String field, int numTerms,
468:                    boolean storeOffsets, boolean storePositions) {
469:                this .field = field;
470:                terms = new String[numTerms];
471:                termFreqs = new int[numTerms];
472:                this .storingOffsets = storeOffsets;
473:                this .storingPositions = storePositions;
474:                if (storePositions)
475:                    this .positions = new int[numTerms][];
476:                if (storeOffsets)
477:                    this .offsets = new TermVectorOffsetInfo[numTerms][];
478:            }
479:
480:            public void map(String term, int frequency,
481:                    TermVectorOffsetInfo[] offsets, int[] positions) {
482:                terms[currentPosition] = term;
483:                termFreqs[currentPosition] = frequency;
484:                if (storingOffsets) {
485:                    this .offsets[currentPosition] = offsets;
486:                }
487:                if (storingPositions) {
488:                    this .positions[currentPosition] = positions;
489:                }
490:                currentPosition++;
491:            }
492:
493:            /**
494:             * Construct the vector
495:             * @return The {@link TermFreqVector} based on the mappings.
496:             */
497:            public TermFreqVector materializeVector() {
498:                SegmentTermVector tv = null;
499:                if (field != null && terms != null) {
500:                    if (storingPositions || storingOffsets) {
501:                        tv = new SegmentTermPositionVector(field, terms,
502:                                termFreqs, positions, offsets);
503:                    } else {
504:                        tv = new SegmentTermVector(field, terms, termFreqs);
505:                    }
506:                }
507:                return tv;
508:            }
509:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.