Source Code Cross Referenced for MemoryIndex.java in  » Net » lucene-connector » org » apache » lucene » index » memory » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Net » lucene connector » org.apache.lucene.index.memory 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        package org.apache.lucene.index.memory;
0002:
0003:        /**
0004:         * Licensed to the Apache Software Foundation (ASF) under one or more
0005:         * contributor license agreements.  See the NOTICE file distributed with
0006:         * this work for additional information regarding copyright ownership.
0007:         * The ASF licenses this file to You under the Apache License, Version 2.0
0008:         * (the "License"); you may not use this file except in compliance with
0009:         * the License.  You may obtain a copy of the License at
0010:         *
0011:         *     http://www.apache.org/licenses/LICENSE-2.0
0012:         *
0013:         * Unless required by applicable law or agreed to in writing, software
0014:         * distributed under the License is distributed on an "AS IS" BASIS,
0015:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0016:         * See the License for the specific language governing permissions and
0017:         * limitations under the License.
0018:         */
0019:
0020:        import java.io.IOException;
0021:        import java.io.Serializable;
0022:        import java.util.Arrays;
0023:        import java.util.Collection;
0024:        import java.util.Collections;
0025:        import java.util.Comparator;
0026:        import java.util.HashMap;
0027:        import java.util.Iterator;
0028:        import java.util.Map;
0029:
0030:        import org.apache.lucene.analysis.Analyzer;
0031:        import org.apache.lucene.analysis.Token;
0032:        import org.apache.lucene.analysis.TokenStream;
0033:        import org.apache.lucene.document.Document;
0034:        import org.apache.lucene.document.Field;
0035:        import org.apache.lucene.document.FieldSelector;
0036:        import org.apache.lucene.index.IndexReader;
0037:        import org.apache.lucene.index.Term;
0038:        import org.apache.lucene.index.TermDocs;
0039:        import org.apache.lucene.index.TermEnum;
0040:        import org.apache.lucene.index.TermFreqVector;
0041:        import org.apache.lucene.index.TermPositionVector;
0042:        import org.apache.lucene.index.TermPositions;
0043:        import org.apache.lucene.index.TermVectorMapper;
0044:        import org.apache.lucene.search.HitCollector;
0045:        import org.apache.lucene.search.IndexSearcher;
0046:        import org.apache.lucene.search.Query;
0047:        import org.apache.lucene.search.Searcher;
0048:        import org.apache.lucene.search.Similarity;
0049:
0050:        /**
0051:         * High-performance single-document main memory Apache Lucene fulltext search index. 
0052:         * 
0053:         * <h4>Overview</h4>
0054:         * 
0055:         * This class is a replacement/substitute for a large subset of
0056:         * {@link org.apache.lucene.store.RAMDirectory} functionality. It is designed to
0057:         * enable maximum efficiency for on-the-fly matchmaking combining structured and 
0058:         * fuzzy fulltext search in realtime streaming applications such as Nux XQuery based XML 
0059:         * message queues, publish-subscribe systems for Blogs/newsfeeds, text chat, data acquisition and 
0060:         * distribution systems, application level routers, firewalls, classifiers, etc. 
0061:         * Rather than targetting fulltext search of infrequent queries over huge persistent 
0062:         * data archives (historic search), this class targets fulltext search of huge 
0063:         * numbers of queries over comparatively small transient realtime data (prospective 
0064:         * search). 
0065:         * For example as in 
0066:         * <pre>
0067:         * float score = search(String text, Query query)
0068:         * </pre>
0069:         * <p>
0070:         * Each instance can hold at most one Lucene "document", with a document containing
0071:         * zero or more "fields", each field having a name and a fulltext value. The
0072:         * fulltext value is tokenized (split and transformed) into zero or more index terms 
0073:         * (aka words) on <code>addField()</code>, according to the policy implemented by an
0074:         * Analyzer. For example, Lucene analyzers can split on whitespace, normalize to lower case
0075:         * for case insensitivity, ignore common terms with little discriminatory value such as "he", "in", "and" (stop
0076:         * words), reduce the terms to their natural linguistic root form such as "fishing"
0077:         * being reduced to "fish" (stemming), resolve synonyms/inflexions/thesauri 
0078:         * (upon indexing and/or querying), etc. For details, see
0079:         * <a target="_blank" href="http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html">Lucene Analyzer Intro</a>.
0080:         * <p>
0081:         * Arbitrary Lucene queries can be run against this class - see <a target="_blank" 
0082:         * href="http://lucene.apache.org/java/docs/queryparsersyntax.html">Lucene Query Syntax</a>
0083:         * as well as <a target="_blank" 
0084:         * href="http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html">Query Parser Rules</a>.
0085:         * Note that a Lucene query selects on the field names and associated (indexed) 
0086:         * tokenized terms, not on the original fulltext(s) - the latter are not stored 
0087:         * but rather thrown away immediately after tokenization.
0088:         * <p>
0089:         * For some interesting background information on search technology, see Bob Wyman's
0090:         * <a target="_blank" 
0091:         * href="http://bobwyman.pubsub.com/main/2005/05/mary_hodder_poi.html">Prospective Search</a>, 
0092:         * Jim Gray's
0093:         * <a target="_blank" href="http://www.acmqueue.org/modules.php?name=Content&pa=showpage&pid=293&page=4">
0094:         * A Call to Arms - Custom subscriptions</a>, and Tim Bray's
0095:         * <a target="_blank" 
0096:         * href="http://www.tbray.org/ongoing/When/200x/2003/07/30/OnSearchTOC">On Search, the Series</a>.
0097:         * 
0098:         * 
0099:         * <h4>Example Usage</h4> 
0100:         * 
0101:         * <pre>
0102:         * Analyzer analyzer = PatternAnalyzer.DEFAULT_ANALYZER;
0103:         * //Analyzer analyzer = new SimpleAnalyzer();
0104:         * MemoryIndex index = new MemoryIndex();
0105:         * index.addField("content", "Readings about Salmons and other select Alaska fishing Manuals", analyzer);
0106:         * index.addField("author", "Tales of James", analyzer);
0107:         * QueryParser parser = new QueryParser("content", analyzer);
0108:         * float score = index.search(parser.parse("+author:james +salmon~ +fish* manual~"));
0109:         * if (score &gt; 0.0f) {
0110:         *     System.out.println("it's a match");
0111:         * } else {
0112:         *     System.out.println("no match found");
0113:         * }
0114:         * System.out.println("indexData=" + index.toString());
0115:         * </pre>
0116:         * 
0117:         * 
0118:         * <h4>Example XQuery Usage</h4> 
0119:         * 
0120:         * <pre>
0121:         * (: An XQuery that finds all books authored by James that have something to do with "salmon fishing manuals", sorted by relevance :)
0122:         * declare namespace lucene = "java:nux.xom.pool.FullTextUtil";
0123:         * declare variable $query := "+salmon~ +fish* manual~"; (: any arbitrary Lucene query can go here :)
0124:         * 
0125:         * for $book in /books/book[author="James" and lucene:match(abstract, $query) > 0.0]
0126:         * let $score := lucene:match($book/abstract, $query)
0127:         * order by $score descending
0128:         * return $book
0129:         * </pre>
0130:         * 
0131:         * 
0132:         * <h4>No thread safety guarantees</h4>
0133:         * 
0134:         * An instance can be queried multiple times with the same or different queries,
0135:         * but an instance is not thread-safe. If desired use idioms such as:
0136:         * <pre>
0137:         * MemoryIndex index = ...
0138:         * synchronized (index) {
0139:         *    // read and/or write index (i.e. add fields and/or query)
0140:         * } 
0141:         * </pre>
0142:         * 
0143:         * 
0144:         * <h4>Performance Notes</h4>
0145:         * 
0146:         * Internally there's a new data structure geared towards efficient indexing 
0147:         * and searching, plus the necessary support code to seamlessly plug into the Lucene 
0148:         * framework.
0149:         * <p>
0150:         * This class performs very well for very small texts (e.g. 10 chars) 
0151:         * as well as for large texts (e.g. 10 MB) and everything in between. 
0152:         * Typically, it is about 10-100 times faster than <code>RAMDirectory</code>.
0153:         * Note that <code>RAMDirectory</code> has particularly 
0154:         * large efficiency overheads for small to medium sized texts, both in time and space.
0155:         * Indexing a field with N tokens takes O(N) in the best case, and O(N logN) in the worst 
0156:         * case. Memory consumption is probably larger than for <code>RAMDirectory</code>.
0157:         * <p>
0158:         * Example throughput of many simple term queries over a single MemoryIndex: 
0159:         * ~500000 queries/sec on a MacBook Pro, jdk 1.5.0_06, server VM. 
0160:         * As always, your mileage may vary.
0161:         * <p>
0162:         * If you're curious about
0163:         * the whereabouts of bottlenecks, run java 1.5 with the non-perturbing '-server
0164:         * -agentlib:hprof=cpu=samples,depth=10' flags, then study the trace log and
0165:         * correlate its hotspot trailer with its call stack headers (see <a
0166:         * target="_blank"
0167:         * href="http://java.sun.com/developer/technicalArticles/Programming/HPROF.html">
0168:         * hprof tracing </a>).
0169:         * 
0170:         * @author whoschek.AT.lbl.DOT.gov
0171:         */
0172:        public class MemoryIndex {
0173:
0174:            /** info for each field: Map<String fieldName, Info field> */
0175:            private final HashMap fields = new HashMap();
0176:
0177:            /** fields sorted ascending by fieldName; lazily computed on demand */
0178:            private transient Map.Entry[] sortedFields;
0179:
0180:            /** pos: positions[3*i], startOffset: positions[3*i +1], endOffset: positions[3*i +2] */
0181:            private final int stride;
0182:
0183:            /** Could be made configurable; See {@link Document#setBoost(float)} */
0184:            private static final float docBoost = 1.0f;
0185:
0186:            private static final long serialVersionUID = 2782195016849084649L;
0187:
0188:            private static final boolean DEBUG = false;
0189:
0190:            /**
0191:             * Sorts term entries into ascending order; also works for
0192:             * Arrays.binarySearch() and Arrays.sort()
0193:             */
0194:            private static final Comparator termComparator = new Comparator() {
0195:                public int compare(Object o1, Object o2) {
0196:                    if (o1 instanceof  Map.Entry)
0197:                        o1 = ((Map.Entry) o1).getKey();
0198:                    if (o2 instanceof  Map.Entry)
0199:                        o2 = ((Map.Entry) o2).getKey();
0200:                    if (o1 == o2)
0201:                        return 0;
0202:                    return ((String) o1).compareTo((String) o2);
0203:                }
0204:            };
0205:
0206:            /**
0207:             * Constructs an empty instance.
0208:             */
0209:            public MemoryIndex() {
0210:                this (false);
0211:            }
0212:
0213:            /**
0214:             * Constructs an empty instance that can optionally store the start and end
0215:             * character offset of each token term in the text. This can be useful for
0216:             * highlighting of hit locations with the Lucene highlighter package.
0217:             * Private until the highlighter package matures, so that this can actually
0218:             * be meaningfully integrated.
0219:             * 
0220:             * @param storeOffsets
0221:             *            whether or not to store the start and end character offset of
0222:             *            each token term in the text
0223:             */
0224:            private MemoryIndex(boolean storeOffsets) {
0225:                this .stride = storeOffsets ? 3 : 1;
0226:            }
0227:
0228:            /**
0229:             * Convenience method; Tokenizes the given field text and adds the resulting
0230:             * terms to the index; Equivalent to adding an indexed non-keyword Lucene
0231:             * {@link org.apache.lucene.document.Field} that is
0232:             * {@link org.apache.lucene.document.Field.Index#TOKENIZED tokenized},
0233:             * {@link org.apache.lucene.document.Field.Store#NO not stored},
0234:             * {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS termVectorStored with positions} (or
0235:             * {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS termVectorStored with positions and offsets}),
0236:             * 
0237:             * @param fieldName
0238:             *            a name to be associated with the text
0239:             * @param text
0240:             *            the text to tokenize and index.
0241:             * @param analyzer
0242:             *            the analyzer to use for tokenization
0243:             */
0244:            public void addField(String fieldName, String text,
0245:                    Analyzer analyzer) {
0246:                if (fieldName == null)
0247:                    throw new IllegalArgumentException(
0248:                            "fieldName must not be null");
0249:                if (text == null)
0250:                    throw new IllegalArgumentException("text must not be null");
0251:                if (analyzer == null)
0252:                    throw new IllegalArgumentException(
0253:                            "analyzer must not be null");
0254:
0255:                TokenStream stream;
0256:                if (analyzer instanceof  PatternAnalyzer) {
0257:                    stream = ((PatternAnalyzer) analyzer).tokenStream(
0258:                            fieldName, text);
0259:                } else {
0260:                    stream = analyzer.tokenStream(fieldName,
0261:                            new PatternAnalyzer.FastStringReader(text));
0262:                }
0263:                addField(fieldName, stream);
0264:            }
0265:
0266:            /**
0267:             * Convenience method; Creates and returns a token stream that generates a
0268:             * token for each keyword in the given collection, "as is", without any
0269:             * transforming text analysis. The resulting token stream can be fed into
0270:             * {@link #addField(String, TokenStream)}, perhaps wrapped into another
0271:             * {@link org.apache.lucene.analysis.TokenFilter}, as desired.
0272:             * 
0273:             * @param keywords
0274:             *            the keywords to generate tokens for
0275:             * @return the corresponding token stream
0276:             */
0277:            public TokenStream keywordTokenStream(final Collection keywords) {
0278:                // TODO: deprecate & move this method into AnalyzerUtil?
0279:                if (keywords == null)
0280:                    throw new IllegalArgumentException(
0281:                            "keywords must not be null");
0282:
0283:                return new TokenStream() {
0284:                    private Iterator iter = keywords.iterator();
0285:                    private int start = 0;
0286:
0287:                    public Token next() {
0288:                        if (!iter.hasNext())
0289:                            return null;
0290:
0291:                        Object obj = iter.next();
0292:                        if (obj == null)
0293:                            throw new IllegalArgumentException(
0294:                                    "keyword must not be null");
0295:
0296:                        String term = obj.toString();
0297:                        Token token = new Token(term, start, start
0298:                                + term.length());
0299:                        start += term.length() + 1; // separate words by 1 (blank) character
0300:                        return token;
0301:                    }
0302:                };
0303:            }
0304:
0305:            /**
0306:             * Equivalent to <code>addField(fieldName, stream, 1.0f)</code>.
0307:             * 
0308:             * @param fieldName
0309:             *            a name to be associated with the text
0310:             * @param stream
0311:             *            the token stream to retrieve tokens from
0312:             */
0313:            public void addField(String fieldName, TokenStream stream) {
0314:                addField(fieldName, stream, 1.0f);
0315:            }
0316:
0317:            /**
0318:             * Iterates over the given token stream and adds the resulting terms to the index;
0319:             * Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
0320:             * Lucene {@link org.apache.lucene.document.Field}.
0321:             * Finally closes the token stream. Note that untokenized keywords can be added with this method via 
0322:             * {@link #keywordTokenStream(Collection)}, the Lucene contrib <code>KeywordTokenizer</code> or similar utilities.
0323:             * 
0324:             * @param fieldName
0325:             *            a name to be associated with the text
0326:             * @param stream
0327:             *            the token stream to retrieve tokens from.
0328:             * @param boost
0329:             *            the boost factor for hits for this field
0330:             * @see Field#setBoost(float)
0331:             */
0332:            public void addField(String fieldName, TokenStream stream,
0333:                    float boost) {
0334:                /*
0335:                 * Note that this method signature avoids having a user call new
0336:                 * o.a.l.d.Field(...) which would be much too expensive due to the
0337:                 * String.intern() usage of that class.
0338:                 * 
0339:                 * More often than not, String.intern() leads to serious performance
0340:                 * degradations rather than improvements! If you're curious why, check
0341:                 * out the JDK's native code, see how it oscillates multiple times back
0342:                 * and forth between Java code and native code on each intern() call,
0343:                 * only to end up using a plain vanilla java.util.HashMap on the Java
0344:                 * heap for it's interned strings! String.equals() has a small cost
0345:                 * compared to String.intern(), trust me. Application level interning
0346:                 * (e.g. a HashMap per Directory/Index) typically leads to better
0347:                 * solutions than frequent hidden low-level calls to String.intern().
0348:                 * 
0349:                 * Perhaps with some luck, Lucene's Field.java (and Term.java) and
0350:                 * cousins could be fixed to not use String.intern(). Sigh :-(
0351:                 */
0352:                try {
0353:                    if (fieldName == null)
0354:                        throw new IllegalArgumentException(
0355:                                "fieldName must not be null");
0356:                    if (stream == null)
0357:                        throw new IllegalArgumentException(
0358:                                "token stream must not be null");
0359:                    if (boost <= 0.0f)
0360:                        throw new IllegalArgumentException(
0361:                                "boost factor must be greater than 0.0");
0362:                    if (fields.get(fieldName) != null)
0363:                        throw new IllegalArgumentException(
0364:                                "field must not be added more than once");
0365:
0366:                    HashMap terms = new HashMap();
0367:                    int numTokens = 0;
0368:                    int pos = -1;
0369:                    Token token;
0370:
0371:                    while ((token = stream.next()) != null) {
0372:                        String term = token.termText();
0373:                        if (term.length() == 0)
0374:                            continue; // nothing to do
0375:                            //        if (DEBUG) System.err.println("token='" + term + "'");
0376:                        numTokens++;
0377:                        pos += token.getPositionIncrement();
0378:
0379:                        ArrayIntList positions = (ArrayIntList) terms.get(term);
0380:                        if (positions == null) { // term not seen before
0381:                            positions = new ArrayIntList(stride);
0382:                            terms.put(term, positions);
0383:                        }
0384:                        if (stride == 1) {
0385:                            positions.add(pos);
0386:                        } else {
0387:                            positions.add(pos, token.startOffset(), token
0388:                                    .endOffset());
0389:                        }
0390:                    }
0391:
0392:                    // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
0393:                    if (numTokens > 0) {
0394:                        boost = boost * docBoost; // see DocumentWriter.addDocument(...)
0395:                        fields
0396:                                .put(fieldName, new Info(terms, numTokens,
0397:                                        boost));
0398:                        sortedFields = null; // invalidate sorted view, if any
0399:                    }
0400:                } catch (IOException e) { // can never happen
0401:                    throw new RuntimeException(e);
0402:                } finally {
0403:                    try {
0404:                        if (stream != null)
0405:                            stream.close();
0406:                    } catch (IOException e2) {
0407:                        throw new RuntimeException(e2);
0408:                    }
0409:                }
0410:            }
0411:
0412:            /**
0413:             * Creates and returns a searcher that can be used to execute arbitrary
0414:             * Lucene queries and to collect the resulting query results as hits.
0415:             * 
0416:             * @return a searcher
0417:             */
0418:            public IndexSearcher createSearcher() {
0419:                MemoryIndexReader reader = new MemoryIndexReader();
0420:                IndexSearcher searcher = new IndexSearcher(reader); // ensures no auto-close !!
0421:                reader.setSearcher(searcher); // to later get hold of searcher.getSimilarity()
0422:                return searcher;
0423:            }
0424:
0425:            /**
0426:             * Convenience method that efficiently returns the relevance score by
0427:             * matching this index against the given Lucene query expression.
0428:             * 
0429:             * @param query
0430:             *            an arbitrary Lucene query to run against this index
0431:             * @return the relevance score of the matchmaking; A number in the range
0432:             *         [0.0 .. 1.0], with 0.0 indicating no match. The higher the number
0433:             *         the better the match.
0434:             * @see org.apache.lucene.queryParser.QueryParser#parse(String)
0435:             */
0436:            public float search(Query query) {
0437:                if (query == null)
0438:                    throw new IllegalArgumentException("query must not be null");
0439:
0440:                Searcher searcher = createSearcher();
0441:                try {
0442:                    final float[] scores = new float[1]; // inits to 0.0f (no match)
0443:                    searcher.search(query, new HitCollector() {
0444:                        public void collect(int doc, float score) {
0445:                            scores[0] = score;
0446:                        }
0447:                    });
0448:                    float score = scores[0];
0449:                    return score;
0450:                } catch (IOException e) { // can never happen (RAMDirectory)
0451:                    throw new RuntimeException(e);
0452:                } finally {
0453:                    // searcher.close();
0454:                    /*
0455:                     * Note that it is harmless and important for good performance to
0456:                     * NOT close the index reader!!! This avoids all sorts of
0457:                     * unnecessary baggage and locking in the Lucene IndexReader
0458:                     * superclass, all of which is completely unnecessary for this main
0459:                     * memory index data structure without thread-safety claims.
0460:                     * 
0461:                     * Wishing IndexReader would be an interface...
0462:                     * 
0463:                     * Actually with the new tight createSearcher() API auto-closing is now
0464:                     * made impossible, hence searcher.close() would be harmless and also 
0465:                     * would not degrade performance...
0466:                     */
0467:                }
0468:            }
0469:
0470:            /**
0471:             * Returns a reasonable approximation of the main memory [bytes] consumed by
0472:             * this instance. Useful for smart memory sensititive caches/pools. Assumes
0473:             * fieldNames are interned, whereas tokenized terms are memory-overlaid.
0474:             * 
0475:             * @return the main memory consumption
0476:             */
0477:            public int getMemorySize() {
0478:                // for example usage in a smart cache see nux.xom.pool.Pool    
0479:                int PTR = VM.PTR;
0480:                int INT = VM.INT;
0481:                int size = 0;
0482:                size += VM.sizeOfObject(2 * PTR + INT); // memory index
0483:                if (sortedFields != null)
0484:                    size += VM.sizeOfObjectArray(sortedFields.length);
0485:
0486:                size += VM.sizeOfHashMap(fields.size());
0487:                Iterator iter = fields.entrySet().iterator();
0488:                while (iter.hasNext()) { // for each Field Info
0489:                    Map.Entry entry = (Map.Entry) iter.next();
0490:                    Info info = (Info) entry.getValue();
0491:                    size += VM.sizeOfObject(2 * INT + 3 * PTR); // Info instance vars
0492:                    if (info.sortedTerms != null)
0493:                        size += VM.sizeOfObjectArray(info.sortedTerms.length);
0494:
0495:                    int len = info.terms.size();
0496:                    size += VM.sizeOfHashMap(len);
0497:                    Iterator iter2 = info.terms.entrySet().iterator();
0498:                    while (--len >= 0) { // for each term
0499:                        Map.Entry e = (Map.Entry) iter2.next();
0500:                        size += VM.sizeOfObject(PTR + 3 * INT); // assumes substring() memory overlay
0501:                        //        size += STR + 2 * ((String) e.getKey()).length();
0502:                        ArrayIntList positions = (ArrayIntList) e.getValue();
0503:                        size += VM.sizeOfArrayIntList(positions.size());
0504:                    }
0505:                }
0506:                return size;
0507:            }
0508:
0509:            private int numPositions(ArrayIntList positions) {
0510:                return positions.size() / stride;
0511:            }
0512:
0513:            /** sorts into ascending order (on demand), reusing memory along the way */
0514:            private void sortFields() {
0515:                if (sortedFields == null)
0516:                    sortedFields = sort(fields);
0517:            }
0518:
0519:            /** returns a view of the given map's entries, sorted ascending by key */
0520:            private static Map.Entry[] sort(HashMap map) {
0521:                int size = map.size();
0522:                Map.Entry[] entries = new Map.Entry[size];
0523:
0524:                Iterator iter = map.entrySet().iterator();
0525:                for (int i = 0; i < size; i++) {
0526:                    entries[i] = (Map.Entry) iter.next();
0527:                }
0528:
0529:                if (size > 1)
0530:                    Arrays.sort(entries, termComparator);
0531:                return entries;
0532:            }
0533:
0534:            /**
0535:             * Returns a String representation of the index data for debugging purposes.
0536:             * 
0537:             * @return the string representation
0538:             */
0539:            public String toString() {
0540:                StringBuffer result = new StringBuffer(256);
0541:                sortFields();
0542:                int sumChars = 0;
0543:                int sumPositions = 0;
0544:                int sumTerms = 0;
0545:
0546:                for (int i = 0; i < sortedFields.length; i++) {
0547:                    Map.Entry entry = sortedFields[i];
0548:                    String fieldName = (String) entry.getKey();
0549:                    Info info = (Info) entry.getValue();
0550:                    info.sortTerms();
0551:                    result.append(fieldName + ":\n");
0552:
0553:                    int numChars = 0;
0554:                    int numPositions = 0;
0555:                    for (int j = 0; j < info.sortedTerms.length; j++) {
0556:                        Map.Entry e = info.sortedTerms[j];
0557:                        String term = (String) e.getKey();
0558:                        ArrayIntList positions = (ArrayIntList) e.getValue();
0559:                        result.append("\t'" + term + "':"
0560:                                + numPositions(positions) + ":");
0561:                        result.append(positions.toString(stride)); // ignore offsets
0562:                        result.append("\n");
0563:                        numPositions += numPositions(positions);
0564:                        numChars += term.length();
0565:                    }
0566:
0567:                    result.append("\tterms=" + info.sortedTerms.length);
0568:                    result.append(", positions=" + numPositions);
0569:                    result.append(", Kchars=" + (numChars / 1000.0f));
0570:                    result.append("\n");
0571:                    sumPositions += numPositions;
0572:                    sumChars += numChars;
0573:                    sumTerms += info.sortedTerms.length;
0574:                }
0575:
0576:                result.append("\nfields=" + sortedFields.length);
0577:                result.append(", terms=" + sumTerms);
0578:                result.append(", positions=" + sumPositions);
0579:                result.append(", Kchars=" + (sumChars / 1000.0f));
0580:                return result.toString();
0581:            }
0582:
0583:            ///////////////////////////////////////////////////////////////////////////////
0584:            // Nested classes:
0585:            ///////////////////////////////////////////////////////////////////////////////
0586:            /**
0587:             * Index data structure for a field; Contains the tokenized term texts and
0588:             * their positions.
0589:             */
0590:            private static final class Info implements  Serializable {
0591:
0592:                /**
0593:                 * Term strings and their positions for this field: Map <String
0594:                 * termText, ArrayIntList positions>
0595:                 */
0596:                private final HashMap terms;
0597:
0598:                /** Terms sorted ascending by term text; computed on demand */
0599:                private transient Map.Entry[] sortedTerms;
0600:
0601:                /** Number of added tokens for this field */
0602:                private final int numTokens;
0603:
0604:                /** Boost factor for hits for this field */
0605:                private final float boost;
0606:
0607:                /** Term for this field's fieldName, lazily computed on demand */
0608:                public transient Term template;
0609:
0610:                private static final long serialVersionUID = 2882195016849084649L;
0611:
0612:                public Info(HashMap terms, int numTokens, float boost) {
0613:                    this .terms = terms;
0614:                    this .numTokens = numTokens;
0615:                    this .boost = boost;
0616:                }
0617:
0618:                /**
0619:                 * Sorts hashed terms into ascending order, reusing memory along the
0620:                 * way. Note that sorting is lazily delayed until required (often it's
0621:                 * not required at all). If a sorted view is required then hashing +
0622:                 * sort + binary search is still faster and smaller than TreeMap usage
0623:                 * (which would be an alternative and somewhat more elegant approach,
0624:                 * apart from more sophisticated Tries / prefix trees).
0625:                 */
0626:                public void sortTerms() {
0627:                    if (sortedTerms == null)
0628:                        sortedTerms = sort(terms);
0629:                }
0630:
0631:                /** note that the frequency can be calculated as numPosition(getPositions(x)) */
0632:                public ArrayIntList getPositions(String term) {
0633:                    return (ArrayIntList) terms.get(term);
0634:                }
0635:
0636:                /** note that the frequency can be calculated as numPosition(getPositions(x)) */
0637:                public ArrayIntList getPositions(int pos) {
0638:                    return (ArrayIntList) sortedTerms[pos].getValue();
0639:                }
0640:
0641:                public float getBoost() {
0642:                    return boost;
0643:                }
0644:
0645:            }
0646:
0647:            ///////////////////////////////////////////////////////////////////////////////
0648:            // Nested classes:
0649:            ///////////////////////////////////////////////////////////////////////////////
0650:            /**
0651:             * Efficient resizable auto-expanding list holding <code>int</code> elements;
0652:             * implemented with arrays.
0653:             */
0654:            private static final class ArrayIntList implements  Serializable {
0655:
0656:                private int[] elements;
0657:                private int size = 0;
0658:
0659:                private static final long serialVersionUID = 2282195016849084649L;
0660:
0661:                public ArrayIntList() {
0662:                    this (10);
0663:                }
0664:
0665:                public ArrayIntList(int initialCapacity) {
0666:                    elements = new int[initialCapacity];
0667:                }
0668:
0669:                public void add(int elem) {
0670:                    if (size == elements.length)
0671:                        ensureCapacity(size + 1);
0672:                    elements[size++] = elem;
0673:                }
0674:
0675:                public void add(int pos, int start, int end) {
0676:                    if (size + 3 > elements.length)
0677:                        ensureCapacity(size + 3);
0678:                    elements[size] = pos;
0679:                    elements[size + 1] = start;
0680:                    elements[size + 2] = end;
0681:                    size += 3;
0682:                }
0683:
0684:                public int get(int index) {
0685:                    if (index >= size)
0686:                        throwIndex(index);
0687:                    return elements[index];
0688:                }
0689:
0690:                public int size() {
0691:                    return size;
0692:                }
0693:
0694:                public int[] toArray(int stride) {
0695:                    int[] arr = new int[size() / stride];
0696:                    if (stride == 1) {
0697:                        System.arraycopy(elements, 0, arr, 0, size); // fast path
0698:                    } else {
0699:                        for (int i = 0, j = 0; j < size; i++, j += stride)
0700:                            arr[i] = elements[j];
0701:                    }
0702:                    return arr;
0703:                }
0704:
0705:                private void ensureCapacity(int minCapacity) {
0706:                    int newCapacity = Math.max(minCapacity,
0707:                            (elements.length * 3) / 2 + 1);
0708:                    int[] newElements = new int[newCapacity];
0709:                    System.arraycopy(elements, 0, newElements, 0, size);
0710:                    elements = newElements;
0711:                }
0712:
0713:                private void throwIndex(int index) {
0714:                    throw new IndexOutOfBoundsException("index: " + index
0715:                            + ", size: " + size);
0716:                }
0717:
0718:                /** returns the first few positions (without offsets); debug only */
0719:                public String toString(int stride) {
0720:                    int s = size() / stride;
0721:                    int len = Math.min(10, s); // avoid printing huge lists
0722:                    StringBuffer buf = new StringBuffer(4 * len);
0723:                    buf.append("[");
0724:                    for (int i = 0; i < len; i++) {
0725:                        buf.append(get(i * stride));
0726:                        if (i < len - 1)
0727:                            buf.append(", ");
0728:                    }
0729:                    if (len != s)
0730:                        buf.append(", ..."); // and some more...
0731:                    buf.append("]");
0732:                    return buf.toString();
0733:                }
0734:            }
0735:
0736:            ///////////////////////////////////////////////////////////////////////////////
0737:            // Nested classes:
0738:            ///////////////////////////////////////////////////////////////////////////////
0739:            private static final Term MATCH_ALL_TERM = new Term("", "");
0740:
0741:            /**
0742:             * Search support for Lucene framework integration; implements all methods
0743:             * required by the Lucene IndexReader contracts.
0744:             */
0745:            private final class MemoryIndexReader extends IndexReader {
0746:
0747:                private Searcher searcher; // needed to find searcher.getSimilarity() 
0748:
0749:                private MemoryIndexReader() {
0750:                    super (null); // avoid as much superclass baggage as possible
0751:                }
0752:
0753:                // lucene >= 1.9 or lucene-1.4.3 with patch removing "final" in superclass
0754:                protected void finalize() {
0755:                }
0756:
0757:                private Info getInfo(String fieldName) {
0758:                    return (Info) fields.get(fieldName);
0759:                }
0760:
0761:                private Info getInfo(int pos) {
0762:                    return (Info) sortedFields[pos].getValue();
0763:                }
0764:
0765:                public int docFreq(Term term) {
0766:                    Info info = getInfo(term.field());
0767:                    int freq = 0;
0768:                    if (info != null)
0769:                        freq = info.getPositions(term.text()) != null ? 1 : 0;
0770:                    if (DEBUG)
0771:                        System.err.println("MemoryIndexReader.docFreq: " + term
0772:                                + ", freq:" + freq);
0773:                    return freq;
0774:                }
0775:
0776:                public TermEnum terms() {
0777:                    if (DEBUG)
0778:                        System.err.println("MemoryIndexReader.terms()");
0779:                    return terms(MATCH_ALL_TERM);
0780:                }
0781:
0782:                public TermEnum terms(Term term) {
0783:                    if (DEBUG)
0784:                        System.err.println("MemoryIndexReader.terms: " + term);
0785:
0786:                    int i; // index into info.sortedTerms
0787:                    int j; // index into sortedFields
0788:
0789:                    sortFields();
0790:                    if (sortedFields.length == 1
0791:                            && sortedFields[0].getKey() == term.field()) {
0792:                        j = 0; // fast path
0793:                    } else {
0794:                        j = Arrays.binarySearch(sortedFields, term.field(),
0795:                                termComparator);
0796:                    }
0797:
0798:                    if (j < 0) { // not found; choose successor
0799:                        j = -j - 1;
0800:                        i = 0;
0801:                        if (j < sortedFields.length)
0802:                            getInfo(j).sortTerms();
0803:                    } else { // found
0804:                        Info info = getInfo(j);
0805:                        info.sortTerms();
0806:                        i = Arrays.binarySearch(info.sortedTerms, term.text(),
0807:                                termComparator);
0808:                        if (i < 0) { // not found; choose successor
0809:                            i = -i - 1;
0810:                            if (i >= info.sortedTerms.length) { // move to next successor
0811:                                j++;
0812:                                i = 0;
0813:                                if (j < sortedFields.length)
0814:                                    getInfo(j).sortTerms();
0815:                            }
0816:                        }
0817:                    }
0818:                    final int ix = i;
0819:                    final int jx = j;
0820:
0821:                    return new TermEnum() {
0822:
0823:                        private int i = ix; // index into info.sortedTerms
0824:                        private int j = jx; // index into sortedFields
0825:
0826:                        public boolean next() {
0827:                            if (DEBUG)
0828:                                System.err.println("TermEnum.next");
0829:                            if (j >= sortedFields.length)
0830:                                return false;
0831:                            Info info = getInfo(j);
0832:                            if (++i < info.sortedTerms.length)
0833:                                return true;
0834:
0835:                            // move to successor
0836:                            j++;
0837:                            i = 0;
0838:                            if (j >= sortedFields.length)
0839:                                return false;
0840:                            getInfo(j).sortTerms();
0841:                            return true;
0842:                        }
0843:
0844:                        public Term term() {
0845:                            if (DEBUG)
0846:                                System.err.println("TermEnum.term: " + i);
0847:                            if (j >= sortedFields.length)
0848:                                return null;
0849:                            Info info = getInfo(j);
0850:                            if (i >= info.sortedTerms.length)
0851:                                return null;
0852:                            //          if (DEBUG) System.err.println("TermEnum.term: " + i + ", " + info.sortedTerms[i].getKey());
0853:                            return createTerm(info, j,
0854:                                    (String) info.sortedTerms[i].getKey());
0855:                        }
0856:
0857:                        public int docFreq() {
0858:                            if (DEBUG)
0859:                                System.err.println("TermEnum.docFreq");
0860:                            if (j >= sortedFields.length)
0861:                                return 0;
0862:                            Info info = getInfo(j);
0863:                            if (i >= info.sortedTerms.length)
0864:                                return 0;
0865:                            return numPositions(info.getPositions(i));
0866:                        }
0867:
0868:                        public void close() {
0869:                            if (DEBUG)
0870:                                System.err.println("TermEnum.close");
0871:                        }
0872:
0873:                        /** Returns a new Term object, minimizing String.intern() overheads. */
0874:                        private Term createTerm(Info info, int pos, String text) {
0875:                            // Assertion: sortFields has already been called before
0876:                            Term template = info.template;
0877:                            if (template == null) { // not yet cached?
0878:                                String fieldName = (String) sortedFields[pos]
0879:                                        .getKey();
0880:                                template = new Term(fieldName, "");
0881:                                info.template = template;
0882:                            }
0883:
0884:                            return template.createTerm(text);
0885:                        }
0886:
0887:                    };
0888:                }
0889:
0890:                public TermPositions termPositions() {
0891:                    if (DEBUG)
0892:                        System.err.println("MemoryIndexReader.termPositions");
0893:
0894:                    return new TermPositions() {
0895:
0896:                        private boolean hasNext;
0897:                        private int cursor = 0;
0898:                        private ArrayIntList current;
0899:
0900:                        public void seek(Term term) {
0901:                            if (DEBUG)
0902:                                System.err.println(".seek: " + term);
0903:                            Info info = getInfo(term.field());
0904:                            current = info == null ? null : info
0905:                                    .getPositions(term.text());
0906:                            hasNext = (current != null);
0907:                            cursor = 0;
0908:                        }
0909:
0910:                        public void seek(TermEnum termEnum) {
0911:                            if (DEBUG)
0912:                                System.err.println(".seekEnum");
0913:                            seek(termEnum.term());
0914:                        }
0915:
0916:                        public int doc() {
0917:                            if (DEBUG)
0918:                                System.err.println(".doc");
0919:                            return 0;
0920:                        }
0921:
0922:                        public int freq() {
0923:                            int freq = current != null ? numPositions(current)
0924:                                    : 0;
0925:                            if (DEBUG)
0926:                                System.err.println(".freq: " + freq);
0927:                            return freq;
0928:                        }
0929:
0930:                        public boolean next() {
0931:                            if (DEBUG)
0932:                                System.err.println(".next: " + current
0933:                                        + ", oldHasNext=" + hasNext);
0934:                            boolean next = hasNext;
0935:                            hasNext = false;
0936:                            return next;
0937:                        }
0938:
0939:                        public int read(int[] docs, int[] freqs) {
0940:                            if (DEBUG)
0941:                                System.err.println(".read: " + docs.length);
0942:                            if (!hasNext)
0943:                                return 0;
0944:                            hasNext = false;
0945:                            docs[0] = 0;
0946:                            freqs[0] = freq();
0947:                            return 1;
0948:                        }
0949:
0950:                        public boolean skipTo(int target) {
0951:                            if (DEBUG)
0952:                                System.err.println(".skipTo: " + target);
0953:                            return next();
0954:                        }
0955:
0956:                        public void close() {
0957:                            if (DEBUG)
0958:                                System.err.println(".close");
0959:                        }
0960:
0961:                        public int nextPosition() { // implements TermPositions
0962:                            int pos = current.get(cursor);
0963:                            cursor += stride;
0964:                            if (DEBUG)
0965:                                System.err.println(".nextPosition: " + pos);
0966:                            return pos;
0967:                        }
0968:
0969:                        /**
0970:                         * Not implemented.
0971:                         * @throws UnsupportedOperationException
0972:                         */
0973:                        public int getPayloadLength() {
0974:                            throw new UnsupportedOperationException();
0975:                        }
0976:
0977:                        /**
0978:                         * Not implemented.
0979:                         * @throws UnsupportedOperationException
0980:                         */
0981:                        public byte[] getPayload(byte[] data, int offset)
0982:                                throws IOException {
0983:                            throw new UnsupportedOperationException();
0984:                        }
0985:
0986:                        public boolean isPayloadAvailable() {
0987:                            // unsuported
0988:                            return false;
0989:                        }
0990:
0991:                    };
0992:                }
0993:
0994:                public TermDocs termDocs() {
0995:                    if (DEBUG)
0996:                        System.err.println("MemoryIndexReader.termDocs");
0997:                    return termPositions();
0998:                }
0999:
1000:                public TermFreqVector[] getTermFreqVectors(int docNumber) {
1001:                    if (DEBUG)
1002:                        System.err
1003:                                .println("MemoryIndexReader.getTermFreqVectors");
1004:                    TermFreqVector[] vectors = new TermFreqVector[fields.size()];
1005:                    //      if (vectors.length == 0) return null;
1006:                    Iterator iter = fields.keySet().iterator();
1007:                    for (int i = 0; i < vectors.length; i++) {
1008:                        String fieldName = (String) iter.next();
1009:                        vectors[i] = getTermFreqVector(docNumber, fieldName);
1010:                    }
1011:                    return vectors;
1012:                }
1013:
1014:                public void getTermFreqVector(int docNumber,
1015:                        TermVectorMapper mapper) throws IOException {
1016:                    if (DEBUG)
1017:                        System.err
1018:                                .println("MemoryIndexReader.getTermFreqVectors");
1019:
1020:                    //      if (vectors.length == 0) return null;
1021:                    for (Iterator iterator = fields.keySet().iterator(); iterator
1022:                            .hasNext();) {
1023:                        String fieldName = (String) iterator.next();
1024:                        getTermFreqVector(docNumber, fieldName, mapper);
1025:                    }
1026:                }
1027:
1028:                public void getTermFreqVector(int docNumber, String field,
1029:                        TermVectorMapper mapper) throws IOException {
1030:                    if (DEBUG)
1031:                        System.err
1032:                                .println("MemoryIndexReader.getTermFreqVector");
1033:                    final Info info = getInfo(field);
1034:                    if (info == null) {
1035:                        return;
1036:                    }
1037:                    info.sortTerms();
1038:                    mapper.setExpectations(field, info.sortedTerms.length,
1039:                            stride != 1, true);
1040:                    for (int i = info.sortedTerms.length; --i >= 0;) {
1041:
1042:                        ArrayIntList positions = (ArrayIntList) info.sortedTerms[i]
1043:                                .getValue();
1044:                        int size = positions.size();
1045:                        org.apache.lucene.index.TermVectorOffsetInfo[] offsets = new org.apache.lucene.index.TermVectorOffsetInfo[size
1046:                                / stride];
1047:
1048:                        for (int k = 0, j = 1; j < size; k++, j += stride) {
1049:                            int start = positions.get(j);
1050:                            int end = positions.get(j + 1);
1051:                            offsets[k] = new org.apache.lucene.index.TermVectorOffsetInfo(
1052:                                    start, end);
1053:                        }
1054:                        mapper.map((String) info.sortedTerms[i].getKey(),
1055:                                numPositions((ArrayIntList) info.sortedTerms[i]
1056:                                        .getValue()), offsets,
1057:                                ((ArrayIntList) info.sortedTerms[i].getValue())
1058:                                        .toArray(stride));
1059:                    }
1060:                }
1061:
1062:                public TermFreqVector getTermFreqVector(int docNumber,
1063:                        final String fieldName) {
1064:                    if (DEBUG)
1065:                        System.err
1066:                                .println("MemoryIndexReader.getTermFreqVector");
1067:                    final Info info = getInfo(fieldName);
1068:                    if (info == null)
1069:                        return null; // TODO: or return empty vector impl???
1070:                    info.sortTerms();
1071:
1072:                    return new TermPositionVector() {
1073:
1074:                        private final Map.Entry[] sortedTerms = info.sortedTerms;
1075:
1076:                        public String getField() {
1077:                            return fieldName;
1078:                        }
1079:
1080:                        public int size() {
1081:                            return sortedTerms.length;
1082:                        }
1083:
1084:                        public String[] getTerms() {
1085:                            String[] terms = new String[sortedTerms.length];
1086:                            for (int i = sortedTerms.length; --i >= 0;) {
1087:                                terms[i] = (String) sortedTerms[i].getKey();
1088:                            }
1089:                            return terms;
1090:                        }
1091:
1092:                        public int[] getTermFrequencies() {
1093:                            int[] freqs = new int[sortedTerms.length];
1094:                            for (int i = sortedTerms.length; --i >= 0;) {
1095:                                freqs[i] = numPositions((ArrayIntList) sortedTerms[i]
1096:                                        .getValue());
1097:                            }
1098:                            return freqs;
1099:                        }
1100:
1101:                        public int indexOf(String term) {
1102:                            int i = Arrays.binarySearch(sortedTerms, term,
1103:                                    termComparator);
1104:                            return i >= 0 ? i : -1;
1105:                        }
1106:
1107:                        public int[] indexesOf(String[] terms, int start,
1108:                                int len) {
1109:                            int[] indexes = new int[len];
1110:                            for (int i = 0; i < len; i++) {
1111:                                indexes[i] = indexOf(terms[start++]);
1112:                            }
1113:                            return indexes;
1114:                        }
1115:
1116:                        // lucene >= 1.4.3
1117:                        public int[] getTermPositions(int index) {
1118:                            return ((ArrayIntList) sortedTerms[index]
1119:                                    .getValue()).toArray(stride);
1120:                        }
1121:
1122:                        // lucene >= 1.9 (remove this method for lucene-1.4.3)
1123:                        public org.apache.lucene.index.TermVectorOffsetInfo[] getOffsets(
1124:                                int index) {
1125:                            if (stride == 1)
1126:                                return null; // no offsets stored
1127:
1128:                            ArrayIntList positions = (ArrayIntList) sortedTerms[index]
1129:                                    .getValue();
1130:                            int size = positions.size();
1131:                            org.apache.lucene.index.TermVectorOffsetInfo[] offsets = new org.apache.lucene.index.TermVectorOffsetInfo[size
1132:                                    / stride];
1133:
1134:                            for (int i = 0, j = 1; j < size; i++, j += stride) {
1135:                                int start = positions.get(j);
1136:                                int end = positions.get(j + 1);
1137:                                offsets[i] = new org.apache.lucene.index.TermVectorOffsetInfo(
1138:                                        start, end);
1139:                            }
1140:                            return offsets;
1141:                        }
1142:
1143:                    };
1144:                }
1145:
1146:                private Similarity getSimilarity() {
1147:                    if (searcher != null)
1148:                        return searcher.getSimilarity();
1149:                    return Similarity.getDefault();
1150:                }
1151:
1152:                private void setSearcher(Searcher searcher) {
1153:                    this .searcher = searcher;
1154:                }
1155:
1156:                /** performance hack: cache norms to avoid repeated expensive calculations */
1157:                private byte[] cachedNorms;
1158:                private String cachedFieldName;
1159:                private Similarity cachedSimilarity;
1160:
1161:                public byte[] norms(String fieldName) {
1162:                    byte[] norms = cachedNorms;
1163:                    Similarity sim = getSimilarity();
1164:                    if (fieldName != cachedFieldName || sim != cachedSimilarity) { // not cached?
1165:                        Info info = getInfo(fieldName);
1166:                        int numTokens = info != null ? info.numTokens : 0;
1167:                        float n = sim.lengthNorm(fieldName, numTokens);
1168:                        float boost = info != null ? info.getBoost() : 1.0f;
1169:                        n = n * boost; // see DocumentWriter.writeNorms(String segment)                
1170:                        byte norm = Similarity.encodeNorm(n);
1171:                        norms = new byte[] { norm };
1172:
1173:                        // cache it for future reuse
1174:                        cachedNorms = norms;
1175:                        cachedFieldName = fieldName;
1176:                        cachedSimilarity = sim;
1177:                        if (DEBUG)
1178:                            System.err.println("MemoryIndexReader.norms: "
1179:                                    + fieldName + ":" + n + ":" + norm + ":"
1180:                                    + numTokens);
1181:                    }
1182:                    return norms;
1183:                }
1184:
1185:                public void norms(String fieldName, byte[] bytes, int offset) {
1186:                    if (DEBUG)
1187:                        System.err.println("MemoryIndexReader.norms*: "
1188:                                + fieldName);
1189:                    byte[] norms = norms(fieldName);
1190:                    System.arraycopy(norms, 0, bytes, offset, norms.length);
1191:                }
1192:
1193:                protected void doSetNorm(int doc, String fieldName, byte value) {
1194:                    throw new UnsupportedOperationException();
1195:                }
1196:
1197:                public int numDocs() {
1198:                    if (DEBUG)
1199:                        System.err.println("MemoryIndexReader.numDocs");
1200:                    return fields.size() > 0 ? 1 : 0;
1201:                }
1202:
1203:                public int maxDoc() {
1204:                    if (DEBUG)
1205:                        System.err.println("MemoryIndexReader.maxDoc");
1206:                    return 1;
1207:                }
1208:
1209:                public Document document(int n) {
1210:                    if (DEBUG)
1211:                        System.err.println("MemoryIndexReader.document");
1212:                    return new Document(); // there are no stored fields
1213:                }
1214:
1215:                //When we convert to JDK 1.5 make this Set<String>
1216:                public Document document(int n, FieldSelector fieldSelector)
1217:                        throws IOException {
1218:                    if (DEBUG)
1219:                        System.err.println("MemoryIndexReader.document");
1220:                    return new Document(); // there are no stored fields
1221:                }
1222:
1223:                public boolean isDeleted(int n) {
1224:                    if (DEBUG)
1225:                        System.err.println("MemoryIndexReader.isDeleted");
1226:                    return false;
1227:                }
1228:
1229:                public boolean hasDeletions() {
1230:                    if (DEBUG)
1231:                        System.err.println("MemoryIndexReader.hasDeletions");
1232:                    return false;
1233:                }
1234:
1235:                protected void doDelete(int docNum) {
1236:                    throw new UnsupportedOperationException();
1237:                }
1238:
1239:                protected void doUndeleteAll() {
1240:                    throw new UnsupportedOperationException();
1241:                }
1242:
1243:                protected void doCommit() {
1244:                    if (DEBUG)
1245:                        System.err.println("MemoryIndexReader.doCommit");
1246:                }
1247:
1248:                protected void doClose() {
1249:                    if (DEBUG)
1250:                        System.err.println("MemoryIndexReader.doClose");
1251:                }
1252:
1253:                // lucene >= 1.9 (remove this method for lucene-1.4.3)
1254:                public Collection getFieldNames(FieldOption fieldOption) {
1255:                    if (DEBUG)
1256:                        System.err
1257:                                .println("MemoryIndexReader.getFieldNamesOption");
1258:                    if (fieldOption == FieldOption.UNINDEXED)
1259:                        return Collections.EMPTY_SET;
1260:                    if (fieldOption == FieldOption.INDEXED_NO_TERMVECTOR)
1261:                        return Collections.EMPTY_SET;
1262:                    if (fieldOption == FieldOption.TERMVECTOR_WITH_OFFSET
1263:                            && stride == 1)
1264:                        return Collections.EMPTY_SET;
1265:                    if (fieldOption == FieldOption.TERMVECTOR_WITH_POSITION_OFFSET
1266:                            && stride == 1)
1267:                        return Collections.EMPTY_SET;
1268:
1269:                    return Collections.unmodifiableSet(fields.keySet());
1270:                }
1271:            }
1272:
1273:            ///////////////////////////////////////////////////////////////////////////////
1274:            // Nested classes:
1275:            ///////////////////////////////////////////////////////////////////////////////
1276:            private static final class VM {
1277:
1278:                public static final int PTR = is64BitVM() ? 8 : 4;
1279:
1280:                // bytes occupied by primitive data types
1281:                public static final int BOOLEAN = 1;
1282:                public static final int BYTE = 1;
1283:                public static final int CHAR = 2;
1284:                public static final int SHORT = 2;
1285:                public static final int INT = 4;
1286:                public static final int LONG = 8;
1287:                public static final int FLOAT = 4;
1288:                public static final int DOUBLE = 8;
1289:
1290:                private static final int LOG_PTR = (int) Math.round(log2(PTR));
1291:
1292:                /**
1293:                 * Object header of any heap allocated Java object. 
1294:                 * ptr to class, info for monitor, gc, hash, etc.
1295:                 */
1296:                //	private static final int OBJECT_HEADER = 2*4; // even on 64 bit VMs?
1297:                private static final int OBJECT_HEADER = 2 * PTR;
1298:
1299:                /**
1300:                 * Modern VMs tend to trade space for time, allocating memory on word
1301:                 * boundaries. For example, on a 64 bit VM, the variables of a class with
1302:                 * one 32 bit integer and one Java char really consume 8 bytes instead of 6
1303:                 * bytes. 2 bytes are spent on padding. Similary, on a 64 bit VM a
1304:                 * java.lang.Integer consumes OBJECT_HEADER + 8 bytes rather than
1305:                 * OBJECT_HEADER + 4 bytes.
1306:                 */
1307:                private static final boolean IS_WORD_ALIGNED_VM = true;
1308:
1309:                private VM() {
1310:                } // not instantiable
1311:
1312:                //  assumes n > 0
1313:                //  64 bit VM:
1314:                //    0     --> 0*PTR
1315:                //    1..8  --> 1*PTR
1316:                //    9..16 --> 2*PTR
1317:                private static int sizeOf(int n) {
1318:                    return IS_WORD_ALIGNED_VM ?
1319:                    //              ((n-1)/PTR + 1) * PTR :               // slow version
1320:                    (((n - 1) >> LOG_PTR) + 1) << LOG_PTR
1321:                            : // fast version
1322:                            n;
1323:                }
1324:
1325:                public static int sizeOfObject(int n) {
1326:                    return sizeOf(OBJECT_HEADER + n);
1327:                }
1328:
1329:                public static int sizeOfObjectArray(int len) {
1330:                    return sizeOfObject(INT + PTR * len);
1331:                }
1332:
1333:                public static int sizeOfCharArray(int len) {
1334:                    return sizeOfObject(INT + CHAR * len);
1335:                }
1336:
1337:                public static int sizeOfIntArray(int len) {
1338:                    return sizeOfObject(INT + INT * len);
1339:                }
1340:
1341:                public static int sizeOfString(int len) {
1342:                    return sizeOfObject(3 * INT + PTR) + sizeOfCharArray(len);
1343:                }
1344:
1345:                public static int sizeOfHashMap(int len) {
1346:                    return sizeOfObject(4 * PTR + 4 * INT)
1347:                            + sizeOfObjectArray(len) + len
1348:                            * sizeOfObject(3 * PTR + INT); // entries
1349:                }
1350:
1351:                // note: does not include referenced objects
1352:                public static int sizeOfArrayList(int len) {
1353:                    return sizeOfObject(PTR + 2 * INT) + sizeOfObjectArray(len);
1354:                }
1355:
1356:                public static int sizeOfArrayIntList(int len) {
1357:                    return sizeOfObject(PTR + INT) + sizeOfIntArray(len);
1358:                }
1359:
1360:                private static boolean is64BitVM() {
1361:                    try {
1362:                        int bits = Integer.getInteger("sun.arch.data.model", 0)
1363:                                .intValue();
1364:                        if (bits != 0)
1365:                            return bits == 64;
1366:
1367:                        // fallback if sun.arch.data.model isn't available
1368:                        return System.getProperty("java.vm.name").toLowerCase()
1369:                                .indexOf("64") >= 0;
1370:                    } catch (Throwable t) {
1371:                        return false; // better safe than sorry (applets, security managers, etc.) ...
1372:                    }
1373:                }
1374:
1375:                /** logarithm to the base 2. Example: log2(4) == 2, log2(8) == 3 */
1376:                private static double log2(double value) {
1377:                    return Math.log(value) / Math.log(2);
1378:                }
1379:
1380:            }
1381:
1382:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.