Source Code Cross Referenced for PatternAnalyzer.java in » Net » lucene-connector » org » apache » lucene » index » memory » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Net » lucene connector » org.apache.lucene.index.memory
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        package org.apache.lucene.index.memory;
002:
003:        /**
004:         * Licensed to the Apache Software Foundation (ASF) under one or more
005:         * contributor license agreements.  See the NOTICE file distributed with
006:         * this work for additional information regarding copyright ownership.
007:         * The ASF licenses this file to You under the Apache License, Version 2.0
008:         * (the "License"); you may not use this file except in compliance with
009:         * the License.  You may obtain a copy of the License at
010:         *
011:         *     http://www.apache.org/licenses/LICENSE-2.0
012:         *
013:         * Unless required by applicable law or agreed to in writing, software
014:         * distributed under the License is distributed on an "AS IS" BASIS,
015:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016:         * See the License for the specific language governing permissions and
017:         * limitations under the License.
018:         */
019:
020:        import java.io.IOException;
021:        import java.io.Reader;
022:        import java.io.StringReader;
023:        import java.util.Arrays;
024:        import java.util.HashSet;
025:        import java.util.Locale;
026:        import java.util.Set;
027:        import java.util.regex.Matcher;
028:        import java.util.regex.Pattern;
029:
030:        import org.apache.lucene.analysis.Analyzer;
031:        import org.apache.lucene.analysis.StopAnalyzer;
032:        import org.apache.lucene.analysis.StopFilter;
033:        import org.apache.lucene.analysis.Token;
034:        import org.apache.lucene.analysis.TokenStream;
035:
036:        /**
037:         * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
038:         * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
039:         * (with behaviour identical to {@link String#split(String)}),
040:         * and that combines the functionality of
041:         * {@link org.apache.lucene.analysis.LetterTokenizer},
042:         * {@link org.apache.lucene.analysis.LowerCaseTokenizer},
043:         * {@link org.apache.lucene.analysis.WhitespaceTokenizer},
044:         * {@link org.apache.lucene.analysis.StopFilter} into a single efficient
045:         * multi-purpose class.
046:         * <p>
047:         * If you are unsure how exactly a regular expression should look like, consider 
048:         * prototyping by simply trying various expressions on some test texts via
049:         * {@link String#split(String)}. Once you are satisfied, give that regex to 
050:         * PatternAnalyzer. Also see <a target="_blank" 
051:         * href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>.
052:         * <p>
053:         * This class can be considerably faster than the "normal" Lucene tokenizers. 
054:         * It can also serve as a building block in a compound Lucene
055:         * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this 
056:         * stemming example:
057:         * <pre>
058:         * PatternAnalyzer pat = ...
059:         * TokenStream tokenStream = new SnowballFilter(
060:         *     pat.tokenStream("content", "James is running round in the woods"), 
061:         *     "English"));
062:         * </pre>
063:         * 
064:         * @author whoschek.AT.lbl.DOT.gov
065:         */
066:        public class PatternAnalyzer extends Analyzer {
067:
068:            /** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
069:            public static final Pattern NON_WORD_PATTERN = Pattern
070:                    .compile("\\W+");
071:
072:            /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
073:            public static final Pattern WHITESPACE_PATTERN = Pattern
074:                    .compile("\\s+");
075:
076:            private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
077:                    "a", "about", "above", "across", "adj", "after",
078:                    "afterwards", "again", "against", "albeit", "all",
079:                    "almost", "alone", "along", "already", "also", "although",
080:                    "always", "among", "amongst", "an", "and", "another",
081:                    "any", "anyhow", "anyone", "anything", "anywhere", "are",
082:                    "around", "as", "at", "be", "became", "because", "become",
083:                    "becomes", "becoming", "been", "before", "beforehand",
084:                    "behind", "being", "below", "beside", "besides", "between",
085:                    "beyond", "both", "but", "by", "can", "cannot", "co",
086:                    "could", "down", "during", "each", "eg", "either", "else",
087:                    "elsewhere", "enough", "etc", "even", "ever", "every",
088:                    "everyone", "everything", "everywhere", "except", "few",
089:                    "first", "for", "former", "formerly", "from", "further",
090:                    "had", "has", "have", "he", "hence", "her", "here",
091:                    "hereafter", "hereby", "herein", "hereupon", "hers",
092:                    "herself", "him", "himself", "his", "how", "however", "i",
093:                    "ie", "if", "in", "inc", "indeed", "into", "is", "it",
094:                    "its", "itself", "last", "latter", "latterly", "least",
095:                    "less", "ltd", "many", "may", "me", "meanwhile", "might",
096:                    "more", "moreover", "most", "mostly", "much", "must", "my",
097:                    "myself", "namely", "neither", "never", "nevertheless",
098:                    "next", "no", "nobody", "none", "noone", "nor", "not",
099:                    "nothing", "now", "nowhere", "of", "off", "often", "on",
100:                    "once one", "only", "onto", "or", "other", "others",
101:                    "otherwise", "our", "ours", "ourselves", "out", "over",
102:                    "own", "per", "perhaps", "rather", "s", "same", "seem",
103:                    "seemed", "seeming", "seems", "several", "she", "should",
104:                    "since", "so", "some", "somehow", "someone", "something",
105:                    "sometime", "sometimes", "somewhere", "still", "such", "t",
106:                    "than", "that", "the", "their", "them", "themselves",
107:                    "then", "thence", "there", "thereafter", "thereby",
108:                    "therefor", "therein", "thereupon", "these", "they",
109:                    "this", "those", "though", "through", "throughout", "thru",
110:                    "thus", "to", "together", "too", "toward", "towards",
111:                    "under", "until", "up", "upon", "us", "very", "via", "was",
112:                    "we", "well", "were", "what", "whatever", "whatsoever",
113:                    "when", "whence", "whenever", "whensoever", "where",
114:                    "whereafter", "whereas", "whereat", "whereby", "wherefrom",
115:                    "wherein", "whereinto", "whereof", "whereon", "whereto",
116:                    "whereunto", "whereupon", "wherever", "wherewith",
117:                    "whether", "which", "whichever", "whichsoever", "while",
118:                    "whilst", "whither", "who", "whoever", "whole", "whom",
119:                    "whomever", "whomsoever", "whose", "whosoever", "why",
120:                    "will", "with", "within", "without", "would", "xsubj",
121:                    "xcal", "xauthor", "xother ", "xnote", "yet", "you",
122:                    "your", "yours", "yourself", "yourselves" });
123:
124:            /**
125:             * A lower-casing word analyzer with English stop words (can be shared
126:             * freely across threads without harm); global per class loader.
127:             */
128:            public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
129:                    NON_WORD_PATTERN, true,
130:                    makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
131:
132:            /**
133:             * A lower-casing word analyzer with <b>extended </b> English stop words
134:             * (can be shared freely across threads without harm); global per class
135:             * loader. The stop words are borrowed from
136:             * http://thomas.loc.gov/home/stopwords.html, see
137:             * http://thomas.loc.gov/home/all.about.inquery.html
138:             */
139:            public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
140:                    NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
141:
142:            private final Pattern pattern;
143:            private final boolean toLowerCase;
144:            private final Set stopWords;
145:
146:            /**
147:             * Constructs a new instance with the given parameters.
148:             * 
149:             * @param pattern
150:             *            a regular expression delimiting tokens
151:             * @param toLowerCase
152:             *            if <code>true</code> returns tokens after applying
153:             *            String.toLowerCase()
154:             * @param stopWords
155:             *            if non-null, ignores all tokens that are contained in the
156:             *            given stop set (after previously having applied toLowerCase()
157:             *            if applicable). For example, created via
158:             *            {@link StopFilter#makeStopSet(String[])}and/or
159:             *            {@link org.apache.lucene.analysis.WordlistLoader}as in
160:             *            <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
161:             *            or <a href="http://www.unine.ch/info/clef/">other stop words
162:             *            lists </a>.
163:             */
164:            public PatternAnalyzer(Pattern pattern, boolean toLowerCase,
165:                    Set stopWords) {
166:                if (pattern == null)
167:                    throw new IllegalArgumentException(
168:                            "pattern must not be null");
169:
170:                if (eqPattern(NON_WORD_PATTERN, pattern))
171:                    pattern = NON_WORD_PATTERN;
172:                else if (eqPattern(WHITESPACE_PATTERN, pattern))
173:                    pattern = WHITESPACE_PATTERN;
174:
175:                if (stopWords != null && stopWords.size() == 0)
176:                    stopWords = null;
177:
178:                this .pattern = pattern;
179:                this .toLowerCase = toLowerCase;
180:                this .stopWords = stopWords;
181:            }
182:
183:            /**
184:             * Creates a token stream that tokenizes the given string into token terms
185:             * (aka words).
186:             * 
187:             * @param fieldName
188:             *            the name of the field to tokenize (currently ignored).
189:             * @param text
190:             *            the string to tokenize
191:             * @return a new token stream
192:             */
193:            public TokenStream tokenStream(String fieldName, String text) {
194:                // Ideally the Analyzer superclass should have a method with the same signature, 
195:                // with a default impl that simply delegates to the StringReader flavour. 
196:                if (text == null)
197:                    throw new IllegalArgumentException("text must not be null");
198:
199:                TokenStream stream;
200:                if (pattern == NON_WORD_PATTERN) { // fast path
201:                    stream = new FastStringTokenizer(text, true, toLowerCase,
202:                            stopWords);
203:                } else if (pattern == WHITESPACE_PATTERN) { // fast path
204:                    stream = new FastStringTokenizer(text, false, toLowerCase,
205:                            stopWords);
206:                } else {
207:                    stream = new PatternTokenizer(text, pattern, toLowerCase);
208:                    if (stopWords != null)
209:                        stream = new StopFilter(stream, stopWords);
210:                }
211:
212:                return stream;
213:            }
214:
215:            /**
216:             * Creates a token stream that tokenizes all the text in the given Reader;
217:             * This implementation forwards to <code>tokenStream(String, String)</code> and is
218:             * less efficient than <code>tokenStream(String, String)</code>.
219:             * 
220:             * @param fieldName
221:             *            the name of the field to tokenize (currently ignored).
222:             * @param reader
223:             *            the reader delivering the text
224:             * @return a new token stream
225:             */
226:            public TokenStream tokenStream(String fieldName, Reader reader) {
227:                if (reader instanceof  FastStringReader) { // fast path
228:                    return tokenStream(fieldName, ((FastStringReader) reader)
229:                            .getString());
230:                }
231:
232:                try {
233:                    String text = toString(reader);
234:                    return tokenStream(fieldName, text);
235:                } catch (IOException e) {
236:                    throw new RuntimeException(e);
237:                }
238:            }
239:
240:            /**
241:             * Indicates whether some other object is "equal to" this one.
242:             * 
243:             * @param other
244:             *            the reference object with which to compare.
245:             * @return true if equal, false otherwise
246:             */
247:            public boolean equals(Object other) {
248:                if (this  == other)
249:                    return true;
250:                if (this  == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER)
251:                    return false;
252:                if (other == DEFAULT_ANALYZER && this  == EXTENDED_ANALYZER)
253:                    return false;
254:
255:                if (other instanceof  PatternAnalyzer) {
256:                    PatternAnalyzer p2 = (PatternAnalyzer) other;
257:                    return toLowerCase == p2.toLowerCase
258:                            && eqPattern(pattern, p2.pattern)
259:                            && eq(stopWords, p2.stopWords);
260:                }
261:                return false;
262:            }
263:
264:            /**
265:             * Returns a hash code value for the object.
266:             * 
267:             * @return the hash code.
268:             */
269:            public int hashCode() {
270:                if (this  == DEFAULT_ANALYZER)
271:                    return -1218418418; // fast path
272:                if (this  == EXTENDED_ANALYZER)
273:                    return 1303507063; // fast path
274:
275:                int h = 1;
276:                h = 31 * h + pattern.pattern().hashCode();
277:                h = 31 * h + pattern.flags();
278:                h = 31 * h + (toLowerCase ? 1231 : 1237);
279:                h = 31 * h + (stopWords != null ? stopWords.hashCode() : 0);
280:                return h;
281:            }
282:
283:            /** equality where o1 and/or o2 can be null */
284:            private static boolean eq(Object o1, Object o2) {
285:                return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
286:            }
287:
288:            /** assumes p1 and p2 are not null */
289:            private static boolean eqPattern(Pattern p1, Pattern p2) {
290:                return p1 == p2
291:                        || (p1.flags() == p2.flags() && p1.pattern().equals(
292:                                p2.pattern()));
293:            }
294:
295:            /**
296:             * Reads until end-of-stream and returns all read chars, finally closes the stream.
297:             * 
298:             * @param input the input stream
299:             * @throws IOException if an I/O error occurs while reading the stream
300:             */
301:            private static String toString(Reader input) throws IOException {
302:                try {
303:                    int len = 256;
304:                    char[] buffer = new char[len];
305:                    char[] output = new char[len];
306:
307:                    len = 0;
308:                    int n;
309:                    while ((n = input.read(buffer)) >= 0) {
310:                        if (len + n > output.length) { // grow capacity
311:                            char[] tmp = new char[Math.max(output.length << 1,
312:                                    len + n)];
313:                            System.arraycopy(output, 0, tmp, 0, len);
314:                            System.arraycopy(buffer, 0, tmp, len, n);
315:                            buffer = output; // use larger buffer for future larger bulk reads
316:                            output = tmp;
317:                        } else {
318:                            System.arraycopy(buffer, 0, output, len, n);
319:                        }
320:                        len += n;
321:                    }
322:
323:                    return new String(output, 0, len);
324:                } finally {
325:                    if (input != null)
326:                        input.close();
327:                }
328:            }
329:
330:            /** somewhat oversized to minimize hash collisions */
331:            private static Set makeStopSet(String[] stopWords) {
332:                Set stops = new HashSet(stopWords.length * 2, 0.3f);
333:                stops.addAll(Arrays.asList(stopWords));
334:                return stops;
335:                //    return Collections.unmodifiableSet(stops);
336:            }
337:
338:            ///////////////////////////////////////////////////////////////////////////////
339:            // Nested classes:
340:            ///////////////////////////////////////////////////////////////////////////////
341:            /**
342:             * The work horse; performance isn't fantastic, but it's not nearly as bad
343:             * as one might think - kudos to the Sun regex developers.
344:             */
345:            private static final class PatternTokenizer extends TokenStream {
346:
347:                private final String str;
348:                private final boolean toLowerCase;
349:                private Matcher matcher;
350:                private int pos = 0;
351:                private static final Locale locale = Locale.getDefault();
352:
353:                public PatternTokenizer(String str, Pattern pattern,
354:                        boolean toLowerCase) {
355:                    this .str = str;
356:                    this .matcher = pattern.matcher(str);
357:                    this .toLowerCase = toLowerCase;
358:                }
359:
360:                public Token next() {
361:                    if (matcher == null)
362:                        return null;
363:
364:                    while (true) { // loop takes care of leading and trailing boundary cases
365:                        int start = pos;
366:                        int end;
367:                        boolean isMatch = matcher.find();
368:                        if (isMatch) {
369:                            end = matcher.start();
370:                            pos = matcher.end();
371:                        } else {
372:                            end = str.length();
373:                            matcher = null; // we're finished
374:                        }
375:
376:                        if (start != end) { // non-empty match (header/trailer)
377:                            String text = str.substring(start, end);
378:                            if (toLowerCase)
379:                                text = text.toLowerCase(locale);
380:                            return new Token(text, start, end);
381:                        }
382:                        if (!isMatch)
383:                            return null;
384:                    }
385:                }
386:
387:            }
388:
389:            ///////////////////////////////////////////////////////////////////////////////
390:            // Nested classes:
391:            ///////////////////////////////////////////////////////////////////////////////
392:            /**
393:             * Special-case class for best performance in common cases; this class is
394:             * otherwise unnecessary.
395:             */
396:            private static final class FastStringTokenizer extends TokenStream {
397:
398:                private final String str;
399:                private int pos;
400:                private final boolean isLetter;
401:                private final boolean toLowerCase;
402:                private final Set stopWords;
403:                private static final Locale locale = Locale.getDefault();
404:
405:                public FastStringTokenizer(String str, boolean isLetter,
406:                        boolean toLowerCase, Set stopWords) {
407:                    this .str = str;
408:                    this .isLetter = isLetter;
409:                    this .toLowerCase = toLowerCase;
410:                    this .stopWords = stopWords;
411:                }
412:
413:                public Token next() {
414:                    // cache loop instance vars (performance)
415:                    String s = str;
416:                    int len = s.length();
417:                    int i = pos;
418:                    boolean letter = isLetter;
419:
420:                    int start = 0;
421:                    String text;
422:                    do {
423:                        // find beginning of token
424:                        text = null;
425:                        while (i < len && !isTokenChar(s.charAt(i), letter)) {
426:                            i++;
427:                        }
428:
429:                        if (i < len) { // found beginning; now find end of token
430:                            start = i;
431:                            while (i < len && isTokenChar(s.charAt(i), letter)) {
432:                                i++;
433:                            }
434:
435:                            text = s.substring(start, i);
436:                            if (toLowerCase)
437:                                text = text.toLowerCase(locale);
438:                            //          if (toLowerCase) {            
439:                            ////            use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
440:                            ////            see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
441:                            //            text = s.substring(start, i).toLowerCase(); 
442:                            ////            char[] chars = new char[i-start];
443:                            ////            for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
444:                            ////            text = new String(chars);
445:                            //          } else {
446:                            //            text = s.substring(start, i);
447:                            //          }
448:                        }
449:                    } while (text != null && isStopWord(text));
450:
451:                    pos = i;
452:                    return text != null ? new Token(text, start, i) : null;
453:                }
454:
455:                private boolean isTokenChar(char c, boolean isLetter) {
456:                    return isLetter ? Character.isLetter(c) : !Character
457:                            .isWhitespace(c);
458:                }
459:
460:                private boolean isStopWord(String text) {
461:                    return stopWords != null && stopWords.contains(text);
462:                }
463:
464:            }
465:
466:            ///////////////////////////////////////////////////////////////////////////////
467:            // Nested classes:
468:            ///////////////////////////////////////////////////////////////////////////////
469:            /**
470:             * A StringReader that exposes it's contained string for fast direct access.
471:             * Might make sense to generalize this to CharSequence and make it public?
472:             */
473:            static final class FastStringReader extends StringReader {
474:
475:                private final String s;
476:
477:                FastStringReader(String s) {
478:                    super (s);
479:                    this .s = s;
480:                }
481:
482:                String getString() {
483:                    return s;
484:                }
485:            }
486:
487:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.