001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.lucene.analysis.standard;
017:
018: import java.io.IOException;
019: import java.io.Reader;
020:
021: import org.apache.lucene.analysis.Token;
022: import org.apache.lucene.analysis.Tokenizer;
023:
024: /** A grammar-based tokenizer constructed with JFlex
025: *
026: * <p> This should be a good tokenizer for most European-language documents:
027: *
028: * <ul>
029: * <li>Splits words at punctuation characters, removing punctuation. However, a
030: * dot that's not followed by whitespace is considered part of a token.
031: * <li>Splits words at hyphens, unless there's a number in the token, in which case
032: * the whole token is interpreted as a product number and is not split.
033: * <li>Recognizes email addresses and internet hostnames as one token.
034: * </ul>
035: *
036: * <p>Many applications have specific tokenizer needs. If this tokenizer does
037: * not suit your application, please consider copying this source code
038: * directory to your project and maintaining your own grammar-based tokenizer.
039: */
040:
041: public class StandardTokenizer extends Tokenizer {
042: /** A private instance of the JFlex-constructed scanner */
043: private final StandardTokenizerImpl scanner;
044:
045: /**
046: * Specifies whether deprecated acronyms should be replaced with HOST type.
047: * This is false by default to support backward compatibility.
048: *<p/>
049: * See http://issues.apache.org/jira/browse/LUCENE-1068
050: *
051: * @deprecated this should be removed in the next release (3.0).
052: */
053: private boolean replaceInvalidAcronym = false;
054:
055: void setInput(Reader reader) {
056: this .input = reader;
057: }
058:
059: private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
060:
061: /** Set the max allowed token length. Any token longer
062: * than this is skipped. */
063: public void setMaxTokenLength(int length) {
064: this .maxTokenLength = length;
065: }
066:
067: /** @see #setMaxTokenLength */
068: public int getMaxTokenLength() {
069: return maxTokenLength;
070: }
071:
072: /**
073: * Creates a new instance of the {@link StandardTokenizer}. Attaches the
074: * <code>input</code> to a newly created JFlex scanner.
075: */
076: public StandardTokenizer(Reader input) {
077: this .input = input;
078: this .scanner = new StandardTokenizerImpl(input);
079: }
080:
081: /**
082: * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
083: * the <code>input</code> to the newly created JFlex scanner.
084: *
085: * @param input The input reader
086: * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms with HOST.
087: *
088: * See http://issues.apache.org/jira/browse/LUCENE-1068
089: */
090: public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
091: this .replaceInvalidAcronym = replaceInvalidAcronym;
092: this .input = input;
093: this .scanner = new StandardTokenizerImpl(input);
094: }
095:
096: /*
097: * (non-Javadoc)
098: *
099: * @see org.apache.lucene.analysis.TokenStream#next()
100: */
101: public Token next(Token result) throws IOException {
102: int posIncr = 1;
103:
104: while (true) {
105: int tokenType = scanner.getNextToken();
106:
107: if (tokenType == StandardTokenizerImpl.YYEOF) {
108: return null;
109: }
110:
111: if (scanner.yylength() <= maxTokenLength) {
112: result.clear();
113: result.setPositionIncrement(posIncr);
114: scanner.getText(result);
115: final int start = scanner.yychar();
116: result.setStartOffset(start);
117: result.setEndOffset(start + result.termLength());
118: // This 'if' should be removed in the next release. For now, it converts
119: // invalid acronyms to HOST. When removed, only the 'else' part should
120: // remain.
121: if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
122: if (replaceInvalidAcronym) {
123: result
124: .setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
125: result.setTermLength(result.termLength() - 1); // remove extra '.'
126: } else {
127: result
128: .setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
129: }
130: } else {
131: result
132: .setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
133: }
134: return result;
135: } else
136: // When we skip a too-long term, we still increment the
137: // position increment
138: posIncr++;
139: }
140: }
141:
142: /*
143: * (non-Javadoc)
144: *
145: * @see org.apache.lucene.analysis.TokenStream#reset()
146: */
147: public void reset() throws IOException {
148: super .reset();
149: scanner.yyreset(input);
150: }
151:
152: public void reset(Reader reader) throws IOException {
153: input = reader;
154: reset();
155: }
156:
157: /**
158: * Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
159: * when they should have been labeled as hosts instead.
160: * @return true if StandardTokenizer now returns these tokens as Hosts, otherwise false
161: *
162: * @deprecated Remove in 3.X and make true the only valid value
163: */
164: public boolean isReplaceInvalidAcronym() {
165: return replaceInvalidAcronym;
166: }
167:
168: /**
169: *
170: * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
171: * @deprecated Remove in 3.X and make true the only valid value
172: *
173: * See https://issues.apache.org/jira/browse/LUCENE-1068
174: */
175: public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
176: this.replaceInvalidAcronym = replaceInvalidAcronym;
177: }
178: }
|