001: package org.apache.lucene.analysis;
002:
003: import org.apache.lucene.analysis.standard.StandardAnalyzer;
004: import org.apache.lucene.util.LuceneTestCase;
005:
006: import java.io.StringReader;
007:
008: /**
009: * Copyright 2004 The Apache Software Foundation
010: * <p/>
011: * Licensed under the Apache License, Version 2.0 (the "License");
012: * you may not use this file except in compliance with the License.
013: * You may obtain a copy of the License at
014: * <p/>
015: * http://www.apache.org/licenses/LICENSE-2.0
016: * <p/>
017: * Unless required by applicable law or agreed to in writing, software
018: * distributed under the License is distributed on an "AS IS" BASIS,
019: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
020: * See the License for the specific language governing permissions and
021: * limitations under the License.
022: */
023:
024: public class TestStandardAnalyzer extends LuceneTestCase {
025:
026: private Analyzer a = new StandardAnalyzer();
027:
028: public void assertAnalyzesTo(Analyzer a, String input,
029: String[] expected) throws Exception {
030: assertAnalyzesTo(a, input, expected, null);
031: }
032:
033: public void assertAnalyzesTo(Analyzer a, String input,
034: String[] expectedImages, String[] expectedTypes)
035: throws Exception {
036: assertAnalyzesTo(a, input, expectedImages, expectedTypes, null);
037: }
038:
039: public void assertAnalyzesTo(Analyzer a, String input,
040: String[] expectedImages, String[] expectedTypes,
041: int[] expectedPosIncrs) throws Exception {
042: TokenStream ts = a
043: .tokenStream("dummy", new StringReader(input));
044: for (int i = 0; i < expectedImages.length; i++) {
045: Token t = ts.next();
046: assertNotNull(t);
047: assertEquals(expectedImages[i], t.termText());
048: if (expectedTypes != null) {
049: assertEquals(expectedTypes[i], t.type());
050: }
051: if (expectedPosIncrs != null) {
052: assertEquals(expectedPosIncrs[i], t
053: .getPositionIncrement());
054: }
055: }
056: assertNull(ts.next());
057: ts.close();
058: }
059:
060: public void testMaxTermLength() throws Exception {
061: StandardAnalyzer sa = new StandardAnalyzer();
062: sa.setMaxTokenLength(5);
063: assertAnalyzesTo(sa, "ab cd toolong xy z", new String[] { "ab",
064: "cd", "xy", "z" });
065: }
066:
067: public void testMaxTermLength2() throws Exception {
068: StandardAnalyzer sa = new StandardAnalyzer();
069: assertAnalyzesTo(sa, "ab cd toolong xy z", new String[] { "ab",
070: "cd", "toolong", "xy", "z" });
071: sa.setMaxTokenLength(5);
072:
073: assertAnalyzesTo(sa, "ab cd toolong xy z", new String[] { "ab",
074: "cd", "xy", "z" }, null, new int[] { 1, 1, 2, 1 });
075: }
076:
077: public void testMaxTermLength3() throws Exception {
078: char[] chars = new char[255];
079: for (int i = 0; i < 255; i++)
080: chars[i] = 'a';
081: String longTerm = new String(chars, 0, 255);
082:
083: assertAnalyzesTo(a, "ab cd " + longTerm + " xy z",
084: new String[] { "ab", "cd", longTerm, "xy", "z" });
085: assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z",
086: new String[] { "ab", "cd", "xy", "z" });
087: }
088:
089: public void testAlphanumeric() throws Exception {
090: // alphanumeric tokens
091: assertAnalyzesTo(a, "B2B", new String[] { "b2b" });
092: assertAnalyzesTo(a, "2B", new String[] { "2b" });
093: }
094:
095: public void testUnderscores() throws Exception {
096: // underscores are delimiters, but not in email addresses (below)
097: assertAnalyzesTo(a, "word_having_underscore", new String[] {
098: "word", "having", "underscore" });
099: assertAnalyzesTo(a, "word_with_underscore_and_stopwords",
100: new String[] { "word", "underscore", "stopwords" });
101: }
102:
103: public void testDelimiters() throws Exception {
104: // other delimiters: "-", "/", ","
105: assertAnalyzesTo(a, "some-dashed-phrase", new String[] {
106: "some", "dashed", "phrase" });
107: assertAnalyzesTo(a, "dogs,chase,cats", new String[] { "dogs",
108: "chase", "cats" });
109: assertAnalyzesTo(a, "ac/dc", new String[] { "ac", "dc" });
110: }
111:
112: public void testApostrophes() throws Exception {
113: // internal apostrophes: O'Reilly, you're, O'Reilly's
114: // possessives are actually removed by StardardFilter, not the tokenizer
115: assertAnalyzesTo(a, "O'Reilly", new String[] { "o'reilly" });
116: assertAnalyzesTo(a, "you're", new String[] { "you're" });
117: assertAnalyzesTo(a, "she's", new String[] { "she" });
118: assertAnalyzesTo(a, "Jim's", new String[] { "jim" });
119: assertAnalyzesTo(a, "don't", new String[] { "don't" });
120: assertAnalyzesTo(a, "O'Reilly's", new String[] { "o'reilly" });
121: }
122:
123: public void testTSADash() throws Exception {
124: // t and s had been stopwords in Lucene <= 2.0, which made it impossible
125: // to correctly search for these terms:
126: assertAnalyzesTo(a, "s-class", new String[] { "s", "class" });
127: assertAnalyzesTo(a, "t-com", new String[] { "t", "com" });
128: // 'a' is still a stopword:
129: assertAnalyzesTo(a, "a-class", new String[] { "class" });
130: }
131:
132: public void testCompanyNames() throws Exception {
133: // company names
134: assertAnalyzesTo(a, "AT&T", new String[] { "at&t" });
135: assertAnalyzesTo(a, "Excite@Home",
136: new String[] { "excite@home" });
137: }
138:
139: public void testLucene1140() throws Exception {
140: try {
141: StandardAnalyzer analyzer = new StandardAnalyzer(true);
142: assertAnalyzesTo(analyzer, "www.nutch.org.",
143: new String[] { "www.nutch.org" },
144: new String[] { "<HOST>" });
145: } catch (NullPointerException e) {
146: assertTrue("Should not throw an NPE and it did", false);
147: }
148:
149: }
150:
151: public void testDomainNames() throws Exception {
152: // domain names
153: assertAnalyzesTo(a, "www.nutch.org",
154: new String[] { "www.nutch.org" });
155: //Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
156: //TODO: Remove in 3.x
157: assertAnalyzesTo(a, "www.nutch.org.",
158: new String[] { "wwwnutchorg" },
159: new String[] { "<ACRONYM>" });
160: // the following should be recognized as HOST. The code that sets replaceDepAcronym should be removed in the next release.
161: ((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
162: assertAnalyzesTo(a, "www.nutch.org.",
163: new String[] { "www.nutch.org" },
164: new String[] { "<HOST>" });
165: }
166:
167: public void testEMailAddresses() throws Exception {
168: // email addresses, possibly with underscores, periods, etc
169: assertAnalyzesTo(a, "test@example.com",
170: new String[] { "test@example.com" });
171: assertAnalyzesTo(a, "first.lastname@example.com",
172: new String[] { "first.lastname@example.com" });
173: assertAnalyzesTo(a, "first_lastname@example.com",
174: new String[] { "first_lastname@example.com" });
175: }
176:
177: public void testNumeric() throws Exception {
178: // floating point, serial, model numbers, ip addresses, etc.
179: // every other segment must have at least one digit
180: assertAnalyzesTo(a, "21.35", new String[] { "21.35" });
181: assertAnalyzesTo(a, "R2D2 C3PO",
182: new String[] { "r2d2", "c3po" });
183: assertAnalyzesTo(a, "216.239.63.104",
184: new String[] { "216.239.63.104" });
185: assertAnalyzesTo(a, "1-2-3", new String[] { "1-2-3" });
186: assertAnalyzesTo(a, "a1-b2-c3", new String[] { "a1-b2-c3" });
187: assertAnalyzesTo(a, "a1-b-c3", new String[] { "a1-b-c3" });
188: }
189:
190: public void testTextWithNumbers() throws Exception {
191: // numbers
192: assertAnalyzesTo(a, "David has 5000 bones", new String[] {
193: "david", "has", "5000", "bones" });
194: }
195:
196: public void testVariousText() throws Exception {
197: // various
198: assertAnalyzesTo(
199: a,
200: "C embedded developers wanted",
201: new String[] { "c", "embedded", "developers", "wanted" });
202: assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo",
203: "bar", "foo", "bar" });
204: assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[] {
205: "foo", "bar", "foo", "bar" });
206: assertAnalyzesTo(a, "\"QUOTED\" word", new String[] { "quoted",
207: "word" });
208: }
209:
210: public void testAcronyms() throws Exception {
211: // acronyms have their dots stripped
212: assertAnalyzesTo(a, "U.S.A.", new String[] { "usa" });
213: }
214:
215: public void testCPlusPlusHash() throws Exception {
216: // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
217: assertAnalyzesTo(a, "C++", new String[] { "c" });
218: assertAnalyzesTo(a, "C#", new String[] { "c" });
219: }
220:
221: public void testKorean() throws Exception {
222: // Korean words
223: assertAnalyzesTo(a, "안녕하세요 한글입니다",
224: new String[] { "안녕하세요", "한글입니다" });
225: }
226:
227: // Compliance with the "old" JavaCC-based analyzer, see:
228: // https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
229:
230: public void testComplianceFileName() throws Exception {
231: assertAnalyzesTo(a, "2004.jpg", new String[] { "2004.jpg" },
232: new String[] { "<HOST>" });
233: }
234:
235: public void testComplianceNumericIncorrect() throws Exception {
236: assertAnalyzesTo(a, "62.46", new String[] { "62.46" },
237: new String[] { "<HOST>" });
238: }
239:
240: public void testComplianceNumericLong() throws Exception {
241: assertAnalyzesTo(a, "978-0-94045043-1",
242: new String[] { "978-0-94045043-1" },
243: new String[] { "<NUM>" });
244: }
245:
246: public void testComplianceNumericFile() throws Exception {
247: assertAnalyzesTo(a, "78academyawards/rules/rule02.html",
248: new String[] { "78academyawards/rules/rule02.html" },
249: new String[] { "<NUM>" });
250: }
251:
252: public void testComplianceNumericWithUnderscores() throws Exception {
253: assertAnalyzesTo(
254: a,
255: "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
256: new String[] { "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs" },
257: new String[] { "<NUM>" });
258: }
259:
260: public void testComplianceNumericWithDash() throws Exception {
261: assertAnalyzesTo(a, "mid-20th", new String[] { "mid-20th" },
262: new String[] { "<NUM>" });
263: }
264:
265: public void testComplianceManyTokens() throws Exception {
266: assertAnalyzesTo(
267: a,
268: "/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
269: + "safari-0-sheikh-zayed-grand-mosque.jpg",
270: new String[] { "money.cnn.com", "magazines", "fortune",
271: "fortune", "archive/2007/03/19/8402357",
272: "index.htm", "safari-0-sheikh", "zayed",
273: "grand", "mosque.jpg" }, new String[] {
274: "<HOST>", "<ALPHANUM>", "<ALPHANUM>",
275: "<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>",
276: "<ALPHANUM>", "<ALPHANUM>", "<HOST>" });
277: }
278:
279: /** @deprecated this should be removed in the 3.0. */
280: public void testDeprecatedAcronyms() throws Exception {
281: // test backward compatibility for applications that require the old behavior.
282: // this should be removed once replaceDepAcronym is removed.
283: assertAnalyzesTo(a, "lucene.apache.org.",
284: new String[] { "luceneapacheorg" },
285: new String[] { "<ACRONYM>" });
286: }
287: }
|