001: package org.apache.lucene.analysis.fr;
002:
003: /* ====================================================================
004: * The Apache Software License, Version 1.1
005: *
006: * Copyright (c) 2004 The Apache Software Foundation. All rights
007: * reserved.
008: *
009: * Redistribution and use in source and binary forms, with or without
010: * modification, are permitted provided that the following conditions
011: * are met:
012: *
013: * 1. Redistributions of source code must retain the above copyright
014: * notice, this list of conditions and the following disclaimer.
015: *
016: * 2. Redistributions in binary form must reproduce the above copyright
017: * notice, this list of conditions and the following disclaimer in
018: * the documentation and/or other materials provided with the
019: * distribution.
020: *
021: * 3. The end-user documentation included with the redistribution,
022: * if any, must include the following acknowledgment:
023: * "This product includes software developed by the
024: * Apache Software Foundation (http://www.apache.org/)."
025: * Alternately, this acknowledgment may appear in the software itself,
026: * if and wherever such third-party acknowledgments normally appear.
027: *
028: * 4. The names "Apache" and "Apache Software Foundation" and
029: * "Apache Lucene" must not be used to endorse or promote products
030: * derived from this software without prior written permission. For
031: * written permission, please contact apache@apache.org.
032: *
033: * 5. Products derived from this software may not be called "Apache",
034: * "Apache Lucene", nor may "Apache" appear in their name, without
035: * prior written permission of the Apache Software Foundation.
036: *
037: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
038: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
039: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
040: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
041: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
042: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
043: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
044: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
045: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
046: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
047: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
048: * SUCH DAMAGE.
049: * ====================================================================
050: *
051: * This software consists of voluntary contributions made by many
052: * individuals on behalf of the Apache Software Foundation. For more
053: * information on the Apache Software Foundation, please see
054: * <http://www.apache.org/>.
055: */
056:
057: import java.io.StringReader;
058:
059: import junit.framework.TestCase;
060:
061: import org.apache.lucene.analysis.Analyzer;
062: import org.apache.lucene.analysis.Token;
063: import org.apache.lucene.analysis.TokenStream;
064:
065: /**
066: * Test case for FrenchAnalyzer.
067: *
068: * @author Jean-François Halleux
069: * @version $version$
070: */
071:
072: public class TestFrenchAnalyzer extends TestCase {
073:
074: // Method copied from TestAnalyzers, maybe should be refactored
075: public void assertAnalyzesTo(Analyzer a, String input,
076: String[] output) throws Exception {
077:
078: TokenStream ts = a
079: .tokenStream("dummy", new StringReader(input));
080:
081: for (int i = 0; i < output.length; i++) {
082: Token t = ts.next();
083: assertNotNull(t);
084: assertEquals(t.termText(), output[i]);
085: }
086: assertNull(ts.next());
087: ts.close();
088: }
089:
090: public void testAnalyzer() throws Exception {
091: FrenchAnalyzer fa = new FrenchAnalyzer();
092:
093: // test null reader
094: boolean iaeFlag = false;
095: try {
096: fa.tokenStream("dummy", null);
097: } catch (IllegalArgumentException iae) {
098: iaeFlag = true;
099: }
100: assertEquals(iaeFlag, true);
101:
102: // test null fieldname
103: iaeFlag = false;
104: try {
105: fa.tokenStream(null, new StringReader("dummy"));
106: } catch (IllegalArgumentException iae) {
107: iaeFlag = true;
108: }
109: assertEquals(iaeFlag, true);
110:
111: assertAnalyzesTo(fa, "", new String[] {});
112:
113: assertAnalyzesTo(fa, "chien chat cheval", new String[] {
114: "chien", "chat", "cheval" });
115:
116: assertAnalyzesTo(fa, "chien CHAT CHEVAL", new String[] {
117: "chien", "chat", "cheval" });
118:
119: assertAnalyzesTo(fa, " chien ,? + = - CHAT /: > CHEVAL",
120: new String[] { "chien", "chat", "cheval" });
121:
122: assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
123:
124: assertAnalyzesTo(fa, "mot \"entreguillemet\"", new String[] {
125: "mot", "entreguillemet" });
126:
127: // let's do some french specific tests now
128:
129: /* 1. couldn't resist
130: I would expect this to stay one term as in French the minus
131: sign is often used for composing words */
132: assertAnalyzesTo(fa, "Jean-François", new String[] { "jean",
133: "françois" });
134:
135: // 2. stopwords
136: assertAnalyzesTo(fa,
137: "le la chien les aux chat du des à cheval",
138: new String[] { "chien", "chat", "cheval" });
139:
140: // some nouns and adjectives
141: assertAnalyzesTo(fa,
142: "lances chismes habitable chiste éléments captifs",
143: new String[] { "lanc", "chism", "habit", "chist",
144: "élément", "captif" });
145:
146: // some verbs
147: assertAnalyzesTo(fa, "finissions souffrirent rugissante",
148: new String[] { "fin", "souffr", "rug" });
149:
150: // some everything else
151: // aujourd'hui stays one term which is OK
152: assertAnalyzesTo(
153: fa,
154: "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
155: new String[] { "c3po", "aujourd'hui", "oeuf",
156: "ïâöûàä", "anticonstitutionnel", "jav" });
157:
158: // some more everything else
159: // here 1940-1945 stays as one term, 1940:1945 not ?
160: assertAnalyzesTo(fa, "33Bis 1940-1945 1940:1945 (---i+++)*",
161: new String[] { "33bis", "1940-1945", "1940", "1945",
162: "i" });
163:
164: }
165:
166: }
|