001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.lucene.wikipedia.analysis;
017:
018: import junit.framework.TestCase;
019: import org.apache.lucene.analysis.Token;
020:
021: import java.io.StringReader;
022: import java.util.HashMap;
023: import java.util.Map;
024:
025: /**
026: *
027: *
028: **/
029: public class WikipediaTokenizerTest extends TestCase {
030:
031: public WikipediaTokenizerTest(String s) {
032: super (s);
033: }
034:
035: protected void setUp() {
036: }
037:
038: protected void tearDown() {
039:
040: }
041:
042: public void testHandwritten() throws Exception {
043: //make sure all tokens are in only one type
044: String test = "[[link]] This is a [[Category:foo]] Category This is a linked [[:Category:bar none withstanding]] "
045: + "Category This is (parens) This is a [[link]] This is an external URL [http://lucene.apache.org] "
046: + "Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' "
047: + " This is a [[link|display info]] This is a period. Here is $3.25 and here is 3.50. Here's Johnny. "
048: + "==heading== ===sub head=== followed by some text [[Category:blah| ]] "
049: + "''[[Category:ital_cat]]'' here is some that is ''italics [[Category:foo]] but is never closed."
050: + "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this"
051: + " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]"
052: + " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
053: Map tcm = new HashMap();//map tokens to types
054: tcm.put("link", WikipediaTokenizer.INTERNAL_LINK);
055: tcm.put("display", WikipediaTokenizer.INTERNAL_LINK);
056: tcm.put("info", WikipediaTokenizer.INTERNAL_LINK);
057:
058: tcm.put("http://lucene.apache.org",
059: WikipediaTokenizer.EXTERNAL_LINK_URL);
060: tcm.put("http://foo.boo.com/test/test/",
061: WikipediaTokenizer.EXTERNAL_LINK_URL);
062: tcm.put("http://foo.boo.com/test/test/test.html",
063: WikipediaTokenizer.EXTERNAL_LINK_URL);
064: tcm.put("http://foo.boo.com/test/test/test.html?g=b&c=d",
065: WikipediaTokenizer.EXTERNAL_LINK_URL);
066: tcm.put("Test", WikipediaTokenizer.EXTERNAL_LINK);
067:
068: //alphanums
069: tcm.put("This", "<ALPHANUM>");
070: tcm.put("is", "<ALPHANUM>");
071: tcm.put("a", "<ALPHANUM>");
072: tcm.put("Category", "<ALPHANUM>");
073: tcm.put("linked", "<ALPHANUM>");
074: tcm.put("parens", "<ALPHANUM>");
075: tcm.put("external", "<ALPHANUM>");
076: tcm.put("URL", "<ALPHANUM>");
077: tcm.put("and", "<ALPHANUM>");
078: tcm.put("period", "<ALPHANUM>");
079: tcm.put("Here", "<ALPHANUM>");
080: tcm.put("Here's", "<APOSTROPHE>");
081: tcm.put("here", "<ALPHANUM>");
082: tcm.put("Johnny", "<ALPHANUM>");
083: tcm.put("followed", "<ALPHANUM>");
084: tcm.put("by", "<ALPHANUM>");
085: tcm.put("text", "<ALPHANUM>");
086: tcm.put("that", "<ALPHANUM>");
087: tcm.put("but", "<ALPHANUM>");
088: tcm.put("never", "<ALPHANUM>");
089: tcm.put("closed", "<ALPHANUM>");
090: tcm.put("goes", "<ALPHANUM>");
091: tcm.put("for", "<ALPHANUM>");
092: tcm.put("this", "<ALPHANUM>");
093: tcm.put("an", "<ALPHANUM>");
094: tcm.put("some", "<ALPHANUM>");
095: tcm.put("martian", "<ALPHANUM>");
096: tcm.put("code", "<ALPHANUM>");
097:
098: tcm.put("foo", WikipediaTokenizer.CATEGORY);
099: tcm.put("bar", WikipediaTokenizer.CATEGORY);
100: tcm.put("none", WikipediaTokenizer.CATEGORY);
101: tcm.put("withstanding", WikipediaTokenizer.CATEGORY);
102: tcm.put("blah", WikipediaTokenizer.CATEGORY);
103: tcm.put("ital", WikipediaTokenizer.CATEGORY);
104: tcm.put("cat", WikipediaTokenizer.CATEGORY);
105:
106: tcm.put("italics", WikipediaTokenizer.ITALICS);
107: tcm.put("more", WikipediaTokenizer.ITALICS);
108: tcm.put("bold", WikipediaTokenizer.BOLD);
109: tcm.put("same", WikipediaTokenizer.BOLD);
110: tcm.put("five", WikipediaTokenizer.BOLD_ITALICS);
111: tcm.put("and2", WikipediaTokenizer.BOLD_ITALICS);
112: tcm.put("quotes", WikipediaTokenizer.BOLD_ITALICS);
113:
114: tcm.put("heading", WikipediaTokenizer.HEADING);
115: tcm.put("sub", WikipediaTokenizer.SUB_HEADING);
116: tcm.put("head", WikipediaTokenizer.SUB_HEADING);
117:
118: tcm.put("Citation", WikipediaTokenizer.CITATION);
119:
120: tcm.put("3.25", "<NUM>");
121: tcm.put("3.50", "<NUM>");
122: WikipediaTokenizer tf = new WikipediaTokenizer(
123: new StringReader(test));
124: Token token = new Token();
125: int count = 0;
126: int numItalics = 0;
127: int numBoldItalics = 0;
128: int numCategory = 0;
129: int numCitation = 0;
130: while ((token = tf.next(token)) != null) {
131: String tokText = token.termText();
132: //System.out.println("Text: " + tokText + " Type: " + token.type());
133: assertTrue("token is null and it shouldn't be",
134: token != null);
135: String expectedType = (String) tcm.get(tokText);
136: assertTrue("expectedType is null and it shouldn't be for: "
137: + token, expectedType != null);
138: assertTrue(token.type() + " is not equal to "
139: + expectedType + " for " + token, token.type()
140: .equals(expectedType) == true);
141: count++;
142: if (token.type().equals(WikipediaTokenizer.ITALICS) == true) {
143: numItalics++;
144: } else if (token.type().equals(
145: WikipediaTokenizer.BOLD_ITALICS) == true) {
146: numBoldItalics++;
147: } else if (token.type().equals(WikipediaTokenizer.CATEGORY) == true) {
148: numCategory++;
149: } else if (token.type().equals(WikipediaTokenizer.CITATION) == true) {
150: numCitation++;
151: }
152: }
153: assertTrue("We have not seen enough tokens: " + count
154: + " is not >= " + tcm.size(), count >= tcm.size());
155: assertTrue(numItalics + " does not equal: " + 4
156: + " for numItalics", numItalics == 4);
157: assertTrue(numBoldItalics + " does not equal: " + 3
158: + " for numBoldItalics", numBoldItalics == 3);
159: assertTrue(numCategory + " does not equal: " + 10
160: + " for numCategory", numCategory == 10);
161: assertTrue(numCitation + " does not equal: " + 1
162: + " for numCitation", numCitation == 1);
163: }
164:
165: public void testLinkPhrases() throws Exception {
166: String test = "click [[link here again]] click [http://lucene.apache.org here again]";
167: WikipediaTokenizer tf = new WikipediaTokenizer(
168: new StringReader(test));
169: Token token = new Token();
170: token = tf.next(token);
171: assertTrue("token is null and it shouldn't be", token != null);
172: assertTrue(
173: new String(token.termBuffer(), 0, token.termLength())
174: + " is not equal to " + "click", new String(
175: token.termBuffer(), 0, token.termLength())
176: .equals("click") == true);
177: assertTrue(token.getPositionIncrement() + " does not equal: "
178: + 1, token.getPositionIncrement() == 1);
179: token = tf.next(token);
180: assertTrue("token is null and it shouldn't be", token != null);
181: assertTrue(
182: new String(token.termBuffer(), 0, token.termLength())
183: + " is not equal to " + "link", new String(
184: token.termBuffer(), 0, token.termLength())
185: .equals("link") == true);
186: assertTrue(token.getPositionIncrement() + " does not equal: "
187: + 1, token.getPositionIncrement() == 1);
188: token = tf.next(token);
189: assertTrue("token is null and it shouldn't be", token != null);
190: assertTrue(
191: new String(token.termBuffer(), 0, token.termLength())
192: + " is not equal to " + "here", new String(
193: token.termBuffer(), 0, token.termLength())
194: .equals("here") == true);
195: //The link, and here should be at the same position for phrases to work
196: assertTrue(token.getPositionIncrement() + " does not equal: "
197: + 1, token.getPositionIncrement() == 1);
198: token = tf.next(token);
199: assertTrue("token is null and it shouldn't be", token != null);
200: assertTrue(
201: new String(token.termBuffer(), 0, token.termLength())
202: + " is not equal to " + "again", new String(
203: token.termBuffer(), 0, token.termLength())
204: .equals("again") == true);
205: assertTrue(token.getPositionIncrement() + " does not equal: "
206: + 1, token.getPositionIncrement() == 1);
207:
208: token = tf.next(token);
209: assertTrue("token is null and it shouldn't be", token != null);
210: assertTrue(
211: new String(token.termBuffer(), 0, token.termLength())
212: + " is not equal to " + "click", new String(
213: token.termBuffer(), 0, token.termLength())
214: .equals("click") == true);
215: assertTrue(token.getPositionIncrement() + " does not equal: "
216: + 1, token.getPositionIncrement() == 1);
217:
218: token = tf.next(token);
219: assertTrue("token is null and it shouldn't be", token != null);
220: assertTrue(
221: new String(token.termBuffer(), 0, token.termLength())
222: + " is not equal to "
223: + "http://lucene.apache.org", new String(token
224: .termBuffer(), 0, token.termLength())
225: .equals("http://lucene.apache.org") == true);
226: assertTrue(token.getPositionIncrement() + " does not equal: "
227: + 1, token.getPositionIncrement() == 1);
228:
229: token = tf.next(token);
230: assertTrue("token is null and it shouldn't be", token != null);
231: assertTrue(
232: new String(token.termBuffer(), 0, token.termLength())
233: + " is not equal to " + "here", new String(
234: token.termBuffer(), 0, token.termLength())
235: .equals("here") == true);
236: assertTrue(token.getPositionIncrement() + " does not equal: "
237: + 0, token.getPositionIncrement() == 0);
238:
239: token = tf.next(token);
240: assertTrue("token is null and it shouldn't be", token != null);
241: assertTrue(
242: new String(token.termBuffer(), 0, token.termLength())
243: + " is not equal to " + "again", new String(
244: token.termBuffer(), 0, token.termLength())
245: .equals("again") == true);
246: assertTrue(token.getPositionIncrement() + " does not equal: "
247: + 1, token.getPositionIncrement() == 1);
248:
249: }
250:
251: public void testLinks() throws Exception {
252: String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
253: WikipediaTokenizer tf = new WikipediaTokenizer(
254: new StringReader(test));
255: Token token = new Token();
256: token = tf.next(token);
257: assertTrue("token is null and it shouldn't be", token != null);
258: assertTrue(
259: new String(token.termBuffer(), 0, token.termLength())
260: + " is not equal to "
261: + "http://lucene.apache.org/java/docs/index.html#news",
262: new String(token.termBuffer(), 0, token.termLength())
263: .equals("http://lucene.apache.org/java/docs/index.html#news") == true);
264: assertTrue(token.type() + " is not equal to "
265: + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type()
266: .equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
267: tf.next(token);//skip here
268: token = tf.next(token);
269: assertTrue("token is null and it shouldn't be", token != null);
270: assertTrue(
271: new String(token.termBuffer(), 0, token.termLength())
272: + " is not equal to "
273: + "http://lucene.apache.org/java/docs/index.html?b=c",
274: new String(token.termBuffer(), 0, token.termLength())
275: .equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
276: assertTrue(token.type() + " is not equal to "
277: + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type()
278: .equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
279: tf.next(token);//skip here
280: token = tf.next(token);
281: assertTrue("token is null and it shouldn't be", token != null);
282: assertTrue(
283: new String(token.termBuffer(), 0, token.termLength())
284: + " is not equal to "
285: + "https://lucene.apache.org/java/docs/index.html?b=c",
286: new String(token.termBuffer(), 0, token.termLength())
287: .equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
288: assertTrue(token.type() + " is not equal to "
289: + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type()
290: .equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
291: }
292: }
|