001: package org.apache.lucene.index.memory;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.ByteArrayInputStream;
021: import java.io.File;
022: import java.io.FileInputStream;
023: import java.io.IOException;
024: import java.io.InputStream;
025: import java.io.InputStreamReader;
026: import java.io.StringReader;
027: import java.nio.ByteBuffer;
028: import java.nio.charset.Charset;
029: import java.util.ArrayList;
030: import java.util.List;
031: import java.util.Set;
032: import java.util.regex.Pattern;
033:
034: import junit.framework.TestCase;
035:
036: import org.apache.lucene.analysis.LetterTokenizer;
037: import org.apache.lucene.analysis.LowerCaseFilter;
038: import org.apache.lucene.analysis.StopAnalyzer;
039: import org.apache.lucene.analysis.StopFilter;
040: import org.apache.lucene.analysis.Token;
041: import org.apache.lucene.analysis.TokenStream;
042: import org.apache.lucene.analysis.WhitespaceTokenizer;
043:
044: /**
045: Verifies that Lucene PatternAnalyzer and normal Lucene Analyzers have the same behaviour,
046: returning the same results for any given free text.
047: Runs a set of texts against a tokenizers/analyzers
048: Can also be used as a simple benchmark.
049: <p>
050: Example usage:
051: <pre>
052: cd lucene-cvs
053: java org.apache.lucene.index.memory.PatternAnalyzerTest 1 1 patluc 1 2 2 *.txt *.xml docs/*.html src/java/org/apache/lucene/index/*.java xdocs/*.xml ../nux/samples/data/*.xml
054: </pre>
055:
056: with WhitespaceAnalyzer problems can be found; These are not bugs but questionable
057: Lucene features: CharTokenizer.MAX_WORD_LEN = 255.
058: Thus the PatternAnalyzer produces correct output, whereas the WhitespaceAnalyzer
059: silently truncates text, and so the comparison results in assertEquals() don't match up.
060:
061: @author whoschek.AT.lbl.DOT.gov
062: */
063: public class PatternAnalyzerTest extends TestCase {
064:
065: /** Runs the tests and/or benchmark */
066: public static void main(String[] args) throws Throwable {
067: new PatternAnalyzerTest().run(args);
068: }
069:
070: public void testMany() throws Throwable {
071: // String[] files = MemoryIndexTest.listFiles(new String[] {
072: // "*.txt", "*.html", "*.xml", "xdocs/*.xml",
073: // "src/test/org/apache/lucene/queryParser/*.java",
074: // "src/org/apache/lucene/index/memory/*.java",
075: // });
076: String[] files = MemoryIndexTest.listFiles(new String[] {
077: "../../*.txt", "../../*.html", "../../*.xml",
078: "../../xdocs/*.xml",
079: "../../src/test/org/apache/lucene/queryParser/*.java",
080: "src/java/org/apache/lucene/index/memory/*.java", });
081: System.out.println("files = " + java.util.Arrays.asList(files));
082: String[] xargs = new String[] { "1", "1", "patluc", "1", "2",
083: "2", };
084: String[] args = new String[xargs.length + files.length];
085: System.arraycopy(xargs, 0, args, 0, xargs.length);
086: System.arraycopy(files, 0, args, xargs.length, files.length);
087: run(args);
088: }
089:
090: private void run(String[] args) throws Throwable {
091: int k = -1;
092:
093: int iters = 1;
094: if (args.length > ++k)
095: iters = Math.max(1, Integer.parseInt(args[k]));
096:
097: int runs = 1;
098: if (args.length > ++k)
099: runs = Math.max(1, Integer.parseInt(args[k]));
100:
101: String cmd = "patluc";
102: if (args.length > ++k)
103: cmd = args[k];
104: boolean usePattern = cmd.indexOf("pat") >= 0;
105: boolean useLucene = cmd.indexOf("luc") >= 0;
106:
107: int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc
108: if (args.length > ++k)
109: maxLetters = Integer.parseInt(args[k]);
110:
111: int maxToLower = 2;
112: if (args.length > ++k)
113: maxToLower = Integer.parseInt(args[k]);
114:
115: int maxStops = 2;
116: if (args.length > ++k)
117: maxStops = Integer.parseInt(args[k]);
118:
119: File[] files = new File[] { new File("CHANGES.txt"),
120: new File("LICENSE.txt") };
121: if (args.length > ++k) {
122: files = new File[args.length - k];
123: for (int i = k; i < args.length; i++) {
124: files[i - k] = new File(args[i]);
125: }
126: }
127:
128: for (int iter = 0; iter < iters; iter++) {
129: System.out.println("\n########### iteration=" + iter);
130: long start = System.currentTimeMillis();
131: long bytes = 0;
132:
133: for (int i = 0; i < files.length; i++) {
134: File file = files[i];
135: if (!file.exists() || file.isDirectory())
136: continue; // ignore
137: bytes += file.length();
138: String text = toString(new FileInputStream(file), null);
139: System.out.println("\n*********** FILE=" + file);
140:
141: for (int letters = 0; letters < maxLetters; letters++) {
142: boolean lettersOnly = letters == 0;
143:
144: for (int stops = 0; stops < maxStops; stops++) {
145: Set stopWords = null;
146: if (stops != 0)
147: stopWords = StopFilter
148: .makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
149:
150: for (int toLower = 0; toLower < maxToLower; toLower++) {
151: boolean toLowerCase = toLower != 0;
152:
153: for (int run = 0; run < runs; run++) {
154: List tokens1 = null;
155: List tokens2 = null;
156: try {
157: if (usePattern)
158: tokens1 = getTokens(patternTokenStream(
159: text, lettersOnly,
160: toLowerCase, stopWords));
161: if (useLucene)
162: tokens2 = getTokens(luceneTokenStream(
163: text, lettersOnly,
164: toLowerCase, stopWords));
165: if (usePattern && useLucene)
166: assertEquals(tokens1, tokens2);
167: } catch (Throwable t) {
168: if (t instanceof OutOfMemoryError)
169: t.printStackTrace();
170: System.out
171: .println("fatal error at file="
172: + file
173: + ", letters="
174: + lettersOnly
175: + ", toLowerCase="
176: + toLowerCase
177: + ", stopwords="
178: + (stopWords != null ? "english"
179: : "none"));
180: System.out.println("\n\ntokens1="
181: + toString(tokens1));
182: System.out.println("\n\ntokens2="
183: + toString(tokens2));
184: throw t;
185: }
186: }
187: }
188: }
189: }
190: long end = System.currentTimeMillis();
191: System.out.println("\nsecs = "
192: + ((end - start) / 1000.0f));
193: System.out
194: .println("files/sec= "
195: + (1.0f * runs * maxLetters
196: * maxToLower * maxStops
197: * files.length / ((end - start) / 1000.0f)));
198: float mb = (1.0f * bytes * runs * maxLetters
199: * maxToLower * maxStops)
200: / (1024.0f * 1024.0f);
201: System.out.println("MB/sec = "
202: + (mb / ((end - start) / 1000.0f)));
203: }
204: }
205:
206: if (usePattern && useLucene)
207: System.out.println("No bug found. done.");
208: else
209: System.out
210: .println("Done benchmarking (without checking correctness).");
211: }
212:
213: private TokenStream patternTokenStream(String text,
214: boolean letters, boolean toLowerCase, Set stopWords) {
215: Pattern pattern;
216: if (letters)
217: pattern = PatternAnalyzer.NON_WORD_PATTERN;
218: else
219: pattern = PatternAnalyzer.WHITESPACE_PATTERN;
220: PatternAnalyzer analyzer = new PatternAnalyzer(pattern,
221: toLowerCase, stopWords);
222: return analyzer.tokenStream("", text);
223: }
224:
225: private TokenStream luceneTokenStream(String text, boolean letters,
226: boolean toLowerCase, Set stopWords) {
227: TokenStream stream;
228: if (letters)
229: stream = new LetterTokenizer(new StringReader(text));
230: else
231: stream = new WhitespaceTokenizer(new StringReader(text));
232: if (toLowerCase)
233: stream = new LowerCaseFilter(stream);
234: if (stopWords != null)
235: stream = new StopFilter(stream, stopWords);
236: return stream;
237: }
238:
239: private List getTokens(TokenStream stream) throws IOException {
240: ArrayList tokens = new ArrayList();
241: Token token;
242: while ((token = stream.next()) != null) {
243: tokens.add(token);
244: }
245: return tokens;
246: }
247:
248: private void assertEquals(List tokens1, List tokens2) {
249: int size = Math.min(tokens1.size(), tokens2.size());
250: int i = 0;
251: try {
252: for (; i < size; i++) {
253: Token t1 = (Token) tokens1.get(i);
254: Token t2 = (Token) tokens2.get(i);
255: if (!(t1.termText().equals(t2.termText())))
256: throw new IllegalStateException("termText");
257: if (t1.startOffset() != t2.startOffset())
258: throw new IllegalStateException("startOffset");
259: if (t1.endOffset() != t2.endOffset())
260: throw new IllegalStateException("endOffset");
261: if (!(t1.type().equals(t2.type())))
262: throw new IllegalStateException("type");
263: }
264: if (tokens1.size() != tokens2.size())
265: throw new IllegalStateException("size1="
266: + tokens1.size() + ", size2=" + tokens2.size());
267: }
268:
269: catch (IllegalStateException e) {
270: if (size > 0) {
271: System.out.println("i=" + i + ", size=" + size);
272: System.out.println("t1[size]='"
273: + ((Token) tokens1.get(size - 1)).termText()
274: + "'");
275: System.out.println("t2[size]='"
276: + ((Token) tokens2.get(size - 1)).termText()
277: + "'");
278: }
279: throw e;
280: }
281: }
282:
283: private String toString(List tokens) {
284: if (tokens == null)
285: return "null";
286: String str = "[";
287: for (int i = 0; i < tokens.size(); i++) {
288: Token t1 = (Token) tokens.get(i);
289: str = str + "'" + t1.termText() + "', ";
290: }
291: return str + "]";
292: }
293:
294: // trick to detect default platform charset
295: private static final Charset DEFAULT_PLATFORM_CHARSET = Charset
296: .forName(new InputStreamReader(new ByteArrayInputStream(
297: new byte[0])).getEncoding());
298:
299: // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
300: private static String toString(InputStream input, Charset charset)
301: throws IOException {
302: if (charset == null)
303: charset = DEFAULT_PLATFORM_CHARSET;
304: byte[] data = toByteArray(input);
305: return charset.decode(ByteBuffer.wrap(data)).toString();
306: }
307:
308: private static byte[] toByteArray(InputStream input)
309: throws IOException {
310: try {
311: // safe and fast even if input.available() behaves weird or buggy
312: int len = Math.max(256, input.available());
313: byte[] buffer = new byte[len];
314: byte[] output = new byte[len];
315:
316: len = 0;
317: int n;
318: while ((n = input.read(buffer)) >= 0) {
319: if (len + n > output.length) { // grow capacity
320: byte tmp[] = new byte[Math.max(output.length << 1,
321: len + n)];
322: System.arraycopy(output, 0, tmp, 0, len);
323: System.arraycopy(buffer, 0, tmp, len, n);
324: buffer = output; // use larger buffer for future larger bulk reads
325: output = tmp;
326: } else {
327: System.arraycopy(buffer, 0, output, len, n);
328: }
329: len += n;
330: }
331:
332: if (len == output.length)
333: return output;
334: buffer = null; // help gc
335: buffer = new byte[len];
336: System.arraycopy(output, 0, buffer, 0, len);
337: return buffer;
338: } finally {
339: if (input != null)
340: input.close();
341: }
342: }
343:
344: }
|