001: /*
002: * TestTokenizerSource.java: JUnit test for a Tokenizer
003: *
004: * Copyright (C) 2004 Heiko Blau
005: *
006: * This file belongs to the JTopas test suite.
007: * The JTopas test suite is free software; you can redistribute it and/or modify it
008: * under the terms of the GNU Lesser General Public License as published by the
009: * Free Software Foundation; either version 2.1 of the License, or (at your option)
010: * any later version.
011: *
012: * This software is distributed in the hope that it will be useful, but WITHOUT
013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
014: * FITNESS FOR A PARTICULAR PURPOSE.
015: * See the GNU Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public License along
018: * with the JTopas test suite. If not, write to the
019: *
020: * Free Software Foundation, Inc.
021: * 59 Temple Place, Suite 330,
022: * Boston, MA 02111-1307
023: * USA
024: *
025: * or check the Internet: http://www.fsf.org
026: *
027: * The JTopas test suite uses the test framework JUnit by Kent Beck and Erich Gamma.
028: * You should have received a copy of their JUnit licence agreement along with
029: * the JTopas test suite.
030: *
031: * We do NOT provide the JUnit archive junit.jar nessecary to compile and run
032: * our tests, since we assume, that You either have it already or would like
033: * to get the current release Yourself.
034: * Please visit either:
035: * http://sourceforge.net/projects/junit
036: * or
037: * http://junit.org
038: * to obtain JUnit.
039: *
040: * Contact:
041: * email: heiko@susebox.de
042: */
043:
044: package de.susebox.jtopas;
045:
046: //-----------------------------------------------------------------------------
047: // Imports
048: //
049: import java.io.Reader;
050: import java.io.StringReader;
051: import java.io.File;
052: import java.io.PrintWriter;
053: import java.util.Iterator;
054: import java.util.List;
055: import java.util.LinkedList;
056:
057: import junit.framework.Test;
058: import junit.framework.TestCase;
059: import junit.framework.TestSuite;
060: import junit.framework.Assert;
061:
062: import de.susebox.TestUtilities;
063:
064: //-----------------------------------------------------------------------------
065: // Class TestTokenizerSource
066: //
067:
068: /**<p>
069: * The class contains a number of test cases that are supposed to be difficult
070: * to handle for a {@link Tokenizer}, e.g. EOF conditions inside strings etc.
071: *</p>
072: *
073: * @see TokenizerSource
074: * @author Heiko Blau
075: */
076: public class TestTokenizerSource extends TestCase {
077:
078: //---------------------------------------------------------------------------
079: // properties
080: //
081:
082: //---------------------------------------------------------------------------
083: // main method
084: //
085:
086: /**
087: * call this method to invoke the tests
088: */
089: public static void main(String[] args) {
090: String[] tests = { TestTokenizerSource.class.getName() };
091:
092: TestUtilities.run(tests, args);
093: }
094:
095: //---------------------------------------------------------------------------
096: // suite method
097: //
098:
099: /**
100: * Implementation of the JUnit method <code>suite</code>. For each set of test
101: * properties one or more tests are instantiated.
102: *
103: * @return a test suite
104: */
105: public static Test suite() {
106: TestSuite suite = new TestSuite(TestTokenizerSource.class
107: .getName());
108:
109: suite.addTest(new TestTokenizerSource("testEmptySource"));
110: suite.addTest(new TestTokenizerSource("testSmallBuffer"));
111: suite.addTest(new TestTokenizerSource("testLargeBuffer"));
112: suite.addTest(new TestTokenizerSource("testSpeed"));
113: suite.addTest(new TestTokenizerSource("testSimilarResults"));
114: suite.addTest(new TestTokenizerSource("testLargeSource"));
115: return suite;
116: }
117:
118: //---------------------------------------------------------------------------
119: // Constructor
120: //
121:
122: /**
123: * Default constructor. Standard input {@link java.lang.System#in} is used
124: * to construct the input stream reader.
125: */
126: public TestTokenizerSource(String test) {
127: super (test);
128: }
129:
130: //---------------------------------------------------------------------------
131: // Fixture setup and release
132: //
133:
134: /**
135: * Sets up the fixture, for example, open a network connection.
136: * This method is called before a test is executed.
137: */
138: protected void setUp() throws Exception {
139: }
140:
141: /**
142: * Tears down the fixture, for example, close a network connection.
143: * This method is called after a test is executed.
144: */
145: protected void tearDown() throws Exception {
146: }
147:
148: //---------------------------------------------------------------------------
149: // test cases
150: //
151:
152: /**
153: * Test empty data sources
154: */
155: public void testEmptySource() throws Throwable {
156: TokenizerSource[] source = { null, null, null, null, null, null };
157: char[] cbuf = new char[8129];
158: int count;
159:
160: source[0] = new CharArraySource(null);
161: source[1] = new ReaderSource((java.io.InputStream) null);
162: source[2] = new StringSource(null);
163: source[3] = new CharArraySource(new char[0]);
164: source[4] = new ReaderSource(new StringReader(""));
165: source[5] = new StringSource("");
166: for (int index = 0; index < source.length; ++index) {
167: count = source[index].read(cbuf, 0, cbuf.length);
168: assertTrue(source[index].getClass().getName()
169: + ": expected -1, got " + count, count == -1);
170: }
171: }
172:
173: /**
174: * Test a buffer that is smaller than the available data
175: */
176: public void testSmallBuffer() throws Throwable {
177: TokenizerSource[] source = { null, null, null };
178: char[] cbuf = new char[1];
179: char[] text = new char[DATA.length()];
180: int count;
181:
182: DATA.getChars(0, DATA.length(), text, 0);
183: source[0] = new CharArraySource(text);
184: source[1] = new ReaderSource(new StringReader(DATA));
185: source[2] = new StringSource(DATA);
186: for (int index = 0; index < source.length; ++index) {
187: for (int readIndex = 0; readIndex < DATA.length(); ++readIndex) {
188: count = source[index].read(cbuf, 0, cbuf.length);
189: assertTrue(source[index].getClass().getName()
190: + ": expected 1, got " + count, count == 1);
191: }
192: count = source[index].read(cbuf, 0, cbuf.length);
193: assertTrue(source[index].getClass().getName()
194: + ": expected -1, got " + count, count == -1);
195: }
196: }
197:
198: /**
199: * Test a buffer that is larger than the available data
200: */
201: public void testLargeBuffer() throws Throwable {
202: TokenizerSource[] source = { null, null, null };
203: char[] cbuf = new char[8192];
204: char[] text = new char[DATA.length()];
205: int count;
206:
207: DATA.getChars(0, DATA.length(), text, 0);
208: source[0] = new CharArraySource(text);
209: source[1] = new ReaderSource(new StringReader(DATA));
210: source[2] = new StringSource(DATA);
211: for (int index = 0; index < source.length; ++index) {
212: count = source[index].read(cbuf, 0, cbuf.length);
213: assertTrue(source[index].getClass().getName()
214: + ": expected " + DATA.length() + ", got " + count,
215: count == DATA.length());
216: count = source[index].read(cbuf, 0, cbuf.length);
217: assertTrue(source[index].getClass().getName()
218: + ": expected -1, got " + count, count == -1);
219: }
220: }
221:
222: /**
223: * Test speed
224: */
225: public void testSpeed() throws Throwable {
226: // construct a really huge string
227: TokenizerSource source;
228: char[] buffer;
229: String text = expandData(20000);
230: char[] cbuf = new char[text.length()];
231:
232: text.getChars(0, text.length(), cbuf, 0);
233:
234: for (int bufferSize = 8; bufferSize < 0x20000; bufferSize *= 2) {
235: System.out.println("Buffer size " + bufferSize + ":");
236: buffer = new char[bufferSize];
237:
238: // CharArraySource
239: readSource(new CharArraySource(cbuf), buffer);
240:
241: // ReaderSource
242: readSource(new ReaderSource(new StringReader(text)), buffer);
243:
244: // StringSource
245: readSource(new StringSource(text), buffer);
246: }
247: }
248:
249: /**
250: * Test similar special sequences.
251: */
252: public void testSimilarResults() throws Throwable {
253: // construct a really huge string
254: String text = expandData(1000);
255:
256: // initialize the properties
257: TokenizerProperties props = new StandardTokenizerProperties();
258: StandardTokenizer tokenizer = new StandardTokenizer();
259: TokenizerSource source;
260: long startTime;
261:
262: props.addSpecialSequence(ORIG_SMILEY, ORIG_SMILEY);
263: props.addSpecialSequence(FRIGHTENED_SMIKEY, FRIGHTENED_SMIKEY);
264: props.addSpecialSequence(WINKING_SMILEY, WINKING_SMILEY);
265: props.addString("\"", "\"", "\\");
266: props.addString("'", "'", "\\");
267:
268: try {
269: tokenizer.setTokenizerProperties(props);
270:
271: // CharArraySource
272: char[] cbuf = new char[text.length()];
273:
274: text.getChars(0, text.length(), cbuf, 0);
275:
276: // tokenize several times to avoid JIT or hotspot optimization effects
277: int loopCount = 100;
278: int loops = 0;
279: long timeTotal1 = 0;
280: long timeTotal2 = 0;
281: long timeTotal3 = 0;
282:
283: while (loops++ < loopCount) {
284: tokenizer.setSource(new CharArraySource(cbuf));
285:
286: startTime = System.currentTimeMillis();
287: List list1 = tokenize(tokenizer);
288: long time1 = System.currentTimeMillis() - startTime;
289: System.out.println("Loop #" + loops
290: + ": CharArraySource needed " + time1
291: + "ms for " + list1.size() + " token.");
292: timeTotal1 += time1;
293:
294: // ReaderSource
295: tokenizer.setSource(new ReaderSource(new StringReader(
296: text)));
297:
298: startTime = System.currentTimeMillis();
299: List list2 = tokenize(tokenizer);
300: long time2 = System.currentTimeMillis() - startTime;
301: System.out.println("Loop #" + loops
302: + ": ReaderSource needed " + time2 + "ms for "
303: + list2.size() + " token.");
304: timeTotal2 += time2;
305:
306: // StringSource
307: tokenizer.setSource(new StringSource(text));
308:
309: startTime = System.currentTimeMillis();
310: List list3 = tokenize(tokenizer);
311: long time3 = System.currentTimeMillis() - startTime;
312: System.out.println("Loop #" + loops
313: + ": StringSource needed " + time3 + "ms for "
314: + list3.size() + " token.");
315: timeTotal3 += time3;
316:
317: System.out.println("CharArraySource has "
318: + list1.size() + " token.");
319: System.out.println("ReaderSource has " + list2.size()
320: + " token.");
321: System.out.println("StringSource has " + list3.size()
322: + " token.");
323:
324: // any list shorter than the others?
325: assertTrue(
326: "CharArraySource token count differs from ReaderSource token count.",
327: list1.size() == list2.size());
328: assertTrue(
329: "CharArraySource token count differs from StringSource token count.",
330: list1.size() == list3.size());
331:
332: // check token list only once
333: if (loops == loopCount) {
334: System.out.println("CharArraySource total time: "
335: + timeTotal1 + "ms.");
336: System.out.println("ReaderSource total time: "
337: + timeTotal2 + "ms.");
338: System.out.println("StringSource total time: "
339: + timeTotal3 + "ms.");
340:
341: Iterator iter1 = list1.iterator();
342: Iterator iter2 = list2.iterator();
343: Iterator iter3 = list3.iterator();
344: int index = 0;
345: while (iter1.hasNext()) {
346: // compare token
347: Token token1 = (Token) iter1.next();
348: Token token2 = (Token) iter2.next();
349: Token token3 = (Token) iter3.next();
350:
351: assertTrue("Token mismatch at position "
352: + index + ": CharArraySource \""
353: + token1 + "\", ReaderSource \""
354: + token2 + "\"", token1.equals(token2));
355: assertTrue("Token mismatch at position "
356: + index + ": CharArraySource \""
357: + token1 + "\", StringSource \""
358: + token3 + "\"", token1.equals(token3));
359: index++;
360: }
361: }
362: }
363: } finally {
364: tokenizer.close();
365: }
366: }
367:
368: /**
369: * Test similar special sequences.
370: */
371: public void testLargeSource() throws Throwable {
372: // construct a large data source
373: String dataItem = "/*\n"
374: + "* This is a Java style data item.\n"
375: + "* It is concatenated \"multible\" times to get a real\n"
376: + "* big chunk of data.\n"
377: + "* With such a lot of characters the speed of the tokenizers\n"
378: + "* can be compared.\n" + "*/\n"
379: + "package org.muppets.gonzo;\n\n" + "/**\n"
380: + "* This is a class comment :-)\n" + "*/\n"
381: + "public class Gonzo extends Serializable {\n\n"
382: + " /** The standard constructor */\n"
383: + " public Gonzo() {\n" + " // nothing todo here\n"
384: + " }\n\n" + " /** a method */\n"
385: + " public String toString() {\n"
386: + " return \"This is Gonzo\";\n" + " }\n\n"
387: + "}\n\n\n";
388: int tokenCountPerItem = 35;
389: int tokenCount = 0;
390: int maxSize = 0x80000;
391: StringBuffer data = new StringBuffer(maxSize);
392:
393: while (data.length() < maxSize) {
394: data.append(dataItem);
395: tokenCount += tokenCountPerItem;
396: }
397: tokenCount++; // EOF token
398:
399: // Set up the Properties
400: TokenizerProperties props = new StandardTokenizerProperties();
401:
402: props
403: .setParseFlags(Flags.F_RETURN_BLOCK_COMMENTS
404: + Flags.F_RETURN_LINE_COMMENTS
405: + Flags.F_TOKEN_POS_ONLY);
406: props.addBlockComment("/*", "*/");
407: props.addBlockComment("/**", "*/");
408: props.addLineComment("//");
409: props.addString("\"", "\"", "\\");
410: props.addString("'", "'", "\\");
411: props.addKeyword("package");
412: props.addKeyword("public");
413: props.addKeyword("class");
414: props.addKeyword("extends");
415: props.addKeyword("return");
416: props.addKeyword("if");
417: props.addKeyword("then");
418: props.addKeyword("while");
419: props.addKeyword("for");
420: props.addKeyword("int");
421: props.addKeyword("char");
422: props.addSpecialSequence("(");
423: props.addSpecialSequence(")");
424: props.addSpecialSequence(";");
425: props.addSpecialSequence("==");
426: props.addSpecialSequence("!=");
427: props.addSpecialSequence("<=");
428: props.addSpecialSequence(">=");
429:
430: // create the tokenizers.
431: // NOTE: the sources have a special structure that is required for the
432: // analysis below
433: Tokenizer tokenizer = new StandardTokenizer(props);
434: Object[] sources = new Object[] {
435: new StringSource(data.toString()),
436: new ReaderSource(new StringReader(data.toString())),
437: new StringSource(data.toString().substring(0,
438: data.toString().length() / 2)),
439: new ReaderSource(new StringReader(data.toString()
440: .substring(0, data.toString().length() / 2))),
441: new StringSource(data.toString().substring(0,
442: data.toString().length() / 5)),
443: new ReaderSource(new StringReader(data.toString()
444: .substring(0, data.toString().length() / 5))),
445: new StringSource(data.toString().substring(0,
446: data.toString().length() / 20)),
447: new ReaderSource(new StringReader(data.toString()
448: .substring(0, data.toString().length() / 20))) };
449: Object[] tokenLists = new Object[] { null, null, null, null,
450: null, null, null, null };
451:
452: try {
453: for (int index = 0; index < sources.length; ++index) {
454: long start = System.currentTimeMillis();
455:
456: System.out.println(sources[index].getClass().getName()
457: + ": running ...");
458: tokenizer.setSource((TokenizerSource) sources[index]);
459:
460: tokenLists[index] = tokenize(tokenizer);
461:
462: System.out.println(sources[index].getClass().getName()
463: + ": " + (System.currentTimeMillis() - start)
464: + "ms.");
465: }
466: } finally {
467: tokenizer.close();
468: }
469:
470: // check the results
471: for (int index = 0; index < sources.length; ++index) {
472: List tokenList = (List) tokenLists[index];
473:
474: System.out.println(sources[index].getClass().getName()
475: + " has " + tokenList.size() + " token.");
476:
477: // only the first 2 data sources have the full token count
478: if (index < 2) {
479: assertTrue("Expected " + tokenCount + " token, got "
480: + tokenList.size(), tokenCount == tokenList
481: .size());
482: }
483:
484: // compare two lists with the same amount of data
485: if (index % 2 == 1) {
486: List tokenList0 = (List) tokenLists[index - 1];
487: Iterator iter0 = tokenList0.iterator();
488: Iterator iter = tokenList.iterator();
489: int tokenIndex = 0;
490:
491: while (iter.hasNext()) {
492: Token token0 = (Token) iter0.next();
493: Token token = (Token) iter.next();
494:
495: assertTrue("Token #" + tokenIndex + "differs:\n"
496: + token0 + "\n" + token, token0
497: .equals(token));
498: tokenIndex++;
499: }
500: }
501: }
502: }
503:
504: //---------------------------------------------------------------------------
505: // helpers
506: //
507:
508: /**
509: * This method returns a {@link java.util.List} of Token.
510: */
511: private List tokenize(Tokenizer tokenizer) throws Throwable {
512: List list = new LinkedList();
513: //File file = File.createTempFile(tokenizer.getSource().getClass().getName(), null);
514: //PrintWriter writer = new PrintWriter(file.getAbsolutePath());
515:
516: try {
517: while (tokenizer.hasMoreToken()) {
518: Token token = tokenizer.nextToken();
519:
520: // writer.println(token);
521: list.add(token);
522: }
523: } finally {
524: // writer.close();
525: }
526: return list;
527: }
528:
529: /**
530: * Expand some text
531: */
532: private String expandData(int factor) {
533: StringBuffer expandedData = new StringBuffer(DATA.length()
534: * factor);
535:
536: for (int ii = 0; ii < factor; ++ii) {
537: expandedData.append(DATA);
538: }
539: return expandedData.toString();
540: }
541:
542: /**
543: * Read the full source
544: */
545: private void readSource(TokenizerSource source, char[] buffer)
546: throws Throwable {
547: long startTime = System.currentTimeMillis();
548: int chars;
549:
550: while ((chars = source.read(buffer, 0, buffer.length)) > 0)
551: ;
552: System.out.println(source.getClass().getName() + " needed "
553: + (System.currentTimeMillis() - startTime) + "ms.");
554: }
555:
556: //---------------------------------------------------------------------------
557: // members
558: //
559:
560: // various constants
561: private static final String ORIG_SMILEY = ":-)";
562: private static final String FRIGHTENED_SMIKEY = "=8-[";
563: private static final String WINKING_SMILEY = ".-\\";
564:
565: // Text data for the tests
566: private static final String DATA = "this is a simple text with a lot of perfectly normal\n"
567: + "token. And a few separators (brackets are some, for instance)\n"
568: + "as well. There could\talso be some\ttabs (\"\\t\")\n"
569: + "in between. And 'some strings' :-).\n"
570: + "And the smileys (;-), =8-[, .-\\ etc.) should be regarded as\n"
571: + "'special sequences'.\n\n";
572: }
|