001: /*
002: * TestStandardTokenizer.java: JUnit test for the StandardTokenizer
003: *
004: * Copyright (C) 2002 Heiko Blau
005: *
006: * This file belongs to the JTopas test suite.
007: * The JTopas test suite is free software; you can redistribute it and/or modify it
008: * under the terms of the GNU Lesser General Public License as published by the
009: * Free Software Foundation; either version 2.1 of the License, or (at your option)
010: * any later version.
011: *
012: * This software is distributed in the hope that it will be useful, but WITHOUT
013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
014: * FITNESS FOR A PARTICULAR PURPOSE.
015: * See the GNU Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public License along
018: * with the JTopas test suite. If not, write to the
019: *
020: * Free Software Foundation, Inc.
021: * 59 Temple Place, Suite 330,
022: * Boston, MA 02111-1307
023: * USA
024: *
025: * or check the Internet: http://www.fsf.org
026: *
027: * The JTopas test suite uses the test framework JUnit by Kent Beck and Erich Gamma.
028: * You should have received a copy of their JUnit licence agreement along with
029: * the JTopas test suite.
030: *
031: * We do NOT provide the JUnit archive junit.jar nessecary to compile and run
032: * our tests, since we assume, that You either have it already or would like
033: * to get the current release Yourself.
034: * Please visit either:
035: * http://sourceforge.net/projects/junit
036: * or
037: * http://junit.org
038: * to obtain JUnit.
039: *
040: * Contact:
041: * email: heiko@susebox.de
042: */
043:
044: package de.susebox.jtopas;
045:
046: //-----------------------------------------------------------------------------
047: // Imports
048: //
049: import java.io.InputStream;
050: import java.io.FileInputStream;
051: import java.io.InputStreamReader;
052: import java.util.Vector;
053: import java.util.Properties;
054: import java.net.URL;
055:
056: import junit.framework.Test;
057: import junit.framework.TestCase;
058: import junit.framework.TestSuite;
059: import junit.framework.Assert;
060:
061: import de.susebox.java.lang.ExtRuntimeException;
062:
063: import de.susebox.TestUtilities;
064:
065: //-----------------------------------------------------------------------------
066: // Class TestStandardTokenizer
067: //
068:
069: /**<p>
070: * This test suite works with a test configuration file. This file contains some
071: * sets of properties, each set for one or more different test runs.
072: *</p><p>
073: * The properties are defined as class constants. In the configuration file, a
074: * property consists of the property name and a number identifying the property
075: * set.
076: *</p>
077: *
078: * @see Tokenizer
079: * @see AbstractTokenizer
080: * @see java.io.InputStreamReader
081: * @author Heiko Blau
082: */
083: public class TestStandardTokenizer extends TestCase {
084:
085: //---------------------------------------------------------------------------
086: // properties
087: //
088:
089: /**
090: * The name of the test configuration file. This file will be read by
091: * {@link java.lang.Class#getResourceAsStream}.
092: */
093: public static final String CONFIG_FILE = "TestStandardTokenizer.conf";
094:
095: /**
096: * Property for the tests {@link #testLinkParsing} and {@link #testContentsParsing}
097: */
098: public static final String PROP_PATH = "Path";
099:
100: /**
101: * Property for the test {@link #testLineCounting}.
102: */
103: public static final String PROP_COUNTLINES_PATH = "CountLinesPath";
104:
105: //---------------------------------------------------------------------------
106: // main method
107: //
108:
109: /**
110: * call this method to invoke the tests
111: */
112: public static void main(String[] args) {
113: String[] tests = { TestStandardTokenizer.class.getName() };
114:
115: TestUtilities.run(tests, args);
116: }
117:
118: //---------------------------------------------------------------------------
119: // suite method
120: //
121:
122: /**
123: * Implementation of the JUnit method <code>suite</code>. For each set of test
124: * properties one or more tests are instantiated.
125: *
126: * @return a test suite
127: */
128: public static Test suite() {
129: TestSuite suite = new TestSuite(TestStandardTokenizer.class
130: .getName());
131: Properties props = new Properties();
132: int count = 1;
133: String path;
134: URL url;
135:
136: try {
137: props.load(TestStandardTokenizer.class
138: .getResourceAsStream(CONFIG_FILE));
139: } catch (Exception ex) {
140: throw new ExtRuntimeException(ex);
141: }
142:
143: while ((path = props.getProperty(PROP_PATH + count)) != null) {
144: if ((url = TestStandardTokenizer.class.getResource(path)) != null) {
145: path = url.getFile();
146: }
147: suite.addTest(new TestStandardTokenizer("testLinkParsing",
148: path));
149: suite.addTest(new TestStandardTokenizer(
150: "testContentsParsing", path));
151: suite.addTest(new TestStandardTokenizer(
152: "testContentsFormatting", path));
153: count++;
154: }
155: count = 1;
156: while ((path = props.getProperty(PROP_COUNTLINES_PATH + count)) != null) {
157: if ((url = TestStandardTokenizer.class.getResource(path)) != null) {
158: path = url.getFile();
159: }
160: suite.addTest(new TestStandardTokenizer("testLineCounting",
161: path));
162: count++;
163: }
164: return suite;
165: }
166:
167: //---------------------------------------------------------------------------
168: // Constructor
169: //
170:
171: /**
172: * Default constructor. Standard input {@link java.lang.System#in} is used
173: * to construct the input stream reader.
174: */
175: public TestStandardTokenizer(String test, String path) {
176: super (test);
177: _path = path;
178: }
179:
180: //---------------------------------------------------------------------------
181: // Fixture setup and release
182: //
183:
184: /**
185: * Sets up the fixture, for example, open a network connection.
186: * This method is called before a test is executed.
187: */
188: protected void setUp() throws Exception {
189: InputStream stream = new FileInputStream(_path);
190:
191: _reader = new InputStreamReader(stream);
192: }
193:
194: /**
195: * Tears down the fixture, for example, close a network connection.
196: * This method is called after a test is executed.
197: */
198: protected void tearDown() throws Exception {
199: _reader.close();
200: }
201:
202: //---------------------------------------------------------------------------
203: // test cases
204: //
205:
206: public void testLinkParsing() throws Throwable {
207: long start = System.currentTimeMillis();
208: TokenizerProperties props = new StandardTokenizerProperties();
209: Tokenizer tokenizer = new StandardTokenizer(props);
210: Vector links = new Vector();
211: Token token;
212:
213: try {
214: props.setParseFlags(Flags.F_NO_CASE);
215: props.setSeparators("=");
216: props.addString("\"", "\"", "\\");
217: props.addBlockComment(">", "<"); // overread everything outside of tags
218: props.addBlockComment("SCRIPT", "/SCRIPT"); // overread script parts
219: props.addBlockComment("!--", "--"); // overread HTML comments without < and >
220: props.addKeyword("HREF");
221: tokenizer.setSource(new ReaderSource(_reader));
222:
223: System.out.println("\nStart looking for links in \""
224: + _path + "\"");
225: while (tokenizer.hasMoreToken()) {
226: token = tokenizer.nextToken();
227: if (token.getType() == Token.KEYWORD) {
228: tokenizer.nextToken(); // should be the '=' character
229: System.out.println(" " + tokenizer.nextImage());
230: assertTrue(tokenizer.currentImage() != null);
231: assertTrue(tokenizer.currentToken().getType() == Token.STRING);
232: }
233: }
234: } finally {
235: // Cleanup
236: tokenizer.close();
237: }
238:
239: long diff = System.currentTimeMillis() - start;
240: System.out.println("Finished after " + diff + " milliseconds");
241: }
242:
243: /**
244: * Extracting the pure contents of a HTML stream.
245: */
246: public void testContentsParsing() throws Throwable {
247: long start = System.currentTimeMillis();
248: TokenizerProperties props = new StandardTokenizerProperties();
249: Tokenizer tokenizer = new StandardTokenizer(props);
250:
251: try {
252: tokenizer.setSource(new ReaderSource(_reader));
253: System.out.println("\nStart extracting contents in \""
254: + _path + "\"");
255:
256: props.setParseFlags(Flags.F_NO_CASE
257: | Flags.F_TOKEN_POS_ONLY);
258: props.setWhitespaces(null);
259: props.setSeparators(null);
260: props.addBlockComment("<", ">"); // overread HTML tags
261: props.addBlockComment("<HEAD>", "</HEAD>"); // overread HTML header
262: props.addBlockComment("<!--", "-->"); // overread HTML comments
263:
264: while (tokenizer.hasMoreToken()) {
265: tokenizer.nextToken();
266: if (tokenizer.currentToken().getType() != Token.EOF) {
267: System.out.println(tokenizer.currentImage());
268: assertTrue("Method currentImage() returned null.",
269: tokenizer.currentImage() != null);
270: }
271: assertTrue(
272: "Found token type "
273: + tokenizer.currentToken().getType()
274: + ", expected NORMAL (" + Token.NORMAL
275: + ") or EOF (" + Token.EOF + ").",
276: tokenizer.currentToken().getType() == Token.NORMAL
277: || tokenizer.currentToken().getType() == Token.EOF);
278: }
279: } finally {
280: tokenizer.close();
281: }
282:
283: long diff = System.currentTimeMillis() - start;
284: System.out.println("Finished after " + diff + " milliseconds");
285: }
286:
287: /**
288: * Testing the line and column counting correctness. This is done by using a
289: * specially formatted file. At a line x and a column y, the method expects
290: * the token "x/y", e.g. "0/0" at the very beginning of the file.
291: */
292: public void testLineCounting() throws Throwable {
293: long start = System.currentTimeMillis();
294: TokenizerProperties props = new StandardTokenizerProperties();
295: Tokenizer tokenizer = new StandardTokenizer(props);
296: Token token;
297: String image;
298: int delPos;
299: int line;
300: int col;
301:
302: System.out.println("\nStart counting lines in \"" + _path
303: + "\"");
304:
305: try {
306: tokenizer.setSource(new ReaderSource(_reader));
307: props.setParseFlags(Flags.F_TOKEN_POS_ONLY
308: | Flags.F_COUNT_LINES);
309: props
310: .setWhitespaces(TokenizerProperties.DEFAULT_WHITESPACES);
311: props.setSeparators(TokenizerProperties.DEFAULT_SEPARATORS);
312: props
313: .addLineComment(TokenizerProperties.DEFAULT_LINE_COMMENT);
314: props.addBlockComment(
315: TokenizerProperties.DEFAULT_BLOCK_COMMENT_START,
316: TokenizerProperties.DEFAULT_BLOCK_COMMENT_END);
317:
318: while (tokenizer.hasMoreToken()) {
319: token = tokenizer.nextToken();
320: switch (token.getType()) {
321: case Token.NORMAL:
322: image = tokenizer.currentImage();
323: line = Integer.parseInt(image);
324: assertTrue("Missing separator \"/\".", tokenizer
325: .nextToken().getType() == Token.SEPARATOR
326: && tokenizer.currentImage().equals("/"));
327: assertTrue("Missing column number", tokenizer
328: .nextToken().getType() == Token.NORMAL);
329: image = tokenizer.currentImage();
330: col = Integer.parseInt(image);
331: assertTrue("Found line number "
332: + token.getStartLine()
333: + " does not match expected line number "
334: + line, line == token.getStartLine());
335: assertTrue("Found column number "
336: + token.getStartColumn()
337: + " does not match expected column number "
338: + col, col == token.getStartColumn());
339: assertTrue("Found token length "
340: + tokenizer.currentToken().getLength()
341: + " does not match expected length "
342: + image.length(),
343: image.length() == tokenizer.currentToken()
344: .getLength());
345: break;
346: }
347: }
348: } finally {
349: tokenizer.close();
350: }
351:
352: long diff = System.currentTimeMillis() - start;
353: System.out.println("Finished after " + diff + " milliseconds");
354: }
355:
356: /**
357: * Advanced contents extracting. Lines will be around 80 characters, a basic
358: * paragraph recognition takes place.
359: */
360: public void testContentsFormatting() throws Throwable {
361: long start = System.currentTimeMillis();
362: TokenizerProperties props = new StandardTokenizerProperties();
363: Tokenizer tokenizer = new StandardTokenizer(props);
364: Token token;
365: String image;
366: int len;
367: Object startPRE = new Object();
368: Object endPRE = new Object();
369: int inPRE = 0;
370:
371: // Counter for expected parts
372: int wsCount = 0;
373: int normalCount = 0;
374: int specCount = 0;
375: int commentCount = 0;
376:
377: System.out.println("\nStart formatting contents in \"" + _path
378: + "\"");
379:
380: try {
381: tokenizer.setSource(new ReaderSource(_reader));
382: props.setParseFlags(Flags.F_NO_CASE
383: | Flags.F_TOKEN_POS_ONLY
384: | Flags.F_RETURN_WHITESPACES);
385: props.setSeparators(null);
386: props.addBlockComment("<", ">");
387: props.addBlockComment("<HEAD>", "</HEAD>");
388: props.addBlockComment("<!--", "-->");
389: props.addSpecialSequence("<b>", "");
390: props.addSpecialSequence("</b>", "");
391: props.addSpecialSequence("<i>", "");
392: props.addSpecialSequence("</i>", "");
393: props.addSpecialSequence("<code>", "");
394: props.addSpecialSequence("</code>", "");
395: props.addSpecialSequence("<pre>", startPRE);
396: props.addSpecialSequence("</pre>", endPRE);
397: props.addSpecialSequence("ä", "\u00E4", 0,
398: Flags.F_NO_CASE);
399: props.addSpecialSequence("ö", "\u00F6", 0,
400: Flags.F_NO_CASE);
401: props.addSpecialSequence("ü", "\u00FC", 0,
402: Flags.F_NO_CASE);
403: props.addSpecialSequence("ß", "\u00DF", 0,
404: Flags.F_NO_CASE);
405: props.addSpecialSequence("Ä", "\u00C4", 0,
406: Flags.F_NO_CASE);
407: props.addSpecialSequence("Ö", "\u00D6", 0,
408: Flags.F_NO_CASE);
409: props.addSpecialSequence("Ü", "\u00DC", 0,
410: Flags.F_NO_CASE);
411: props.addSpecialSequence(" ", " ", 0, Flags.F_NO_CASE);
412: props.addSpecialSequence(">", ">", 0, Flags.F_NO_CASE);
413: props.addSpecialSequence("<", "<", 0, Flags.F_NO_CASE);
414: props.addSpecialSequence("©", "\u00A9");
415: props.addSpecialSequence("€", "\u20AC");
416:
417: len = 0;
418: while (tokenizer.hasMoreToken()) {
419: token = tokenizer.nextToken();
420: switch (token.getType()) {
421: case Token.NORMAL:
422: image = tokenizer.currentImage();
423: assertTrue("Found HTML tag in normal token: "
424: + image, image.indexOf('<') < 0);
425: System.out.print(image);
426: if (inPRE <= 0) {
427: len += token.getLength();
428: }
429: normalCount++;
430: break;
431:
432: case Token.SPECIAL_SEQUENCE:
433: image = tokenizer.currentImage();
434: assertTrue(
435: "Couldn't find special sequence in properties: "
436: + image, props
437: .specialSequenceExists(image));
438: if (token.getCompanion() == startPRE) {
439: System.out.println();
440: len = 0;
441: inPRE++;
442: } else if (token.getCompanion() == endPRE) {
443: System.out.println();
444: len = 0;
445: inPRE--;
446: } else {
447: System.out.print((String) token.getCompanion());
448: }
449: specCount++;
450: break;
451:
452: case Token.BLOCK_COMMENT:
453: if (len > 0) {
454: System.out.println();
455: len = 0;
456: }
457: commentCount++;
458: break;
459:
460: case Token.WHITESPACE:
461: if (inPRE > 0) {
462: System.out.print(tokenizer.currentImage());
463: } else if (len > 75) {
464: System.out.println();
465: len = 0;
466: } else if (len > 0) {
467: System.out.print(' ');
468: len++;
469: }
470: wsCount++;
471: break;
472: }
473: }
474:
475: // Where should have been something of each categorie
476: assertTrue("Not one simple context part was found in file "
477: + _path + ".", normalCount > 0);
478: assertTrue("No HTML tag found " + _path + ".",
479: commentCount > 0);
480: assertTrue("No whitespaces found " + _path + ".",
481: wsCount > 0);
482:
483: } finally {
484: // cleanup
485: tokenizer.close();
486: }
487:
488: // Ready
489: long diff = System.currentTimeMillis() - start;
490: System.out.println("Finished after " + diff + " milliseconds");
491: }
492:
493: //---------------------------------------------------------------------------
494: // Members
495: //
496: protected InputStreamReader _reader = null;
497: protected String _path = null;
498: }
|