Source Code Cross Referenced for TestStandardTokenizer.java in » Parser » JTopas » de » susebox » jtopas » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Parser » JTopas » de.susebox.jtopas
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         * TestStandardTokenizer.java: JUnit test for the StandardTokenizer
003:         *
004:         * Copyright (C) 2002 Heiko Blau
005:         *
006:         * This file belongs to the JTopas test suite.
007:         * The JTopas test suite is free software; you can redistribute it and/or modify it 
008:         * under the terms of the GNU Lesser General Public License as published by the 
009:         * Free Software Foundation; either version 2.1 of the License, or (at your option) 
010:         * any later version.
011:         *
012:         * This software is distributed in the hope that it will be useful, but WITHOUT
013:         * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
014:         * FITNESS FOR A PARTICULAR PURPOSE. 
015:         * See the GNU Lesser General Public License for more details.
016:         *
017:         * You should have received a copy of the GNU Lesser General Public License along 
018:         * with the JTopas test suite. If not, write to the
019:         *
020:         *   Free Software Foundation, Inc.
021:         *   59 Temple Place, Suite 330, 
022:         *   Boston, MA 02111-1307 
023:         *   USA
024:         *
025:         * or check the Internet: http://www.fsf.org
026:         *
027:         * The JTopas test suite uses the test framework JUnit by Kent Beck and Erich Gamma.
028:         * You should have received a copy of their JUnit licence agreement along with 
029:         * the JTopas test suite.
030:         *
031:         * We do NOT provide the JUnit archive junit.jar nessecary to compile and run 
032:         * our tests, since we assume, that You  either have it already or would like 
033:         * to get the current release Yourself. 
034:         * Please visit either:
035:         *   http://sourceforge.net/projects/junit
036:         * or
037:         *   http://junit.org
038:         * to obtain JUnit.
039:         *
040:         * Contact:
041:         *   email: heiko@susebox.de 
042:         */
043:
044:        package de.susebox.jtopas;
045:
046:        //-----------------------------------------------------------------------------
047:        // Imports
048:        //
049:        import java.io.InputStream;
050:        import java.io.FileInputStream;
051:        import java.io.InputStreamReader;
052:        import java.util.Vector;
053:        import java.util.Properties;
054:        import java.net.URL;
055:
056:        import junit.framework.Test;
057:        import junit.framework.TestCase;
058:        import junit.framework.TestSuite;
059:        import junit.framework.Assert;
060:
061:        import de.susebox.java.lang.ExtRuntimeException;
062:
063:        import de.susebox.TestUtilities;
064:
065:        //-----------------------------------------------------------------------------
066:        // Class TestStandardTokenizer
067:        //
068:
069:        /**<p>
070:         * This test suite works with a test configuration file. This file contains some
071:         * sets of properties, each set for one or more different test runs.
072:         *</p><p>
073:         * The properties are defined as class constants. In the configuration file, a 
074:         * property consists of the property name and a number identifying the property
075:         * set. 
076:         *</p>
077:         *
078:         * @see     Tokenizer
079:         * @see     AbstractTokenizer
080:         * @see     java.io.InputStreamReader
081:         * @author  Heiko Blau
082:         */
083:        public class TestStandardTokenizer extends TestCase {
084:
085:            //---------------------------------------------------------------------------
086:            // properties
087:            //
088:
089:            /**
090:             * The name of the test configuration file. This file will be read by 
091:             * {@link java.lang.Class#getResourceAsStream}.
092:             */
093:            public static final String CONFIG_FILE = "TestStandardTokenizer.conf";
094:
095:            /**
096:             * Property for the tests {@link #testLinkParsing} and {@link #testContentsParsing}
097:             */
098:            public static final String PROP_PATH = "Path";
099:
100:            /**
101:             * Property for the test {@link #testLineCounting}.
102:             */
103:            public static final String PROP_COUNTLINES_PATH = "CountLinesPath";
104:
105:            //---------------------------------------------------------------------------
106:            // main method
107:            //
108:
109:            /**
110:             * call this method to invoke the tests
111:             */
112:            public static void main(String[] args) {
113:                String[] tests = { TestStandardTokenizer.class.getName() };
114:
115:                TestUtilities.run(tests, args);
116:            }
117:
118:            //---------------------------------------------------------------------------
119:            // suite method
120:            //
121:
122:            /**
123:             * Implementation of the JUnit method <code>suite</code>. For each set of test
124:             * properties one or more tests are instantiated.
125:             *
126:             * @return a test suite
127:             */
128:            public static Test suite() {
129:                TestSuite suite = new TestSuite(TestStandardTokenizer.class
130:                        .getName());
131:                Properties props = new Properties();
132:                int count = 1;
133:                String path;
134:                URL url;
135:
136:                try {
137:                    props.load(TestStandardTokenizer.class
138:                            .getResourceAsStream(CONFIG_FILE));
139:                } catch (Exception ex) {
140:                    throw new ExtRuntimeException(ex);
141:                }
142:
143:                while ((path = props.getProperty(PROP_PATH + count)) != null) {
144:                    if ((url = TestStandardTokenizer.class.getResource(path)) != null) {
145:                        path = url.getFile();
146:                    }
147:                    suite.addTest(new TestStandardTokenizer("testLinkParsing",
148:                            path));
149:                    suite.addTest(new TestStandardTokenizer(
150:                            "testContentsParsing", path));
151:                    suite.addTest(new TestStandardTokenizer(
152:                            "testContentsFormatting", path));
153:                    count++;
154:                }
155:                count = 1;
156:                while ((path = props.getProperty(PROP_COUNTLINES_PATH + count)) != null) {
157:                    if ((url = TestStandardTokenizer.class.getResource(path)) != null) {
158:                        path = url.getFile();
159:                    }
160:                    suite.addTest(new TestStandardTokenizer("testLineCounting",
161:                            path));
162:                    count++;
163:                }
164:                return suite;
165:            }
166:
167:            //---------------------------------------------------------------------------
168:            // Constructor
169:            //
170:
171:            /**
172:             * Default constructor. Standard input {@link java.lang.System#in} is used
173:             * to construct the input stream reader.
174:             */
175:            public TestStandardTokenizer(String test, String path) {
176:                super (test);
177:                _path = path;
178:            }
179:
180:            //---------------------------------------------------------------------------
181:            // Fixture setup and release
182:            //
183:
184:            /**
185:             * Sets up the fixture, for example, open a network connection.
186:             * This method is called before a test is executed.
187:             */
188:            protected void setUp() throws Exception {
189:                InputStream stream = new FileInputStream(_path);
190:
191:                _reader = new InputStreamReader(stream);
192:            }
193:
194:            /**
195:             * Tears down the fixture, for example, close a network connection.
196:             * This method is called after a test is executed.
197:             */
198:            protected void tearDown() throws Exception {
199:                _reader.close();
200:            }
201:
202:            //---------------------------------------------------------------------------
203:            // test cases
204:            //
205:
206:            public void testLinkParsing() throws Throwable {
207:                long start = System.currentTimeMillis();
208:                TokenizerProperties props = new StandardTokenizerProperties();
209:                Tokenizer tokenizer = new StandardTokenizer(props);
210:                Vector links = new Vector();
211:                Token token;
212:
213:                try {
214:                    props.setParseFlags(Flags.F_NO_CASE);
215:                    props.setSeparators("=");
216:                    props.addString("\"", "\"", "\\");
217:                    props.addBlockComment(">", "<"); // overread everything outside of tags
218:                    props.addBlockComment("SCRIPT", "/SCRIPT"); // overread script parts
219:                    props.addBlockComment("!--", "--"); // overread HTML comments without < and >
220:                    props.addKeyword("HREF");
221:                    tokenizer.setSource(new ReaderSource(_reader));
222:
223:                    System.out.println("\nStart looking for links in \""
224:                            + _path + "\"");
225:                    while (tokenizer.hasMoreToken()) {
226:                        token = tokenizer.nextToken();
227:                        if (token.getType() == Token.KEYWORD) {
228:                            tokenizer.nextToken(); // should be the '=' character
229:                            System.out.println("  " + tokenizer.nextImage());
230:                            assertTrue(tokenizer.currentImage() != null);
231:                            assertTrue(tokenizer.currentToken().getType() == Token.STRING);
232:                        }
233:                    }
234:                } finally {
235:                    // Cleanup
236:                    tokenizer.close();
237:                }
238:
239:                long diff = System.currentTimeMillis() - start;
240:                System.out.println("Finished after " + diff + " milliseconds");
241:            }
242:
243:            /**
244:             * Extracting the pure contents of a HTML stream.
245:             */
246:            public void testContentsParsing() throws Throwable {
247:                long start = System.currentTimeMillis();
248:                TokenizerProperties props = new StandardTokenizerProperties();
249:                Tokenizer tokenizer = new StandardTokenizer(props);
250:
251:                try {
252:                    tokenizer.setSource(new ReaderSource(_reader));
253:                    System.out.println("\nStart extracting contents in \""
254:                            + _path + "\"");
255:
256:                    props.setParseFlags(Flags.F_NO_CASE
257:                            | Flags.F_TOKEN_POS_ONLY);
258:                    props.setWhitespaces(null);
259:                    props.setSeparators(null);
260:                    props.addBlockComment("<", ">"); // overread HTML tags
261:                    props.addBlockComment("<HEAD>", "</HEAD>"); // overread HTML header
262:                    props.addBlockComment("<!--", "-->"); // overread HTML comments
263:
264:                    while (tokenizer.hasMoreToken()) {
265:                        tokenizer.nextToken();
266:                        if (tokenizer.currentToken().getType() != Token.EOF) {
267:                            System.out.println(tokenizer.currentImage());
268:                            assertTrue("Method currentImage() returned null.",
269:                                    tokenizer.currentImage() != null);
270:                        }
271:                        assertTrue(
272:                                "Found token type "
273:                                        + tokenizer.currentToken().getType()
274:                                        + ", expected NORMAL (" + Token.NORMAL
275:                                        + ") or EOF (" + Token.EOF + ").",
276:                                tokenizer.currentToken().getType() == Token.NORMAL
277:                                        || tokenizer.currentToken().getType() == Token.EOF);
278:                    }
279:                } finally {
280:                    tokenizer.close();
281:                }
282:
283:                long diff = System.currentTimeMillis() - start;
284:                System.out.println("Finished after " + diff + " milliseconds");
285:            }
286:
287:            /**
288:             * Testing the line and column counting correctness. This is done by using a
289:             * specially formatted file. At a line x and a column y, the method expects
290:             * the token "x/y", e.g. "0/0" at the very beginning of the file.
291:             */
292:            public void testLineCounting() throws Throwable {
293:                long start = System.currentTimeMillis();
294:                TokenizerProperties props = new StandardTokenizerProperties();
295:                Tokenizer tokenizer = new StandardTokenizer(props);
296:                Token token;
297:                String image;
298:                int delPos;
299:                int line;
300:                int col;
301:
302:                System.out.println("\nStart counting lines in \"" + _path
303:                        + "\"");
304:
305:                try {
306:                    tokenizer.setSource(new ReaderSource(_reader));
307:                    props.setParseFlags(Flags.F_TOKEN_POS_ONLY
308:                            | Flags.F_COUNT_LINES);
309:                    props
310:                            .setWhitespaces(TokenizerProperties.DEFAULT_WHITESPACES);
311:                    props.setSeparators(TokenizerProperties.DEFAULT_SEPARATORS);
312:                    props
313:                            .addLineComment(TokenizerProperties.DEFAULT_LINE_COMMENT);
314:                    props.addBlockComment(
315:                            TokenizerProperties.DEFAULT_BLOCK_COMMENT_START,
316:                            TokenizerProperties.DEFAULT_BLOCK_COMMENT_END);
317:
318:                    while (tokenizer.hasMoreToken()) {
319:                        token = tokenizer.nextToken();
320:                        switch (token.getType()) {
321:                        case Token.NORMAL:
322:                            image = tokenizer.currentImage();
323:                            line = Integer.parseInt(image);
324:                            assertTrue("Missing separator \"/\".", tokenizer
325:                                    .nextToken().getType() == Token.SEPARATOR
326:                                    && tokenizer.currentImage().equals("/"));
327:                            assertTrue("Missing column number", tokenizer
328:                                    .nextToken().getType() == Token.NORMAL);
329:                            image = tokenizer.currentImage();
330:                            col = Integer.parseInt(image);
331:                            assertTrue("Found line number "
332:                                    + token.getStartLine()
333:                                    + " does not match expected line number "
334:                                    + line, line == token.getStartLine());
335:                            assertTrue("Found column number "
336:                                    + token.getStartColumn()
337:                                    + " does not match expected column number "
338:                                    + col, col == token.getStartColumn());
339:                            assertTrue("Found token length "
340:                                    + tokenizer.currentToken().getLength()
341:                                    + " does not match expected length "
342:                                    + image.length(),
343:                                    image.length() == tokenizer.currentToken()
344:                                            .getLength());
345:                            break;
346:                        }
347:                    }
348:                } finally {
349:                    tokenizer.close();
350:                }
351:
352:                long diff = System.currentTimeMillis() - start;
353:                System.out.println("Finished after " + diff + " milliseconds");
354:            }
355:
356:            /**
357:             * Advanced contents extracting. Lines will be around 80 characters, a basic
358:             * paragraph recognition takes place.
359:             */
360:            public void testContentsFormatting() throws Throwable {
361:                long start = System.currentTimeMillis();
362:                TokenizerProperties props = new StandardTokenizerProperties();
363:                Tokenizer tokenizer = new StandardTokenizer(props);
364:                Token token;
365:                String image;
366:                int len;
367:                Object startPRE = new Object();
368:                Object endPRE = new Object();
369:                int inPRE = 0;
370:
371:                // Counter for expected parts
372:                int wsCount = 0;
373:                int normalCount = 0;
374:                int specCount = 0;
375:                int commentCount = 0;
376:
377:                System.out.println("\nStart formatting contents in \"" + _path
378:                        + "\"");
379:
380:                try {
381:                    tokenizer.setSource(new ReaderSource(_reader));
382:                    props.setParseFlags(Flags.F_NO_CASE
383:                            | Flags.F_TOKEN_POS_ONLY
384:                            | Flags.F_RETURN_WHITESPACES);
385:                    props.setSeparators(null);
386:                    props.addBlockComment("<", ">");
387:                    props.addBlockComment("<HEAD>", "</HEAD>");
388:                    props.addBlockComment("<!--", "-->");
389:                    props.addSpecialSequence("<b>", "");
390:                    props.addSpecialSequence("</b>", "");
391:                    props.addSpecialSequence("<i>", "");
392:                    props.addSpecialSequence("</i>", "");
393:                    props.addSpecialSequence("<code>", "");
394:                    props.addSpecialSequence("</code>", "");
395:                    props.addSpecialSequence("<pre>", startPRE);
396:                    props.addSpecialSequence("</pre>", endPRE);
397:                    props.addSpecialSequence("&auml;", "\u00E4", 0,
398:                            Flags.F_NO_CASE);
399:                    props.addSpecialSequence("&ouml;", "\u00F6", 0,
400:                            Flags.F_NO_CASE);
401:                    props.addSpecialSequence("&uuml;", "\u00FC", 0,
402:                            Flags.F_NO_CASE);
403:                    props.addSpecialSequence("&szlig;", "\u00DF", 0,
404:                            Flags.F_NO_CASE);
405:                    props.addSpecialSequence("&Auml;", "\u00C4", 0,
406:                            Flags.F_NO_CASE);
407:                    props.addSpecialSequence("&Ouml;", "\u00D6", 0,
408:                            Flags.F_NO_CASE);
409:                    props.addSpecialSequence("&Uuml;", "\u00DC", 0,
410:                            Flags.F_NO_CASE);
411:                    props.addSpecialSequence("&nbsp;", " ", 0, Flags.F_NO_CASE);
412:                    props.addSpecialSequence("&gt;", ">", 0, Flags.F_NO_CASE);
413:                    props.addSpecialSequence("&lt;", "<", 0, Flags.F_NO_CASE);
414:                    props.addSpecialSequence("&copy;", "\u00A9");
415:                    props.addSpecialSequence("&euro;", "\u20AC");
416:
417:                    len = 0;
418:                    while (tokenizer.hasMoreToken()) {
419:                        token = tokenizer.nextToken();
420:                        switch (token.getType()) {
421:                        case Token.NORMAL:
422:                            image = tokenizer.currentImage();
423:                            assertTrue("Found HTML tag in normal token: "
424:                                    + image, image.indexOf('<') < 0);
425:                            System.out.print(image);
426:                            if (inPRE <= 0) {
427:                                len += token.getLength();
428:                            }
429:                            normalCount++;
430:                            break;
431:
432:                        case Token.SPECIAL_SEQUENCE:
433:                            image = tokenizer.currentImage();
434:                            assertTrue(
435:                                    "Couldn't find special sequence in properties: "
436:                                            + image, props
437:                                            .specialSequenceExists(image));
438:                            if (token.getCompanion() == startPRE) {
439:                                System.out.println();
440:                                len = 0;
441:                                inPRE++;
442:                            } else if (token.getCompanion() == endPRE) {
443:                                System.out.println();
444:                                len = 0;
445:                                inPRE--;
446:                            } else {
447:                                System.out.print((String) token.getCompanion());
448:                            }
449:                            specCount++;
450:                            break;
451:
452:                        case Token.BLOCK_COMMENT:
453:                            if (len > 0) {
454:                                System.out.println();
455:                                len = 0;
456:                            }
457:                            commentCount++;
458:                            break;
459:
460:                        case Token.WHITESPACE:
461:                            if (inPRE > 0) {
462:                                System.out.print(tokenizer.currentImage());
463:                            } else if (len > 75) {
464:                                System.out.println();
465:                                len = 0;
466:                            } else if (len > 0) {
467:                                System.out.print(' ');
468:                                len++;
469:                            }
470:                            wsCount++;
471:                            break;
472:                        }
473:                    }
474:
475:                    // Where should have been something of each categorie
476:                    assertTrue("Not one simple context part was found in file "
477:                            + _path + ".", normalCount > 0);
478:                    assertTrue("No HTML tag found " + _path + ".",
479:                            commentCount > 0);
480:                    assertTrue("No whitespaces found " + _path + ".",
481:                            wsCount > 0);
482:
483:                } finally {
484:                    // cleanup
485:                    tokenizer.close();
486:                }
487:
488:                // Ready
489:                long diff = System.currentTimeMillis() - start;
490:                System.out.println("Finished after " + diff + " milliseconds");
491:            }
492:
493:            //---------------------------------------------------------------------------
494:            // Members
495:            //
496:            protected InputStreamReader _reader = null;
497:            protected String _path = null;
498:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.