Source Code Cross Referenced for AbstractTokenizer.java in » Parser » JTopas » de » susebox » jtopas » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Parser » JTopas » de.susebox.jtopas
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /*
0002:         * AbstractTokenizer.java: base class for Tokenizer implementations.
0003:         *
0004:         * Copyright (C) 2004 Heiko Blau
0005:         *
0006:         * This file belongs to the JTopas Library.
0007:         * JTopas is free software; you can redistribute it and/or modify it 
0008:         * under the terms of the GNU Lesser General Public License as published by the 
0009:         * Free Software Foundation; either version 2.1 of the License, or (at your 
0010:         * option) any later version.
0011:         *
0012:         * This software is distributed in the hope that it will be useful, but WITHOUT
0013:         * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
0014:         * FITNESS FOR A PARTICULAR PURPOSE. 
0015:         * See the GNU Lesser General Public License for more details.
0016:         *
0017:         * You should have received a copy of the GNU Lesser General Public License along
0018:         * with JTopas. If not, write to the
0019:         *
0020:         *   Free Software Foundation, Inc.
0021:         *   59 Temple Place, Suite 330, 
0022:         *   Boston, MA 02111-1307 
0023:         *   USA
0024:         *
0025:         * or check the Internet: http://www.fsf.org
0026:         *
0027:         * Contact:
0028:         *   email: heiko@susebox.de 
0029:         */
0030:
0031:        package de.susebox.jtopas;
0032:
0033:        //-----------------------------------------------------------------------------
0034:        // Imports
0035:        //
0036:        import java.io.Reader;
0037:        import java.util.SortedMap;
0038:        import java.util.TreeMap;
0039:        import java.util.LinkedList;
0040:        import java.util.Arrays;
0041:
0042:        import de.susebox.java.lang.ExtIndexOutOfBoundsException;
0043:
0044:        import de.susebox.jtopas.spi.WhitespaceHandler;
0045:        import de.susebox.jtopas.spi.KeywordHandler;
0046:        import de.susebox.jtopas.spi.PatternHandler;
0047:        import de.susebox.jtopas.spi.SeparatorHandler;
0048:        import de.susebox.jtopas.spi.SequenceHandler;
0049:
0050:        import de.susebox.jtopas.spi.StandardWhitespaceHandler;
0051:        import de.susebox.jtopas.spi.StandardKeywordHandler;
0052:        import de.susebox.jtopas.spi.StandardSeparatorHandler;
0053:        import de.susebox.jtopas.spi.StandardSequenceHandler;
0054:
0055:        import de.susebox.jtopas.spi.DataProvider;
0056:        import de.susebox.jtopas.spi.DataMapper;
0057:
0058:        //-----------------------------------------------------------------------------
0059:        // Class AbstractTokenizer
0060:        //
0061:
0062:        /**<p>
0063:         * Base class for {@link Tokenizer} implementations. <code>AbstractTokenizer</code>
0064:         * separates the data analysis from the actual data provision. Although the class
0065:         * maintains read and write positions the physical representation of the logical 
0066:         * character buffer behind these positions concerns only the subclasses.
0067:         *</p>
0068:         *
0069:         * @see Tokenizer
0070:         * @see TokenizerProperties
0071:         * @author Heiko Blau
0072:         */
0073:        public abstract class AbstractTokenizer implements  Tokenizer,
0074:                TokenizerPropertyListener {
0075:
0076:            //---------------------------------------------------------------------------
0077:            // Abstract methods
0078:            //
0079:
0080:            /**
0081:             * Subclasses have to provide {@link de.susebox.jtopas.spi.DataProvider}
0082:             * instances for various token type handlers. The given start position is the
0083:             * absolute number of characters from the beginning of the data source.
0084:             *
0085:             * @param   startPos    position in the input data
0086:             * @param   length      number of characters
0087:             * @return  the <code>DataProvider</code> for the given data range
0088:             */
0089:            protected abstract DataProvider getDataProvider(int startPos,
0090:                    int length);
0091:
0092:            /**
0093:             * This method is called when the tokenizer runs out of data. Its main purpose 
0094:             * is to call the {@link TokenizerSource#read} method. It is also responsible
0095:             * to handle the flag {@link TokenizerProperties#F_KEEP_DATA} flag).
0096:             *
0097:             * @return  number of read bytes or -1 if an end-of-file condition occured
0098:             * @throws  TokenizerException wrapped exceptions from the {@link TokenizerSource#read} 
0099:             *          method
0100:             */
0101:            protected abstract int readMoreData() throws TokenizerException;
0102:
0103:            //---------------------------------------------------------------------------
0104:            // Constructors
0105:            //
0106:
0107:            /**
0108:             * Default constructor that sets the tokenizer control flags as it would be
0109:             * approbriate for C/C++ and Java. Found token images are copied. No line nor
0110:             * column informations are provided. Nested comments are not allowed.
0111:             *<br>
0112:             * The tokenizer will use the {@link TokenizerProperties#DEFAULT_WHITESPACES} 
0113:             * and {@link TokenizerProperties#DEFAULT_SEPARATORS} for whitespace and 
0114:             * separator handling.
0115:             */
0116:            public AbstractTokenizer() {
0117:                _baseTokenizer = this ;
0118:                if (_defaultProperties == null) {
0119:                    _defaultProperties = new StandardTokenizerProperties();
0120:                }
0121:                setTokenizerProperties(_defaultProperties);
0122:            }
0123:
0124:            /**
0125:             * Contructing a <code>AbstractTokenizer</code> with a backing {@link TokenizerProperties}
0126:             * instance.
0127:             *
0128:             * @param properties  an {@link TokenizerProperties} object containing the 
0129:             *                    settings for the tokenizing process
0130:             */
0131:            public AbstractTokenizer(TokenizerProperties properties) {
0132:                _baseTokenizer = this ;
0133:                setTokenizerProperties(properties);
0134:            }
0135:
0136:            //---------------------------------------------------------------------------
0137:            // data source
0138:            //
0139:
0140:            /**
0141:             * Setting the source of data. This method is usually called during setup of
0142:             * the <code>Tokenizer</code> but may also be invoked while the tokenizing
0143:             * is in progress. It will reset the tokenizers input buffer, line and column 
0144:             * counters etc.
0145:             *<br>
0146:             * Subclasses should override this method to do their own actions on a data source
0147:             * change. Generally, this base method should be called first in the subclass
0148:             * implementation of <code>setSource</code> (equivalent to super calls in
0149:             * constructors of derived classes).
0150:             *
0151:             * @param source  a {@link TokenizerSource} to read data from
0152:             * @see #getSource
0153:             */
0154:            public void setSource(TokenizerSource source) {
0155:                _source = source;
0156:                _eofReached = false;
0157:                _currentReadPos = 0;
0158:                _currentWritePos = 0;
0159:                if (isFlagSet(Flags.F_COUNT_LINES)) {
0160:                    _lineNumber = 0;
0161:                    _columnNumber = 0;
0162:                } else {
0163:                    _lineNumber = -1;
0164:                    _columnNumber = -1;
0165:                }
0166:                Arrays.fill(_scannedToken, null);
0167:            }
0168:
0169:            /**
0170:             * Convenience method to avoid the construction of a {@link TokenizerSource}
0171:             * from the most important data source {@link java.io.Reader}.
0172:             *
0173:             * @param reader  the {@link java.io.Reader} to get data from
0174:             */
0175:            public void setSource(Reader reader) {
0176:                setSource(new ReaderSource(reader));
0177:            }
0178:
0179:            /**
0180:             * Retrieving the {@link TokenizerSource} of this <code>Tokenizer</code>. The
0181:             * method may return <code>null</code> if there is no <code>TokenizerSource</code>
0182:             * associated with it.
0183:             *
0184:             * @param the {@link TokenizerSource} associated with this <code>Tokenizer</code>
0185:             * @see #setSource
0186:             */
0187:            public TokenizerSource getSource() {
0188:                return _source;
0189:            }
0190:
0191:            //---------------------------------------------------------------------------
0192:            // Methods of the Tokenizer interface
0193:            //
0194:
0195:            /**
0196:             * Setting the tokenizer characteristics. See the method description in 
0197:             * {@link Tokenizer}.
0198:             *
0199:             * @param   props   the {@link TokenizerProperties} for this tokenizer
0200:             * @throws  NullPointerException if the <code>null</code> is passed to the call
0201:             * @see     #getTokenizerProperties
0202:             */
0203:            public void setTokenizerProperties(TokenizerProperties props)
0204:                    throws NullPointerException {
0205:                if (props == null) {
0206:                    throw new NullPointerException();
0207:                }
0208:
0209:                // set properties
0210:                if (_properties != null) {
0211:                    _properties.removeTokenizerPropertyListener(this );
0212:                }
0213:                _properties = props;
0214:                _properties.addTokenizerPropertyListener(this );
0215:
0216:                // who is going to handle the various token types ?
0217:                if (_properties instanceof  WhitespaceHandler) {
0218:                    setWhitespaceHandler((WhitespaceHandler) _properties);
0219:                } else {
0220:                    setWhitespaceHandler(new StandardWhitespaceHandler(
0221:                            _properties));
0222:                }
0223:                if (_properties instanceof  SeparatorHandler) {
0224:                    setSeparatorHandler((SeparatorHandler) _properties);
0225:                } else {
0226:                    setSeparatorHandler(new StandardSeparatorHandler(
0227:                            _properties));
0228:                }
0229:                if (_properties instanceof  SequenceHandler) {
0230:                    setSequenceHandler((SequenceHandler) _properties);
0231:                } else {
0232:                    setSequenceHandler(new StandardSequenceHandler(_properties));
0233:                }
0234:                if (props instanceof  KeywordHandler) {
0235:                    setKeywordHandler((KeywordHandler) props);
0236:                } else {
0237:                    setKeywordHandler(new StandardKeywordHandler(_properties));
0238:                }
0239:                if (_properties instanceof  PatternHandler) {
0240:                    setPatternHandler((PatternHandler) _properties);
0241:                } else {
0242:                    setPatternHandler(null);
0243:                }
0244:
0245:                // flag handling
0246:                int newFlags = _properties.getParseFlags();
0247:
0248:                if (newFlags != _flags) {
0249:                    propertyChanged(new TokenizerPropertyEvent(
0250:                            TokenizerPropertyEvent.PROPERTY_MODIFIED,
0251:                            new TokenizerProperty(
0252:                                    TokenizerProperty.PARSE_FLAG_MASK,
0253:                                    new String[] { Integer
0254:                                            .toBinaryString(newFlags) }),
0255:                            new TokenizerProperty(
0256:                                    TokenizerProperty.PARSE_FLAG_MASK,
0257:                                    new String[] { Integer
0258:                                            .toBinaryString(_flags) })));
0259:                }
0260:            }
0261:
0262:            /**
0263:             * Retrieving the current tokenizer characteristics. See the method description 
0264:             * in {@link Tokenizer}.
0265:             *
0266:             * @return  the {@link TokenizerProperties} of this <code>Tokenizer</code>
0267:             * @see     #setTokenizerProperties
0268:             */
0269:            public TokenizerProperties getTokenizerProperties() {
0270:                return _properties;
0271:            }
0272:
0273:            /**
0274:             * Setting the control flags of the <code>Tokenizer</code>. See the method 
0275:             * description in {@link Tokenizer}.
0276:             *
0277:             * @param flags the parser control flags
0278:             * @param mask  the mask for the flags to set or unset
0279:             * @throws TokenizerException if one or more of the flags given cannot be honored
0280:             * @see   #getParseFlags
0281:             */
0282:            public void changeParseFlags(int flags, int mask)
0283:                    throws TokenizerException {
0284:                // test the given flags
0285:                if ((mask | VALID_FLAGS_MASK) != VALID_FLAGS_MASK) {
0286:                    throw new TokenizerException(
0287:                            "One or more flags cannot be set separately for a {0}. Violating flags in {1}: {2}.",
0288:                            new Object[] {
0289:                                    AbstractTokenizer.class.getName(),
0290:                                    Integer.toHexString(flags),
0291:                                    Integer.toHexString(mask
0292:                                            & ~VALID_FLAGS_MASK) });
0293:                }
0294:
0295:                // set the new flags for this tokenizer
0296:                _flagMask = mask;
0297:                _flags = (flags & mask)
0298:                        | (getTokenizerProperties().getParseFlags() & ~mask);
0299:
0300:                // when counting lines initialize the current line and column position
0301:                if (!isFlagSet(Flags.F_COUNT_LINES)) {
0302:                    _lineNumber = 0;
0303:                    _columnNumber = 0;
0304:                }
0305:            }
0306:
0307:            /**
0308:             * Retrieving the parser control flags. See the method description in 
0309:             * {@link Tokenizer}.
0310:             *
0311:             * @return the current parser control flags
0312:             * @see #changeParseFlags
0313:             */
0314:            public int getParseFlags() {
0315:                return (getTokenizerProperties().getParseFlags() & ~_flagMask)
0316:                        + (_flags & _flagMask);
0317:            }
0318:
0319:            /**
0320:             * Setting a new {@link de.susebox.jtopas.spi.KeywordHandler} or removing any 
0321:             * previously installed one. See the method description in {@link Tokenizer}.
0322:             *
0323:             * @param handler the (new) {@link KeywordHandler} to use or <code>null</code>
0324:             *                to remove it
0325:             */
0326:            public void setKeywordHandler(
0327:                    de.susebox.jtopas.spi.KeywordHandler handler) {
0328:                synchronized (this ) {
0329:                    if (handler == _properties) {
0330:                        if (_properties != null
0331:                                && _properties.getKeywords().hasNext()) {
0332:                            _keywordHandler = handler;
0333:                        } else {
0334:                            _keywordHandler = null;
0335:                        }
0336:                        _internalFlags &= ~IFLAG_EXTERNAL_KEYWORD_HANDLER;
0337:                    } else {
0338:                        _keywordHandler = handler;
0339:                        _internalFlags |= IFLAG_EXTERNAL_KEYWORD_HANDLER;
0340:                    }
0341:                }
0342:            }
0343:
0344:            /**
0345:             * Retrieving the current {@link de.susebox.jtopas.spi.KeywordHandler}. See the 
0346:             * method description in {@link Tokenizer}.
0347:             *
0348:             * @return the currently active whitespace keyword or <code>null</code>, if 
0349:             *         keyword support is switched off
0350:             */
0351:            public de.susebox.jtopas.spi.KeywordHandler getKeywordHandler() {
0352:                synchronized (this ) {
0353:                    if ((_internalFlags & IFLAG_EXTERNAL_KEYWORD_HANDLER) == 0) {
0354:                        return (de.susebox.jtopas.spi.KeywordHandler) getTokenizerProperties();
0355:                    } else {
0356:                        return _keywordHandler;
0357:                    }
0358:                }
0359:            }
0360:
0361:            /**
0362:             * Setting a new {@link de.susebox.jtopas.spi.WhitespaceHandler} or removing any 
0363:             * previously installed one. See the method description in {@link Tokenizer}.
0364:             *
0365:             * @param handler the (new) whitespace handler to use or <code>null</code> to 
0366:             *                switch off whitespace handling
0367:             * @see   #getWhitespaceHandler
0368:             */
0369:            public void setWhitespaceHandler(
0370:                    de.susebox.jtopas.spi.WhitespaceHandler handler) {
0371:                _whitespaceHandler = handler;
0372:            }
0373:
0374:            /**
0375:             * Retrieving the current {@link de.susebox.jtopas.spi.WhitespaceHandler}. See 
0376:             * the method description in {@link Tokenizer}.
0377:             *
0378:             * @return  the currently active whitespace handler or null, if the base
0379:             *          implementation is working
0380:             */
0381:            public de.susebox.jtopas.spi.WhitespaceHandler getWhitespaceHandler() {
0382:                return _whitespaceHandler;
0383:            }
0384:
0385:            /**
0386:             * Setting a new {@link de.susebox.jtopas.spi.SeparatorHandler} or removing any 
0387:             * previously installed <code>SeparatorHandler</code>. See the method description 
0388:             * in {@link Tokenizer}.
0389:             *
0390:             * @param handler the (new) separator handler to use or <code>null</code> to
0391:             *                remove it
0392:             * @see   #getSeparatorHandler
0393:             */
0394:            public void setSeparatorHandler(
0395:                    de.susebox.jtopas.spi.SeparatorHandler handler) {
0396:                _separatorHandler = handler;
0397:            }
0398:
0399:            /**
0400:             * Retrieving the current {@link de.susebox.jtopas.spi.SeparatorHandler}. See 
0401:             * the method description in {@link Tokenizer}.
0402:             *
0403:             * @return  the currently active {@link SeparatorHandler} or <code>null</code>, 
0404:             *          if separators aren't recognized by the tokenizer
0405:             * @see     #setSequenceHandler
0406:             */
0407:            public de.susebox.jtopas.spi.SeparatorHandler getSeparatorHandler() {
0408:                return _separatorHandler;
0409:            }
0410:
0411:            /**
0412:             * Setting a new {@link de.susebox.jtopas.spi.SequenceHandler} or removing any 
0413:             * previously installed one. See the method description in {@link Tokenizer}.
0414:             *
0415:             * @param handler   the (new) {@link SequenceHandler} to use or null to remove it
0416:             */
0417:            public void setSequenceHandler(
0418:                    de.susebox.jtopas.spi.SequenceHandler handler) {
0419:                synchronized (this ) {
0420:                    if (handler == _properties) {
0421:                        if (_properties != null
0422:                                && (_properties.getSpecialSequences().hasNext()
0423:                                        || _properties.getStrings().hasNext()
0424:                                        || _properties.getBlockComments()
0425:                                                .hasNext() || _properties
0426:                                        .getLineComments().hasNext())) {
0427:                            _sequenceHandler = handler;
0428:                        } else {
0429:                            _sequenceHandler = null;
0430:                        }
0431:                        _internalFlags &= ~IFLAG_EXTERNAL_SEQUENCE_HANDLER;
0432:                    } else {
0433:                        _sequenceHandler = handler;
0434:                        _internalFlags |= IFLAG_EXTERNAL_SEQUENCE_HANDLER;
0435:                    }
0436:                }
0437:            }
0438:
0439:            /**
0440:             * Retrieving the current {@link SequenceHandler}. See the method description 
0441:             * in {@link Tokenizer}.
0442:             *
0443:             * @return  the currently active {@link SequenceHandler} or null, if the base
0444:             *          implementation is working
0445:             */
0446:            public de.susebox.jtopas.spi.SequenceHandler getSequenceHandler() {
0447:                synchronized (this ) {
0448:                    if ((_internalFlags & IFLAG_EXTERNAL_SEQUENCE_HANDLER) == 0) {
0449:                        return (de.susebox.jtopas.spi.SequenceHandler) getTokenizerProperties();
0450:                    } else {
0451:                        return _sequenceHandler;
0452:                    }
0453:                }
0454:            }
0455:
0456:            /**
0457:             * Setting a new {@link de.susebox.jtopas.spi.PatternHandler} or removing any 
0458:             * previously installed one. See the method description in {@link Tokenizer}.
0459:             *
0460:             * @param handler the (new) {@link de.susebox.jtopas.spi.PatternHandler} to 
0461:             *                use or <code>null</code> to remove it
0462:             * @see #getPatternHandler
0463:             */
0464:            public void setPatternHandler(
0465:                    de.susebox.jtopas.spi.PatternHandler handler) {
0466:                synchronized (this ) {
0467:                    if (handler == _properties) {
0468:                        if (_properties != null
0469:                                && _properties.getPatterns().hasNext()) {
0470:                            _patternHandler = handler;
0471:                        } else {
0472:                            _patternHandler = null;
0473:                        }
0474:                        _internalFlags &= ~IFLAG_EXTERNAL_PATTERN_HANDLER;
0475:                    } else {
0476:                        _patternHandler = handler;
0477:                        _internalFlags |= IFLAG_EXTERNAL_PATTERN_HANDLER;
0478:                    }
0479:                }
0480:            }
0481:
0482:            /**
0483:             * Retrieving the current {@link de.susebox.jtopas.spi.PatternHandler}. See the
0484:             * method description in {@link Tokenizer}.
0485:             *
0486:             * @return  the currently active {@link de.susebox.jtopas.spi.PatternHandler} 
0487:             *          or <code>null</code>, if patterns are not recognized by the tokenizer
0488:             * @see #setPatternHandler
0489:             */
0490:            public de.susebox.jtopas.spi.PatternHandler getPatternHandler() {
0491:                synchronized (this ) {
0492:                    if ((_internalFlags & IFLAG_EXTERNAL_PATTERN_HANDLER) == 0) {
0493:                        return (de.susebox.jtopas.spi.PatternHandler) getTokenizerProperties();
0494:                    } else {
0495:                        return _patternHandler;
0496:                    }
0497:                }
0498:            }
0499:
0500:            /**
0501:             * Query the current row. The method can only be used if the flag {@link TokenizerProperties#F_COUNT_LINES}
0502:             * has been set. Without this flag being set, the return value is undefined.
0503:             *<br>
0504:             * Note that row counting starts with 0, while editors often use 1 for the first
0505:             * row.
0506:             *
0507:             * @return current row (starting with 0) 
0508:             *         or -1 if the flag {@link TokenizerProperties#F_COUNT_LINES} is set
0509:             */
0510:            public int getCurrentLine() {
0511:                return _lineNumber;
0512:            }
0513:
0514:            /**
0515:             * Retrieve the current column. The method can only be used if the flag <code>F_COUNT_LINES</code>
0516:             * has been set.
0517:             * Without this flag being set, the return value is undefined.
0518:             * Note that column counting starts with 0, while editors often use 1 for the first
0519:             * column in one row.
0520:             *
0521:             * @return current column number (starting with 0)
0522:             */
0523:            public int getCurrentColumn() {
0524:                return _columnNumber;
0525:            }
0526:
0527:            /**
0528:             * Checking if there are more tokens available. See the method description in 
0529:             * {@link Tokenizer}.
0530:             *
0531:             * @return  <code>true</code> if a ca_ll to {@link #nextToken} or {@link #nextImage}
0532:             *          will succed, <code>false</code> otherwise
0533:             */
0534:            public boolean hasMoreToken() {
0535:                return _scannedToken[0] == null
0536:                        || _scannedToken[0].getType() != Token.EOF;
0537:            }
0538:
0539:            /**
0540:             * Retrieving the next {@link Token}. See the method description in 
0541:             * {@link Tokenizer}.
0542:             *
0543:             * @return found {@link Token} including the EOF token
0544:             * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
0545:             *         (IOExceptions for instance)
0546:             */
0547:            public Token nextToken() throws TokenizerException {
0548:                boolean returnIt = false;
0549:
0550:                // Get the next token
0551:                __MAIN_LOOP__: do {
0552:                    // analyze look-ahead token
0553:                    if (_scannedToken[1] == null) {
0554:                        if (!isEOF(0)) {
0555:                            if (!isWhitespace(0)) {
0556:                                if (!isPattern(0, false)) {
0557:                                    if (!isSpecialSequence(0)) {
0558:                                        if (!isSeparator(0)) {
0559:                                            _scannedToken[1] = new Token(
0560:                                                    Token.NORMAL);
0561:                                        }
0562:                                    }
0563:                                }
0564:                            }
0565:                        }
0566:                    }
0567:                    _scannedToken[0] = _scannedToken[1];
0568:                    _scannedToken[1] = _scannedToken[2];
0569:                    _scannedToken[2] = null;
0570:
0571:                    // get new token or complete the previously found look-ahead token
0572:                    Token token = _scannedToken[0];
0573:                    TokenizerProperty prop = (TokenizerProperty) token
0574:                            .getCompanion();
0575:
0576:                    token.setCompanion((prop != null) ? prop.getCompanion()
0577:                            : null);
0578:                    token.setStartPosition(getReadPosition());
0579:                    token.setStartLine(_lineNumber);
0580:                    token.setStartColumn(_columnNumber);
0581:
0582:                    returnIt = true;
0583:
0584:                    switch (token.getType()) {
0585:                    case Token.EOF:
0586:                        token.setLength(0);
0587:                        break;
0588:                    case Token.WHITESPACE:
0589:                        token.setLength(completeWhitespace());
0590:                        returnIt = isFlagSet(Flags.F_RETURN_SIMPLE_WHITESPACES);
0591:                        break;
0592:                    case Token.SEPARATOR: // Separators are always single characters.
0593:                        token.setLength(1);
0594:                        break;
0595:                    case Token.STRING:
0596:                        token.setLength(completeString(prop));
0597:                        break;
0598:                    case Token.LINE_COMMENT:
0599:                        token.setLength(completeLineComment(prop));
0600:                        returnIt = isFlagSet(prop, Flags.F_RETURN_LINE_COMMENTS);
0601:                        break;
0602:                    case Token.BLOCK_COMMENT:
0603:                        token.setLength(completeBlockComment(prop));
0604:                        returnIt = isFlagSet(prop,
0605:                                Flags.F_RETURN_BLOCK_COMMENTS);
0606:                        break;
0607:                    case Token.SPECIAL_SEQUENCE:
0608:                        token.setLength(prop.getImages()[0].length());
0609:                        break;
0610:                    case Token.PATTERN:
0611:                        // already contained in the first look-ahead token, see token shifting
0612:                        break;
0613:                    default:
0614:                        prop = completeBoundedToken(token);
0615:                    }
0616:
0617:                    // compute new line and column positions (if flag is set) and complete
0618:                    // the token
0619:                    adjustLineAndColumn(token.getType(), token.getLength());
0620:                    token.setEndLine(_lineNumber);
0621:                    token.setEndColumn(_columnNumber);
0622:
0623:                    // need to extract the image ?
0624:                    if (returnIt) {
0625:                        boolean tokenPosOnly = (prop != null) ? isFlagSet(prop,
0626:                                Flags.F_TOKEN_POS_ONLY)
0627:                                : isFlagSet(Flags.F_TOKEN_POS_ONLY);
0628:                        boolean returnImageParts = (prop != null) ? isFlagSet(
0629:                                prop, Flags.F_RETURN_IMAGE_PARTS)
0630:                                : isFlagSet(Flags.F_RETURN_IMAGE_PARTS);
0631:                        if (!tokenPosOnly || returnImageParts) {
0632:                            token.setImage(getText(_currentReadPos, token
0633:                                    .getLength()));
0634:                        }
0635:                        if (returnImageParts) {
0636:                            switch (token.getType()) {
0637:                            case Token.WHITESPACE:
0638:                                token.setImageParts(splitIntoLines(token
0639:                                        .getImage()));
0640:                                break;
0641:                            case Token.STRING:
0642:                                token.setImageParts(splitString(prop, token
0643:                                        .getImage()));
0644:                                break;
0645:                            case Token.LINE_COMMENT:
0646:                                token.setImageParts(splitIntoLines(token
0647:                                        .getImage().substring(
0648:                                                prop.getImages()[0].length())));
0649:                                break;
0650:                            case Token.BLOCK_COMMENT:
0651:                                token.setImageParts(splitBlockComment(prop,
0652:                                        token.getImage()));
0653:                                break;
0654:                            case Token.PATTERN:
0655:                                break;
0656:                            case Token.EOF:
0657:                                token.setImageParts(new String[] {});
0658:                                break;
0659:                            default:
0660:                                token.setImageParts(new String[] { token
0661:                                        .getImage() });
0662:                            }
0663:                        }
0664:                    }
0665:
0666:                    // this is the one and only point where the current read position is
0667:                    // adjusted (except for the data shifting in readMoreData).
0668:                    _currentReadPos += token.getLength();
0669:
0670:                } while (!returnIt);
0671:
0672:                // the current token is the first in the list
0673:                return _scannedToken[0];
0674:            }
0675:
0676:            /**
0677:             * This method is a convenience method. It returns only the next token image
0678:             * without any informations about its type or associated information. See the 
0679:             * method description in {@link Tokenizer}.
0680:             *
0681:             * @return the token image of the next token
0682:             * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
0683:             *         (IOExceptions for instance)
0684:             * @see #currentImage
0685:             */
0686:            public String nextImage() throws TokenizerException {
0687:                nextToken();
0688:                return currentImage();
0689:            }
0690:
0691:            /**
0692:             * Retrieve the {@link Token} that was found by the last call to {@link #nextToken}.
0693:             * See the method description in {@link Tokenizer}.
0694:             *
0695:             * @return  the {@link Token} retrieved by the lahasest call to {@link #nextToken}.
0696:             * @throws  TokenizerException if the tokenizer has no current token
0697:             */
0698:            public Token currentToken() throws TokenizerException {
0699:                if (_scannedToken[0] == null) {
0700:                    throw new TokenizerException(
0701:                            "No current token available (nextToken was not called / read position changed)");
0702:                }
0703:                return _scannedToken[0];
0704:            }
0705:
0706:            /**
0707:             * Convenience method to retrieve only the token image of the {@link Token} that
0708:             * would be returned by {@link #currentToken}. See the method description in 
0709:             * {@link Tokenizer}.
0710:             *
0711:             * @return the token image of the current token
0712:             * @see #currentToken
0713:             */
0714:            public String currentImage() throws TokenizerException {
0715:                Token token = currentToken();
0716:
0717:                if (token.getType() == Token.EOF) {
0718:                    return null;
0719:                } else if (!isFlagSet(Flags.F_TOKEN_POS_ONLY)
0720:                        || token.getImage() != null) {
0721:                    return token.getImage();
0722:                } else {
0723:                    return getText(token.getStartPosition(), token.getLength());
0724:                }
0725:            }
0726:
0727:            /**
0728:             * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method will 
0729:             * return the line number starting with 0 in the input stream. See the method 
0730:             * description in {@link Tokenizer}.
0731:             *
0732:             * @return the current line number starting with 0 or -1 if no line numbers are supplied.
0733:             * @see #getColumnNumber
0734:             */
0735:            public int getLineNumber() {
0736:                return _lineNumber;
0737:            }
0738:
0739:            /**
0740:             * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method will 
0741:             * return the current column positionstarting with 0 in the input stream. See 
0742:             * the method description in {@link Tokenizer}.
0743:             *
0744:             * @return the current column position
0745:             * @see #getLineNumber
0746:             */
0747:            public int getColumnNumber() {
0748:                return _columnNumber;
0749:            }
0750:
0751:            /**
0752:             * Getting the current read offset. See the method description in 
0753:             * {@link Tokenizer}.
0754:             *
0755:             * @return the absolute offset in characters from the start of the data source 
0756:             *         of the Tokenizer where reading will be continued
0757:             * @see #setReadPositionAbsolute
0758:             * @see #setReadPositionRelative
0759:             */
0760:            public int getReadPosition() {
0761:                return _currentReadPos;
0762:            }
0763:
0764:            /**
0765:             * Retrieving the number of the currently available characters. See the method 
0766:             * description in {@link Tokenizer}.
0767:             *
0768:             * @return number of currently available characters
0769:             */
0770:            public int currentlyAvailable() {
0771:                return _currentWritePos - getRangeStart();
0772:            }
0773:
0774:            /**
0775:             * Try to read more data into the text buffer of the tokenizer. See the method 
0776:             * description in {@link Tokenizer}.
0777:             *
0778:             * @return  the number of character now available
0779:             * @throws  TokenizerException generic exception (list) for all problems that 
0780:             *          may occur while reading (IOExceptions for instance)
0781:             */
0782:            public int readMore() throws TokenizerException {
0783:                readMoreDataFromBase();
0784:                return currentlyAvailable();
0785:            }
0786:
0787:            /**
0788:             * Returns the character at the given position. The method does not attempt to
0789:             * read more data.
0790:             *
0791:             * @param   pos   get character on this position in the data stream
0792:             * @return  the character at the given position
0793:             * @throws  IndexOutOfBoundsException if the parameter <code>pos</code> is not 
0794:             *          in the available text range (text window)
0795:             */
0796:            public char getChar(int pos) throws IndexOutOfBoundsException {
0797:                return getBaseDataProvider(pos, 1).getCharAt(0);
0798:            }
0799:
0800:            /**
0801:             * Retrieve text from the currently available range. See the method description 
0802:             * in {@link Tokenizer}.
0803:             *
0804:             * @param   start   position where the text begins
0805:             * @param   len     length of the text
0806:             * @return  the text beginning at the given position ith the given length
0807:             * @throws  IndexOutOfBoundsException if the starting position or the length 
0808:             *          is out of the current text window
0809:             */
0810:            public String getText(int start, int len)
0811:                    throws IndexOutOfBoundsException {
0812:                return getBaseDataProvider(start, len).toString();
0813:            }
0814:
0815:            /**
0816:             * This method sets the tokenizers current read position to the given absolute
0817:             * read position. See the method description in {@link Tokenizer}.
0818:             *<br>
0819:             * When using this method with embedded tokenizers, the user is responsible to
0820:             * set the read position in the currently used tokenizer. It will be propagated
0821:             * by the next call to {@link #switchTo}. Until that point, a call to this
0822:             * method has no effect on the other tokenizers sharing the same data source.
0823:             *
0824:             * @param   position  absolute position for the next parse operation
0825:             * @throws  IndexOutOfBoundsException if the parameter <code>position</code> is
0826:             *          not in the available text range (text window)
0827:             * @see     #setReadPositionRelative
0828:             */
0829:            public void setReadPositionAbsolute(int position)
0830:                    throws IndexOutOfBoundsException {
0831:                if (position < getRangeStart()) {
0832:                    throw new ExtIndexOutOfBoundsException(
0833:                            "Invalid read position {0} below the current text window start {1}.",
0834:                            new Object[] { new Integer(position),
0835:                                    new Integer(getRangeStart()) });
0836:                } else if (position > _currentWritePos) {
0837:                    throw new ExtIndexOutOfBoundsException(
0838:                            "Invalid read position {0} at or above the current text window end {1}.",
0839:                            new Object[] {
0840:                                    new Integer(position),
0841:                                    new Integer(currentlyAvailable()
0842:                                            + getRangeStart()) });
0843:                }
0844:                _currentReadPos = position;
0845:                Arrays.fill(_scannedToken, null);
0846:
0847:                // adjust line and column counting
0848:                if (isFlagSet(Flags.F_COUNT_LINES)) {
0849:                    SortedMap map = _position2LineMap.headMap(new Integer(
0850:                            position + 1));
0851:
0852:                    if (map != null && !map.isEmpty()) {
0853:                        Integer lastLineStart = (Integer) map.lastKey();
0854:
0855:                        _lineNumber = ((Integer) map.get(lastLineStart))
0856:                                .intValue();
0857:                        _columnNumber = position - lastLineStart.intValue();
0858:                    } else {
0859:                        _lineNumber = 0;
0860:                        _columnNumber = position;
0861:                    }
0862:                }
0863:            }
0864:
0865:            /**
0866:             * This method sets the tokenizers new read position the given number of characters
0867:             * forward (positive value) or backward (negative value) starting from the current
0868:             * read position. See the method description in {@link Tokenizer}.
0869:             *<br>
0870:             * When using this method with embedded tokenizers, the user is responsible to
0871:             * set the read position in the currently used tokenizer. It will be propagated
0872:             * by the next call to {@link #switchTo}. Until that point, a call to this
0873:             * method has no effect on the other tokenizers sharing the same data source.
0874:             *
0875:             * @param   offset  number of characters to move forward (positive offset) or
0876:             *                 backward (negative offset)
0877:             * @throws  IndexOutOfBoundsException if the parameter <code>offset</code> would
0878:             *          move the read position out of the available text range (text window)
0879:             * @see     #setReadPositionAbsolute
0880:             */
0881:            public void setReadPositionRelative(int offset)
0882:                    throws IndexOutOfBoundsException {
0883:                setReadPositionAbsolute(getReadPosition() + offset);
0884:            }
0885:
0886:            /**
0887:             * Closing this tokenizer frees resources and deregisters from the 
0888:             * associated {@link TokenizerProperties} object.
0889:             */
0890:            public void close() {
0891:                // deregister from the properties
0892:                if (_properties != null) {
0893:                    _properties.removeTokenizerPropertyListener(this );
0894:                    _properties = null;
0895:                }
0896:
0897:                // freeing memory
0898:                if (_position2LineMap != null) {
0899:                    _position2LineMap.clear();
0900:                    _position2LineMap = null;
0901:                }
0902:
0903:                // adjust members
0904:                _eofReached = true;
0905:                _flags = 0;
0906:                _flagMask = 0;
0907:                _internalFlags = 0;
0908:                _currentReadPos = 0;
0909:                _currentWritePos = 0;
0910:                _lineNumber = -1;
0911:                _columnNumber = -1;
0912:                _nextTokenizer = null;
0913:                _prevTokenizer = null;
0914:                _whitespaceHandler = null;
0915:                _separatorHandler = null;
0916:                _keywordHandler = null;
0917:                _sequenceHandler = null;
0918:                _patternHandler = null;
0919:                _source = null;
0920:                Arrays.fill(_scannedToken, null);
0921:            }
0922:
0923:            //---------------------------------------------------------------------------
0924:            // embedded tokenizer support
0925:            //
0926:
0927:            /**
0928:             * Adding an embedded tokenizer. Embedded tokenizer work on the same input 
0929:             * buffer as their base tokenizer. A situation where embedded tokenizer could
0930:             * be applied, is a HTML stream with cascading style sheet (CSS) and JavaScript
0931:             * parts.
0932:             *<br>
0933:             * There are no internal means of switching from one tokenizer to another. 
0934:             * This should be done by the caller using the method {@link #switchTo}.
0935:             *<br>
0936:             * The {@link TokenizerProperties#F_KEEP_DATA} and {@link TokenizerProperties#F_COUNT_LINES}
0937:             * flags of the base tokenizer take effect also in the embedded tokenizers.
0938:             *<br>
0939:             * Since is might be possible that the given <code>tokenizer</code> is a
0940:             * derivation of the <code>AbstractTokenizer</code> class, this method is
0941:             * synchronized on <code>tokenizer</code>.
0942:             *
0943:             * @param  tokenizer   an embedded tokenizer
0944:             * @throws TokenizerException if something goes wrong (not likely :-)
0945:             */
0946:            public void addTokenizer(AbstractTokenizer tokenizer)
0947:                    throws TokenizerException {
0948:                AbstractTokenizer curr = this ;
0949:
0950:                while (curr._nextTokenizer != null) {
0951:                    curr = curr._nextTokenizer;
0952:                }
0953:
0954:                if (tokenizer != null) {
0955:                    synchronized (tokenizer) {
0956:                        curr._nextTokenizer = tokenizer;
0957:                        tokenizer._prevTokenizer = curr;
0958:
0959:                        // share the input buffer of the base tokenizer
0960:                        AbstractTokenizer baseTokenizer = getBaseTokenizer();
0961:
0962:                        tokenizer._baseTokenizer = baseTokenizer;
0963:
0964:                        // inherited flags
0965:                        tokenizer.changeParseFlags(baseTokenizer
0966:                                .getParseFlags(), Flags.F_COUNT_LINES);
0967:                    }
0968:                }
0969:            }
0970:
0971:            /**
0972:             * Changing fron one tokenizer to another. If the given tokenizer has not been
0973:             * added with {@link #addTokenizer}, an exception is thrown.<br>
0974:             * The <code>switchTo</code> method does the nessecary synchronisation between
0975:             * <code>this</code> and the given tokenizer. The user is therefore responsible
0976:             * to use <code>switchTo</code> whenever a tokenizer change is nessecary. It
0977:             * must be done this way:
0978:             *<blockquote><pre>
0979:             *   Tokenizer base     = new MyTokenizer(...)
0980:             *   Tokenizer embedded = new MyTokenizer(...)
0981:             *
0982:             *   // setting properties (comments, keywords etc.)
0983:             *   ...
0984:             *
0985:             *   // embedding a tokenizer
0986:             *   base.addTokenizer(embedded);
0987:             *   
0988:             *   // tokenizing with base
0989:             *   ...
0990:             *   if (<i>switch_condition</i>) {
0991:             *     base.switchTo(embedded);
0992:             *   }
0993:             *
0994:             *   // tokenizing with embedded
0995:             *   ...
0996:             *   if (<i>switch_condition</i>) {
0997:             *     embedded.switchTo(base);
0998:             *   }
0999:             *</pre></blockquote>
1000:             * That way we avoid a more complex synchronisation between tokenizers whenever
1001:             * one of them parses the next data in the input stream. However, the danger
1002:             * of not synchronized tokenizers remains, so take care.
1003:             *<br>
1004:             * Since is might be possible that the given <code>tokenizer</code> is a
1005:             * derivation of the <code>AbstractTokenizer</code> class, this method is
1006:             * synchronized on <code>tokenizer</code>.
1007:             *
1008:             * @param tokenizer   the tokenizer that should be used from now on
1009:             */
1010:            public void switchTo(AbstractTokenizer tokenizer)
1011:                    throws TokenizerException {
1012:                if (tokenizer != null) {
1013:                    synchronized (tokenizer) {
1014:                        if (tokenizer._baseTokenizer != _baseTokenizer) {
1015:                            throw new TokenizerException(
1016:                                    "Trying to switch to an alien tokenizer (not added with addTokenizer).",
1017:                                    null);
1018:                        }
1019:                        tokenizer._eofReached = this ._eofReached;
1020:                        tokenizer._currentReadPos = this ._currentReadPos;
1021:                        tokenizer._currentWritePos = this ._currentWritePos;
1022:                        tokenizer._columnNumber = this ._columnNumber;
1023:                        tokenizer._lineNumber = this ._lineNumber;
1024:                        tokenizer._position2LineMap = this ._position2LineMap;
1025:                    }
1026:                } else {
1027:                    throw new TokenizerException(new NullPointerException());
1028:                }
1029:            }
1030:
1031:            //---------------------------------------------------------------------------
1032:            // Methods that may be overwritten in derived classes
1033:            //
1034:
1035:            /**
1036:             * This method checks if the character is a whitespace. Implement Your own
1037:             * code for situations where this default implementation is not fast enough
1038:             * or otherwise not really good.
1039:             *
1040:             * @param testChar  check this character
1041:             * @return <code>true</code> if the given character is a whitespace,
1042:             *         <code>false</code> otherwise
1043:             */
1044:            protected boolean isWhitespace(char testChar) {
1045:                if (_whitespaceHandler != null) {
1046:                    return _whitespaceHandler.isWhitespace(testChar);
1047:                } else {
1048:                    return false;
1049:                }
1050:            }
1051:
1052:            /**
1053:             * This method detects the number of whitespace characters starting at the given
1054:             * position. It should return the number of characters identified as whitespaces
1055:             * starting from and including the given start position.
1056:             *<br>
1057:             * Then overriding this method, use {@link #getBaseDataProvider} to access characters.
1058:             *<br>
1059:             * Do not attempt to actually read more data or do anything that leads to the
1060:             * change of the data source or to tokenizer switching. This is done by the 
1061:             * tokenizer framework.
1062:             *
1063:             * @param   startingAtPos  start checking for whitespace from this position
1064:             * @param   maxChars  if there is no non-whitespace character, read up to this number of characters 
1065:             * @return  number of whitespace characters starting from the given offset
1066:             * @throws  TokenizerException failure while reading data from the input stream
1067:             */
1068:            protected int readWhitespaces(int startingAtPos, int maxChars)
1069:                    throws TokenizerException {
1070:                if (_whitespaceHandler != null) {
1071:                    DataProvider dataProvider = getBaseDataProvider(
1072:                            startingAtPos, maxChars);
1073:                    return _whitespaceHandler
1074:                            .countLeadingWhitespaces(dataProvider);
1075:                } else {
1076:                    return 0;
1077:                }
1078:            }
1079:
1080:            /**
1081:             * This method checks if the character sequence starting at a given position
1082:             * with a given lenghth is a keyword. If so, it returns the keyword description
1083:             * as {@link TokenizerProperty} object.
1084:             *
1085:             * @param   startingAtPos   check at this position
1086:             * @param   length          the candidate has this number of characters
1087:             * @throws  TokenizerException routed exception from the active {@link de.susebox.jtopas.spi.KeywordHandler}
1088:             * @return  {@link TokenizerProperty} describing the keyword or <code>null</code>
1089:             */
1090:            protected TokenizerProperty isKeyword(int startingAtPos, int length)
1091:                    throws TokenizerException {
1092:                if (_keywordHandler != null) {
1093:                    DataProvider dataProvider = getBaseDataProvider(
1094:                            startingAtPos, length);
1095:                    return _keywordHandler.isKeyword(dataProvider);
1096:                } else {
1097:                    return null;
1098:                }
1099:            }
1100:
1101:            //---------------------------------------------------------------------------
1102:            // TokenizerPropertyListener methods
1103:            //
1104:
1105:            /**
1106:             * Splits a given String into lines. The method ist used to retrieve the
1107:             * image parts of several token types.
1108:             *
1109:             * @param   image   split this string into lines
1110:             * @return  an array containing the lines of the image without line separator
1111:             *          characters
1112:             */
1113:            protected String[] splitIntoLines(String image) {
1114:                LinkedList lines = new LinkedList();
1115:                int index = 0;
1116:                int start = 0;
1117:
1118:                while (index < image.length()) {
1119:                    switch (image.charAt(index)) {
1120:                    case '\r':
1121:                        lines.add(image.substring(start, index));
1122:                        if (index + 1 < image.length()
1123:                                && image.charAt(index + 1) == '\n') {
1124:                            index += 2;
1125:                        } else {
1126:                            index++;
1127:                        }
1128:                        start = index;
1129:                        break;
1130:                    case '\n':
1131:                        lines.add(image.substring(start, index));
1132:                        start = ++index;
1133:                        break;
1134:                    default:
1135:                        index++;
1136:                    }
1137:                }
1138:
1139:                if (start < index || start > 0) {
1140:                    lines.add(image.substring(start, index));
1141:                }
1142:
1143:                return (String[]) lines.toArray(new String[lines.size()]);
1144:            }
1145:
1146:            /**
1147:             * Splits a given string into lines and removing string escapes. The method is
1148:             * used to retrieve the image parts for string token types.
1149:             *
1150:             * @param   prop    the {@link TokenizerProperty} describing a string
1151:             * @param   image   split this string into lines
1152:             * @return  an array containing the lines of the image without line separator
1153:             *          characters
1154:             */
1155:            protected String[] splitString(TokenizerProperty prop, String image) {
1156:                // complete string
1157:                String[] images = prop.getImages();
1158:                String begin = images[0];
1159:                String end = images[1];
1160:                String esc = images[2];
1161:                boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1162:                boolean escEqualsEnd = (!noCase && esc.compareTo(end) == 0)
1163:                        || (noCase && esc.compareToIgnoreCase(end) == 0);
1164:
1165:                StringBuffer buffer = null;
1166:                int index = begin.length();
1167:                int start = index;
1168:                int endIndex;
1169:
1170:                if (image.length() - start >= end.length()
1171:                        && ((!noCase && end.equals(image.substring(image
1172:                                .length()
1173:                                - end.length()))) || (noCase && end
1174:                                .equalsIgnoreCase(image.substring(image
1175:                                        .length()
1176:                                        - end.length()))))) {
1177:                    endIndex = image.length() - end.length();
1178:                } else {
1179:                    endIndex = image.length();
1180:                }
1181:
1182:                while (index < endIndex) {
1183:                    if ((!noCase && image.startsWith(esc, index))
1184:                            || (noCase && image.substring(index,
1185:                                    index + esc.length()).equalsIgnoreCase(esc))) {
1186:                        if (buffer == null) {
1187:                            buffer = new StringBuffer(image.length());
1188:                        }
1189:                        buffer.append(image.substring(start, index));
1190:                        index += esc.length();
1191:                        if (index < image.length()) {
1192:                            if ((!noCase && image.startsWith(esc, index))
1193:                                    || (noCase && image.substring(index,
1194:                                            index + esc.length())
1195:                                            .equalsIgnoreCase(esc))) {
1196:                                buffer.append(esc);
1197:                                index += esc.length();
1198:                            } else if ((!noCase && image.startsWith(begin,
1199:                                    index))
1200:                                    || (noCase && image.substring(index,
1201:                                            index + begin.length())
1202:                                            .equalsIgnoreCase(begin))) {
1203:                                buffer.append(begin);
1204:                                index += begin.length();
1205:                            } else if ((!noCase && image.startsWith(end, index))
1206:                                    || (noCase && image.substring(index,
1207:                                            index + end.length())
1208:                                            .equalsIgnoreCase(end))) {
1209:                                buffer.append(end);
1210:                                index += end.length();
1211:                            }
1212:                        }
1213:                        start = index;
1214:                    }
1215:                    index++;
1216:                }
1217:
1218:                if (buffer != null && start < index) {
1219:                    buffer.append(image.substring(start, endIndex));
1220:                }
1221:
1222:                return splitIntoLines((buffer != null) ? buffer.toString()
1223:                        : image.substring(start, endIndex));
1224:            }
1225:
1226:            /**
1227:             * Splits a given block comment into lines. The method is used to retrieve the 
1228:             * image parts for block comment token types.
1229:             *
1230:             * @param   prop    the {@link TokenizerProperty} describing a block comment
1231:             * @param   image   split this string into lines
1232:             * @return  an array containing the lines of the image without line separator
1233:             *          characters
1234:             */
1235:            protected String[] splitBlockComment(TokenizerProperty prop,
1236:                    String image) {
1237:                // complete string
1238:                String[] images = prop.getImages();
1239:                String start = images[0];
1240:                String end = images[1];
1241:                boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1242:
1243:                if (image.length() - start.length() >= end.length()
1244:                        && ((!noCase && end.equals(image.substring(image
1245:                                .length()
1246:                                - end.length()))) || (noCase && end
1247:                                .equalsIgnoreCase(image.substring(image
1248:                                        .length()
1249:                                        - end.length()))))) {
1250:                    return splitIntoLines(image.substring(start.length(), image
1251:                            .length()
1252:                            - end.length()));
1253:                } else {
1254:                    return splitIntoLines(image.substring(start.length()));
1255:                }
1256:            }
1257:
1258:            /**
1259:             * Event handler method. The given {@link TokenizerPropertyEvent} parameter
1260:             * contains the nessecary information about the property change. We choose
1261:             * one single method in favour of various more specialized methods since the
1262:             * reactions on adding, removing and modifying tokenizer properties are often
1263:             * the same (flushing cash, rereading information etc.) are probably not very
1264:             * different.
1265:             *<br>
1266:             * Note that a modification of the parse flags in the backing {@link TokenizerProperties}
1267:             * object removes all flags previously modified through {@link #changeParseFlags}.
1268:             *
1269:             * @param event the {@link TokenizerPropertyEvent} that describes the change
1270:             */
1271:            public void propertyChanged(TokenizerPropertyEvent event) {
1272:                TokenizerProperty prop = event.getProperty();
1273:                String[] images = prop.getImages();
1274:
1275:                synchronized (this ) {
1276:                    switch (event.getType()) {
1277:                    case TokenizerPropertyEvent.PROPERTY_ADDED:
1278:                    case TokenizerPropertyEvent.PROPERTY_REMOVED:
1279:                        switch (prop.getType()) {
1280:                        case Token.LINE_COMMENT:
1281:                        case Token.BLOCK_COMMENT:
1282:                        case Token.STRING:
1283:                        case Token.SPECIAL_SEQUENCE:
1284:                            if ((_internalFlags & IFLAG_EXTERNAL_SEQUENCE_HANDLER) == 0
1285:                                    && _properties instanceof  de.susebox.jtopas.spi.SequenceHandler) {
1286:                                setSequenceHandler((de.susebox.jtopas.spi.SequenceHandler) _properties);
1287:                            }
1288:                            break;
1289:                        case Token.KEYWORD:
1290:                            if ((_internalFlags & IFLAG_EXTERNAL_KEYWORD_HANDLER) == 0
1291:                                    && _properties instanceof  de.susebox.jtopas.spi.KeywordHandler) {
1292:                                setKeywordHandler((de.susebox.jtopas.spi.KeywordHandler) _properties);
1293:                            }
1294:                            break;
1295:                        case Token.PATTERN:
1296:                            if ((_internalFlags & IFLAG_EXTERNAL_PATTERN_HANDLER) == 0
1297:                                    && _properties instanceof  de.susebox.jtopas.spi.PatternHandler) {
1298:                                setPatternHandler((de.susebox.jtopas.spi.PatternHandler) _properties);
1299:                            }
1300:                            break;
1301:                        }
1302:                        break;
1303:
1304:                    case TokenizerPropertyEvent.PROPERTY_MODIFIED:
1305:                        switch (prop.getType()) {
1306:                        case TokenizerProperty.PARSE_FLAG_MASK:
1307:                            _flags = getTokenizerProperties().getParseFlags();
1308:                            _flagMask = 0;
1309:                            if (isFlagSet(Flags.F_COUNT_LINES)) {
1310:                                if (_lineNumber < 0) {
1311:                                    if (_position2LineMap != null) {
1312:                                        _position2LineMap.clear();
1313:                                    }
1314:                                    _lineNumber = 0;
1315:                                    putPosition(_currentReadPos, _lineNumber);
1316:                                }
1317:                                if (_columnNumber < 0) {
1318:                                    _columnNumber = 0;
1319:                                }
1320:                            } else {
1321:                                _lineNumber = -1;
1322:                                _columnNumber = -1;
1323:                            }
1324:                            break;
1325:                        }
1326:                        break;
1327:                    }
1328:                }
1329:            }
1330:
1331:            //---------------------------------------------------------------------------
1332:            // Implementation
1333:            //
1334:
1335:            /**
1336:             * Embedded tokenizers have their base tokenizer they share the input stream
1337:             * with.
1338:             *
1339:             * @return the base tokenizer (the one owning the input stream and text buffer)
1340:             */
1341:            protected AbstractTokenizer getBaseTokenizer() {
1342:                return _baseTokenizer;
1343:            }
1344:
1345:            /**
1346:             * Returns the {@link de.susebox.jtopas.spi.DataProvider} of the base tokenizer.
1347:             * This is this tokenizer if it is not an embedded one.
1348:             *
1349:             * @param   startPos    position in the input data
1350:             * @param   length      number of characters
1351:             * @return  the <code>DataProvider</code> for the given data range
1352:             */
1353:            protected DataProvider getBaseDataProvider(int startPos, int length) {
1354:                return getBaseTokenizer().getDataProvider(startPos, length);
1355:            }
1356:
1357:            /**
1358:             * This method organizes the input buffer. It moves the current text window if
1359:             * nessecary or allocates more space, if data should be kept completely (see the
1360:             * {@link TokenizerProperties#F_KEEP_DATA} flag).
1361:             * Its main purpose is to call the {@link TokenizerSource#read} method.
1362:             *
1363:             * @return  number of read bytes or -1 if an end-of-file condition occured
1364:             * @throws  TokenizerException wrapped exceptions from the {@link TokenizerSource#read} 
1365:             *          method
1366:             */
1367:            protected int readMoreDataFromBase() throws TokenizerException {
1368:                // its always the base tokenizer doing the reading
1369:                int readChars = -1;
1370:
1371:                if (!_eofReached) {
1372:                    AbstractTokenizer baseTokenizer = getBaseTokenizer();
1373:
1374:                    if (baseTokenizer != this ) {
1375:                        readChars = baseTokenizer.readMoreData();
1376:                    } else {
1377:                        readChars = readMoreData();
1378:                    }
1379:                    if (readChars > 0) {
1380:                        _currentWritePos += readChars;
1381:                    } else if (readChars < 0) {
1382:                        readChars = -1;
1383:                        _eofReached = true;
1384:                    }
1385:
1386:                    // Inform all embedded tokenizers about input buffer changes
1387:                    synchronizeAll();
1388:                }
1389:                return readChars;
1390:            }
1391:
1392:            /**
1393:             * When the method {@link #readMoreData} changes the contents of the input buffer 
1394:             * or the input buffer itself, all embedded tokenizers must be synchronized.
1395:             * That means their member variables are adjusted to the base tokenizer.
1396:             *
1397:             * @throws TokenizerException if something goes wrong
1398:             */
1399:            protected void synchronizeAll() throws TokenizerException {
1400:                AbstractTokenizer embedded = getBaseTokenizer();
1401:
1402:                while ((embedded = embedded._nextTokenizer) != null) {
1403:                    switchTo(embedded); // adjust the member variables
1404:                }
1405:            }
1406:
1407:            /**
1408:             * Checks the EOF condition at the given offset.
1409:             *
1410:             * @param offset  check at this position relative to the current read position
1411:             * @return <code>true</code> if EOF has been reached, <code>false</code> otherwise
1412:             * @throws TokenizerException failure while reading data from the input stream
1413:             */
1414:            protected boolean isEOF(int offset) throws TokenizerException {
1415:                if (_currentReadPos + offset < _currentWritePos
1416:                        || readMoreDataFromBase() > 0) {
1417:                    return false;
1418:                } else {
1419:                    _scannedToken[1] = new Token(Token.EOF);
1420:                    return true;
1421:                }
1422:            }
1423:
1424:            /**
1425:             * The number of characters until the next comment, whitespace, string, special
1426:             * sequence or separator are determined. The character sequnce is then checked
1427:             * for keyword or pattern matching.
1428:             *
1429:             * @param token buffer to receive information about the keyword or normal token
1430:             * @return <code>null</code> or a {@link TokenizerProperty} if a keyword or pattern is found
1431:             * @throws TokenizerException failure while reading data from the input stream
1432:             */
1433:            protected TokenizerProperty completeBoundedToken(Token token)
1434:                    throws TokenizerException {
1435:                // find out the return value (length of normal token)
1436:                int len = 1; // the first character is a normal one, see call of this method
1437:
1438:                while (!(isEOF(len) || isWhitespace(len)
1439:                        || isPattern(len, true) || isSpecialSequence(len) || isSeparator(len))) {
1440:                    len++;
1441:                }
1442:                token.setLength(len);
1443:
1444:                // test on keyword or non-free pattern
1445:                TokenizerProperty prop = null;
1446:                PatternHandler.Result result;
1447:
1448:                if ((prop = isKeyword(_currentReadPos, len)) != null) {
1449:                    token.setType(Token.KEYWORD);
1450:                    token.setCompanion(prop.getCompanion());
1451:                } else {
1452:                    token.setType(Token.NORMAL);
1453:                }
1454:                return prop;
1455:            }
1456:
1457:            /**
1458:             * After having identified a whitespace, this method continues to read data
1459:             * until it detects a non-whitespace.
1460:             *
1461:             * @return number of consecutive whitespaces
1462:             * @throws TokenizerException failure while reading data from the input stream
1463:             */
1464:            protected int completeWhitespace() throws TokenizerException {
1465:                int start = _currentReadPos + 1; // the first whitespace we have already
1466:                int available = _currentWritePos - start;
1467:                int len = readWhitespaces(start, available);
1468:
1469:                while (len == available) {
1470:                    if (readMoreDataFromBase() <= 0) {
1471:                        break;
1472:                    }
1473:                    start += len;
1474:                    available = _currentWritePos - start;
1475:                    len += readWhitespaces(start, available);
1476:                }
1477:                return len + 1; // the first whitespace we had already
1478:            }
1479:
1480:            /**
1481:             * This method checks at the given offset if it is a whitespace.
1482:             *
1483:             * @param offset  check at this position relative to the current read position
1484:             * @throws TokenizerException failure while reading data from the input stream
1485:             * @return <code>true</code> if a whitespace sequence was found at the given offset,
1486:             *         <code>false</code> otherwise
1487:             */
1488:            protected boolean isWhitespace(int offset)
1489:                    throws TokenizerException {
1490:                if (_whitespaceHandler != null) {
1491:                    if (_currentReadPos + offset >= _currentWritePos
1492:                            && readMoreDataFromBase() < 0) {
1493:                        return false;
1494:                    }
1495:
1496:                    if (isWhitespace(getChar(_currentReadPos + offset))) {
1497:                        _scannedToken[1] = new Token(Token.WHITESPACE);
1498:                        return true;
1499:                    }
1500:                }
1501:                return false;
1502:            }
1503:
1504:            /**
1505:             * This method checks at the given offset if it contains a separator. 
1506:             *
1507:             * @param offset  check at this position relative to the current read position
1508:             * @throws TokenizerException failure while reading data from the input stream
1509:             * @return <code>true</code> if a separator was found atthe given offset,
1510:             *         <code>false</code> otherwise
1511:             */
1512:            protected boolean isSeparator(int offset) throws TokenizerException {
1513:                if (_separatorHandler != null
1514:                        && (_currentReadPos + offset < _currentWritePos || readMoreDataFromBase() > 0)
1515:                        && _separatorHandler
1516:                                .isSeparator(getChar(_currentReadPos + offset))) {
1517:                    _scannedToken[1] = new Token(Token.SEPARATOR);
1518:                    return true;
1519:                } else {
1520:                    return false;
1521:                }
1522:            }
1523:
1524:            /**
1525:             * Testing for pattern matching.
1526:             *
1527:             * @param   offset            check at this position relative to the current read position
1528:             * @param   freePatternOnly   if <code>true</code> consider only pattern that can occur anywhere in the data
1529:             * @throws  TokenizerException failure while reading data from the input stream
1530:             * @return  <code>true</code> if a pattern match was found at the given offset,
1531:             *          <code>false</code> otherwise
1532:             */
1533:            protected boolean isPattern(int offset, boolean freePatternOnly)
1534:                    throws TokenizerException {
1535:                if (_patternHandler != null) {
1536:                    // for pattern, we might need a lot of data
1537:                    int startingAtPos = _currentReadPos + offset;
1538:
1539:                    while (_currentWritePos - startingAtPos < PATTERN_MAX_SIZE) {
1540:                        if (readMoreDataFromBase() <= 0) {
1541:                            break;
1542:                        }
1543:                    }
1544:
1545:                    // try pattern matching
1546:                    DataProvider dataProvider = getBaseDataProvider(
1547:                            startingAtPos, _currentWritePos - startingAtPos);
1548:                    PatternHandler.Result result = _patternHandler
1549:                            .matches(dataProvider);
1550:                    boolean isFree = (result != null) ? isFlagSet(result
1551:                            .getProperty(), Flags.F_FREE_PATTERN) : false;
1552:
1553:                    if (result != null && (isFree || !freePatternOnly)) {
1554:                        if (!isFree) {
1555:                            int nextOffset = offset + result.getLengthOfMatch();
1556:
1557:                            if (isEOF(nextOffset) || isWhitespace(nextOffset)
1558:                                    || isPattern(nextOffset, true)
1559:                                    || isSpecialSequence(nextOffset)
1560:                                    || isSeparator(nextOffset)) {
1561:                                _scannedToken[2] = _scannedToken[1];
1562:                            } else {
1563:                                return false;
1564:                            }
1565:                        }
1566:                        _scannedToken[1] = new Token(Token.PATTERN, null,
1567:                                result.getProperty());
1568:                        _scannedToken[1].setLength(result.getLengthOfMatch());
1569:                        if (isFlagSet(result.getProperty(),
1570:                                Flags.F_RETURN_IMAGE_PARTS)) {
1571:                            _scannedToken[1].setImageParts(result.getGroups());
1572:                        }
1573:                        return true;
1574:                    }
1575:                }
1576:
1577:                // no pattern matching available or no match found
1578:                return false;
1579:            }
1580:
1581:            /**
1582:             * This method checks at the given offset if it contains a a special sequence. 
1583:             * Unlike the method {@link #test4SpecialSequence} it does nothing more.
1584:             *
1585:             * @param offset  check at this position relative to the current read position
1586:             * @throws TokenizerException failure while reading data from the input stream
1587:             * @return <code>true</code> if a special sequence was found at the given offset,
1588:             *         <code>false</code> otherwise
1589:             */
1590:            protected boolean isSpecialSequence(int offset)
1591:                    throws TokenizerException {
1592:                if (_sequenceHandler != null) {
1593:                    // do we need more data to ensure enough characters for even the longest
1594:                    // possible sequence match 
1595:                    int startingAtPos = _currentReadPos + offset;
1596:
1597:                    while (_sequenceHandler.getSequenceMaxLength() > _currentWritePos
1598:                            - startingAtPos) {
1599:                        if (readMoreDataFromBase() <= 0) {
1600:                            break;
1601:                        }
1602:                    }
1603:
1604:                    // invoke the sequence handler
1605:                    DataProvider dataProvider = getBaseDataProvider(
1606:                            startingAtPos, _currentWritePos - startingAtPos);
1607:                    TokenizerProperty prop = _sequenceHandler
1608:                            .startsWithSequenceCommentOrString(dataProvider);
1609:
1610:                    if (prop != null) {
1611:                        _scannedToken[1] = new Token(prop.getType(), null, prop);
1612:                        return true;
1613:                    }
1614:                }
1615:
1616:                // no sequence handler given or no special sequence at given offset
1617:                return false;
1618:            }
1619:
1620:            /**
1621:             * Completing a line comment. After a line comment sequence has been found, all 
1622:             * characters up to and including the end-of-line combination belong to the 
1623:             * line comment. Note that on reaching end-of-file a line comment does not 
1624:             * nessecarily ends with an end-of-line sequence (linefeed for example).
1625:             *
1626:             * @param   prop    the property describing the line comment to complete
1627:             * @return  length of the line comment
1628:             * @throws  TokenizerException failure while reading data from the input stream
1629:             */
1630:            protected int completeLineComment(TokenizerProperty prop)
1631:                    throws TokenizerException {
1632:                String[] images = prop.getImages();
1633:                int len = images[0].length();
1634:
1635:                while (_currentReadPos + len < _currentWritePos
1636:                        || readMoreDataFromBase() > 0) {
1637:                    switch (getChar(_currentReadPos + len)) {
1638:                    case '\r':
1639:                        len++;
1640:                        if (_currentReadPos + len < _currentWritePos
1641:                                || readMoreDataFromBase() > 0) {
1642:                            if (getChar(_currentReadPos + len) == '\n') {
1643:                                len++;
1644:                            }
1645:                        }
1646:                        return len;
1647:                    case '\n':
1648:                        len++;
1649:                        return len;
1650:                    default:
1651:                        len++;
1652:                    }
1653:                }
1654:                return len;
1655:            }
1656:
1657:            /**
1658:             * Completing a block comment. After a block comment sequence has been found, all
1659:             * characters up to and including the end sequence of the block comment belong 
1660:             * to the block comment. Note that on reaching end-of-file a block comment does 
1661:             * not nessecarily ends with an end-of-block-comment sequence.
1662:             *
1663:             * @param   prop    the property describing the block comment to complete
1664:             * @return  length of the block comment
1665:             * @throws  TokenizerException failure while reading data from the input stream
1666:             */
1667:            protected int completeBlockComment(TokenizerProperty prop)
1668:                    throws TokenizerException {
1669:                String[] images = prop.getImages();
1670:                String start = images[0];
1671:                String end = images[1];
1672:                boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1673:                boolean nested = isFlagSet(prop, Flags.F_ALLOW_NESTED_COMMENTS);
1674:                int len = start.length();
1675:                int level = 0;
1676:
1677:                __LOOP__: do {
1678:                    // test on nested comments: we take only care for nesting the same
1679:                    // block comment
1680:                    if (nested) {
1681:                        switch (comparePrefix(len, start, noCase)) {
1682:                        case 0: // comment start identified
1683:                            level++;
1684:                            len += start.length();
1685:                            continue __LOOP__;
1686:                        case -1: // EOF reached
1687:                            return _currentWritePos - _currentReadPos;
1688:                        }
1689:                    }
1690:
1691:                    // is it the end ?
1692:                    switch (comparePrefix(len, end, noCase)) {
1693:                    case 0: // comment end identified
1694:                        level--;
1695:                        len += end.length();
1696:                        break;
1697:                    case -1: // EOF reached
1698:                        return _currentWritePos - _currentReadPos;
1699:                    default:
1700:                        len++;
1701:                    }
1702:                } while (level >= 0);
1703:
1704:                // block comment regularly terminated
1705:                return len;
1706:            }
1707:
1708:            /**
1709:             * Completing a string. After a string start sequence has been found, all
1710:             * characters up to and including the end-of-string sequence belong to the
1711:             * string. Note that on reaching end-of-file a string does not nessecarily ends 
1712:             * with an end-of-string sequence.
1713:             *
1714:             * @param   prop    the property describing the string to complete
1715:             * @return  length of the string
1716:             * @throws  TokenizerException failure while reading data from the input stream
1717:             */
1718:            protected int completeString(TokenizerProperty prop)
1719:                    throws TokenizerException {
1720:                // complete string
1721:                String[] images = prop.getImages();
1722:                String start = images[0];
1723:                String end = images[1];
1724:                String esc = images[2];
1725:                int len = start.length();
1726:                boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1727:                boolean escEqualsEnd = (!noCase && esc.compareTo(end) == 0)
1728:                        || (noCase && esc.compareToIgnoreCase(end) == 0);
1729:
1730:                while (true) {
1731:                    // test on escape
1732:                    if (esc != null) {
1733:                        switch (comparePrefix(len, esc, noCase)) {
1734:                        case 0: // escape found
1735:                            len += esc.length();
1736:                            if (escEqualsEnd) {
1737:                                switch (comparePrefix(len, end, noCase)) {
1738:                                case 0:
1739:                                    len += end.length();
1740:                                    break;
1741:                                case -1: // EOF reached
1742:                                    return _currentWritePos - _currentReadPos;
1743:                                default: // this is the regular return point if the esc is the string end
1744:                                    return len;
1745:                                }
1746:                            } else {
1747:                                len++; // esc != string end: skip the next character
1748:                            }
1749:                            continue;
1750:                        case -1: // EOF reached
1751:                            return _currentWritePos - _currentReadPos;
1752:                        }
1753:                    }
1754:
1755:                    // test on end sequence
1756:                    switch (comparePrefix(len, end, noCase)) {
1757:                    case 0: // this is the regular return point if esc != string end
1758:                        len += end.length();
1759:                        return len;
1760:                    case -1: // EOF reached    
1761:                        return _currentWritePos - _currentReadPos;
1762:                    default:
1763:                        len++;
1764:                    }
1765:                }
1766:            }
1767:
1768:            /**
1769:             * This method compares the characters at the given offset (from the current
1770:             * read position) with the given prefix.
1771:             *
1772:             * @param   offset  start comparing at this offset from the current read position
1773:             * @param   prefic  compare read data with this prefix
1774:             * @param   noCase  case- or not case-sensitive comparison
1775:             * @throws  TokenizerException failure while reading data from the input stream
1776:             * @return  0 if the the given prefix matches the input stream, -1 on EOF and
1777:             *          1 if not matching
1778:             */
1779:            protected int comparePrefix(int offset, String prefix,
1780:                    boolean noCase) throws TokenizerException {
1781:                // compare
1782:                int len = prefix.length();
1783:
1784:                for (int pos = offset; pos < offset + len; ++pos) {
1785:                    // do we have enough data
1786:                    if (_currentReadPos + pos >= _currentWritePos
1787:                            && readMoreDataFromBase() < 0) {
1788:                        return -1;
1789:                    }
1790:
1791:                    // compare single character
1792:                    char c1 = prefix.charAt(pos - offset);
1793:                    char c2 = getChar(_currentReadPos + pos);
1794:
1795:                    if (c1 != c2
1796:                            && (!noCase || Character.toUpperCase(c1) != Character
1797:                                    .toUpperCase(c2))) {
1798:                        return 1;
1799:                    }
1800:                }
1801:
1802:                // found
1803:                return 0;
1804:            }
1805:
1806:            /**
1807:             * The method recomputes the line and column position of the tokenizer, if the 
1808:             * flag {@link TokenizerProperties#F_COUNT_LINES} is set. It gets the token type of the
1809:             * {@link Token} that has been retrieved by the calling {@link #nextToken}.
1810:             * Using the tokenizer control flags and certain other information it tries to
1811:             * to find end-of-line sequences as fast as possible. For example, a line 
1812:             * comment should always contain a end-of-line sequence, so we can simply 
1813:             * increase the line count and set the column count to 0.
1814:             *
1815:             * @param type    the type of the current token
1816:             * @param length  the length of the current token
1817:             */
1818:            protected void adjustLineAndColumn(int type, int length) {
1819:                // line and column counting not required
1820:                if (!isFlagSet(Flags.F_COUNT_LINES)) {
1821:                    return;
1822:                }
1823:
1824:                // there might be a simple way to determine the current line and column position
1825:                switch (type) {
1826:                case Token.EOF:
1827:                    return;
1828:
1829:                case Token.LINE_COMMENT: // a line comment always ends with a newline
1830:                    _lineNumber++;
1831:                    _columnNumber = 0;
1832:                    putPosition(_currentReadPos + length, _lineNumber);
1833:                    return;
1834:
1835:                case Token.SPECIAL_SEQUENCE:
1836:                case Token.SEPARATOR:
1837:                case Token.NORMAL:
1838:                case Token.KEYWORD:
1839:                    if (_whitespaceHandler != null
1840:                            && _whitespaceHandler.newlineIsWhitespace()) { // newline is a whitespace character
1841:                        _columnNumber += length; // it should therefore not occure in other
1842:                        return; // tokens
1843:                    }
1844:                    break;
1845:
1846:                case Token.WHITESPACE:
1847:                    if (!(_whitespaceHandler.isWhitespace('\n') || _whitespaceHandler
1848:                            .isWhitespace('\r'))) {
1849:                        _columnNumber += length; // newline is not a whitespace; we do not have 
1850:                        return; // to test for it in the current token
1851:                    }
1852:                    break;
1853:                }
1854:
1855:                // count it
1856:                int newLineNumber = _lineNumber;
1857:
1858:                for (int pos = _currentReadPos; pos < _currentReadPos + length; ++pos) {
1859:                    switch (getChar(pos)) {
1860:                    case '\r':
1861:                        if (pos + 1 >= _currentReadPos + length
1862:                                || getChar(pos + 1) != '\n') {
1863:                            _lineNumber++;
1864:                            _columnNumber = 0;
1865:                            putPosition(pos + 1, _lineNumber);
1866:                            break;
1867:                        }
1868:                        pos++;
1869:                        /* no break; */
1870:                    case '\n':
1871:                        _lineNumber++;
1872:                        _columnNumber = 0;
1873:                        putPosition(pos + 1, _lineNumber);
1874:                        break;
1875:
1876:                    default:
1877:                        _columnNumber++;
1878:                    }
1879:                }
1880:            }
1881:
1882:            /**
1883:             * Putting a new position into the position-to-line-number map.
1884:             *
1885:             * @param position  the position to map to the current line number
1886:             */
1887:            private void putPosition(int position, int lineNumber) {
1888:                if (_position2LineMap == null) {
1889:                    _position2LineMap = new TreeMap();
1890:                }
1891:                _position2LineMap.put(new Integer(position), new Integer(
1892:                        lineNumber));
1893:            }
1894:
1895:            /**
1896:             * Checking a given flag. The method considers both the globally set flags
1897:             * in the associated {@link TokenizerProperties} instance and the locally set
1898:             * by {@link #changeParseFlags}.
1899:             *
1900:             * @param flag one of the <code>F_...</code> flags defined in {@link TokenizerProperties}
1901:             */
1902:            protected boolean isFlagSet(int flag) {
1903:                return (getParseFlags() & flag) != 0;
1904:            }
1905:
1906:            /**
1907:             * Checking if a given flag is set for the given {@link TokenizerProperty}, for
1908:             * this <code>Tokenizer</code> or for the used {@link TokenizerProperties}. The method considers both the globally set flags
1909:             * in the associated {@link TokenizerProperties} instance and the locally set
1910:             * by {@link #changeParseFlags}.
1911:             *
1912:             * @param prop  check the flag for this property
1913:             * @param flag  one of the {@link Flags} constants
1914:             */
1915:            protected boolean isFlagSet(TokenizerProperty prop, int flag) {
1916:                return prop.isFlagSet(flag, (getTokenizerProperties()
1917:                        .getParseFlags() & flag) != 0
1918:                        || isFlagSet(flag));
1919:            }
1920:
1921:            //---------------------------------------------------------------------------
1922:            // Class members
1923:            //
1924:
1925:            /**
1926:             * mask of flags that can be set separately for a <code>AbstractTokenizer</code>.
1927:             */
1928:            protected static final int VALID_FLAGS_MASK = Flags.F_RETURN_WHITESPACES
1929:                    | Flags.F_TOKEN_POS_ONLY
1930:                    | Flags.F_KEEP_DATA
1931:                    | Flags.F_COUNT_LINES;
1932:
1933:            /**
1934:             * {@link TokenizerProperties} tha tare used if no others have been 
1935:             * specified by calling {@link #setTokenizerProperties}.
1936:             */
1937:            protected StandardTokenizerProperties _defaultProperties = null;
1938:
1939:            /**
1940:             * Buffer sizes
1941:             */
1942:            private static final int PATTERN_MAX_SIZE = 0x40000; // 256K
1943:
1944:            /**
1945:             * Bits for the internal flag bitmask
1946:             */
1947:            private static final byte IFLAG_EXTERNAL_PATTERN_HANDLER = 0x01;
1948:            private static final byte IFLAG_EXTERNAL_KEYWORD_HANDLER = 0x02;
1949:            private static final byte IFLAG_EXTERNAL_SEQUENCE_HANDLER = 0x04;
1950:
1951:            //---------------------------------------------------------------------------
1952:            // Members
1953:            //
1954:
1955:            /**
1956:             * overall tokenizer flags.
1957:             */
1958:            protected int _flags = 0;
1959:
1960:            /**
1961:             * a combination of <code>F_...</code> constants defined in {@link TokenizerProperties}
1962:             * indicating which bits in the {@link #_flags} member are valid. All other 
1963:             * flags are taken from the associated {@link TokenizerProperties} object.
1964:             *
1965:             * @see #changeParseFlags
1966:             */
1967:            private int _flagMask = 0;
1968:
1969:            /**
1970:             * Flag if EOF has been reached. The flag should speed up calls to {@link readMoreDataFromBase}
1971:             */
1972:            private boolean _eofReached = true;
1973:
1974:            /**
1975:             * Data index there {@link #nextToken} will start parsing.
1976:             */
1977:            protected int _currentReadPos = 0;
1978:
1979:            /**
1980:             * Data index there {@link #readMoreDataFromBase} will fill in new data.
1981:             */
1982:            protected int _currentWritePos = 0;
1983:
1984:            /**
1985:             * if line counting is enabled, this contains the current line number starting
1986:             * with 0.
1987:             */
1988:            protected int _lineNumber = -1;
1989:
1990:            /**
1991:             * if line counting is enabled, this contains the current column number starting
1992:             * with 0.
1993:             */
1994:            protected int _columnNumber = -1;
1995:
1996:            /**
1997:             * List of currently known token. The first element is the current token returned
1998:             * by the last call to {@link #nextToken}. The following elements are look-ahead
1999:             * token that have already been identified when extracting the current token.
2000:             */
2001:            protected Token[] _scannedToken = new Token[] { null, null, null };
2002:
2003:            /**
2004:             * For embedded tokenizers: this is the list of the succeding tokenizers
2005:             */
2006:            protected AbstractTokenizer _nextTokenizer = null;
2007:
2008:            /**
2009:             * For embedded tokenizers: this is the base tokenizer that reads the data
2010:             */
2011:            protected AbstractTokenizer _baseTokenizer = null;
2012:
2013:            /**
2014:             * For embedded tokenizers: this is the list of the previous tokenizers
2015:             */
2016:            protected AbstractTokenizer _prevTokenizer = null;
2017:
2018:            /**
2019:             * Whitespace handler
2020:             */
2021:            private de.susebox.jtopas.spi.WhitespaceHandler _whitespaceHandler = null;
2022:
2023:            /**
2024:             * Separator handler
2025:             */
2026:            private de.susebox.jtopas.spi.SeparatorHandler _separatorHandler = null;
2027:
2028:            /**
2029:             * Keyword handler
2030:             */
2031:            private de.susebox.jtopas.spi.KeywordHandler _keywordHandler = null;
2032:
2033:            /**
2034:             * Sequence handler
2035:             */
2036:            private de.susebox.jtopas.spi.SequenceHandler _sequenceHandler = null;
2037:
2038:            /**
2039:             * Sequence handler
2040:             */
2041:            private de.susebox.jtopas.spi.PatternHandler _patternHandler = null;
2042:
2043:            /**
2044:             * The source of input data
2045:             */
2046:            private TokenizerSource _source = null;
2047:
2048:            /**
2049:             * The characteristics of this tokenizer.
2050:             */
2051:            private TokenizerProperties _properties = null;
2052:
2053:            /**
2054:             * Line number to position mapping
2055:             */
2056:            private TreeMap _position2LineMap = null;
2057:
2058:            /**
2059:             * Control flags for the internal work
2060:             */
2061:            private long _internalFlags = 0;
2062:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.