0001: /*
0002: * AbstractTokenizer.java: base class for Tokenizer implementations.
0003: *
0004: * Copyright (C) 2004 Heiko Blau
0005: *
0006: * This file belongs to the JTopas Library.
0007: * JTopas is free software; you can redistribute it and/or modify it
0008: * under the terms of the GNU Lesser General Public License as published by the
0009: * Free Software Foundation; either version 2.1 of the License, or (at your
0010: * option) any later version.
0011: *
0012: * This software is distributed in the hope that it will be useful, but WITHOUT
0013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0014: * FITNESS FOR A PARTICULAR PURPOSE.
0015: * See the GNU Lesser General Public License for more details.
0016: *
0017: * You should have received a copy of the GNU Lesser General Public License along
0018: * with JTopas. If not, write to the
0019: *
0020: * Free Software Foundation, Inc.
0021: * 59 Temple Place, Suite 330,
0022: * Boston, MA 02111-1307
0023: * USA
0024: *
0025: * or check the Internet: http://www.fsf.org
0026: *
0027: * Contact:
0028: * email: heiko@susebox.de
0029: */
0030:
0031: package de.susebox.jtopas;
0032:
0033: //-----------------------------------------------------------------------------
0034: // Imports
0035: //
0036: import java.io.Reader;
0037: import java.util.SortedMap;
0038: import java.util.TreeMap;
0039: import java.util.LinkedList;
0040: import java.util.Arrays;
0041:
0042: import de.susebox.java.lang.ExtIndexOutOfBoundsException;
0043:
0044: import de.susebox.jtopas.spi.WhitespaceHandler;
0045: import de.susebox.jtopas.spi.KeywordHandler;
0046: import de.susebox.jtopas.spi.PatternHandler;
0047: import de.susebox.jtopas.spi.SeparatorHandler;
0048: import de.susebox.jtopas.spi.SequenceHandler;
0049:
0050: import de.susebox.jtopas.spi.StandardWhitespaceHandler;
0051: import de.susebox.jtopas.spi.StandardKeywordHandler;
0052: import de.susebox.jtopas.spi.StandardSeparatorHandler;
0053: import de.susebox.jtopas.spi.StandardSequenceHandler;
0054:
0055: import de.susebox.jtopas.spi.DataProvider;
0056: import de.susebox.jtopas.spi.DataMapper;
0057:
0058: //-----------------------------------------------------------------------------
0059: // Class AbstractTokenizer
0060: //
0061:
0062: /**<p>
0063: * Base class for {@link Tokenizer} implementations. <code>AbstractTokenizer</code>
0064: * separates the data analysis from the actual data provision. Although the class
0065: * maintains read and write positions the physical representation of the logical
0066: * character buffer behind these positions concerns only the subclasses.
0067: *</p>
0068: *
0069: * @see Tokenizer
0070: * @see TokenizerProperties
0071: * @author Heiko Blau
0072: */
0073: public abstract class AbstractTokenizer implements Tokenizer,
0074: TokenizerPropertyListener {
0075:
0076: //---------------------------------------------------------------------------
0077: // Abstract methods
0078: //
0079:
0080: /**
0081: * Subclasses have to provide {@link de.susebox.jtopas.spi.DataProvider}
0082: * instances for various token type handlers. The given start position is the
0083: * absolute number of characters from the beginning of the data source.
0084: *
0085: * @param startPos position in the input data
0086: * @param length number of characters
0087: * @return the <code>DataProvider</code> for the given data range
0088: */
0089: protected abstract DataProvider getDataProvider(int startPos,
0090: int length);
0091:
0092: /**
0093: * This method is called when the tokenizer runs out of data. Its main purpose
0094: * is to call the {@link TokenizerSource#read} method. It is also responsible
0095: * to handle the flag {@link TokenizerProperties#F_KEEP_DATA} flag).
0096: *
0097: * @return number of read bytes or -1 if an end-of-file condition occured
0098: * @throws TokenizerException wrapped exceptions from the {@link TokenizerSource#read}
0099: * method
0100: */
0101: protected abstract int readMoreData() throws TokenizerException;
0102:
0103: //---------------------------------------------------------------------------
0104: // Constructors
0105: //
0106:
0107: /**
0108: * Default constructor that sets the tokenizer control flags as it would be
0109: * approbriate for C/C++ and Java. Found token images are copied. No line nor
0110: * column informations are provided. Nested comments are not allowed.
0111: *<br>
0112: * The tokenizer will use the {@link TokenizerProperties#DEFAULT_WHITESPACES}
0113: * and {@link TokenizerProperties#DEFAULT_SEPARATORS} for whitespace and
0114: * separator handling.
0115: */
0116: public AbstractTokenizer() {
0117: _baseTokenizer = this ;
0118: if (_defaultProperties == null) {
0119: _defaultProperties = new StandardTokenizerProperties();
0120: }
0121: setTokenizerProperties(_defaultProperties);
0122: }
0123:
0124: /**
0125: * Contructing a <code>AbstractTokenizer</code> with a backing {@link TokenizerProperties}
0126: * instance.
0127: *
0128: * @param properties an {@link TokenizerProperties} object containing the
0129: * settings for the tokenizing process
0130: */
0131: public AbstractTokenizer(TokenizerProperties properties) {
0132: _baseTokenizer = this ;
0133: setTokenizerProperties(properties);
0134: }
0135:
0136: //---------------------------------------------------------------------------
0137: // data source
0138: //
0139:
0140: /**
0141: * Setting the source of data. This method is usually called during setup of
0142: * the <code>Tokenizer</code> but may also be invoked while the tokenizing
0143: * is in progress. It will reset the tokenizers input buffer, line and column
0144: * counters etc.
0145: *<br>
0146: * Subclasses should override this method to do their own actions on a data source
0147: * change. Generally, this base method should be called first in the subclass
0148: * implementation of <code>setSource</code> (equivalent to super calls in
0149: * constructors of derived classes).
0150: *
0151: * @param source a {@link TokenizerSource} to read data from
0152: * @see #getSource
0153: */
0154: public void setSource(TokenizerSource source) {
0155: _source = source;
0156: _eofReached = false;
0157: _currentReadPos = 0;
0158: _currentWritePos = 0;
0159: if (isFlagSet(Flags.F_COUNT_LINES)) {
0160: _lineNumber = 0;
0161: _columnNumber = 0;
0162: } else {
0163: _lineNumber = -1;
0164: _columnNumber = -1;
0165: }
0166: Arrays.fill(_scannedToken, null);
0167: }
0168:
0169: /**
0170: * Convenience method to avoid the construction of a {@link TokenizerSource}
0171: * from the most important data source {@link java.io.Reader}.
0172: *
0173: * @param reader the {@link java.io.Reader} to get data from
0174: */
0175: public void setSource(Reader reader) {
0176: setSource(new ReaderSource(reader));
0177: }
0178:
0179: /**
0180: * Retrieving the {@link TokenizerSource} of this <code>Tokenizer</code>. The
0181: * method may return <code>null</code> if there is no <code>TokenizerSource</code>
0182: * associated with it.
0183: *
0184: * @param the {@link TokenizerSource} associated with this <code>Tokenizer</code>
0185: * @see #setSource
0186: */
0187: public TokenizerSource getSource() {
0188: return _source;
0189: }
0190:
0191: //---------------------------------------------------------------------------
0192: // Methods of the Tokenizer interface
0193: //
0194:
0195: /**
0196: * Setting the tokenizer characteristics. See the method description in
0197: * {@link Tokenizer}.
0198: *
0199: * @param props the {@link TokenizerProperties} for this tokenizer
0200: * @throws NullPointerException if the <code>null</code> is passed to the call
0201: * @see #getTokenizerProperties
0202: */
0203: public void setTokenizerProperties(TokenizerProperties props)
0204: throws NullPointerException {
0205: if (props == null) {
0206: throw new NullPointerException();
0207: }
0208:
0209: // set properties
0210: if (_properties != null) {
0211: _properties.removeTokenizerPropertyListener(this );
0212: }
0213: _properties = props;
0214: _properties.addTokenizerPropertyListener(this );
0215:
0216: // who is going to handle the various token types ?
0217: if (_properties instanceof WhitespaceHandler) {
0218: setWhitespaceHandler((WhitespaceHandler) _properties);
0219: } else {
0220: setWhitespaceHandler(new StandardWhitespaceHandler(
0221: _properties));
0222: }
0223: if (_properties instanceof SeparatorHandler) {
0224: setSeparatorHandler((SeparatorHandler) _properties);
0225: } else {
0226: setSeparatorHandler(new StandardSeparatorHandler(
0227: _properties));
0228: }
0229: if (_properties instanceof SequenceHandler) {
0230: setSequenceHandler((SequenceHandler) _properties);
0231: } else {
0232: setSequenceHandler(new StandardSequenceHandler(_properties));
0233: }
0234: if (props instanceof KeywordHandler) {
0235: setKeywordHandler((KeywordHandler) props);
0236: } else {
0237: setKeywordHandler(new StandardKeywordHandler(_properties));
0238: }
0239: if (_properties instanceof PatternHandler) {
0240: setPatternHandler((PatternHandler) _properties);
0241: } else {
0242: setPatternHandler(null);
0243: }
0244:
0245: // flag handling
0246: int newFlags = _properties.getParseFlags();
0247:
0248: if (newFlags != _flags) {
0249: propertyChanged(new TokenizerPropertyEvent(
0250: TokenizerPropertyEvent.PROPERTY_MODIFIED,
0251: new TokenizerProperty(
0252: TokenizerProperty.PARSE_FLAG_MASK,
0253: new String[] { Integer
0254: .toBinaryString(newFlags) }),
0255: new TokenizerProperty(
0256: TokenizerProperty.PARSE_FLAG_MASK,
0257: new String[] { Integer
0258: .toBinaryString(_flags) })));
0259: }
0260: }
0261:
0262: /**
0263: * Retrieving the current tokenizer characteristics. See the method description
0264: * in {@link Tokenizer}.
0265: *
0266: * @return the {@link TokenizerProperties} of this <code>Tokenizer</code>
0267: * @see #setTokenizerProperties
0268: */
0269: public TokenizerProperties getTokenizerProperties() {
0270: return _properties;
0271: }
0272:
0273: /**
0274: * Setting the control flags of the <code>Tokenizer</code>. See the method
0275: * description in {@link Tokenizer}.
0276: *
0277: * @param flags the parser control flags
0278: * @param mask the mask for the flags to set or unset
0279: * @throws TokenizerException if one or more of the flags given cannot be honored
0280: * @see #getParseFlags
0281: */
0282: public void changeParseFlags(int flags, int mask)
0283: throws TokenizerException {
0284: // test the given flags
0285: if ((mask | VALID_FLAGS_MASK) != VALID_FLAGS_MASK) {
0286: throw new TokenizerException(
0287: "One or more flags cannot be set separately for a {0}. Violating flags in {1}: {2}.",
0288: new Object[] {
0289: AbstractTokenizer.class.getName(),
0290: Integer.toHexString(flags),
0291: Integer.toHexString(mask
0292: & ~VALID_FLAGS_MASK) });
0293: }
0294:
0295: // set the new flags for this tokenizer
0296: _flagMask = mask;
0297: _flags = (flags & mask)
0298: | (getTokenizerProperties().getParseFlags() & ~mask);
0299:
0300: // when counting lines initialize the current line and column position
0301: if (!isFlagSet(Flags.F_COUNT_LINES)) {
0302: _lineNumber = 0;
0303: _columnNumber = 0;
0304: }
0305: }
0306:
0307: /**
0308: * Retrieving the parser control flags. See the method description in
0309: * {@link Tokenizer}.
0310: *
0311: * @return the current parser control flags
0312: * @see #changeParseFlags
0313: */
0314: public int getParseFlags() {
0315: return (getTokenizerProperties().getParseFlags() & ~_flagMask)
0316: + (_flags & _flagMask);
0317: }
0318:
0319: /**
0320: * Setting a new {@link de.susebox.jtopas.spi.KeywordHandler} or removing any
0321: * previously installed one. See the method description in {@link Tokenizer}.
0322: *
0323: * @param handler the (new) {@link KeywordHandler} to use or <code>null</code>
0324: * to remove it
0325: */
0326: public void setKeywordHandler(
0327: de.susebox.jtopas.spi.KeywordHandler handler) {
0328: synchronized (this ) {
0329: if (handler == _properties) {
0330: if (_properties != null
0331: && _properties.getKeywords().hasNext()) {
0332: _keywordHandler = handler;
0333: } else {
0334: _keywordHandler = null;
0335: }
0336: _internalFlags &= ~IFLAG_EXTERNAL_KEYWORD_HANDLER;
0337: } else {
0338: _keywordHandler = handler;
0339: _internalFlags |= IFLAG_EXTERNAL_KEYWORD_HANDLER;
0340: }
0341: }
0342: }
0343:
0344: /**
0345: * Retrieving the current {@link de.susebox.jtopas.spi.KeywordHandler}. See the
0346: * method description in {@link Tokenizer}.
0347: *
0348: * @return the currently active whitespace keyword or <code>null</code>, if
0349: * keyword support is switched off
0350: */
0351: public de.susebox.jtopas.spi.KeywordHandler getKeywordHandler() {
0352: synchronized (this ) {
0353: if ((_internalFlags & IFLAG_EXTERNAL_KEYWORD_HANDLER) == 0) {
0354: return (de.susebox.jtopas.spi.KeywordHandler) getTokenizerProperties();
0355: } else {
0356: return _keywordHandler;
0357: }
0358: }
0359: }
0360:
0361: /**
0362: * Setting a new {@link de.susebox.jtopas.spi.WhitespaceHandler} or removing any
0363: * previously installed one. See the method description in {@link Tokenizer}.
0364: *
0365: * @param handler the (new) whitespace handler to use or <code>null</code> to
0366: * switch off whitespace handling
0367: * @see #getWhitespaceHandler
0368: */
0369: public void setWhitespaceHandler(
0370: de.susebox.jtopas.spi.WhitespaceHandler handler) {
0371: _whitespaceHandler = handler;
0372: }
0373:
0374: /**
0375: * Retrieving the current {@link de.susebox.jtopas.spi.WhitespaceHandler}. See
0376: * the method description in {@link Tokenizer}.
0377: *
0378: * @return the currently active whitespace handler or null, if the base
0379: * implementation is working
0380: */
0381: public de.susebox.jtopas.spi.WhitespaceHandler getWhitespaceHandler() {
0382: return _whitespaceHandler;
0383: }
0384:
0385: /**
0386: * Setting a new {@link de.susebox.jtopas.spi.SeparatorHandler} or removing any
0387: * previously installed <code>SeparatorHandler</code>. See the method description
0388: * in {@link Tokenizer}.
0389: *
0390: * @param handler the (new) separator handler to use or <code>null</code> to
0391: * remove it
0392: * @see #getSeparatorHandler
0393: */
0394: public void setSeparatorHandler(
0395: de.susebox.jtopas.spi.SeparatorHandler handler) {
0396: _separatorHandler = handler;
0397: }
0398:
0399: /**
0400: * Retrieving the current {@link de.susebox.jtopas.spi.SeparatorHandler}. See
0401: * the method description in {@link Tokenizer}.
0402: *
0403: * @return the currently active {@link SeparatorHandler} or <code>null</code>,
0404: * if separators aren't recognized by the tokenizer
0405: * @see #setSequenceHandler
0406: */
0407: public de.susebox.jtopas.spi.SeparatorHandler getSeparatorHandler() {
0408: return _separatorHandler;
0409: }
0410:
0411: /**
0412: * Setting a new {@link de.susebox.jtopas.spi.SequenceHandler} or removing any
0413: * previously installed one. See the method description in {@link Tokenizer}.
0414: *
0415: * @param handler the (new) {@link SequenceHandler} to use or null to remove it
0416: */
0417: public void setSequenceHandler(
0418: de.susebox.jtopas.spi.SequenceHandler handler) {
0419: synchronized (this ) {
0420: if (handler == _properties) {
0421: if (_properties != null
0422: && (_properties.getSpecialSequences().hasNext()
0423: || _properties.getStrings().hasNext()
0424: || _properties.getBlockComments()
0425: .hasNext() || _properties
0426: .getLineComments().hasNext())) {
0427: _sequenceHandler = handler;
0428: } else {
0429: _sequenceHandler = null;
0430: }
0431: _internalFlags &= ~IFLAG_EXTERNAL_SEQUENCE_HANDLER;
0432: } else {
0433: _sequenceHandler = handler;
0434: _internalFlags |= IFLAG_EXTERNAL_SEQUENCE_HANDLER;
0435: }
0436: }
0437: }
0438:
0439: /**
0440: * Retrieving the current {@link SequenceHandler}. See the method description
0441: * in {@link Tokenizer}.
0442: *
0443: * @return the currently active {@link SequenceHandler} or null, if the base
0444: * implementation is working
0445: */
0446: public de.susebox.jtopas.spi.SequenceHandler getSequenceHandler() {
0447: synchronized (this ) {
0448: if ((_internalFlags & IFLAG_EXTERNAL_SEQUENCE_HANDLER) == 0) {
0449: return (de.susebox.jtopas.spi.SequenceHandler) getTokenizerProperties();
0450: } else {
0451: return _sequenceHandler;
0452: }
0453: }
0454: }
0455:
0456: /**
0457: * Setting a new {@link de.susebox.jtopas.spi.PatternHandler} or removing any
0458: * previously installed one. See the method description in {@link Tokenizer}.
0459: *
0460: * @param handler the (new) {@link de.susebox.jtopas.spi.PatternHandler} to
0461: * use or <code>null</code> to remove it
0462: * @see #getPatternHandler
0463: */
0464: public void setPatternHandler(
0465: de.susebox.jtopas.spi.PatternHandler handler) {
0466: synchronized (this ) {
0467: if (handler == _properties) {
0468: if (_properties != null
0469: && _properties.getPatterns().hasNext()) {
0470: _patternHandler = handler;
0471: } else {
0472: _patternHandler = null;
0473: }
0474: _internalFlags &= ~IFLAG_EXTERNAL_PATTERN_HANDLER;
0475: } else {
0476: _patternHandler = handler;
0477: _internalFlags |= IFLAG_EXTERNAL_PATTERN_HANDLER;
0478: }
0479: }
0480: }
0481:
0482: /**
0483: * Retrieving the current {@link de.susebox.jtopas.spi.PatternHandler}. See the
0484: * method description in {@link Tokenizer}.
0485: *
0486: * @return the currently active {@link de.susebox.jtopas.spi.PatternHandler}
0487: * or <code>null</code>, if patterns are not recognized by the tokenizer
0488: * @see #setPatternHandler
0489: */
0490: public de.susebox.jtopas.spi.PatternHandler getPatternHandler() {
0491: synchronized (this ) {
0492: if ((_internalFlags & IFLAG_EXTERNAL_PATTERN_HANDLER) == 0) {
0493: return (de.susebox.jtopas.spi.PatternHandler) getTokenizerProperties();
0494: } else {
0495: return _patternHandler;
0496: }
0497: }
0498: }
0499:
0500: /**
0501: * Query the current row. The method can only be used if the flag {@link TokenizerProperties#F_COUNT_LINES}
0502: * has been set. Without this flag being set, the return value is undefined.
0503: *<br>
0504: * Note that row counting starts with 0, while editors often use 1 for the first
0505: * row.
0506: *
0507: * @return current row (starting with 0)
0508: * or -1 if the flag {@link TokenizerProperties#F_COUNT_LINES} is set
0509: */
0510: public int getCurrentLine() {
0511: return _lineNumber;
0512: }
0513:
0514: /**
0515: * Retrieve the current column. The method can only be used if the flag <code>F_COUNT_LINES</code>
0516: * has been set.
0517: * Without this flag being set, the return value is undefined.
0518: * Note that column counting starts with 0, while editors often use 1 for the first
0519: * column in one row.
0520: *
0521: * @return current column number (starting with 0)
0522: */
0523: public int getCurrentColumn() {
0524: return _columnNumber;
0525: }
0526:
0527: /**
0528: * Checking if there are more tokens available. See the method description in
0529: * {@link Tokenizer}.
0530: *
0531: * @return <code>true</code> if a ca_ll to {@link #nextToken} or {@link #nextImage}
0532: * will succed, <code>false</code> otherwise
0533: */
0534: public boolean hasMoreToken() {
0535: return _scannedToken[0] == null
0536: || _scannedToken[0].getType() != Token.EOF;
0537: }
0538:
0539: /**
0540: * Retrieving the next {@link Token}. See the method description in
0541: * {@link Tokenizer}.
0542: *
0543: * @return found {@link Token} including the EOF token
0544: * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
0545: * (IOExceptions for instance)
0546: */
0547: public Token nextToken() throws TokenizerException {
0548: boolean returnIt = false;
0549:
0550: // Get the next token
0551: __MAIN_LOOP__: do {
0552: // analyze look-ahead token
0553: if (_scannedToken[1] == null) {
0554: if (!isEOF(0)) {
0555: if (!isWhitespace(0)) {
0556: if (!isPattern(0, false)) {
0557: if (!isSpecialSequence(0)) {
0558: if (!isSeparator(0)) {
0559: _scannedToken[1] = new Token(
0560: Token.NORMAL);
0561: }
0562: }
0563: }
0564: }
0565: }
0566: }
0567: _scannedToken[0] = _scannedToken[1];
0568: _scannedToken[1] = _scannedToken[2];
0569: _scannedToken[2] = null;
0570:
0571: // get new token or complete the previously found look-ahead token
0572: Token token = _scannedToken[0];
0573: TokenizerProperty prop = (TokenizerProperty) token
0574: .getCompanion();
0575:
0576: token.setCompanion((prop != null) ? prop.getCompanion()
0577: : null);
0578: token.setStartPosition(getReadPosition());
0579: token.setStartLine(_lineNumber);
0580: token.setStartColumn(_columnNumber);
0581:
0582: returnIt = true;
0583:
0584: switch (token.getType()) {
0585: case Token.EOF:
0586: token.setLength(0);
0587: break;
0588: case Token.WHITESPACE:
0589: token.setLength(completeWhitespace());
0590: returnIt = isFlagSet(Flags.F_RETURN_SIMPLE_WHITESPACES);
0591: break;
0592: case Token.SEPARATOR: // Separators are always single characters.
0593: token.setLength(1);
0594: break;
0595: case Token.STRING:
0596: token.setLength(completeString(prop));
0597: break;
0598: case Token.LINE_COMMENT:
0599: token.setLength(completeLineComment(prop));
0600: returnIt = isFlagSet(prop, Flags.F_RETURN_LINE_COMMENTS);
0601: break;
0602: case Token.BLOCK_COMMENT:
0603: token.setLength(completeBlockComment(prop));
0604: returnIt = isFlagSet(prop,
0605: Flags.F_RETURN_BLOCK_COMMENTS);
0606: break;
0607: case Token.SPECIAL_SEQUENCE:
0608: token.setLength(prop.getImages()[0].length());
0609: break;
0610: case Token.PATTERN:
0611: // already contained in the first look-ahead token, see token shifting
0612: break;
0613: default:
0614: prop = completeBoundedToken(token);
0615: }
0616:
0617: // compute new line and column positions (if flag is set) and complete
0618: // the token
0619: adjustLineAndColumn(token.getType(), token.getLength());
0620: token.setEndLine(_lineNumber);
0621: token.setEndColumn(_columnNumber);
0622:
0623: // need to extract the image ?
0624: if (returnIt) {
0625: boolean tokenPosOnly = (prop != null) ? isFlagSet(prop,
0626: Flags.F_TOKEN_POS_ONLY)
0627: : isFlagSet(Flags.F_TOKEN_POS_ONLY);
0628: boolean returnImageParts = (prop != null) ? isFlagSet(
0629: prop, Flags.F_RETURN_IMAGE_PARTS)
0630: : isFlagSet(Flags.F_RETURN_IMAGE_PARTS);
0631: if (!tokenPosOnly || returnImageParts) {
0632: token.setImage(getText(_currentReadPos, token
0633: .getLength()));
0634: }
0635: if (returnImageParts) {
0636: switch (token.getType()) {
0637: case Token.WHITESPACE:
0638: token.setImageParts(splitIntoLines(token
0639: .getImage()));
0640: break;
0641: case Token.STRING:
0642: token.setImageParts(splitString(prop, token
0643: .getImage()));
0644: break;
0645: case Token.LINE_COMMENT:
0646: token.setImageParts(splitIntoLines(token
0647: .getImage().substring(
0648: prop.getImages()[0].length())));
0649: break;
0650: case Token.BLOCK_COMMENT:
0651: token.setImageParts(splitBlockComment(prop,
0652: token.getImage()));
0653: break;
0654: case Token.PATTERN:
0655: break;
0656: case Token.EOF:
0657: token.setImageParts(new String[] {});
0658: break;
0659: default:
0660: token.setImageParts(new String[] { token
0661: .getImage() });
0662: }
0663: }
0664: }
0665:
0666: // this is the one and only point where the current read position is
0667: // adjusted (except for the data shifting in readMoreData).
0668: _currentReadPos += token.getLength();
0669:
0670: } while (!returnIt);
0671:
0672: // the current token is the first in the list
0673: return _scannedToken[0];
0674: }
0675:
0676: /**
0677: * This method is a convenience method. It returns only the next token image
0678: * without any informations about its type or associated information. See the
0679: * method description in {@link Tokenizer}.
0680: *
0681: * @return the token image of the next token
0682: * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
0683: * (IOExceptions for instance)
0684: * @see #currentImage
0685: */
0686: public String nextImage() throws TokenizerException {
0687: nextToken();
0688: return currentImage();
0689: }
0690:
0691: /**
0692: * Retrieve the {@link Token} that was found by the last call to {@link #nextToken}.
0693: * See the method description in {@link Tokenizer}.
0694: *
0695: * @return the {@link Token} retrieved by the lahasest call to {@link #nextToken}.
0696: * @throws TokenizerException if the tokenizer has no current token
0697: */
0698: public Token currentToken() throws TokenizerException {
0699: if (_scannedToken[0] == null) {
0700: throw new TokenizerException(
0701: "No current token available (nextToken was not called / read position changed)");
0702: }
0703: return _scannedToken[0];
0704: }
0705:
0706: /**
0707: * Convenience method to retrieve only the token image of the {@link Token} that
0708: * would be returned by {@link #currentToken}. See the method description in
0709: * {@link Tokenizer}.
0710: *
0711: * @return the token image of the current token
0712: * @see #currentToken
0713: */
0714: public String currentImage() throws TokenizerException {
0715: Token token = currentToken();
0716:
0717: if (token.getType() == Token.EOF) {
0718: return null;
0719: } else if (!isFlagSet(Flags.F_TOKEN_POS_ONLY)
0720: || token.getImage() != null) {
0721: return token.getImage();
0722: } else {
0723: return getText(token.getStartPosition(), token.getLength());
0724: }
0725: }
0726:
0727: /**
0728: * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method will
0729: * return the line number starting with 0 in the input stream. See the method
0730: * description in {@link Tokenizer}.
0731: *
0732: * @return the current line number starting with 0 or -1 if no line numbers are supplied.
0733: * @see #getColumnNumber
0734: */
0735: public int getLineNumber() {
0736: return _lineNumber;
0737: }
0738:
0739: /**
0740: * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method will
0741: * return the current column positionstarting with 0 in the input stream. See
0742: * the method description in {@link Tokenizer}.
0743: *
0744: * @return the current column position
0745: * @see #getLineNumber
0746: */
0747: public int getColumnNumber() {
0748: return _columnNumber;
0749: }
0750:
0751: /**
0752: * Getting the current read offset. See the method description in
0753: * {@link Tokenizer}.
0754: *
0755: * @return the absolute offset in characters from the start of the data source
0756: * of the Tokenizer where reading will be continued
0757: * @see #setReadPositionAbsolute
0758: * @see #setReadPositionRelative
0759: */
0760: public int getReadPosition() {
0761: return _currentReadPos;
0762: }
0763:
0764: /**
0765: * Retrieving the number of the currently available characters. See the method
0766: * description in {@link Tokenizer}.
0767: *
0768: * @return number of currently available characters
0769: */
0770: public int currentlyAvailable() {
0771: return _currentWritePos - getRangeStart();
0772: }
0773:
0774: /**
0775: * Try to read more data into the text buffer of the tokenizer. See the method
0776: * description in {@link Tokenizer}.
0777: *
0778: * @return the number of character now available
0779: * @throws TokenizerException generic exception (list) for all problems that
0780: * may occur while reading (IOExceptions for instance)
0781: */
0782: public int readMore() throws TokenizerException {
0783: readMoreDataFromBase();
0784: return currentlyAvailable();
0785: }
0786:
0787: /**
0788: * Returns the character at the given position. The method does not attempt to
0789: * read more data.
0790: *
0791: * @param pos get character on this position in the data stream
0792: * @return the character at the given position
0793: * @throws IndexOutOfBoundsException if the parameter <code>pos</code> is not
0794: * in the available text range (text window)
0795: */
0796: public char getChar(int pos) throws IndexOutOfBoundsException {
0797: return getBaseDataProvider(pos, 1).getCharAt(0);
0798: }
0799:
0800: /**
0801: * Retrieve text from the currently available range. See the method description
0802: * in {@link Tokenizer}.
0803: *
0804: * @param start position where the text begins
0805: * @param len length of the text
0806: * @return the text beginning at the given position ith the given length
0807: * @throws IndexOutOfBoundsException if the starting position or the length
0808: * is out of the current text window
0809: */
0810: public String getText(int start, int len)
0811: throws IndexOutOfBoundsException {
0812: return getBaseDataProvider(start, len).toString();
0813: }
0814:
0815: /**
0816: * This method sets the tokenizers current read position to the given absolute
0817: * read position. See the method description in {@link Tokenizer}.
0818: *<br>
0819: * When using this method with embedded tokenizers, the user is responsible to
0820: * set the read position in the currently used tokenizer. It will be propagated
0821: * by the next call to {@link #switchTo}. Until that point, a call to this
0822: * method has no effect on the other tokenizers sharing the same data source.
0823: *
0824: * @param position absolute position for the next parse operation
0825: * @throws IndexOutOfBoundsException if the parameter <code>position</code> is
0826: * not in the available text range (text window)
0827: * @see #setReadPositionRelative
0828: */
0829: public void setReadPositionAbsolute(int position)
0830: throws IndexOutOfBoundsException {
0831: if (position < getRangeStart()) {
0832: throw new ExtIndexOutOfBoundsException(
0833: "Invalid read position {0} below the current text window start {1}.",
0834: new Object[] { new Integer(position),
0835: new Integer(getRangeStart()) });
0836: } else if (position > _currentWritePos) {
0837: throw new ExtIndexOutOfBoundsException(
0838: "Invalid read position {0} at or above the current text window end {1}.",
0839: new Object[] {
0840: new Integer(position),
0841: new Integer(currentlyAvailable()
0842: + getRangeStart()) });
0843: }
0844: _currentReadPos = position;
0845: Arrays.fill(_scannedToken, null);
0846:
0847: // adjust line and column counting
0848: if (isFlagSet(Flags.F_COUNT_LINES)) {
0849: SortedMap map = _position2LineMap.headMap(new Integer(
0850: position + 1));
0851:
0852: if (map != null && !map.isEmpty()) {
0853: Integer lastLineStart = (Integer) map.lastKey();
0854:
0855: _lineNumber = ((Integer) map.get(lastLineStart))
0856: .intValue();
0857: _columnNumber = position - lastLineStart.intValue();
0858: } else {
0859: _lineNumber = 0;
0860: _columnNumber = position;
0861: }
0862: }
0863: }
0864:
0865: /**
0866: * This method sets the tokenizers new read position the given number of characters
0867: * forward (positive value) or backward (negative value) starting from the current
0868: * read position. See the method description in {@link Tokenizer}.
0869: *<br>
0870: * When using this method with embedded tokenizers, the user is responsible to
0871: * set the read position in the currently used tokenizer. It will be propagated
0872: * by the next call to {@link #switchTo}. Until that point, a call to this
0873: * method has no effect on the other tokenizers sharing the same data source.
0874: *
0875: * @param offset number of characters to move forward (positive offset) or
0876: * backward (negative offset)
0877: * @throws IndexOutOfBoundsException if the parameter <code>offset</code> would
0878: * move the read position out of the available text range (text window)
0879: * @see #setReadPositionAbsolute
0880: */
0881: public void setReadPositionRelative(int offset)
0882: throws IndexOutOfBoundsException {
0883: setReadPositionAbsolute(getReadPosition() + offset);
0884: }
0885:
0886: /**
0887: * Closing this tokenizer frees resources and deregisters from the
0888: * associated {@link TokenizerProperties} object.
0889: */
0890: public void close() {
0891: // deregister from the properties
0892: if (_properties != null) {
0893: _properties.removeTokenizerPropertyListener(this );
0894: _properties = null;
0895: }
0896:
0897: // freeing memory
0898: if (_position2LineMap != null) {
0899: _position2LineMap.clear();
0900: _position2LineMap = null;
0901: }
0902:
0903: // adjust members
0904: _eofReached = true;
0905: _flags = 0;
0906: _flagMask = 0;
0907: _internalFlags = 0;
0908: _currentReadPos = 0;
0909: _currentWritePos = 0;
0910: _lineNumber = -1;
0911: _columnNumber = -1;
0912: _nextTokenizer = null;
0913: _prevTokenizer = null;
0914: _whitespaceHandler = null;
0915: _separatorHandler = null;
0916: _keywordHandler = null;
0917: _sequenceHandler = null;
0918: _patternHandler = null;
0919: _source = null;
0920: Arrays.fill(_scannedToken, null);
0921: }
0922:
0923: //---------------------------------------------------------------------------
0924: // embedded tokenizer support
0925: //
0926:
0927: /**
0928: * Adding an embedded tokenizer. Embedded tokenizer work on the same input
0929: * buffer as their base tokenizer. A situation where embedded tokenizer could
0930: * be applied, is a HTML stream with cascading style sheet (CSS) and JavaScript
0931: * parts.
0932: *<br>
0933: * There are no internal means of switching from one tokenizer to another.
0934: * This should be done by the caller using the method {@link #switchTo}.
0935: *<br>
0936: * The {@link TokenizerProperties#F_KEEP_DATA} and {@link TokenizerProperties#F_COUNT_LINES}
0937: * flags of the base tokenizer take effect also in the embedded tokenizers.
0938: *<br>
0939: * Since is might be possible that the given <code>tokenizer</code> is a
0940: * derivation of the <code>AbstractTokenizer</code> class, this method is
0941: * synchronized on <code>tokenizer</code>.
0942: *
0943: * @param tokenizer an embedded tokenizer
0944: * @throws TokenizerException if something goes wrong (not likely :-)
0945: */
0946: public void addTokenizer(AbstractTokenizer tokenizer)
0947: throws TokenizerException {
0948: AbstractTokenizer curr = this ;
0949:
0950: while (curr._nextTokenizer != null) {
0951: curr = curr._nextTokenizer;
0952: }
0953:
0954: if (tokenizer != null) {
0955: synchronized (tokenizer) {
0956: curr._nextTokenizer = tokenizer;
0957: tokenizer._prevTokenizer = curr;
0958:
0959: // share the input buffer of the base tokenizer
0960: AbstractTokenizer baseTokenizer = getBaseTokenizer();
0961:
0962: tokenizer._baseTokenizer = baseTokenizer;
0963:
0964: // inherited flags
0965: tokenizer.changeParseFlags(baseTokenizer
0966: .getParseFlags(), Flags.F_COUNT_LINES);
0967: }
0968: }
0969: }
0970:
0971: /**
0972: * Changing fron one tokenizer to another. If the given tokenizer has not been
0973: * added with {@link #addTokenizer}, an exception is thrown.<br>
0974: * The <code>switchTo</code> method does the nessecary synchronisation between
0975: * <code>this</code> and the given tokenizer. The user is therefore responsible
0976: * to use <code>switchTo</code> whenever a tokenizer change is nessecary. It
0977: * must be done this way:
0978: *<blockquote><pre>
0979: * Tokenizer base = new MyTokenizer(...)
0980: * Tokenizer embedded = new MyTokenizer(...)
0981: *
0982: * // setting properties (comments, keywords etc.)
0983: * ...
0984: *
0985: * // embedding a tokenizer
0986: * base.addTokenizer(embedded);
0987: *
0988: * // tokenizing with base
0989: * ...
0990: * if (<i>switch_condition</i>) {
0991: * base.switchTo(embedded);
0992: * }
0993: *
0994: * // tokenizing with embedded
0995: * ...
0996: * if (<i>switch_condition</i>) {
0997: * embedded.switchTo(base);
0998: * }
0999: *</pre></blockquote>
1000: * That way we avoid a more complex synchronisation between tokenizers whenever
1001: * one of them parses the next data in the input stream. However, the danger
1002: * of not synchronized tokenizers remains, so take care.
1003: *<br>
1004: * Since is might be possible that the given <code>tokenizer</code> is a
1005: * derivation of the <code>AbstractTokenizer</code> class, this method is
1006: * synchronized on <code>tokenizer</code>.
1007: *
1008: * @param tokenizer the tokenizer that should be used from now on
1009: */
1010: public void switchTo(AbstractTokenizer tokenizer)
1011: throws TokenizerException {
1012: if (tokenizer != null) {
1013: synchronized (tokenizer) {
1014: if (tokenizer._baseTokenizer != _baseTokenizer) {
1015: throw new TokenizerException(
1016: "Trying to switch to an alien tokenizer (not added with addTokenizer).",
1017: null);
1018: }
1019: tokenizer._eofReached = this ._eofReached;
1020: tokenizer._currentReadPos = this ._currentReadPos;
1021: tokenizer._currentWritePos = this ._currentWritePos;
1022: tokenizer._columnNumber = this ._columnNumber;
1023: tokenizer._lineNumber = this ._lineNumber;
1024: tokenizer._position2LineMap = this ._position2LineMap;
1025: }
1026: } else {
1027: throw new TokenizerException(new NullPointerException());
1028: }
1029: }
1030:
1031: //---------------------------------------------------------------------------
1032: // Methods that may be overwritten in derived classes
1033: //
1034:
1035: /**
1036: * This method checks if the character is a whitespace. Implement Your own
1037: * code for situations where this default implementation is not fast enough
1038: * or otherwise not really good.
1039: *
1040: * @param testChar check this character
1041: * @return <code>true</code> if the given character is a whitespace,
1042: * <code>false</code> otherwise
1043: */
1044: protected boolean isWhitespace(char testChar) {
1045: if (_whitespaceHandler != null) {
1046: return _whitespaceHandler.isWhitespace(testChar);
1047: } else {
1048: return false;
1049: }
1050: }
1051:
1052: /**
1053: * This method detects the number of whitespace characters starting at the given
1054: * position. It should return the number of characters identified as whitespaces
1055: * starting from and including the given start position.
1056: *<br>
1057: * Then overriding this method, use {@link #getBaseDataProvider} to access characters.
1058: *<br>
1059: * Do not attempt to actually read more data or do anything that leads to the
1060: * change of the data source or to tokenizer switching. This is done by the
1061: * tokenizer framework.
1062: *
1063: * @param startingAtPos start checking for whitespace from this position
1064: * @param maxChars if there is no non-whitespace character, read up to this number of characters
1065: * @return number of whitespace characters starting from the given offset
1066: * @throws TokenizerException failure while reading data from the input stream
1067: */
1068: protected int readWhitespaces(int startingAtPos, int maxChars)
1069: throws TokenizerException {
1070: if (_whitespaceHandler != null) {
1071: DataProvider dataProvider = getBaseDataProvider(
1072: startingAtPos, maxChars);
1073: return _whitespaceHandler
1074: .countLeadingWhitespaces(dataProvider);
1075: } else {
1076: return 0;
1077: }
1078: }
1079:
1080: /**
1081: * This method checks if the character sequence starting at a given position
1082: * with a given lenghth is a keyword. If so, it returns the keyword description
1083: * as {@link TokenizerProperty} object.
1084: *
1085: * @param startingAtPos check at this position
1086: * @param length the candidate has this number of characters
1087: * @throws TokenizerException routed exception from the active {@link de.susebox.jtopas.spi.KeywordHandler}
1088: * @return {@link TokenizerProperty} describing the keyword or <code>null</code>
1089: */
1090: protected TokenizerProperty isKeyword(int startingAtPos, int length)
1091: throws TokenizerException {
1092: if (_keywordHandler != null) {
1093: DataProvider dataProvider = getBaseDataProvider(
1094: startingAtPos, length);
1095: return _keywordHandler.isKeyword(dataProvider);
1096: } else {
1097: return null;
1098: }
1099: }
1100:
1101: //---------------------------------------------------------------------------
1102: // TokenizerPropertyListener methods
1103: //
1104:
1105: /**
1106: * Splits a given String into lines. The method ist used to retrieve the
1107: * image parts of several token types.
1108: *
1109: * @param image split this string into lines
1110: * @return an array containing the lines of the image without line separator
1111: * characters
1112: */
1113: protected String[] splitIntoLines(String image) {
1114: LinkedList lines = new LinkedList();
1115: int index = 0;
1116: int start = 0;
1117:
1118: while (index < image.length()) {
1119: switch (image.charAt(index)) {
1120: case '\r':
1121: lines.add(image.substring(start, index));
1122: if (index + 1 < image.length()
1123: && image.charAt(index + 1) == '\n') {
1124: index += 2;
1125: } else {
1126: index++;
1127: }
1128: start = index;
1129: break;
1130: case '\n':
1131: lines.add(image.substring(start, index));
1132: start = ++index;
1133: break;
1134: default:
1135: index++;
1136: }
1137: }
1138:
1139: if (start < index || start > 0) {
1140: lines.add(image.substring(start, index));
1141: }
1142:
1143: return (String[]) lines.toArray(new String[lines.size()]);
1144: }
1145:
1146: /**
1147: * Splits a given string into lines and removing string escapes. The method is
1148: * used to retrieve the image parts for string token types.
1149: *
1150: * @param prop the {@link TokenizerProperty} describing a string
1151: * @param image split this string into lines
1152: * @return an array containing the lines of the image without line separator
1153: * characters
1154: */
1155: protected String[] splitString(TokenizerProperty prop, String image) {
1156: // complete string
1157: String[] images = prop.getImages();
1158: String begin = images[0];
1159: String end = images[1];
1160: String esc = images[2];
1161: boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1162: boolean escEqualsEnd = (!noCase && esc.compareTo(end) == 0)
1163: || (noCase && esc.compareToIgnoreCase(end) == 0);
1164:
1165: StringBuffer buffer = null;
1166: int index = begin.length();
1167: int start = index;
1168: int endIndex;
1169:
1170: if (image.length() - start >= end.length()
1171: && ((!noCase && end.equals(image.substring(image
1172: .length()
1173: - end.length()))) || (noCase && end
1174: .equalsIgnoreCase(image.substring(image
1175: .length()
1176: - end.length()))))) {
1177: endIndex = image.length() - end.length();
1178: } else {
1179: endIndex = image.length();
1180: }
1181:
1182: while (index < endIndex) {
1183: if ((!noCase && image.startsWith(esc, index))
1184: || (noCase && image.substring(index,
1185: index + esc.length()).equalsIgnoreCase(esc))) {
1186: if (buffer == null) {
1187: buffer = new StringBuffer(image.length());
1188: }
1189: buffer.append(image.substring(start, index));
1190: index += esc.length();
1191: if (index < image.length()) {
1192: if ((!noCase && image.startsWith(esc, index))
1193: || (noCase && image.substring(index,
1194: index + esc.length())
1195: .equalsIgnoreCase(esc))) {
1196: buffer.append(esc);
1197: index += esc.length();
1198: } else if ((!noCase && image.startsWith(begin,
1199: index))
1200: || (noCase && image.substring(index,
1201: index + begin.length())
1202: .equalsIgnoreCase(begin))) {
1203: buffer.append(begin);
1204: index += begin.length();
1205: } else if ((!noCase && image.startsWith(end, index))
1206: || (noCase && image.substring(index,
1207: index + end.length())
1208: .equalsIgnoreCase(end))) {
1209: buffer.append(end);
1210: index += end.length();
1211: }
1212: }
1213: start = index;
1214: }
1215: index++;
1216: }
1217:
1218: if (buffer != null && start < index) {
1219: buffer.append(image.substring(start, endIndex));
1220: }
1221:
1222: return splitIntoLines((buffer != null) ? buffer.toString()
1223: : image.substring(start, endIndex));
1224: }
1225:
1226: /**
1227: * Splits a given block comment into lines. The method is used to retrieve the
1228: * image parts for block comment token types.
1229: *
1230: * @param prop the {@link TokenizerProperty} describing a block comment
1231: * @param image split this string into lines
1232: * @return an array containing the lines of the image without line separator
1233: * characters
1234: */
1235: protected String[] splitBlockComment(TokenizerProperty prop,
1236: String image) {
1237: // complete string
1238: String[] images = prop.getImages();
1239: String start = images[0];
1240: String end = images[1];
1241: boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1242:
1243: if (image.length() - start.length() >= end.length()
1244: && ((!noCase && end.equals(image.substring(image
1245: .length()
1246: - end.length()))) || (noCase && end
1247: .equalsIgnoreCase(image.substring(image
1248: .length()
1249: - end.length()))))) {
1250: return splitIntoLines(image.substring(start.length(), image
1251: .length()
1252: - end.length()));
1253: } else {
1254: return splitIntoLines(image.substring(start.length()));
1255: }
1256: }
1257:
1258: /**
1259: * Event handler method. The given {@link TokenizerPropertyEvent} parameter
1260: * contains the nessecary information about the property change. We choose
1261: * one single method in favour of various more specialized methods since the
1262: * reactions on adding, removing and modifying tokenizer properties are often
1263: * the same (flushing cash, rereading information etc.) are probably not very
1264: * different.
1265: *<br>
1266: * Note that a modification of the parse flags in the backing {@link TokenizerProperties}
1267: * object removes all flags previously modified through {@link #changeParseFlags}.
1268: *
1269: * @param event the {@link TokenizerPropertyEvent} that describes the change
1270: */
1271: public void propertyChanged(TokenizerPropertyEvent event) {
1272: TokenizerProperty prop = event.getProperty();
1273: String[] images = prop.getImages();
1274:
1275: synchronized (this ) {
1276: switch (event.getType()) {
1277: case TokenizerPropertyEvent.PROPERTY_ADDED:
1278: case TokenizerPropertyEvent.PROPERTY_REMOVED:
1279: switch (prop.getType()) {
1280: case Token.LINE_COMMENT:
1281: case Token.BLOCK_COMMENT:
1282: case Token.STRING:
1283: case Token.SPECIAL_SEQUENCE:
1284: if ((_internalFlags & IFLAG_EXTERNAL_SEQUENCE_HANDLER) == 0
1285: && _properties instanceof de.susebox.jtopas.spi.SequenceHandler) {
1286: setSequenceHandler((de.susebox.jtopas.spi.SequenceHandler) _properties);
1287: }
1288: break;
1289: case Token.KEYWORD:
1290: if ((_internalFlags & IFLAG_EXTERNAL_KEYWORD_HANDLER) == 0
1291: && _properties instanceof de.susebox.jtopas.spi.KeywordHandler) {
1292: setKeywordHandler((de.susebox.jtopas.spi.KeywordHandler) _properties);
1293: }
1294: break;
1295: case Token.PATTERN:
1296: if ((_internalFlags & IFLAG_EXTERNAL_PATTERN_HANDLER) == 0
1297: && _properties instanceof de.susebox.jtopas.spi.PatternHandler) {
1298: setPatternHandler((de.susebox.jtopas.spi.PatternHandler) _properties);
1299: }
1300: break;
1301: }
1302: break;
1303:
1304: case TokenizerPropertyEvent.PROPERTY_MODIFIED:
1305: switch (prop.getType()) {
1306: case TokenizerProperty.PARSE_FLAG_MASK:
1307: _flags = getTokenizerProperties().getParseFlags();
1308: _flagMask = 0;
1309: if (isFlagSet(Flags.F_COUNT_LINES)) {
1310: if (_lineNumber < 0) {
1311: if (_position2LineMap != null) {
1312: _position2LineMap.clear();
1313: }
1314: _lineNumber = 0;
1315: putPosition(_currentReadPos, _lineNumber);
1316: }
1317: if (_columnNumber < 0) {
1318: _columnNumber = 0;
1319: }
1320: } else {
1321: _lineNumber = -1;
1322: _columnNumber = -1;
1323: }
1324: break;
1325: }
1326: break;
1327: }
1328: }
1329: }
1330:
1331: //---------------------------------------------------------------------------
1332: // Implementation
1333: //
1334:
1335: /**
1336: * Embedded tokenizers have their base tokenizer they share the input stream
1337: * with.
1338: *
1339: * @return the base tokenizer (the one owning the input stream and text buffer)
1340: */
1341: protected AbstractTokenizer getBaseTokenizer() {
1342: return _baseTokenizer;
1343: }
1344:
1345: /**
1346: * Returns the {@link de.susebox.jtopas.spi.DataProvider} of the base tokenizer.
1347: * This is this tokenizer if it is not an embedded one.
1348: *
1349: * @param startPos position in the input data
1350: * @param length number of characters
1351: * @return the <code>DataProvider</code> for the given data range
1352: */
1353: protected DataProvider getBaseDataProvider(int startPos, int length) {
1354: return getBaseTokenizer().getDataProvider(startPos, length);
1355: }
1356:
1357: /**
1358: * This method organizes the input buffer. It moves the current text window if
1359: * nessecary or allocates more space, if data should be kept completely (see the
1360: * {@link TokenizerProperties#F_KEEP_DATA} flag).
1361: * Its main purpose is to call the {@link TokenizerSource#read} method.
1362: *
1363: * @return number of read bytes or -1 if an end-of-file condition occured
1364: * @throws TokenizerException wrapped exceptions from the {@link TokenizerSource#read}
1365: * method
1366: */
1367: protected int readMoreDataFromBase() throws TokenizerException {
1368: // its always the base tokenizer doing the reading
1369: int readChars = -1;
1370:
1371: if (!_eofReached) {
1372: AbstractTokenizer baseTokenizer = getBaseTokenizer();
1373:
1374: if (baseTokenizer != this ) {
1375: readChars = baseTokenizer.readMoreData();
1376: } else {
1377: readChars = readMoreData();
1378: }
1379: if (readChars > 0) {
1380: _currentWritePos += readChars;
1381: } else if (readChars < 0) {
1382: readChars = -1;
1383: _eofReached = true;
1384: }
1385:
1386: // Inform all embedded tokenizers about input buffer changes
1387: synchronizeAll();
1388: }
1389: return readChars;
1390: }
1391:
1392: /**
1393: * When the method {@link #readMoreData} changes the contents of the input buffer
1394: * or the input buffer itself, all embedded tokenizers must be synchronized.
1395: * That means their member variables are adjusted to the base tokenizer.
1396: *
1397: * @throws TokenizerException if something goes wrong
1398: */
1399: protected void synchronizeAll() throws TokenizerException {
1400: AbstractTokenizer embedded = getBaseTokenizer();
1401:
1402: while ((embedded = embedded._nextTokenizer) != null) {
1403: switchTo(embedded); // adjust the member variables
1404: }
1405: }
1406:
1407: /**
1408: * Checks the EOF condition at the given offset.
1409: *
1410: * @param offset check at this position relative to the current read position
1411: * @return <code>true</code> if EOF has been reached, <code>false</code> otherwise
1412: * @throws TokenizerException failure while reading data from the input stream
1413: */
1414: protected boolean isEOF(int offset) throws TokenizerException {
1415: if (_currentReadPos + offset < _currentWritePos
1416: || readMoreDataFromBase() > 0) {
1417: return false;
1418: } else {
1419: _scannedToken[1] = new Token(Token.EOF);
1420: return true;
1421: }
1422: }
1423:
1424: /**
1425: * The number of characters until the next comment, whitespace, string, special
1426: * sequence or separator are determined. The character sequnce is then checked
1427: * for keyword or pattern matching.
1428: *
1429: * @param token buffer to receive information about the keyword or normal token
1430: * @return <code>null</code> or a {@link TokenizerProperty} if a keyword or pattern is found
1431: * @throws TokenizerException failure while reading data from the input stream
1432: */
1433: protected TokenizerProperty completeBoundedToken(Token token)
1434: throws TokenizerException {
1435: // find out the return value (length of normal token)
1436: int len = 1; // the first character is a normal one, see call of this method
1437:
1438: while (!(isEOF(len) || isWhitespace(len)
1439: || isPattern(len, true) || isSpecialSequence(len) || isSeparator(len))) {
1440: len++;
1441: }
1442: token.setLength(len);
1443:
1444: // test on keyword or non-free pattern
1445: TokenizerProperty prop = null;
1446: PatternHandler.Result result;
1447:
1448: if ((prop = isKeyword(_currentReadPos, len)) != null) {
1449: token.setType(Token.KEYWORD);
1450: token.setCompanion(prop.getCompanion());
1451: } else {
1452: token.setType(Token.NORMAL);
1453: }
1454: return prop;
1455: }
1456:
1457: /**
1458: * After having identified a whitespace, this method continues to read data
1459: * until it detects a non-whitespace.
1460: *
1461: * @return number of consecutive whitespaces
1462: * @throws TokenizerException failure while reading data from the input stream
1463: */
1464: protected int completeWhitespace() throws TokenizerException {
1465: int start = _currentReadPos + 1; // the first whitespace we have already
1466: int available = _currentWritePos - start;
1467: int len = readWhitespaces(start, available);
1468:
1469: while (len == available) {
1470: if (readMoreDataFromBase() <= 0) {
1471: break;
1472: }
1473: start += len;
1474: available = _currentWritePos - start;
1475: len += readWhitespaces(start, available);
1476: }
1477: return len + 1; // the first whitespace we had already
1478: }
1479:
1480: /**
1481: * This method checks at the given offset if it is a whitespace.
1482: *
1483: * @param offset check at this position relative to the current read position
1484: * @throws TokenizerException failure while reading data from the input stream
1485: * @return <code>true</code> if a whitespace sequence was found at the given offset,
1486: * <code>false</code> otherwise
1487: */
1488: protected boolean isWhitespace(int offset)
1489: throws TokenizerException {
1490: if (_whitespaceHandler != null) {
1491: if (_currentReadPos + offset >= _currentWritePos
1492: && readMoreDataFromBase() < 0) {
1493: return false;
1494: }
1495:
1496: if (isWhitespace(getChar(_currentReadPos + offset))) {
1497: _scannedToken[1] = new Token(Token.WHITESPACE);
1498: return true;
1499: }
1500: }
1501: return false;
1502: }
1503:
1504: /**
1505: * This method checks at the given offset if it contains a separator.
1506: *
1507: * @param offset check at this position relative to the current read position
1508: * @throws TokenizerException failure while reading data from the input stream
1509: * @return <code>true</code> if a separator was found atthe given offset,
1510: * <code>false</code> otherwise
1511: */
1512: protected boolean isSeparator(int offset) throws TokenizerException {
1513: if (_separatorHandler != null
1514: && (_currentReadPos + offset < _currentWritePos || readMoreDataFromBase() > 0)
1515: && _separatorHandler
1516: .isSeparator(getChar(_currentReadPos + offset))) {
1517: _scannedToken[1] = new Token(Token.SEPARATOR);
1518: return true;
1519: } else {
1520: return false;
1521: }
1522: }
1523:
1524: /**
1525: * Testing for pattern matching.
1526: *
1527: * @param offset check at this position relative to the current read position
1528: * @param freePatternOnly if <code>true</code> consider only pattern that can occur anywhere in the data
1529: * @throws TokenizerException failure while reading data from the input stream
1530: * @return <code>true</code> if a pattern match was found at the given offset,
1531: * <code>false</code> otherwise
1532: */
1533: protected boolean isPattern(int offset, boolean freePatternOnly)
1534: throws TokenizerException {
1535: if (_patternHandler != null) {
1536: // for pattern, we might need a lot of data
1537: int startingAtPos = _currentReadPos + offset;
1538:
1539: while (_currentWritePos - startingAtPos < PATTERN_MAX_SIZE) {
1540: if (readMoreDataFromBase() <= 0) {
1541: break;
1542: }
1543: }
1544:
1545: // try pattern matching
1546: DataProvider dataProvider = getBaseDataProvider(
1547: startingAtPos, _currentWritePos - startingAtPos);
1548: PatternHandler.Result result = _patternHandler
1549: .matches(dataProvider);
1550: boolean isFree = (result != null) ? isFlagSet(result
1551: .getProperty(), Flags.F_FREE_PATTERN) : false;
1552:
1553: if (result != null && (isFree || !freePatternOnly)) {
1554: if (!isFree) {
1555: int nextOffset = offset + result.getLengthOfMatch();
1556:
1557: if (isEOF(nextOffset) || isWhitespace(nextOffset)
1558: || isPattern(nextOffset, true)
1559: || isSpecialSequence(nextOffset)
1560: || isSeparator(nextOffset)) {
1561: _scannedToken[2] = _scannedToken[1];
1562: } else {
1563: return false;
1564: }
1565: }
1566: _scannedToken[1] = new Token(Token.PATTERN, null,
1567: result.getProperty());
1568: _scannedToken[1].setLength(result.getLengthOfMatch());
1569: if (isFlagSet(result.getProperty(),
1570: Flags.F_RETURN_IMAGE_PARTS)) {
1571: _scannedToken[1].setImageParts(result.getGroups());
1572: }
1573: return true;
1574: }
1575: }
1576:
1577: // no pattern matching available or no match found
1578: return false;
1579: }
1580:
1581: /**
1582: * This method checks at the given offset if it contains a a special sequence.
1583: * Unlike the method {@link #test4SpecialSequence} it does nothing more.
1584: *
1585: * @param offset check at this position relative to the current read position
1586: * @throws TokenizerException failure while reading data from the input stream
1587: * @return <code>true</code> if a special sequence was found at the given offset,
1588: * <code>false</code> otherwise
1589: */
1590: protected boolean isSpecialSequence(int offset)
1591: throws TokenizerException {
1592: if (_sequenceHandler != null) {
1593: // do we need more data to ensure enough characters for even the longest
1594: // possible sequence match
1595: int startingAtPos = _currentReadPos + offset;
1596:
1597: while (_sequenceHandler.getSequenceMaxLength() > _currentWritePos
1598: - startingAtPos) {
1599: if (readMoreDataFromBase() <= 0) {
1600: break;
1601: }
1602: }
1603:
1604: // invoke the sequence handler
1605: DataProvider dataProvider = getBaseDataProvider(
1606: startingAtPos, _currentWritePos - startingAtPos);
1607: TokenizerProperty prop = _sequenceHandler
1608: .startsWithSequenceCommentOrString(dataProvider);
1609:
1610: if (prop != null) {
1611: _scannedToken[1] = new Token(prop.getType(), null, prop);
1612: return true;
1613: }
1614: }
1615:
1616: // no sequence handler given or no special sequence at given offset
1617: return false;
1618: }
1619:
1620: /**
1621: * Completing a line comment. After a line comment sequence has been found, all
1622: * characters up to and including the end-of-line combination belong to the
1623: * line comment. Note that on reaching end-of-file a line comment does not
1624: * nessecarily ends with an end-of-line sequence (linefeed for example).
1625: *
1626: * @param prop the property describing the line comment to complete
1627: * @return length of the line comment
1628: * @throws TokenizerException failure while reading data from the input stream
1629: */
1630: protected int completeLineComment(TokenizerProperty prop)
1631: throws TokenizerException {
1632: String[] images = prop.getImages();
1633: int len = images[0].length();
1634:
1635: while (_currentReadPos + len < _currentWritePos
1636: || readMoreDataFromBase() > 0) {
1637: switch (getChar(_currentReadPos + len)) {
1638: case '\r':
1639: len++;
1640: if (_currentReadPos + len < _currentWritePos
1641: || readMoreDataFromBase() > 0) {
1642: if (getChar(_currentReadPos + len) == '\n') {
1643: len++;
1644: }
1645: }
1646: return len;
1647: case '\n':
1648: len++;
1649: return len;
1650: default:
1651: len++;
1652: }
1653: }
1654: return len;
1655: }
1656:
1657: /**
1658: * Completing a block comment. After a block comment sequence has been found, all
1659: * characters up to and including the end sequence of the block comment belong
1660: * to the block comment. Note that on reaching end-of-file a block comment does
1661: * not nessecarily ends with an end-of-block-comment sequence.
1662: *
1663: * @param prop the property describing the block comment to complete
1664: * @return length of the block comment
1665: * @throws TokenizerException failure while reading data from the input stream
1666: */
1667: protected int completeBlockComment(TokenizerProperty prop)
1668: throws TokenizerException {
1669: String[] images = prop.getImages();
1670: String start = images[0];
1671: String end = images[1];
1672: boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1673: boolean nested = isFlagSet(prop, Flags.F_ALLOW_NESTED_COMMENTS);
1674: int len = start.length();
1675: int level = 0;
1676:
1677: __LOOP__: do {
1678: // test on nested comments: we take only care for nesting the same
1679: // block comment
1680: if (nested) {
1681: switch (comparePrefix(len, start, noCase)) {
1682: case 0: // comment start identified
1683: level++;
1684: len += start.length();
1685: continue __LOOP__;
1686: case -1: // EOF reached
1687: return _currentWritePos - _currentReadPos;
1688: }
1689: }
1690:
1691: // is it the end ?
1692: switch (comparePrefix(len, end, noCase)) {
1693: case 0: // comment end identified
1694: level--;
1695: len += end.length();
1696: break;
1697: case -1: // EOF reached
1698: return _currentWritePos - _currentReadPos;
1699: default:
1700: len++;
1701: }
1702: } while (level >= 0);
1703:
1704: // block comment regularly terminated
1705: return len;
1706: }
1707:
1708: /**
1709: * Completing a string. After a string start sequence has been found, all
1710: * characters up to and including the end-of-string sequence belong to the
1711: * string. Note that on reaching end-of-file a string does not nessecarily ends
1712: * with an end-of-string sequence.
1713: *
1714: * @param prop the property describing the string to complete
1715: * @return length of the string
1716: * @throws TokenizerException failure while reading data from the input stream
1717: */
1718: protected int completeString(TokenizerProperty prop)
1719: throws TokenizerException {
1720: // complete string
1721: String[] images = prop.getImages();
1722: String start = images[0];
1723: String end = images[1];
1724: String esc = images[2];
1725: int len = start.length();
1726: boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1727: boolean escEqualsEnd = (!noCase && esc.compareTo(end) == 0)
1728: || (noCase && esc.compareToIgnoreCase(end) == 0);
1729:
1730: while (true) {
1731: // test on escape
1732: if (esc != null) {
1733: switch (comparePrefix(len, esc, noCase)) {
1734: case 0: // escape found
1735: len += esc.length();
1736: if (escEqualsEnd) {
1737: switch (comparePrefix(len, end, noCase)) {
1738: case 0:
1739: len += end.length();
1740: break;
1741: case -1: // EOF reached
1742: return _currentWritePos - _currentReadPos;
1743: default: // this is the regular return point if the esc is the string end
1744: return len;
1745: }
1746: } else {
1747: len++; // esc != string end: skip the next character
1748: }
1749: continue;
1750: case -1: // EOF reached
1751: return _currentWritePos - _currentReadPos;
1752: }
1753: }
1754:
1755: // test on end sequence
1756: switch (comparePrefix(len, end, noCase)) {
1757: case 0: // this is the regular return point if esc != string end
1758: len += end.length();
1759: return len;
1760: case -1: // EOF reached
1761: return _currentWritePos - _currentReadPos;
1762: default:
1763: len++;
1764: }
1765: }
1766: }
1767:
1768: /**
1769: * This method compares the characters at the given offset (from the current
1770: * read position) with the given prefix.
1771: *
1772: * @param offset start comparing at this offset from the current read position
1773: * @param prefic compare read data with this prefix
1774: * @param noCase case- or not case-sensitive comparison
1775: * @throws TokenizerException failure while reading data from the input stream
1776: * @return 0 if the the given prefix matches the input stream, -1 on EOF and
1777: * 1 if not matching
1778: */
1779: protected int comparePrefix(int offset, String prefix,
1780: boolean noCase) throws TokenizerException {
1781: // compare
1782: int len = prefix.length();
1783:
1784: for (int pos = offset; pos < offset + len; ++pos) {
1785: // do we have enough data
1786: if (_currentReadPos + pos >= _currentWritePos
1787: && readMoreDataFromBase() < 0) {
1788: return -1;
1789: }
1790:
1791: // compare single character
1792: char c1 = prefix.charAt(pos - offset);
1793: char c2 = getChar(_currentReadPos + pos);
1794:
1795: if (c1 != c2
1796: && (!noCase || Character.toUpperCase(c1) != Character
1797: .toUpperCase(c2))) {
1798: return 1;
1799: }
1800: }
1801:
1802: // found
1803: return 0;
1804: }
1805:
1806: /**
1807: * The method recomputes the line and column position of the tokenizer, if the
1808: * flag {@link TokenizerProperties#F_COUNT_LINES} is set. It gets the token type of the
1809: * {@link Token} that has been retrieved by the calling {@link #nextToken}.
1810: * Using the tokenizer control flags and certain other information it tries to
1811: * to find end-of-line sequences as fast as possible. For example, a line
1812: * comment should always contain a end-of-line sequence, so we can simply
1813: * increase the line count and set the column count to 0.
1814: *
1815: * @param type the type of the current token
1816: * @param length the length of the current token
1817: */
1818: protected void adjustLineAndColumn(int type, int length) {
1819: // line and column counting not required
1820: if (!isFlagSet(Flags.F_COUNT_LINES)) {
1821: return;
1822: }
1823:
1824: // there might be a simple way to determine the current line and column position
1825: switch (type) {
1826: case Token.EOF:
1827: return;
1828:
1829: case Token.LINE_COMMENT: // a line comment always ends with a newline
1830: _lineNumber++;
1831: _columnNumber = 0;
1832: putPosition(_currentReadPos + length, _lineNumber);
1833: return;
1834:
1835: case Token.SPECIAL_SEQUENCE:
1836: case Token.SEPARATOR:
1837: case Token.NORMAL:
1838: case Token.KEYWORD:
1839: if (_whitespaceHandler != null
1840: && _whitespaceHandler.newlineIsWhitespace()) { // newline is a whitespace character
1841: _columnNumber += length; // it should therefore not occure in other
1842: return; // tokens
1843: }
1844: break;
1845:
1846: case Token.WHITESPACE:
1847: if (!(_whitespaceHandler.isWhitespace('\n') || _whitespaceHandler
1848: .isWhitespace('\r'))) {
1849: _columnNumber += length; // newline is not a whitespace; we do not have
1850: return; // to test for it in the current token
1851: }
1852: break;
1853: }
1854:
1855: // count it
1856: int newLineNumber = _lineNumber;
1857:
1858: for (int pos = _currentReadPos; pos < _currentReadPos + length; ++pos) {
1859: switch (getChar(pos)) {
1860: case '\r':
1861: if (pos + 1 >= _currentReadPos + length
1862: || getChar(pos + 1) != '\n') {
1863: _lineNumber++;
1864: _columnNumber = 0;
1865: putPosition(pos + 1, _lineNumber);
1866: break;
1867: }
1868: pos++;
1869: /* no break; */
1870: case '\n':
1871: _lineNumber++;
1872: _columnNumber = 0;
1873: putPosition(pos + 1, _lineNumber);
1874: break;
1875:
1876: default:
1877: _columnNumber++;
1878: }
1879: }
1880: }
1881:
1882: /**
1883: * Putting a new position into the position-to-line-number map.
1884: *
1885: * @param position the position to map to the current line number
1886: */
1887: private void putPosition(int position, int lineNumber) {
1888: if (_position2LineMap == null) {
1889: _position2LineMap = new TreeMap();
1890: }
1891: _position2LineMap.put(new Integer(position), new Integer(
1892: lineNumber));
1893: }
1894:
1895: /**
1896: * Checking a given flag. The method considers both the globally set flags
1897: * in the associated {@link TokenizerProperties} instance and the locally set
1898: * by {@link #changeParseFlags}.
1899: *
1900: * @param flag one of the <code>F_...</code> flags defined in {@link TokenizerProperties}
1901: */
1902: protected boolean isFlagSet(int flag) {
1903: return (getParseFlags() & flag) != 0;
1904: }
1905:
1906: /**
1907: * Checking if a given flag is set for the given {@link TokenizerProperty}, for
1908: * this <code>Tokenizer</code> or for the used {@link TokenizerProperties}. The method considers both the globally set flags
1909: * in the associated {@link TokenizerProperties} instance and the locally set
1910: * by {@link #changeParseFlags}.
1911: *
1912: * @param prop check the flag for this property
1913: * @param flag one of the {@link Flags} constants
1914: */
1915: protected boolean isFlagSet(TokenizerProperty prop, int flag) {
1916: return prop.isFlagSet(flag, (getTokenizerProperties()
1917: .getParseFlags() & flag) != 0
1918: || isFlagSet(flag));
1919: }
1920:
1921: //---------------------------------------------------------------------------
1922: // Class members
1923: //
1924:
1925: /**
1926: * mask of flags that can be set separately for a <code>AbstractTokenizer</code>.
1927: */
1928: protected static final int VALID_FLAGS_MASK = Flags.F_RETURN_WHITESPACES
1929: | Flags.F_TOKEN_POS_ONLY
1930: | Flags.F_KEEP_DATA
1931: | Flags.F_COUNT_LINES;
1932:
1933: /**
1934: * {@link TokenizerProperties} tha tare used if no others have been
1935: * specified by calling {@link #setTokenizerProperties}.
1936: */
1937: protected StandardTokenizerProperties _defaultProperties = null;
1938:
1939: /**
1940: * Buffer sizes
1941: */
1942: private static final int PATTERN_MAX_SIZE = 0x40000; // 256K
1943:
1944: /**
1945: * Bits for the internal flag bitmask
1946: */
1947: private static final byte IFLAG_EXTERNAL_PATTERN_HANDLER = 0x01;
1948: private static final byte IFLAG_EXTERNAL_KEYWORD_HANDLER = 0x02;
1949: private static final byte IFLAG_EXTERNAL_SEQUENCE_HANDLER = 0x04;
1950:
1951: //---------------------------------------------------------------------------
1952: // Members
1953: //
1954:
1955: /**
1956: * overall tokenizer flags.
1957: */
1958: protected int _flags = 0;
1959:
1960: /**
1961: * a combination of <code>F_...</code> constants defined in {@link TokenizerProperties}
1962: * indicating which bits in the {@link #_flags} member are valid. All other
1963: * flags are taken from the associated {@link TokenizerProperties} object.
1964: *
1965: * @see #changeParseFlags
1966: */
1967: private int _flagMask = 0;
1968:
1969: /**
1970: * Flag if EOF has been reached. The flag should speed up calls to {@link readMoreDataFromBase}
1971: */
1972: private boolean _eofReached = true;
1973:
1974: /**
1975: * Data index there {@link #nextToken} will start parsing.
1976: */
1977: protected int _currentReadPos = 0;
1978:
1979: /**
1980: * Data index there {@link #readMoreDataFromBase} will fill in new data.
1981: */
1982: protected int _currentWritePos = 0;
1983:
1984: /**
1985: * if line counting is enabled, this contains the current line number starting
1986: * with 0.
1987: */
1988: protected int _lineNumber = -1;
1989:
1990: /**
1991: * if line counting is enabled, this contains the current column number starting
1992: * with 0.
1993: */
1994: protected int _columnNumber = -1;
1995:
1996: /**
1997: * List of currently known token. The first element is the current token returned
1998: * by the last call to {@link #nextToken}. The following elements are look-ahead
1999: * token that have already been identified when extracting the current token.
2000: */
2001: protected Token[] _scannedToken = new Token[] { null, null, null };
2002:
2003: /**
2004: * For embedded tokenizers: this is the list of the succeding tokenizers
2005: */
2006: protected AbstractTokenizer _nextTokenizer = null;
2007:
2008: /**
2009: * For embedded tokenizers: this is the base tokenizer that reads the data
2010: */
2011: protected AbstractTokenizer _baseTokenizer = null;
2012:
2013: /**
2014: * For embedded tokenizers: this is the list of the previous tokenizers
2015: */
2016: protected AbstractTokenizer _prevTokenizer = null;
2017:
2018: /**
2019: * Whitespace handler
2020: */
2021: private de.susebox.jtopas.spi.WhitespaceHandler _whitespaceHandler = null;
2022:
2023: /**
2024: * Separator handler
2025: */
2026: private de.susebox.jtopas.spi.SeparatorHandler _separatorHandler = null;
2027:
2028: /**
2029: * Keyword handler
2030: */
2031: private de.susebox.jtopas.spi.KeywordHandler _keywordHandler = null;
2032:
2033: /**
2034: * Sequence handler
2035: */
2036: private de.susebox.jtopas.spi.SequenceHandler _sequenceHandler = null;
2037:
2038: /**
2039: * Sequence handler
2040: */
2041: private de.susebox.jtopas.spi.PatternHandler _patternHandler = null;
2042:
2043: /**
2044: * The source of input data
2045: */
2046: private TokenizerSource _source = null;
2047:
2048: /**
2049: * The characteristics of this tokenizer.
2050: */
2051: private TokenizerProperties _properties = null;
2052:
2053: /**
2054: * Line number to position mapping
2055: */
2056: private TreeMap _position2LineMap = null;
2057:
2058: /**
2059: * Control flags for the internal work
2060: */
2061: private long _internalFlags = 0;
2062: }
|