0001: /*
0002: * StandardTokenizerProperties.java: general-use TokenizerProperties implementation
0003: *
0004: * Copyright (C) 2002 Heiko Blau
0005: *
0006: * This file belongs to the JTopas Library.
0007: * JTopas is free software; you can redistribute it and/or modify it
0008: * under the terms of the GNU Lesser General Public License as published by the
0009: * Free Software Foundation; either version 2.1 of the License, or (at your
0010: * option) any later version.
0011: *
0012: * This software is distributed in the hope that it will be useful, but WITHOUT
0013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0014: * FITNESS FOR A PARTICULAR PURPOSE.
0015: * See the GNU Lesser General Public License for more details.
0016: *
0017: * You should have received a copy of the GNU Lesser General Public License along
0018: * with JTopas. If not, write to the
0019: *
0020: * Free Software Foundation, Inc.
0021: * 59 Temple Place, Suite 330,
0022: * Boston, MA 02111-1307
0023: * USA
0024: *
0025: * or check the Internet: http://www.fsf.org
0026: *
0027: * Contact:
0028: * email: heiko@susebox.de
0029: */
0030:
0031: package de.susebox.jtopas;
0032:
0033: //-----------------------------------------------------------------------------
0034: // Imports
0035: //
0036: import java.util.Arrays;
0037: import java.util.ArrayList;
0038: import java.util.Map;
0039: import java.util.HashMap;
0040: import java.util.Iterator;
0041: import java.util.NoSuchElementException;
0042:
0043: import de.susebox.java.lang.ExtRuntimeException;
0044: import de.susebox.java.lang.ExtUnsupportedOperationException;
0045: import de.susebox.java.lang.ExtIllegalArgumentException;
0046:
0047: import de.susebox.jtopas.spi.DataMapper;
0048: import de.susebox.jtopas.spi.DataProvider;
0049: import de.susebox.jtopas.spi.PatternHandler;
0050:
0051: import de.susebox.jtopas.impl.PatternMatcher;
0052: import de.susebox.jtopas.impl.SequenceStore;
0053: import de.susebox.jtopas.impl.NoCaseSequenceStore;
0054:
0055: //-----------------------------------------------------------------------------
0056: // Class StandardTokenizerProperties
0057: //
0058:
0059: /**<p>
0060: * The class <code>StandardTokenizerProperties</code> provides a simple implementation
0061: * of the {@link TokenizerProperties} interface for use in most situations.
0062: *</p><p>
0063: * Note that this class takes advantage of JTopas features that use Java 1.4 or
0064: * higher. It can still be used in older environments but not compiled with JDK
0065: * versions below 1.4!
0066: *</p>
0067: *
0068: * @see TokenizerProperties
0069: * @see Tokenizer
0070: * @author Heiko Blau
0071: */
0072: public class StandardTokenizerProperties extends
0073: AbstractTokenizerProperties implements TokenizerProperties,
0074: DataMapper {
0075:
0076: //---------------------------------------------------------------------------
0077: // Properties
0078: //
0079:
0080: /**
0081: * Maximum length of a non-free pattern match. These are patterns that dont
0082: * have the {@link TokenizerProperties#F_FREE_PATTERN} flag set. A common
0083: * example are number patterns.
0084: */
0085: public static final short MAX_NONFREE_MATCHLEN = 1024;
0086:
0087: //---------------------------------------------------------------------------
0088: // Constructors
0089: //
0090:
0091: /**
0092: * Default constructor that intitializes an instance with the default whitespaces
0093: * and separator sets. {@link Tokenizer} instances using this <code>StandardTokenizerProperties</code>
0094: * object, split text between spaces, tabs and line ending sequences as well
0095: * as between punctuation characters.
0096: */
0097: public StandardTokenizerProperties() {
0098: this (0);
0099: }
0100:
0101: /**
0102: * This constructor takes the control flags to be used. It is a shortcut to:
0103: * <pre>
0104: * TokenizerProperties props = new StandardTokenizerProperties();
0105: *
0106: * props.setParseFlags(flags);
0107: * </pre>
0108: * See the {@link TokenizerProperties} interface for the supported flags.
0109: *<br>
0110: * The {@link TokenizerProperties#DEFAULT_WHITESPACES} and
0111: * {@link TokenizerProperties#DEFAULT_SEPARATORS} are used for whitespace and
0112: * separator handling if no explicit calls to {@link #setWhitespaces} and
0113: * {@link #setSeparators} will follow subsequently.
0114: *
0115: * @param flags tokenizer control flags
0116: * @see #setParseFlags
0117: */
0118: public StandardTokenizerProperties(int flags) {
0119: this (flags, DEFAULT_WHITESPACES, DEFAULT_SEPARATORS);
0120: }
0121:
0122: /**
0123: * This constructor takes the whitespace and separator sets to be used. It is
0124: * a shortcut to:
0125: * <pre>
0126: * TokenizerProperties props = new StandardTokenizerProperties();
0127: *
0128: * props.setWhitespaces(ws);
0129: * props.setSeparators(sep);
0130: * </pre>
0131: *
0132: * @param flags tokenizer control flags
0133: * @param whitespaces the whitespace set
0134: * @param separators the set of separating characters
0135: * @see #setParseFlags
0136: * @see #setWhitespaces
0137: * @see #setSeparators
0138: */
0139: public StandardTokenizerProperties(int flags, String whitespaces,
0140: String separators) {
0141: Arrays.fill(_charFlags, 0);
0142: setParseFlags(flags);
0143: setWhitespaces(whitespaces);
0144: setSeparators(separators);
0145: }
0146:
0147: //---------------------------------------------------------------------------
0148: // Abstract methods of the base class
0149: //
0150:
0151: /**
0152: * Retrieving a property by a given type and image. See the method description
0153: * in {@link AbstractTokenizerProperties} for details.
0154: *
0155: * @param type the type the returned property should have
0156: * @param startImage the (starting) image
0157: * @return the token description for the image or <code>null</code>
0158: */
0159: protected TokenizerProperty doGetProperty(int type,
0160: String startImage) {
0161: TokenizerProperty prop = null;
0162:
0163: switch (type) {
0164: case Token.KEYWORD:
0165: if (_keywords[0] != null) {
0166: prop = _keywords[0].getKeyword(startImage);
0167: }
0168: if (prop == null && _keywords[1] != null) {
0169: prop = _keywords[1].getKeyword(startImage);
0170: }
0171: break;
0172:
0173: case Token.STRING:
0174: case Token.LINE_COMMENT:
0175: case Token.BLOCK_COMMENT:
0176: case Token.SPECIAL_SEQUENCE:
0177: if (_sequences[0] != null) {
0178: prop = _sequences[0].getSpecialSequence(startImage);
0179: }
0180: if (prop == null && _sequences[1] != null) {
0181: prop = _sequences[1].getSpecialSequence(startImage);
0182: }
0183: break;
0184:
0185: case Token.PATTERN:
0186: for (int index = 0; index < _patterns.size(); ++index) {
0187: PatternMatcher data = (PatternMatcher) _patterns
0188: .get(index);
0189:
0190: prop = data.getProperty();
0191: if (prop.getImages()[0].equals(startImage)) {
0192: break;
0193: }
0194: prop = null;
0195: }
0196: break;
0197:
0198: case Token.WHITESPACE:
0199: case Token.SEPARATOR:
0200: default:
0201: throw new ExtIllegalArgumentException(
0202: "Unsupported property type {0}. (Leading) image \"{1}\".",
0203: new Object[] { new Integer(type), startImage });
0204: }
0205:
0206: // either the required property or null
0207: return prop;
0208: }
0209:
0210: /**
0211: * Setting a new separator set. See the method description in
0212: * {@link AbstractTokenizerProperties} for details.
0213: *
0214: * @param separators the set of separators including ranges
0215: * @return the replaced separator set or <code>null</code>
0216: */
0217: protected String doSetSeparators(String separators) {
0218: String oldValue;
0219:
0220: // which separators should be set?
0221: if ((_flags & Flags.F_NO_CASE) == 0) {
0222: oldValue = (_separatorsCase.length() > 0) ? _separatorsCase
0223: : _separatorsNoCase;
0224: _separatorsCase = separators;
0225: _separatorsNoCase = "";
0226: } else {
0227: oldValue = (_separatorsNoCase.length() > 0) ? _separatorsNoCase
0228: : _separatorsCase;
0229: _separatorsCase = "";
0230: _separatorsNoCase = separators;
0231: }
0232:
0233: // mark seaparators in character table
0234: putCharSet(oldValue, Token.SEPARATOR, false);
0235: putCharSet(separators, Token.SEPARATOR, true);
0236:
0237: // normalize the old value
0238: if (oldValue == null || oldValue.length() == 0) {
0239: return null;
0240: } else {
0241: return oldValue;
0242: }
0243: }
0244:
0245: /**
0246: * Setting a new whitespace set. See the method description in
0247: * {@link AbstractTokenizerProperties} for details.
0248: *
0249: * @param whitespaces the set of whitespaces including ranges
0250: * @return the replaced whitespace set or <code>null</code>
0251: */
0252: protected String doSetWhitespaces(String whitespaces) {
0253: // set the right whitespaces
0254: String oldValue;
0255:
0256: if ((_flags & Flags.F_NO_CASE) == 0) {
0257: oldValue = (_whitespacesCase.length() > 0) ? _whitespacesCase
0258: : _whitespacesNoCase;
0259: _whitespacesCase = whitespaces;
0260: _whitespacesNoCase = "";
0261: } else {
0262: oldValue = (_whitespacesNoCase.length() > 0) ? _whitespacesNoCase
0263: : _whitespacesCase;
0264: _whitespacesCase = "";
0265: _whitespacesNoCase = whitespaces;
0266: }
0267:
0268: // mark whitespaces in character table
0269: putCharSet(oldValue, Token.WHITESPACE, false);
0270: putCharSet(whitespaces, Token.WHITESPACE, true);
0271:
0272: // return changes
0273: if (oldValue == null || oldValue.length() == 0) {
0274: return null;
0275: } else {
0276: return oldValue;
0277: }
0278: }
0279:
0280: /**
0281: * Registering a {@link TokenizerProperty}.
0282: * See the method description in {@link AbstractTokenizerProperties}.
0283: *
0284: * @param property property to register
0285: * @return the replaced property or <code>null</code>
0286: */
0287: protected TokenizerProperty doAddProperty(TokenizerProperty property) {
0288: switch (property.getType()) {
0289: case Token.STRING:
0290: case Token.LINE_COMMENT:
0291: case Token.BLOCK_COMMENT:
0292: case Token.SPECIAL_SEQUENCE:
0293: return addSpecialSequence(property);
0294:
0295: case Token.KEYWORD:
0296: return addKeyword(property);
0297:
0298: case Token.PATTERN:
0299: return addPattern(property);
0300:
0301: case Token.WHITESPACE:
0302: case Token.SEPARATOR:
0303: default:
0304: throw new ExtIllegalArgumentException(
0305: "Unsupported property type {0}. (Leading) image \"{1}\".",
0306: new Object[] { new Integer(property.getType()),
0307: property.getImages()[0] });
0308: }
0309: }
0310:
0311: /**
0312: * Deregistering a {@link TokenizerProperty} from the store.
0313: * See the method description in {@link AbstractTokenizerProperties}.
0314: *
0315: * @param property property to remove
0316: * @return the replaced property or <code>null</code>
0317: */
0318: protected TokenizerProperty doRemoveProperty(
0319: TokenizerProperty property) {
0320: // removing property according to type
0321: TokenizerProperty prop = null;
0322: String image = property.getImages()[0];
0323:
0324: switch (property.getType()) {
0325: case Token.LINE_COMMENT:
0326: case Token.BLOCK_COMMENT:
0327: case Token.STRING:
0328: case Token.SPECIAL_SEQUENCE:
0329: if (_sequences[0] != null) {
0330: prop = _sequences[0].removeSpecialSequence(image);
0331: }
0332: if (prop == null && _sequences[1] != null) {
0333: prop = _sequences[1].removeSpecialSequence(image);
0334: }
0335: break;
0336:
0337: case Token.KEYWORD:
0338: if (_keywords[0] != null) {
0339: prop = _keywords[0].removeKeyword(image);
0340: }
0341: if (prop == null && _keywords[1] != null) {
0342: prop = _keywords[1].removeKeyword(image);
0343: }
0344: break;
0345:
0346: case Token.PATTERN:
0347: for (int index = 0; index < _patterns.size(); ++index) {
0348: PatternMatcher data = (PatternMatcher) _patterns
0349: .get(index);
0350:
0351: prop = data.getProperty();
0352: if (prop.getImages()[0].equals(image)) {
0353: _patterns.remove(index);
0354: break;
0355: } else {
0356: prop = null;
0357: }
0358: }
0359: break;
0360:
0361: case Token.WHITESPACE:
0362: case Token.SEPARATOR:
0363: default:
0364: throw new ExtIllegalArgumentException(
0365: "Unsupported property type {0}. (Leading) image \"{1}\".",
0366: new Object[] { new Integer(property.getType()),
0367: image });
0368: }
0369:
0370: // return removed property
0371: return prop;
0372: }
0373:
0374: //---------------------------------------------------------------------------
0375: // Methods of the TokenizerProperties interface
0376: //
0377:
0378: /**
0379: * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
0380: * objects. See the method description in {@link TokenizerProperties}.
0381: *
0382: * @return enumeration of {@link TokenizerProperty} objects
0383: */
0384: public Iterator getStrings() {
0385: return new SpecialSequencesIterator(this , _sequences,
0386: Token.STRING);
0387: }
0388:
0389: /**
0390: * Obtaining the whitespace character set.
0391: * See the method description in {@link TokenizerProperties}.
0392: *
0393: * @see #setWhitespaces
0394: * @return the currently active whitespace set
0395: */
0396: public String getWhitespaces() {
0397: synchronized (this ) {
0398: return _whitespacesCase + _whitespacesNoCase;
0399: }
0400: }
0401:
0402: /**
0403: * Obtaining the separator set of the <code>Tokenizer</code>.
0404: * See the method description in {@link TokenizerProperties}.
0405: *
0406: * @see #setSeparators
0407: * @return the currently used set of separating characters
0408: */
0409: public String getSeparators() {
0410: synchronized (this ) {
0411: return _separatorsCase + _separatorsNoCase;
0412: }
0413: }
0414:
0415: /**
0416: * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
0417: * objects.
0418: * See the method description in {@link TokenizerProperties}.
0419: *
0420: * @return enumeration of {@link TokenizerProperty} objects
0421: */
0422: public Iterator getLineComments() {
0423: return new SpecialSequencesIterator(this , _sequences,
0424: Token.LINE_COMMENT);
0425: }
0426:
0427: /**
0428: * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
0429: * objects.
0430: * See the method description in {@link TokenizerProperties}.
0431: *
0432: * @return enumeration of {@link TokenizerProperty} objects
0433: */
0434: public Iterator getBlockComments() {
0435: return new SpecialSequencesIterator(this , _sequences,
0436: Token.BLOCK_COMMENT);
0437: }
0438:
0439: /**
0440: * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
0441: * objects.
0442: * See the method description in {@link TokenizerProperties}.
0443: *
0444: * @return enumeration of {@link TokenizerProperty} objects
0445: */
0446: public Iterator getSpecialSequences() {
0447: return new SpecialSequencesIterator(this , _sequences,
0448: Token.SPECIAL_SEQUENCE);
0449: }
0450:
0451: /**
0452: * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
0453: * objects.
0454: * See the method description in {@link TokenizerProperties}.
0455: *
0456: * @return iteration of {@link TokenizerProperty} objects
0457: */
0458: public Iterator getKeywords() {
0459: return new SpecialSequencesIterator(this , _keywords,
0460: Token.KEYWORD);
0461: }
0462:
0463: /**
0464: * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
0465: * objects. Each <code>TokenizerProperty</code> object contains a pattern and
0466: * its companion if such an associated object exists.
0467: *
0468: * @return enumeration of {@link TokenizerProperty} objects
0469: */
0470: public Iterator getPatterns() {
0471: return new PatternIterator(this );
0472: }
0473:
0474: /**
0475: * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
0476: * objects.
0477: * See the method description in {@link TokenizerProperties}.
0478: *
0479: * @return enumeration of {@link TokenizerProperty} objects
0480: */
0481: public Iterator getProperties() {
0482: return new FullIterator(this );
0483: }
0484:
0485: //---------------------------------------------------------------------------
0486: // Methods of the DataMapper interface
0487: //
0488:
0489: /**
0490: * Setting the backing {@link TokenizerProperties} instance this <code>DataMapper</code>
0491: * is working with. Usually, the <code>DataMapper</code>
0492: * interface is implemented by <code>TokenizerProperties</code> implementations,
0493: * too. Otherwise the {@link Tokenizer} using the <code>TokenizerProperties</code>,
0494: * will construct a default <code>DataMapper</code> an propagate the
0495: * <code>TokenizerProperties</code> instance by calling this method.
0496: *<br>
0497: * The method should throw an {@link java.lang.UnsupportedOperationException}
0498: * if this <code>DataMapper</code> is an extension to an <code>TokenizerProperties</code>
0499: * implementation.
0500: *
0501: * @param props the {@link de.susebox.jtopas.TokenizerProperties}
0502: * @throws UnsupportedOperationException is this is a <code>DataMapper</code>
0503: * implemented by a {@link de.susebox.jtopas.TokenizerProperties}
0504: * implementation
0505: * @throws NullPointerException if no {@link TokenizerProperties} are given
0506: */
0507: public void setTokenizerProperties(TokenizerProperties props)
0508: throws UnsupportedOperationException, NullPointerException {
0509: throw new ExtUnsupportedOperationException(
0510: "Class {0} already defines the {1} interface.",
0511: new Object[] {
0512: StandardTokenizerProperties.class.getName(),
0513: DataMapper.class.getName() });
0514: }
0515:
0516: /**
0517: * The method retrieves the backing {@link de.susebox.jtopas.TokenizerProperties}
0518: * instance, this <code>DataMapper</code> is working on. For implementations
0519: * of the <code>TokenizerProperties</code> interface that also implement the
0520: * <code>DataMapper</code> interface, this method returns the instance itself
0521: * it is called on.
0522: *<br>
0523: * Otherwise the method returns the <code>TokenizerProperties</code> instance
0524: * passed through the last call to {@link #setTokenizerProperties} or <code>null</code>
0525: * if no such call has taken place so far.
0526: *
0527: * @return the backing {@link de.susebox.jtopas.TokenizerProperties} or <code>null</code>
0528: */
0529: public TokenizerProperties getTokenizerProperties() {
0530: return this ;
0531: }
0532:
0533: /**
0534: * This method checks if the character is a whitespace. Implement Your own
0535: * code for situations where this default implementation is not fast enough
0536: * or otherwise not really good.
0537: *
0538: * @param testChar check this character
0539: * @return <code>true</code> if the given character is a whitespace,
0540: * <code>false</code> otherwise
0541: */
0542: public boolean isWhitespace(char testChar) {
0543: try {
0544: return (_charFlags[testChar] & CHARFLAG_WHITESPACE) != 0;
0545: } catch (ArrayIndexOutOfBoundsException ex) {
0546: Integer extFlags = (Integer) _extCharFlags.get(new Integer(
0547: testChar));
0548: return (extFlags != null && (extFlags.intValue() & CHARFLAG_WHITESPACE) != 0);
0549: }
0550: }
0551:
0552: /**
0553: * This method detects the number of whitespace characters the data range given
0554: * through the {@link DataProvider} parameter starts with.
0555: *
0556: * @param dataProvider the source to get the data range from
0557: * @return number of whitespace characters starting from the given offset
0558: * @throws TokenizerException failure while reading data from the input stream
0559: * @throws NullPointerException if no {@link DataProvider} is given
0560: * @see de.susebox.jtopas.spi.DataProvider
0561: */
0562: public int countLeadingWhitespaces(DataProvider dataProvider)
0563: throws NullPointerException {
0564: int maxChars = dataProvider.getLength();
0565: int len = 0;
0566:
0567: while (len < maxChars
0568: && isWhitespace(dataProvider.getCharAt(len))) {
0569: len++;
0570: }
0571: return len;
0572: }
0573:
0574: /**
0575: * If a {@link Tokenizer} performs line counting, it is often nessecary to
0576: * know if newline characters is considered to be a whitespace. See {@link WhitespaceHandler}
0577: * for details.
0578: *
0579: * @return <code>true</code> if newline characters are in the current whitespace set,
0580: * <code>false</code> otherwise
0581: *
0582: */
0583: public boolean newlineIsWhitespace() {
0584: return (_charFlags['\n'] & CHARFLAG_WHITESPACE) != 0
0585: && (_charFlags['\r'] & CHARFLAG_WHITESPACE) != 0;
0586: }
0587:
0588: /**
0589: * This method checks the given character if it is a separator.
0590: *
0591: * @param testChar check this character
0592: * @return <code>true</code> if the given character is a separator,
0593: * <code>false</code> otherwise
0594: */
0595: public boolean isSeparator(char testChar) {
0596: try {
0597: return (_charFlags[testChar] & CHARFLAG_SEPARATOR) != 0;
0598: } catch (ArrayIndexOutOfBoundsException ex) {
0599: Integer extFlags = (Integer) _extCharFlags.get(new Integer(
0600: testChar));
0601: return (extFlags != null && (extFlags.intValue() & CHARFLAG_SEPARATOR) != 0);
0602: }
0603: }
0604:
0605: /**
0606: * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation
0607: * for a fast detection if special sequence checking must be performed at all.
0608: * If the method returns <code>false</code> time-consuming preparations can be
0609: * skipped.
0610: *
0611: * @return <code>true</code> if there actually are pattern that can be tested
0612: * for a match, <code>false</code> otherwise.
0613: */
0614: public boolean hasSequenceCommentOrString() {
0615: synchronized (_sequences) {
0616: return (_sequences[0] != null || _sequences[1] != null);
0617: }
0618: }
0619:
0620: /**
0621: * This method checks if a given range of data starts with a special sequence,
0622: * a comment or a string. These three types of token are testet together since
0623: * both comment and string prefixes are ordinary special sequences. Only the
0624: * actions preformed <strong>after</strong> a string or comment has been detected,
0625: * are different.
0626: *<br>
0627: * The method returns <code>null</code> if no special sequence, comment or string
0628: * could matches the the leading part of the data range given through the
0629: * {@link DataProvider}.
0630: *<br>
0631: * In cases of strings or comments, the return value contains the description
0632: * for the introducing character sequence, <strong>NOT</strong> the whole
0633: * string or comment. The reading of the rest of the string or comment is done
0634: * by the calling {@link de.susebox.jtopas.Tokenizer}.
0635: *
0636: * @param dataProvider the source to get the data range from
0637: * @return a {@link de.susebox.jtopas.TokenizerProperty} if a special sequence,
0638: * comment or string could be detected, <code>null</code> otherwise
0639: * @throws TokenizerException failure while reading more data
0640: * @throws NullPointerException if no {@link DataProvider} is given
0641: */
0642: public TokenizerProperty startsWithSequenceCommentOrString(
0643: DataProvider dataProvider) throws TokenizerException,
0644: NullPointerException {
0645: // we need the longest possible match
0646: synchronized (_sequences) {
0647: TokenizerProperty caseProp = (_sequences[0] != null) ? _sequences[0]
0648: .startsWithSequenceCommentOrString(dataProvider)
0649: : null;
0650:
0651: TokenizerProperty noCaseProp = (_sequences[1] != null) ? _sequences[1]
0652: .startsWithSequenceCommentOrString(dataProvider)
0653: : null;
0654:
0655: if (noCaseProp == null) {
0656: return caseProp;
0657: } else if (caseProp == null) {
0658: return noCaseProp;
0659: } else if (caseProp.getImages()[0].length() >= noCaseProp
0660: .getImages()[0].length()) {
0661: return caseProp;
0662: } else {
0663: return noCaseProp;
0664: }
0665: }
0666: }
0667:
0668: /**
0669: * This method returns the length of the longest special sequence, comment or
0670: * string prefix that is known to this <code>SequenceHandler</code>. When
0671: * calling {@link #startsWithSequenceCommentOrString}, the passed {@link DataProvider}
0672: * parameter will supply at least this number of characters (see {@link DataProvider#getLength}).
0673: * If less characters are provided, EOF is reached.
0674: *
0675: * @return the number of characters needed in the worst case to identify a
0676: * special sequence
0677: */
0678: public int getSequenceMaxLength() {
0679: int maxLength = 0;
0680:
0681: synchronized (_sequences) {
0682: if (_sequences[0] != null) {
0683: maxLength = _sequences[0].getSequenceMaxLength();
0684: }
0685: if (_sequences[1] != null
0686: && _sequences[1].getSequenceMaxLength() > maxLength) {
0687: maxLength = _sequences[1].getSequenceMaxLength();
0688: }
0689: }
0690: return maxLength;
0691: }
0692:
0693: /**
0694: * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation
0695: * for a fast detection if keyword matching must be performed at all. If the method
0696: * returns <code>false</code> time-consuming preparations can be skipped.
0697: *
0698: * @return <code>true</code> if there actually are pattern that can be tested
0699: * for a match, <code>false</code> otherwise.
0700: */
0701: public boolean hasKeywords() {
0702: synchronized (_keywords) {
0703: return (_keywords[0] != null || _keywords[1] != null);
0704: }
0705: }
0706:
0707: /**
0708: * This method checks if the character range given through the
0709: * {@link DataProvider} comprises a keyword.
0710: *
0711: * @param dataProvider the source to get the data from, that are checked
0712: * @return a {@link de.susebox.jtopas.TokenizerProperty} if a keyword could be
0713: * found, <code>null</code> otherwise
0714: * @throws TokenizerException failure while reading more data
0715: * @throws NullPointerException if no {@link DataProvider} is given
0716: */
0717: public TokenizerProperty isKeyword(DataProvider dataProvider)
0718: throws TokenizerException, NullPointerException {
0719: synchronized (_keywords) {
0720: TokenizerProperty prop;
0721:
0722: if (_keywords[0] != null) {
0723: prop = _keywords[0].isKeyword(dataProvider);
0724: } else {
0725: prop = null;
0726: }
0727: if (prop == null && _keywords[1] != null) {
0728: prop = _keywords[1].isKeyword(dataProvider);
0729: }
0730: return prop;
0731: }
0732: }
0733:
0734: /**
0735: * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation
0736: * for a fast detection if pattern matching must be performed at all. If the method
0737: * returns <code>false</code> time-consuming preparations can be skipped.
0738: *
0739: * @return <code>true</code> if there actually are pattern that can be tested
0740: * for a match, <code>false</code> otherwise.
0741: */
0742: public boolean hasPattern() {
0743: synchronized (_patterns) {
0744: return (_patterns.size() > 0);
0745: }
0746: }
0747:
0748: /**
0749: * This method checks if the start of a character range given through the
0750: * {@link DataProvider} matches a pattern.
0751: *
0752: * @param dataProvider the source to get the data from
0753: * @return a {@link PatternHandler.Result} object or <code>null</code> if no
0754: * match was found
0755: * @throws TokenizerException generic exception
0756: * @throws NullPointerException if no {@link DataProvider} is given
0757: */
0758: public PatternHandler.Result matches(DataProvider dataProvider)
0759: throws TokenizerException, NullPointerException {
0760: synchronized (_patterns) {
0761: int longestMatch = 0;
0762: PatternHandler.Result bestResult = null;
0763:
0764: // only get the string if pattern are available
0765: for (int index = 0; index < _patterns.size(); ++index) {
0766: PatternMatcher data = (PatternMatcher) _patterns
0767: .get(index);
0768: PatternHandler.Result result = data
0769: .matches(dataProvider);
0770:
0771: if (result != null) {
0772: if (bestResult == null
0773: || bestResult.getLengthOfMatch() < result
0774: .getLengthOfMatch()) {
0775: bestResult = result;
0776: }
0777: }
0778: }
0779:
0780: // return the best result
0781: return bestResult;
0782: }
0783: }
0784:
0785: //---------------------------------------------------------------------------
0786: // Implementation
0787: //
0788:
0789: /**
0790: * Registering a pattern with an associated object. The method assumes that the
0791: * given pattern property has been checked for not being null, having a non-empty
0792: * pattern image and normalized flags ({@link AbstractTokenizerProperties#normalizeFlags}).
0793: * See the method description in {@link AbstractTokenizerProperties}.
0794: *
0795: * @param patternProp the regular expression to be added
0796: * @return the replaced pattern property or <code>null</code>
0797: * @throws IllegalArgumentException if pattern matching is not available
0798: */
0799: protected TokenizerProperty addPattern(TokenizerProperty patternProp)
0800: throws IllegalArgumentException {
0801: // construct the pattern
0802: PatternMatcher data = null;
0803: String pattern = patternProp.getImages()[0];
0804:
0805: try {
0806: data = new PatternMatcher(patternProp, getParseFlags());
0807: } catch (Throwable ex) {
0808: throw new ExtIllegalArgumentException(ex,
0809: "Pattern matching is not available (use JDK 1.4 or above).");
0810: }
0811:
0812: // Register pattern. First search for existing one
0813: for (int index = 0; index < _patterns.size(); ++index) {
0814: PatternMatcher oldData = (PatternMatcher) _patterns
0815: .get(index);
0816: TokenizerProperty oldProp = oldData.getProperty();
0817:
0818: if (oldProp.getImages()[0].equals(pattern)) {
0819: _patterns.set(index, data);
0820: return oldProp;
0821: }
0822: }
0823:
0824: // not found -> register new pattern
0825: _patterns.add(data);
0826: return null;
0827: }
0828:
0829: /**
0830: * Registering a keyword property. The method assumes that the given keyword
0831: * property has been checked for not being null, having a non-empty keyword
0832: * image and normalized flags ({@link AbstractTokenizerProperties#normalizeFlags}).
0833: *
0834: * @param keywordProp keyword property to register
0835: * @return the replaced keyword property or <code>null</code>
0836: */
0837: protected TokenizerProperty addKeyword(TokenizerProperty keywordProp) {
0838: // case-sensitive keyword?
0839: boolean noCase = isFlagSet(keywordProp, Flags.F_NO_CASE);
0840: int arrayIdx = noCase ? 1 : 0;
0841:
0842: // first keyword?
0843: if (_keywords[arrayIdx] == null) {
0844: if (noCase) {
0845: _keywords[arrayIdx] = new NoCaseSequenceStore(true);
0846: } else {
0847: _keywords[arrayIdx] = new SequenceStore(true);
0848: }
0849: }
0850:
0851: // add / replace property
0852: return _keywords[arrayIdx].addKeyword(keywordProp);
0853: }
0854:
0855: /**
0856: * This method adds or replaces strings, comments and ordinary special sequences.
0857: * The method assumes that the given special sequence property has been checked
0858: * for not being null, having a non-empty imagesand normalized flags
0859: * ({@link AbstractTokenizerProperties#normalizeFlags}).
0860: *
0861: * @param property the description of the new sequence
0862: * @return the replaced special sequence property or <code>null</code>
0863: */
0864: protected TokenizerProperty addSpecialSequence(
0865: TokenizerProperty property) {
0866: // case-sensitive sequence?
0867: boolean noCase = isFlagSet(property, Flags.F_NO_CASE);
0868: int arrayIdx = noCase ? 1 : 0;
0869:
0870: // first special sequence?
0871: if (_sequences[arrayIdx] == null) {
0872: if (noCase) {
0873: _sequences[arrayIdx] = new NoCaseSequenceStore(false);
0874: } else {
0875: _sequences[arrayIdx] = new SequenceStore(false);
0876: }
0877: }
0878:
0879: // add / replace property
0880: return _sequences[arrayIdx].addSpecialSequence(property);
0881: }
0882:
0883: /**
0884: * Set or removes the flags corresponding to type and case-sensitivity from the
0885: * character flags tables.
0886: *
0887: * @param set the character set to handle (may contain ranges)
0888: * @param type token type fro the characters ({@link Token#WHITESPACE} or {@link Token#SEPARATOR})
0889: * @param setIt if <code>true</code> the approbriate flags will be set, otherwise removed
0890: */
0891: private void putCharSet(String set, int type, boolean setIt) {
0892: // which flags ?
0893: int charFlags = 0;
0894:
0895: switch (type) {
0896: case Token.WHITESPACE:
0897: charFlags = CHARFLAG_WHITESPACE;
0898: break;
0899: case Token.SEPARATOR:
0900: charFlags = CHARFLAG_SEPARATOR;
0901: break;
0902: }
0903:
0904: // analyze the given set
0905: int length = (set != null) ? set.length() : 0;
0906: char start, end, setChar;
0907:
0908: for (int ii = 0; ii < length; ++ii) {
0909: setChar = set.charAt(ii);
0910:
0911: switch (setChar) {
0912: case '-':
0913: start = (ii > 0) ? set.charAt(ii - 1) : 0;
0914: end = (ii < length - 1) ? set.charAt(ii + 1) : 0xFFFF;
0915: ii += 2;
0916: break;
0917:
0918: case '\\':
0919: setChar = (ii + 1 >= length) ? 0 : set.charAt(ii + 1);
0920: ii++;
0921: /* no break */
0922:
0923: default:
0924: start = end = setChar;
0925: }
0926:
0927: // put flags
0928: for (char index = start; index <= end; ++index) {
0929: char currChar = index;
0930:
0931: do {
0932: if (currChar < _charFlags.length) {
0933: // one-byte characters
0934: if (setIt) {
0935: _charFlags[currChar] |= charFlags;
0936: } else {
0937: _charFlags[currChar] &= ~charFlags;
0938: }
0939:
0940: } else {
0941: // longer characters
0942: Integer key = new Integer(currChar);
0943: Integer extFlags = (Integer) _extCharFlags
0944: .get(key);
0945:
0946: if (setIt) {
0947: extFlags = new Integer(extFlags.intValue()
0948: | charFlags);
0949: } else {
0950: extFlags = new Integer(extFlags.intValue()
0951: & ~charFlags);
0952: }
0953: _extCharFlags.put(key, extFlags);
0954: }
0955:
0956: // settings must be also done for the upper/lowercase variant
0957: if (Character.isLowerCase(currChar)) {
0958: currChar = Character.toUpperCase(currChar);
0959: } else if (Character.isUpperCase(currChar)) {
0960: currChar = Character.toLowerCase(currChar);
0961: }
0962: } while ((_flags & Flags.F_NO_CASE) != 0
0963: && currChar != index);
0964: }
0965: }
0966: }
0967:
0968: //---------------------------------------------------------------------------
0969: // Class members
0970: //
0971:
0972: /**
0973: * character flag for whitespaces
0974: */
0975: public static final int CHARFLAG_WHITESPACE = 1;
0976:
0977: /**
0978: * character flag for whitespaces
0979: */
0980: public static final int CHARFLAG_SEPARATOR = 2;
0981:
0982: //---------------------------------------------------------------------------
0983: // Members
0984: //
0985:
0986: /**
0987: * array containing the flags for whitespaces and separators
0988: */
0989: protected int _charFlags[] = new int[256];
0990:
0991: /**
0992: * Map with flags for characters beyond 256;
0993: */
0994: protected HashMap _extCharFlags = new HashMap();
0995:
0996: /**
0997: * current whitespace characters including character ranges.
0998: */
0999: protected String _whitespacesCase = DEFAULT_WHITESPACES;
1000:
1001: /**
1002: * current whitespace characters including character ranges. Case is ignored.
1003: */
1004: protected String _whitespacesNoCase = "";
1005:
1006: /**
1007: * current separator characters including character ranges.
1008: */
1009: protected String _separatorsCase = DEFAULT_SEPARATORS;
1010:
1011: /**
1012: * current separator characters including character ranges. Case is ignored.
1013: */
1014: protected String _separatorsNoCase = "";
1015:
1016: /**
1017: * The first element is the {@link de.susebox.jtopas.impl.SequenceStore} for
1018: * the case-sensitive sequences, the second is for the case-insensitive ones.
1019: */
1020: protected SequenceStore[] _sequences = new SequenceStore[2];
1021:
1022: /**
1023: * Like the array {@link #_sequences} this two-element Array contains two
1024: * {@link de.susebox.jtopas.impl.SequenceStore}, the first for the case-sensitive
1025: * keywords, the second for the case-insensitive ones.
1026: */
1027: protected SequenceStore[] _keywords = new SequenceStore[2];
1028:
1029: /**
1030: * This array contains the patterns
1031: */
1032: protected ArrayList _patterns = new ArrayList();
1033:
1034: /**
1035: * Which regular expression parser to use
1036: */
1037: private Class _patternClass = null;
1038:
1039: /**
1040: * A buffer used for pattern matching
1041: */
1042: private StringBuffer _foundMatch = new StringBuffer();
1043: }
1044:
1045: //---------------------------------------------------------------------------
1046: // inner classes
1047: //
1048:
1049: /**
1050: * Instances of this inner class are returned when a call to
1051: * {@link TokenizerProperties#getProperties}.
1052: * Each element of the enumeration contains a {@link TokenizerProperty} element.
1053: */
1054: final class FullIterator implements Iterator {
1055:
1056: /**
1057: * constructor taking the calling {@link TokenizerProperties} object to retrieve
1058: * the members holding {@link TokenizerProperty} elements which are iterated by
1059: * this <code>FullIterator</code> instance.
1060: *
1061: * @param caseSensitiveMap map with properties where case matters
1062: * @param caseSensitiveMap map with properties where case doesn't matter
1063: */
1064: public FullIterator(StandardTokenizerProperties parent) {
1065: _parent = parent;
1066:
1067: // create list of iterators
1068: _iterators = new Object[3];
1069: _iterators[0] = new SpecialSequencesIterator(parent,
1070: parent._keywords, Token.KEYWORD);
1071: _iterators[1] = new SpecialSequencesIterator(parent,
1072: parent._sequences, 0);
1073: _iterators[2] = new PatternIterator(parent);
1074: _currIndex = 0;
1075: }
1076:
1077: /**
1078: * Test wether there is another element in the iterated set or not. See
1079: * {@link java.util.Iterator} for details.
1080: *
1081: * @return <code>true</code>if another call to {@link #next} will return an object,
1082: * <code>false</code> otherwise
1083: */
1084: public boolean hasNext() {
1085: synchronized (this ) {
1086: while (_currIndex < _iterators.length) {
1087: Iterator iter = (Iterator) _iterators[_currIndex];
1088:
1089: if (iter.hasNext()) {
1090: return true;
1091: }
1092: _currIndex++;
1093: }
1094: return false;
1095: }
1096: }
1097:
1098: /**
1099: * Retrieve the next element in the iterated set. See {@link java.util.Iterator}
1100: * for details.
1101: *
1102: * @return the next element or <code>null</code> if there is none
1103: */
1104: public Object next() {
1105: if (hasNext()) {
1106: synchronized (this ) {
1107: Iterator iter = (Iterator) _iterators[_currIndex];
1108: return iter.next();
1109: }
1110: } else {
1111: return null;
1112: }
1113: }
1114:
1115: /**
1116: * Retrieve the next element in the iterated set. See {@link java.util.Iterator}
1117: * for details.
1118: *
1119: * @return the next element or <code>null</code> if there is none
1120: */
1121: public void remove() {
1122: if (_currIndex < _iterators.length) {
1123: Iterator iter = (Iterator) _iterators[_currIndex];
1124: iter.remove();
1125: }
1126: }
1127:
1128: // members
1129: private StandardTokenizerProperties _parent = null;
1130: private Object[] _iterators = null;
1131: private int _currIndex = -1;
1132: }
1133:
1134: /**
1135: * Instances of this inner class are returned when a call to {@link TokenizerProperties#getKeywords}
1136: * or {@link TokenizerProperties#getPatterns}.
1137: * Each element of the enumeration contains a {@link TokenizerProperty} element,
1138: * that in turn has the keyword or a pattern with its companion
1139: */
1140: final class MapIterator implements Iterator {
1141:
1142: /**
1143: * constructor taking the a case-sensitive and a case-insensitive {@link java.util.Map}
1144: * which are iterated by this <code>MapIterator</code> instance.
1145: *
1146: * @param caseSensitiveMap map with properties where case matters
1147: * @param caseSensitiveMap map with properties where case doesn't matter
1148: */
1149: public MapIterator(StandardTokenizerProperties parent,
1150: Map caseSensitiveMap, Map caseInsensitiveMap) {
1151: synchronized (this ) {
1152: _parent = parent;
1153: if (caseSensitiveMap != null) {
1154: _iterators[0] = caseSensitiveMap.values().iterator();
1155: }
1156: if (caseInsensitiveMap != null) {
1157: _iterators[1] = caseInsensitiveMap.values().iterator();
1158: }
1159: }
1160: }
1161:
1162: /**
1163: * the well known method from the {@link java.util.Iterator} interface.
1164: *
1165: * @return <code>true</code> if there are more {@link TokenizerProperty}
1166: * elements, <code>false</code> otherwise
1167: */
1168: public boolean hasNext() {
1169: // check the current array
1170: synchronized (_iterators) {
1171: if (_iterators[0] != null) {
1172: if (_iterators[0].hasNext()) {
1173: return true;
1174: } else {
1175: _iterators[0] = null;
1176: }
1177: }
1178: if (_iterators[1] != null) {
1179: if (_iterators[1].hasNext()) {
1180: return true;
1181: } else {
1182: _iterators[1] = null;
1183: }
1184: }
1185: return false;
1186: }
1187: }
1188:
1189: /**
1190: * Retrieve the next {@link TokenizerProperty} in this enumeration.
1191: *
1192: * @return the next keyword as a <code>TokenizerProperty</code>
1193: * @throws NoSuchElementException if there is no more element in this iterator
1194: */
1195: public Object next() {
1196: if (!hasNext()) {
1197: throw new NoSuchElementException();
1198: }
1199:
1200: synchronized (this ) {
1201: if (_iterators[0] != null) {
1202: _currentData = (TokenizerProperty) _iterators[0].next();
1203: } else {
1204: _currentData = (TokenizerProperty) _iterators[1].next();
1205: }
1206: return _currentData;
1207: }
1208: }
1209:
1210: /**
1211: * This method is similar to {@link Tokenizer#removeKeyword}.
1212: *
1213: * @throws IllegalStateExcpetion if {@link #next} has not been called before or
1214: * <code>remove</code> has been called already after the last <code>next</code>.
1215: */
1216: public void remove() {
1217: synchronized (this ) {
1218: // if current element is not set
1219: if (_currentData == null) {
1220: throw new IllegalStateException();
1221: }
1222:
1223: if (_iterators[0] != null) {
1224: _iterators[0].remove();
1225: } else {
1226: _iterators[1].remove();
1227: }
1228: _parent.notifyListeners(new TokenizerPropertyEvent(
1229: TokenizerPropertyEvent.PROPERTY_REMOVED,
1230: _currentData));
1231: _currentData = null;
1232: }
1233: }
1234:
1235: // members
1236: private StandardTokenizerProperties _parent = null;
1237: private Iterator[] _iterators = new Iterator[2];
1238: private TokenizerProperty _currentData = null;
1239: }
1240:
1241: /**
1242: * Iterator for comments, strings and special sequences.
1243: * Instances of this inner class are returned when a call to one of the methods
1244: *<ul><li>
1245: * {@link #getBlockComments}
1246: *</li><li>
1247: * {@link #getLineComments}
1248: *</li><li>
1249: * {@link #getStrings}
1250: *</li><li>
1251: * {@link #getSpecialSequences}
1252: *</li></ul>
1253: * is done. Each element of the enumeration contains a {@link TokenizerProperty}
1254: * element, that in turn has the comment, special sequence etc. together with
1255: * its companion
1256: */
1257: final class SpecialSequencesIterator implements Iterator {
1258:
1259: /**
1260: * constructor taking the calling <code>Tokenizer</code> and the type of the
1261: * {@link TokenizerProperty}. If the type is 0 then special sequences, line and
1262: * block comments are returned in one iterator
1263: *
1264: * @param parent the calling tokenizer
1265: * @param stores which array of {@link de.susebox.jtopas.impl.SequenceStore} to use
1266: * @param type type of the <code>TokenizerProperty</code>
1267: */
1268: public SpecialSequencesIterator(StandardTokenizerProperties parent,
1269: SequenceStore[] stores, int type) {
1270: _type = type;
1271: _parent = parent;
1272: _stores = stores;
1273: }
1274:
1275: /**
1276: * the well known method from the {@link java.util.Iterator} interface.
1277: *
1278: * @return <code>true</code> if there are more {@link TokenizerProperty}
1279: * elements, <code>false</code> otherwise
1280: */
1281: public boolean hasNext() {
1282: synchronized (this ) {
1283: if (_currentIterator != null && _currentIterator.hasNext()) {
1284: return true;
1285: }
1286:
1287: while (_stores != null && ++_currentIndex < _stores.length) {
1288: if (_stores[_currentIndex] != null) {
1289: _currentIterator = _stores[_currentIndex]
1290: .getSpecialSequences(_type);
1291: if (_currentIterator.hasNext()) {
1292: return true;
1293: }
1294: }
1295: }
1296: return false;
1297: }
1298: }
1299:
1300: /**
1301: * Retrieve the next {@link TokenizerProperty} in this enumeration.
1302: *
1303: * @return a {@link TokenizerProperty} of the desired type or <code>null</code>
1304: * @throws NoSuchElementException if there is no more element in this iterator
1305: */
1306: public Object next() throws NoSuchElementException {
1307: synchronized (this ) {
1308: if (!hasNext()) {
1309: throw new NoSuchElementException();
1310: }
1311: _currentElement = (TokenizerProperty) _currentIterator
1312: .next();
1313: return _currentElement;
1314: }
1315: }
1316:
1317: /**
1318: * Remove the current special sequence entry from the collection. This is an
1319: * alternative to {@link Tokenizer#removeSpecialSequence}.
1320: *
1321: * @throws IllegalStateExcpetion if {@link #next} has not been called before or
1322: * <code>remove</code> has been called already after the last <code>next</code>.
1323: */
1324: public void remove() throws IllegalStateException {
1325: synchronized (this ) {
1326: // if current element is not set
1327: if (_currentElement == null) {
1328: throw new IllegalStateException();
1329: }
1330:
1331: // remove current element
1332: try {
1333: _currentIterator.remove();
1334: _parent.notifyListeners(new TokenizerPropertyEvent(
1335: TokenizerPropertyEvent.PROPERTY_REMOVED,
1336: _currentElement));
1337: _currentElement = null;
1338: } catch (Exception ex) {
1339: throw new ExtRuntimeException(ex,
1340: "While trying to remove current element of a SpecialSequencesIterator.");
1341: }
1342: }
1343: }
1344:
1345: // members
1346: private StandardTokenizerProperties _parent = null;
1347: private SequenceStore[] _stores = null;
1348: private TokenizerProperty _currentElement = null;
1349: private Iterator _currentIterator = null;
1350: private int _currentIndex = -1;
1351: private int _type = Token.UNKNOWN;
1352: }
1353:
1354: /**
1355: * An {@link java.util.Iterator} for pattern.
1356: */
1357: final class PatternIterator implements Iterator {
1358: /**
1359: * constructor taking the calling {@link TokenizerProperties} object.
1360: *
1361: * @param parent the caller
1362: */
1363: public PatternIterator(StandardTokenizerProperties parent) {
1364: _parent = parent;
1365: synchronized (parent._patterns) {
1366: _iterator = parent._patterns.iterator();
1367: }
1368: }
1369:
1370: /**
1371: * the well known method from the {@link java.util.Iterator} interface.
1372: *
1373: * @return <code>true</code> if there are more {@link TokenizerProperty}
1374: * elements, <code>false</code> otherwise
1375: */
1376: public boolean hasNext() {
1377: return _iterator.hasNext();
1378: }
1379:
1380: /**
1381: * Retrieve the next {@link TokenizerProperty} in this enumeration.
1382: *
1383: * @return the next keyword as a <code>TokenizerProperty</code>
1384: * @throws NoSuchElementException if there is no more element in this iterator
1385: */
1386: public Object next() throws NoSuchElementException {
1387: synchronized (this ) {
1388: _currentData = (PatternMatcher) _iterator.next();
1389: return _currentData.getProperty();
1390: }
1391: }
1392:
1393: /**
1394: * This method is similar to {@link Tokenizer#removeKeyword}
1395: */
1396: public void remove() {
1397: synchronized (this ) {
1398: _iterator.remove();
1399: _parent.notifyListeners(new TokenizerPropertyEvent(
1400: TokenizerPropertyEvent.PROPERTY_REMOVED,
1401: _currentData.getProperty()));
1402: }
1403: }
1404:
1405: // members
1406: private StandardTokenizerProperties _parent = null;
1407: private Iterator _iterator = null;
1408: private PatternMatcher _currentData = null;
1409: }
|