001: /*
002: * StandardSequenceHandler.java: simple implementation of SequenceHandler
003: *
004: * Copyright (C) 2002 Heiko Blau
005: *
006: * This file belongs to the JTopas Library.
007: * JTopas is free software; you can redistribute it and/or modify it
008: * under the terms of the GNU Lesser General Public License as published by the
009: * Free Software Foundation; either version 2.1 of the License, or (at your
010: * option) any later version.
011: *
012: * This software is distributed in the hope that it will be useful, but WITHOUT
013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
014: * FITNESS FOR A PARTICULAR PURPOSE.
015: * See the GNU Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public License along
018: * with JTopas. If not, write to the
019: *
020: * Free Software Foundation, Inc.
021: * 59 Temple Place, Suite 330,
022: * Boston, MA 02111-1307
023: * USA
024: *
025: * or check the Internet: http://www.fsf.org
026: *
027: * Contact:
028: * email: heiko@susebox.de
029: */
030:
031: package de.susebox.jtopas.spi;
032:
033: //-----------------------------------------------------------------------------
034: // Imports
035: //
036: import java.util.Iterator;
037:
038: import de.susebox.jtopas.TokenizerProperty;
039: import de.susebox.jtopas.TokenizerProperties;
040: import de.susebox.jtopas.TokenizerException;
041:
042: //-----------------------------------------------------------------------------
043: // Interface SequenceHandler
044: //
045:
046: /**<p>
047: * Simple implementation of the {@link SequenceHandler} interface. This class
048: * works only with the {@link de.susebox.jtopas.TokenizerProperties} interface
049: * methods and is aware of changes in these properties. It does not cache any
050: * information and is therefore a more or less slow way to handle special sequences.
051: *</p><p>
052: * This class is a bridge between arbitrary {@link de.susebox.jtopas.Tokenizer}
053: * implementations using the SPI interface {@link SequenceHandler} and any
054: * {@link de.susebox.jtopas.TokenizerProperties} implementation that does not
055: * implement the <code>SequenceHandler</code> interface itself.
056: *</p>
057: *
058: * @see SequenceHandler
059: * @see de.susebox.jtopas.Tokenizer
060: * @see de.susebox.jtopas.TokenizerProperties
061: * @author Heiko Blau
062: */
063: public class StandardSequenceHandler implements SequenceHandler {
064:
065: /**
066: * The constructor takes the {@link de.susebox.jtopas.TokenizerProperties}
067: * that provide the special sequences.
068: *
069: * @param props the {@link de.susebox.jtopas.TokenizerProperties} to take the
070: * sequences from
071: */
072: public StandardSequenceHandler(TokenizerProperties props) {
073: _properties = props;
074: }
075:
076: /**
077: * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation
078: * for a fast detection if special sequence checking must be performed at all.
079: * If the method returns <code>false</code> time-consuming preparations can be
080: * skipped.
081: *
082: * @return <code>true</code> if there actually are pattern that can be tested
083: * for a match, <code>false</code> otherwise.
084: */
085: public boolean hasSequenceCommentOrString() {
086: if (_properties != null) {
087: return _properties.getSpecialSequences().hasNext();
088: } else {
089: return false;
090: }
091: }
092:
093: /**
094: * This method checks if a given range of data starts with a special sequence,
095: * a comment or a string. These three types of token are tested together since
096: * both comment and string prefixes are ordinary special sequences. Only the
097: * actions preformed <strong>after</strong> a string or comment has been detected,
098: * are different.
099: *<br>
100: * The method returns <code>null</code> if no special sequence, comment or string
101: * could matches the the leading part of the data range given through the
102: * {@link DataProvider}.
103: *
104: * @param dataProvider the source to get the data range from
105: * @return a {@link de.susebox.jtopas.TokenizerProperty} if a special sequence,
106: * comment or string could be detected, <code>null</code> otherwise
107: * @throws NullPointerException if no {@link DataProvider} is given
108: */
109: public TokenizerProperty startsWithSequenceCommentOrString(
110: DataProvider dataProvider) throws NullPointerException {
111: TokenizerProperty prop = null;
112:
113: if (_properties != null) {
114: String data = dataProvider.toString();
115:
116: prop = getLongestMatch(data, _properties
117: .getSpecialSequences(), prop);
118: prop = getLongestMatch(data, _properties.getLineComments(),
119: prop);
120: prop = getLongestMatch(data,
121: _properties.getBlockComments(), prop);
122: prop = getLongestMatch(data, _properties.getStrings(), prop);
123: }
124: return prop;
125: }
126:
127: /**
128: * This method returns the length of the longest special sequence, comment or
129: * string prefix that is known to this <code>SequenceHandler</code>. When
130: * calling {@link #startsWithSequenceCommentOrString}, the passed {@link DataProvider}
131: * parameter will supply at least this number of characters (see {@link DataProvider#getLength}).
132: * If less characters are provided, EOF is reached.
133: *<br>
134: * The method is an easy approach to the problem of how to provide more data
135: * in case a test runs out of characters. The invoking {@link de.susebox.jtopas.Tokenizer}
136: * (represented by the given {@link DataProvider}) can supply enough data for
137: * the {@link #startsWithSequenceCommentOrString} method.
138: *
139: * @return the number of characters needed in the worst case to identify a
140: * special sequence
141: */
142: public int getSequenceMaxLength() {
143: int maxLength = 0;
144:
145: if (_properties != null) {
146: maxLength = getSequenceMaxLength(_properties
147: .getSpecialSequences(), maxLength);
148: maxLength = getSequenceMaxLength(_properties
149: .getLineComments(), maxLength);
150: maxLength = getSequenceMaxLength(_properties
151: .getBlockComments(), maxLength);
152: maxLength = getSequenceMaxLength(_properties.getStrings(),
153: maxLength);
154: }
155: return maxLength;
156: }
157:
158: /**
159: * Retrieving the maximum length of a {@link TokenizerProperty} from an
160: * {@link java.util.Iterator}.
161: *
162: * @param iter a initialized {@link java.util.Iterator} to walk through
163: * @param currentMax the currently known maximum length
164: * @return the maximum length of the {@link TokenizerProperty} images in the
165: * iterator
166: */
167: private int getSequenceMaxLength(Iterator iter, int currentMax) {
168: while (iter.hasNext()) {
169: TokenizerProperty prop = (TokenizerProperty) iter.next();
170: int len = prop.getImages()[0].length();
171:
172: if (len > currentMax) {
173: currentMax = len;
174: }
175: }
176: return currentMax;
177: }
178:
179: /**
180: * Retrieving the longest {@link TokenizerProperty} that matches the start of
181: * the given string.
182: *
183: * @param data check the start of this string
184: * @param iter a initialized {@link java.util.Iterator} to walk through
185: * @param currentMatch the currently known longest match
186: * @return the longest matching {@link TokenizerProperty} or <code>null</code>
187: */
188: private TokenizerProperty getLongestMatch(String data,
189: Iterator iter, TokenizerProperty currentMatch) {
190: int currentMax = (currentMatch != null) ? currentMatch
191: .getImages()[0].length() : 0;
192: TokenizerProperty retProp = currentMatch;
193:
194: while (iter.hasNext()) {
195: TokenizerProperty prop = (TokenizerProperty) iter.next();
196: int len = prop.getImages()[0].length();
197:
198: if (len > currentMax) {
199: currentMax = len;
200: retProp = prop;
201: }
202: }
203: return retProp;
204: }
205:
206: //---------------------------------------------------------------------------
207: // Members
208: //
209:
210: /**
211: * The {@link TokenizerProperties} that provide the sequences and the
212: * control flags.
213: */
214: private TokenizerProperties _properties = null;
215: }
|