001: /*
002: * StandardTokenizer.java: core class for lexical parser.
003: *
004: * Copyright (C) 2001 Heiko Blau
005: *
006: * This file belongs to the JTopas Library.
007: * JTopas is free software; you can redistribute it and/or modify it
008: * under the terms of the GNU Lesser General Public License as published by the
009: * Free Software Foundation; either version 2.1 of the License, or (at your
010: * option) any later version.
011: *
012: * This software is distributed in the hope that it will be useful, but WITHOUT
013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
014: * FITNESS FOR A PARTICULAR PURPOSE.
015: * See the GNU Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public License along
018: * with JTopas. If not, write to the
019: *
020: * Free Software Foundation, Inc.
021: * 59 Temple Place, Suite 330,
022: * Boston, MA 02111-1307
023: * USA
024: *
025: * or check the Internet: http://www.fsf.org
026: *
027: * Contact:
028: * email: heiko@susebox.de
029: */
030:
031: package de.susebox.jtopas;
032:
033: //-----------------------------------------------------------------------------
034: // Imports
035: //
036: import de.susebox.java.lang.ExtIndexOutOfBoundsException;
037: import de.susebox.jtopas.spi.DataProvider;
038:
039: //-----------------------------------------------------------------------------
040: // Class StandardTokenizer
041: //
042:
043: /**<p>
044: * This is the mainstream {@link Tokenizer}. It implements the {@link Tokenizer}
045: * interface in a straightforward approach without too specialized parse
046: * optimizations.
047: * </p><p>
048: * Beside the {@link Tokenizer} interface, the class <code>StandardTokenizer</code>
049: * provides some basic features for cascading (nested) tokenizers. Consider the usual
050: * HTML pages found today in the WWW. Most of them are a mixture of regular HTML,
051: * cascading style sheets (CSS) and embedded JavaScript. These different languages
052: * use different syntaxes, so one needs varous tokenizers on the same input stream.
053: *</p><p>
054: * This {@link Tokenizer} implementation is not synchronized. Take care when using
055: * with multible threads.
056: *</p>
057: *
058: * @see Tokenizer
059: * @see TokenizerProperties
060: * @author Heiko Blau
061: */
062: public class StandardTokenizer extends AbstractTokenizer implements
063: Tokenizer, TokenizerPropertyListener {
064: //---------------------------------------------------------------------------
065: // Constructors
066: //
067:
068: /**
069: * Default constructor that sets the tokenizer control flags as it would be
070: * approbriate for C/C++ and Java. Found token images are copied. No line nor
071: * column informations are provided. Nested comments are not allowed.
072: *<br>
073: * The tokenizer will use the {@link TokenizerProperties#DEFAULT_WHITESPACES}
074: * and {@link TokenizerProperties#DEFAULT_SEPARATORS} for whitespace and
075: * separator handling.
076: */
077: public StandardTokenizer() {
078: }
079:
080: /**
081: * Contructing a <code>StandardTokenizer</code> with a backing {@link TokenizerProperties}
082: * instance.
083: *
084: * @param properties an {@link TokenizerProperties} object containing the
085: * settings for the tokenizing process
086: */
087: public StandardTokenizer(TokenizerProperties properties) {
088: super .setTokenizerProperties(properties);
089: }
090:
091: //---------------------------------------------------------------------------
092: // Methods of the Tokenizer interface
093: //
094:
095: /**
096: * This method returns the absolute offset in characters to the start of the
097: * parsed stream. See the method description in {@link Tokenizer}.
098: *
099: * @return the absolute offset of the current text window in characters from
100: * the start of the data source of the Tokenizer
101: * @see #getReadPosition
102: */
103: public int getRangeStart() {
104: return _rangeStart;
105: }
106:
107: /**
108: * Additionally to the common behaviour implemented in
109: * {@link #de.susebox.jtopas.AbstractTokenizer#setSource}, this method ajusts
110: * the state speicific to the <code>StandardTokenizer</code> class.
111: *
112: * @param source a {@link TokenizerSource} to read data from
113: */
114: public void setSource(TokenizerSource source) {
115: super .setSource(source);
116: _hasBeenRead = false;
117: _rangeStart = 0;
118: try {
119: _charSequenceTokenizerSource = (CharSequenceTokenizerSource) getSource();
120: _dataProvider = new StringDataProvider(
121: _charSequenceTokenizerSource, 0, 0);
122: } catch (ClassCastException ex) {
123: _charSequenceTokenizerSource = null;
124: _dataProvider = new CharArrayDataProvider(_inputBuffer, 0,
125: 0);
126: }
127: }
128:
129: /**
130: * Closing this tokenizer frees resources.
131: */
132: public void close() {
133: _inputBuffer = null;
134: _rangeStart = 0;
135: _hasBeenRead = false;
136: _charSequenceTokenizerSource = null;
137: _dataProvider = null;
138: super .close();
139: }
140:
141: //---------------------------------------------------------------------------
142: // Implementation
143: //
144:
145: /**
146: * Implements the abstract method of the base class.
147: *
148: * @param startPos position in the input data
149: * @param length number of characters
150: */
151: protected DataProvider getDataProvider(int startPos, int length) {
152: _dataProvider.setDataRange(startPos - getRangeStart(), length);
153: return _dataProvider;
154: }
155:
156: /**
157: * This method organizes the input buffer. It moves the current text window if
158: * nessecary or allocates more space, if data should be kept completely (see the
159: * {@link TokenizerProperties#F_KEEP_DATA} flag).
160: * Its main purpose is to call the {@link TokenizerSource#read} method.
161: *
162: * @return number of read bytes or -1 if an end-of-file condition occured
163: * @throws TokenizerException wrapped exceptions from the {@link TokenizerSource#read}
164: * method
165: */
166: protected int readMoreData() throws TokenizerException {
167: if (_charSequenceTokenizerSource != null) {
168: // new CharSequenceTokenizerSource
169: if (_hasBeenRead
170: || _charSequenceTokenizerSource.length() <= 0) {
171: return -1;
172: } else {
173: _hasBeenRead = true;
174: return _charSequenceTokenizerSource.length();
175: }
176:
177: } else {
178: // no input buffer so far
179: if (_inputBuffer == null) {
180: if (isFlagSet(Flags.F_KEEP_DATA)) {
181: _inputBuffer = new char[LARGE_BUFFER_INITSIZE]; // 64k
182: } else {
183: _inputBuffer = new char[SMALL_BUFFER_INITSIZE]; // 8k
184: }
185: ((CharArrayDataProvider) _dataProvider)
186: .setData(_inputBuffer);
187: }
188:
189: // this is a good moment to move already read data if the write position is
190: // near the end of the buffer and there is a certain space before the current
191: // read position
192: int readPos = getReadPosition() - getRangeStart();
193: int writePos = currentlyAvailable();
194:
195: if (!isFlagSet(Flags.F_KEEP_DATA)) {
196: if ((readPos > _inputBuffer.length / 4)
197: && (writePos > (3 * _inputBuffer.length) / 4)) {
198: reorganizeInputBuffer(_inputBuffer);
199: writePos = currentlyAvailable();
200: }
201: }
202:
203: // if there is no space any more and data couldn't be moved (see above)
204: // we need a new input buffer
205: if (writePos >= _inputBuffer.length) {
206: _inputBuffer = reorganizeInputBuffer(new char[_inputBuffer.length * 2]);
207: writePos = currentlyAvailable();
208: ((CharArrayDataProvider) _dataProvider)
209: .setData(_inputBuffer);
210: }
211:
212: // read data
213: int chars = 0;
214:
215: while (chars == 0) {
216: try {
217: chars = getSource().read(_inputBuffer, writePos,
218: _inputBuffer.length - writePos);
219: } catch (Exception ex) {
220: throw new TokenizerException(ex);
221: }
222: }
223: return chars;
224: }
225: }
226:
227: /**
228: * Move data in the input buffer and adjust various position values.
229: */
230: private char[] reorganizeInputBuffer(char[] newBuffer) {
231: int readPos = getReadPosition() - getRangeStart();
232: int writePos = currentlyAvailable();
233:
234: if (!isFlagSet(Flags.F_KEEP_DATA)) {
235: System.arraycopy(_inputBuffer, readPos, newBuffer, 0,
236: writePos - readPos);
237: _rangeStart += readPos;
238: } else {
239: System.arraycopy(_inputBuffer, 0, newBuffer, 0, writePos);
240: }
241: return newBuffer;
242: }
243:
244: //---------------------------------------------------------------------------
245: // Inner classes
246: //
247:
248: /**
249: * Base class for the various implementations of the
250: * {@link de.susebox.jtopas.spi.DataProvider} interface for the {@link StandardTokenizer}.
251: */
252: private abstract class AbstractDataProvider implements DataProvider {
253:
254: /**
255: * The constructor takes the nessecary parameters for the methods defined
256: * below
257: *
258: * @param startPosition valid data start here
259: * @param length count of characters starting at startPosition
260: */
261: public AbstractDataProvider(int startPosition, int length) {
262: setDataRange(startPosition, length);
263: }
264:
265: /**
266: * Retrieving the position where the data to analyze start in the buffer provided
267: * by {@link #getData}. The calling {@link de.susebox.jtopas.spi.DataMapper}
268: * must not access data prior to this index in the character array.
269: *
270: * @return index in the character array returned by {@link #getData}, where data starts
271: */
272: public int getStartPosition() {
273: return _startPosition;
274: }
275:
276: /**
277: * Retrieving the maximum number of characters in the array provided by {@link getData}
278: * that can be analyzed by the calling {@link de.susebox.jtopas.spi.DataMapper}.
279: *
280: * @param testChar check this character
281: * @return <code>true</code> if the given character is a separator,
282: * <code>false</code> otherwise
283: */
284: public int getLength() {
285: return _length;
286: }
287:
288: /**
289: * Setting the start position and the length in the data buffer of this
290: * instance.
291: *
292: * @param startPosition valid data start here
293: * @param length count of characters starting at startPosition
294: */
295: protected void setDataRange(int startPosition, int length) {
296: _startPosition = startPosition;
297: _length = length;
298: }
299:
300: // Members
301: protected int _startPosition;
302: protected int _length;
303: }
304:
305: /**
306: * Implementation of the {@link de.susebox.jtopas.spi.DataProvider} interface
307: * for the {@link StandardTokenizer}.
308: */
309: private final class CharArrayDataProvider extends
310: AbstractDataProvider implements DataProvider {
311:
312: /**
313: * The constructor takes the nessecary parameters for the methods defined
314: * below
315: */
316: public CharArrayDataProvider(char[] data, int startPosition,
317: int length) {
318: super (startPosition, length);
319: setData(data);
320: _dataAsString = null;
321: }
322:
323: /**
324: * See {@link de.susebox.jtopas.spi.DataProvider#getCharAt} for details.
325: *
326: * @param index an index between 0 and {@link #getLength}
327: * @return the character at the given position
328: */
329: public char getCharAt(int index) {
330: return _data[_startPosition + index];
331: }
332:
333: /**
334: * See {@link de.susebox.jtopas.spi.DataProvider#getData} for details.
335: *
336: * @return the character buffer to read data from
337: */
338: public char[] getData() {
339: return _data;
340: }
341:
342: /**
343: * See {@link de.susebox.jtopas.spi.DataProvider#getDataCopy} for details.
344: *
345: * @return a copy of the valid data of this {@link DataProvider}
346: * @see #getData
347: * @see #toString
348: */
349: public char[] getDataCopy() {
350: char[] copy = new char[getLength()];
351:
352: System.arraycopy(_data, getStartPosition(), copy, 0,
353: copy.length);
354: return copy;
355: }
356:
357: /**
358: * Returning the valid data range of this <code>DataProvider</code> as a string.
359: * This method is an alternative to {@link #getDataCopy}.
360: *
361: * @return the string representation of the valid data range
362: */
363: public String toString() {
364: if (_dataAsString == null) {
365: if (_data != null) {
366: _dataAsString = new String(_data, _startPosition,
367: _length);
368: } else {
369: _dataAsString = "";
370: }
371: }
372: return _dataAsString;
373: }
374:
375: /**
376: * Setting the data buffer of this instance.
377: */
378: protected void setData(char[] data) {
379: _data = data;
380: }
381:
382: /**
383: * Setting the start position and the length in the data buffer of this
384: * instance.
385: *
386: * @param startPosition valid data start here
387: * @param length count of characters starting at startPosition
388: */
389: protected void setDataRange(int startPosition, int length) {
390: super .setDataRange(startPosition, length);
391: _dataAsString = null;
392: }
393:
394: // Members
395: private char[] _data;
396: private String _dataAsString;
397: }
398:
399: /**
400: * Implementation of the {@link de.susebox.jtopas.spi.DataProvider}
401: * interface for {@link CharSequenceTokenizerSource} sources.
402: */
403: private final class StringDataProvider extends AbstractDataProvider
404: implements DataProvider {
405:
406: /**
407: * The constructor takes the nessecary parameters for the methods defined
408: * below
409: */
410: public StringDataProvider(CharSequenceTokenizerSource source,
411: int startPosition, int length) {
412: super (startPosition, length);
413: setData(source);
414: }
415:
416: //---------------------------------------------------------------------------
417: // methods of the DataProvider interface
418: //
419:
420: /**
421: * See {@link de.susebox.jtopas.spi.DataProvider#getCharAt} for details.
422: *
423: * @param index the index of the character starting from {@link #getStartPosition}
424: * @return the character at the given position
425: */
426: public char getCharAt(int index) {
427: return _source.charAt(_startPosition + index);
428: }
429:
430: /**
431: * See {@link de.susebox.jtopas.spi.DataProvider#getData} for details.
432: *
433: * @return the character buffer to read data from
434: */
435: public char[] getData() {
436: return _source.toString().toCharArray();
437: }
438:
439: /**
440: * See {@link de.susebox.jtopas.spi.DataProvider#getDataCopy} for details.
441: *
442: * @return a copy of the valid data of this {@link DataProvider}
443: * @see #getData
444: * @see #toString
445: */
446: public char[] getDataCopy() {
447: return toString().toCharArray();
448: }
449:
450: /**
451: * Returning the valid data range of this <code>DataProvider</code> as a string.
452: * This method is an alternative to {@link #getDataCopy}.
453: *
454: * @return the string representation of the valid data range
455: */
456: public String toString() {
457: return _source.subSequence(_startPosition,
458: _startPosition + _length).toString();
459: }
460:
461: /**
462: * Setting the data source of this instance.
463: */
464: protected void setData(CharSequenceTokenizerSource source) {
465: _source = source;
466: }
467:
468: // Members
469: private CharSequenceTokenizerSource _source;
470: }
471:
472: //---------------------------------------------------------------------------
473: // Class members
474: //
475:
476: /**
477: * Buffer sizes
478: */
479: private static final int SMALL_BUFFER_INITSIZE = 0x2000; // 8K
480: private static final int LARGE_BUFFER_INITSIZE = 0x10000; // 64K
481:
482: //---------------------------------------------------------------------------
483: // Members
484: //
485:
486: /**
487: * This buffer holds the currently read data. Dont use a buffered reader, since
488: * we do buffering here.
489: */
490: protected char[] _inputBuffer = null;
491:
492: /**
493: * Mapping of index 0 of {@link #_inputBuffer} to the absolute start of the
494: * input stream.
495: */
496: protected int _rangeStart = 0;
497:
498: /**
499: * Flag used in conjunction with the {@link #_charSequenceTokenizerSource}.
500: */
501: protected boolean _hasBeenRead = false;
502:
503: /**
504: * If a {@link CharSequenceTokenizerSource} is used, this member is set to
505: * it.
506: */
507: protected CharSequenceTokenizerSource _charSequenceTokenizerSource = null;
508:
509: /**
510: * The {@link de.susebox.jtopas.spi.DataProvider} instance for this object.
511: * This instance is kept due to a significant performance boost compared with
512: * construction of a <code>DataProvider</code> every time {@link #getDataProvider}
513: * is called.
514: */
515: protected AbstractDataProvider _dataProvider = null;
516: }
|