001: /*
002: * Tokenizer.java: lexical parser interface.
003: *
004: * Copyright (C) 2001 Heiko Blau
005: *
006: * This file belongs to the JTopas Library.
007: * JTopas is free software; you can redistribute it and/or modify it
008: * under the terms of the GNU Lesser General Public License as published by the
009: * Free Software Foundation; either version 2.1 of the License, or (at your
010: * option) any later version.
011: *
012: * This software is distributed in the hope that it will be useful, but WITHOUT
013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
014: * FITNESS FOR A PARTICULAR PURPOSE.
015: * See the GNU Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public License along
018: * with JTopas. If not, write to the
019: *
020: * Free Software Foundation, Inc.
021: * 59 Temple Place, Suite 330,
022: * Boston, MA 02111-1307
023: * USA
024: *
025: * or check the Internet: http://www.fsf.org
026: *
027: * Contact:
028: * email: heiko@susebox.de
029: */
030:
031: package de.susebox.jtopas;
032:
033: //-----------------------------------------------------------------------------
034: // Imports
035: //
036: import de.susebox.jtopas.spi.WhitespaceHandler;
037: import de.susebox.jtopas.spi.SeparatorHandler;
038: import de.susebox.jtopas.spi.KeywordHandler;
039: import de.susebox.jtopas.spi.SequenceHandler;
040: import de.susebox.jtopas.spi.PatternHandler;
041:
042: //-----------------------------------------------------------------------------
043: // Interface Tokenizer
044: //
045:
046: /**<p>
047: * The interface <code>Tokenizer</code> contains setup methods, parse operations
048: * and other getter and setter methods for a tokenizer. A tokenizer splits a
049: * stream of input data into various units like whitespaces, comments, keywords
050: * etc. These units are the tokens that are reflected in the {@link Token} class
051: * of the <code>de.susebox.jtopas</code> package.
052: *</p><p>
053: * A <code>Tokenizer</code> is configured using a {@link TokenizerProperties}
054: * object that contains declarations for whitespaces, separators, comments,
055: * keywords, special sequences and patterns. It is designed to enable a common
056: * approach for parsing texts like program code, annotated documents like HTML
057: * and so on.
058: *</p><p>
059: * To detect links in an HTML document, a tokenizer would be invoked like that
060: * (see {@link StandardTokenizerProperties} and {@link StandardTokenizer} for the
061: * classes mentioned here):
062: *<blockquote><pre>
063: *
064: * Vector links = new Vector();
065: * FileReader reader = new FileReader("index.html");
066: * TokenizerProperties props = new StandardTokenizerProperties();
067: * Tokenizer tokenizer = new StandardTokenizer();
068: * Token token;
069: *
070: * props.setParseFlags(Tokenizer.F_NO_CASE);
071: * props.setSeparators("=");
072: * props.addString("\"", "\"", "\\");
073: * props.addBlockComment(">", "<");
074: * props.addKeyword("HREF");
075: *
076: * tokenizer.setTokenizerProperties(props);
077: * tokenizer.setSource(new ReaderSource(reader));
078: *
079: * try {
080: * while (tokenizer.hasMoreToken()) {
081: * token = tokenizer.nextToken();
082: * if (token.getType() == Token.KEYWORD) {
083: * tokenizer.nextToken(); // should be the '=' character
084: * links.addElement(tokenizer.next());
085: * }
086: * }
087: * } finally {
088: * tokenizer.close();
089: * reader.close();
090: * }
091: *
092: *</pre></blockquote>
093: * This somewhat rough way to find links should work fine on syntactically
094: * correct HTML code. It finds common links as well as mail, ftp links etc. Note
095: * the block comment. It starts with the ">" character, that is the closing
096: * character for HTML tags and ends with the "<" being the starting character
097: * of HTML tags. The effect is that all the real text is treated as a comment.
098: *</p><p>
099: * To extract the contents of a HTML file, one would write:
100: *<blockquote><pre>
101: *
102: * StringBuffer contents = new StringBuffer(4096);
103: * FileReader reader = new FileReader("index.html");
104: * TokenizerProperties props = new StandardTokenizerProperties();
105: * Tokenizer tokenizer = new StandardTokenizer();
106: * Token token;
107: *
108: * props.setParseFlags(Tokenizer.F_NO_CASE);
109: * props.addBlockComment(">", "<");
110: * props.addBlockComment(">HEAD<", ">/HEAD<");
111: * props.addBlockComment(">!--;", "--<");
112: *
113: * tokenizer.setTokenizerProperties(props);
114: * tokenizer.setSource(new ReaderSource(reader));
115: *
116: * try {
117: * while (tokenizer.hasMoreToken()) {
118: * token = tokenizer.nextToken();
119: * if (token.getType() != Token.BLOCK_COMMENT) {
120: * contents.append(token.getToken());
121: * }
122: * }
123: * } finally {
124: * tokenizer.close();
125: * reader.close();
126: * }
127: *
128: *</pre></blockquote>
129: * Here the block comment is the exact opposite of the first example. Now all the
130: * HTML tags are skipped. Moreover, we declared the HTML-Header as a block
131: * comment as well - the informations from the header are thus skipped alltogether.
132: *</p><p>
133: * Parsing (tokenizing) is done on a well defined priority scheme. See
134: * {@link #nextToken} for details.
135: *</p><p>
136: * NOTE: if a character sequence is registered for two categories of tokenizer
137: * properties (e.g. as a line comments starting sequence as well as a special
138: * sequence), the category with the highest priority wins (e.g. if the metioned
139: * sequence is found, it is interpreted as a line comment).
140: *</p><p>
141: * The tokenizer interface is clearly designed for "readable" data, say ASCII-
142: * or UNICODE data. Parsing binary data has other characteristics that do not
143: * necessarily fit in a scheme of comments, keywords, strings, identifiers and
144: * operators.
145: *</p><p>
146: * Note that the interface has no methods that handle stream data sources. This
147: * is left to the implementations that may have quite different data sources, e. g.
148: * {@link java.io.InputStreamReader}, database queries, string arrays etc. The
149: * interface {@link TokenizerSource} serves as an abstraction of such widely
150: * varying data sources.
151: *</p><p>
152: * The <code>Tokenizer</code> interface partly replaces the older
153: * {@link de.susebox.java.util.Tokenizer} interface which is deprecated.
154: *</p>
155: *
156: * @see Token
157: * @see TokenizerProperties
158: * @author Heiko Blau
159: */
160: public interface Tokenizer {
161:
162: //---------------------------------------------------------------------------
163: // data source
164: //
165:
166: /**
167: * Setting the source of data. This method is usually called during setup of
168: * the <code>Tokenizer</code> but may also be invoked while the tokenizing
169: * is in progress. It will reset the tokenizers input buffer, line and column
170: * counters etc.
171: *<br>
172: * It is allowed to pass <code>null</code>. Calls to {@link #hasMoreToken}
173: * will return <code>false</code>, while calling {@link #nextToken} will return
174: * an EOF token.
175: *
176: * @param source a {@link TokenizerSource} to read data from
177: * @see #getSource
178: */
179: public void setSource(TokenizerSource source);
180:
181: /**
182: * Retrieving the {@link TokenizerSource} of this <code>Tokenizer</code>. The
183: * method may return <code>null</code> if there is no <code>TokenizerSource</code>
184: * associated with this <code>Tokenizer</code>.
185: *
186: * @return the {@link TokenizerSource} associated with this <code>Tokenizer</code>
187: * @see #setSource
188: */
189: public TokenizerSource getSource();
190:
191: //---------------------------------------------------------------------------
192: // configuration
193: //
194:
195: /**
196: * Setting the tokenizer characteristics. This operation is usually done before
197: * the parse process. A common place is a constructor of a <code>Tokenizer</code>
198: * implementation. If the tokenizer characteristics change during the parse
199: * process they take effect with the next call of {@link #nextToken} or
200: * {@link #nextImage}. Usually, a <code>Tokenizer</code> implementation will
201: * also implement the {@link TokenizerPropertyListener} interface to be notified
202: * about property changes.
203: *<br>
204: * Generally, the <code>Tokenizer</code> implementation should also implement
205: * the {@link de.susebox.jtopas.spi.DataProvider} interface or provide an inner
206: * class that implements the <code>DataProvider</code> interface, while the
207: * {@link TokenizerProperties} implementation should in turn implement the
208: * interfaces
209: *<ul><li>
210: * {@link de.susebox.jtopas.spi.WhitespaceHandler},
211: *</li><li>
212: * {@link de.susebox.jtopas.spi.SeparatorHandler},
213: *</li><li>
214: * {@link de.susebox.jtopas.spi.SequenceHandler},
215: *</li><li>
216: * {@link de.susebox.jtopas.spi.KeywordHandler} and
217: *</li><li>
218: * {@link de.susebox.jtopas.spi.PatternHandler}
219: *</li></ul>
220: * These handler interfaces are collected in the {@link de.susebox.jtopas.spi.DataMapper}
221: * interface.
222: *<br>
223: * Although the implementation of the mentioned interfaces is recommended, it
224: * is not a mandatory way. Except for {@link de.susebox.jtopas.spi.PatternHandler}
225: * that must be implemented by the {@link TokenizerProperties} implementation,
226: * since it is not possible for a <code>Tokenizer</code> to interpret a regular
227: * expression pattern only with the information provided through the
228: * <code>TokenizerProperties</code> interface.
229: *<br>
230: * If a <code>Tokenizer</code> implementation chooses to use a exclusively tailored
231: * {@link TokenizerProperties} implementation, it should throw an
232: * {@link java.lang.IllegalArgumentException} if it is not provided with an
233: * instance of that {@link TokenizerProperties} implementation.
234: *<br>
235: * If <code>null</code> is passed to the method it throws
236: * {@link java.lang.NullPointerException}.
237: *
238: * @param props the {@link TokenizerProperties} for this tokenizer
239: * @throws NullPointerException if the <code>null</code> is passed to the call
240: * @throws IllegalArgumentException if the {@link TokenizerProperties} implementation
241: * of the parameter cannot be used with the implementation of this
242: * <code>Tokenizer</code>
243: * @see #getTokenizerProperties
244: */
245: public void setTokenizerProperties(TokenizerProperties props)
246: throws NullPointerException, IllegalArgumentException;
247:
248: /**
249: * Retrieving the current tokenizer characteristics. The method may return
250: * <code>null</code> if {@link #setTokenizerProperties} has not been called so
251: * far.
252: *
253: * @return the {@link TokenizerProperties} of this <code>Tokenizer</code>
254: * @see #setTokenizerProperties
255: */
256: public TokenizerProperties getTokenizerProperties();
257:
258: /**
259: * Setting the control flags of the <code>TokenizerProperties</code>. Use a
260: * combination of the <code>F_...</code> flags declared in {@link TokenizerProperties}
261: * for the parameter. The <code>mask</code> parameter contains a bit mask of
262: * the <code>F_...</code> flags to change.
263: *<br>
264: * The parse flags for a tokenizer can be set through the associated
265: * {@link TokenizerProperties} instance. These global settings take effect in all
266: * <code>Tokenizer</code> instances that use the same <code>TokenizerProperties</code>
267: * object. Flags related to the parsing process can also be set separately
268: * for each tokenizer during runtime. These are the dynamic flags:
269: *<ul><li>
270: * {@link TokenizerProperties#F_RETURN_WHITESPACES} and its sub-flags
271: *</li><li>
272: * {@link TokenizerProperties#F_TOKEN_POS_ONLY}
273: *</li></ul>
274: * Other flags can also be set for each tokenizer separately, but should be set
275: * before the tokenizing starts to make sense.
276: *<ul><li>
277: * {@link TokenizerProperties#F_KEEP_DATA}
278: *</li><li>
279: * {@link TokenizerProperties#F_COUNT_LINES}
280: *</li></ul>
281: * The other flags should only be used on the <code>TokenizerProperties</code>
282: * instance or on single {@link TokenizerProperty} objects and influence all
283: * <code>Tokenizer</code> instances sharing the same <code>TokenizerProperties</code>
284: * object. For instance, using the flag {@link TokenizerProperties#F_NO_CASE}
285: * is an invalid operation on a <code>Tokenizer</code>. It affects the interpretation
286: * of keywords and sequences by the associated <code>TokenizerProperties</code>
287: * instance and, moreover, possibly the storage of these properties.
288: *<br>
289: * This method throws a {@link TokenizerException} if a flag is passed that cannot
290: * be handled by the <code>Tokenizer</code> object itself.
291: *<br>
292: * This method takes precedence over the {@link TokenizerProperties#setParseFlags}
293: * method of the associated <code>TokenizerProperties</code> object. Even if
294: * the global settings of one of the dynamic flags (see above) change after a
295: * call to this method, the flags set separately for this tokenizer, stay
296: * active.
297: *
298: * @param flags the parser control flags
299: * @param mask the mask for the flags to set or unset
300: * @throws TokenizerException if one or more of the flags given cannot be honored
301: * @see #getParseFlags
302: */
303: public void changeParseFlags(int flags, int mask)
304: throws TokenizerException;
305:
306: /**
307: * Retrieving the parser control flags. A bitmask containing the <code>F_...</code>
308: * constants is returned. This method returns both the flags that are set
309: * separately for this <code>Tokenizer</code> and the flags set for the
310: * associated {@link TokenizerProperties} object.
311: *
312: * @return the current parser control flags
313: * @see #changeParseFlags
314: */
315: public int getParseFlags();
316:
317: /**
318: * Setting a new {@link de.susebox.jtopas.spi.KeywordHandler} or removing any
319: * previously installed one. If <code>null</code> is passed (installed handler
320: * removed), no keyword support is available.
321: *<br>
322: * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
323: * implement the {@link de.susebox.jtopas.spi.KeywordHandler} interface. If so,
324: * the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code>
325: * instance as its <code>KeywordHandler</code>. A different or a handler specific
326: * to a certain <code>Tokenizer</code> instance, can be set using this method.
327: *
328: * @param handler the (new) {@link de.susebox.jtopas.spi.KeywordHandler} to use
329: * or <code>null</code> to remove it
330: * @see #getKeywordHandler
331: * @see TokenizerProperties#addKeyword
332: */
333: public void setKeywordHandler(
334: de.susebox.jtopas.spi.KeywordHandler handler);
335:
336: /**
337: * Retrieving the current {@link de.susebox.jtopas.spi.KeywordHandler}. The
338: * method may return <code>null</code> if there isn't any handler installed.
339: *
340: * @return the currently active {@link de.susebox.jtopas.spi.KeywordHandler}
341: * or <code>null</code>, if keyword support is switched off
342: * @see #setKeywordHandler
343: */
344: public de.susebox.jtopas.spi.KeywordHandler getKeywordHandler();
345:
346: /**
347: * Setting a new {@link de.susebox.jtopas.spi.WhitespaceHandler} or removing
348: * any previously installed one. If <code>null</code> is passed, the tokenizer
349: * will not recognize whitespaces.
350: *<br>
351: * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
352: * implement the {@link de.susebox.jtopas.spi.WhitespaceHandler} interface. If
353: * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code>
354: * instance as its <code>WhitespaceHandler</code>. A different handler or a
355: * handler specific to a certain <code>Tokenizer</code> instance, can be set
356: * using this method.
357: *
358: * @param handler the (new) whitespace handler to use or <code>null</code> to
359: * switch off whitespace handling
360: * @see #getWhitespaceHandler
361: * @see TokenizerProperties#setWhitespaces
362: */
363: public void setWhitespaceHandler(
364: de.susebox.jtopas.spi.WhitespaceHandler handler);
365:
366: /**
367: * Retrieving the current {@link de.susebox.jtopas.spi.WhitespaceHandler}. The
368: * method may return <code>null</code> if there whitespaces are not recognized.
369: *
370: * @return the currently active whitespace handler or null, if the base
371: * implementation is working
372: * @see #setWhitespaceHandler
373: */
374: public de.susebox.jtopas.spi.WhitespaceHandler getWhitespaceHandler();
375:
376: /**
377: * Setting a new {@link de.susebox.jtopas.spi.SeparatorHandler} or removing any
378: * previously installed <code>SeparatorHandler</code>. If <code>null</code> is
379: * passed, the tokenizer doesn't recognize separators.
380: *<br>
381: * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
382: * implement the {@link de.susebox.jtopas.spi.SeparatorHandler} interface. If
383: * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code>
384: * instance as its <code>SeparatorHandler</code>. A different handler or a
385: * handler specific to a certain <code>Tokenizer</code> instance, can be set
386: * using this method.
387: *
388: * @param handler the (new) separator handler to use or <code>null</code> to
389: * remove it
390: * @see #getSeparatorHandler
391: * @see TokenizerProperties#setSeparators
392: */
393: public void setSeparatorHandler(
394: de.susebox.jtopas.spi.SeparatorHandler handler);
395:
396: /**
397: * Retrieving the current {@link de.susebox.jtopas.spi.SeparatorHandler}. The
398: * method may return <code>null</code> if there isn't any handler installed.
399: *
400: * @return the currently active {@link de.susebox.jtopas.spi.SeparatorHandler}
401: * or <code>null</code>, if separators aren't recognized by the tokenizer
402: * @see #setSeparatorHandler
403: */
404: public de.susebox.jtopas.spi.SeparatorHandler getSeparatorHandler();
405:
406: /**
407: * Setting a new {@link de.susebox.jtopas.spi.SequenceHandler} or removing any
408: * previously installed one. If <code>null</code> is passed, the tokenizer will
409: * not recognize line and block comments, strings and special sequences.
410: *<br>
411: * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
412: * implement the {@link de.susebox.jtopas.spi.SequenceHandler} interface. If
413: * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code>
414: * instance as its <code>SeparatorHandler</code>. A different handler or a
415: * handler specific to a certain <code>Tokenizer</code> instance, can be set
416: * using this method.
417: *
418: * @param handler the (new) {@link de.susebox.jtopas.spi.SequenceHandler} to
419: * use or <code>null</code> to remove it
420: * @see #getSequenceHandler
421: * @see TokenizerProperties#addSpecialSequence
422: * @see TokenizerProperties#addLineComment
423: * @see TokenizerProperties#addBlockComment
424: * @see TokenizerProperties#addString
425: */
426: public void setSequenceHandler(
427: de.susebox.jtopas.spi.SequenceHandler handler);
428:
429: /**
430: * Retrieving the current {@link de.susebox.jtopas.spi.SequenceHandler}. The
431: * method may return <code>null</code> if there isn't any handler installed.
432: *<br>
433: * A <code>SequenceHandler</code> deals with line and block comments, strings
434: * and special sequences.
435: *
436: * @return the currently active {@link de.susebox.jtopas.spi.SequenceHandler}
437: * or <code>null</code>, if no
438: * @see #setSequenceHandler
439: */
440: public de.susebox.jtopas.spi.SequenceHandler getSequenceHandler();
441:
442: /**
443: * Setting a new {@link de.susebox.jtopas.spi.PatternHandler} or removing any
444: * previously installed one. If <code>null</code> is passed, pattern are not
445: * supported by the tokenizer (any longer).
446: *<br>
447: * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
448: * implement the {@link de.susebox.jtopas.spi.PatternHandler} interface. If
449: * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code>
450: * instance as its <code>PatternHandler</code>. A different handler or a
451: * handler specific to a certain <code>Tokenizer</code> instance, can be set
452: * using this method.
453: *
454: * @param handler the (new) {@link de.susebox.jtopas.spi.PatternHandler} to
455: * use or <code>null</code> to remove it
456: * @see #getPatternHandler
457: * @see TokenizerProperties#addPattern
458: */
459: public void setPatternHandler(
460: de.susebox.jtopas.spi.PatternHandler handler);
461:
462: /**
463: * Retrieving the current {@link de.susebox.jtopas.spi.PatternHandler}. The method
464: * may return <code>null</code> if there isn't any handler installed.
465: *
466: * @return the currently active {@link de.susebox.jtopas.spi.PatternHandler}
467: * or <code>null</code>, if patterns are not recognized by the tokenizer
468: * @see #setPatternHandler
469: */
470: public de.susebox.jtopas.spi.PatternHandler getPatternHandler();
471:
472: //---------------------------------------------------------------------------
473: // tokenizer operations
474: //
475:
476: /**
477: * Check if there are more tokens available. This method will return
478: * <code>true</code> until and enf-of-file condition is encountered during a
479: * call to {@link #nextToken} or {@link #nextImage}.
480: *<br>
481: * That means, that the EOF is returned one time, afterwards <code>hasMoreToken</code>
482: * will return <code>false</code>. Furthermore, that implies, that the method
483: * will return <code>true</code> at least once, even if the input data stream
484: * is empty.
485: *<br>
486: * The method can be conveniently used in a while loop.
487: *
488: * @return <code>true</code> if a call to {@link #nextToken} or {@link #nextImage}
489: * will succed, <code>false</code> otherwise
490: */
491: public boolean hasMoreToken();
492:
493: /**
494: * Retrieving the next {@link Token}. The method works in this order:
495: *<ol><li>
496: * Check for an end-of-file condition. If there is such a condition then
497: * return it.
498: *</li><li>
499: * Try to collect a sequence of whitespaces. If such a sequence can be found
500: * return if the flag <code>F_RETURN_WHITESPACES</code> is set, or skip these
501: * whitespaces.
502: *</li><li>
503: * Check the next characters against all known pattern. A pattern is usually
504: * a regular expression that is used by {@link java.util.regex.Pattern}. But
505: * implementations of {@link de.susebox.jtopas.spi.PatternHandler} may use
506: * other pattern syntaxes. Note that pattern are not recognized within
507: * "normal" text (see below for a more precise description).
508: *</li><li>
509: * Check the next characters against all known line and block comments. If
510: * a line or block comment starting sequence matches, return if the flag
511: * <code>F_RETURN_WHITESPACES</code> is set, or skip the comment.
512: * If comments are returned they include their starting and ending sequences
513: * (newline in case of a line comment).
514: *</li><li>
515: * Check the next characters against all known string starting sequences. If
516: * a string begin could be identified return the string until and including
517: * the closing sequence.
518: *</li><li>
519: * Check the next characters against all known special sequences. Especially,
520: * find the longest possible match. If a special sequence could be identified
521: * then return it.
522: *</li><li>
523: * Check for ordinary separators. If one could be found return it.
524: *</li><li>
525: * Check the next characters against all known keywords. If a keyword could
526: * be identified then return it.
527: *</li><li>
528: * Return the text portion until the next whitespace, comment, special
529: * sequence or separator. Note that pattern are not recognized within "normal"
530: * text. A pattern match has therefore always a whitespace, comment, special
531: * sequence, separator or another pattern match in front of it or starts at
532: * position 0 of the data.
533: *</li></ol>
534: * The method will return the EOF token as long as {@link #hasMoreToken} returns
535: * <code>false</code>. It will not return <code>null</code> in such conditions.
536: *
537: * @return found {@link Token} including the EOF token
538: * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
539: * (IOExceptions for instance)
540: * @see #nextImage
541: */
542: public Token nextToken() throws TokenizerException;
543:
544: /**
545: * This method is a convenience method. It returns only the next token image
546: * without any informations about its type or associated information. This is
547: * an especially usefull method, if the parse flags for this <code>Tokenizer</code>
548: * have the flag {@link TokenizerProperties#F_TOKEN_POS_ONLY} set, since this
549: * method returns a valid string even in that case.
550: *
551: * @return the token image of the next token
552: * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
553: * (IOExceptions for instance)
554: * @see #nextToken
555: * @see #currentImage
556: */
557: public String nextImage() throws TokenizerException;
558:
559: /**
560: * Retrieve the {@link Token} that was found by the last call to {@link #nextToken}.
561: * or {@link #nextImage}.
562: *<br>
563: * Since version 0.6.1 of JTopas, this method throws a {@link TokenizerException}
564: * rather than returning <code>null</code> if neither {@link #nextToken} nor
565: * {@link #nextImage} have been called before or {@link #setReadPositionRelative}
566: * or {@link #setReadPositionAbsolute} habe been called after the last call to
567: * <code>nextToken</code> or <code>nextImage</code>.
568: *
569: * @return the {@link Token} retrieved by the last call to {@link #nextToken}.
570: * @throws TokenizerException if the tokenizer has no current token
571: * @see #nextToken
572: * @see #currentImage
573: */
574: public Token currentToken() throws TokenizerException;
575:
576: /**
577: * Convenience method to retrieve only the token image of the {@link Token} that
578: * would be returned by {@link #currentToken}. This is an especially usefull
579: * method, if the parse flags for this <code>Tokenizer</code> have the
580: * flag {@link TokenizerProperties#F_TOKEN_POS_ONLY} set, since this method
581: * returns a valid string even in that case.
582: *<br>
583: * Since version 0.6.1 of JTopas, this method throws a {@link TokenizerException}
584: * rather than returning <code>null</code> if neither {@link #nextToken} nor
585: * {@link #nextImage} have been called before or {@link #setReadPositionRelative}
586: * or {@link #setReadPositionAbsolute} habe been called after the last call to
587: * <code>nextToken</code> or <code>nextImage</code>.
588: *
589: * @return the token image of the current token
590: * @throws TokenizerException if the tokenizer has no current token
591: * @see #currentToken
592: * @see #nextImage
593: */
594: public String currentImage() throws TokenizerException;
595:
596: //---------------------------------------------------------------------------
597: // line and column positions
598: //
599:
600: /**
601: * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method
602: * will return the line number starting with 0 in the input stream. The
603: * implementation of the <code>Tokenizer</code> interface can decide which
604: * end-of-line sequences should be recognized. The most flexible approach is
605: * to process the following end-of-line sequences:
606: * <br><ul><li>
607: * Carriage Return (ASCII 13, '\r'). This EOL is used on Apple Macintosh
608: * </li><li>
609: * Linefeed (ASCII 10, '\n'). This is the UNIX EOL character.
610: * </li><li>
611: * Carriage Return + Linefeed ("\r\n"). This is used on MS Windows systems.
612: * </li></ul>
613: * Another legitime and in many cases satisfying way is to use the system
614: * property "line.separator".
615: *<br>
616: * Displaying information about lines usually means adding 1 to the zero-based
617: * line number.
618: *
619: * @return the current line number starting with 0 or -1 if no line numbers
620: * are supplied ({@link TokenizerProperties#F_COUNT_LINES} is not set).
621: * @see #getColumnNumber
622: */
623: public int getLineNumber();
624:
625: /**
626: * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method
627: * will return the current column position starting with 0 in the input stream.
628: * Displaying information about columns usually means adding 1 to the zero-based
629: * column number.
630: *
631: * @return the current column position or -1 if the flag if no column numbers
632: * are supplied {@link TokenizerProperties#F_COUNT_LINES} is not set).
633: * is not set
634: * @see #getLineNumber
635: */
636: public int getColumnNumber();
637:
638: //---------------------------------------------------------------------------
639: // text range operations
640: //
641:
642: /**
643: * This method returns the absolute offset in characters to the start of the
644: * parsed stream. Together with {@link #currentlyAvailable} it describes the
645: * currently available text "window".
646: *<br>
647: * The position returned by this method and also by {@link #getReadPosition}
648: * are absolute rather than relative in a text buffer to give the tokenizer
649: * the full control of how and when to refill its text buffer.
650: *
651: * @return the absolute offset of the current text window in characters from
652: * the start of the data source of the Tokenizer
653: */
654: public int getRangeStart();
655:
656: /**
657: * Getting the current read offset. This is the absolute position where the
658: * next call to <code>nextToken</code> or <code>next</code> will start. It is
659: * therefore <b><k>not</k></b> the same as the position returned by
660: * {@link Token#getStartPosition} of the current token ({@link #currentToken}).
661: *<br>
662: * It is the starting position of the token returned by the next call to
663: * {@link #nextToken}, if that token is no whitespace or if whitespaces are
664: * returned ({@link TokenizerProperties#F_RETURN_WHITESPACES}).
665: *<br>
666: * The position returned by this method and also by {@link #getRangeStart}
667: * are absolute rather than relative in a text buffer to give the tokenizer
668: * the full control of how and when to refill its text buffer.
669: *
670: * @return the absolute offset in characters from the start of the data source
671: * of the Tokenizer where reading will be continued
672: */
673: public int getReadPosition();
674:
675: /**
676: * Retrieving the number of the currently available characters. This includes
677: * both characters already parsed by the <code>Tokenizer</code> and characters
678: * still to be analyzed.<br>
679: *
680: * @return number of currently available characters
681: */
682: public int currentlyAvailable();
683:
684: /**
685: * Retrieve text from the currently available range. The start and length
686: * parameters must be inside {@link #getRangeStart} and
687: * {@link #getRangeStart} + {@link #currentlyAvailable}.
688: *<br>
689: * Example:
690: *<block><pre>
691: * int startPos = tokenizer.getReadPosition();
692: * String source;
693: *
694: * while (tokenizer.hasMoreToken()) {
695: * Token token = tokenizer.nextToken();
696: *
697: * switch (token.getType()) {
698: * case Token.LINE_COMMENT:
699: * case Token.BLOCK_COMMENT:
700: * source = tokenizer.getText(startPos, token.getStartPos() - startPos);
701: * startPos = token.getStartPos();
702: * }
703: * }
704: *</pre></block>
705: *
706: * @param start position where the text begins
707: * @param length length of the text
708: * @return the text beginning at the given position ith the given length
709: * @throws IndexOutOfBoundsException if the starting position or the length is
710: * out of the current text window
711: */
712: public String getText(int start, int length)
713: throws IndexOutOfBoundsException;
714:
715: /**
716: * Get a single character from the current text range.
717: *
718: * @param pos position of the required character
719: * @return the character at the specified position
720: * @throws IndexOutOfBoundsException if the parameter <code>pos</code> is not
721: * in the available text range (text window)
722: */
723: public char getChar(int pos) throws IndexOutOfBoundsException;
724:
725: /**
726: * Try to read more data into the text buffer of the tokenizer. This can be
727: * useful when a method needs to look ahead of the available data or a skip
728: * operation should be performed.
729: *<br>
730: * The method returns the same value than an immediately following call to
731: * {@link #currentlyAvailable} would return.
732: *
733: * @return the number of character now available
734: * @throws TokenizerException generic exception (list) for all problems that
735: * may occur while reading (IOExceptions for instance)
736: */
737: public int readMore() throws TokenizerException;
738:
739: /**
740: * This method sets the tokenizers current read position to the given absolute
741: * read position. It realizes one type of rewind / forward operations. The
742: * given position must be inside the intervall {@link #getRangeStart} and
743: * {@link #getRangeStart} + {@link #currentlyAvailable} - 1.
744: *<br>
745: * The current read position is the end position of the current token. That means
746: * that the following assertion can be made:
747: *<pre>
748: * Token token1 = tokenizer.nextToken();
749: * tokenizer.setReadPositionAbsolute(tokenizer.getReadPosition() - token1.getLength());
750: * Token token2 = tokenizer.nextToken();
751: * assert(token1.equals(token2));
752: *</pre>
753: *<br>
754: * Since JTopas version 0.6.1, the operation clears the current token. Therefore,
755: * {@link #currentImage} and {@link #currentToken} will throw a {@link TokenizerException}
756: * if called after a <code>setReadPositionAbsolute</code> without a subsequent
757: * call to {@link #nextToken} of {@link #nextImage}.
758: *
759: * @param position absolute position for the next parse operation
760: * @throws IndexOutOfBoundsException if the parameter <code>position</code> is
761: * not in the available text range (text window)
762: * @see #setReadPositionRelative
763: */
764: public void setReadPositionAbsolute(int position)
765: throws IndexOutOfBoundsException;
766:
767: /**
768: * This method sets the tokenizers new read position the given number of characters
769: * forward (positive value) or backward (negative value) starting from the current
770: * read position. It realizes one type of rewind / forward operations. The
771: * given offset must be greater or equal than {@link #getRangeStart} - {@link #getReadPosition}
772: * and lower than {@link #currentlyAvailable} - {@link #getReadPosition}.
773: *<br>
774: * Since JTopas version 0.6.1, the operation clears the current token. Therefore,
775: * {@link #currentImage} and {@link #currentToken} will throw a {@link TokenizerException}
776: * if called after a <code>setReadPositionAbsolute</code> without a subsequent
777: * call to {@link #nextToken} of {@link #nextImage}.
778: *
779: * @param offset number of characters to move forward (positive offset) or
780: * backward (negative offset)
781: * @throws IndexOutOfBoundsException if the parameter <code>offset</code> would
782: * move the read position out of the available text range (text window)
783: * @see #setReadPositionAbsolute
784: */
785: public void setReadPositionRelative(int offset)
786: throws IndexOutOfBoundsException;
787:
788: //---------------------------------------------------------------------------
789: // Cleanup
790: //
791:
792: /**
793: * This method is nessecary to release memory and remove object references if
794: * a <code>Tokenizer</code> instances are frequently created for small tasks.
795: * Generally, the method shouldn't throw any exceptions. It is also ok to call
796: * it more than once.
797: *<br>
798: * It is an error, to call any other method of the implementing class after
799: * <code>close</code> has been called.
800: */
801: public void close();
802: }
|