001: /*
002: * Token.java: Token for parsers etc.
003: *
004: * Copyright (C) 2002 Heiko Blau
005: *
006: * This file belongs to the Susebox Java Core Library (Susebox JCL).
007: * The Susebox JCL is free software; you can redistribute it and/or modify it
008: * under the terms of the GNU Lesser General Public License as published by the
009: * Free Software Foundation; either version 2.1 of the License, or (at your
010: * option) any later version.
011: *
012: * This software is distributed in the hope that it will be useful, but WITHOUT
013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
014: * FITNESS FOR A PARTICULAR PURPOSE.
015: * See the GNU Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public License along
018: * with the Susebox JCL. If not, write to the
019: *
020: * Free Software Foundation, Inc.
021: * 59 Temple Place, Suite 330,
022: * Boston, MA 02111-1307
023: * USA
024: *
025: * or check the Internet: http://www.fsf.org
026: *
027: * Contact:
028: * email: heiko@susebox.de
029: */
030:
031: package de.susebox.jtopas;
032:
033: //-----------------------------------------------------------------------------
034: // Class Token
035: //
036:
037: /**<p>
038: * Instances of this class are created by the classes implementing the
039: * {@link Tokenizer} interface. <code>Token</code> describes a portion of text
040: * according to the settings given to the producing {@link Tokenizer} in form of
041: * a {@link TokenizerProperties} object. Beside the token type the token image
042: * itself, its position in the input stream, line and column position and associated
043: * informations can be obtained from the <code>Token</code> (provided, the nessecary
044: * parse flags are set in the tokenizer).
045: *</p><p>
046: * This class replaces the older {@link de.susebox.java.util.Token} which is
047: * deprecated.
048: *</p>
049: *
050: * @author Heiko Blau
051: * @see Tokenizer
052: * @see TokenizerProperties
053: */
054: public class Token {
055:
056: //---------------------------------------------------------------------------
057: // constants (token types)
058: //
059:
060: /**
061: * The token is nothing special (no keyword, no whitespace, etc.).
062: */
063: public static final byte NORMAL = 0;
064:
065: /**
066: * The token is a keyword registered with the used {@link Tokenizer}.
067: */
068: public static final byte KEYWORD = 1;
069:
070: /**
071: * The token is one of the quoted strings known to the {@link Tokenizer}. In Java
072: * this would be for instance a "String" or a 'c' (haracter).
073: */
074: public static final byte STRING = 2;
075:
076: /**
077: * The token matches a pattern. This can be a number od identifier pattern for
078: * instance.
079: */
080: public static final byte PATTERN = 3;
081:
082: /**
083: * Special sequences are characters or character combinations that have a certain
084: * meaning to the parsed language or dialect. In computer languages we have for
085: * instance operators, end-of-statement characters etc.
086: * A companion might have been associated with a special sequence. It probably
087: * contains information important to the user of the <code>Token</code>.
088: */
089: public static final byte SPECIAL_SEQUENCE = 4;
090:
091: /**
092: * Separators are otherwise not remarkable characters. An opening parenthesis
093: * might be nessecary for a syntactically correct text, but without any special
094: * meaning to the compiler, interpreter etc. after it has been detected.
095: */
096: public static final byte SEPARATOR = 5;
097:
098: /**
099: * Whitespaces are portions of the text, that contain one or more characters
100: * that separate the significant parts of the text. Generally, a sequence of
101: * whitespaces is equally represented by one single whitespace character. That
102: * is the difference to separators.
103: */
104: public static final byte WHITESPACE = 6;
105:
106: /**
107: * Although a line comment is - in most cases - actually a whitespace sequence, it
108: * is often nessecary to handle it separately. Syntax hilighting is a thing that
109: * needs to know a line comment.
110: */
111: public static final byte LINE_COMMENT = 7;
112:
113: /**
114: * Block comments are also a special form of a whitespace sequence. See
115: * {@link #LINE_COMMENT} for details.
116: */
117: public static final byte BLOCK_COMMENT = 8;
118:
119: /**
120: * A token of the type <code>EOF</code> is used to indicate an end-of-line condition
121: * on the input stream of the tokenizer.
122: */
123: public static final byte EOF = -1;
124:
125: /**
126: * This is for the leftovers of the lexical analysis of a text.
127: */
128: public static final byte UNKNOWN = -2;
129:
130: //---------------------------------------------------------------------------
131: // Getter- und Setter-Methoden
132: //
133:
134: /**
135: * Setting the type property of the <code>Token</code>. This is one of the constants
136: * defined in this class.
137: *
138: * @param type the token type
139: * @see #getType
140: */
141: public void setType(int type) {
142: _type = type;
143: }
144:
145: /**
146: * Obtaining the type of the <code>Token</code>. This is one of the constants
147: * defined in the <code>Token</code> class.
148: *
149: * @return the token type
150: * @see #setType
151: */
152: public int getType() {
153: return _type;
154: }
155:
156: /**
157: * Setting the token image. Note that some {@link Tokenizer} only fill position
158: * and length information rather than setting the token image. This strategy
159: * might have a tremendous influence on the parse performance and the memory
160: * allocation.
161: *
162: * @param image the token image
163: * @see #getImage
164: */
165: public void setImage(String image) {
166: if ((_image = image) == null) {
167: _length = 0;
168: } else {
169: _length = _image.length();
170: }
171: }
172:
173: /**
174: * Obtaining the token image as a {@link java.lang.String}. Th method returns
175: * <code>null</code> when called on an end-of-file token or if the {@link Tokenizer}
176: * producing this <code>Token</code> object, is configured to return only
177: * position informations (see {@link TokenizerProperties#F_TOKEN_POS_ONLY}).
178: *
179: * @return the token image as a {@link java.lang.String} (<code>null</code> is possible).
180: * @see #setImage
181: */
182: public String getImage() {
183: return _image;
184: }
185:
186: /**
187: * Image parts are substrings of a token image. The operation returns a meaningful
188: * result only, if the flag {@link TokenizerProperties#F_RETURN_IMAGE_PARTS} is
189: * set for the <code>TokenizerProperties</code>, the {@link Tokenizer} or the
190: * {@link TokenizerProperty} that "produced" the token. If that flag is not set
191: * the return value is identical to {@link #getImage}.
192: *<br>
193: * Number and contents of the image parts depend on the token type:
194: *<ul><li>
195: * {@link #NORMAL}, {@link #KEYWORD}, {@link #SPECIAL_SEQUENCE},
196: * {@link #SEPARATOR}: These token have one image part that is identical to
197: * the image itself ({@link #getImage}).
198: *</li><li>
199: * {@link #WHITESPACE}: Whitespaces have one image part for each substring
200: * on a single line without any line separators. For whitespace sequences
201: * without line separators there will be one part that is identical to the
202: * image itself ({@link #getImage}). More generally, whitespaces have
203: * <code>separatorCount + 1</code> image parts. For multi-line whitespaces
204: * some or all of these image parts can be empty.
205: *</li><li>
206: * {@link #STRING}: One image part per line containing the characters between
207: * and excluding the string start and end sequences and/or the line
208: * separators, equivalent to the handling of whitespaces. The string escape
209: * sequences are resolved. For instance, the image part of the SQL string
210: * <code>'select ''hello'' from dual'</code> is <code>select 'hello' from dual</code>.
211: * Multiline strings may have empty image parts (if emtpy lines are included
212: * in the string). The string "line1\n" has two image parts: "line1" and the
213: * empty string (since the string ends on a new line). The string "\nline2"
214: * has also two image parts: the empty string and "line2" (since the string
215: * starts on one line and ends on the next).
216: *</li><li>
217: * {@link #PATTERN}: a pattern has image parts according to the groups defined
218: * in the regular expression of the pattern. The {@link java.util.regex.Pattern}
219: * class speaks of "Capturing groups" that are expressions in parentheses.
220: * Image parts are especially important for pattern token, where the access
221: * to parts of the pattern is usually nessecary. For instance, in Java Unicode
222: * characters can be written in form of <code>"\\u[0-9A-Fa-f]{4}"</code>
223: * pattern. For further processing the hexadecimal part must be accessed.
224: * By using the pattern <code>"\\u([0-9A-Fa-f]{4})"</code>, a token containing
225: * the unicode notation <code>"\\u00AC"</code> has the two image parts
226: * <code>"\\u00AC"</code> (capturing group 0) and <code>"00AC"</code>
227: * (capturing group 1).
228: *</li><li>
229: * {@link #LINE_COMMENT}: Line comments have one image part that contains
230: * the substring after the line comment start sequence up to and excluding
231: * the line separator sequence.
232: *</li><li>
233: * {@link #BLOCK_COMMENT}: Like whitespaces and string, block comments have
234: * one image part per line they are spanning. The first part is without the
235: * block comment start sequence, the last without the block comment end
236: * sequence. The line separator sequences are also not included in the parts.
237: *</li><li>
238: * {@link #EOF}: The method returns an empty array.
239: *</li></ul>
240: * The return value is an array of strings rather than an {@link java.util.Enumeration}
241: * or {@link java.util.Iterator}, since it can be used more easily and contains
242: * only one element in a lot if not most cases.
243: *
244: * @return an array of image parts according to the token type if the flag
245: * {@link TokenizerProperties#F_RETURN_IMAGE_PARTS} is set or containing
246: * the image itself otherwise ({@link #getImage}).
247: */
248: public String[] getImageParts() {
249: if (_imageParts != null) {
250: return _imageParts;
251: } else {
252: return new String[] { getImage() };
253: }
254: }
255:
256: /**
257: * The counterpart to {@link #getImageParts}. It sets all image parts in one
258: * operation. The method accepts <code>null</code> and empty arrays.
259: *
260: * @param imageParts an array of image parts according to the token type or
261: * <code>null</code>
262: */
263: public void setImageParts(String[] imageParts) {
264: _imageParts = imageParts;
265: }
266:
267: /**
268: * Setting the length of the token. Some {@link Tokenizer} may prefer or may be
269: * configured not to return a token image, but only the position and length
270: * informations. This may save a lot of time whereever only a subset of the found
271: * tokens are actually needed by the user.
272: *<br>
273: * This method is an alternative to {@link #setEndPosition} depending on which
274: * information is at hand or easier to obtain for the {@link Tokenizer} producing
275: * this <code>Token</code>.
276: *<br>
277: * Note that this method is implicitely called by {@link #setImage} and
278: * {@link #setEndPosition}.
279: *
280: * @param length the length of the token
281: * @see #getLength
282: * @see #setEndPosition
283: */
284: public void setLength(int length) {
285: _length = length;
286: }
287:
288: /**
289: * Obtaining the length of the token. Note that some token types have a zero length
290: * (like EOF or UNKNOWN).
291: *
292: * @return the length of the token.
293: * @see #setLength
294: * @see #getEndPosition
295: */
296: public int getLength() {
297: return _length;
298: }
299:
300: /**
301: * Some token may have associated informations for the user of the <code>Token</code>.
302: * A popular thing would be the association of an integer constant to a special
303: * sequence or keyword to be used in fast <code>switch</code> statetents.
304: *
305: * @param companion the associated information for this token
306: */
307: public void setCompanion(Object companion) {
308: _companion = companion;
309: }
310:
311: /**
312: * Obtaining the associated information of the token. Can be <code>null</code>. See
313: * {@link #setCompanion} for details.
314: *
315: * @return the associated information of this token
316: */
317: public Object getCompanion() {
318: return _companion;
319: }
320:
321: /**
322: * Setting the start position of the token relative to the start of the input
323: * stream. For instance, the first character in a file has the start position
324: * 0.
325: *
326: * @param startPosition the position where the token starts in the input stream.
327: * @see #getStartPosition
328: * @see #setEndPosition
329: */
330: public void setStartPosition(int startPosition) {
331: _startPosition = startPosition;
332: }
333:
334: /**
335: * Obtaining the starting position of the token. If not set or not of interest,
336: * -1 is returned.
337: *
338: * @return start position of the token.
339: * @see #setStartPosition
340: * @see #getEndPosition
341: */
342: public int getStartPosition() {
343: return _startPosition;
344: }
345:
346: /**
347: * Setting the end position of the token relative to the start of the input
348: * stream. For instance, the first character in a file has the start position
349: * 0. The character at the given end position is <strong>NOT</code> part of
350: * this <code>Token</code>. This is the same principle as in the
351: * {@link java.lang.String#substring(int, int)} method.
352: *<br>
353: * This method is an alternative to {@link #setLength} depending on which
354: * information is at hand or easier to obtain for the {@link Tokenizer} producing
355: * this <code>Token</code>.
356: *<br>
357: * Note that this method <strong>MUST</strong> be called after {@link #setStartPosition}
358: * since it affects the length of the token. Its effect is in turn eliminated
359: * by calls to {@link #setLength} and {@link #setImage}
360: *
361: * @param endPosition the position where the token ends in the input stream.
362: */
363: public void setEndPosition(int endPosition) {
364: setLength(endPosition - _startPosition);
365: }
366:
367: /**
368: * Obtaining the end position of this token. Note that the return value of this
369: * method is only valid, if {@link #setStartPosition} has been called and one
370: * of the methods {@link #setImage}, {@link #setLength} or {@link #setEndPosition}.
371: *
372: * @return end position of the token.
373: * @see #setEndPosition
374: * @see #setStartPosition
375: * @see #getStartPosition
376: */
377: public int getEndPosition() {
378: return getLength() - getStartPosition();
379: }
380:
381: /**
382: * In {@link Tokenizer}'s counting lines and columns, this method is used to
383: * set the line number where the beginning of the <code>Token</code> was found.
384: * Line numbers start with 0.
385: *
386: * @param lineno line number where the token begins
387: * @see #getStartLine
388: */
389: public void setStartLine(int lineno) {
390: _startLine = lineno;
391: }
392:
393: /**
394: * Obtaining the line number where the <code>Token</code> starts. See also
395: * {@link #setStartLine} for details.<br>
396: * If a tokenizer doesn't count lines and columns, the returned value is -1.
397: *
398: * @return the line number where the token starts or -1, if no line counting is
399: * performed
400: * @see #setStartLine
401: */
402: public int getStartLine() {
403: return _startLine;
404: }
405:
406: /**
407: * In {@link Tokenizer}'s counting lines and columns, this method is used to
408: * set the column number where the beginning of the <code>Token</code> was
409: * found. Column numbers start with 0.
410: *
411: * @param colno number where the token begins
412: * @see #getStartColumn
413: */
414: public void setStartColumn(int colno) {
415: _startColumn = colno;
416: }
417:
418: /**
419: * Obtaining the column number of the <code>Token</code> start. See {@link #setStartColumn}
420: * for details.<br>
421: * If a tokenizer doesn't count lines and columns, the returned value is -1.
422: *
423: * @return the column number where the token starts or -1, if no line counting
424: * is performed
425: * @see #setStartColumn
426: */
427: public int getStartColumn() {
428: return _startColumn;
429: }
430:
431: /**
432: * In {@link Tokenizer}'s counting lines and columns, this method is used to
433: * set the line number where the end of the <code>Token</code> was found.
434: * See {@link #setStartLine} for more.<br>
435: * The end line number is the one there the first character was found that does
436: * <b><i>NOT</i></b> belongs to the token. This approach is choosen in accordance
437: * to the toIndex parameters in {@link java.lang.String#substring(int, int)}.
438: *
439: * @param lineno line number where the token ends
440: */
441: public void setEndLine(int lineno) {
442: _endLine = lineno;
443: }
444:
445: /**
446: * Obtaining the line number where the token ends. See {@link #setEndLine} for
447: * more. If a tokenizer doesn't count lines and columns, the returned value is
448: * -1.
449: *
450: * @return line number where the token ends or -1, if no line counting is
451: * performed
452: * @see #setEndLine
453: */
454: public int getEndLine() {
455: return _endLine;
456: }
457:
458: /**
459: * In {@link Tokenizer}'s counting lines and columns, this method is used to set the
460: * column number where the end of the <code>Token</code> was found.<br>
461: * The end column number is the one of the first character that does
462: * <b><i>NOT</i></b> belongs to the token. This approach is choosen in accordance
463: * to the toIndex parameters in {@link java.lang.String#substring(int, int)}.
464: *
465: * @param colno column number where the token ends
466: */
467: public void setEndColumn(int colno) {
468: _endColumn = colno;
469: }
470:
471: /**
472: * Obtaining the column number where the <code>Token</code> ends. See {@link #setEndColumn}
473: * for more.<br>
474: * If a tokenizer doesn't count lines and columns, the returned value is -1.
475: *
476: * @return column number where the token ends or -1, if no line counting is
477: * performed
478: * @see #setEndColumn
479: */
480: public int getEndColumn() {
481: return _endColumn;
482: }
483:
484: //---------------------------------------------------------------------------
485: // construction
486: //
487:
488: /**
489: * Default constructor.
490: */
491: public Token() {
492: this (UNKNOWN, null, null);
493: }
494:
495: /**
496: * Constructs a token of a given type. Only the type of the token is known but not
497: * its image or positions.
498: *
499: * @param type token type, one of the class constants.
500: */
501: public Token(int type) {
502: this (type, null, null);
503: }
504:
505: /**
506: * Construct a token of a given type with the given image. No position information
507: * is given.
508: *
509: * @param type token type, one of the class constants.
510: * @param image the token image itself
511: */
512: public Token(int type, String image) {
513: this (type, image, null);
514: }
515:
516: /**
517: * Construct a token of a given type with the given image and a companion. This
518: * constructor is most useful for keywords or special sequences.
519: *
520: * @param type token type, one of the class constants.
521: * @param image the token image itself
522: * @param companion an associated information of the token type
523: */
524: public Token(int type, String image, Object companion) {
525: setType(type);
526: setImage(image);
527: setCompanion(companion);
528: setStartPosition(-1);
529: setStartLine(-1);
530: setStartColumn(-1);
531: setEndLine(-1);
532: setEndColumn(-1);
533: setImageParts(null);
534: }
535:
536: //---------------------------------------------------------------------------
537: // overloaded methods
538: //
539:
540: /**
541: * Implementation of the well known method {@link java.lang.Object#equals}.
542: * Note that two token are equal if every member of it is equal. That means
543: * that token retrieved by two different {@link Tokenizer} instances can be
544: * equal.
545: *
546: * @param object the {@link java.lang.Object} to compare
547: * @return <code>true</code> if two token are equal, <code>false</code>
548: * otherwise
549: */
550: public boolean equals(Object object) {
551: // Test on intentical objects and incompatible classes
552: if (object == null) {
553: return false;
554: } else if (object == this ) {
555: return true;
556: } else if (object.getClass() != getClass()) {
557: return false;
558: }
559:
560: // real check
561: Token other = (Token) object;
562:
563: if (getType() != other.getType()) {
564: return false;
565: } else if (getStartPosition() != other.getStartPosition()) {
566: return false;
567: } else if (getLength() != other.getLength()) {
568: return false;
569: } else if (getStartLine() != other.getStartLine()) {
570: return false;
571: } else if (getStartColumn() != other.getStartColumn()) {
572: return false;
573: } else if (getEndLine() != other.getEndLine()) {
574: return false;
575: } else if (getEndColumn() != other.getEndColumn()) {
576: return false;
577: } else if ((getCompanion() == null && other.getCompanion() != null)
578: || (getCompanion() != null && !getCompanion().equals(
579: other.getCompanion()))) {
580: return false;
581: } else if ((getImage() == null && other.getImage() != null)
582: || (getImage() != null && !getImage().equals(
583: other.getImage()))) {
584: return false;
585: }
586: return true;
587: }
588:
589: /**
590: * Implementation of the well known method {@link java.lang.Object#toString}.
591: *
592: * @return string representation of this object
593: */
594: public String toString() {
595: StringBuffer buffer = new StringBuffer();
596:
597: // Type
598: buffer.append("Type ");
599: buffer.append(Token.getTypeName(getType()));
600:
601: // Image
602: if (getType() != EOF) {
603: buffer.append(": ");
604: if (getImage() != null) {
605: buffer.append('"');
606: buffer.append(getImage());
607: buffer.append('"');
608: } else {
609: buffer.append("no image, length ");
610: buffer.append(getLength());
611: }
612: }
613: return buffer.toString();
614: }
615:
616: /**
617: * Getting a type name for displaying. The methode never fails even if the
618: * given type is unknown.
619: *
620: * @param type one of the Token type constants
621: * @return a string representation of the given type constant
622: */
623: public static String getTypeName(int type) {
624: switch (type) {
625: case NORMAL:
626: return "NORMAL";
627: case KEYWORD:
628: return "KEYWORD";
629: case STRING:
630: return "STRING";
631: case PATTERN:
632: return "PATTERN";
633: case SPECIAL_SEQUENCE:
634: return "SPECIAL_SEQUENCE";
635: case SEPARATOR:
636: return "SEPARATOR";
637: case WHITESPACE:
638: return "WHITESPACE";
639: case LINE_COMMENT:
640: return "LINE_COMMENT";
641: case BLOCK_COMMENT:
642: return "BLOCK_COMMENT";
643: case EOF:
644: return "EOF";
645: default:
646: return "UNKNOWN";
647: }
648: }
649:
650: //---------------------------------------------------------------------------
651: // members
652: //
653:
654: /**
655: * The token type. Usually one of the constants {@link #NORMAL}, {@link #EOF} etc.
656: *
657: * @see #getType
658: * @see #setType
659: */
660: protected int _type;
661:
662: /**
663: * The string representing the token. This member might not be present if a
664: * {@link Tokenizer} is configured not to return token images.
665: *
666: * @see #getImage
667: * @see #setImage
668: */
669: protected String _image;
670:
671: /**
672: * The length of the string representing the token..
673: *
674: * @see #getLength
675: * @see #setLength
676: */
677: protected int _length;
678:
679: /**
680: * An information associated with the token. For instance, keywords can be
681: * distinguished using different companions for each keyword
682: *
683: * @see #getCompanion
684: * @see #setCompanion
685: * @see TokenizerProperties#addKeyword
686: */
687: protected Object _companion;
688:
689: /**
690: * The absolute position where the token starts in the source of data.
691: *
692: * @see #getStartPosition
693: * @see #setStartPosition
694: */
695: protected int _startPosition;
696:
697: /**
698: * The line where the token starts in the source of data. This member may not
699: * be set if a {@link Tokenizer} is configured not to return token line and
700: * column (see {@link TokenizerProperties#F_COUNT_LINES}).
701: *
702: * @see #getStartLine
703: * @see #setStartLine
704: */
705: protected int _startLine;
706:
707: /**
708: * The column where the token starts in the source of data. This member may not
709: * be set if a {@link Tokenizer} is configured not to return token line and
710: * column (see {@link TokenizerProperties#F_COUNT_LINES}).
711: *
712: * @see #getStartColumn
713: * @see #setStartColumn
714: */
715: protected int _startColumn;
716:
717: /**
718: * The line where the token ends in the source of data. This member may not
719: * be set if a {@link Tokenizer} is configured not to return token line and
720: * column (see {@link TokenizerProperties#F_COUNT_LINES}).
721: *
722: * @see #getEndLine
723: * @see #setEndLine
724: */
725: protected int _endLine;
726:
727: /**
728: * The column where the token ends in the source of data. This member may not
729: * be set if a {@link Tokenizer} is configured not to return token line and
730: * column (see {@link TokenizerProperties#F_COUNT_LINES}).
731: *
732: * @see #getEndColumn
733: * @see #setEndColumn
734: */
735: protected int _endColumn;
736:
737: /**
738: * Array with the image parts. See {@link #getImageParts} for details.
739: */
740: protected String[] _imageParts;
741: }
|