001: /*
002: * This file is part of a syntax highlighting package
003: * Copyright (C) 2002 Stephen Ostermiller
004: * http://ostermiller.org/contact.pl?regarding=Syntax+Highlighting
005: *
006: * This program is free software; you can redistribute it and/or modify
007: * it under the terms of the GNU General Public License as published by
008: * the Free Software Foundation; either version 2 of the License, or
009: * (at your option) any later version.
010: *
011: * This program is distributed in the hope that it will be useful,
012: * but WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014: * GNU General Public License for more details.
015: *
016: * See COPYING.TXT for details.
017: */
018: package workbench.sql.formatter;
019:
020: import java.util.regex.Pattern;
021:
022: /**
023: * A SQLToken is a token that is returned by a lexer that is lexing an SQL
024: * source file. It has several attributes describing the token:
025: * The type of token, the text of the token, the line number on which it
026: * occurred, the number of characters into the input at which it started, and
027: * similarly, the number of characters into the input at which it ended. <br>
028: */
029: public class SQLToken extends Token {
030: /**
031: * A reserved word (keyword)
032: */
033: public final static int RESERVED_WORD = 0x100;
034:
035: /**
036: * A variable, name, or other identifier
037: */
038: public final static int IDENTIFIER = 0x200;
039:
040: /**
041: * A string literal
042: */
043: public final static int LITERAL_STRING = 0x300;
044: /**
045: * A bit-string
046: */
047: public final static int LITERAL_BIT_STRING = 0x310;
048: /**
049: * An integer
050: */
051: public final static int LITERAL_INTEGER = 0x320;
052: /**
053: * A floating point
054: */
055: public final static int LITERAL_FLOAT = 0x330;
056:
057: /**
058: * A separator
059: */
060: public final static int SEPARATOR = 0x400;
061:
062: /**
063: * An operator
064: */
065: public final static int OPERATOR = 0x500;
066:
067: /**
068: * C style comment, (except possibly nested)
069: */
070: public final static int COMMENT_TRADITIONAL = 0xD00;
071:
072: /**
073: * a -- to end of line comment.
074: */
075: public final static int COMMENT_END_OF_LINE = 0xD10;
076:
077: /**
078: * White space
079: */
080: public final static int WHITE_SPACE = 0xE00;
081:
082: /**
083: * An error
084: */
085: public final static int ERROR = 0xF00;
086: /**
087: * An comment start embedded in an operator
088: */
089: public final static int ERROR_UNCLOSED_COMMENT = 0xF02;
090: /**
091: * An comment start embedded in an operator
092: */
093: public final static int ERROR_UNCLOSED_STRING = 0xF03;
094: /**
095: * An comment start embedded in an operator
096: */
097: public final static int ERROR_UNCLOSED_BIT_STRING = 0xF04;
098: /**
099: * An comment start embedded in an operator
100: */
101: public final static int ERROR_BAD_BIT_STRING = 0xF05;
102:
103: private final int ID;
104: private String contents;
105: private int lineNumber;
106: private int charBegin;
107: private int charEnd;
108: private int state;
109:
110: private static final Pattern WHITESPACE = Pattern
111: .compile("[ \t\r\n]+");
112:
113: /**
114: * Create a new token.
115: * The constructor is typically called by the lexer
116: *
117: * @param ID the id number of the token
118: * @param contents A string representing the text of the token
119: * @param lineNumber the line number of the input on which this token started
120: * @param charBegin the offset into the input in characters at which this token started
121: * @param charEnd the offset into the input in characters at which this token ended
122: */
123: public SQLToken(int ID, String contents, int lineNumber,
124: int charBegin, int charEnd) {
125: this (ID, contents, lineNumber, charBegin, charEnd,
126: Token.UNDEFINED_STATE);
127: }
128:
129: /**
130: * Create a new token.
131: * The constructor is typically called by the lexer
132: *
133: * @param ID the id number of the token
134: * @param text A string representing the text of the token
135: * @param lineNumber the line number of the input on which this token started
136: * @param charBegin the offset into the input in characters at which this token started
137: * @param charEnd the offset into the input in characters at which this token ended
138: * @param state the state the tokenizer is in after returning this token.
139: */
140: public SQLToken(int ID, String text, int lineNumber, int charBegin,
141: int charEnd, int state) {
142: this .ID = ID;
143: this .contents = text;
144: this .lineNumber = lineNumber;
145: this .charBegin = charBegin;
146: this .charEnd = charEnd;
147: this .state = state;
148: }
149:
150: /**
151: * Get an integer representing the state the tokenizer is in after
152: * returning this token.
153: * Those who are interested in incremental tokenizing for performance
154: * reasons will want to use this method to figure out where the tokenizer
155: * may be restarted. The tokenizer starts in Token.INITIAL_STATE, so
156: * any time that it reports that it has returned to this state, the
157: * tokenizer may be restarted from there.
158: */
159: public int getState() {
160: return state;
161: }
162:
163: /**
164: * get the ID number of this token
165: *
166: * @return the id number of the token
167: */
168: public int getID() {
169: return ID;
170: }
171:
172: /**
173: * Returned an uparsed version of the contents of this
174: * token. To get a
175: */
176: public String getText() {
177: return this .contents;
178: }
179:
180: /**
181: * Get the contents of this token. Reserved words (keywords)
182: * will be returned in upper case and with multiple whitespaces
183: * replaced by a single whitespace to make comparisons easier.
184: * "is Null" will be returned as "IS NULL".
185: * To get the real text from the underlying SQL, use getText().
186: * For all tokens where isReservedWord() == false getText and
187: * getContents() will return exactly the same.
188: *
189: * @return A string representing the text of the token
190: * @see #getText()
191: */
192: public String getContents() {
193: if (this .isReservedWord()) {
194: return WHITESPACE.matcher(contents).replaceAll(" ")
195: .toUpperCase();
196: } else {
197: return this .contents;
198: }
199: }
200:
201: /**
202: * get the line number of the input on which this token started
203: *
204: * @return the line number of the input on which this token started
205: */
206: public int getLineNumber() {
207: return lineNumber;
208: }
209:
210: /**
211: * get the offset into the input in characters at which this token started
212: *
213: * @return the offset into the input in characters at which this token started
214: */
215: public int getCharBegin() {
216: return charBegin;
217: }
218:
219: /**
220: * get the offset into the input in characters at which this token ended
221: *
222: * @return the offset into the input in characters at which this token ended
223: */
224: public int getCharEnd() {
225: return charEnd;
226: }
227:
228: /**
229: * Checks this token to see if it is a reserved word.
230: *
231: * @return true if this token is a reserved word, false otherwise
232: */
233: public boolean isReservedWord() {
234: return ((ID >> 8) == 0x1);
235: }
236:
237: public boolean isIntegerLiteral() {
238: return (ID == LITERAL_INTEGER);
239: }
240:
241: public boolean isNumberLiteral() {
242: return (ID == LITERAL_INTEGER) || (ID == LITERAL_FLOAT);
243: }
244:
245: /**
246: * Checks this token to see if it is an identifier.
247: *
248: * @return true if this token is an identifier, false otherwise
249: */
250: public boolean isIdentifier() {
251: return ((ID >> 8) == 0x2);
252: }
253:
254: /**
255: * Checks this token to see if it is a literal.
256: *
257: * @return true if this token is a literal, false otherwise
258: */
259: public boolean isLiteral() {
260: return ((ID >> 8) == 0x3);
261: }
262:
263: /**
264: * Checks this token to see if it is a Separator.
265: *
266: * @return true if this token is a Separator, false otherwise
267: */
268: public boolean isSeparator() {
269: return ((ID >> 8) == 0x4);
270: }
271:
272: /**
273: * Checks this token to see if it is a Operator.
274: *
275: * @return true if this token is a Operator, false otherwise
276: */
277: public boolean isOperator() {
278: return ((ID >> 8) == 0x5);
279: }
280:
281: /**
282: * Checks this token to see if it is a comment.
283: *
284: * @return true if this token is a comment, false otherwise
285: */
286: public boolean isComment() {
287: return ((ID >> 8) == 0xD);
288: }
289:
290: /**
291: * Checks this token to see if it is White Space.
292: * Usually tabs, line breaks, form feed, spaces, etc.
293: *
294: * @return true if this token is White Space, false otherwise
295: */
296: public boolean isWhiteSpace() {
297: return ((ID >> 8) == 0xE);
298: }
299:
300: /**
301: * Checks this token to see if it is an Error.
302: * Unfinished comments, numbers that are too big, unclosed strings, etc.
303: *
304: * @return true if this token is an Error, false otherwise
305: */
306: public boolean isError() {
307: return ((ID >> 8) == 0xF);
308: }
309:
310: /**
311: * A description of this token. The description should
312: * be appropriate for syntax highlighting. For example
313: * "comment" is returned for a comment.
314: *
315: * @return a description of this token.
316: */
317: public String getDescription() {
318: if (isReservedWord()) {
319: return ("reservedWord");
320: } else if (isIdentifier()) {
321: return ("identifier");
322: } else if (isLiteral()) {
323: return ("literal");
324: } else if (isSeparator()) {
325: return ("separator");
326: } else if (isOperator()) {
327: return ("operator");
328: } else if (isComment()) {
329: return ("comment");
330: } else if (isWhiteSpace()) {
331: return ("whitespace");
332: } else if (isError()) {
333: return ("error");
334: } else {
335: return ("unknown");
336: }
337: }
338:
339: /**
340: * get a String that explains the error, if this token is an error.
341: *
342: * @return a String that explains the error, if this token is an error, null otherwise.
343: */
344: public String errorString() {
345: String s;
346: if (isError()) {
347: s = "Error on line " + lineNumber + ": ";
348: switch (ID) {
349: case ERROR:
350: s += "Unexpected token: " + contents;
351: break;
352: case ERROR_UNCLOSED_COMMENT:
353: s += "Unclosed comment: " + contents;
354: break;
355: case ERROR_UNCLOSED_STRING:
356: s += "Unclosed string literal: " + contents;
357: break;
358: case ERROR_UNCLOSED_BIT_STRING:
359: s += "Unclosed bit-string literal: " + contents;
360: break;
361: case ERROR_BAD_BIT_STRING:
362: s += "Bit-strings can only contain 0 and 1: "
363: + contents;
364: break;
365: }
366:
367: } else {
368: s = null;
369: }
370: return (s);
371: }
372:
373: /**
374: * get a representation of this token as a human readable string.
375: * The format of this string is subject to change and should only be used
376: * for debugging purposes.
377: *
378: * @return a string representation of this token
379: */
380: public String toString() {
381: return ("Token #" + Integer.toHexString(ID) + ": "
382: + getDescription() + " Line " + lineNumber + " from "
383: + charBegin + " to " + charEnd + " : " + contents);
384: }
385:
386: }
|