001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package java.io;
019:
020: /**
021: * StreamTokenizer takes a stream and a set of tokens and parses them one at a
022: * time. The different types of tokens that can be found are numbers,
023: * identifiers, quoted strings, and different comment styles.
024: */
025: public class StreamTokenizer {
026: /**
027: * Contains a number if the current token is a number (<code>ttype</code>
028: * is <code>TT_NUMBER</code>)
029: */
030: public double nval;
031:
032: /**
033: * Contains a string if the current token is a word (<code>ttype</code>
034: * is <code>TT_WORD</code>)
035: */
036: public String sval;
037:
038: /**
039: * After calling <code>nextToken</code>, the field <code>ttype</code>
040: * contains the type of token that has been read. When a single character is
041: * read, it's integer value is used. For a quoted string, the value is the
042: * quoted character. If not one of those, then it is one of the following:
043: * <UL>
044: * <LI> <code>TT_WORD</code> - the token is a word.</LI>
045: * <LI> <code>TT_NUMBER</code> - the token is a number.</LI>
046: * <LI> <code>TT_EOL</code> - the end of line has been reached. Depends on
047: * whether <code>eolIsSignificant</code> is <code>true</code>.</LI>
048: * <LI> <code>TT_EOF</code> - the end of the stream has been reached.</LI>
049: * </UL>
050: */
051:
052: /**
053: * The constant representing end of stream.
054: */
055: public static final int TT_EOF = -1;
056:
057: /**
058: * The constant representing end of line.
059: */
060: public static final int TT_EOL = '\n';
061:
062: /**
063: * The constant representing a number token.
064: */
065: public static final int TT_NUMBER = -2;
066:
067: /**
068: * The constant representing a word token.
069: */
070: public static final int TT_WORD = -3;
071:
072: /**
073: * Internal representation of unknown state.
074: */
075: private static final int TT_UNKNOWN = -4;
076:
077: /**
078: * The token type
079: */
080: public int ttype = TT_UNKNOWN;
081:
082: /**
083: * Internal character meanings, 0 implies TOKEN_ORDINARY
084: */
085: private byte tokenTypes[] = new byte[256];
086:
087: private static final byte TOKEN_COMMENT = 1;
088:
089: private static final byte TOKEN_QUOTE = 2;
090:
091: private static final byte TOKEN_WHITE = 4;
092:
093: private static final byte TOKEN_WORD = 8;
094:
095: private static final byte TOKEN_DIGIT = 16;
096:
097: private int lineNumber = 1;
098:
099: private boolean forceLowercase;
100:
101: private boolean isEOLSignificant;
102:
103: private boolean slashStarComments;
104:
105: private boolean slashSlashComments;
106:
107: private boolean pushBackToken;
108:
109: private boolean lastCr;
110:
111: /* One of these will have the stream */
112: private InputStream inStream;
113:
114: private Reader inReader;
115:
116: private int peekChar = -2;
117:
118: /**
119: * Private constructor to initialize the default values according to the
120: * specification.
121: */
122: private StreamTokenizer() {
123: /*
124: * Initialize the default state per specification. All byte values 'A'
125: * through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are
126: * considered to be alphabetic.
127: */
128: wordChars('A', 'Z');
129: wordChars('a', 'z');
130: wordChars(160, 255);
131: /**
132: * All byte values '\u0000' through '\u0020' are considered to be white
133: * space.
134: */
135: whitespaceChars(0, 32);
136: /**
137: * '/' is a comment character. Single quote '\'' and double quote '"'
138: * are string quote characters.
139: */
140: commentChar('/');
141: quoteChar('"');
142: quoteChar('\'');
143: /**
144: * Numbers are parsed.
145: */
146: parseNumbers();
147: /**
148: * Ends of lines are treated as white space, not as separate tokens.
149: * C-style and C++-style comments are not recognized. These are the
150: * defaults and are not needed in constructor.
151: */
152: }
153:
154: /**
155: * Construct a new StreamTokenizer on the InputStream is. This usage of this
156: * method should be replaced with the constructor which takes a Reader.
157: *
158: * @param is
159: * The InputStream to parse tokens on.
160: *
161: * @deprecated Use StreamTokenizer(Reader)
162: */
163: @Deprecated
164: public StreamTokenizer(InputStream is) {
165: this ();
166: if (is == null) {
167: throw new NullPointerException();
168: }
169: inStream = is;
170: }
171:
172: /**
173: * Construct a new StreamTokenizer on the Reader <code>r</code>.
174: * Initialize the default state per specification.
175: * <UL>
176: * <LI>All byte values 'A' through 'Z', 'a' through 'z', and '\u00A0'
177: * through '\u00FF' are considered to be alphabetic.</LI>
178: * <LI>All byte values '\u0000' through '\u0020' are considered to
179: * be white space. '/' is a comment character.</LI>
180: * <LI>Single quote '\'' and double quote '"' are string quote characters.</LI>
181: * <LI>Numbers are parsed.</LI>
182: * <LI>Ends of lines are considered to be white space rather than separate
183: * tokens.</LI>
184: * <LI>C-style and C++-style comments are not recognized.</LI>
185: * </UL>
186: * These are the defaults and are not needed in constructor.
187: *
188: * @param r
189: * The InputStream to parse tokens on.
190: */
191: public StreamTokenizer(Reader r) {
192: this ();
193: if (r == null) {
194: throw new NullPointerException();
195: }
196: inReader = r;
197: }
198:
199: /**
200: * Set the character <code>ch</code> to be regarded as a comment
201: * character.
202: *
203: * @param ch
204: * The character to be considered a comment character.
205: */
206: public void commentChar(int ch) {
207: if (0 <= ch && ch < tokenTypes.length) {
208: tokenTypes[ch] = TOKEN_COMMENT;
209: }
210: }
211:
212: /**
213: * Set a boolean indicating whether or not end of line is significant and
214: * should be returned as <code>TT_EOF</code> in <code>ttype</code>.
215: *
216: * @param flag
217: * <code>true</code> if EOL is significant, <code>false</code>
218: * otherwise.
219: */
220: public void eolIsSignificant(boolean flag) {
221: isEOLSignificant = flag;
222: }
223:
224: /**
225: * Answer the current line number.
226: *
227: * @return the current line number.
228: */
229: public int lineno() {
230: return lineNumber;
231: }
232:
233: /**
234: * Set a boolean indicating whether or not tokens should be uppercased when
235: * present in <code>sval</code>.
236: *
237: * @param flag
238: * <code>true</code> if <code>sval</code> should be forced
239: * uppercase, <code>false</code> otherwise.
240: */
241: public void lowerCaseMode(boolean flag) {
242: forceLowercase = flag;
243: }
244:
245: /**
246: * Answer the next token type.
247: *
248: * @return The next token to be parsed.
249: *
250: * @throws IOException
251: * If an IO error occurs while getting the token
252: */
253: public int nextToken() throws IOException {
254: if (pushBackToken) {
255: pushBackToken = false;
256: if (ttype != TT_UNKNOWN) {
257: return ttype;
258: }
259: }
260: sval = null; // Always reset sval to null
261: int currentChar = peekChar == -2 ? read() : peekChar;
262:
263: if (lastCr && currentChar == '\n') {
264: lastCr = false;
265: currentChar = read();
266: }
267: if (currentChar == -1) {
268: return (ttype = TT_EOF);
269: }
270:
271: byte currentType = currentChar > 255 ? TOKEN_WORD
272: : tokenTypes[currentChar];
273: while ((currentType & TOKEN_WHITE) != 0) {
274: /**
275: * Skip over white space until we hit a new line or a real token
276: */
277: if (currentChar == '\r') {
278: lineNumber++;
279: if (isEOLSignificant) {
280: lastCr = true;
281: peekChar = -2;
282: return (ttype = TT_EOL);
283: }
284: if ((currentChar = read()) == '\n') {
285: currentChar = read();
286: }
287: } else if (currentChar == '\n') {
288: lineNumber++;
289: if (isEOLSignificant) {
290: peekChar = -2;
291: return (ttype = TT_EOL);
292: }
293: currentChar = read();
294: } else {
295: // Advance over this white space character and try again.
296: currentChar = read();
297: }
298: if (currentChar == -1) {
299: return (ttype = TT_EOF);
300: }
301: currentType = currentChar > 255 ? TOKEN_WORD
302: : tokenTypes[currentChar];
303: }
304:
305: /**
306: * Check for digits before checking for words since digits can be
307: * contained within words.
308: */
309: if ((currentType & TOKEN_DIGIT) != 0) {
310: StringBuilder digits = new StringBuilder(20);
311: boolean haveDecimal = false, checkJustNegative = currentChar == '-';
312: while (true) {
313: if (currentChar == '.') {
314: haveDecimal = true;
315: }
316: digits.append((char) currentChar);
317: currentChar = read();
318: if ((currentChar < '0' || currentChar > '9')
319: && (haveDecimal || currentChar != '.')) {
320: break;
321: }
322: }
323: peekChar = currentChar;
324: if (checkJustNegative && digits.length() == 1) {
325: // Didn't get any other digits other than '-'
326: return (ttype = '-');
327: }
328: try {
329: nval = Double.valueOf(digits.toString()).doubleValue();
330: } catch (NumberFormatException e) {
331: // Unsure what to do, will write test.
332: nval = 0;
333: }
334: return (ttype = TT_NUMBER);
335: }
336: // Check for words
337: if ((currentType & TOKEN_WORD) != 0) {
338: StringBuffer word = new StringBuffer(20);
339: while (true) {
340: word.append((char) currentChar);
341: currentChar = read();
342: if (currentChar == -1
343: || (currentChar < 256 && (tokenTypes[currentChar] & (TOKEN_WORD | TOKEN_DIGIT)) == 0)) {
344: break;
345: }
346: }
347: peekChar = currentChar;
348: sval = forceLowercase ? word.toString().toLowerCase()
349: : word.toString();
350: return (ttype = TT_WORD);
351: }
352: // Check for quoted character
353: if (currentType == TOKEN_QUOTE) {
354: int matchQuote = currentChar;
355: StringBuffer quoteString = new StringBuffer();
356: int peekOne = read();
357: while (peekOne >= 0 && peekOne != matchQuote
358: && peekOne != '\r' && peekOne != '\n') {
359: boolean readPeek = true;
360: if (peekOne == '\\') {
361: int c1 = read();
362: // Check for quoted octal IE: \377
363: if (c1 <= '7' && c1 >= '0') {
364: int digitValue = c1 - '0';
365: c1 = read();
366: if (c1 > '7' || c1 < '0') {
367: readPeek = false;
368: } else {
369: digitValue = digitValue * 8 + (c1 - '0');
370: c1 = read();
371: // limit the digit value to a byte
372: if (digitValue > 037 || c1 > '7'
373: || c1 < '0') {
374: readPeek = false;
375: } else {
376: digitValue = digitValue * 8
377: + (c1 - '0');
378: }
379: }
380: if (!readPeek) {
381: // We've consumed one to many
382: quoteString.append((char) digitValue);
383: peekOne = c1;
384: } else {
385: peekOne = digitValue;
386: }
387: } else {
388: switch (c1) {
389: case 'a':
390: peekOne = 0x7;
391: break;
392: case 'b':
393: peekOne = 0x8;
394: break;
395: case 'f':
396: peekOne = 0xc;
397: break;
398: case 'n':
399: peekOne = 0xA;
400: break;
401: case 'r':
402: peekOne = 0xD;
403: break;
404: case 't':
405: peekOne = 0x9;
406: break;
407: case 'v':
408: peekOne = 0xB;
409: break;
410: default:
411: peekOne = c1;
412: }
413: }
414: }
415: if (readPeek) {
416: quoteString.append((char) peekOne);
417: peekOne = read();
418: }
419: }
420: if (peekOne == matchQuote) {
421: peekOne = read();
422: }
423: peekChar = peekOne;
424: ttype = matchQuote;
425: sval = quoteString.toString();
426: return ttype;
427: }
428: // Do comments, both "//" and "/*stuff*/"
429: if (currentChar == '/'
430: && (slashSlashComments || slashStarComments)) {
431: if ((currentChar = read()) == '*' && slashStarComments) {
432: int peekOne = read();
433: while (true) {
434: currentChar = peekOne;
435: peekOne = read();
436: if (currentChar == -1) {
437: peekChar = -1;
438: return (ttype = TT_EOF);
439: }
440: if (currentChar == '\r') {
441: if (peekOne == '\n') {
442: peekOne = read();
443: }
444: lineNumber++;
445: } else if (currentChar == '\n') {
446: lineNumber++;
447: } else if (currentChar == '*' && peekOne == '/') {
448: peekChar = read();
449: return nextToken();
450: }
451: }
452: } else if (currentChar == '/' && slashSlashComments) {
453: // Skip to EOF or new line then return the next token
454: while ((currentChar = read()) >= 0
455: && currentChar != '\r' && currentChar != '\n') {
456: // Intentionally empty
457: }
458: peekChar = currentChar;
459: return nextToken();
460: } else if (currentType != TOKEN_COMMENT) {
461: // Was just a slash by itself
462: peekChar = currentChar;
463: return (ttype = '/');
464: }
465: }
466: // Check for comment character
467: if (currentType == TOKEN_COMMENT) {
468: // Skip to EOF or new line then return the next token
469: while ((currentChar = read()) >= 0 && currentChar != '\r'
470: && currentChar != '\n') {
471: // Intentionally empty
472: }
473: peekChar = currentChar;
474: return nextToken();
475: }
476:
477: peekChar = read();
478: return (ttype = currentChar);
479: }
480:
481: /**
482: * Set the character <code>ch</code> to be regarded as an ordinary
483: * character.
484: *
485: * @param ch
486: * The character to be considered an ordinary comment character.
487: */
488: public void ordinaryChar(int ch) {
489: if (0 <= ch && ch < tokenTypes.length) {
490: tokenTypes[ch] = 0;
491: }
492: }
493:
494: /**
495: * Set the characters ranging from <code>low</code> to <code>hi</code>
496: * to be regarded as ordinary characters.
497: *
498: * @param low
499: * The starting range for ordinary characters.
500: * @param hi
501: * The ending range for ordinary characters.
502: */
503: public void ordinaryChars(int low, int hi) {
504: if (low < 0) {
505: low = 0;
506: }
507: if (hi > tokenTypes.length) {
508: hi = tokenTypes.length - 1;
509: }
510: for (int i = low; i <= hi; i++) {
511: tokenTypes[i] = 0;
512: }
513: }
514:
515: /**
516: * Indicate that numbers should be parsed.
517: */
518: public void parseNumbers() {
519: for (int i = '0'; i <= '9'; i++) {
520: tokenTypes[i] |= TOKEN_DIGIT;
521: }
522: tokenTypes['.'] |= TOKEN_DIGIT;
523: tokenTypes['-'] |= TOKEN_DIGIT;
524: }
525:
526: /**
527: * Indicate that the current token should be pushed back and returned the
528: * next time <code>nextToken()</code> is called.
529: */
530: public void pushBack() {
531: pushBackToken = true;
532: }
533:
534: /**
535: * Set the character <code>ch</code> to be regarded as a quote character.
536: *
537: * @param ch
538: * The character to be considered a quote comment character.
539: */
540: public void quoteChar(int ch) {
541: if (0 <= ch && ch < tokenTypes.length) {
542: tokenTypes[ch] = TOKEN_QUOTE;
543: }
544: }
545:
546: private int read() throws IOException {
547: // Call the read for the appropriate stream
548: if (inStream == null) {
549: return inReader.read();
550: }
551: return inStream.read();
552: }
553:
554: /**
555: * Reset all characters so that they are ordinary.
556: */
557: public void resetSyntax() {
558: for (int i = 0; i < 256; i++) {
559: tokenTypes[i] = 0;
560: }
561: }
562:
563: /**
564: * Set a boolean indicating whether or not slash slash comments should be
565: * recognized. The comment ends at a new line.
566: *
567: * @param flag
568: * <code>true</code> if <code>//</code> should be recognized
569: * as the start of a comment, <code>false</code> otherwise.
570: */
571: public void slashSlashComments(boolean flag) {
572: slashSlashComments = flag;
573: }
574:
575: /**
576: * Set a boolean indicating whether or not slash star comments should be
577: * recognized. Slash-star comments cannot be nested and end when a
578: * star-slash combination is found.
579: *
580: * @param flag
581: * <code>true</code> if <code>/*</code> should be recognized
582: * as the start of a comment, <code>false</code> otherwise.
583: */
584: public void slashStarComments(boolean flag) {
585: slashStarComments = flag;
586: }
587:
588: /**
589: * Answer the state of this tokenizer in a readable format.
590: *
591: * @return The current state of this tokenizer.
592: */
593: @Override
594: public String toString() {
595: // Values determined through experimentation
596: StringBuilder result = new StringBuilder();
597: result.append("Token["); //$NON-NLS-1$
598: switch (ttype) {
599: case TT_EOF:
600: result.append("EOF"); //$NON-NLS-1$
601: break;
602: case TT_EOL:
603: result.append("EOL"); //$NON-NLS-1$
604: break;
605: case TT_NUMBER:
606: result.append("n="); //$NON-NLS-1$
607: result.append(nval);
608: break;
609: case TT_WORD:
610: result.append(sval);
611: break;
612: default:
613: if (ttype == TT_UNKNOWN || tokenTypes[ttype] == TOKEN_QUOTE) {
614: result.append(sval);
615: } else {
616: result.append('\'');
617: result.append((char) ttype);
618: result.append('\'');
619: }
620: }
621: result.append("], line "); //$NON-NLS-1$
622: result.append(lineNumber);
623: return result.toString();
624: }
625:
626: /**
627: * Set the characters ranging from <code>low</code> to <code>hi</code>
628: * to be regarded as whitespace characters.
629: *
630: * @param low
631: * The starting range for whitespace characters.
632: * @param hi
633: * The ending range for whitespace characters.
634: */
635: public void whitespaceChars(int low, int hi) {
636: if (low < 0) {
637: low = 0;
638: }
639: if (hi > tokenTypes.length) {
640: hi = tokenTypes.length - 1;
641: }
642: for (int i = low; i <= hi; i++) {
643: tokenTypes[i] = TOKEN_WHITE;
644: }
645: }
646:
647: /**
648: * Set the characters ranging from <code>low</code> to <code>hi</code>
649: * to be regarded as word characters.
650: *
651: * @param low
652: * The starting range for word characters.
653: * @param hi
654: * The ending range for word characters.
655: */
656: public void wordChars(int low, int hi) {
657: if (low < 0) {
658: low = 0;
659: }
660: if (hi > tokenTypes.length) {
661: hi = tokenTypes.length - 1;
662: }
663: for (int i = low; i <= hi; i++) {
664: tokenTypes[i] |= TOKEN_WORD;
665: }
666: }
667: }
|