0001: /*
0002: * The contents of this file are subject to the Mozilla Public License
0003: * Version 1.1 (the "License"); you may not use this file except in
0004: * compliance with the License. You may obtain a copy of the License at
0005: * http://www.mozilla.org/MPL/
0006: *
0007: * Software distributed under the License is distributed on an "AS IS"
0008: * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
0009: * License for the specific language governing rights and limitations
0010: * under the License.
0011: *
0012: * The Original Code is iSQL-Viewer, A Mutli-Platform Database Tool.
0013: *
0014: * The Initial Developer of the Original Code is iSQL-Viewer, A Mutli-Platform Database Tool.
0015: * Portions created by Mark A. Kobold are Copyright (C) 2000-2007. All Rights Reserved.
0016: *
0017: * Contributor(s):
0018: * Mark A. Kobold [mkobold <at> isqlviewer <dot> com].
0019: *
0020: * If you didn't download this code from the following link, you should check
0021: * if you aren't using an obsolete version: http://www.isqlviewer.com
0022: */
0023: package org.isqlviewer.sql.processor;
0024:
0025: import java.sql.DatabaseMetaData;
0026: import java.sql.ResultSet;
0027: import java.sql.SQLException;
0028:
0029: import org.isqlviewer.util.StringTokenizer;
0030:
0031: /**
0032: * @author Mark A. Kobold <mkobold at isqlviewer dot com>
0033: * @version 1.0
0034: */
0035: public class SqlProcessor extends AbstractProcessor {
0036:
0037: private static final byte[] kind = new byte[128];
0038: private static final byte[] unikind = new byte[31];
0039:
0040: private int charlength = 1;
0041: private int pair = 0;
0042:
0043: public SqlProcessor() {
0044:
0045: initKind();
0046: initUniKind();
0047: }
0048:
0049: public void installServiceKeywords(DatabaseMetaData metaData,
0050: String catalog, String schema) throws SQLException {
0051:
0052: initSymbolTable();
0053: String wordSet = metaData.getSystemFunctions();
0054: StringTokenizer st = new StringTokenizer(wordSet, ",");
0055: while (st.hasMoreTokens()) {
0056: String nextToken = st.nextToken();
0057: lookup(TokenType.FUNCTION, nextToken);
0058: }
0059:
0060: wordSet = metaData.getNumericFunctions();
0061: st = new StringTokenizer(wordSet, ",");
0062: while (st.hasMoreTokens()) {
0063: String nextToken = st.nextToken();
0064: lookup(TokenType.FUNCTION, nextToken);
0065: }
0066:
0067: wordSet = metaData.getStringFunctions();
0068: st = new StringTokenizer(wordSet, ",");
0069: while (st.hasMoreTokens()) {
0070: String nextToken = st.nextToken();
0071: lookup(TokenType.FUNCTION, nextToken);
0072: }
0073:
0074: wordSet = metaData.getTimeDateFunctions();
0075: st = new StringTokenizer(wordSet, ",");
0076: while (st.hasMoreTokens()) {
0077: String nextToken = st.nextToken();
0078: lookup(TokenType.FUNCTION, nextToken);
0079: }
0080:
0081: ResultSet set = metaData.getTables(catalog, schema, null, null);
0082: while (set.next()) {
0083: String tableName = set.getString("TABLE_NAME");
0084: lookup(TokenType.TABLE_NAME, tableName);
0085: }
0086: }
0087:
0088: @Override
0089: protected TextSymbol lookup(TokenType type, String name) {
0090:
0091: if (type != TokenType.IDENTIFIER)
0092: return super .lookup(type, name);
0093: TextSymbol sym = symbolTable.get(name);
0094: if (sym != null)
0095: return sym;
0096: sym = symbolTable.get(name);
0097: if (sym != null)
0098: return sym;
0099: return super .lookup(type, name);
0100: }
0101:
0102: @Override
0103: protected void initSymbolTable() {
0104:
0105: symbolTable.clear();
0106: lookup(TokenType.KEYWORD, "ABSOLUTE");
0107: lookup(TokenType.KEYWORD, "ACTION");
0108: lookup(TokenType.KEYWORD, "ADD");
0109: lookup(TokenType.KEYWORD, "ADMIN");
0110: lookup(TokenType.KEYWORD, "AFTER");
0111: lookup(TokenType.KEYWORD, "AGGREGATE");
0112: lookup(TokenType.KEYWORD, "ALIAS");
0113: lookup(TokenType.KEYWORD, "ALL");
0114: lookup(TokenType.KEYWORD, "ALLOCATE");
0115: lookup(TokenType.KEYWORD, "ALTER");
0116: lookup(TokenType.KEYWORD, "AND");
0117: lookup(TokenType.KEYWORD, "ANY");
0118: lookup(TokenType.KEYWORD, "ARE");
0119: lookup(TokenType.KEYWORD, "ARRAY");
0120: lookup(TokenType.KEYWORD, "AS");
0121: lookup(TokenType.KEYWORD, "ASC");
0122: lookup(TokenType.KEYWORD, "ASSERTION");
0123: lookup(TokenType.KEYWORD, "AT");
0124: lookup(TokenType.KEYWORD, "AUTHORIZATION");
0125: lookup(TokenType.KEYWORD, "BEFORE");
0126: lookup(TokenType.KEYWORD, "BEGIN");
0127: lookup(TokenType.KEYWORD, "BINARY");
0128: lookup(TokenType.KEYWORD, "BIT");
0129: lookup(TokenType.KEYWORD, "BLOB");
0130: lookup(TokenType.KEYWORD, "BOOLEAN");
0131: lookup(TokenType.KEYWORD, "BOTH");
0132: lookup(TokenType.KEYWORD, "BREADTH");
0133: lookup(TokenType.KEYWORD, "BY");
0134: lookup(TokenType.KEYWORD, "CALL");
0135: lookup(TokenType.KEYWORD, "CASCADE");
0136: lookup(TokenType.KEYWORD, "CASCADED");
0137: lookup(TokenType.KEYWORD, "CASE");
0138: lookup(TokenType.KEYWORD, "CAST");
0139: lookup(TokenType.KEYWORD, "CATALOG");
0140: lookup(TokenType.KEYWORD, "CHAR");
0141: lookup(TokenType.KEYWORD, "CHARACTER");
0142: lookup(TokenType.KEYWORD, "CHECK");
0143: lookup(TokenType.KEYWORD, "CLASS");
0144: lookup(TokenType.KEYWORD, "CLOB");
0145: lookup(TokenType.KEYWORD, "CLOSE");
0146: lookup(TokenType.KEYWORD, "COLLATE");
0147: lookup(TokenType.KEYWORD, "COLLATION");
0148: lookup(TokenType.KEYWORD, "COLUMN");
0149: lookup(TokenType.KEYWORD, "COMMIT");
0150: lookup(TokenType.KEYWORD, "COMPLETION");
0151: lookup(TokenType.KEYWORD, "CONDITION");
0152: lookup(TokenType.KEYWORD, "CONNECT");
0153: lookup(TokenType.KEYWORD, "CONNECTION");
0154: lookup(TokenType.KEYWORD, "CONSTRAINT");
0155: lookup(TokenType.KEYWORD, "CONSTRAINTS");
0156: lookup(TokenType.KEYWORD, "CONSTRUCTOR");
0157: lookup(TokenType.KEYWORD, "CONTAINS");
0158: lookup(TokenType.KEYWORD, "CONTINUE");
0159: lookup(TokenType.KEYWORD, "CORRESPONDING");
0160: lookup(TokenType.KEYWORD, "CREATE");
0161: lookup(TokenType.KEYWORD, "CROSS");
0162: lookup(TokenType.KEYWORD, "CUBE");
0163: lookup(TokenType.KEYWORD, "CURRENT");
0164: lookup(TokenType.KEYWORD, "CURRENT_DATE");
0165: lookup(TokenType.KEYWORD, "CURRENT_PATH");
0166: lookup(TokenType.KEYWORD, "CURRENT_ROLE");
0167: lookup(TokenType.KEYWORD, "CURRENT_TIME");
0168: lookup(TokenType.KEYWORD, "CURRENT_TIMESTAMP");
0169: lookup(TokenType.KEYWORD, "CURRENT_USER");
0170: lookup(TokenType.KEYWORD, "CURSOR");
0171: lookup(TokenType.KEYWORD, "CYCLE");
0172: lookup(TokenType.KEYWORD, "DATA");
0173: lookup(TokenType.KEYWORD, "DATALINK");
0174: lookup(TokenType.KEYWORD, "DATE");
0175: lookup(TokenType.KEYWORD, "DAY");
0176: lookup(TokenType.KEYWORD, "DEALLOCATE");
0177: lookup(TokenType.KEYWORD, "DEC");
0178: lookup(TokenType.KEYWORD, "DECIMAL");
0179: lookup(TokenType.KEYWORD, "DECLARE");
0180: lookup(TokenType.KEYWORD, "DEFAULT");
0181: lookup(TokenType.KEYWORD, "DEFERRABLE");
0182: lookup(TokenType.KEYWORD, "DELETE");
0183: lookup(TokenType.KEYWORD, "DEPTH");
0184: lookup(TokenType.KEYWORD, "DEREF");
0185: lookup(TokenType.KEYWORD, "DESC");
0186: lookup(TokenType.KEYWORD, "DESCRIPTOR");
0187: lookup(TokenType.KEYWORD, "DIAGNOSTICS");
0188: lookup(TokenType.KEYWORD, "DICTIONARY");
0189: lookup(TokenType.KEYWORD, "DISCONNECT");
0190: lookup(TokenType.KEYWORD, "DO");
0191: lookup(TokenType.KEYWORD, "DOMAIN");
0192: lookup(TokenType.KEYWORD, "DOUBLE");
0193: lookup(TokenType.KEYWORD, "DROP");
0194: lookup(TokenType.KEYWORD, "END-EXEC");
0195: lookup(TokenType.KEYWORD, "EQUALS");
0196: lookup(TokenType.KEYWORD, "ESCAPE");
0197: lookup(TokenType.KEYWORD, "EXCEPT");
0198: lookup(TokenType.KEYWORD, "EXCEPTION");
0199: lookup(TokenType.KEYWORD, "EXECUTE");
0200: lookup(TokenType.KEYWORD, "EXIT");
0201: lookup(TokenType.KEYWORD, "EXPAND");
0202: lookup(TokenType.KEYWORD, "EXPANDING");
0203: lookup(TokenType.KEYWORD, "FALSE");
0204: lookup(TokenType.KEYWORD, "FIRST");
0205: lookup(TokenType.KEYWORD, "FLOAT");
0206: lookup(TokenType.KEYWORD, "FOR");
0207: lookup(TokenType.KEYWORD, "FOREIGN");
0208: lookup(TokenType.KEYWORD, "FREE");
0209: lookup(TokenType.KEYWORD, "FROM");
0210: lookup(TokenType.KEYWORD, "FUNCTION");
0211: lookup(TokenType.KEYWORD, "GENERAL");
0212: lookup(TokenType.KEYWORD, "GET");
0213: lookup(TokenType.KEYWORD, "GLOBAL");
0214: lookup(TokenType.KEYWORD, "GOTO");
0215: lookup(TokenType.KEYWORD, "GROUP");
0216: lookup(TokenType.KEYWORD, "GROUPING");
0217: lookup(TokenType.KEYWORD, "HANDLER");
0218: lookup(TokenType.KEYWORD, "HASH");
0219: lookup(TokenType.KEYWORD, "HOUR");
0220: lookup(TokenType.KEYWORD, "IDENTITY");
0221: lookup(TokenType.KEYWORD, "IF");
0222: lookup(TokenType.KEYWORD, "IGNORE");
0223: lookup(TokenType.KEYWORD, "IMMEDUATE");
0224: lookup(TokenType.KEYWORD, "IN");
0225: lookup(TokenType.KEYWORD, "INDICATOR");
0226: lookup(TokenType.KEYWORD, "INITIALIZE");
0227: lookup(TokenType.KEYWORD, "INITALLY");
0228: lookup(TokenType.KEYWORD, "INNER");
0229: lookup(TokenType.KEYWORD, "INOUT");
0230: lookup(TokenType.KEYWORD, "INPUT");
0231: lookup(TokenType.KEYWORD, "INSERT");
0232: lookup(TokenType.KEYWORD, "INT");
0233: lookup(TokenType.KEYWORD, "INTEGER");
0234: lookup(TokenType.KEYWORD, "INTERSECT");
0235: lookup(TokenType.KEYWORD, "INTERVAL");
0236: lookup(TokenType.KEYWORD, "INTO");
0237: lookup(TokenType.KEYWORD, "IS");
0238: lookup(TokenType.KEYWORD, "ISOLATION");
0239: lookup(TokenType.KEYWORD, "ITERATE");
0240: lookup(TokenType.KEYWORD, "JOIN");
0241: lookup(TokenType.KEYWORD, "KEY");
0242: lookup(TokenType.KEYWORD, "LANGUAGE");
0243: lookup(TokenType.KEYWORD, "LARGE");
0244: lookup(TokenType.KEYWORD, "LAST");
0245: lookup(TokenType.KEYWORD, "LATERAL");
0246: lookup(TokenType.KEYWORD, "LEADING");
0247: lookup(TokenType.KEYWORD, "LEAVE");
0248: lookup(TokenType.KEYWORD, "LEFT");
0249: lookup(TokenType.KEYWORD, "LESS");
0250: lookup(TokenType.KEYWORD, "LEVEL");
0251: lookup(TokenType.KEYWORD, "LIKE");
0252: lookup(TokenType.KEYWORD, "LIMIT");
0253: lookup(TokenType.KEYWORD, "LOCAL");
0254: lookup(TokenType.KEYWORD, "LOCALTIME");
0255: lookup(TokenType.KEYWORD, "LOCALTIME-");
0256: lookup(TokenType.KEYWORD, "LOCATOR");
0257: lookup(TokenType.KEYWORD, "LOOP");
0258: lookup(TokenType.KEYWORD, "MATCH");
0259: lookup(TokenType.KEYWORD, "MEETS");
0260: lookup(TokenType.KEYWORD, "MINUTE");
0261: lookup(TokenType.KEYWORD, "MODIFIES");
0262: lookup(TokenType.KEYWORD, "MODIFY");
0263: lookup(TokenType.KEYWORD, "MODULE");
0264: lookup(TokenType.KEYWORD, "MONTH");
0265: lookup(TokenType.KEYWORD, "NAMES");
0266: lookup(TokenType.KEYWORD, "NATIONAL");
0267: lookup(TokenType.KEYWORD, "NATURAL");
0268: lookup(TokenType.KEYWORD, "NCHAR");
0269: lookup(TokenType.KEYWORD, "NCLOB");
0270: lookup(TokenType.KEYWORD, "NEW");
0271: lookup(TokenType.KEYWORD, "NEXT");
0272: lookup(TokenType.KEYWORD, "NO");
0273: lookup(TokenType.KEYWORD, "NONE");
0274: lookup(TokenType.KEYWORD, "NORMALIZE");
0275: lookup(TokenType.KEYWORD, "NOT");
0276: lookup(TokenType.KEYWORD, "NULL");
0277: lookup(TokenType.KEYWORD, "NUMERIC");
0278: lookup(TokenType.KEYWORD, "OBJECT");
0279: lookup(TokenType.KEYWORD, "OF");
0280: lookup(TokenType.KEYWORD, "OFF");
0281: lookup(TokenType.KEYWORD, "OLD");
0282: lookup(TokenType.KEYWORD, "ON");
0283: lookup(TokenType.KEYWORD, "ONLY");
0284: lookup(TokenType.KEYWORD, "OPEN");
0285: lookup(TokenType.KEYWORD, "OPERATION");
0286: lookup(TokenType.KEYWORD, "OPTION");
0287: lookup(TokenType.KEYWORD, "OR");
0288: lookup(TokenType.KEYWORD, "ORDER");
0289: lookup(TokenType.KEYWORD, "ORDINALITY");
0290: lookup(TokenType.KEYWORD, "OUT");
0291: lookup(TokenType.KEYWORD, "OUTER");
0292: lookup(TokenType.KEYWORD, "OUTPUT");
0293: lookup(TokenType.KEYWORD, "PAD");
0294: lookup(TokenType.KEYWORD, "PARAMETER");
0295: lookup(TokenType.KEYWORD, "PARAMETERS");
0296: lookup(TokenType.KEYWORD, "PARTIAL");
0297: lookup(TokenType.KEYWORD, "PATH");
0298: lookup(TokenType.KEYWORD, "PERIOD");
0299: lookup(TokenType.KEYWORD, "POSTFIX");
0300: lookup(TokenType.KEYWORD, "PRECEDES");
0301: lookup(TokenType.KEYWORD, "PRECISION");
0302: lookup(TokenType.KEYWORD, "PREFIX");
0303: lookup(TokenType.KEYWORD, "PREORDER");
0304: lookup(TokenType.KEYWORD, "PREPARE");
0305: lookup(TokenType.KEYWORD, "PRESERVE");
0306: lookup(TokenType.KEYWORD, "PRIMARY");
0307: lookup(TokenType.KEYWORD, "PRIOR");
0308: lookup(TokenType.KEYWORD, "PRIVILEGES");
0309: lookup(TokenType.KEYWORD, "PROCEDURE");
0310: lookup(TokenType.KEYWORD, "PUBLIC");
0311: lookup(TokenType.KEYWORD, "READ");
0312: lookup(TokenType.KEYWORD, "READS");
0313: lookup(TokenType.KEYWORD, "REAL");
0314: lookup(TokenType.KEYWORD, "RECURSIVE");
0315: lookup(TokenType.KEYWORD, "REDO");
0316: lookup(TokenType.KEYWORD, "REF");
0317: lookup(TokenType.KEYWORD, "REFRENCES");
0318: lookup(TokenType.KEYWORD, "REFRENCING");
0319: lookup(TokenType.KEYWORD, "RELATIVE");
0320: lookup(TokenType.KEYWORD, "REPEAT");
0321: lookup(TokenType.KEYWORD, "RESIGNAL");
0322: lookup(TokenType.KEYWORD, "RESTRICT");
0323: lookup(TokenType.KEYWORD, "RESULT");
0324: lookup(TokenType.KEYWORD, "RETURN");
0325: lookup(TokenType.KEYWORD, "RETURNS");
0326: lookup(TokenType.KEYWORD, "REVOKE");
0327: lookup(TokenType.KEYWORD, "RIGHT");
0328: lookup(TokenType.KEYWORD, "ROLE");
0329: lookup(TokenType.KEYWORD, "ROLLBACK");
0330: lookup(TokenType.KEYWORD, "ROLLUP");
0331: lookup(TokenType.KEYWORD, "ROUTINE");
0332: lookup(TokenType.KEYWORD, "ROW");
0333: lookup(TokenType.KEYWORD, "ROWS");
0334: lookup(TokenType.KEYWORD, "SAVEPOINT");
0335: lookup(TokenType.KEYWORD, "SCHEMA");
0336: lookup(TokenType.KEYWORD, "SCROLL");
0337: lookup(TokenType.KEYWORD, "SEARCH");
0338: lookup(TokenType.KEYWORD, "SECOND");
0339: lookup(TokenType.KEYWORD, "SECTION");
0340: lookup(TokenType.KEYWORD, "SELECT");
0341: lookup(TokenType.KEYWORD, "SEQUENCE");
0342: lookup(TokenType.KEYWORD, "SESSION");
0343: lookup(TokenType.KEYWORD, "SESSION_USER");
0344: lookup(TokenType.KEYWORD, "SET");
0345: lookup(TokenType.KEYWORD, "SETS");
0346: lookup(TokenType.KEYWORD, "SIGNAL");
0347: lookup(TokenType.KEYWORD, "SIZE");
0348: lookup(TokenType.KEYWORD, "SMALLINT");
0349: lookup(TokenType.KEYWORD, "SPECIFIC");
0350: lookup(TokenType.KEYWORD, "SPECIFICTYPE");
0351: lookup(TokenType.KEYWORD, "SQL");
0352: lookup(TokenType.KEYWORD, "SQLEXCEPTION");
0353: lookup(TokenType.KEYWORD, "SQLSTATE");
0354: lookup(TokenType.KEYWORD, "SQLWARNING");
0355: lookup(TokenType.KEYWORD, "START");
0356: lookup(TokenType.KEYWORD, "STATE");
0357: lookup(TokenType.KEYWORD, "STATIC");
0358: lookup(TokenType.KEYWORD, "STRUCTURE");
0359: lookup(TokenType.KEYWORD, "SUCCEEDS");
0360: lookup(TokenType.KEYWORD, "SUM");
0361: lookup(TokenType.KEYWORD, "SYSTEM_USER");
0362: lookup(TokenType.KEYWORD, "TABLE");
0363: lookup(TokenType.KEYWORD, "TEMPORARY");
0364: lookup(TokenType.KEYWORD, "TERMINATE");
0365: lookup(TokenType.KEYWORD, "THAN");
0366: lookup(TokenType.KEYWORD, "THEN");
0367: lookup(TokenType.KEYWORD, "TIME");
0368: lookup(TokenType.KEYWORD, "TIMESTAMP");
0369: lookup(TokenType.KEYWORD, "TIMEZONE_HOUR");
0370: lookup(TokenType.KEYWORD, "TIMEZONE_MINUTE");
0371: lookup(TokenType.KEYWORD, "TO");
0372: lookup(TokenType.KEYWORD, "TRAILING");
0373: lookup(TokenType.KEYWORD, "TRANSACTION");
0374: lookup(TokenType.KEYWORD, "TRANSLATION");
0375: lookup(TokenType.KEYWORD, "TREAT");
0376: lookup(TokenType.KEYWORD, "TRIGGER");
0377: lookup(TokenType.KEYWORD, "TRUE");
0378: lookup(TokenType.KEYWORD, "UNDER");
0379: lookup(TokenType.KEYWORD, "UNDO");
0380: lookup(TokenType.KEYWORD, "UNION");
0381: lookup(TokenType.KEYWORD, "UNIQUE");
0382: lookup(TokenType.KEYWORD, "UNKNOWN");
0383: lookup(TokenType.KEYWORD, "UNTIL");
0384: lookup(TokenType.KEYWORD, "UPDATE");
0385: lookup(TokenType.KEYWORD, "USAGE");
0386: lookup(TokenType.KEYWORD, "USER");
0387: lookup(TokenType.KEYWORD, "USING");
0388: lookup(TokenType.KEYWORD, "VALUE");
0389: lookup(TokenType.KEYWORD, "VALUES");
0390: lookup(TokenType.KEYWORD, "VARIABLE");
0391: lookup(TokenType.KEYWORD, "VARYING");
0392: lookup(TokenType.KEYWORD, "VIEW");
0393: lookup(TokenType.KEYWORD, "WHEN");
0394: lookup(TokenType.KEYWORD, "WHENEVER");
0395: lookup(TokenType.KEYWORD, "WHERE");
0396: lookup(TokenType.KEYWORD, "WHILE");
0397: lookup(TokenType.KEYWORD, "WITH");
0398: lookup(TokenType.KEYWORD, "WRITE");
0399: lookup(TokenType.KEYWORD, "YEAR");
0400: lookup(TokenType.KEYWORD, "ZONE");
0401:
0402: lookup(TokenType.FUNCTION, "MAX");
0403: lookup(TokenType.FUNCTION, "MIN");
0404: lookup(TokenType.FUNCTION, "AVG");
0405: lookup(TokenType.FUNCTION, "COUNT");
0406: }
0407:
0408: /** Override the read method from the Scanner class. */
0409: @Override
0410: protected TokenType read() {
0411:
0412: TokenType type = TokenType.UNRECOGNIZED;
0413:
0414: if (start >= end)
0415: return TokenType.WHITESPACE;
0416:
0417: switch (state) {
0418: case START_VARIABLE:
0419: type = readVariable(TokenType.START_VARIABLE);
0420: if (type == TokenType.END_COMMENT) {
0421: state = TokenType.WHITESPACE;
0422: } else {
0423: state = TokenType.MID_COMMENT;
0424: }
0425: return type;
0426: case MID_COMMENT:
0427: case END_COMMENT:
0428: type = readComment(TokenType.MID_COMMENT);
0429: if (type == TokenType.END_COMMENT)
0430: state = TokenType.WHITESPACE;
0431: else
0432: state = TokenType.MID_COMMENT;
0433: return type;
0434: default:
0435: char c = buffer[start];
0436: if (c == '\\')
0437: c = next();
0438: if (c < 128)
0439: type = TokenType.forByte(kind[c]);
0440: else
0441: type = TokenType.forByte(unikind[Character.getType(c)]);
0442: switch (type) {
0443: case WHITESPACE:
0444: start = start + charlength;
0445: charlength = 1;
0446: while (start < end) {
0447: c = buffer[start];
0448: if (c == '\\')
0449: c = next();
0450: int k;
0451: if (c < 128)
0452: k = kind[c];
0453: else
0454: k = unikind[Character.getType(c)];
0455: if (k != TokenType.WHITESPACE.ordinal())
0456: break;
0457: start = start + charlength;
0458: charlength = 1;
0459: }
0460: break;
0461: case UNRECOGNIZED:
0462: case BRACKET:
0463: case SEPARATOR:
0464: start = start + charlength;
0465: charlength = 1;
0466: break;
0467: case OPERATOR:
0468: start = start + charlength;
0469: charlength = 1;
0470: type = readOperator(c);
0471: break;
0472: case CHARACTER:
0473: start = start + charlength;
0474: charlength = 1;
0475: type = readCharLiteral();
0476: break;
0477: case STRING:
0478: start = start + charlength;
0479: charlength = 1;
0480: type = readStringLiteral();
0481: break;
0482: case IDENTIFIER:
0483: start = start + charlength;
0484: charlength = 1;
0485: while (start < end) {
0486: c = buffer[start];
0487: if (c == '\\')
0488: c = next();
0489: int k;
0490: if (c < 128)
0491: k = kind[c];
0492: else
0493: k = unikind[Character.getType(c)];
0494: if (k != TokenType.IDENTIFIER.ordinal()
0495: && k != TokenType.NUMBER.ordinal())
0496: break;
0497: start = start + charlength;
0498: charlength = 1;
0499: }
0500: break;
0501: case NUMBER:
0502: start = start + charlength;
0503: charlength = 1;
0504: type = readNumber(c);
0505: break;
0506: case PUNCTUATION:
0507: start = start + charlength;
0508: charlength = 1;
0509: type = readDot();
0510: break;
0511: case VARIABLE:
0512: start = start + charlength;
0513: charlength = 1;
0514: type = readVariable(TokenType.START_VARIABLE);
0515: if (type == TokenType.START_VARIABLE)
0516: state = TokenType.UNRECOGNIZED;
0517: break;
0518: case COMMENT:
0519: start = start + charlength;
0520: charlength = 1;
0521: type = readSlash();
0522: if (type == TokenType.START_COMMENT)
0523: state = TokenType.MID_COMMENT;
0524: break;
0525: default:
0526: break;
0527: }
0528: }
0529: return type;
0530: }
0531:
0532: private TokenType readOperator(char c) {
0533:
0534: if (start >= end)
0535: return TokenType.OPERATOR;
0536: char c2;
0537:
0538: switch (c) {
0539: case '~':
0540: case '?':
0541: case ':':
0542: break;
0543: case '+':
0544: case '-':
0545: case '&':
0546: case '|':
0547: c2 = buffer[start];
0548: if (c2 == '\\')
0549: c2 = next();
0550: if (c2 != c && c2 != '=')
0551: break;
0552: start = start + charlength;
0553: charlength = 1;
0554: break;
0555: case '=':
0556: case '*':
0557: case '!':
0558: case '^':
0559: case '%':
0560: case '/':
0561: c2 = buffer[start];
0562: if (c2 == '\\')
0563: c2 = next();
0564: if (c2 != '=')
0565: break;
0566: start = start + charlength;
0567: charlength = 1;
0568: break;
0569: case '<':
0570: case '>':
0571: c2 = buffer[start];
0572: if (c2 == '\\')
0573: c2 = next();
0574: if (c2 == '=') {
0575: start = start + charlength;
0576: charlength = 1;
0577: } else if (c2 == c) {
0578: start = start + charlength;
0579: charlength = 1;
0580: if (start >= end)
0581: break;
0582: char c3 = buffer[start];
0583: if (c3 == '\\')
0584: c3 = next();
0585: if (c3 == '=') {
0586: start = start + charlength;
0587: charlength = 1;
0588: } else if (c == '>' && c3 == '>') // >>>
0589: {
0590: start = start + charlength;
0591: charlength = 1;
0592: if (start >= end)
0593: break;
0594: char c4 = buffer[start];
0595: if (c4 == '\\')
0596: c4 = next();
0597: if (c4 != '=')
0598: break;
0599: start = start + charlength;
0600: charlength = 1;
0601: }
0602: }
0603: break;
0604: }
0605: return TokenType.OPERATOR;
0606: }
0607:
0608: private TokenType readCharLiteral() {
0609:
0610: if (start >= end)
0611: return bad();
0612: char c = buffer[start];
0613: if (c == '\\')
0614: c = next();
0615:
0616: while (c != '\"') {
0617: switch (c) {
0618: case '\\':
0619: start = start + charlength;
0620: charlength = 1;
0621: boolean ok = readEscapeSequence();
0622: if (!ok)
0623: return bad();
0624: break;
0625: case '\n':
0626: return bad();
0627: default:
0628: start = start + charlength;
0629: charlength = 1;
0630: if (start >= end)
0631: return bad();
0632: break;
0633: }
0634: c = buffer[start];
0635: if (c == '\\')
0636: c = next();
0637: }
0638: if (c != '\"') {
0639: return bad();
0640: }
0641: start = start + charlength;
0642: charlength = 1;
0643: return TokenType.CHARACTER;
0644: }
0645:
0646: private TokenType readStringLiteral() {
0647:
0648: if (start >= end)
0649: return bad();
0650: char c = buffer[start];
0651: if (c == '\\')
0652: c = next();
0653:
0654: while (c != '\'') {
0655: switch (c) {
0656: case '\\':
0657: start = start + charlength;
0658: charlength = 1;
0659: boolean ok = readEscapeSequence();
0660: if (!ok)
0661: return bad();
0662: break;
0663: case '\n':
0664: return bad();
0665: default:
0666: start = start + charlength;
0667: charlength = 1;
0668: if (start >= end)
0669: return bad();
0670: break;
0671: }
0672: c = buffer[start];
0673: if (c == '\\')
0674: c = next();
0675: }
0676: if (c != '\'') {
0677: return bad();
0678: }
0679: start = start + charlength;
0680: charlength = 1;
0681: return TokenType.STRING;
0682: }
0683:
0684: private TokenType readSlash() {
0685:
0686: if (start >= end)
0687: return TokenType.OPERATOR;
0688: char c = buffer[start];
0689: if (c == '\\') {
0690: c = next();
0691: }
0692: if (c == '-' && buffer[start - 1] == '-') {
0693: while (c != '\n') {
0694: start = start + charlength;
0695: charlength = 1;
0696: if (start >= end) {
0697: return TokenType.COMMENT;
0698: }
0699: c = buffer[start];
0700: if (c == '\\') {
0701: c = next();
0702: }
0703: }
0704: start = start + charlength;
0705: charlength = 1;
0706: return TokenType.COMMENT;
0707: } else if (c == '*') {
0708: start = start + charlength;
0709: charlength = 1;
0710: return readComment(TokenType.START_COMMENT);
0711: }
0712: return readOperator('/');
0713: }
0714:
0715: private TokenType readVariable(TokenType type) {
0716:
0717: if (start >= end) {
0718: return type;
0719: }
0720:
0721: char c = buffer[start];
0722: if (c == '\\') {
0723: c = next();
0724: }
0725:
0726: if (c == '{' && buffer[start - 1] == '$') {
0727: while (c != '}') {
0728: start = start + charlength;
0729: charlength = 1;
0730: if (start >= end) {
0731: return bad();
0732: }
0733: c = buffer[start];
0734: if (c == '\\') {
0735: c = next();
0736: }
0737: }
0738: start = start + charlength;
0739: charlength = 1;
0740: return TokenType.VARIABLE;
0741: }
0742: return readOperator('$');
0743: }
0744:
0745: // Read one line of a /*...*/ comment, given the expected type
0746: private TokenType readComment(TokenType type) {
0747:
0748: if (start >= end)
0749: return type;
0750: char c = buffer[start];
0751: if (c == '\\')
0752: c = next();
0753:
0754: while (true) {
0755: while (c != '*' && c != '\n') {
0756: start = start + charlength;
0757: charlength = 1;
0758: if (start >= end) {
0759: return bad();
0760: }
0761: c = buffer[start];
0762: if (c == '\\') {
0763: c = next();
0764: }
0765: }
0766: start = start + charlength;
0767: charlength = 1;
0768: if (c == '\n')
0769: return type;
0770: if (start >= end)
0771: return type;
0772: c = buffer[start];
0773: if (c == '\\')
0774: c = next();
0775: if (c == '/') {
0776: start = start + charlength;
0777: charlength = 1;
0778: if (type == TokenType.START_COMMENT) {
0779: return TokenType.COMMENT;
0780: }
0781: return TokenType.END_COMMENT;
0782: }
0783: }
0784: }
0785:
0786: // Read a number, without checking whether it is out of range
0787: // Doesn't deal with e.g. 0777.9 or 07779f
0788: private TokenType readNumber(final char c) {
0789:
0790: char character = c;
0791: if (character == '0') {
0792: int saveStart = start, saveLength = charlength;
0793: start = start + charlength;
0794: charlength = 1;
0795: if (start >= end)
0796: return TokenType.NUMBER;
0797: char c2 = buffer[start];
0798: if (c2 == '\\')
0799: c2 = next();
0800: switch (c2) {
0801: case 'x':
0802: case 'X':
0803: start = start + charlength;
0804: charlength = 1;
0805: boolean ok = readDigits(16);
0806: if (!ok)
0807: return bad();
0808: readSuffix();
0809: return TokenType.NUMBER;
0810: case 0:
0811: case 1:
0812: case 2:
0813: case 3:
0814: case 4:
0815: case 5:
0816: case 6:
0817: case 7:
0818: readDigits(8);
0819: readSuffix();
0820: return TokenType.NUMBER;
0821: case '.':
0822: case 'e':
0823: case 'E':
0824: start = saveStart;
0825: charlength = saveLength;
0826: break;
0827: case 'f':
0828: case 'F':
0829: case 'd':
0830: case 'D':
0831: start = start + charlength;
0832: charlength = 1;
0833: return TokenType.NUMBER;
0834: case 'l':
0835: case 'L':
0836: start = start + charlength;
0837: charlength = 1;
0838: return TokenType.NUMBER;
0839: }
0840: }
0841: boolean hasDigits = false;
0842: if ('0' <= c && c <= '9') {
0843: hasDigits = true;
0844: readDigits(10);
0845: if (start >= end)
0846: return TokenType.NUMBER;
0847: character = buffer[start];
0848: if (character == '\\')
0849: character = next();
0850: if (character == 'l' || character == 'L') {
0851: start = start + charlength;
0852: charlength = 1;
0853: return TokenType.NUMBER;
0854: }
0855: }
0856: if (character == '.') {
0857: start = start + charlength;
0858: charlength = 1;
0859: if (start >= end)
0860: return TokenType.NUMBER;
0861: character = buffer[start];
0862: if (character == '\\')
0863: character = next();
0864: if ('0' <= c && c <= '9') {
0865: hasDigits = true;
0866: readDigits(10);
0867: if (start >= end)
0868: return TokenType.NUMBER;
0869: character = buffer[start];
0870: if (character == '\\')
0871: character = next();
0872: }
0873: }
0874: if (!hasDigits)
0875: return bad();
0876: switch (c) {
0877: case 'e':
0878: case 'E':
0879: start = start + charlength;
0880: charlength = 1;
0881: if (start >= end)
0882: return bad();
0883: character = buffer[start];
0884: if (character == '\\')
0885: character = next();
0886: if (c == '+' || c == '-') {
0887: start = start + charlength;
0888: charlength = 1;
0889: if (start >= end) {
0890: return bad();
0891: }
0892: character = buffer[start];
0893: if (character == '\\') {
0894: character = next();
0895: }
0896: }
0897: readDigits(10);
0898: break;
0899: case 'f':
0900: case 'F':
0901: case 'd':
0902: case 'D':
0903: start = start + charlength;
0904: charlength = 1;
0905: return TokenType.NUMBER;
0906: }
0907: return TokenType.NUMBER;
0908: }
0909:
0910: private boolean readDigits(int radix) {
0911:
0912: if (start >= end)
0913: return false;
0914: char c = buffer[start];
0915: if (c == '\\')
0916: c = next();
0917: if (Character.digit(c, radix) == -1)
0918: return false;
0919: while (Character.digit(c, radix) != -1) {
0920: start = start + charlength;
0921: charlength = 1;
0922: if (start >= end)
0923: return true;
0924: c = buffer[start];
0925: if (c == '\\')
0926: c = next();
0927: }
0928: return true;
0929: }
0930:
0931: private void readSuffix() {
0932:
0933: if (start >= end)
0934: return;
0935: char c = buffer[start];
0936: if (c == '\\')
0937: c = next();
0938: switch (c) {
0939: case 'f':
0940: case 'F':
0941: case 'd':
0942: case 'D':
0943: case 'l':
0944: case 'L':
0945: start = start + charlength;
0946: charlength = 1;
0947: }
0948: }
0949:
0950: private TokenType readDot() {
0951:
0952: if (start >= end)
0953: return TokenType.SEPARATOR;
0954: char c2 = buffer[start];
0955: if (c2 == '\\')
0956: c2 = next();
0957: if (Character.isDigit(c2)) {
0958: return readNumber('.');
0959: }
0960: if (start + 1 >= end)
0961: return TokenType.SEPARATOR;
0962: if (c2 != '.' || buffer[start + 1] != '.')
0963: return TokenType.SEPARATOR;
0964: start = start + 2;
0965: return TokenType.SEPARATOR;
0966: }
0967:
0968: private boolean readEscapeSequence() {
0969:
0970: if (start >= end)
0971: return false;
0972: char c2 = buffer[start];
0973: if (c2 == '\\')
0974: c2 = next();
0975:
0976: switch (c2) {
0977: case 'b':
0978: case 't':
0979: case 'n':
0980: case 'f':
0981: case 'r':
0982: case '\"':
0983: case '\'':
0984: case '\\':
0985: start = start + charlength;
0986: charlength = 1;
0987: return true;
0988: case '0':
0989: case '1':
0990: case '2':
0991: case '3':
0992: return readOctal(3);
0993: case '4':
0994: case '5':
0995: case '6':
0996: case '7':
0997: return readOctal(2);
0998: default:
0999: return false;
1000: }
1001: }
1002:
1003: private boolean readOctal(int maxlength) {
1004:
1005: if (start >= end)
1006: return false;
1007: char c = buffer[start];
1008: if (c == '\\')
1009: c = next();
1010:
1011: int i, val = 0;
1012: for (i = 0; i < maxlength; i++) {
1013: if (Character.digit(c, 8) != -1) {
1014: val = 8 * val + Character.digit(c, 8);
1015: start = start + charlength;
1016: charlength = 1;
1017: if (start >= end)
1018: break;
1019: c = buffer[start];
1020: if (c == '\\')
1021: c = next();
1022: } else
1023: break;
1024: }
1025: if ((i == 0) || (val > 0xFF))
1026: return false;
1027: return true;
1028: }
1029:
1030: // A malformed or incomplete token has a negative type
1031: private TokenType bad() {
1032:
1033: return TokenType.UNRECOGNIZED;
1034: }
1035:
1036: // Look ahead at the next character or unicode escape.
1037: // For efficiency, replace c = next(); with
1038: // c = buffer[start]; if (c == '\\') c = next();
1039: // To accept the character after looking at it, use:
1040: // start = start + charlength; charlength = 1;
1041:
1042: // Record the number of source code characters used up. To deal with an odd
1043: // or even number of backslashes preceding a unicode escape, whenever a
1044: // second backslash is coming up, mark its position as a pair.
1045:
1046: private char next() {
1047:
1048: if (start >= end)
1049: return 26; // EOF
1050: char c = buffer[start];
1051: if (c != '\\')
1052: return c;
1053: if (start == pair) {
1054: pair = 0;
1055: return '\\';
1056: }
1057: if (start + 1 >= end)
1058: return '\\';
1059:
1060: c = buffer[start + 1];
1061: if (c == '\\')
1062: pair = start + 1;
1063: if (c != 'u')
1064: return '\\';
1065:
1066: int pos = start + 2;
1067: while (pos < end && buffer[pos] == 'u')
1068: pos++;
1069: if (pos + 4 > end) {
1070: charlength = end - start;
1071: return '\0';
1072: }
1073:
1074: c = 0;
1075: for (int j = 0; j < 4; j++) {
1076: int d = Character.digit(buffer[pos + j], 16);
1077: if (d < 0) {
1078: charlength = pos + j - start;
1079: return '\0';
1080: }
1081: c = (char) (c * 16 + d);
1082: }
1083: charlength = pos + 4 - start;
1084: return c;
1085: }
1086:
1087: private void initKind() {
1088:
1089: for (char c = 0; c < 128; c++)
1090: kind[c] = -1;
1091: for (char c = 0; c < 128; c++)
1092: switch (c) {
1093: case 0:
1094: case 1:
1095: case 2:
1096: case 3:
1097: case 4:
1098: case 5:
1099: case 6:
1100: case 7:
1101: case 8:
1102: case 11:
1103: case 13:
1104: case 14:
1105: case 15:
1106: case 16:
1107: case 17:
1108: case 18:
1109: case 19:
1110: case 20:
1111: case 21:
1112: case 22:
1113: case 23:
1114: case 24:
1115: case 25:
1116: case 27:
1117: case 28:
1118: case 29:
1119: case 30:
1120: case 31:
1121: case 127:
1122: case '#':
1123: case '@':
1124: case '`':
1125: case '\\':
1126: kind[c] = (byte) TokenType.UNRECOGNIZED.ordinal();
1127: break;
1128: case '\t':
1129: case '\n':
1130: case ' ':
1131: case '\f':
1132: case 26:
1133: kind[c] = (byte) TokenType.WHITESPACE.ordinal();
1134: break;
1135: case '!':
1136: case '%':
1137: case '&':
1138: case '*':
1139: case '+':
1140: case ':':
1141: case '<':
1142: case '=':
1143: case '>':
1144: case '?':
1145: case '^':
1146: case '|':
1147: case '~':
1148: kind[c] = (byte) TokenType.OPERATOR.ordinal();
1149: break;
1150: case '\'':
1151: kind[c] = (byte) TokenType.STRING.ordinal();
1152: break;
1153: case '\"':
1154: kind[c] = (byte) TokenType.CHARACTER.ordinal();
1155: break;
1156: case '.':
1157: kind[c] = (byte) TokenType.PUNCTUATION.ordinal();
1158: break;
1159: case '/':
1160: case '-':
1161: kind[c] = (byte) TokenType.COMMENT.ordinal();
1162: break;
1163: case 'A':
1164: case 'B':
1165: case 'C':
1166: case 'D':
1167: case 'E':
1168: case 'F':
1169: case 'G':
1170: case 'H':
1171: case 'I':
1172: case 'J':
1173: case 'K':
1174: case 'L':
1175: case 'M':
1176: case 'N':
1177: case 'O':
1178: case 'P':
1179: case 'Q':
1180: case 'R':
1181: case 'S':
1182: case 'T':
1183: case 'U':
1184: case 'V':
1185: case 'W':
1186: case 'X':
1187: case 'Y':
1188: case 'Z':
1189: case '_':
1190: case 'a':
1191: case 'b':
1192: case 'c':
1193: case 'd':
1194: case 'e':
1195: case 'f':
1196: case 'g':
1197: case 'h':
1198: case 'i':
1199: case 'j':
1200: case 'k':
1201: case 'l':
1202: case 'm':
1203: case 'n':
1204: case 'o':
1205: case 'p':
1206: case 'q':
1207: case 'r':
1208: case 's':
1209: case 't':
1210: case 'u':
1211: case 'v':
1212: case 'w':
1213: case 'x':
1214: case 'y':
1215: case 'z':
1216: kind[c] = (byte) TokenType.IDENTIFIER.ordinal();
1217: break;
1218: case '0':
1219: case '1':
1220: case '2':
1221: case '3':
1222: case '4':
1223: case '5':
1224: case '6':
1225: case '7':
1226: case '8':
1227: case '9':
1228: kind[c] = (byte) TokenType.NUMBER.ordinal();
1229: break;
1230: case '(':
1231: case ')':
1232: case '[':
1233: case ']':
1234: case '{':
1235: case '}':
1236: kind[c] = (byte) TokenType.BRACKET.ordinal();
1237: break;
1238: case ',':
1239: case ';':
1240: kind[c] = (byte) TokenType.SEPARATOR.ordinal();
1241: break;
1242: case '$':
1243: kind[c] = (byte) TokenType.VARIABLE.ordinal();
1244: break;
1245: }
1246: for (char c = 0; c < 128; c++)
1247: if (kind[c] == -1)
1248: System.out.println("Char " + ((int) c)
1249: + " hasn't been classified");
1250: }
1251:
1252: private void initUniKind() {
1253:
1254: for (byte b = 0; b < 31; b++)
1255: unikind[b] = -1;
1256: for (byte b = 0; b < 31; b++)
1257: switch (b) {
1258: case Character.UNASSIGNED:
1259: case Character.ENCLOSING_MARK:
1260: case Character.OTHER_NUMBER:
1261: case Character.SPACE_SEPARATOR:
1262: case Character.LINE_SEPARATOR:
1263: case Character.PARAGRAPH_SEPARATOR:
1264: case Character.CONTROL:
1265: case 17: // category 17 is unused
1266: case Character.PRIVATE_USE:
1267: case Character.SURROGATE:
1268: case Character.DASH_PUNCTUATION:
1269: case Character.START_PUNCTUATION:
1270: case Character.END_PUNCTUATION:
1271: case Character.OTHER_PUNCTUATION:
1272: case Character.MATH_SYMBOL:
1273: case Character.MODIFIER_SYMBOL:
1274: case Character.OTHER_SYMBOL:
1275: case Character.INITIAL_QUOTE_PUNCTUATION:
1276: case Character.FINAL_QUOTE_PUNCTUATION:
1277: unikind[b] = (byte) TokenType.UNRECOGNIZED.ordinal();
1278: break;
1279: case Character.UPPERCASE_LETTER:
1280: case Character.LOWERCASE_LETTER:
1281: case Character.TITLECASE_LETTER:
1282: case Character.MODIFIER_LETTER:
1283: case Character.OTHER_LETTER:
1284: case Character.LETTER_NUMBER:
1285: case Character.CONNECTOR_PUNCTUATION: // maybe NUMBER
1286: case Character.CURRENCY_SYMBOL:
1287: // Characters where Other_ID_Start is true
1288: unikind[b] = (byte) TokenType.IDENTIFIER.ordinal();
1289: break;
1290: case Character.NON_SPACING_MARK:
1291: case Character.COMBINING_SPACING_MARK:
1292: case Character.DECIMAL_DIGIT_NUMBER:
1293: case Character.FORMAT:
1294: unikind[b] = (byte) TokenType.NUMBER.ordinal();
1295: break;
1296: }
1297: for (byte b = 0; b < 31; b++)
1298: if (unikind[b] == -1)
1299: System.out.println("Unicode cat " + b
1300: + " hasn't been classified");
1301: }
1302:
1303: }
|