0001: package net.sf.saxon.expr;
0002:
0003: import net.sf.saxon.functions.NormalizeSpace;
0004: import net.sf.saxon.trans.StaticError;
0005:
0006: import java.util.ArrayList;
0007: import java.util.List;
0008:
0009: /**
0010: * Tokenizer for expressions and inputs.
0011: *
0012: * This code was originally derived from James Clark's xt, though it has been greatly modified since.
0013: * See copyright notice at end of file.
0014: */
0015:
0016: public final class Tokenizer {
0017:
0018: public int getState() {
0019: return state;
0020: }
0021:
0022: public void setState(int state) {
0023: this .state = state;
0024: if (state == DEFAULT_STATE) {
0025: // force the followsOperator() test to return true
0026: precedingToken = Token.UNKNOWN;
0027: currentToken = Token.UNKNOWN;
0028: } else if (state == OPERATOR_STATE) {
0029: precedingToken = Token.RPAR;
0030: currentToken = Token.RPAR;
0031: }
0032: }
0033:
0034: private int state = DEFAULT_STATE;
0035: // we may need to make this a stack at some time
0036:
0037: /**
0038: * Initial default state of the Tokenizer
0039: */
0040: public static final int DEFAULT_STATE = 0;
0041:
0042: /**
0043: * State in which a name is NOT to be merged with what comes next, for example "("
0044: */
0045: public static final int BARE_NAME_STATE = 1;
0046:
0047: /**
0048: * State in which the next thing to be read is a SequenceType
0049: */
0050: public static final int SEQUENCE_TYPE_STATE = 2;
0051: /**
0052: * State in which the next thing to be read is an operator
0053: */
0054:
0055: public static final int OPERATOR_STATE = 3;
0056:
0057: /**
0058: * The starting line number (for XPath in XSLT, the line number in the stylesheet)
0059: */
0060: public int startLineNumber;
0061: /**
0062: * The number identifying the most recently read token
0063: */
0064: public int currentToken = Token.EOF;
0065: /**
0066: * The string value of the most recently read token
0067: */
0068: public String currentTokenValue = null;
0069: /**
0070: * The position in the input expression where the current token starts
0071: */
0072: public int currentTokenStartOffset = 0;
0073: /**
0074: * The number of the next token to be returned
0075: */
0076: private int nextToken = Token.EOF;
0077: /**
0078: * The string value of the next token to be returned
0079: */
0080: private String nextTokenValue = null;
0081: /**
0082: * The position in the expression of the start of the next token
0083: */
0084: private int nextTokenStartOffset = 0;
0085: /**
0086: * The string being parsed
0087: */
0088: public String input;
0089: /**
0090: * The current position within the input string
0091: */
0092: public int inputOffset = 0;
0093: /**
0094: * The length of the input string
0095: */
0096: private int inputLength;
0097: /**
0098: * The line number (within the expression) of the current token
0099: */
0100: private int lineNumber = 1;
0101: /**
0102: * The line number (within the expression) of the next token
0103: */
0104: private int nextLineNumber = 1;
0105:
0106: /**
0107: * List containing the positions (offsets in the input string) at which newline characters
0108: * occur
0109: */
0110:
0111: private List newlineOffsets = null;
0112:
0113: /**
0114: * The token number of the token that preceded the current token
0115: */
0116: private int precedingToken = Token.UNKNOWN;
0117:
0118: //public boolean recognizePragmas = false;
0119: //public String lastPragma = null;
0120:
0121: //
0122: // Lexical analyser for expressions, queries, and XSLT patterns
0123: //
0124:
0125: /**
0126: * Prepare a string for tokenization.
0127: * The actual tokens are obtained by calls on next()
0128: *
0129: * @param input the string to be tokenized
0130: * @param start start point within the string
0131: * @param end end point within the string (last character not read):
0132: * -1 means end of string
0133: * @exception net.sf.saxon.trans.StaticError if a lexical error occurs, e.g. unmatched
0134: * string quotes
0135: */
0136: public void tokenize(String input, int start, int end,
0137: int lineNumber) throws StaticError {
0138: nextToken = Token.EOF;
0139: nextTokenValue = null;
0140: nextTokenStartOffset = 0;
0141: inputOffset = start;
0142: this .input = input;
0143: this .startLineNumber = lineNumber;
0144: this .lineNumber = lineNumber;
0145: this .nextLineNumber = lineNumber;
0146: if (end == -1) {
0147: this .inputLength = input.length();
0148: } else {
0149: this .inputLength = end;
0150: }
0151:
0152: // The tokenizer actually reads one token ahead. The raw lexical analysis performed by
0153: // the lookAhead() method does not (in general) distinguish names used as QNames from names
0154: // used for operators, axes, and functions. The next() routine further refines names into the
0155: // correct category, by looking at the following token. In addition, it combines compound tokens
0156: // such as "instance of" and "cast as".
0157:
0158: lookAhead();
0159: next();
0160: }
0161:
0162: //diagnostic version of next(): change real version to realnext()
0163: //
0164: //public void next() throws XPathException {
0165: // realnext();
0166: // System.err.println("Token: " + currentToken + "[" + tokens[currentToken] + "]");
0167: //}
0168:
0169: /**
0170: * Get the next token from the input expression. The type of token is returned in the
0171: * currentToken variable, the string value of the token in currentTokenValue.
0172: *
0173: * @exception net.sf.saxon.trans.StaticError if a lexical error is detected
0174: */
0175:
0176: public void next() throws StaticError {
0177: precedingToken = currentToken;
0178: currentToken = nextToken;
0179: currentTokenValue = nextTokenValue;
0180: if (currentTokenValue == null) {
0181: currentTokenValue = "";
0182: }
0183: currentTokenStartOffset = nextTokenStartOffset;
0184: lineNumber = nextLineNumber;
0185:
0186: // disambiguate the current token based on the tokenizer state
0187:
0188: switch (currentToken) {
0189: case Token.NAME:
0190: int optype = getBinaryOp(currentTokenValue);
0191: if (optype != Token.UNKNOWN && !followsOperator()) {
0192: currentToken = optype;
0193: }
0194: break;
0195: case Token.LT:
0196: if (followsOperator()) {
0197: currentToken = Token.TAG;
0198: }
0199: break;
0200: case Token.STAR:
0201: if (!followsOperator()) {
0202: currentToken = Token.MULT;
0203: }
0204: break;
0205: }
0206:
0207: if (currentToken == Token.TAG || currentToken == Token.RCURLY) {
0208: // No lookahead after encountering "<" at the start of an XML-like tag.
0209: // After an RCURLY, the parser must do an explicit lookahead() to continue
0210: // tokenizing; otherwise it can continue with direct character reading
0211: return;
0212: }
0213:
0214: lookAhead();
0215:
0216: if (currentToken == Token.NAME) {
0217: if (state == BARE_NAME_STATE) {
0218: return;
0219: }
0220: switch (nextToken) {
0221: case Token.LPAR:
0222: int op = getBinaryOp(currentTokenValue);
0223: if (op == Token.UNKNOWN) {
0224: currentToken = getFunctionType(currentTokenValue);
0225: lookAhead(); // swallow the "("
0226: } else {
0227: currentToken = op;
0228: }
0229: break;
0230:
0231: case Token.LCURLY:
0232: if (!(state == SEQUENCE_TYPE_STATE)) {
0233: currentToken = Token.KEYWORD_CURLY;
0234: lookAhead(); // swallow the "{"
0235: }
0236: break;
0237:
0238: case Token.COLONCOLON:
0239: lookAhead();
0240: currentToken = Token.AXIS;
0241: break;
0242:
0243: case Token.COLONSTAR:
0244: lookAhead();
0245: currentToken = Token.PREFIX;
0246: break;
0247:
0248: case Token.DOLLAR:
0249: if (currentTokenValue == "for") {
0250: currentToken = Token.FOR;
0251: } else if (currentTokenValue == "some") {
0252: currentToken = Token.SOME;
0253: } else if (currentTokenValue == "every") {
0254: currentToken = Token.EVERY;
0255: } else if (currentTokenValue == "let") {
0256: currentToken = Token.LET;
0257: }
0258: break;
0259:
0260: case Token.NAME:
0261: int candidate = -1;
0262: if (currentTokenValue.equals("element")) {
0263: candidate = Token.ELEMENT_QNAME;
0264: } else if (currentTokenValue.equals("attribute")) {
0265: candidate = Token.ATTRIBUTE_QNAME;
0266: } else if (currentTokenValue
0267: .equals("processing-instruction")) {
0268: candidate = Token.PI_QNAME;
0269: }
0270: if (candidate != -1) {
0271: // <'element' QName '{'> constructor
0272: // <'attribute' QName '{'> constructor
0273: // <'processing-instruction' QName '{'> constructor
0274:
0275: String qname = nextTokenValue;
0276: String saveTokenValue = currentTokenValue;
0277: int savePosition = inputOffset;
0278: lookAhead();
0279: if (nextToken == Token.LCURLY) {
0280: currentToken = candidate;
0281: currentTokenValue = qname;
0282: lookAhead();
0283: return;
0284: } else {
0285: // backtrack (we don't have 2-token lookahead; this is the
0286: // only case where it's needed. So we backtrack instead.)
0287: currentToken = Token.NAME;
0288: currentTokenValue = saveTokenValue;
0289: inputOffset = savePosition;
0290: nextToken = Token.NAME;
0291: nextTokenValue = qname;
0292: }
0293:
0294: }
0295: String composite = currentTokenValue + ' '
0296: + nextTokenValue;
0297: Integer val = (Integer) Token.doubleKeywords
0298: .get(composite);
0299: if (val == null) {
0300: break;
0301: } else {
0302: currentToken = val.intValue();
0303: currentTokenValue = composite;
0304: lookAhead();
0305: return;
0306: }
0307: default:
0308: // no action needed
0309: }
0310: }
0311: }
0312:
0313: /**
0314: * Force the current token to be treated as an operator if possible
0315: */
0316:
0317: public void treatCurrentAsOperator() {
0318: switch (currentToken) {
0319: case Token.NAME:
0320: int optype = getBinaryOp(currentTokenValue);
0321: if (optype != Token.UNKNOWN) {
0322: currentToken = optype;
0323: }
0324: break;
0325: case Token.STAR:
0326: currentToken = Token.MULT;
0327: break;
0328: }
0329: }
0330:
0331: /**
0332: * Look ahead by one token. This method does the real tokenization work.
0333: * The method is normally called internally, but the XQuery parser also
0334: * calls it to resume normal tokenization after dealing with pseudo-XML
0335: * syntax.
0336: * @exception net.sf.saxon.trans.StaticError if a lexical error occurs
0337: */
0338: public void lookAhead() throws StaticError {
0339: precedingToken = nextToken;
0340: nextTokenValue = null;
0341: nextTokenStartOffset = inputOffset;
0342: for (;;) {
0343: if (inputOffset >= inputLength) {
0344: nextToken = Token.EOF;
0345: return;
0346: }
0347: char c = input.charAt(inputOffset++);
0348: switch (c) {
0349: case '/':
0350: if (inputOffset < inputLength
0351: && input.charAt(inputOffset) == '/') {
0352: inputOffset++;
0353: nextToken = Token.SLSL;
0354: return;
0355: }
0356: nextToken = Token.SLASH;
0357: return;
0358: case ':':
0359: if (inputOffset < inputLength) {
0360: if (input.charAt(inputOffset) == ':') {
0361: inputOffset++;
0362: nextToken = Token.COLONCOLON;
0363: return;
0364: } else if (input.charAt(inputOffset) == '=') {
0365: nextToken = Token.ASSIGN;
0366: inputOffset++;
0367: return;
0368: }
0369: }
0370: throw new StaticError(
0371: "Unexpected colon at start of token");
0372: case '@':
0373: nextToken = Token.AT;
0374: return;
0375: case '?':
0376: nextToken = Token.QMARK;
0377: return;
0378: case '[':
0379: nextToken = Token.LSQB;
0380: return;
0381: case ']':
0382: nextToken = Token.RSQB;
0383: return;
0384: case '{':
0385: nextToken = Token.LCURLY;
0386: return;
0387: case '}':
0388: nextToken = Token.RCURLY;
0389: return;
0390: case ';':
0391: nextToken = Token.SEMICOLON;
0392: state = DEFAULT_STATE;
0393: return;
0394: case '(':
0395: if (inputOffset < inputLength
0396: && input.charAt(inputOffset) == '#') {
0397: inputOffset++;
0398: int pragmaStart = inputOffset;
0399: int nestingDepth = 1;
0400: while (nestingDepth > 0
0401: && inputOffset < (inputLength - 1)) {
0402: if (input.charAt(inputOffset) == '\n') {
0403: incrementLineNumber();
0404: } else if (input.charAt(inputOffset) == '#'
0405: && input.charAt(inputOffset + 1) == ')') {
0406: nestingDepth--;
0407: inputOffset++;
0408: } else if (input.charAt(inputOffset) == '('
0409: && input.charAt(inputOffset + 1) == '#') {
0410: nestingDepth++;
0411: inputOffset++;
0412: }
0413: inputOffset++;
0414: }
0415: if (nestingDepth > 0) {
0416: throw new StaticError("Unclosed XQuery pragma");
0417: }
0418: nextToken = Token.PRAGMA;
0419: nextTokenValue = input.substring(pragmaStart,
0420: inputOffset - 2);
0421: return;
0422: }
0423: if (inputOffset < inputLength
0424: && input.charAt(inputOffset) == ':') {
0425: // XPath comment syntax is (: .... :)
0426: // Comments may be nested, and may now be empty
0427: inputOffset++;
0428: int nestingDepth = 1;
0429: while (nestingDepth > 0
0430: && inputOffset < (inputLength - 1)) {
0431: if (input.charAt(inputOffset) == '\n') {
0432: incrementLineNumber();
0433: } else if (input.charAt(inputOffset) == ':'
0434: && input.charAt(inputOffset + 1) == ')') {
0435: // if (input.charAt(inputOffset-2) == '(' &&
0436: // input.charAt(inputOffset-1) == ':') {
0437: // throw new StaticError("Empty XPath comments are not allowed");
0438: // }
0439: nestingDepth--;
0440: inputOffset++;
0441: } else if (input.charAt(inputOffset) == '('
0442: && input.charAt(inputOffset + 1) == ':') {
0443: nestingDepth++;
0444: inputOffset++;
0445: }
0446: inputOffset++;
0447: }
0448: if (nestingDepth > 0) {
0449: throw new StaticError("Unclosed XPath comment");
0450: }
0451: lookAhead();
0452: } else {
0453: nextToken = Token.LPAR;
0454: }
0455: return;
0456: case ')':
0457: nextToken = Token.RPAR;
0458: return;
0459: case '+':
0460: nextToken = Token.PLUS;
0461: return;
0462: case '-':
0463: nextToken = Token.MINUS; // not detected if part of a name
0464: return;
0465: case '=':
0466: nextToken = Token.EQUALS;
0467: return;
0468: case '!':
0469: if (inputOffset < inputLength
0470: && input.charAt(inputOffset) == '=') {
0471: inputOffset++;
0472: nextToken = Token.NE;
0473: return;
0474: }
0475: throw new StaticError("'!' without '='");
0476: case '*':
0477: // disambiguation of MULT and STAR is now done later
0478: //if (followsOperator()) {
0479: if (inputOffset < inputLength
0480: && input.charAt(inputOffset) == ':') {
0481: inputOffset++;
0482: nextToken = Token.SUFFIX;
0483: // we leave the parser to get the following name as a separate
0484: // token, but first check there's no intervening white space
0485: if (inputOffset < inputLength) {
0486: char ahead = input.charAt(inputOffset);
0487: if (" \r\t\n".indexOf(ahead) >= 0) {
0488: throw new StaticError(
0489: "Whitespace is not allowed after '*:'");
0490: }
0491: }
0492: return;
0493: }
0494: nextToken = Token.STAR;
0495: //} else {
0496: // nextToken = MULT;
0497: //}
0498: return;
0499: case ',':
0500: nextToken = Token.COMMA;
0501: return;
0502: case '$':
0503: nextToken = Token.DOLLAR;
0504: return;
0505: case '|':
0506: nextToken = Token.UNION;
0507: return;
0508: case '<':
0509: if (inputOffset < inputLength
0510: && input.charAt(inputOffset) == '=') {
0511: inputOffset++;
0512: nextToken = Token.LE;
0513: return;
0514: }
0515: if (inputOffset < inputLength
0516: && input.charAt(inputOffset) == '<') {
0517: inputOffset++;
0518: nextToken = Token.PRECEDES;
0519: return;
0520: }
0521: nextToken = Token.LT;
0522: return;
0523: case '>':
0524: if (inputOffset < inputLength
0525: && input.charAt(inputOffset) == '=') {
0526: inputOffset++;
0527: nextToken = Token.GE;
0528: return;
0529: }
0530: if (inputOffset < inputLength
0531: && input.charAt(inputOffset) == '>') {
0532: inputOffset++;
0533: nextToken = Token.FOLLOWS;
0534: return;
0535: }
0536: nextToken = Token.GT;
0537: return;
0538: case '.':
0539: if (inputOffset < inputLength
0540: && input.charAt(inputOffset) == '.') {
0541: inputOffset++;
0542: nextToken = Token.DOTDOT;
0543: return;
0544: }
0545: if (inputOffset == inputLength
0546: || input.charAt(inputOffset) < '0'
0547: || input.charAt(inputOffset) > '9') {
0548: nextToken = Token.DOT;
0549: return;
0550: }
0551: // otherwise drop through: we have a number starting with a decimal point
0552: case '0':
0553: case '1':
0554: case '2':
0555: case '3':
0556: case '4':
0557: case '5':
0558: case '6':
0559: case '7':
0560: case '8':
0561: case '9':
0562: // The logic here can return some tokens that are not legitimate numbers,
0563: // for example "23e" or "1.0e+". However, this will only happen if the XPath
0564: // expression as a whole is syntactically incorrect.
0565: // These errors will be caught by the numeric constructor.
0566: boolean allowE = true;
0567: boolean allowSign = false;
0568: boolean allowDot = true;
0569: boolean endOfNum = false;
0570: numloop: while (!endOfNum) {
0571: switch (c) {
0572: case '0':
0573: case '1':
0574: case '2':
0575: case '3':
0576: case '4':
0577: case '5':
0578: case '6':
0579: case '7':
0580: case '8':
0581: case '9':
0582: allowSign = false;
0583: break;
0584: case '.':
0585: if (allowDot) {
0586: allowDot = false;
0587: allowSign = false;
0588: } else {
0589: inputOffset--;
0590: break numloop;
0591: }
0592: break;
0593: case 'E':
0594: case 'e':
0595: if (allowE) {
0596: allowSign = true;
0597: allowE = false;
0598: } else {
0599: inputOffset--;
0600: break numloop;
0601: }
0602: break;
0603: case '+':
0604: case '-':
0605: if (allowSign) {
0606: allowSign = false;
0607: } else {
0608: inputOffset--;
0609: break numloop;
0610: }
0611: break;
0612: default:
0613: if (('a' <= c && c <= 'z') || c > 127) {
0614: // this prevents the famous "10div 3"
0615: throw new StaticError(
0616: "Separator needed after numeric literal");
0617: }
0618: inputOffset--;
0619: break numloop;
0620: }
0621: if (inputOffset >= inputLength)
0622: break;
0623: c = input.charAt(inputOffset++);
0624: }
0625: nextTokenValue = input.substring(nextTokenStartOffset,
0626: inputOffset);
0627: nextToken = Token.NUMBER;
0628: return;
0629: case '"':
0630: case '\'':
0631: nextTokenValue = "";
0632: while (true) {
0633: inputOffset = input.indexOf(c, inputOffset);
0634: if (inputOffset < 0) {
0635: inputOffset = nextTokenStartOffset + 1;
0636: throw new StaticError(
0637: "Unmatched quote in expression");
0638: }
0639: nextTokenValue += input.substring(
0640: nextTokenStartOffset + 1, inputOffset++);
0641: // look for doubled delimiters
0642: if (inputOffset < inputLength
0643: && input.charAt(inputOffset) == c) {
0644: nextTokenValue += c;
0645: nextTokenStartOffset = inputOffset;
0646: inputOffset++;
0647: } else {
0648: break;
0649: }
0650: }
0651:
0652: // maintain line number if there are newlines in the string
0653: if (nextTokenValue.indexOf('\n') >= 0) {
0654: for (int i = 0; i < nextTokenValue.length(); i++) {
0655: if (nextTokenValue.charAt(i) == '\n') {
0656: lineNumber++;
0657: if (newlineOffsets == null) {
0658: newlineOffsets = new ArrayList(20);
0659: }
0660: newlineOffsets.add(new Integer(
0661: nextTokenStartOffset + i));
0662: }
0663: }
0664: }
0665: nextTokenValue = nextTokenValue.intern();
0666: nextToken = Token.STRING_LITERAL;
0667: return;
0668: case '\n':
0669: incrementLineNumber();
0670: // drop through
0671: case ' ':
0672: case '\t':
0673: case '\r':
0674: nextTokenStartOffset = inputOffset;
0675: break;
0676: default:
0677: if (c < 0x80 && !Character.isLetter(c)) {
0678: throw new StaticError("Invalid character '" + c
0679: + "' in expression");
0680: }
0681: /* fall through */
0682: case '_':
0683: loop: for (; inputOffset < inputLength; inputOffset++) {
0684: c = input.charAt(inputOffset);
0685: switch (c) {
0686: case ':':
0687: if (inputOffset + 1 < inputLength) {
0688: char nc = input.charAt(inputOffset + 1);
0689: if (nc == ':') {
0690: nextTokenValue = input.substring(
0691: nextTokenStartOffset,
0692: inputOffset).intern();
0693: nextToken = Token.AXIS;
0694: inputOffset += 2;
0695: return;
0696: } else if (nc == '*') {
0697: nextTokenValue = input.substring(
0698: nextTokenStartOffset,
0699: inputOffset).intern();
0700: nextToken = Token.PREFIX;
0701: inputOffset += 2;
0702: return;
0703: } else if (nc == '=') {
0704: // as in "let $x:=2"
0705: nextTokenValue = input.substring(
0706: nextTokenStartOffset,
0707: inputOffset).intern();
0708: nextToken = Token.NAME;
0709: return;
0710: }
0711: }
0712: break;
0713: case '.':
0714: case '-':
0715: case '_':
0716: break;
0717:
0718: default:
0719: if (c < 0x80 && !Character.isLetterOrDigit(c))
0720: break loop;
0721: break;
0722: }
0723: }
0724: nextTokenValue = input.substring(nextTokenStartOffset,
0725: inputOffset).intern();
0726: nextToken = Token.NAME;
0727: return;
0728: }
0729: }
0730: }
0731:
0732: /**
0733: * Identify a binary operator
0734: *
0735: * @param s String representation of the operator - must be interned
0736: * @return the token number of the operator, or UNKNOWN if it is not a
0737: * known operator
0738: */
0739:
0740: private static int getBinaryOp(String s) {
0741: switch (s.length()) {
0742: case 2:
0743: if (s == "or")
0744: return Token.OR;
0745: if (s == "is")
0746: return Token.IS;
0747: if (s == "to")
0748: return Token.TO;
0749: if (s == "in")
0750: return Token.IN;
0751: if (s == "eq")
0752: return Token.FEQ;
0753: if (s == "ne")
0754: return Token.FNE;
0755: if (s == "gt")
0756: return Token.FGT;
0757: if (s == "ge")
0758: return Token.FGE;
0759: if (s == "lt")
0760: return Token.FLT;
0761: if (s == "le")
0762: return Token.FLE;
0763: break;
0764: case 3:
0765: if (s == "and")
0766: return Token.AND;
0767: if (s == "div")
0768: return Token.DIV;
0769: if (s == "mod")
0770: return Token.MOD;
0771: break;
0772: case 4:
0773: if (s == "idiv")
0774: return Token.IDIV;
0775: if (s == "then")
0776: return Token.THEN;
0777: if (s == "else")
0778: return Token.ELSE;
0779: if (s == "case")
0780: return Token.CASE;
0781: break;
0782: case 5:
0783: if (s == "where")
0784: return Token.WHERE;
0785: if (s == "union")
0786: return Token.UNION;
0787: break;
0788: case 6:
0789: if (s == "except")
0790: return Token.EXCEPT;
0791: if (s == "return")
0792: return Token.RETURN;
0793: break;
0794: case 7:
0795: if (s == "default")
0796: return Token.DEFAULT;
0797: case 9:
0798: if (s == "intersect")
0799: return Token.INTERSECT;
0800: if (s == "satisfies")
0801: return Token.SATISFIES;
0802: break;
0803: }
0804: return Token.UNKNOWN;
0805: }
0806:
0807: /**
0808: * Distinguish nodekind names, "if", and function names, which are all
0809: * followed by a "("
0810: *
0811: * @param s the name - must be interned
0812: * @return the token number
0813: */
0814:
0815: private static int getFunctionType(String s) {
0816: switch (s.length()) {
0817: case 2:
0818: if (s == "if")
0819: return Token.IF;
0820: break;
0821: case 4:
0822: if (s == "node")
0823: return Token.NODEKIND;
0824: if (s == "item")
0825: return Token.NODEKIND;
0826: if (s == "text")
0827: return Token.NODEKIND;
0828: break;
0829: case 7:
0830: if (s == "element")
0831: return Token.NODEKIND;
0832: if (s == "comment")
0833: return Token.NODEKIND;
0834: break;
0835: case 9:
0836: if (s == "attribute")
0837: return Token.NODEKIND;
0838: if (s == "namespace")
0839: return Token.NODEKIND;
0840: break;
0841: case 10:
0842: if (s == "typeswitch")
0843: return Token.TYPESWITCH;
0844: break;
0845: default:
0846: if (s == "document-node")
0847: return Token.NODEKIND;
0848: if (s == "empty-sequence")
0849: return Token.NODEKIND;
0850: if (s == "schema-element")
0851: return Token.NODEKIND;
0852: if (s == "schema-attribute")
0853: return Token.NODEKIND;
0854: if (s == "processing-instruction")
0855: return Token.NODEKIND;
0856:
0857: break;
0858: }
0859: return Token.FUNCTION;
0860: }
0861:
0862: /**
0863: * Test whether the previous token is an operator
0864: * @return true if the previous token is an operator token
0865: */
0866:
0867: private boolean followsOperator() {
0868: return precedingToken <= Token.LAST_OPERATOR;
0869: }
0870:
0871: /**
0872: * Read next character directly. Used by the XQuery parser when parsing pseudo-XML syntax
0873: * @return the next character from the input
0874: * @throws StringIndexOutOfBoundsException if an attempt is made to read beyond
0875: * the end of the string. This will only occur in the event of a syntax error in the
0876: * input.
0877: */
0878:
0879: public char nextChar() throws StringIndexOutOfBoundsException {
0880: char c = input.charAt(inputOffset++);
0881: //c = normalizeLineEnding(c);
0882: if (c == '\n') {
0883: incrementLineNumber();
0884: lineNumber++;
0885: }
0886: return c;
0887: }
0888:
0889: /**
0890: * Normalize line endings according to the rules in XML 1.1.
0891: * @param c the most recently read character. The value of inputOffset must be the immediately following
0892: * character
0893: * @return c the current character after newline normalization
0894: */
0895:
0896: // private char normalizeLineEnding(char c) throws StringIndexOutOfBoundsException {
0897: // switch (c) {
0898: // case '\r':
0899: // if (input.charAt(inputOffset) == '\n' || input.charAt(inputOffset) == 0x85) {
0900: // inputOffset++;
0901: // return '\n';
0902: // } else {
0903: // return '\n';
0904: // }
0905: // case 0x85:
0906: // return '\n';
0907: // case 0x2028:
0908: // return '\n';
0909: // default:
0910: // return c;
0911: // }
0912: // }
0913: /**
0914: * Increment the line number, making a record of where in the input string the newline character occurred.
0915: */
0916:
0917: private void incrementLineNumber() {
0918: nextLineNumber++;
0919: if (newlineOffsets == null) {
0920: newlineOffsets = new ArrayList(20);
0921: }
0922: newlineOffsets.add(new Integer(inputOffset - 1));
0923: }
0924:
0925: /**
0926: * Step back one character. If this steps back to a previous line, adjust the line number.
0927: */
0928:
0929: public void unreadChar() {
0930: if (input.charAt(--inputOffset) == '\n') {
0931: nextLineNumber--;
0932: lineNumber--;
0933: if (newlineOffsets != null) {
0934: newlineOffsets.remove(newlineOffsets.size() - 1);
0935: }
0936: }
0937: }
0938:
0939: /**
0940: * Get the most recently read text (for use in an error message)
0941: */
0942:
0943: public String recentText() {
0944: if (inputOffset > inputLength) {
0945: inputOffset = inputLength;
0946: }
0947: if (inputOffset < 34) {
0948: return input.substring(0, inputOffset);
0949: } else {
0950: return NormalizeSpace.normalize(
0951: "..."
0952: + input.substring(inputOffset - 30,
0953: inputOffset)).toString();
0954: }
0955: }
0956:
0957: /**
0958: * Get the line number of the current token
0959: */
0960:
0961: public int getLineNumber() {
0962: return lineNumber;
0963: }
0964:
0965: /**
0966: * Get the column number of the current token
0967: */
0968:
0969: public int getColumnNumber() {
0970: return (int) (getLineAndColumn(currentTokenStartOffset) & 0x7fffffff);
0971: }
0972:
0973: // --Commented out by Inspection START (16/12/04 14:40):
0974: // /**
0975: // * Get the line and column number of the current token,
0976: // * as a long value with the line number in the top half
0977: // * and the column number in the lower half
0978: // * @return the line and column number, packed together
0979: // */
0980: //
0981: // public long getLineAndColumn() {
0982: // return ((long)getLineNumber()) << 32 | ((long)getColumnNumber());
0983: // }
0984: // --Commented out by Inspection STOP (16/12/04 14:40)
0985:
0986: /**
0987: * Get the line and column number corresponding to a given offset in the input expression,
0988: * as a long value with the line number in the top half
0989: * and the column number in the lower half
0990: * @return the line and column number, packed together
0991: */
0992:
0993: public long getLineAndColumn(int offset) {
0994: if (newlineOffsets == null) {
0995: return ((long) startLineNumber) << 32 | (long) offset;
0996: }
0997: for (int line = newlineOffsets.size() - 1; line >= 0; line--) {
0998: int nloffset = ((Integer) newlineOffsets.get(line))
0999: .intValue();
1000: if (offset > nloffset) {
1001: return ((long) (line + startLineNumber + 1) << 32)
1002: | ((long) (offset - nloffset));
1003: }
1004: }
1005: return ((long) startLineNumber) << 32 | (long) (offset + 1);
1006: }
1007:
1008: public int getLineNumber(int offset) {
1009: return (int) ((getLineAndColumn(offset)) >> 32);
1010: }
1011:
1012: public int getColumnNumber(int offset) {
1013: return (int) ((getLineAndColumn(offset)) & 0x7fffffff);
1014: }
1015:
1016: }
1017:
1018: /*
1019:
1020: The following copyright notice is copied from the licence for xt, from which the
1021: original version of this module was derived:
1022: --------------------------------------------------------------------------------
1023: Copyright (c) 1998, 1999 James Clark
1024:
1025: Permission is hereby granted, free of charge, to any person obtaining
1026: a copy of this software and associated documentation files (the
1027: "Software"), to deal in the Software without restriction, including
1028: without limitation the rights to use, copy, modify, merge, publish,
1029: distribute, sublicense, and/or sell copies of the Software, and to
1030: permit persons to whom the Software is furnished to do so, subject to
1031: the following conditions:
1032:
1033: The above copyright notice and this permission notice shall be included
1034: in all copies or substantial portions of the Software.
1035:
1036: THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
1037: OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
1038: MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
1039: IN NO EVENT SHALL JAMES CLARK BE LIABLE FOR ANY CLAIM, DAMAGES OR
1040: OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
1041: ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
1042: OTHER DEALINGS IN THE SOFTWARE.
1043:
1044: Except as contained in this notice, the name of James Clark shall
1045: not be used in advertising or otherwise to promote the sale, use or
1046: other dealings in this Software without prior written authorization
1047: from James Clark.
1048: ---------------------------------------------------------------------------
1049: */
1050:
1051: //
1052: // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
1053: // you may not use this file except in compliance with the License. You may obtain a copy of the
1054: // License at http://www.mozilla.org/MPL/
1055: //
1056: // Software distributed under the License is distributed on an "AS IS" basis,
1057: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
1058: // See the License for the specific language governing rights and limitations under the License.
1059: //
1060: // The Original Code is: all this file, other than the parts developed by James Clark as part of xt.
1061: //
1062: // The Initial Developer of the Original Code is Michael H. Kay.
1063: //
1064: // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
1065: //
1066: // Contributor(s): none.
1067: //
|