0001: /*
0002: * The Apache Software License, Version 1.1
0003: *
0004: *
0005: * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
0006: * reserved.
0007: *
0008: * Redistribution and use in source and binary forms, with or without
0009: * modification, are permitted provided that the following conditions
0010: * are met:
0011: *
0012: * 1. Redistributions of source code must retain the above copyright
0013: * notice, this list of conditions and the following disclaimer.
0014: *
0015: * 2. Redistributions in binary form must reproduce the above copyright
0016: * notice, this list of conditions and the following disclaimer in
0017: * the documentation and/or other materials provided with the
0018: * distribution.
0019: *
0020: * 3. The end-user documentation included with the redistribution,
0021: * if any, must include the following acknowledgment:
0022: * "This product includes software developed by the
0023: * Apache Software Foundation (http://www.apache.org/)."
0024: * Alternately, this acknowledgment may appear in the software itself,
0025: * if and wherever such third-party acknowledgments normally appear.
0026: *
0027: * 4. The names "Xerces" and "Apache Software Foundation" must
0028: * not be used to endorse or promote products derived from this
0029: * software without prior written permission. For written
0030: * permission, please contact apache@apache.org.
0031: *
0032: * 5. Products derived from this software may not be called "Apache",
0033: * nor may "Apache" appear in their name, without prior written
0034: * permission of the Apache Software Foundation.
0035: *
0036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0047: * SUCH DAMAGE.
0048: * ====================================================================
0049: *
0050: * This software consists of voluntary contributions made by many
0051: * individuals on behalf of the Apache Software Foundation and was
0052: * originally based on software copyright (c) 1999, International
0053: * Business Machines, Inc., http://www.apache.org. For more
0054: * information on the Apache Software Foundation, please see
0055: * <http://www.apache.org/>.
0056: */
0057:
0058: package org.apache.xerces.utils.regex;
0059:
0060: import java.util.Locale;
0061: import java.util.MissingResourceException;
0062: import java.util.ResourceBundle;
0063: import java.util.Vector;
0064:
0065: /**
0066: * A Regular Expression Parser.
0067: */
0068: class RegexParser {
0069: static final int T_CHAR = 0;
0070: static final int T_EOF = 1;
0071: static final int T_OR = 2; // '|'
0072: static final int T_STAR = 3; // '*'
0073: static final int T_PLUS = 4; // '+'
0074: static final int T_QUESTION = 5; // '?'
0075: static final int T_LPAREN = 6; // '('
0076: static final int T_RPAREN = 7; // ')'
0077: static final int T_DOT = 8; // '.'
0078: static final int T_LBRACKET = 9; // '['
0079: static final int T_BACKSOLIDUS = 10; // '\'
0080: static final int T_CARET = 11; // '^'
0081: static final int T_DOLLAR = 12; // '$'
0082: static final int T_LPAREN2 = 13; // '(?:'
0083: static final int T_LOOKAHEAD = 14; // '(?='
0084: static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
0085: static final int T_LOOKBEHIND = 16; // '(?<='
0086: static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
0087: static final int T_INDEPENDENT = 18; // '(?>'
0088: static final int T_SET_OPERATIONS = 19; // '(?['
0089: static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
0090: static final int T_COMMENT = 21; // '(?#'
0091: static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
0092: static final int T_CONDITION = 23; // '(?('
0093: static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
0094:
0095: static class ReferencePosition {
0096: int refNumber;
0097: int position;
0098:
0099: ReferencePosition(int n, int pos) {
0100: this .refNumber = n;
0101: this .position = pos;
0102: }
0103: }
0104:
0105: int offset;
0106: String regex;
0107: int regexlen;
0108: int options;
0109: ResourceBundle resources;
0110: int chardata;
0111: int nexttoken;
0112: static protected final int S_NORMAL = 0;
0113: static protected final int S_INBRACKETS = 1;
0114: static protected final int S_INXBRACKETS = 2;
0115: int context = S_NORMAL;
0116: int parennumber = 1;
0117: boolean hasBackReferences;
0118: Vector references = null;
0119:
0120: public RegexParser() {
0121: this .setLocale(Locale.getDefault());
0122: }
0123:
0124: public RegexParser(Locale locale) {
0125: this .setLocale(locale);
0126: }
0127:
0128: public void setLocale(Locale locale) {
0129: try {
0130: this .resources = ResourceBundle.getBundle(
0131: "org.apache.xerces.utils.regex.message", locale);
0132: } catch (MissingResourceException mre) {
0133: throw new RuntimeException(
0134: "Installation Problem??? Couldn't load messages: "
0135: + mre.getMessage());
0136: }
0137: }
0138:
0139: final ParseException ex(String key, int loc) {
0140: return new ParseException(this .resources.getString(key), loc);
0141: }
0142:
0143: private final boolean isSet(int flag) {
0144: return (this .options & flag) == flag;
0145: }
0146:
0147: synchronized Token parse(String regex, int options)
0148: throws ParseException {
0149:
0150: this .options = options;
0151: this .offset = 0;
0152: this .setContext(S_NORMAL);
0153: this .parennumber = 1;
0154: this .hasBackReferences = false;
0155: this .regex = regex;
0156: if (this .isSet(RegularExpression.EXTENDED_COMMENT))
0157: this .regex = REUtil.stripExtendedComment(this .regex);
0158: this .regexlen = this .regex.length();
0159:
0160: this .next();
0161: Token ret = this .parseRegex();
0162: if (this .offset != this .regexlen)
0163: throw ex("parser.parse.1", this .offset);
0164: if (this .references != null) {
0165: for (int i = 0; i < this .references.size(); i++) {
0166: ReferencePosition position = (ReferencePosition) this .references
0167: .elementAt(i);
0168: if (this .parennumber <= position.refNumber)
0169: throw ex("parser.parse.2", position.position);
0170: }
0171: this .references.removeAllElements();
0172: }
0173: return ret;
0174: }
0175:
0176: /*
0177: public RegularExpression createRegex(String regex, int options) throws ParseException {
0178: Token tok = this.parse(regex, options);
0179: return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
0180: }
0181: */
0182:
0183: protected final void setContext(int con) {
0184: this .context = con;
0185: }
0186:
0187: final int read() {
0188: return this .nexttoken;
0189: }
0190:
0191: final void next() {
0192: if (this .offset >= this .regexlen) {
0193: this .chardata = -1;
0194: this .nexttoken = T_EOF;
0195: return;
0196: }
0197:
0198: int ret;
0199: int ch = this .regex.charAt(this .offset++);
0200: this .chardata = ch;
0201:
0202: if (this .context == S_INBRACKETS) {
0203: // In a character class, this.chardata has one character, that is to say,
0204: // a pair of surrogates is composed and stored to this.chardata.
0205: switch (ch) {
0206: case '\\':
0207: ret = T_BACKSOLIDUS;
0208: if (this .offset >= this .regexlen)
0209: throw ex("parser.next.1", this .offset - 1);
0210: this .chardata = this .regex.charAt(this .offset++);
0211: break;
0212:
0213: case '-':
0214: if (this .isSet(RegularExpression.XMLSCHEMA_MODE)
0215: && this .offset < this .regexlen
0216: && this .regex.charAt(this .offset) == '[') {
0217: this .offset++;
0218: ret = T_XMLSCHEMA_CC_SUBTRACTION;
0219: } else
0220: ret = T_CHAR;
0221: break;
0222:
0223: case '[':
0224: if (!this .isSet(RegularExpression.XMLSCHEMA_MODE)
0225: && this .offset < this .regexlen
0226: && this .regex.charAt(this .offset) == ':') {
0227: this .offset++;
0228: ret = T_POSIX_CHARCLASS_START;
0229: break;
0230: } // Through down
0231: default:
0232: if (REUtil.isHighSurrogate(ch)
0233: && this .offset < this .regexlen) {
0234: int low = this .regex.charAt(this .offset);
0235: if (REUtil.isLowSurrogate(low)) {
0236: this .chardata = REUtil.composeFromSurrogates(
0237: ch, low);
0238: this .offset++;
0239: }
0240: }
0241: ret = T_CHAR;
0242: }
0243: this .nexttoken = ret;
0244: return;
0245: }
0246:
0247: switch (ch) {
0248: case '|':
0249: ret = T_OR;
0250: break;
0251: case '*':
0252: ret = T_STAR;
0253: break;
0254: case '+':
0255: ret = T_PLUS;
0256: break;
0257: case '?':
0258: ret = T_QUESTION;
0259: break;
0260: case ')':
0261: ret = T_RPAREN;
0262: break;
0263: case '.':
0264: ret = T_DOT;
0265: break;
0266: case '[':
0267: ret = T_LBRACKET;
0268: break;
0269: case '^':
0270: ret = T_CARET;
0271: break;
0272: case '$':
0273: ret = T_DOLLAR;
0274: break;
0275: case '(':
0276: ret = T_LPAREN;
0277: if (this .offset >= this .regexlen)
0278: break;
0279: if (this .regex.charAt(this .offset) != '?')
0280: break;
0281: if (++this .offset >= this .regexlen)
0282: throw ex("parser.next.2", this .offset - 1);
0283: ch = this .regex.charAt(this .offset++);
0284: switch (ch) {
0285: case ':':
0286: ret = T_LPAREN2;
0287: break;
0288: case '=':
0289: ret = T_LOOKAHEAD;
0290: break;
0291: case '!':
0292: ret = T_NEGATIVELOOKAHEAD;
0293: break;
0294: case '[':
0295: ret = T_SET_OPERATIONS;
0296: break;
0297: case '>':
0298: ret = T_INDEPENDENT;
0299: break;
0300: case '<':
0301: if (this .offset >= this .regexlen)
0302: throw ex("parser.next.2", this .offset - 3);
0303: ch = this .regex.charAt(this .offset++);
0304: if (ch == '=') {
0305: ret = T_LOOKBEHIND;
0306: } else if (ch == '!') {
0307: ret = T_NEGATIVELOOKBEHIND;
0308: } else
0309: throw ex("parser.next.3", this .offset - 3);
0310: break;
0311: case '#':
0312: while (this .offset < this .regexlen) {
0313: ch = this .regex.charAt(this .offset++);
0314: if (ch == ')')
0315: break;
0316: }
0317: if (ch != ')')
0318: throw ex("parser.next.4", this .offset - 1);
0319: ret = T_COMMENT;
0320: break;
0321: default:
0322: if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch
0323: && ch <= 'Z') {// Options
0324: this .offset--;
0325: ret = T_MODIFIERS;
0326: break;
0327: } else if (ch == '(') { // conditional
0328: ret = T_CONDITION; // this.offsets points the next of '('.
0329: break;
0330: }
0331: throw ex("parser.next.2", this .offset - 2);
0332: }
0333: break;
0334:
0335: case '\\':
0336: ret = T_BACKSOLIDUS;
0337: if (this .offset >= this .regexlen)
0338: throw ex("parser.next.1", this .offset - 1);
0339: this .chardata = this .regex.charAt(this .offset++);
0340: break;
0341:
0342: default:
0343: ret = T_CHAR;
0344: if (REUtil.isHighSurrogate(this .chardata)
0345: && this .offset < this .regexlen)
0346: this .chardata = REUtil
0347: .composeFromSurrogates(this .chardata,
0348: this .regex.charAt(this .offset++));
0349: }
0350: this .nexttoken = ret;
0351: }
0352:
0353: /**
0354: * regex ::= term (`|` term)*
0355: * term ::= factor+
0356: * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
0357: * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
0358: * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
0359: * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
0360: * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
0361: */
0362: Token parseRegex() throws ParseException {
0363: Token tok = this .parseTerm();
0364: Token parent = null;
0365: while (this .read() == T_OR) {
0366: this .next(); // '|'
0367: if (parent == null) {
0368: parent = Token.createUnion();
0369: parent.addChild(tok);
0370: tok = parent;
0371: }
0372: tok.addChild(this .parseTerm());
0373: }
0374:
0375: return tok;
0376: }
0377:
0378: /**
0379: * term ::= factor+
0380: */
0381: Token parseTerm() throws ParseException {
0382: int ch = this .read();
0383: if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
0384: return Token.createEmpty();
0385: } else {
0386: Token tok = this .parseFactor();
0387: Token concat = null;
0388: while ((ch = this .read()) != T_OR && ch != T_RPAREN
0389: && ch != T_EOF) {
0390: if (concat == null) {
0391: concat = Token.createConcat();
0392: concat.addChild(tok);
0393: tok = concat;
0394: }
0395: concat.addChild(this .parseFactor());
0396: //tok = Token.createConcat(tok, this.parseFactor());
0397: }
0398: return tok;
0399: }
0400: }
0401:
0402: // ----------------------------------------------------------------
0403:
0404: Token processCaret() throws ParseException {
0405: this .next();
0406: return Token.token_linebeginning;
0407: }
0408:
0409: Token processDollar() throws ParseException {
0410: this .next();
0411: return Token.token_lineend;
0412: }
0413:
0414: Token processLookahead() throws ParseException {
0415: this .next();
0416: Token tok = Token
0417: .createLook(Token.LOOKAHEAD, this .parseRegex());
0418: if (this .read() != T_RPAREN)
0419: throw ex("parser.factor.1", this .offset - 1);
0420: this .next(); // ')'
0421: return tok;
0422: }
0423:
0424: Token processNegativelookahead() throws ParseException {
0425: this .next();
0426: Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this
0427: .parseRegex());
0428: if (this .read() != T_RPAREN)
0429: throw ex("parser.factor.1", this .offset - 1);
0430: this .next(); // ')'
0431: return tok;
0432: }
0433:
0434: Token processLookbehind() throws ParseException {
0435: this .next();
0436: Token tok = Token.createLook(Token.LOOKBEHIND, this
0437: .parseRegex());
0438: if (this .read() != T_RPAREN)
0439: throw ex("parser.factor.1", this .offset - 1);
0440: this .next(); // ')'
0441: return tok;
0442: }
0443:
0444: Token processNegativelookbehind() throws ParseException {
0445: this .next();
0446: Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this
0447: .parseRegex());
0448: if (this .read() != T_RPAREN)
0449: throw ex("parser.factor.1", this .offset - 1);
0450: this .next(); // ')'
0451: return tok;
0452: }
0453:
0454: Token processBacksolidus_A() throws ParseException {
0455: this .next();
0456: return Token.token_stringbeginning;
0457: }
0458:
0459: Token processBacksolidus_Z() throws ParseException {
0460: this .next();
0461: return Token.token_stringend2;
0462: }
0463:
0464: Token processBacksolidus_z() throws ParseException {
0465: this .next();
0466: return Token.token_stringend;
0467: }
0468:
0469: Token processBacksolidus_b() throws ParseException {
0470: this .next();
0471: return Token.token_wordedge;
0472: }
0473:
0474: Token processBacksolidus_B() throws ParseException {
0475: this .next();
0476: return Token.token_not_wordedge;
0477: }
0478:
0479: Token processBacksolidus_lt() throws ParseException {
0480: this .next();
0481: return Token.token_wordbeginning;
0482: }
0483:
0484: Token processBacksolidus_gt() throws ParseException {
0485: this .next();
0486: return Token.token_wordend;
0487: }
0488:
0489: Token processStar(Token tok) throws ParseException {
0490: this .next();
0491: if (this .read() == T_QUESTION) {
0492: this .next();
0493: return Token.createNGClosure(tok);
0494: } else
0495: return Token.createClosure(tok);
0496: }
0497:
0498: Token processPlus(Token tok) throws ParseException {
0499: // X+ -> XX*
0500: this .next();
0501: if (this .read() == T_QUESTION) {
0502: this .next();
0503: return Token.createConcat(tok, Token.createNGClosure(tok));
0504: } else
0505: return Token.createConcat(tok, Token.createClosure(tok));
0506: }
0507:
0508: Token processQuestion(Token tok) throws ParseException {
0509: // X? -> X|
0510: this .next();
0511: Token par = Token.createUnion();
0512: if (this .read() == T_QUESTION) {
0513: this .next();
0514: par.addChild(Token.createEmpty());
0515: par.addChild(tok);
0516: } else {
0517: par.addChild(tok);
0518: par.addChild(Token.createEmpty());
0519: }
0520: return par;
0521: }
0522:
0523: boolean checkQuestion(int off) {
0524: return off < this .regexlen && this .regex.charAt(off) == '?';
0525: }
0526:
0527: Token processParen() throws ParseException {
0528: this .next();
0529: int p = this .parennumber++;
0530: Token tok = Token.createParen(this .parseRegex(), p);
0531: if (this .read() != T_RPAREN)
0532: throw ex("parser.factor.1", this .offset - 1);
0533: this .next(); // Skips ')'
0534: return tok;
0535: }
0536:
0537: Token processParen2() throws ParseException {
0538: this .next();
0539: Token tok = Token.createParen(this .parseRegex(), 0);
0540: if (this .read() != T_RPAREN)
0541: throw ex("parser.factor.1", this .offset - 1);
0542: this .next(); // Skips ')'
0543: return tok;
0544: }
0545:
0546: Token processCondition() throws ParseException {
0547: // this.offset points the next of '('
0548: if (this .offset + 1 >= this .regexlen)
0549: throw ex("parser.factor.4", this .offset);
0550: // Parses a condition.
0551: int refno = -1;
0552: Token condition = null;
0553: int ch = this .regex.charAt(this .offset);
0554: if ('1' <= ch && ch <= '9') {
0555: refno = ch - '0';
0556: this .hasBackReferences = true;
0557: if (this .references == null)
0558: this .references = new Vector();
0559: this .references.addElement(new ReferencePosition(refno,
0560: this .offset));
0561: this .offset++;
0562: if (this .regex.charAt(this .offset) != ')')
0563: throw ex("parser.factor.1", this .offset);
0564: this .offset++;
0565: } else {
0566: if (ch == '?')
0567: this .offset--; // Points '('.
0568: this .next();
0569: condition = this .parseFactor();
0570: switch (condition.type) {
0571: case Token.LOOKAHEAD:
0572: case Token.NEGATIVELOOKAHEAD:
0573: case Token.LOOKBEHIND:
0574: case Token.NEGATIVELOOKBEHIND:
0575: break;
0576: case Token.ANCHOR:
0577: if (this .read() != T_RPAREN)
0578: throw ex("parser.factor.1", this .offset - 1);
0579: break;
0580: default:
0581: throw ex("parser.factor.5", this .offset);
0582: }
0583: }
0584: // Parses yes/no-patterns.
0585: this .next();
0586: Token yesPattern = this .parseRegex();
0587: Token noPattern = null;
0588: if (yesPattern.type == Token.UNION) {
0589: if (yesPattern.size() != 2)
0590: throw ex("parser.factor.6", this .offset);
0591: noPattern = yesPattern.getChild(1);
0592: yesPattern = yesPattern.getChild(0);
0593: }
0594: if (this .read() != T_RPAREN)
0595: throw ex("parser.factor.1", this .offset - 1);
0596: this .next();
0597: return Token.createCondition(refno, condition, yesPattern,
0598: noPattern);
0599: }
0600:
0601: Token processModifiers() throws ParseException {
0602: // this.offset points the next of '?'.
0603: // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
0604: int add = 0, mask = 0, ch = -1;
0605: while (this .offset < this .regexlen) {
0606: ch = this .regex.charAt(this .offset);
0607: int v = REUtil.getOptionValue(ch);
0608: if (v == 0)
0609: break; // '-' or ':'?
0610: add |= v;
0611: this .offset++;
0612: }
0613: if (this .offset >= this .regexlen)
0614: throw ex("parser.factor.2", this .offset - 1);
0615: if (ch == '-') {
0616: this .offset++;
0617: while (this .offset < this .regexlen) {
0618: ch = this .regex.charAt(this .offset);
0619: int v = REUtil.getOptionValue(ch);
0620: if (v == 0)
0621: break; // ':'?
0622: mask |= v;
0623: this .offset++;
0624: }
0625: if (this .offset >= this .regexlen)
0626: throw ex("parser.factor.2", this .offset - 1);
0627: }
0628: Token tok;
0629: if (ch == ':') {
0630: this .offset++;
0631: this .next();
0632: tok = Token.createModifierGroup(this .parseRegex(), add,
0633: mask);
0634: if (this .read() != T_RPAREN)
0635: throw ex("parser.factor.1", this .offset - 1);
0636: this .next();
0637: } else if (ch == ')') { // such as (?-i)
0638: this .offset++;
0639: this .next();
0640: tok = Token.createModifierGroup(this .parseRegex(), add,
0641: mask);
0642: } else
0643: throw ex("parser.factor.3", this .offset);
0644:
0645: return tok;
0646: }
0647:
0648: Token processIndependent() throws ParseException {
0649: this .next();
0650: Token tok = Token.createLook(Token.INDEPENDENT, this
0651: .parseRegex());
0652: if (this .read() != T_RPAREN)
0653: throw ex("parser.factor.1", this .offset - 1);
0654: this .next(); // Skips ')'
0655: return tok;
0656: }
0657:
0658: Token processBacksolidus_c() throws ParseException {
0659: int ch2; // Must be in 0x0040-0x005f
0660: if (this .offset >= this .regexlen
0661: || ((ch2 = this .regex.charAt(this .offset++)) & 0xffe0) != 0x0040)
0662: throw ex("parser.atom.1", this .offset - 1);
0663: this .next();
0664: return Token.createChar(ch2 - 0x40);
0665: }
0666:
0667: Token processBacksolidus_C() throws ParseException {
0668: throw ex("parser.process.1", this .offset);
0669: }
0670:
0671: Token processBacksolidus_i() throws ParseException {
0672: Token tok = Token.createChar('i');
0673: this .next();
0674: return tok;
0675: }
0676:
0677: Token processBacksolidus_I() throws ParseException {
0678: throw ex("parser.process.1", this .offset);
0679: }
0680:
0681: Token processBacksolidus_g() throws ParseException {
0682: this .next();
0683: return Token.getGraphemePattern();
0684: }
0685:
0686: Token processBacksolidus_X() throws ParseException {
0687: this .next();
0688: return Token.getCombiningCharacterSequence();
0689: }
0690:
0691: Token processBackreference() throws ParseException {
0692: int refnum = this .chardata - '0';
0693: Token tok = Token.createBackReference(refnum);
0694: this .hasBackReferences = true;
0695: if (this .references == null)
0696: this .references = new Vector();
0697: this .references.addElement(new ReferencePosition(refnum,
0698: this .offset - 2));
0699: this .next();
0700: return tok;
0701: }
0702:
0703: // ----------------------------------------------------------------
0704:
0705: /**
0706: * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
0707: * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
0708: * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
0709: * | '(?#' [^)]* ')'
0710: * minmax ::= '{' min (',' max?)? '}'
0711: * min ::= [0-9]+
0712: * max ::= [0-9]+
0713: */
0714: Token parseFactor() throws ParseException {
0715: int ch = this .read();
0716: Token tok;
0717: switch (ch) {
0718: case T_CARET:
0719: return this .processCaret();
0720: case T_DOLLAR:
0721: return this .processDollar();
0722: case T_LOOKAHEAD:
0723: return this .processLookahead();
0724: case T_NEGATIVELOOKAHEAD:
0725: return this .processNegativelookahead();
0726: case T_LOOKBEHIND:
0727: return this .processLookbehind();
0728: case T_NEGATIVELOOKBEHIND:
0729: return this .processNegativelookbehind();
0730:
0731: case T_COMMENT:
0732: this .next();
0733: return Token.createEmpty();
0734:
0735: case T_BACKSOLIDUS:
0736: switch (this .chardata) {
0737: case 'A':
0738: return this .processBacksolidus_A();
0739: case 'Z':
0740: return this .processBacksolidus_Z();
0741: case 'z':
0742: return this .processBacksolidus_z();
0743: case 'b':
0744: return this .processBacksolidus_b();
0745: case 'B':
0746: return this .processBacksolidus_B();
0747: case '<':
0748: return this .processBacksolidus_lt();
0749: case '>':
0750: return this .processBacksolidus_gt();
0751: }
0752: // through down
0753: }
0754: tok = this .parseAtom();
0755: ch = this .read();
0756: switch (ch) {
0757: case T_STAR:
0758: return this .processStar(tok);
0759: case T_PLUS:
0760: return this .processPlus(tok);
0761: case T_QUESTION:
0762: return this .processQuestion(tok);
0763: case T_CHAR:
0764: if (this .chardata == '{') {
0765: // this.offset -> next of '{'
0766: int off = this .offset;
0767: int min = 0, max = -1;
0768: if (off >= this .regexlen)
0769: break;
0770: ch = this .regex.charAt(off++);
0771: if (ch < '0' || ch > '9') {
0772: throw new RuntimeException("Invalid quantifier '"
0773: + (char) ch + "' in " + regex);
0774: }
0775: min = ch - '0';
0776: while (off < this .regexlen
0777: && (ch = this .regex.charAt(off++)) >= '0'
0778: && ch <= '9') {
0779: min = min * 10 + ch - '0';
0780: ch = -1;
0781: }
0782: max = min;
0783: if (ch != '}' && ch != ',' && (ch < '0' || ch > '9')) {
0784: throw new RuntimeException("Invalid quantifier '"
0785: + (char) ch + "' in " + regex);
0786: }
0787: //REVISIT: check for invalid quantifiers!
0788: //
0789:
0790: else if (ch == ',') {
0791: if (ch == '}') {
0792: max = -1; // {min,}
0793: } else {
0794: max = ch - '0'; // {min,max}
0795: while (off < this .regexlen
0796: && (ch = this .regex.charAt(off++)) >= '0'
0797: && ch <= '9') {
0798: max = max * 10 + ch - '0';
0799: ch = -1;
0800: }
0801: //if (min > max)
0802: // throw new ParseException("parseFactor(): min > max: "+min+", "+max);
0803:
0804: if (ch != '}' && (ch < '0' || ch > '9')) {
0805: throw new RuntimeException(
0806: "Invalid quantifier '" + (char) ch
0807: + "' in" + regex);
0808: }
0809: }
0810: }
0811: // off -> next of '}'
0812: if (this .checkQuestion(off)) {
0813: tok = Token.createNGClosure(tok);
0814: this .offset = off + 1;
0815: } else {
0816: tok = Token.createClosure(tok);
0817: this .offset = off;
0818: }
0819: tok.setMin(min);
0820: tok.setMax(max);
0821: //System.err.println("CLOSURE: "+min+", "+max);
0822: this .next();
0823: }
0824: }
0825: return tok;
0826: }
0827:
0828: /**
0829: * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
0830: * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
0831: * | '(?>' regex ')'
0832: * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
0833: */
0834: Token parseAtom() throws ParseException {
0835: int ch = this .read();
0836: Token tok = null;
0837: switch (ch) {
0838: case T_LPAREN:
0839: return this .processParen();
0840: case T_LPAREN2:
0841: return this .processParen2(); // '(?:'
0842: case T_CONDITION:
0843: return this .processCondition(); // '(?('
0844: case T_MODIFIERS:
0845: return this .processModifiers(); // (?modifiers ... )
0846: case T_INDEPENDENT:
0847: return this .processIndependent();
0848: case T_DOT:
0849: this .next(); // Skips '.'
0850: tok = Token.token_dot;
0851: break;
0852:
0853: /**
0854: * char-class ::= '[' ( '^'? range ','?)+ ']'
0855: * range ::= '\d' | '\w' | '\s' | category-block | range-char
0856: * | range-char '-' range-char
0857: * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
0858: * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
0859: */
0860: case T_LBRACKET:
0861: return this .parseCharacterClass(true);
0862: case T_SET_OPERATIONS:
0863: return this .parseSetOperations();
0864:
0865: case T_BACKSOLIDUS:
0866: switch (this .chardata) {
0867: case 'd':
0868: case 'D':
0869: case 'w':
0870: case 'W':
0871: case 's':
0872: case 'S':
0873: tok = this .getTokenForShorthand(this .chardata);
0874: this .next();
0875: return tok;
0876:
0877: case 'e':
0878: case 'f':
0879: case 'n':
0880: case 'r':
0881: case 't':
0882: case 'u':
0883: case 'v':
0884: case 'x': {
0885: int ch2 = this .decodeEscaped();
0886: if (ch2 < 0x10000) {
0887: tok = Token.createChar(ch2);
0888: } else {
0889: tok = Token.createString(REUtil
0890: .decomposeToSurrogates(ch2));
0891: }
0892: }
0893: break;
0894:
0895: case 'c':
0896: return this .processBacksolidus_c();
0897: case 'C':
0898: return this .processBacksolidus_C();
0899: case 'i':
0900: return this .processBacksolidus_i();
0901: case 'I':
0902: return this .processBacksolidus_I();
0903: case 'g':
0904: return this .processBacksolidus_g();
0905: case 'X':
0906: return this .processBacksolidus_X();
0907: case '1':
0908: case '2':
0909: case '3':
0910: case '4':
0911: case '5':
0912: case '6':
0913: case '7':
0914: case '8':
0915: case '9':
0916: return this .processBackreference();
0917:
0918: case 'P':
0919: case 'p':
0920: int pstart = this .offset;
0921: tok = processBacksolidus_pP(this .chardata);
0922: if (tok == null)
0923: throw this .ex("parser.atom.5", pstart);
0924: break;
0925:
0926: default:
0927: tok = Token.createChar(this .chardata);
0928: }
0929: this .next();
0930: break;
0931:
0932: case T_CHAR:
0933: tok = Token.createChar(this .chardata);
0934: this .next();
0935: break;
0936:
0937: default:
0938: throw this .ex("parser.atom.4", this .offset - 1);
0939: }
0940: return tok;
0941: }
0942:
0943: protected RangeToken processBacksolidus_pP(int c)
0944: throws ParseException {
0945: boolean positive = c == 'p';
0946: this .next();
0947: if (this .read() != T_CHAR)
0948: throw this .ex("parser.atom.2", this .offset - 1);
0949: RangeToken tok;
0950: switch (this .chardata) {
0951: case 'L': // Letter
0952: tok = Token.getRange("L", positive);
0953: break;
0954: case 'M': // Mark
0955: tok = Token.getRange("M", positive);
0956: break;
0957: case 'N': // Number
0958: tok = Token.getRange("N", positive);
0959: break;
0960: case 'Z': // Separator
0961: tok = Token.getRange("Z", positive);
0962: break;
0963: case 'C': // Other
0964: tok = Token.getRange("C", positive);
0965: break;
0966: case 'P': // Punctuation
0967: tok = Token.getRange("P", positive);
0968: break;
0969: case 'S': // Symbol
0970: tok = Token.getRange("S", positive);
0971: break;
0972: case '{':
0973: // this.offset points the next of '{'.
0974: //pstart = this.offset;
0975: int namestart = this .offset;
0976: int nameend = this .regex.indexOf('}', namestart);
0977: if (nameend < 0)
0978: throw this .ex("parser.atom.3", this .offset);
0979: this .offset = nameend + 1;
0980: tok = Token.getRange(this .regex.substring(namestart,
0981: nameend), positive);
0982: /*
0983: if (this.isSet(RegularExpression.IGNORE_CASE))
0984: tok = RangeToken.createCaseInsensitiveToken(tok);
0985: */
0986: break;
0987:
0988: default:
0989: throw this .ex("parser.atom.2", this .offset - 1);
0990: }
0991: return tok;
0992: }
0993:
0994: int processCIinCharacterClass(RangeToken tok, int c) {
0995: return this .decodeEscaped();
0996: }
0997:
0998: /**
0999: * char-class ::= '[' ( '^'? range ','?)+ ']'
1000: * range ::= '\d' | '\w' | '\s' | category-block | range-char
1001: * | range-char '-' range-char
1002: * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
1003: * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
1004: */
1005: protected RangeToken parseCharacterClass(boolean useNrange)
1006: throws ParseException {
1007: this .setContext(S_INBRACKETS);
1008: this .next(); // '['
1009: boolean nrange = false;
1010: RangeToken base = null;
1011: RangeToken tok;
1012: if (this .read() == T_CHAR && this .chardata == '^') {
1013: nrange = true;
1014: this .next(); // '^'
1015: if (useNrange) {
1016: tok = Token.createNRange();
1017: } else {
1018: base = Token.createRange();
1019: base.addRange(0, Token.UTF16_MAX);
1020: tok = Token.createRange();
1021: }
1022: } else {
1023: tok = Token.createRange();
1024: }
1025: int type;
1026: boolean firstloop = true;
1027: while ((type = this .read()) != T_EOF) {
1028: if (type == T_CHAR && this .chardata == ']' && !firstloop)
1029: break;
1030: firstloop = false;
1031: int c = this .chardata;
1032: boolean end = false;
1033: if (type == T_BACKSOLIDUS) {
1034: switch (c) {
1035: case 'd':
1036: case 'D':
1037: case 'w':
1038: case 'W':
1039: case 's':
1040: case 'S':
1041: tok.mergeRanges(this .getTokenForShorthand(c));
1042: end = true;
1043: break;
1044:
1045: case 'i':
1046: case 'I':
1047: case 'c':
1048: case 'C':
1049: c = this .processCIinCharacterClass(tok, c);
1050: if (c < 0)
1051: end = true;
1052: break;
1053:
1054: case 'p':
1055: case 'P':
1056: int pstart = this .offset;
1057: RangeToken tok2 = this .processBacksolidus_pP(c);
1058: if (tok2 == null)
1059: throw this .ex("parser.atom.5", pstart);
1060: tok.mergeRanges(tok2);
1061: end = true;
1062: break;
1063:
1064: default:
1065: c = this .decodeEscaped();
1066: } // \ + c
1067: } // backsolidus
1068: // POSIX Character class such as [:alnum:]
1069: else if (type == T_POSIX_CHARCLASS_START) {
1070: int nameend = this .regex.indexOf(':', this .offset);
1071: if (nameend < 0)
1072: throw this .ex("parser.cc.1", this .offset);
1073: boolean positive = true;
1074: if (this .regex.charAt(this .offset) == '^') {
1075: this .offset++;
1076: positive = false;
1077: }
1078: String name = this .regex
1079: .substring(this .offset, nameend);
1080: RangeToken range = Token.getRange(name, positive);
1081: if (range == null)
1082: throw this .ex("parser.cc.3", this .offset);
1083: tok.mergeRanges(range);
1084: end = true;
1085: if (nameend + 1 >= this .regexlen
1086: || this .regex.charAt(nameend + 1) != ']')
1087: throw this .ex("parser.cc.1", nameend);
1088: this .offset = nameend + 2;
1089: }
1090: this .next();
1091: if (!end) { // if not shorthands...
1092: if (this .read() != T_CHAR || this .chardata != '-') { // Here is no '-'.
1093: tok.addRange(c, c);
1094: } else {
1095: this .next(); // Skips '-'
1096: if ((type = this .read()) == T_EOF)
1097: throw this .ex("parser.cc.2", this .offset);
1098: if (type == T_CHAR && this .chardata == ']') {
1099: tok.addRange(c, c);
1100: tok.addRange('-', '-');
1101: } else {
1102: int rangeend = this .chardata;
1103: if (type == T_BACKSOLIDUS)
1104: rangeend = this .decodeEscaped();
1105: this .next();
1106: tok.addRange(c, rangeend);
1107: }
1108: }
1109: }
1110: if (this .isSet(RegularExpression.SPECIAL_COMMA)
1111: && this .read() == T_CHAR && this .chardata == ',')
1112: this .next();
1113: }
1114: if (this .read() == T_EOF)
1115: throw this .ex("parser.cc.2", this .offset);
1116: if (!useNrange && nrange) {
1117: base.subtractRanges(tok);
1118: tok = base;
1119: }
1120: tok.sortRanges();
1121: tok.compactRanges();
1122: //tok.dumpRanges();
1123: /*
1124: if (this.isSet(RegularExpression.IGNORE_CASE))
1125: tok = RangeToken.createCaseInsensitiveToken(tok);
1126: */
1127: this .setContext(S_NORMAL);
1128: this .next(); // Skips ']'
1129:
1130: return tok;
1131: }
1132:
1133: private RangeToken parseCharacterClass_old(boolean useNrange)
1134: throws ParseException {
1135: this .setContext(S_INBRACKETS);
1136: this .next(); // '['
1137: boolean nrange = false;
1138: RangeToken base = null;
1139: RangeToken tok;
1140: if (this .read() == T_CHAR && this .chardata == '^') {
1141: nrange = true;
1142: this .next(); // '^'
1143: if (useNrange) {
1144: tok = Token.createNRange();
1145: } else {
1146: base = Token.createRange();
1147: base.addRange(0, Token.UTF16_MAX);
1148: tok = Token.createRange();
1149: }
1150: } else {
1151: tok = Token.createRange();
1152: }
1153: int type;
1154: while ((type = this .read()) != T_EOF
1155: && !(type == T_CHAR && this .chardata == ']')) {
1156: int c = this .chardata;
1157: /*
1158: if (type == T_CHAR && c == '^') {
1159: this.next();
1160: type = this.read();
1161: c = this.chardata;
1162: if (type == T_EOF) break;
1163:
1164: nrange = !nrange;
1165: if (nrange)
1166: tok = Token.createRange();
1167: else {
1168: base.subtractRanges(tok);
1169: tok = base;
1170: }
1171: }
1172: */
1173: boolean end = false;
1174: if (type == T_BACKSOLIDUS) {
1175: switch (c) {
1176: case 'd':
1177: case 'D':
1178: case 'w':
1179: case 'W':
1180: case 's':
1181: case 'S':
1182: tok.mergeRanges(this .getTokenForShorthand(c));
1183: end = true;
1184: break;
1185:
1186: case 'i':
1187: case 'I':
1188: case 'c':
1189: case 'C':
1190: c = this .processCIinCharacterClass(tok, c);
1191: if (c < 0)
1192: end = true;
1193: break;
1194:
1195: case 'p':
1196: case 'P':
1197: boolean positive = c == 'p';
1198: int pstart = this .offset;
1199: this .next();
1200: if (this .read() != T_CHAR)
1201: throw ex("parser.atom.2", this .offset - 1);
1202: RangeToken tok2 = null;
1203: switch (this .chardata) {
1204: case 'L': // Letter
1205: tok2 = Token.getRange("L", positive);
1206: break;
1207: case 'M': // Mark
1208: tok2 = Token.getRange("M", positive);
1209: break;
1210: case 'N': // Number
1211: tok2 = Token.getRange("N", positive);
1212: break;
1213: case 'Z': // Separator
1214: tok2 = Token.getRange("Z", positive);
1215: break;
1216: case 'C': // Other
1217: tok2 = Token.getRange("C", positive);
1218: break;
1219: case 'P': // Punctuation
1220: tok2 = Token.getRange("P", positive);
1221: break;
1222: case 'S': // Symbol
1223: tok2 = Token.getRange("S", positive);
1224: break;
1225: case '{':
1226: // this.offset points the next of '{'.
1227: pstart = this .offset;
1228: int namestart = this .offset;
1229: int nameend = this .regex
1230: .indexOf('}', namestart);
1231: if (nameend < 0)
1232: throw ex("parser.atom.3", this .offset);
1233: this .offset = nameend + 1;
1234: tok2 = Token.getRange(this .regex.substring(
1235: namestart, nameend), positive);
1236: break;
1237:
1238: default:
1239: throw ex("parser.atom.2", this .offset - 1);
1240: }
1241: if (tok2 == null)
1242: throw ex("parser.atom.5", pstart);
1243: tok.mergeRanges(tok2);
1244: end = true;
1245: break;
1246:
1247: default:
1248: c = this .decodeEscaped();
1249: } // \ + c
1250: } // backsolidus
1251: // POSIX Character class such as [:alnum:]
1252: else if (type == T_POSIX_CHARCLASS_START) {
1253: int nameend = this .regex.indexOf(':', this .offset);
1254: if (nameend < 0)
1255: throw ex("parser.cc.1", this .offset);
1256: String name = this .regex
1257: .substring(this .offset, nameend);
1258: RangeToken range = Token.getRange(name, true);
1259: if (range == null)
1260: throw ex("parser.cc.3", this .offset);
1261: tok.mergeRanges(range);
1262: end = true;
1263: if (nameend + 1 >= this .regexlen
1264: || this .regex.charAt(nameend + 1) != ']')
1265: throw ex("parser.cc.1", nameend);
1266: this .offset = nameend + 2;
1267: }
1268: this .next();
1269: if (!end) {
1270: if (this .read() != T_CHAR || this .chardata != '-') { // Here is no '-'.
1271: tok.addRange(c, c);
1272: } else {
1273: this .next(); // Skips '-'
1274: if ((type = this .read()) == T_EOF)
1275: throw ex("parser.cc.2", this .offset);
1276: int rangeend = this .chardata;
1277: if (type == T_BACKSOLIDUS)
1278: rangeend = this .decodeEscaped();
1279: this .next();
1280: tok.addRange(c, rangeend);
1281: }
1282: }
1283: if (this .read() == T_CHAR && this .chardata == ',')
1284: this .next();
1285: }
1286: if (this .read() == T_EOF)
1287: throw ex("parser.cc.2", this .offset);
1288: if (!useNrange && nrange) {
1289: base.subtractRanges(tok);
1290: tok = base;
1291: }
1292: tok.sortRanges();
1293: tok.compactRanges();
1294: //tok.dumpRanges();
1295: /*
1296: if (this.isSet(RegularExpression.IGNORE_CASE))
1297: tok = RangeToken.createCaseInsensitiveToken(tok);
1298: */
1299: this .setContext(S_NORMAL);
1300: this .next(); // Skips ']'
1301:
1302: return tok;
1303: }
1304:
1305: /**
1306: * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
1307: */
1308: protected RangeToken parseSetOperations() throws ParseException {
1309: RangeToken tok = this .parseCharacterClass(false);
1310: int type;
1311: while ((type = this .read()) != T_RPAREN) {
1312: int ch = this .chardata;
1313: if (type == T_CHAR && (ch == '-' || ch == '&')
1314: || type == T_PLUS) {
1315: this .next();
1316: if (this .read() != T_LBRACKET)
1317: throw ex("parser.ope.1", this .offset - 1);
1318: RangeToken t2 = this .parseCharacterClass(false);
1319: if (type == T_PLUS)
1320: tok.mergeRanges(t2);
1321: else if (ch == '-')
1322: tok.subtractRanges(t2);
1323: else if (ch == '&')
1324: tok.intersectRanges(t2);
1325: else
1326: throw new RuntimeException("ASSERT");
1327: } else {
1328: throw ex("parser.ope.2", this .offset - 1);
1329: }
1330: }
1331: this .next();
1332: return tok;
1333: }
1334:
1335: Token getTokenForShorthand(int ch) {
1336: Token tok;
1337: switch (ch) {
1338: case 'd':
1339: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1340: .getRange("Nd", true)
1341: : Token.token_0to9;
1342: break;
1343: case 'D':
1344: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1345: .getRange("Nd", false)
1346: : Token.token_not_0to9;
1347: break;
1348: case 'w':
1349: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1350: .getRange("IsWord", true)
1351: : Token.token_wordchars;
1352: break;
1353: case 'W':
1354: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1355: .getRange("IsWord", false)
1356: : Token.token_not_wordchars;
1357: break;
1358: case 's':
1359: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1360: .getRange("IsSpace", true)
1361: : Token.token_spaces;
1362: break;
1363: case 'S':
1364: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1365: .getRange("IsSpace", false)
1366: : Token.token_not_spaces;
1367: break;
1368:
1369: default:
1370: throw new RuntimeException(
1371: "Internal Error: shorthands: \\u"
1372: + Integer.toString(ch, 16));
1373: }
1374: return tok;
1375: }
1376:
1377: /**
1378: */
1379: int decodeEscaped() throws ParseException {
1380: if (this .read() != T_BACKSOLIDUS)
1381: throw ex("parser.next.1", this .offset - 1);
1382: int c = this .chardata;
1383: switch (c) {
1384: case 'e':
1385: c = 0x1b;
1386: break; // ESCAPE U+001B
1387: case 'f':
1388: c = '\f';
1389: break; // FORM FEED U+000C
1390: case 'n':
1391: c = '\n';
1392: break; // LINE FEED U+000A
1393: case 'r':
1394: c = '\r';
1395: break; // CRRIAGE RETURN U+000D
1396: case 't':
1397: c = '\t';
1398: break; // HORIZONTAL TABULATION U+0009
1399: //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B
1400: case 'x':
1401: this .next();
1402: if (this .read() != T_CHAR)
1403: throw ex("parser.descape.1", this .offset - 1);
1404: if (this .chardata == '{') {
1405: int v1 = 0;
1406: int uv = 0;
1407: do {
1408: this .next();
1409: if (this .read() != T_CHAR)
1410: throw ex("parser.descape.1", this .offset - 1);
1411: if ((v1 = hexChar(this .chardata)) < 0)
1412: break;
1413: if (uv > uv * 16)
1414: throw ex("parser.descape.2", this .offset - 1);
1415: uv = uv * 16 + v1;
1416: } while (true);
1417: if (this .chardata != '}')
1418: throw ex("parser.descape.3", this .offset - 1);
1419: if (uv > Token.UTF16_MAX)
1420: throw ex("parser.descape.4", this .offset - 1);
1421: c = uv;
1422: } else {
1423: int v1 = 0;
1424: if (this .read() != T_CHAR
1425: || (v1 = hexChar(this .chardata)) < 0)
1426: throw ex("parser.descape.1", this .offset - 1);
1427: int uv = v1;
1428: this .next();
1429: if (this .read() != T_CHAR
1430: || (v1 = hexChar(this .chardata)) < 0)
1431: throw ex("parser.descape.1", this .offset - 1);
1432: uv = uv * 16 + v1;
1433: c = uv;
1434: }
1435: break;
1436:
1437: case 'u':
1438: int v1 = 0;
1439: this .next();
1440: if (this .read() != T_CHAR
1441: || (v1 = hexChar(this .chardata)) < 0)
1442: throw ex("parser.descape.1", this .offset - 1);
1443: int uv = v1;
1444: this .next();
1445: if (this .read() != T_CHAR
1446: || (v1 = hexChar(this .chardata)) < 0)
1447: throw ex("parser.descape.1", this .offset - 1);
1448: uv = uv * 16 + v1;
1449: this .next();
1450: if (this .read() != T_CHAR
1451: || (v1 = hexChar(this .chardata)) < 0)
1452: throw ex("parser.descape.1", this .offset - 1);
1453: uv = uv * 16 + v1;
1454: this .next();
1455: if (this .read() != T_CHAR
1456: || (v1 = hexChar(this .chardata)) < 0)
1457: throw ex("parser.descape.1", this .offset - 1);
1458: uv = uv * 16 + v1;
1459: c = uv;
1460: break;
1461:
1462: case 'v':
1463: this .next();
1464: if (this .read() != T_CHAR
1465: || (v1 = hexChar(this .chardata)) < 0)
1466: throw ex("parser.descape.1", this .offset - 1);
1467: uv = v1;
1468: this .next();
1469: if (this .read() != T_CHAR
1470: || (v1 = hexChar(this .chardata)) < 0)
1471: throw ex("parser.descape.1", this .offset - 1);
1472: uv = uv * 16 + v1;
1473: this .next();
1474: if (this .read() != T_CHAR
1475: || (v1 = hexChar(this .chardata)) < 0)
1476: throw ex("parser.descape.1", this .offset - 1);
1477: uv = uv * 16 + v1;
1478: this .next();
1479: if (this .read() != T_CHAR
1480: || (v1 = hexChar(this .chardata)) < 0)
1481: throw ex("parser.descape.1", this .offset - 1);
1482: uv = uv * 16 + v1;
1483: this .next();
1484: if (this .read() != T_CHAR
1485: || (v1 = hexChar(this .chardata)) < 0)
1486: throw ex("parser.descape.1", this .offset - 1);
1487: uv = uv * 16 + v1;
1488: this .next();
1489: if (this .read() != T_CHAR
1490: || (v1 = hexChar(this .chardata)) < 0)
1491: throw ex("parser.descape.1", this .offset - 1);
1492: uv = uv * 16 + v1;
1493: if (uv > Token.UTF16_MAX)
1494: throw ex("parser.descappe.4", this .offset - 1);
1495: c = uv;
1496: break;
1497: case 'A':
1498: case 'Z':
1499: case 'z':
1500: throw ex("parser.descape.5", this .offset - 2);
1501: default:
1502: }
1503: return c;
1504: }
1505:
1506: static private final int hexChar(int ch) {
1507: if (ch < '0')
1508: return -1;
1509: if (ch > 'f')
1510: return -1;
1511: if (ch <= '9')
1512: return ch - '0';
1513: if (ch < 'A')
1514: return -1;
1515: if (ch <= 'F')
1516: return ch - 'A' + 10;
1517: if (ch < 'a')
1518: return -1;
1519: return ch - 'a' + 10;
1520: }
1521: }
|