0001: /*
0002: * Licensed to the Apache Software Foundation (ASF) under one or more
0003: * contributor license agreements. See the NOTICE file distributed with
0004: * this work for additional information regarding copyright ownership.
0005: * The ASF licenses this file to You under the Apache License, Version 2.0
0006: * (the "License"); you may not use this file except in compliance with
0007: * the License. You may obtain a copy of the License at
0008: *
0009: * http://www.apache.org/licenses/LICENSE-2.0
0010: *
0011: * Unless required by applicable law or agreed to in writing, software
0012: * distributed under the License is distributed on an "AS IS" BASIS,
0013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014: * See the License for the specific language governing permissions and
0015: * limitations under the License.
0016: */
0017:
0018: package org.apache.xerces.impl.xpath.regex;
0019:
0020: import java.util.Locale;
0021: import java.util.MissingResourceException;
0022: import java.util.ResourceBundle;
0023: import java.util.Vector;
0024:
0025: /**
0026: * A Regular Expression Parser.
0027: *
0028: * @xerces.internal
0029: *
0030: * @version $Id: RegexParser.java 469061 2006-10-30 04:16:15Z mrglavas $
0031: */
0032: class RegexParser {
0033: static final int T_CHAR = 0;
0034: static final int T_EOF = 1;
0035: static final int T_OR = 2; // '|'
0036: static final int T_STAR = 3; // '*'
0037: static final int T_PLUS = 4; // '+'
0038: static final int T_QUESTION = 5; // '?'
0039: static final int T_LPAREN = 6; // '('
0040: static final int T_RPAREN = 7; // ')'
0041: static final int T_DOT = 8; // '.'
0042: static final int T_LBRACKET = 9; // '['
0043: static final int T_BACKSOLIDUS = 10; // '\'
0044: static final int T_CARET = 11; // '^'
0045: static final int T_DOLLAR = 12; // '$'
0046: static final int T_LPAREN2 = 13; // '(?:'
0047: static final int T_LOOKAHEAD = 14; // '(?='
0048: static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
0049: static final int T_LOOKBEHIND = 16; // '(?<='
0050: static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
0051: static final int T_INDEPENDENT = 18; // '(?>'
0052: static final int T_SET_OPERATIONS = 19; // '(?['
0053: static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
0054: static final int T_COMMENT = 21; // '(?#'
0055: static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
0056: static final int T_CONDITION = 23; // '(?('
0057: static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
0058:
0059: static class ReferencePosition {
0060: int refNumber;
0061: int position;
0062:
0063: ReferencePosition(int n, int pos) {
0064: this .refNumber = n;
0065: this .position = pos;
0066: }
0067: }
0068:
0069: int offset;
0070: String regex;
0071: int regexlen;
0072: int options;
0073: ResourceBundle resources;
0074: int chardata;
0075: int nexttoken;
0076: static protected final int S_NORMAL = 0;
0077: static protected final int S_INBRACKETS = 1;
0078: static protected final int S_INXBRACKETS = 2;
0079: int context = S_NORMAL;
0080: int parennumber = 1;
0081: boolean hasBackReferences;
0082: Vector references = null;
0083:
0084: public RegexParser() {
0085: this .setLocale(Locale.getDefault());
0086: }
0087:
0088: public RegexParser(Locale locale) {
0089: this .setLocale(locale);
0090: }
0091:
0092: public void setLocale(Locale locale) {
0093: try {
0094: this .resources = ResourceBundle.getBundle(
0095: "org.apache.xerces.impl.xpath.regex.message",
0096: locale);
0097: } catch (MissingResourceException mre) {
0098: throw new RuntimeException(
0099: "Installation Problem??? Couldn't load messages: "
0100: + mre.getMessage());
0101: }
0102: }
0103:
0104: final ParseException ex(String key, int loc) {
0105: return new ParseException(this .resources.getString(key), loc);
0106: }
0107:
0108: private final boolean isSet(int flag) {
0109: return (this .options & flag) == flag;
0110: }
0111:
0112: synchronized Token parse(String regex, int options)
0113: throws ParseException {
0114: this .options = options;
0115: this .offset = 0;
0116: this .setContext(S_NORMAL);
0117: this .parennumber = 1;
0118: this .hasBackReferences = false;
0119: this .regex = regex;
0120: if (this .isSet(RegularExpression.EXTENDED_COMMENT))
0121: this .regex = REUtil.stripExtendedComment(this .regex);
0122: this .regexlen = this .regex.length();
0123:
0124: this .next();
0125: Token ret = this .parseRegex();
0126: if (this .offset != this .regexlen)
0127: throw ex("parser.parse.1", this .offset);
0128: if (this .references != null) {
0129: for (int i = 0; i < this .references.size(); i++) {
0130: ReferencePosition position = (ReferencePosition) this .references
0131: .elementAt(i);
0132: if (this .parennumber <= position.refNumber)
0133: throw ex("parser.parse.2", position.position);
0134: }
0135: this .references.removeAllElements();
0136: }
0137: return ret;
0138: }
0139:
0140: /*
0141: public RegularExpression createRegex(String regex, int options) throws ParseException {
0142: Token tok = this.parse(regex, options);
0143: return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
0144: }
0145: */
0146:
0147: protected final void setContext(int con) {
0148: this .context = con;
0149: }
0150:
0151: final int read() {
0152: return this .nexttoken;
0153: }
0154:
0155: final void next() {
0156: if (this .offset >= this .regexlen) {
0157: this .chardata = -1;
0158: this .nexttoken = T_EOF;
0159: return;
0160: }
0161:
0162: int ret;
0163: int ch = this .regex.charAt(this .offset++);
0164: this .chardata = ch;
0165:
0166: if (this .context == S_INBRACKETS) {
0167: // In a character class, this.chardata has one character, that is to say,
0168: // a pair of surrogates is composed and stored to this.chardata.
0169: switch (ch) {
0170: case '\\':
0171: ret = T_BACKSOLIDUS;
0172: if (this .offset >= this .regexlen)
0173: throw ex("parser.next.1", this .offset - 1);
0174: this .chardata = this .regex.charAt(this .offset++);
0175: break;
0176:
0177: case '-':
0178: if (this .isSet(RegularExpression.XMLSCHEMA_MODE)
0179: && this .offset < this .regexlen
0180: && this .regex.charAt(this .offset) == '[') {
0181: this .offset++;
0182: ret = T_XMLSCHEMA_CC_SUBTRACTION;
0183: } else
0184: ret = T_CHAR;
0185: break;
0186:
0187: case '[':
0188: if (!this .isSet(RegularExpression.XMLSCHEMA_MODE)
0189: && this .offset < this .regexlen
0190: && this .regex.charAt(this .offset) == ':') {
0191: this .offset++;
0192: ret = T_POSIX_CHARCLASS_START;
0193: break;
0194: } // Through down
0195: default:
0196: if (REUtil.isHighSurrogate(ch)
0197: && this .offset < this .regexlen) {
0198: int low = this .regex.charAt(this .offset);
0199: if (REUtil.isLowSurrogate(low)) {
0200: this .chardata = REUtil.composeFromSurrogates(
0201: ch, low);
0202: this .offset++;
0203: }
0204: }
0205: ret = T_CHAR;
0206: }
0207: this .nexttoken = ret;
0208: return;
0209: }
0210:
0211: switch (ch) {
0212: case '|':
0213: ret = T_OR;
0214: break;
0215: case '*':
0216: ret = T_STAR;
0217: break;
0218: case '+':
0219: ret = T_PLUS;
0220: break;
0221: case '?':
0222: ret = T_QUESTION;
0223: break;
0224: case ')':
0225: ret = T_RPAREN;
0226: break;
0227: case '.':
0228: ret = T_DOT;
0229: break;
0230: case '[':
0231: ret = T_LBRACKET;
0232: break;
0233: case '^':
0234: if (this .isSet(RegularExpression.XMLSCHEMA_MODE)) {
0235: ret = T_CHAR;
0236: } else {
0237: ret = T_CARET;
0238: }
0239: break;
0240: case '$':
0241: if (this .isSet(RegularExpression.XMLSCHEMA_MODE)) {
0242: ret = T_CHAR;
0243: } else {
0244: ret = T_DOLLAR;
0245: }
0246: break;
0247: case '(':
0248: ret = T_LPAREN;
0249: if (this .offset >= this .regexlen)
0250: break;
0251: if (this .regex.charAt(this .offset) != '?')
0252: break;
0253: if (++this .offset >= this .regexlen)
0254: throw ex("parser.next.2", this .offset - 1);
0255: ch = this .regex.charAt(this .offset++);
0256: switch (ch) {
0257: case ':':
0258: ret = T_LPAREN2;
0259: break;
0260: case '=':
0261: ret = T_LOOKAHEAD;
0262: break;
0263: case '!':
0264: ret = T_NEGATIVELOOKAHEAD;
0265: break;
0266: case '[':
0267: ret = T_SET_OPERATIONS;
0268: break;
0269: case '>':
0270: ret = T_INDEPENDENT;
0271: break;
0272: case '<':
0273: if (this .offset >= this .regexlen)
0274: throw ex("parser.next.2", this .offset - 3);
0275: ch = this .regex.charAt(this .offset++);
0276: if (ch == '=') {
0277: ret = T_LOOKBEHIND;
0278: } else if (ch == '!') {
0279: ret = T_NEGATIVELOOKBEHIND;
0280: } else
0281: throw ex("parser.next.3", this .offset - 3);
0282: break;
0283: case '#':
0284: while (this .offset < this .regexlen) {
0285: ch = this .regex.charAt(this .offset++);
0286: if (ch == ')')
0287: break;
0288: }
0289: if (ch != ')')
0290: throw ex("parser.next.4", this .offset - 1);
0291: ret = T_COMMENT;
0292: break;
0293: default:
0294: if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch
0295: && ch <= 'Z') {// Options
0296: this .offset--;
0297: ret = T_MODIFIERS;
0298: break;
0299: } else if (ch == '(') { // conditional
0300: ret = T_CONDITION; // this.offsets points the next of '('.
0301: break;
0302: }
0303: throw ex("parser.next.2", this .offset - 2);
0304: }
0305: break;
0306:
0307: case '\\':
0308: ret = T_BACKSOLIDUS;
0309: if (this .offset >= this .regexlen)
0310: throw ex("parser.next.1", this .offset - 1);
0311: this .chardata = this .regex.charAt(this .offset++);
0312: break;
0313:
0314: default:
0315: ret = T_CHAR;
0316: }
0317: this .nexttoken = ret;
0318: }
0319:
0320: /**
0321: * regex ::= term (`|` term)*
0322: * term ::= factor+
0323: * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
0324: * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
0325: * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
0326: * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
0327: * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
0328: */
0329: Token parseRegex() throws ParseException {
0330: Token tok = this .parseTerm();
0331: Token parent = null;
0332: while (this .read() == T_OR) {
0333: this .next(); // '|'
0334: if (parent == null) {
0335: parent = Token.createUnion();
0336: parent.addChild(tok);
0337: tok = parent;
0338: }
0339: tok.addChild(this .parseTerm());
0340: }
0341: return tok;
0342: }
0343:
0344: /**
0345: * term ::= factor+
0346: */
0347: Token parseTerm() throws ParseException {
0348: int ch = this .read();
0349: if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
0350: return Token.createEmpty();
0351: } else {
0352: Token tok = this .parseFactor();
0353: Token concat = null;
0354: while ((ch = this .read()) != T_OR && ch != T_RPAREN
0355: && ch != T_EOF) {
0356: if (concat == null) {
0357: concat = Token.createConcat();
0358: concat.addChild(tok);
0359: tok = concat;
0360: }
0361: concat.addChild(this .parseFactor());
0362: //tok = Token.createConcat(tok, this.parseFactor());
0363: }
0364: return tok;
0365: }
0366: }
0367:
0368: // ----------------------------------------------------------------
0369:
0370: Token processCaret() throws ParseException {
0371: this .next();
0372: return Token.token_linebeginning;
0373: }
0374:
0375: Token processDollar() throws ParseException {
0376: this .next();
0377: return Token.token_lineend;
0378: }
0379:
0380: Token processLookahead() throws ParseException {
0381: this .next();
0382: Token tok = Token
0383: .createLook(Token.LOOKAHEAD, this .parseRegex());
0384: if (this .read() != T_RPAREN)
0385: throw ex("parser.factor.1", this .offset - 1);
0386: this .next(); // ')'
0387: return tok;
0388: }
0389:
0390: Token processNegativelookahead() throws ParseException {
0391: this .next();
0392: Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this
0393: .parseRegex());
0394: if (this .read() != T_RPAREN)
0395: throw ex("parser.factor.1", this .offset - 1);
0396: this .next(); // ')'
0397: return tok;
0398: }
0399:
0400: Token processLookbehind() throws ParseException {
0401: this .next();
0402: Token tok = Token.createLook(Token.LOOKBEHIND, this
0403: .parseRegex());
0404: if (this .read() != T_RPAREN)
0405: throw ex("parser.factor.1", this .offset - 1);
0406: this .next(); // ')'
0407: return tok;
0408: }
0409:
0410: Token processNegativelookbehind() throws ParseException {
0411: this .next();
0412: Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this
0413: .parseRegex());
0414: if (this .read() != T_RPAREN)
0415: throw ex("parser.factor.1", this .offset - 1);
0416: this .next(); // ')'
0417: return tok;
0418: }
0419:
0420: Token processBacksolidus_A() throws ParseException {
0421: this .next();
0422: return Token.token_stringbeginning;
0423: }
0424:
0425: Token processBacksolidus_Z() throws ParseException {
0426: this .next();
0427: return Token.token_stringend2;
0428: }
0429:
0430: Token processBacksolidus_z() throws ParseException {
0431: this .next();
0432: return Token.token_stringend;
0433: }
0434:
0435: Token processBacksolidus_b() throws ParseException {
0436: this .next();
0437: return Token.token_wordedge;
0438: }
0439:
0440: Token processBacksolidus_B() throws ParseException {
0441: this .next();
0442: return Token.token_not_wordedge;
0443: }
0444:
0445: Token processBacksolidus_lt() throws ParseException {
0446: this .next();
0447: return Token.token_wordbeginning;
0448: }
0449:
0450: Token processBacksolidus_gt() throws ParseException {
0451: this .next();
0452: return Token.token_wordend;
0453: }
0454:
0455: Token processStar(Token tok) throws ParseException {
0456: this .next();
0457: if (this .read() == T_QUESTION) {
0458: this .next();
0459: return Token.createNGClosure(tok);
0460: } else
0461: return Token.createClosure(tok);
0462: }
0463:
0464: Token processPlus(Token tok) throws ParseException {
0465: // X+ -> XX*
0466: this .next();
0467: if (this .read() == T_QUESTION) {
0468: this .next();
0469: return Token.createConcat(tok, Token.createNGClosure(tok));
0470: } else
0471: return Token.createConcat(tok, Token.createClosure(tok));
0472: }
0473:
0474: Token processQuestion(Token tok) throws ParseException {
0475: // X? -> X|
0476: this .next();
0477: Token par = Token.createUnion();
0478: if (this .read() == T_QUESTION) {
0479: this .next();
0480: par.addChild(Token.createEmpty());
0481: par.addChild(tok);
0482: } else {
0483: par.addChild(tok);
0484: par.addChild(Token.createEmpty());
0485: }
0486: return par;
0487: }
0488:
0489: boolean checkQuestion(int off) {
0490: return off < this .regexlen && this .regex.charAt(off) == '?';
0491: }
0492:
0493: Token processParen() throws ParseException {
0494: this .next();
0495: int p = this .parennumber++;
0496: Token tok = Token.createParen(this .parseRegex(), p);
0497: if (this .read() != T_RPAREN)
0498: throw ex("parser.factor.1", this .offset - 1);
0499: this .next(); // Skips ')'
0500: return tok;
0501: }
0502:
0503: Token processParen2() throws ParseException {
0504: this .next();
0505: Token tok = Token.createParen(this .parseRegex(), 0);
0506: if (this .read() != T_RPAREN)
0507: throw ex("parser.factor.1", this .offset - 1);
0508: this .next(); // Skips ')'
0509: return tok;
0510: }
0511:
0512: Token processCondition() throws ParseException {
0513: // this.offset points the next of '('
0514: if (this .offset + 1 >= this .regexlen)
0515: throw ex("parser.factor.4", this .offset);
0516: // Parses a condition.
0517: int refno = -1;
0518: Token condition = null;
0519: int ch = this .regex.charAt(this .offset);
0520: if ('1' <= ch && ch <= '9') {
0521: refno = ch - '0';
0522: this .hasBackReferences = true;
0523: if (this .references == null)
0524: this .references = new Vector();
0525: this .references.addElement(new ReferencePosition(refno,
0526: this .offset));
0527: this .offset++;
0528: if (this .regex.charAt(this .offset) != ')')
0529: throw ex("parser.factor.1", this .offset);
0530: this .offset++;
0531: } else {
0532: if (ch == '?')
0533: this .offset--; // Points '('.
0534: this .next();
0535: condition = this .parseFactor();
0536: switch (condition.type) {
0537: case Token.LOOKAHEAD:
0538: case Token.NEGATIVELOOKAHEAD:
0539: case Token.LOOKBEHIND:
0540: case Token.NEGATIVELOOKBEHIND:
0541: break;
0542: case Token.ANCHOR:
0543: if (this .read() != T_RPAREN)
0544: throw ex("parser.factor.1", this .offset - 1);
0545: break;
0546: default:
0547: throw ex("parser.factor.5", this .offset);
0548: }
0549: }
0550: // Parses yes/no-patterns.
0551: this .next();
0552: Token yesPattern = this .parseRegex();
0553: Token noPattern = null;
0554: if (yesPattern.type == Token.UNION) {
0555: if (yesPattern.size() != 2)
0556: throw ex("parser.factor.6", this .offset);
0557: noPattern = yesPattern.getChild(1);
0558: yesPattern = yesPattern.getChild(0);
0559: }
0560: if (this .read() != T_RPAREN)
0561: throw ex("parser.factor.1", this .offset - 1);
0562: this .next();
0563: return Token.createCondition(refno, condition, yesPattern,
0564: noPattern);
0565: }
0566:
0567: Token processModifiers() throws ParseException {
0568: // this.offset points the next of '?'.
0569: // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
0570: int add = 0, mask = 0, ch = -1;
0571: while (this .offset < this .regexlen) {
0572: ch = this .regex.charAt(this .offset);
0573: int v = REUtil.getOptionValue(ch);
0574: if (v == 0)
0575: break; // '-' or ':'?
0576: add |= v;
0577: this .offset++;
0578: }
0579: if (this .offset >= this .regexlen)
0580: throw ex("parser.factor.2", this .offset - 1);
0581: if (ch == '-') {
0582: this .offset++;
0583: while (this .offset < this .regexlen) {
0584: ch = this .regex.charAt(this .offset);
0585: int v = REUtil.getOptionValue(ch);
0586: if (v == 0)
0587: break; // ':'?
0588: mask |= v;
0589: this .offset++;
0590: }
0591: if (this .offset >= this .regexlen)
0592: throw ex("parser.factor.2", this .offset - 1);
0593: }
0594: Token tok;
0595: if (ch == ':') {
0596: this .offset++;
0597: this .next();
0598: tok = Token.createModifierGroup(this .parseRegex(), add,
0599: mask);
0600: if (this .read() != T_RPAREN)
0601: throw ex("parser.factor.1", this .offset - 1);
0602: this .next();
0603: } else if (ch == ')') { // such as (?-i)
0604: this .offset++;
0605: this .next();
0606: tok = Token.createModifierGroup(this .parseRegex(), add,
0607: mask);
0608: } else
0609: throw ex("parser.factor.3", this .offset);
0610:
0611: return tok;
0612: }
0613:
0614: Token processIndependent() throws ParseException {
0615: this .next();
0616: Token tok = Token.createLook(Token.INDEPENDENT, this
0617: .parseRegex());
0618: if (this .read() != T_RPAREN)
0619: throw ex("parser.factor.1", this .offset - 1);
0620: this .next(); // Skips ')'
0621: return tok;
0622: }
0623:
0624: Token processBacksolidus_c() throws ParseException {
0625: int ch2; // Must be in 0x0040-0x005f
0626: if (this .offset >= this .regexlen
0627: || ((ch2 = this .regex.charAt(this .offset++)) & 0xffe0) != 0x0040)
0628: throw ex("parser.atom.1", this .offset - 1);
0629: this .next();
0630: return Token.createChar(ch2 - 0x40);
0631: }
0632:
0633: Token processBacksolidus_C() throws ParseException {
0634: throw ex("parser.process.1", this .offset);
0635: }
0636:
0637: Token processBacksolidus_i() throws ParseException {
0638: Token tok = Token.createChar('i');
0639: this .next();
0640: return tok;
0641: }
0642:
0643: Token processBacksolidus_I() throws ParseException {
0644: throw ex("parser.process.1", this .offset);
0645: }
0646:
0647: Token processBacksolidus_g() throws ParseException {
0648: this .next();
0649: return Token.getGraphemePattern();
0650: }
0651:
0652: Token processBacksolidus_X() throws ParseException {
0653: this .next();
0654: return Token.getCombiningCharacterSequence();
0655: }
0656:
0657: Token processBackreference() throws ParseException {
0658: int refnum = this .chardata - '0';
0659: Token tok = Token.createBackReference(refnum);
0660: this .hasBackReferences = true;
0661: if (this .references == null)
0662: this .references = new Vector();
0663: this .references.addElement(new ReferencePosition(refnum,
0664: this .offset - 2));
0665: this .next();
0666: return tok;
0667: }
0668:
0669: // ----------------------------------------------------------------
0670:
0671: /**
0672: * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
0673: * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
0674: * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
0675: * | '(?#' [^)]* ')'
0676: * minmax ::= '{' min (',' max?)? '}'
0677: * min ::= [0-9]+
0678: * max ::= [0-9]+
0679: */
0680: Token parseFactor() throws ParseException {
0681: int ch = this .read();
0682: Token tok;
0683: switch (ch) {
0684: case T_CARET:
0685: return this .processCaret();
0686: case T_DOLLAR:
0687: return this .processDollar();
0688: case T_LOOKAHEAD:
0689: return this .processLookahead();
0690: case T_NEGATIVELOOKAHEAD:
0691: return this .processNegativelookahead();
0692: case T_LOOKBEHIND:
0693: return this .processLookbehind();
0694: case T_NEGATIVELOOKBEHIND:
0695: return this .processNegativelookbehind();
0696:
0697: case T_COMMENT:
0698: this .next();
0699: return Token.createEmpty();
0700:
0701: case T_BACKSOLIDUS:
0702: switch (this .chardata) {
0703: case 'A':
0704: return this .processBacksolidus_A();
0705: case 'Z':
0706: return this .processBacksolidus_Z();
0707: case 'z':
0708: return this .processBacksolidus_z();
0709: case 'b':
0710: return this .processBacksolidus_b();
0711: case 'B':
0712: return this .processBacksolidus_B();
0713: case '<':
0714: return this .processBacksolidus_lt();
0715: case '>':
0716: return this .processBacksolidus_gt();
0717: }
0718: // through down
0719: }
0720: tok = this .parseAtom();
0721: ch = this .read();
0722: switch (ch) {
0723: case T_STAR:
0724: return this .processStar(tok);
0725: case T_PLUS:
0726: return this .processPlus(tok);
0727: case T_QUESTION:
0728: return this .processQuestion(tok);
0729: case T_CHAR:
0730: if (this .chardata == '{' && this .offset < this .regexlen) {
0731:
0732: int off = this .offset; // this.offset -> next of '{'
0733: int min = 0, max = -1;
0734:
0735: if ((ch = this .regex.charAt(off++)) >= '0' && ch <= '9') {
0736:
0737: min = ch - '0';
0738: while (off < this .regexlen
0739: && (ch = this .regex.charAt(off++)) >= '0'
0740: && ch <= '9') {
0741: min = min * 10 + ch - '0';
0742: if (min < 0)
0743: throw ex("parser.quantifier.5", this .offset);
0744: }
0745: } else {
0746: throw ex("parser.quantifier.1", this .offset);
0747: }
0748:
0749: max = min;
0750: if (ch == ',') {
0751:
0752: if (off >= this .regexlen) {
0753: throw ex("parser.quantifier.3", this .offset);
0754: } else if ((ch = this .regex.charAt(off++)) >= '0'
0755: && ch <= '9') {
0756:
0757: max = ch - '0'; // {min,max}
0758: while (off < this .regexlen
0759: && (ch = this .regex.charAt(off++)) >= '0'
0760: && ch <= '9') {
0761: max = max * 10 + ch - '0';
0762: if (max < 0)
0763: throw ex("parser.quantifier.5",
0764: this .offset);
0765: }
0766:
0767: if (min > max)
0768: throw ex("parser.quantifier.4", this .offset);
0769: } else { // assume {min,}
0770: max = -1;
0771: }
0772: }
0773:
0774: if (ch != '}')
0775: throw ex("parser.quantifier.2", this .offset);
0776:
0777: if (this .checkQuestion(off)) { // off -> next of '}'
0778: tok = Token.createNGClosure(tok);
0779: this .offset = off + 1;
0780: } else {
0781: tok = Token.createClosure(tok);
0782: this .offset = off;
0783: }
0784:
0785: tok.setMin(min);
0786: tok.setMax(max);
0787: //System.err.println("CLOSURE: "+min+", "+max);
0788: this .next();
0789: }
0790: }
0791: return tok;
0792: }
0793:
0794: /**
0795: * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
0796: * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
0797: * | '(?>' regex ')'
0798: * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
0799: */
0800: Token parseAtom() throws ParseException {
0801: int ch = this .read();
0802: Token tok = null;
0803: switch (ch) {
0804: case T_LPAREN:
0805: return this .processParen();
0806: case T_LPAREN2:
0807: return this .processParen2(); // '(?:'
0808: case T_CONDITION:
0809: return this .processCondition(); // '(?('
0810: case T_MODIFIERS:
0811: return this .processModifiers(); // (?modifiers ... )
0812: case T_INDEPENDENT:
0813: return this .processIndependent();
0814: case T_DOT:
0815: this .next(); // Skips '.'
0816: tok = Token.token_dot;
0817: break;
0818:
0819: /**
0820: * char-class ::= '[' ( '^'? range ','?)+ ']'
0821: * range ::= '\d' | '\w' | '\s' | category-block | range-char
0822: * | range-char '-' range-char
0823: * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
0824: * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
0825: */
0826: case T_LBRACKET:
0827: return this .parseCharacterClass(true);
0828: case T_SET_OPERATIONS:
0829: return this .parseSetOperations();
0830:
0831: case T_BACKSOLIDUS:
0832: switch (this .chardata) {
0833: case 'd':
0834: case 'D':
0835: case 'w':
0836: case 'W':
0837: case 's':
0838: case 'S':
0839: tok = this .getTokenForShorthand(this .chardata);
0840: this .next();
0841: return tok;
0842:
0843: case 'e':
0844: case 'f':
0845: case 'n':
0846: case 'r':
0847: case 't':
0848: case 'u':
0849: case 'v':
0850: case 'x': {
0851: int ch2 = this .decodeEscaped();
0852: if (ch2 < 0x10000) {
0853: tok = Token.createChar(ch2);
0854: } else {
0855: tok = Token.createString(REUtil
0856: .decomposeToSurrogates(ch2));
0857: }
0858: }
0859: break;
0860:
0861: case 'c':
0862: return this .processBacksolidus_c();
0863: case 'C':
0864: return this .processBacksolidus_C();
0865: case 'i':
0866: return this .processBacksolidus_i();
0867: case 'I':
0868: return this .processBacksolidus_I();
0869: case 'g':
0870: return this .processBacksolidus_g();
0871: case 'X':
0872: return this .processBacksolidus_X();
0873: case '1':
0874: case '2':
0875: case '3':
0876: case '4':
0877: case '5':
0878: case '6':
0879: case '7':
0880: case '8':
0881: case '9':
0882: return this .processBackreference();
0883:
0884: case 'P':
0885: case 'p':
0886: int pstart = this .offset;
0887: tok = processBacksolidus_pP(this .chardata);
0888: if (tok == null)
0889: throw this .ex("parser.atom.5", pstart);
0890: break;
0891:
0892: default:
0893: tok = Token.createChar(this .chardata);
0894: }
0895: this .next();
0896: break;
0897:
0898: case T_CHAR:
0899: if (this .chardata == ']' || this .chardata == '{'
0900: || this .chardata == '}')
0901: throw this .ex("parser.atom.4", this .offset - 1);
0902: tok = Token.createChar(this .chardata);
0903: int high = this .chardata;
0904: this .next();
0905: if (REUtil.isHighSurrogate(high) && this .read() == T_CHAR
0906: && REUtil.isLowSurrogate(this .chardata)) {
0907: char[] sur = new char[2];
0908: sur[0] = (char) high;
0909: sur[1] = (char) this .chardata;
0910: tok = Token.createParen(Token.createString(new String(
0911: sur)), 0);
0912: this .next();
0913: }
0914: break;
0915:
0916: default:
0917: throw this .ex("parser.atom.4", this .offset - 1);
0918: }
0919: return tok;
0920: }
0921:
0922: protected RangeToken processBacksolidus_pP(int c)
0923: throws ParseException {
0924:
0925: this .next();
0926: if (this .read() != T_CHAR || this .chardata != '{')
0927: throw this .ex("parser.atom.2", this .offset - 1);
0928:
0929: // handle category escape
0930: boolean positive = c == 'p';
0931: int namestart = this .offset;
0932: int nameend = this .regex.indexOf('}', namestart);
0933:
0934: if (nameend < 0)
0935: throw this .ex("parser.atom.3", this .offset);
0936:
0937: String pname = this .regex.substring(namestart, nameend);
0938: this .offset = nameend + 1;
0939:
0940: return Token.getRange(pname, positive, this
0941: .isSet(RegularExpression.XMLSCHEMA_MODE));
0942: }
0943:
0944: int processCIinCharacterClass(RangeToken tok, int c) {
0945: return this .decodeEscaped();
0946: }
0947:
0948: /**
0949: * char-class ::= '[' ( '^'? range ','?)+ ']'
0950: * range ::= '\d' | '\w' | '\s' | category-block | range-char
0951: * | range-char '-' range-char
0952: * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
0953: * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
0954: */
0955: protected RangeToken parseCharacterClass(boolean useNrange)
0956: throws ParseException {
0957: this .setContext(S_INBRACKETS);
0958: this .next(); // '['
0959: boolean nrange = false;
0960: RangeToken base = null;
0961: RangeToken tok;
0962: if (this .read() == T_CHAR && this .chardata == '^') {
0963: nrange = true;
0964: this .next(); // '^'
0965: if (useNrange) {
0966: tok = Token.createNRange();
0967: } else {
0968: base = Token.createRange();
0969: base.addRange(0, Token.UTF16_MAX);
0970: tok = Token.createRange();
0971: }
0972: } else {
0973: tok = Token.createRange();
0974: }
0975: int type;
0976: boolean firstloop = true;
0977: while ((type = this .read()) != T_EOF) {
0978: if (type == T_CHAR && this .chardata == ']' && !firstloop)
0979: break;
0980: firstloop = false;
0981: int c = this .chardata;
0982: boolean end = false;
0983: if (type == T_BACKSOLIDUS) {
0984: switch (c) {
0985: case 'd':
0986: case 'D':
0987: case 'w':
0988: case 'W':
0989: case 's':
0990: case 'S':
0991: tok.mergeRanges(this .getTokenForShorthand(c));
0992: end = true;
0993: break;
0994:
0995: case 'i':
0996: case 'I':
0997: case 'c':
0998: case 'C':
0999: c = this .processCIinCharacterClass(tok, c);
1000: if (c < 0)
1001: end = true;
1002: break;
1003:
1004: case 'p':
1005: case 'P':
1006: int pstart = this .offset;
1007: RangeToken tok2 = this .processBacksolidus_pP(c);
1008: if (tok2 == null)
1009: throw this .ex("parser.atom.5", pstart);
1010: tok.mergeRanges(tok2);
1011: end = true;
1012: break;
1013:
1014: default:
1015: c = this .decodeEscaped();
1016: } // \ + c
1017: } // backsolidus
1018: // POSIX Character class such as [:alnum:]
1019: else if (type == T_POSIX_CHARCLASS_START) {
1020: int nameend = this .regex.indexOf(':', this .offset);
1021: if (nameend < 0)
1022: throw this .ex("parser.cc.1", this .offset);
1023: boolean positive = true;
1024: if (this .regex.charAt(this .offset) == '^') {
1025: this .offset++;
1026: positive = false;
1027: }
1028: String name = this .regex
1029: .substring(this .offset, nameend);
1030: RangeToken range = Token.getRange(name, positive, this
1031: .isSet(RegularExpression.XMLSCHEMA_MODE));
1032: if (range == null)
1033: throw this .ex("parser.cc.3", this .offset);
1034: tok.mergeRanges(range);
1035: end = true;
1036: if (nameend + 1 >= this .regexlen
1037: || this .regex.charAt(nameend + 1) != ']')
1038: throw this .ex("parser.cc.1", nameend);
1039: this .offset = nameend + 2;
1040: }
1041: this .next();
1042: if (!end) { // if not shorthands...
1043: if (this .read() != T_CHAR || this .chardata != '-') { // Here is no '-'.
1044: tok.addRange(c, c);
1045: } else {
1046: this .next(); // Skips '-'
1047: if ((type = this .read()) == T_EOF)
1048: throw this .ex("parser.cc.2", this .offset);
1049: if (type == T_CHAR && this .chardata == ']') {
1050: tok.addRange(c, c);
1051: tok.addRange('-', '-');
1052: } else {
1053: int rangeend = this .chardata;
1054: if (type == T_BACKSOLIDUS)
1055: rangeend = this .decodeEscaped();
1056: this .next();
1057: tok.addRange(c, rangeend);
1058: }
1059: }
1060: }
1061: if (this .isSet(RegularExpression.SPECIAL_COMMA)
1062: && this .read() == T_CHAR && this .chardata == ',')
1063: this .next();
1064: }
1065: if (this .read() == T_EOF)
1066: throw this .ex("parser.cc.2", this .offset);
1067: if (!useNrange && nrange) {
1068: base.subtractRanges(tok);
1069: tok = base;
1070: }
1071: tok.sortRanges();
1072: tok.compactRanges();
1073: //tok.dumpRanges();
1074: /*
1075: if (this.isSet(RegularExpression.IGNORE_CASE))
1076: tok = RangeToken.createCaseInsensitiveToken(tok);
1077: */
1078: this .setContext(S_NORMAL);
1079: this .next(); // Skips ']'
1080:
1081: return tok;
1082: }
1083:
1084: /**
1085: * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
1086: */
1087: protected RangeToken parseSetOperations() throws ParseException {
1088: RangeToken tok = this .parseCharacterClass(false);
1089: int type;
1090: while ((type = this .read()) != T_RPAREN) {
1091: int ch = this .chardata;
1092: if (type == T_CHAR && (ch == '-' || ch == '&')
1093: || type == T_PLUS) {
1094: this .next();
1095: if (this .read() != T_LBRACKET)
1096: throw ex("parser.ope.1", this .offset - 1);
1097: RangeToken t2 = this .parseCharacterClass(false);
1098: if (type == T_PLUS)
1099: tok.mergeRanges(t2);
1100: else if (ch == '-')
1101: tok.subtractRanges(t2);
1102: else if (ch == '&')
1103: tok.intersectRanges(t2);
1104: else
1105: throw new RuntimeException("ASSERT");
1106: } else {
1107: throw ex("parser.ope.2", this .offset - 1);
1108: }
1109: }
1110: this .next();
1111: return tok;
1112: }
1113:
1114: Token getTokenForShorthand(int ch) {
1115: Token tok;
1116: switch (ch) {
1117: case 'd':
1118: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1119: .getRange("Nd", true)
1120: : Token.token_0to9;
1121: break;
1122: case 'D':
1123: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1124: .getRange("Nd", false)
1125: : Token.token_not_0to9;
1126: break;
1127: case 'w':
1128: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1129: .getRange("IsWord", true)
1130: : Token.token_wordchars;
1131: break;
1132: case 'W':
1133: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1134: .getRange("IsWord", false)
1135: : Token.token_not_wordchars;
1136: break;
1137: case 's':
1138: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1139: .getRange("IsSpace", true)
1140: : Token.token_spaces;
1141: break;
1142: case 'S':
1143: tok = this .isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token
1144: .getRange("IsSpace", false)
1145: : Token.token_not_spaces;
1146: break;
1147:
1148: default:
1149: throw new RuntimeException(
1150: "Internal Error: shorthands: \\u"
1151: + Integer.toString(ch, 16));
1152: }
1153: return tok;
1154: }
1155:
1156: /**
1157: */
1158: int decodeEscaped() throws ParseException {
1159: if (this .read() != T_BACKSOLIDUS)
1160: throw ex("parser.next.1", this .offset - 1);
1161: int c = this .chardata;
1162: switch (c) {
1163: case 'e':
1164: c = 0x1b;
1165: break; // ESCAPE U+001B
1166: case 'f':
1167: c = '\f';
1168: break; // FORM FEED U+000C
1169: case 'n':
1170: c = '\n';
1171: break; // LINE FEED U+000A
1172: case 'r':
1173: c = '\r';
1174: break; // CRRIAGE RETURN U+000D
1175: case 't':
1176: c = '\t';
1177: break; // HORIZONTAL TABULATION U+0009
1178: //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B
1179: case 'x':
1180: this .next();
1181: if (this .read() != T_CHAR)
1182: throw ex("parser.descape.1", this .offset - 1);
1183: if (this .chardata == '{') {
1184: int v1 = 0;
1185: int uv = 0;
1186: do {
1187: this .next();
1188: if (this .read() != T_CHAR)
1189: throw ex("parser.descape.1", this .offset - 1);
1190: if ((v1 = hexChar(this .chardata)) < 0)
1191: break;
1192: if (uv > uv * 16)
1193: throw ex("parser.descape.2", this .offset - 1);
1194: uv = uv * 16 + v1;
1195: } while (true);
1196: if (this .chardata != '}')
1197: throw ex("parser.descape.3", this .offset - 1);
1198: if (uv > Token.UTF16_MAX)
1199: throw ex("parser.descape.4", this .offset - 1);
1200: c = uv;
1201: } else {
1202: int v1 = 0;
1203: if (this .read() != T_CHAR
1204: || (v1 = hexChar(this .chardata)) < 0)
1205: throw ex("parser.descape.1", this .offset - 1);
1206: int uv = v1;
1207: this .next();
1208: if (this .read() != T_CHAR
1209: || (v1 = hexChar(this .chardata)) < 0)
1210: throw ex("parser.descape.1", this .offset - 1);
1211: uv = uv * 16 + v1;
1212: c = uv;
1213: }
1214: break;
1215:
1216: case 'u':
1217: int v1 = 0;
1218: this .next();
1219: if (this .read() != T_CHAR
1220: || (v1 = hexChar(this .chardata)) < 0)
1221: throw ex("parser.descape.1", this .offset - 1);
1222: int uv = v1;
1223: this .next();
1224: if (this .read() != T_CHAR
1225: || (v1 = hexChar(this .chardata)) < 0)
1226: throw ex("parser.descape.1", this .offset - 1);
1227: uv = uv * 16 + v1;
1228: this .next();
1229: if (this .read() != T_CHAR
1230: || (v1 = hexChar(this .chardata)) < 0)
1231: throw ex("parser.descape.1", this .offset - 1);
1232: uv = uv * 16 + v1;
1233: this .next();
1234: if (this .read() != T_CHAR
1235: || (v1 = hexChar(this .chardata)) < 0)
1236: throw ex("parser.descape.1", this .offset - 1);
1237: uv = uv * 16 + v1;
1238: c = uv;
1239: break;
1240:
1241: case 'v':
1242: this .next();
1243: if (this .read() != T_CHAR
1244: || (v1 = hexChar(this .chardata)) < 0)
1245: throw ex("parser.descape.1", this .offset - 1);
1246: uv = v1;
1247: this .next();
1248: if (this .read() != T_CHAR
1249: || (v1 = hexChar(this .chardata)) < 0)
1250: throw ex("parser.descape.1", this .offset - 1);
1251: uv = uv * 16 + v1;
1252: this .next();
1253: if (this .read() != T_CHAR
1254: || (v1 = hexChar(this .chardata)) < 0)
1255: throw ex("parser.descape.1", this .offset - 1);
1256: uv = uv * 16 + v1;
1257: this .next();
1258: if (this .read() != T_CHAR
1259: || (v1 = hexChar(this .chardata)) < 0)
1260: throw ex("parser.descape.1", this .offset - 1);
1261: uv = uv * 16 + v1;
1262: this .next();
1263: if (this .read() != T_CHAR
1264: || (v1 = hexChar(this .chardata)) < 0)
1265: throw ex("parser.descape.1", this .offset - 1);
1266: uv = uv * 16 + v1;
1267: this .next();
1268: if (this .read() != T_CHAR
1269: || (v1 = hexChar(this .chardata)) < 0)
1270: throw ex("parser.descape.1", this .offset - 1);
1271: uv = uv * 16 + v1;
1272: if (uv > Token.UTF16_MAX)
1273: throw ex("parser.descappe.4", this .offset - 1);
1274: c = uv;
1275: break;
1276: case 'A':
1277: case 'Z':
1278: case 'z':
1279: throw ex("parser.descape.5", this .offset - 2);
1280: default:
1281: }
1282: return c;
1283: }
1284:
1285: static private final int hexChar(int ch) {
1286: if (ch < '0')
1287: return -1;
1288: if (ch > 'f')
1289: return -1;
1290: if (ch <= '9')
1291: return ch - '0';
1292: if (ch < 'A')
1293: return -1;
1294: if (ch <= 'F')
1295: return ch - 'A' + 10;
1296: if (ch < 'a')
1297: return -1;
1298: return ch - 'a' + 10;
1299: }
1300: }
|