0001: /*
0002: * Licensed to the Apache Software Foundation (ASF) under one or more
0003: * contributor license agreements. See the NOTICE file distributed with
0004: * this work for additional information regarding copyright ownership.
0005: * The ASF licenses this file to You under the Apache License, Version 2.0
0006: * (the "License"); you may not use this file except in compliance with
0007: * the License. You may obtain a copy of the License at
0008: *
0009: * http://www.apache.org/licenses/LICENSE-2.0
0010: *
0011: * Unless required by applicable law or agreed to in writing, software
0012: * distributed under the License is distributed on an "AS IS" BASIS,
0013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014: * See the License for the specific language governing permissions and
0015: * limitations under the License.
0016: */
0017:
0018: package org.apache.xerces.impl.xpath.regex;
0019:
0020: import java.util.Vector;
0021: import java.util.Hashtable;
0022:
0023: /**
0024: * This class represents a node in parse tree.
0025: *
0026: * @xerces.internal
0027: *
0028: * @version $Id: Token.java 572108 2007-09-02 18:48:31Z mrglavas $
0029: */
0030: class Token implements java.io.Serializable {
0031:
0032: private static final long serialVersionUID = 8484976002585487481L;
0033:
0034: static final boolean COUNTTOKENS = true;
0035: static int tokens = 0;
0036:
0037: static final int CHAR = 0; // Literal char
0038: static final int DOT = 11; // .
0039: static final int CONCAT = 1; // XY
0040: static final int UNION = 2; // X|Y|Z
0041: static final int CLOSURE = 3; // X*
0042: static final int RANGE = 4; // [a-zA-Z] etc.
0043: static final int NRANGE = 5; // [^a-zA-Z] etc.
0044: static final int PAREN = 6; // (X) or (?:X)
0045: static final int EMPTY = 7; //
0046: static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
0047: static final int NONGREEDYCLOSURE = 9; // *? +?
0048: static final int STRING = 10; // strings
0049: static final int BACKREFERENCE = 12; // back references
0050: static final int LOOKAHEAD = 20; // (?=...)
0051: static final int NEGATIVELOOKAHEAD = 21; // (?!...)
0052: static final int LOOKBEHIND = 22; // (?<=...)
0053: static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
0054: static final int INDEPENDENT = 24; // (?>...)
0055: static final int MODIFIERGROUP = 25; // (?ims-ims:...)
0056: static final int CONDITION = 26; // (?(...)yes|no)
0057:
0058: static final int UTF16_MAX = 0x10ffff;
0059:
0060: final int type;
0061:
0062: static Token token_dot;
0063: static Token token_0to9;
0064: static Token token_wordchars;
0065: static Token token_not_0to9;
0066: static Token token_not_wordchars;
0067: static Token token_spaces;
0068: static Token token_not_spaces;
0069: static Token token_empty;
0070: static Token token_linebeginning;
0071: static Token token_linebeginning2;
0072: static Token token_lineend;
0073: static Token token_stringbeginning;
0074: static Token token_stringend;
0075: static Token token_stringend2;
0076: static Token token_wordedge;
0077: static Token token_not_wordedge;
0078: static Token token_wordbeginning;
0079: static Token token_wordend;
0080: static {
0081: Token.token_empty = new Token(Token.EMPTY);
0082:
0083: Token.token_linebeginning = Token.createAnchor('^');
0084: Token.token_linebeginning2 = Token.createAnchor('@');
0085: Token.token_lineend = Token.createAnchor('$');
0086: Token.token_stringbeginning = Token.createAnchor('A');
0087: Token.token_stringend = Token.createAnchor('z');
0088: Token.token_stringend2 = Token.createAnchor('Z');
0089: Token.token_wordedge = Token.createAnchor('b');
0090: Token.token_not_wordedge = Token.createAnchor('B');
0091: Token.token_wordbeginning = Token.createAnchor('<');
0092: Token.token_wordend = Token.createAnchor('>');
0093:
0094: Token.token_dot = new Token(Token.DOT);
0095:
0096: Token.token_0to9 = Token.createRange();
0097: Token.token_0to9.addRange('0', '9');
0098: Token.token_wordchars = Token.createRange();
0099: Token.token_wordchars.addRange('0', '9');
0100: Token.token_wordchars.addRange('A', 'Z');
0101: Token.token_wordchars.addRange('_', '_');
0102: Token.token_wordchars.addRange('a', 'z');
0103: Token.token_spaces = Token.createRange();
0104: Token.token_spaces.addRange('\t', '\t');
0105: Token.token_spaces.addRange('\n', '\n');
0106: Token.token_spaces.addRange('\f', '\f');
0107: Token.token_spaces.addRange('\r', '\r');
0108: Token.token_spaces.addRange(' ', ' ');
0109:
0110: Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
0111: Token.token_not_wordchars = Token
0112: .complementRanges(Token.token_wordchars);
0113: Token.token_not_spaces = Token
0114: .complementRanges(Token.token_spaces);
0115: }
0116:
0117: static Token.ParenToken createLook(int type, Token child) {
0118: if (COUNTTOKENS)
0119: Token.tokens++;
0120: return new Token.ParenToken(type, child, 0);
0121: }
0122:
0123: static Token.ParenToken createParen(Token child, int pnumber) {
0124: if (COUNTTOKENS)
0125: Token.tokens++;
0126: return new Token.ParenToken(Token.PAREN, child, pnumber);
0127: }
0128:
0129: static Token.ClosureToken createClosure(Token tok) {
0130: if (COUNTTOKENS)
0131: Token.tokens++;
0132: return new Token.ClosureToken(Token.CLOSURE, tok);
0133: }
0134:
0135: static Token.ClosureToken createNGClosure(Token tok) {
0136: if (COUNTTOKENS)
0137: Token.tokens++;
0138: return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
0139: }
0140:
0141: static Token.ConcatToken createConcat(Token tok1, Token tok2) {
0142: if (COUNTTOKENS)
0143: Token.tokens++;
0144: return new Token.ConcatToken(tok1, tok2);
0145: }
0146:
0147: static Token.UnionToken createConcat() {
0148: if (COUNTTOKENS)
0149: Token.tokens++;
0150: return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
0151: }
0152:
0153: static Token.UnionToken createUnion() {
0154: if (COUNTTOKENS)
0155: Token.tokens++;
0156: return new Token.UnionToken(Token.UNION);
0157: }
0158:
0159: static Token createEmpty() {
0160: return Token.token_empty;
0161: }
0162:
0163: static RangeToken createRange() {
0164: if (COUNTTOKENS)
0165: Token.tokens++;
0166: return new RangeToken(Token.RANGE);
0167: }
0168:
0169: static RangeToken createNRange() {
0170: if (COUNTTOKENS)
0171: Token.tokens++;
0172: return new RangeToken(Token.NRANGE);
0173: }
0174:
0175: static Token.CharToken createChar(int ch) {
0176: if (COUNTTOKENS)
0177: Token.tokens++;
0178: return new Token.CharToken(Token.CHAR, ch);
0179: }
0180:
0181: static private Token.CharToken createAnchor(int ch) {
0182: if (COUNTTOKENS)
0183: Token.tokens++;
0184: return new Token.CharToken(Token.ANCHOR, ch);
0185: }
0186:
0187: static Token.StringToken createBackReference(int refno) {
0188: if (COUNTTOKENS)
0189: Token.tokens++;
0190: return new Token.StringToken(Token.BACKREFERENCE, null, refno);
0191: }
0192:
0193: static Token.StringToken createString(String str) {
0194: if (COUNTTOKENS)
0195: Token.tokens++;
0196: return new Token.StringToken(Token.STRING, str, 0);
0197: }
0198:
0199: static Token.ModifierToken createModifierGroup(Token child,
0200: int add, int mask) {
0201: if (COUNTTOKENS)
0202: Token.tokens++;
0203: return new Token.ModifierToken(child, add, mask);
0204: }
0205:
0206: static Token.ConditionToken createCondition(int refno,
0207: Token condition, Token yespat, Token nopat) {
0208: if (COUNTTOKENS)
0209: Token.tokens++;
0210: return new Token.ConditionToken(refno, condition, yespat, nopat);
0211: }
0212:
0213: protected Token(int type) {
0214: this .type = type;
0215: }
0216:
0217: /**
0218: * A number of children.
0219: */
0220: int size() {
0221: return 0;
0222: }
0223:
0224: Token getChild(int index) {
0225: return null;
0226: }
0227:
0228: void addChild(Token tok) {
0229: throw new RuntimeException("Not supported.");
0230: }
0231:
0232: // for RANGE or NRANGE
0233: protected void addRange(int start, int end) {
0234: throw new RuntimeException("Not supported.");
0235: }
0236:
0237: protected void sortRanges() {
0238: throw new RuntimeException("Not supported.");
0239: }
0240:
0241: protected void compactRanges() {
0242: throw new RuntimeException("Not supported.");
0243: }
0244:
0245: protected void mergeRanges(Token tok) {
0246: throw new RuntimeException("Not supported.");
0247: }
0248:
0249: protected void subtractRanges(Token tok) {
0250: throw new RuntimeException("Not supported.");
0251: }
0252:
0253: protected void intersectRanges(Token tok) {
0254: throw new RuntimeException("Not supported.");
0255: }
0256:
0257: static Token complementRanges(Token tok) {
0258: return RangeToken.complementRanges(tok);
0259: }
0260:
0261: void setMin(int min) { // for CLOSURE
0262: }
0263:
0264: void setMax(int max) { // for CLOSURE
0265: }
0266:
0267: int getMin() { // for CLOSURE
0268: return -1;
0269: }
0270:
0271: int getMax() { // for CLOSURE
0272: return -1;
0273: }
0274:
0275: int getReferenceNumber() { // for STRING
0276: return 0;
0277: }
0278:
0279: String getString() { // for STRING
0280: return null;
0281: }
0282:
0283: int getParenNumber() {
0284: return 0;
0285: }
0286:
0287: int getChar() {
0288: return -1;
0289: }
0290:
0291: public String toString() {
0292: return this .toString(0);
0293: }
0294:
0295: public String toString(int options) {
0296: return this .type == Token.DOT ? "." : "";
0297: }
0298:
0299: /**
0300: * How many characters are needed?
0301: */
0302: final int getMinLength() {
0303: switch (this .type) {
0304: case CONCAT:
0305: int sum = 0;
0306: for (int i = 0; i < this .size(); i++)
0307: sum += this .getChild(i).getMinLength();
0308: return sum;
0309:
0310: case CONDITION:
0311: case UNION:
0312: if (this .size() == 0)
0313: return 0;
0314: int ret = this .getChild(0).getMinLength();
0315: for (int i = 1; i < this .size(); i++) {
0316: int min = this .getChild(i).getMinLength();
0317: if (min < ret)
0318: ret = min;
0319: }
0320: return ret;
0321:
0322: case CLOSURE:
0323: case NONGREEDYCLOSURE:
0324: if (this .getMin() >= 0)
0325: return this .getMin() * this .getChild(0).getMinLength();
0326: return 0;
0327:
0328: case EMPTY:
0329: case ANCHOR:
0330: return 0;
0331:
0332: case DOT:
0333: case CHAR:
0334: case RANGE:
0335: case NRANGE:
0336: return 1;
0337:
0338: case INDEPENDENT:
0339: case PAREN:
0340: case MODIFIERGROUP:
0341: return this .getChild(0).getMinLength();
0342:
0343: case BACKREFERENCE:
0344: return 0; // *******
0345:
0346: case STRING:
0347: return this .getString().length();
0348:
0349: case LOOKAHEAD:
0350: case NEGATIVELOOKAHEAD:
0351: case LOOKBEHIND:
0352: case NEGATIVELOOKBEHIND:
0353: return 0; // ***** Really?
0354:
0355: default:
0356: throw new RuntimeException(
0357: "Token#getMinLength(): Invalid Type: " + this .type);
0358: }
0359: }
0360:
0361: final int getMaxLength() {
0362: switch (this .type) {
0363: case CONCAT:
0364: int sum = 0;
0365: for (int i = 0; i < this .size(); i++) {
0366: int d = this .getChild(i).getMaxLength();
0367: if (d < 0)
0368: return -1;
0369: sum += d;
0370: }
0371: return sum;
0372:
0373: case CONDITION:
0374: case UNION:
0375: if (this .size() == 0)
0376: return 0;
0377: int ret = this .getChild(0).getMaxLength();
0378: for (int i = 1; ret >= 0 && i < this .size(); i++) {
0379: int max = this .getChild(i).getMaxLength();
0380: if (max < 0) { // infinity
0381: ret = -1;
0382: break;
0383: }
0384: if (max > ret)
0385: ret = max;
0386: }
0387: return ret;
0388:
0389: case CLOSURE:
0390: case NONGREEDYCLOSURE:
0391: if (this .getMax() >= 0)
0392: // When this.child.getMaxLength() < 0,
0393: // this returns minus value
0394: return this .getMax() * this .getChild(0).getMaxLength();
0395: return -1;
0396:
0397: case EMPTY:
0398: case ANCHOR:
0399: return 0;
0400:
0401: case CHAR:
0402: return 1;
0403: case DOT:
0404: case RANGE:
0405: case NRANGE:
0406: return 2;
0407:
0408: case INDEPENDENT:
0409: case PAREN:
0410: case MODIFIERGROUP:
0411: return this .getChild(0).getMaxLength();
0412:
0413: case BACKREFERENCE:
0414: return -1; // ******
0415:
0416: case STRING:
0417: return this .getString().length();
0418:
0419: case LOOKAHEAD:
0420: case NEGATIVELOOKAHEAD:
0421: case LOOKBEHIND:
0422: case NEGATIVELOOKBEHIND:
0423: return 0; // ***** Really?
0424:
0425: default:
0426: throw new RuntimeException(
0427: "Token#getMaxLength(): Invalid Type: " + this .type);
0428: }
0429: }
0430:
0431: static final int FC_CONTINUE = 0;
0432: static final int FC_TERMINAL = 1;
0433: static final int FC_ANY = 2;
0434:
0435: private static final boolean isSet(int options, int flag) {
0436: return (options & flag) == flag;
0437: }
0438:
0439: final int analyzeFirstCharacter(RangeToken result, int options) {
0440: switch (this .type) {
0441: case CONCAT:
0442: int ret = FC_CONTINUE;
0443: for (int i = 0; i < this .size(); i++)
0444: if ((ret = this .getChild(i).analyzeFirstCharacter(
0445: result, options)) != FC_CONTINUE)
0446: break;
0447: return ret;
0448:
0449: case UNION:
0450: if (this .size() == 0)
0451: return FC_CONTINUE;
0452: /*
0453: * a|b|c -> FC_TERMINAL
0454: * a|.|c -> FC_ANY
0455: * a|b| -> FC_CONTINUE
0456: */
0457: int ret2 = FC_CONTINUE;
0458: boolean hasEmpty = false;
0459: for (int i = 0; i < this .size(); i++) {
0460: ret2 = this .getChild(i).analyzeFirstCharacter(result,
0461: options);
0462: if (ret2 == FC_ANY)
0463: break;
0464: else if (ret2 == FC_CONTINUE)
0465: hasEmpty = true;
0466: }
0467: return hasEmpty ? FC_CONTINUE : ret2;
0468:
0469: case CONDITION:
0470: int ret3 = this .getChild(0).analyzeFirstCharacter(result,
0471: options);
0472: if (this .size() == 1)
0473: return FC_CONTINUE;
0474: if (ret3 == FC_ANY)
0475: return ret3;
0476: int ret4 = this .getChild(1).analyzeFirstCharacter(result,
0477: options);
0478: if (ret4 == FC_ANY)
0479: return ret4;
0480: return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE
0481: : FC_TERMINAL;
0482:
0483: case CLOSURE:
0484: case NONGREEDYCLOSURE:
0485: this .getChild(0).analyzeFirstCharacter(result, options);
0486: return FC_CONTINUE;
0487:
0488: case EMPTY:
0489: case ANCHOR:
0490: return FC_CONTINUE;
0491:
0492: case CHAR:
0493: int ch = this .getChar();
0494: result.addRange(ch, ch);
0495: if (ch < 0x10000
0496: && isSet(options, RegularExpression.IGNORE_CASE)) {
0497: ch = Character.toUpperCase((char) ch);
0498: result.addRange(ch, ch);
0499: ch = Character.toLowerCase((char) ch);
0500: result.addRange(ch, ch);
0501: }
0502: return FC_TERMINAL;
0503:
0504: case DOT:
0505: return FC_ANY;
0506:
0507: case RANGE:
0508: if (isSet(options, RegularExpression.IGNORE_CASE)) {
0509: result.mergeRanges(((RangeToken) this )
0510: .getCaseInsensitiveToken());
0511: } else {
0512: result.mergeRanges(this );
0513: }
0514: return FC_TERMINAL;
0515:
0516: case NRANGE: // ****
0517: if (isSet(options, RegularExpression.IGNORE_CASE)) {
0518: result.mergeRanges(Token
0519: .complementRanges(((RangeToken) this )
0520: .getCaseInsensitiveToken()));
0521: } else {
0522: result.mergeRanges(Token.complementRanges(this ));
0523: }
0524: return FC_TERMINAL;
0525:
0526: case INDEPENDENT:
0527: case PAREN:
0528: return this .getChild(0).analyzeFirstCharacter(result,
0529: options);
0530:
0531: case MODIFIERGROUP:
0532: options |= ((ModifierToken) this ).getOptions();
0533: options &= ~((ModifierToken) this ).getOptionsMask();
0534: return this .getChild(0).analyzeFirstCharacter(result,
0535: options);
0536:
0537: case BACKREFERENCE:
0538: result.addRange(0, UTF16_MAX); // **** We can not optimize.
0539: return FC_ANY;
0540:
0541: case STRING:
0542: int cha = this .getString().charAt(0);
0543: int ch2;
0544: if (REUtil.isHighSurrogate(cha)
0545: && this .getString().length() >= 2
0546: && REUtil.isLowSurrogate((ch2 = this .getString()
0547: .charAt(1))))
0548: cha = REUtil.composeFromSurrogates(cha, ch2);
0549: result.addRange(cha, cha);
0550: if (cha < 0x10000
0551: && isSet(options, RegularExpression.IGNORE_CASE)) {
0552: cha = Character.toUpperCase((char) cha);
0553: result.addRange(cha, cha);
0554: cha = Character.toLowerCase((char) cha);
0555: result.addRange(cha, cha);
0556: }
0557: return FC_TERMINAL;
0558:
0559: case LOOKAHEAD:
0560: case NEGATIVELOOKAHEAD:
0561: case LOOKBEHIND:
0562: case NEGATIVELOOKBEHIND:
0563: return FC_CONTINUE;
0564:
0565: default:
0566: throw new RuntimeException(
0567: "Token#analyzeHeadCharacter(): Invalid Type: "
0568: + this .type);
0569: }
0570: }
0571:
0572: private final boolean isShorterThan(Token tok) {
0573: if (tok == null)
0574: return false;
0575: /*
0576: int mylength;
0577: if (this.type == STRING) mylength = this.getString().length();
0578: else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
0579: else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
0580: int otherlength;
0581: if (tok.type == STRING) otherlength = tok.getString().length();
0582: else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
0583: else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
0584: */
0585: int mylength;
0586: if (this .type == STRING)
0587: mylength = this .getString().length();
0588: else
0589: throw new RuntimeException("Internal Error: Illegal type: "
0590: + this .type);
0591: int otherlength;
0592: if (tok.type == STRING)
0593: otherlength = tok.getString().length();
0594: else
0595: throw new RuntimeException("Internal Error: Illegal type: "
0596: + tok.type);
0597: return mylength < otherlength;
0598: }
0599:
0600: static class FixedStringContainer {
0601: Token token = null;
0602: int options = 0;
0603:
0604: FixedStringContainer() {
0605: }
0606: }
0607:
0608: final void findFixedString(FixedStringContainer container,
0609: int options) {
0610: switch (this .type) {
0611: case CONCAT:
0612: Token prevToken = null;
0613: int prevOptions = 0;
0614: for (int i = 0; i < this .size(); i++) {
0615: this .getChild(i).findFixedString(container, options);
0616: if (prevToken == null
0617: || prevToken.isShorterThan(container.token)) {
0618: prevToken = container.token;
0619: prevOptions = container.options;
0620: }
0621: }
0622: container.token = prevToken;
0623: container.options = prevOptions;
0624: return;
0625:
0626: case UNION:
0627: case CLOSURE:
0628: case NONGREEDYCLOSURE:
0629: case EMPTY:
0630: case ANCHOR:
0631: case RANGE:
0632: case DOT:
0633: case NRANGE:
0634: case BACKREFERENCE:
0635: case LOOKAHEAD:
0636: case NEGATIVELOOKAHEAD:
0637: case LOOKBEHIND:
0638: case NEGATIVELOOKBEHIND:
0639: case CONDITION:
0640: container.token = null;
0641: return;
0642:
0643: case CHAR: // Ignore CHAR tokens.
0644: container.token = null; // **
0645: return; // **
0646:
0647: case STRING:
0648: container.token = this ;
0649: container.options = options;
0650: return;
0651:
0652: case INDEPENDENT:
0653: case PAREN:
0654: this .getChild(0).findFixedString(container, options);
0655: return;
0656:
0657: case MODIFIERGROUP:
0658: options |= ((ModifierToken) this ).getOptions();
0659: options &= ~((ModifierToken) this ).getOptionsMask();
0660: this .getChild(0).findFixedString(container, options);
0661: return;
0662:
0663: default:
0664: throw new RuntimeException(
0665: "Token#findFixedString(): Invalid Type: "
0666: + this .type);
0667: }
0668: }
0669:
0670: boolean match(int ch) {
0671: throw new RuntimeException("NFAArrow#match(): Internal error: "
0672: + this .type);
0673: }
0674:
0675: // ------------------------------------------------------
0676: private final static Hashtable categories = new Hashtable();
0677: private final static Hashtable categories2 = new Hashtable();
0678: private static final String[] categoryNames = { "Cn", "Lu", "Ll",
0679: "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", "Nl", "No", "Zs",
0680: "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", "Pd", "Ps", "Pe",
0681: "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
0682: "Pi", "Pf", // 29, 30
0683: "L", "M", "N", "Z", "C", "P", "S", // 31-37
0684: };
0685:
0686: // Schema Rec. {Datatypes} - Punctuation
0687: static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
0688: static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
0689: static final int CHAR_LETTER = 31;
0690: static final int CHAR_MARK = 32;
0691: static final int CHAR_NUMBER = 33;
0692: static final int CHAR_SEPARATOR = 34;
0693: static final int CHAR_OTHER = 35;
0694: static final int CHAR_PUNCTUATION = 36;
0695: static final int CHAR_SYMBOL = 37;
0696:
0697: //blockNames in UNICODE 3.1 that supported by XML Schema REC
0698: private static final String[] blockNames = {
0699: /*0000..007F;*/"Basic Latin",
0700: /*0080..00FF;*/"Latin-1 Supplement",
0701: /*0100..017F;*/"Latin Extended-A",
0702: /*0180..024F;*/"Latin Extended-B",
0703: /*0250..02AF;*/"IPA Extensions",
0704: /*02B0..02FF;*/"Spacing Modifier Letters",
0705: /*0300..036F;*/"Combining Diacritical Marks",
0706: /*0370..03FF;*/"Greek",
0707: /*0400..04FF;*/"Cyrillic",
0708: /*0530..058F;*/"Armenian",
0709: /*0590..05FF;*/"Hebrew",
0710: /*0600..06FF;*/"Arabic",
0711: /*0700..074F;*/"Syriac",
0712: /*0780..07BF;*/"Thaana",
0713: /*0900..097F;*/"Devanagari",
0714: /*0980..09FF;*/"Bengali",
0715: /*0A00..0A7F;*/"Gurmukhi",
0716: /*0A80..0AFF;*/"Gujarati",
0717: /*0B00..0B7F;*/"Oriya",
0718: /*0B80..0BFF;*/"Tamil",
0719: /*0C00..0C7F;*/"Telugu",
0720: /*0C80..0CFF;*/"Kannada",
0721: /*0D00..0D7F;*/"Malayalam",
0722: /*0D80..0DFF;*/"Sinhala",
0723: /*0E00..0E7F;*/"Thai",
0724: /*0E80..0EFF;*/"Lao",
0725: /*0F00..0FFF;*/"Tibetan",
0726: /*1000..109F;*/"Myanmar",
0727: /*10A0..10FF;*/"Georgian",
0728: /*1100..11FF;*/"Hangul Jamo",
0729: /*1200..137F;*/"Ethiopic",
0730: /*13A0..13FF;*/"Cherokee",
0731: /*1400..167F;*/"Unified Canadian Aboriginal Syllabics",
0732: /*1680..169F;*/"Ogham",
0733: /*16A0..16FF;*/"Runic",
0734: /*1780..17FF;*/"Khmer",
0735: /*1800..18AF;*/"Mongolian",
0736: /*1E00..1EFF;*/"Latin Extended Additional",
0737: /*1F00..1FFF;*/"Greek Extended",
0738: /*2000..206F;*/"General Punctuation",
0739: /*2070..209F;*/"Superscripts and Subscripts",
0740: /*20A0..20CF;*/"Currency Symbols",
0741: /*20D0..20FF;*/"Combining Marks for Symbols",
0742: /*2100..214F;*/"Letterlike Symbols",
0743: /*2150..218F;*/"Number Forms",
0744: /*2190..21FF;*/"Arrows",
0745: /*2200..22FF;*/"Mathematical Operators",
0746: /*2300..23FF;*/"Miscellaneous Technical",
0747: /*2400..243F;*/"Control Pictures",
0748: /*2440..245F;*/"Optical Character Recognition",
0749: /*2460..24FF;*/"Enclosed Alphanumerics",
0750: /*2500..257F;*/"Box Drawing",
0751: /*2580..259F;*/"Block Elements",
0752: /*25A0..25FF;*/"Geometric Shapes",
0753: /*2600..26FF;*/"Miscellaneous Symbols",
0754: /*2700..27BF;*/"Dingbats",
0755: /*2800..28FF;*/"Braille Patterns",
0756: /*2E80..2EFF;*/"CJK Radicals Supplement",
0757: /*2F00..2FDF;*/"Kangxi Radicals",
0758: /*2FF0..2FFF;*/"Ideographic Description Characters",
0759: /*3000..303F;*/"CJK Symbols and Punctuation",
0760: /*3040..309F;*/"Hiragana",
0761: /*30A0..30FF;*/"Katakana",
0762: /*3100..312F;*/"Bopomofo",
0763: /*3130..318F;*/"Hangul Compatibility Jamo",
0764: /*3190..319F;*/"Kanbun",
0765: /*31A0..31BF;*/"Bopomofo Extended",
0766: /*3200..32FF;*/"Enclosed CJK Letters and Months",
0767: /*3300..33FF;*/"CJK Compatibility",
0768: /*3400..4DB5;*/"CJK Unified Ideographs Extension A",
0769: /*4E00..9FFF;*/"CJK Unified Ideographs",
0770: /*A000..A48F;*/"Yi Syllables",
0771: /*A490..A4CF;*/"Yi Radicals",
0772: /*AC00..D7A3;*/"Hangul Syllables",
0773: /*E000..F8FF;*/"Private Use",
0774: /*F900..FAFF;*/"CJK Compatibility Ideographs",
0775: /*FB00..FB4F;*/"Alphabetic Presentation Forms",
0776: /*FB50..FDFF;*/"Arabic Presentation Forms-A",
0777: /*FE20..FE2F;*/"Combining Half Marks",
0778: /*FE30..FE4F;*/"CJK Compatibility Forms",
0779: /*FE50..FE6F;*/"Small Form Variants",
0780: /*FE70..FEFE;*/"Arabic Presentation Forms-B",
0781: /*FEFF..FEFF;*/"Specials",
0782: /*FF00..FFEF;*/"Halfwidth and Fullwidth Forms",
0783: //missing Specials add manually
0784: /*10300..1032F;*/"Old Italic", // 84
0785: /*10330..1034F;*/"Gothic",
0786: /*10400..1044F;*/"Deseret",
0787: /*1D000..1D0FF;*/"Byzantine Musical Symbols",
0788: /*1D100..1D1FF;*/"Musical Symbols",
0789: /*1D400..1D7FF;*/"Mathematical Alphanumeric Symbols",
0790: /*20000..2A6D6;*/"CJK Unified Ideographs Extension B",
0791: /*2F800..2FA1F;*/"CJK Compatibility Ideographs Supplement",
0792: /*E0000..E007F;*/"Tags",
0793: //missing 2 private use add manually
0794:
0795: };
0796: //ADD THOSE MANUALLY
0797: //F0000..FFFFD; "Private Use",
0798: //100000..10FFFD; "Private Use"
0799: //FFF0..FFFD; "Specials",
0800: static final String blockRanges = "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
0801: + "\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
0802: + "\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
0803: + "\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
0804: + "\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
0805: + "\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
0806: + "\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
0807: + "\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
0808: + "\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
0809: + "\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
0810: + "\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
0811: static final int[] nonBMPBlockRanges = {
0812: 0x10300,
0813: 0x1032F, // 84
0814: 0x10330, 0x1034F, 0x10400, 0x1044F, 0x1D000, 0x1D0FF,
0815: 0x1D100, 0x1D1FF, 0x1D400, 0x1D7FF, 0x20000, 0x2A6D6,
0816: 0x2F800, 0x2FA1F, 0xE0000, 0xE007F };
0817: private static final int NONBMP_BLOCK_START = 84;
0818:
0819: static protected RangeToken getRange(String name, boolean positive) {
0820: if (Token.categories.size() == 0) {
0821: synchronized (Token.categories) {
0822: Token[] ranges = new Token[Token.categoryNames.length];
0823: for (int i = 0; i < ranges.length; i++) {
0824: ranges[i] = Token.createRange();
0825: }
0826: int type;
0827: for (int i = 0; i < 0x10000; i++) {
0828: type = Character.getType((char) i);
0829: if (type == Character.START_PUNCTUATION
0830: || type == Character.END_PUNCTUATION) {
0831: //build table of Pi values
0832: if (i == 0x00AB || i == 0x2018 || i == 0x201B
0833: || i == 0x201C || i == 0x201F
0834: || i == 0x2039) {
0835: type = CHAR_INIT_QUOTE;
0836: }
0837: //build table of Pf values
0838: if (i == 0x00BB || i == 0x2019 || i == 0x201D
0839: || i == 0x203A) {
0840: type = CHAR_FINAL_QUOTE;
0841: }
0842: }
0843: ranges[type].addRange(i, i);
0844: switch (type) {
0845: case Character.UPPERCASE_LETTER:
0846: case Character.LOWERCASE_LETTER:
0847: case Character.TITLECASE_LETTER:
0848: case Character.MODIFIER_LETTER:
0849: case Character.OTHER_LETTER:
0850: type = CHAR_LETTER;
0851: break;
0852: case Character.NON_SPACING_MARK:
0853: case Character.COMBINING_SPACING_MARK:
0854: case Character.ENCLOSING_MARK:
0855: type = CHAR_MARK;
0856: break;
0857: case Character.DECIMAL_DIGIT_NUMBER:
0858: case Character.LETTER_NUMBER:
0859: case Character.OTHER_NUMBER:
0860: type = CHAR_NUMBER;
0861: break;
0862: case Character.SPACE_SEPARATOR:
0863: case Character.LINE_SEPARATOR:
0864: case Character.PARAGRAPH_SEPARATOR:
0865: type = CHAR_SEPARATOR;
0866: break;
0867: case Character.CONTROL:
0868: case Character.FORMAT:
0869: case Character.SURROGATE:
0870: case Character.PRIVATE_USE:
0871: case Character.UNASSIGNED:
0872: type = CHAR_OTHER;
0873: break;
0874: case Character.CONNECTOR_PUNCTUATION:
0875: case Character.DASH_PUNCTUATION:
0876: case Character.START_PUNCTUATION:
0877: case Character.END_PUNCTUATION:
0878: case CHAR_INIT_QUOTE:
0879: case CHAR_FINAL_QUOTE:
0880: case Character.OTHER_PUNCTUATION:
0881: type = CHAR_PUNCTUATION;
0882: break;
0883: case Character.MATH_SYMBOL:
0884: case Character.CURRENCY_SYMBOL:
0885: case Character.MODIFIER_SYMBOL:
0886: case Character.OTHER_SYMBOL:
0887: type = CHAR_SYMBOL;
0888: break;
0889: default:
0890: throw new RuntimeException(
0891: "org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "
0892: + type);
0893: }
0894: ranges[type].addRange(i, i);
0895: } // for all characters
0896: ranges[Character.UNASSIGNED].addRange(0x10000,
0897: Token.UTF16_MAX);
0898:
0899: for (int i = 0; i < ranges.length; i++) {
0900: if (Token.categoryNames[i] != null) {
0901: if (i == Character.UNASSIGNED) { // Unassigned
0902: ranges[i]
0903: .addRange(0x10000, Token.UTF16_MAX);
0904: }
0905: Token.categories.put(Token.categoryNames[i],
0906: ranges[i]);
0907: Token.categories2.put(Token.categoryNames[i],
0908: Token.complementRanges(ranges[i]));
0909: }
0910: }
0911: //REVISIT: do we really need to support block names as in Unicode 3.1
0912: // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
0913: //
0914: StringBuffer buffer = new StringBuffer(50);
0915: for (int i = 0; i < Token.blockNames.length; i++) {
0916: Token r1 = Token.createRange();
0917: int location;
0918: if (i < NONBMP_BLOCK_START) {
0919: location = i * 2;
0920: int rstart = Token.blockRanges.charAt(location);
0921: int rend = Token.blockRanges
0922: .charAt(location + 1);
0923: //DEBUGING
0924: //System.out.println(n+" " +Integer.toHexString(rstart)
0925: // +"-"+ Integer.toHexString(rend));
0926: r1.addRange(rstart, rend);
0927: } else {
0928: location = (i - NONBMP_BLOCK_START) * 2;
0929: r1.addRange(Token.nonBMPBlockRanges[location],
0930: Token.nonBMPBlockRanges[location + 1]);
0931: }
0932: String n = Token.blockNames[i];
0933: if (n.equals("Specials"))
0934: r1.addRange(0xfff0, 0xfffd);
0935: if (n.equals("Private Use")) {
0936: r1.addRange(0xF0000, 0xFFFFD);
0937: r1.addRange(0x100000, 0x10FFFD);
0938: }
0939: Token.categories.put(n, r1);
0940: Token.categories2
0941: .put(n, Token.complementRanges(r1));
0942: buffer.setLength(0);
0943: buffer.append("Is");
0944: if (n.indexOf(' ') >= 0) {
0945: for (int ci = 0; ci < n.length(); ci++)
0946: if (n.charAt(ci) != ' ')
0947: buffer.append((char) n.charAt(ci));
0948: } else {
0949: buffer.append(n);
0950: }
0951: Token.setAlias(buffer.toString(), n, true);
0952: }
0953:
0954: // TR#18 1.2
0955: Token.setAlias("ASSIGNED", "Cn", false);
0956: Token.setAlias("UNASSIGNED", "Cn", true);
0957: Token all = Token.createRange();
0958: all.addRange(0, Token.UTF16_MAX);
0959: Token.categories.put("ALL", all);
0960: Token.categories2.put("ALL", Token
0961: .complementRanges(all));
0962: Token.registerNonXS("ASSIGNED");
0963: Token.registerNonXS("UNASSIGNED");
0964: Token.registerNonXS("ALL");
0965:
0966: Token isalpha = Token.createRange();
0967: isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
0968: isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
0969: isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
0970: Token.categories.put("IsAlpha", isalpha);
0971: Token.categories2.put("IsAlpha", Token
0972: .complementRanges(isalpha));
0973: Token.registerNonXS("IsAlpha");
0974:
0975: Token isalnum = Token.createRange();
0976: isalnum.mergeRanges(isalpha); // Lu Ll Lo
0977: isalnum
0978: .mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
0979: Token.categories.put("IsAlnum", isalnum);
0980: Token.categories2.put("IsAlnum", Token
0981: .complementRanges(isalnum));
0982: Token.registerNonXS("IsAlnum");
0983:
0984: Token isspace = Token.createRange();
0985: isspace.mergeRanges(Token.token_spaces);
0986: isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
0987: Token.categories.put("IsSpace", isspace);
0988: Token.categories2.put("IsSpace", Token
0989: .complementRanges(isspace));
0990: Token.registerNonXS("IsSpace");
0991:
0992: Token isword = Token.createRange();
0993: isword.mergeRanges(isalnum); // Lu Ll Lo Nd
0994: isword.addRange('_', '_');
0995: Token.categories.put("IsWord", isword);
0996: Token.categories2.put("IsWord", Token
0997: .complementRanges(isword));
0998: Token.registerNonXS("IsWord");
0999:
1000: Token isascii = Token.createRange();
1001: isascii.addRange(0, 127);
1002: Token.categories.put("IsASCII", isascii);
1003: Token.categories2.put("IsASCII", Token
1004: .complementRanges(isascii));
1005: Token.registerNonXS("IsASCII");
1006:
1007: Token isnotgraph = Token.createRange();
1008: isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
1009: isnotgraph.addRange(' ', ' ');
1010: Token.categories.put("IsGraph", Token
1011: .complementRanges(isnotgraph));
1012: Token.categories2.put("IsGraph", isnotgraph);
1013: Token.registerNonXS("IsGraph");
1014:
1015: Token isxdigit = Token.createRange();
1016: isxdigit.addRange('0', '9');
1017: isxdigit.addRange('A', 'F');
1018: isxdigit.addRange('a', 'f');
1019: Token.categories.put("IsXDigit", Token
1020: .complementRanges(isxdigit));
1021: Token.categories2.put("IsXDigit", isxdigit);
1022: Token.registerNonXS("IsXDigit");
1023:
1024: Token.setAlias("IsDigit", "Nd", true);
1025: Token.setAlias("IsUpper", "Lu", true);
1026: Token.setAlias("IsLower", "Ll", true);
1027: Token.setAlias("IsCntrl", "C", true);
1028: Token.setAlias("IsPrint", "C", false);
1029: Token.setAlias("IsPunct", "P", true);
1030: Token.registerNonXS("IsDigit");
1031: Token.registerNonXS("IsUpper");
1032: Token.registerNonXS("IsLower");
1033: Token.registerNonXS("IsCntrl");
1034: Token.registerNonXS("IsPrint");
1035: Token.registerNonXS("IsPunct");
1036:
1037: Token.setAlias("alpha", "IsAlpha", true);
1038: Token.setAlias("alnum", "IsAlnum", true);
1039: Token.setAlias("ascii", "IsASCII", true);
1040: Token.setAlias("cntrl", "IsCntrl", true);
1041: Token.setAlias("digit", "IsDigit", true);
1042: Token.setAlias("graph", "IsGraph", true);
1043: Token.setAlias("lower", "IsLower", true);
1044: Token.setAlias("print", "IsPrint", true);
1045: Token.setAlias("punct", "IsPunct", true);
1046: Token.setAlias("space", "IsSpace", true);
1047: Token.setAlias("upper", "IsUpper", true);
1048: Token.setAlias("word", "IsWord", true); // Perl extension
1049: Token.setAlias("xdigit", "IsXDigit", true);
1050: Token.registerNonXS("alpha");
1051: Token.registerNonXS("alnum");
1052: Token.registerNonXS("ascii");
1053: Token.registerNonXS("cntrl");
1054: Token.registerNonXS("digit");
1055: Token.registerNonXS("graph");
1056: Token.registerNonXS("lower");
1057: Token.registerNonXS("print");
1058: Token.registerNonXS("punct");
1059: Token.registerNonXS("space");
1060: Token.registerNonXS("upper");
1061: Token.registerNonXS("word");
1062: Token.registerNonXS("xdigit");
1063: } // synchronized
1064: } // if null
1065: RangeToken tok = positive ? (RangeToken) Token.categories
1066: .get(name) : (RangeToken) Token.categories2.get(name);
1067: //if (tok == null) System.out.println(name);
1068: return tok;
1069: }
1070:
1071: static protected RangeToken getRange(String name, boolean positive,
1072: boolean xs) {
1073: RangeToken range = Token.getRange(name, positive);
1074: if (xs && range != null && Token.isRegisterNonXS(name))
1075: range = null;
1076: return range;
1077: }
1078:
1079: static Hashtable nonxs = null;
1080:
1081: /**
1082: * This method is called by only getRange().
1083: * So this method need not MT-safe.
1084: */
1085: static protected void registerNonXS(String name) {
1086: if (Token.nonxs == null)
1087: Token.nonxs = new Hashtable();
1088: Token.nonxs.put(name, name);
1089: }
1090:
1091: static protected boolean isRegisterNonXS(String name) {
1092: if (Token.nonxs == null)
1093: return false;
1094: //DEBUG
1095: //System.err.println("isRegisterNonXS: "+name);
1096: return Token.nonxs.containsKey(name);
1097: }
1098:
1099: private static void setAlias(String newName, String name,
1100: boolean positive) {
1101: Token t1 = (Token) Token.categories.get(name);
1102: Token t2 = (Token) Token.categories2.get(name);
1103: if (positive) {
1104: Token.categories.put(newName, t1);
1105: Token.categories2.put(newName, t2);
1106: } else {
1107: Token.categories2.put(newName, t1);
1108: Token.categories.put(newName, t2);
1109: }
1110: }
1111:
1112: // ------------------------------------------------------
1113:
1114: static final String viramaString = "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1115: + "\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1116: + "\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1117: + "\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1118: + "\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1119: + "\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1120: + "\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1121: + "\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1122: + "\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1123: + "\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1124: + "\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1125:
1126: static private Token token_grapheme = null;
1127:
1128: static synchronized Token getGraphemePattern() {
1129: if (Token.token_grapheme != null)
1130: return Token.token_grapheme;
1131:
1132: Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
1133: base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1134: base_char.subtractRanges(Token.getRange("M", true));
1135: base_char.subtractRanges(Token.getRange("C", true));
1136:
1137: Token virama = Token.createRange();
1138: for (int i = 0; i < Token.viramaString.length(); i++) {
1139: virama.addRange(i, i);
1140: }
1141:
1142: Token combiner_wo_virama = Token.createRange();
1143: combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1144: combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1145: combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1146:
1147: Token left = Token.createUnion(); // base_char?
1148: left.addChild(base_char);
1149: left.addChild(Token.token_empty);
1150:
1151: Token foo = Token.createUnion();
1152: foo.addChild(Token.createConcat(virama, Token.getRange("L",
1153: true)));
1154: foo.addChild(combiner_wo_virama);
1155:
1156: foo = Token.createClosure(foo);
1157:
1158: foo = Token.createConcat(left, foo);
1159:
1160: Token.token_grapheme = foo;
1161: return Token.token_grapheme;
1162: }
1163:
1164: /**
1165: * Combing Character Sequence in Perl 5.6.
1166: */
1167: static private Token token_ccs = null;
1168:
1169: static synchronized Token getCombiningCharacterSequence() {
1170: if (Token.token_ccs != null)
1171: return Token.token_ccs;
1172:
1173: Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1174: foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1175: Token.token_ccs = foo;
1176: return Token.token_ccs;
1177: }
1178:
1179: // ------------------------------------------------------
1180:
1181: // ------------------------------------------------------
1182: /**
1183: * This class represents a node in parse tree.
1184: */
1185: static class StringToken extends Token implements
1186: java.io.Serializable {
1187:
1188: private static final long serialVersionUID = -4614366944218504172L;
1189:
1190: String string;
1191: final int refNumber;
1192:
1193: StringToken(int type, String str, int n) {
1194: super (type);
1195: this .string = str;
1196: this .refNumber = n;
1197: }
1198:
1199: int getReferenceNumber() { // for STRING
1200: return this .refNumber;
1201: }
1202:
1203: String getString() { // for STRING
1204: return this .string;
1205: }
1206:
1207: public String toString(int options) {
1208: if (this .type == BACKREFERENCE)
1209: return "\\" + this .refNumber;
1210: else
1211: return REUtil.quoteMeta(this .string);
1212: }
1213: }
1214:
1215: /**
1216: * This class represents a node in parse tree.
1217: */
1218: static class ConcatToken extends Token implements
1219: java.io.Serializable {
1220:
1221: private static final long serialVersionUID = 8717321425541346381L;
1222:
1223: final Token child;
1224: final Token child2;
1225:
1226: ConcatToken(Token t1, Token t2) {
1227: super (Token.CONCAT);
1228: this .child = t1;
1229: this .child2 = t2;
1230: }
1231:
1232: int size() {
1233: return 2;
1234: }
1235:
1236: Token getChild(int index) {
1237: return index == 0 ? this .child : this .child2;
1238: }
1239:
1240: public String toString(int options) {
1241: String ret;
1242: if (this .child2.type == CLOSURE
1243: && this .child2.getChild(0) == this .child) {
1244: ret = this .child.toString(options) + "+";
1245: } else if (this .child2.type == NONGREEDYCLOSURE
1246: && this .child2.getChild(0) == this .child) {
1247: ret = this .child.toString(options) + "+?";
1248: } else
1249: ret = this .child.toString(options)
1250: + this .child2.toString(options);
1251: return ret;
1252: }
1253: }
1254:
1255: /**
1256: * This class represents a node in parse tree.
1257: */
1258: static class CharToken extends Token implements
1259: java.io.Serializable {
1260:
1261: private static final long serialVersionUID = -4394272816279496989L;
1262:
1263: final int chardata;
1264:
1265: CharToken(int type, int ch) {
1266: super (type);
1267: this .chardata = ch;
1268: }
1269:
1270: int getChar() {
1271: return this .chardata;
1272: }
1273:
1274: public String toString(int options) {
1275: String ret;
1276: switch (this .type) {
1277: case CHAR:
1278: switch (this .chardata) {
1279: case '|':
1280: case '*':
1281: case '+':
1282: case '?':
1283: case '(':
1284: case ')':
1285: case '.':
1286: case '[':
1287: case '{':
1288: case '\\':
1289: ret = "\\" + (char) this .chardata;
1290: break;
1291: case '\f':
1292: ret = "\\f";
1293: break;
1294: case '\n':
1295: ret = "\\n";
1296: break;
1297: case '\r':
1298: ret = "\\r";
1299: break;
1300: case '\t':
1301: ret = "\\t";
1302: break;
1303: case 0x1b:
1304: ret = "\\e";
1305: break;
1306: //case 0x0b: ret = "\\v"; break;
1307: default:
1308: if (this .chardata >= 0x10000) {
1309: String pre = "0"
1310: + Integer.toHexString(this .chardata);
1311: ret = "\\v"
1312: + pre.substring(pre.length() - 6, pre
1313: .length());
1314: } else
1315: ret = "" + (char) this .chardata;
1316: }
1317: break;
1318:
1319: case ANCHOR:
1320: if (this == Token.token_linebeginning
1321: || this == Token.token_lineend)
1322: ret = "" + (char) this .chardata;
1323: else
1324: ret = "\\" + (char) this .chardata;
1325: break;
1326:
1327: default:
1328: ret = null;
1329: }
1330: return ret;
1331: }
1332:
1333: boolean match(int ch) {
1334: if (this .type == CHAR) {
1335: return ch == this .chardata;
1336: } else
1337: throw new RuntimeException(
1338: "NFAArrow#match(): Internal error: "
1339: + this .type);
1340: }
1341: }
1342:
1343: /**
1344: * This class represents a node in parse tree.
1345: */
1346: static class ClosureToken extends Token implements
1347: java.io.Serializable {
1348:
1349: private static final long serialVersionUID = 1308971930673997452L;
1350:
1351: int min;
1352: int max;
1353: final Token child;
1354:
1355: ClosureToken(int type, Token tok) {
1356: super (type);
1357: this .child = tok;
1358: this .setMin(-1);
1359: this .setMax(-1);
1360: }
1361:
1362: int size() {
1363: return 1;
1364: }
1365:
1366: Token getChild(int index) {
1367: return this .child;
1368: }
1369:
1370: final void setMin(int min) {
1371: this .min = min;
1372: }
1373:
1374: final void setMax(int max) {
1375: this .max = max;
1376: }
1377:
1378: final int getMin() {
1379: return this .min;
1380: }
1381:
1382: final int getMax() {
1383: return this .max;
1384: }
1385:
1386: public String toString(int options) {
1387: String ret;
1388: if (this .type == CLOSURE) {
1389: if (this .getMin() < 0 && this .getMax() < 0) {
1390: ret = this .child.toString(options) + "*";
1391: } else if (this .getMin() == this .getMax()) {
1392: ret = this .child.toString(options) + "{"
1393: + this .getMin() + "}";
1394: } else if (this .getMin() >= 0 && this .getMax() >= 0) {
1395: ret = this .child.toString(options) + "{"
1396: + this .getMin() + "," + this .getMax() + "}";
1397: } else if (this .getMin() >= 0 && this .getMax() < 0) {
1398: ret = this .child.toString(options) + "{"
1399: + this .getMin() + ",}";
1400: } else
1401: throw new RuntimeException(
1402: "Token#toString(): CLOSURE "
1403: + this .getMin() + ", "
1404: + this .getMax());
1405: } else {
1406: if (this .getMin() < 0 && this .getMax() < 0) {
1407: ret = this .child.toString(options) + "*?";
1408: } else if (this .getMin() == this .getMax()) {
1409: ret = this .child.toString(options) + "{"
1410: + this .getMin() + "}?";
1411: } else if (this .getMin() >= 0 && this .getMax() >= 0) {
1412: ret = this .child.toString(options) + "{"
1413: + this .getMin() + "," + this .getMax()
1414: + "}?";
1415: } else if (this .getMin() >= 0 && this .getMax() < 0) {
1416: ret = this .child.toString(options) + "{"
1417: + this .getMin() + ",}?";
1418: } else
1419: throw new RuntimeException(
1420: "Token#toString(): NONGREEDYCLOSURE "
1421: + this .getMin() + ", "
1422: + this .getMax());
1423: }
1424: return ret;
1425: }
1426: }
1427:
1428: /**
1429: * This class represents a node in parse tree.
1430: */
1431: static class ParenToken extends Token implements
1432: java.io.Serializable {
1433:
1434: private static final long serialVersionUID = -5938014719827987704L;
1435:
1436: final Token child;
1437: final int parennumber;
1438:
1439: ParenToken(int type, Token tok, int paren) {
1440: super (type);
1441: this .child = tok;
1442: this .parennumber = paren;
1443: }
1444:
1445: int size() {
1446: return 1;
1447: }
1448:
1449: Token getChild(int index) {
1450: return this .child;
1451: }
1452:
1453: int getParenNumber() {
1454: return this .parennumber;
1455: }
1456:
1457: public String toString(int options) {
1458: String ret = null;
1459: switch (this .type) {
1460: case PAREN:
1461: if (this .parennumber == 0) {
1462: ret = "(?:" + this .child.toString(options) + ")";
1463: } else {
1464: ret = "(" + this .child.toString(options) + ")";
1465: }
1466: break;
1467:
1468: case LOOKAHEAD:
1469: ret = "(?=" + this .child.toString(options) + ")";
1470: break;
1471: case NEGATIVELOOKAHEAD:
1472: ret = "(?!" + this .child.toString(options) + ")";
1473: break;
1474: case LOOKBEHIND:
1475: ret = "(?<=" + this .child.toString(options) + ")";
1476: break;
1477: case NEGATIVELOOKBEHIND:
1478: ret = "(?<!" + this .child.toString(options) + ")";
1479: break;
1480: case INDEPENDENT:
1481: ret = "(?>" + this .child.toString(options) + ")";
1482: break;
1483: }
1484: return ret;
1485: }
1486: }
1487:
1488: /**
1489: * (?(condition)yes-pattern|no-pattern)
1490: */
1491: static class ConditionToken extends Token implements
1492: java.io.Serializable {
1493:
1494: private static final long serialVersionUID = 4353765277910594411L;
1495:
1496: final int refNumber;
1497: final Token condition;
1498: final Token yes;
1499: final Token no;
1500:
1501: ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1502: super (Token.CONDITION);
1503: this .refNumber = refno;
1504: this .condition = cond;
1505: this .yes = yespat;
1506: this .no = nopat;
1507: }
1508:
1509: int size() {
1510: return this .no == null ? 1 : 2;
1511: }
1512:
1513: Token getChild(int index) {
1514: if (index == 0)
1515: return this .yes;
1516: if (index == 1)
1517: return this .no;
1518: throw new RuntimeException("Internal Error: " + index);
1519: }
1520:
1521: public String toString(int options) {
1522: String ret;
1523: if (refNumber > 0) {
1524: ret = "(?(" + refNumber + ")";
1525: } else if (this .condition.type == Token.ANCHOR) {
1526: ret = "(?(" + this .condition + ")";
1527: } else {
1528: ret = "(?" + this .condition;
1529: }
1530:
1531: if (this .no == null) {
1532: ret += this .yes + ")";
1533: } else {
1534: ret += this .yes + "|" + this .no + ")";
1535: }
1536: return ret;
1537: }
1538: }
1539:
1540: /**
1541: * (ims-ims: .... )
1542: */
1543: static class ModifierToken extends Token implements
1544: java.io.Serializable {
1545:
1546: private static final long serialVersionUID = -9114536559696480356L;
1547:
1548: final Token child;
1549: final int add;
1550: final int mask;
1551:
1552: ModifierToken(Token tok, int add, int mask) {
1553: super (Token.MODIFIERGROUP);
1554: this .child = tok;
1555: this .add = add;
1556: this .mask = mask;
1557: }
1558:
1559: int size() {
1560: return 1;
1561: }
1562:
1563: Token getChild(int index) {
1564: return this .child;
1565: }
1566:
1567: int getOptions() {
1568: return this .add;
1569: }
1570:
1571: int getOptionsMask() {
1572: return this .mask;
1573: }
1574:
1575: public String toString(int options) {
1576: return "(?"
1577: + (this .add == 0 ? "" : REUtil
1578: .createOptionString(this .add))
1579: + (this .mask == 0 ? "" : REUtil
1580: .createOptionString(this .mask)) + ":"
1581: + this .child.toString(options) + ")";
1582: }
1583: }
1584:
1585: /**
1586: * This class represents a node in parse tree.
1587: * for UNION or CONCAT.
1588: */
1589: static class UnionToken extends Token implements
1590: java.io.Serializable {
1591:
1592: private static final long serialVersionUID = -2568843945989489861L;
1593:
1594: Vector children;
1595:
1596: UnionToken(int type) {
1597: super (type);
1598: }
1599:
1600: void addChild(Token tok) {
1601: if (tok == null)
1602: return;
1603: if (this .children == null)
1604: this .children = new Vector();
1605: if (this .type == UNION) {
1606: this .children.addElement(tok);
1607: return;
1608: }
1609: // This is CONCAT, and new child is CONCAT.
1610: if (tok.type == CONCAT) {
1611: for (int i = 0; i < tok.size(); i++)
1612: this .addChild(tok.getChild(i)); // Recursion
1613: return;
1614: }
1615: int size = this .children.size();
1616: if (size == 0) {
1617: this .children.addElement(tok);
1618: return;
1619: }
1620: Token previous = (Token) this .children.elementAt(size - 1);
1621: if (!((previous.type == CHAR || previous.type == STRING) && (tok.type == CHAR || tok.type == STRING))) {
1622: this .children.addElement(tok);
1623: return;
1624: }
1625:
1626: //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1627:
1628: StringBuffer buffer;
1629: int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString()
1630: .length());
1631: if (previous.type == CHAR) { // Replace previous token by STRING
1632: buffer = new StringBuffer(2 + nextMaxLength);
1633: int ch = previous.getChar();
1634: if (ch >= 0x10000)
1635: buffer.append(REUtil.decomposeToSurrogates(ch));
1636: else
1637: buffer.append((char) ch);
1638: previous = Token.createString(null);
1639: this .children.setElementAt(previous, size - 1);
1640: } else { // STRING
1641: buffer = new StringBuffer(previous.getString().length()
1642: + nextMaxLength);
1643: buffer.append(previous.getString());
1644: }
1645:
1646: if (tok.type == CHAR) {
1647: int ch = tok.getChar();
1648: if (ch >= 0x10000)
1649: buffer.append(REUtil.decomposeToSurrogates(ch));
1650: else
1651: buffer.append((char) ch);
1652: } else {
1653: buffer.append(tok.getString());
1654: }
1655:
1656: ((StringToken) previous).string = new String(buffer);
1657: }
1658:
1659: int size() {
1660: return this .children == null ? 0 : this .children.size();
1661: }
1662:
1663: Token getChild(int index) {
1664: return (Token) this .children.elementAt(index);
1665: }
1666:
1667: public String toString(int options) {
1668: String ret;
1669: if (this .type == CONCAT) {
1670: if (this .children.size() == 2) {
1671: Token ch = this .getChild(0);
1672: Token ch2 = this .getChild(1);
1673: if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1674: ret = ch.toString(options) + "+";
1675: } else if (ch2.type == NONGREEDYCLOSURE
1676: && ch2.getChild(0) == ch) {
1677: ret = ch.toString(options) + "+?";
1678: } else
1679: ret = ch.toString(options)
1680: + ch2.toString(options);
1681: } else {
1682: StringBuffer sb = new StringBuffer();
1683: for (int i = 0; i < this .children.size(); i++) {
1684: sb.append(((Token) this .children.elementAt(i))
1685: .toString(options));
1686: }
1687: ret = new String(sb);
1688: }
1689: return ret;
1690: }
1691: if (this .children.size() == 2
1692: && this .getChild(1).type == EMPTY) {
1693: ret = this .getChild(0).toString(options) + "?";
1694: } else if (this .children.size() == 2
1695: && this .getChild(0).type == EMPTY) {
1696: ret = this .getChild(1).toString(options) + "??";
1697: } else {
1698: StringBuffer sb = new StringBuffer();
1699: sb.append(((Token) this .children.elementAt(0))
1700: .toString(options));
1701: for (int i = 1; i < this .children.size(); i++) {
1702: sb.append((char) '|');
1703: sb.append(((Token) this .children.elementAt(i))
1704: .toString(options));
1705: }
1706: ret = new String(sb);
1707: }
1708: return ret;
1709: }
1710: }
1711: }
|