0001: /*
0002: * The Apache Software License, Version 1.1
0003: *
0004: *
0005: * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
0006: * reserved.
0007: *
0008: * Redistribution and use in source and binary forms, with or without
0009: * modification, are permitted provided that the following conditions
0010: * are met:
0011: *
0012: * 1. Redistributions of source code must retain the above copyright
0013: * notice, this list of conditions and the following disclaimer.
0014: *
0015: * 2. Redistributions in binary form must reproduce the above copyright
0016: * notice, this list of conditions and the following disclaimer in
0017: * the documentation and/or other materials provided with the
0018: * distribution.
0019: *
0020: * 3. The end-user documentation included with the redistribution,
0021: * if any, must include the following acknowledgment:
0022: * "This product includes software developed by the
0023: * Apache Software Foundation (http://www.apache.org/)."
0024: * Alternately, this acknowledgment may appear in the software itself,
0025: * if and wherever such third-party acknowledgments normally appear.
0026: *
0027: * 4. The names "Xerces" and "Apache Software Foundation" must
0028: * not be used to endorse or promote products derived from this
0029: * software without prior written permission. For written
0030: * permission, please contact apache@apache.org.
0031: *
0032: * 5. Products derived from this software may not be called "Apache",
0033: * nor may "Apache" appear in their name, without prior written
0034: * permission of the Apache Software Foundation.
0035: *
0036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0047: * SUCH DAMAGE.
0048: * ====================================================================
0049: *
0050: * This software consists of voluntary contributions made by many
0051: * individuals on behalf of the Apache Software Foundation and was
0052: * originally based on software copyright (c) 1999, International
0053: * Business Machines, Inc., http://www.apache.org. For more
0054: * information on the Apache Software Foundation, please see
0055: * <http://www.apache.org/>.
0056: */
0057:
0058: package org.apache.xerces.utils.regex;
0059:
0060: import java.util.Vector;
0061: import java.util.Hashtable;
0062:
0063: /**
0064: * This class represents a node in parse tree.
0065: */
0066: class Token implements java.io.Serializable {
0067: static final boolean COUNTTOKENS = true;
0068: static int tokens = 0;
0069:
0070: static final int CHAR = 0; // Literal char
0071: static final int DOT = 11; // .
0072: static final int CONCAT = 1; // XY
0073: static final int UNION = 2; // X|Y|Z
0074: static final int CLOSURE = 3; // X*
0075: static final int RANGE = 4; // [a-zA-Z] etc.
0076: static final int NRANGE = 5; // [^a-zA-Z] etc.
0077: static final int PAREN = 6; // (X) or (?:X)
0078: static final int EMPTY = 7; //
0079: static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
0080: static final int NONGREEDYCLOSURE = 9; // *? +?
0081: static final int STRING = 10; // strings
0082: static final int BACKREFERENCE = 12; // back references
0083: static final int LOOKAHEAD = 20; // (?=...)
0084: static final int NEGATIVELOOKAHEAD = 21; // (?!...)
0085: static final int LOOKBEHIND = 22; // (?<=...)
0086: static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
0087: static final int INDEPENDENT = 24; // (?>...)
0088: static final int MODIFIERGROUP = 25; // (?ims-ims:...)
0089: static final int CONDITION = 26; // (?(...)yes|no)
0090:
0091: static final int UTF16_MAX = 0x10ffff;
0092:
0093: int type;
0094:
0095: static protected Token token_dot;
0096: static protected Token token_0to9;
0097: static protected Token token_wordchars;
0098: static protected Token token_not_0to9;
0099: static protected Token token_not_wordchars;
0100: static protected Token token_spaces;
0101: static protected Token token_not_spaces;
0102: static protected Token token_empty;
0103: static protected Token token_linebeginning;
0104: static protected Token token_linebeginning2;
0105: static protected Token token_lineend;
0106: static protected Token token_stringbeginning;
0107: static protected Token token_stringend;
0108: static protected Token token_stringend2;
0109: static protected Token token_wordedge;
0110: static protected Token token_not_wordedge;
0111: static protected Token token_wordbeginning;
0112: static protected Token token_wordend;
0113: static {
0114: Token.token_empty = new Token(Token.EMPTY);
0115:
0116: Token.token_linebeginning = Token.createAnchor('^');
0117: Token.token_linebeginning2 = Token.createAnchor('@');
0118: Token.token_lineend = Token.createAnchor('$');
0119: Token.token_stringbeginning = Token.createAnchor('A');
0120: Token.token_stringend = Token.createAnchor('z');
0121: Token.token_stringend2 = Token.createAnchor('Z');
0122: Token.token_wordedge = Token.createAnchor('b');
0123: Token.token_not_wordedge = Token.createAnchor('B');
0124: Token.token_wordbeginning = Token.createAnchor('<');
0125: Token.token_wordend = Token.createAnchor('>');
0126:
0127: Token.token_dot = new Token(Token.DOT);
0128:
0129: Token.token_0to9 = Token.createRange();
0130: Token.token_0to9.addRange('0', '9');
0131: Token.token_wordchars = Token.createRange();
0132: Token.token_wordchars.addRange('0', '9');
0133: Token.token_wordchars.addRange('A', 'Z');
0134: Token.token_wordchars.addRange('_', '_');
0135: Token.token_wordchars.addRange('a', 'z');
0136: Token.token_spaces = Token.createRange();
0137: Token.token_spaces.addRange('\t', '\t');
0138: Token.token_spaces.addRange('\n', '\n');
0139: Token.token_spaces.addRange('\f', '\f');
0140: Token.token_spaces.addRange('\r', '\r');
0141: Token.token_spaces.addRange(' ', ' ');
0142:
0143: Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
0144: Token.token_not_wordchars = Token
0145: .complementRanges(Token.token_wordchars);
0146: Token.token_not_spaces = Token
0147: .complementRanges(Token.token_spaces);
0148: }
0149:
0150: static Token.ParenToken createLook(int type, Token child) {
0151: if (COUNTTOKENS)
0152: Token.tokens++;
0153: return new Token.ParenToken(type, child, 0);
0154: }
0155:
0156: static Token.ParenToken createParen(Token child, int pnumber) {
0157: if (COUNTTOKENS)
0158: Token.tokens++;
0159: return new Token.ParenToken(Token.PAREN, child, pnumber);
0160: }
0161:
0162: static Token.ClosureToken createClosure(Token tok) {
0163: if (COUNTTOKENS)
0164: Token.tokens++;
0165: return new Token.ClosureToken(Token.CLOSURE, tok);
0166: }
0167:
0168: static Token.ClosureToken createNGClosure(Token tok) {
0169: if (COUNTTOKENS)
0170: Token.tokens++;
0171: return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
0172: }
0173:
0174: static Token.ConcatToken createConcat(Token tok1, Token tok2) {
0175: if (COUNTTOKENS)
0176: Token.tokens++;
0177: return new Token.ConcatToken(tok1, tok2);
0178: }
0179:
0180: static Token.UnionToken createConcat() {
0181: if (COUNTTOKENS)
0182: Token.tokens++;
0183: return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
0184: }
0185:
0186: static Token.UnionToken createUnion() {
0187: if (COUNTTOKENS)
0188: Token.tokens++;
0189: return new Token.UnionToken(Token.UNION);
0190: }
0191:
0192: static Token createEmpty() {
0193: return Token.token_empty;
0194: }
0195:
0196: static RangeToken createRange() {
0197: if (COUNTTOKENS)
0198: Token.tokens++;
0199: return new RangeToken(Token.RANGE);
0200: }
0201:
0202: static RangeToken createNRange() {
0203: if (COUNTTOKENS)
0204: Token.tokens++;
0205: return new RangeToken(Token.NRANGE);
0206: }
0207:
0208: static Token.CharToken createChar(int ch) {
0209: if (COUNTTOKENS)
0210: Token.tokens++;
0211: return new Token.CharToken(Token.CHAR, ch);
0212: }
0213:
0214: static private Token.CharToken createAnchor(int ch) {
0215: if (COUNTTOKENS)
0216: Token.tokens++;
0217: return new Token.CharToken(Token.ANCHOR, ch);
0218: }
0219:
0220: static Token.StringToken createBackReference(int refno) {
0221: if (COUNTTOKENS)
0222: Token.tokens++;
0223: return new Token.StringToken(Token.BACKREFERENCE, null, refno);
0224: }
0225:
0226: static Token.StringToken createString(String str) {
0227: if (COUNTTOKENS)
0228: Token.tokens++;
0229: return new Token.StringToken(Token.STRING, str, 0);
0230: }
0231:
0232: static Token.ModifierToken createModifierGroup(Token child,
0233: int add, int mask) {
0234: if (COUNTTOKENS)
0235: Token.tokens++;
0236: return new Token.ModifierToken(child, add, mask);
0237: }
0238:
0239: static Token.ConditionToken createCondition(int refno,
0240: Token condition, Token yespat, Token nopat) {
0241: if (COUNTTOKENS)
0242: Token.tokens++;
0243: return new Token.ConditionToken(refno, condition, yespat, nopat);
0244: }
0245:
0246: protected Token(int type) {
0247: this .type = type;
0248: }
0249:
0250: /**
0251: * A number of children.
0252: */
0253: int size() {
0254: return 0;
0255: }
0256:
0257: Token getChild(int index) {
0258: return null;
0259: }
0260:
0261: void addChild(Token tok) {
0262: throw new RuntimeException("Not supported.");
0263: }
0264:
0265: // for RANGE or NRANGE
0266: protected void addRange(int start, int end) {
0267: throw new RuntimeException("Not supported.");
0268: }
0269:
0270: protected void sortRanges() {
0271: throw new RuntimeException("Not supported.");
0272: }
0273:
0274: protected void compactRanges() {
0275: throw new RuntimeException("Not supported.");
0276: }
0277:
0278: protected void mergeRanges(Token tok) {
0279: throw new RuntimeException("Not supported.");
0280: }
0281:
0282: protected void subtractRanges(Token tok) {
0283: throw new RuntimeException("Not supported.");
0284: }
0285:
0286: protected void intersectRanges(Token tok) {
0287: throw new RuntimeException("Not supported.");
0288: }
0289:
0290: static Token complementRanges(Token tok) {
0291: return RangeToken.complementRanges(tok);
0292: }
0293:
0294: void setMin(int min) { // for CLOSURE
0295: }
0296:
0297: void setMax(int max) { // for CLOSURE
0298: }
0299:
0300: int getMin() { // for CLOSURE
0301: return -1;
0302: }
0303:
0304: int getMax() { // for CLOSURE
0305: return -1;
0306: }
0307:
0308: int getReferenceNumber() { // for STRING
0309: return 0;
0310: }
0311:
0312: String getString() { // for STRING
0313: return null;
0314: }
0315:
0316: int getParenNumber() {
0317: return 0;
0318: }
0319:
0320: int getChar() {
0321: return -1;
0322: }
0323:
0324: public String toString() {
0325: return this .toString(0);
0326: }
0327:
0328: public String toString(int options) {
0329: return this .type == Token.DOT ? "." : "";
0330: }
0331:
0332: /**
0333: * How many characters are needed?
0334: */
0335: final int getMinLength() {
0336: switch (this .type) {
0337: case CONCAT:
0338: int sum = 0;
0339: for (int i = 0; i < this .size(); i++)
0340: sum += this .getChild(i).getMinLength();
0341: return sum;
0342:
0343: case CONDITION:
0344: case UNION:
0345: if (this .size() == 0)
0346: return 0;
0347: int ret = this .getChild(0).getMinLength();
0348: for (int i = 1; i < this .size(); i++) {
0349: int min = this .getChild(i).getMinLength();
0350: if (min < ret)
0351: ret = min;
0352: }
0353: return ret;
0354:
0355: case CLOSURE:
0356: case NONGREEDYCLOSURE:
0357: if (this .getMin() >= 0)
0358: return this .getMin() * this .getChild(0).getMinLength();
0359: return 0;
0360:
0361: case EMPTY:
0362: case ANCHOR:
0363: return 0;
0364:
0365: case DOT:
0366: case CHAR:
0367: case RANGE:
0368: case NRANGE:
0369: return 1;
0370:
0371: case INDEPENDENT:
0372: case PAREN:
0373: case MODIFIERGROUP:
0374: return this .getChild(0).getMinLength();
0375:
0376: case BACKREFERENCE:
0377: return 0; // *******
0378:
0379: case STRING:
0380: return this .getString().length();
0381:
0382: case LOOKAHEAD:
0383: case NEGATIVELOOKAHEAD:
0384: case LOOKBEHIND:
0385: case NEGATIVELOOKBEHIND:
0386: return 0; // ***** Really?
0387:
0388: default:
0389: throw new RuntimeException(
0390: "Token#getMinLength(): Invalid Type: " + this .type);
0391: }
0392: }
0393:
0394: final int getMaxLength() {
0395: switch (this .type) {
0396: case CONCAT:
0397: int sum = 0;
0398: for (int i = 0; i < this .size(); i++) {
0399: int d = this .getChild(i).getMaxLength();
0400: if (d < 0)
0401: return -1;
0402: sum += d;
0403: }
0404: return sum;
0405:
0406: case CONDITION:
0407: case UNION:
0408: if (this .size() == 0)
0409: return 0;
0410: int ret = this .getChild(0).getMaxLength();
0411: for (int i = 1; ret >= 0 && i < this .size(); i++) {
0412: int max = this .getChild(i).getMaxLength();
0413: if (max < 0) { // infinity
0414: ret = -1;
0415: break;
0416: }
0417: if (max > ret)
0418: ret = max;
0419: }
0420: return ret;
0421:
0422: case CLOSURE:
0423: case NONGREEDYCLOSURE:
0424: if (this .getMax() >= 0)
0425: // When this.child.getMaxLength() < 0,
0426: // this returns minus value
0427: return this .getMax() * this .getChild(0).getMaxLength();
0428: return -1;
0429:
0430: case EMPTY:
0431: case ANCHOR:
0432: return 0;
0433:
0434: case CHAR:
0435: return 1;
0436: case DOT:
0437: case RANGE:
0438: case NRANGE:
0439: return 2;
0440:
0441: case INDEPENDENT:
0442: case PAREN:
0443: case MODIFIERGROUP:
0444: return this .getChild(0).getMaxLength();
0445:
0446: case BACKREFERENCE:
0447: return -1; // ******
0448:
0449: case STRING:
0450: return this .getString().length();
0451:
0452: case LOOKAHEAD:
0453: case NEGATIVELOOKAHEAD:
0454: case LOOKBEHIND:
0455: case NEGATIVELOOKBEHIND:
0456: return 0; // ***** Really?
0457:
0458: default:
0459: throw new RuntimeException(
0460: "Token#getMaxLength(): Invalid Type: " + this .type);
0461: }
0462: }
0463:
0464: static final int FC_CONTINUE = 0;
0465: static final int FC_TERMINAL = 1;
0466: static final int FC_ANY = 2;
0467:
0468: private static final boolean isSet(int options, int flag) {
0469: return (options & flag) == flag;
0470: }
0471:
0472: final int analyzeFirstCharacter(RangeToken result, int options) {
0473: switch (this .type) {
0474: case CONCAT:
0475: int ret = FC_CONTINUE;
0476: for (int i = 0; i < this .size(); i++)
0477: if ((ret = this .getChild(i).analyzeFirstCharacter(
0478: result, options)) != FC_CONTINUE)
0479: break;
0480: return ret;
0481:
0482: case UNION:
0483: if (this .size() == 0)
0484: return FC_CONTINUE;
0485: /*
0486: * a|b|c -> FC_TERMINAL
0487: * a|.|c -> FC_ANY
0488: * a|b| -> FC_CONTINUE
0489: */
0490: int ret2 = FC_CONTINUE;
0491: boolean hasEmpty = false;
0492: for (int i = 0; i < this .size(); i++) {
0493: ret2 = this .getChild(i).analyzeFirstCharacter(result,
0494: options);
0495: if (ret2 == FC_ANY)
0496: break;
0497: else if (ret2 == FC_CONTINUE)
0498: hasEmpty = true;
0499: }
0500: return hasEmpty ? FC_CONTINUE : ret2;
0501:
0502: case CONDITION:
0503: int ret3 = this .getChild(0).analyzeFirstCharacter(result,
0504: options);
0505: if (this .size() == 1)
0506: return FC_CONTINUE;
0507: if (ret3 == FC_ANY)
0508: return ret3;
0509: int ret4 = this .getChild(1).analyzeFirstCharacter(result,
0510: options);
0511: if (ret4 == FC_ANY)
0512: return ret4;
0513: return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE
0514: : FC_TERMINAL;
0515:
0516: case CLOSURE:
0517: case NONGREEDYCLOSURE:
0518: this .getChild(0).analyzeFirstCharacter(result, options);
0519: return FC_CONTINUE;
0520:
0521: case EMPTY:
0522: case ANCHOR:
0523: return FC_CONTINUE;
0524:
0525: case CHAR:
0526: int ch = this .getChar();
0527: result.addRange(ch, ch);
0528: if (ch < 0x10000
0529: && isSet(options, RegularExpression.IGNORE_CASE)) {
0530: ch = Character.toUpperCase((char) ch);
0531: result.addRange(ch, ch);
0532: ch = Character.toLowerCase((char) ch);
0533: result.addRange(ch, ch);
0534: }
0535: return FC_TERMINAL;
0536:
0537: case DOT: // ****
0538: if (isSet(options, RegularExpression.SINGLE_LINE)) {
0539: return FC_CONTINUE; // **** We can not optimize.
0540: } else {
0541: return FC_CONTINUE;
0542: /*
0543: result.addRange(0, RegularExpression.LINE_FEED-1);
0544: result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
0545: result.addRange(RegularExpression.CARRIAGE_RETURN+1,
0546: RegularExpression.LINE_SEPARATOR-1);
0547: result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
0548: return 1;
0549: */
0550: }
0551:
0552: case RANGE:
0553: if (isSet(options, RegularExpression.IGNORE_CASE)) {
0554: result.mergeRanges(((RangeToken) this )
0555: .getCaseInsensitiveToken());
0556: } else {
0557: result.mergeRanges(this );
0558: }
0559: return FC_TERMINAL;
0560:
0561: case NRANGE: // ****
0562: if (isSet(options, RegularExpression.IGNORE_CASE)) {
0563: result.mergeRanges(Token
0564: .complementRanges(((RangeToken) this )
0565: .getCaseInsensitiveToken()));
0566: } else {
0567: result.mergeRanges(Token.complementRanges(this ));
0568: }
0569: return FC_TERMINAL;
0570:
0571: case INDEPENDENT:
0572: case PAREN:
0573: return this .getChild(0).analyzeFirstCharacter(result,
0574: options);
0575:
0576: case MODIFIERGROUP:
0577: options |= ((ModifierToken) this ).getOptions();
0578: options &= ~((ModifierToken) this ).getOptionsMask();
0579: return this .getChild(0).analyzeFirstCharacter(result,
0580: options);
0581:
0582: case BACKREFERENCE:
0583: result.addRange(0, UTF16_MAX); // **** We can not optimize.
0584: return FC_ANY;
0585:
0586: case STRING:
0587: int cha = this .getString().charAt(0);
0588: int ch2;
0589: if (REUtil.isHighSurrogate(cha)
0590: && this .getString().length() >= 2
0591: && REUtil.isLowSurrogate((ch2 = this .getString()
0592: .charAt(1))))
0593: cha = REUtil.composeFromSurrogates(cha, ch2);
0594: result.addRange(cha, cha);
0595: if (cha < 0x10000
0596: && isSet(options, RegularExpression.IGNORE_CASE)) {
0597: cha = Character.toUpperCase((char) cha);
0598: result.addRange(cha, cha);
0599: cha = Character.toLowerCase((char) cha);
0600: result.addRange(cha, cha);
0601: }
0602: return FC_TERMINAL;
0603:
0604: case LOOKAHEAD:
0605: case NEGATIVELOOKAHEAD:
0606: case LOOKBEHIND:
0607: case NEGATIVELOOKBEHIND:
0608: return FC_CONTINUE;
0609:
0610: default:
0611: throw new RuntimeException(
0612: "Token#analyzeHeadCharacter(): Invalid Type: "
0613: + this .type);
0614: }
0615: }
0616:
0617: private final boolean isShorterThan(Token tok) {
0618: if (tok == null)
0619: return false;
0620: /*
0621: int mylength;
0622: if (this.type == STRING) mylength = this.getString().length();
0623: else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
0624: else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
0625: int otherlength;
0626: if (tok.type == STRING) otherlength = tok.getString().length();
0627: else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
0628: else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
0629: */
0630: int mylength;
0631: if (this .type == STRING)
0632: mylength = this .getString().length();
0633: else
0634: throw new RuntimeException("Internal Error: Illegal type: "
0635: + this .type);
0636: int otherlength;
0637: if (tok.type == STRING)
0638: otherlength = tok.getString().length();
0639: else
0640: throw new RuntimeException("Internal Error: Illegal type: "
0641: + tok.type);
0642: return mylength < otherlength;
0643: }
0644:
0645: static class FixedStringContainer {
0646: Token token = null;
0647: int options = 0;
0648:
0649: FixedStringContainer() {
0650: }
0651: }
0652:
0653: final void findFixedString(FixedStringContainer container,
0654: int options) {
0655: switch (this .type) {
0656: case CONCAT:
0657: Token prevToken = null;
0658: int prevOptions = 0;
0659: for (int i = 0; i < this .size(); i++) {
0660: this .getChild(i).findFixedString(container, options);
0661: if (prevToken == null
0662: || prevToken.isShorterThan(container.token)) {
0663: prevToken = container.token;
0664: prevOptions = container.options;
0665: }
0666: }
0667: container.token = prevToken;
0668: container.options = prevOptions;
0669: return;
0670:
0671: case UNION:
0672: case CLOSURE:
0673: case NONGREEDYCLOSURE:
0674: case EMPTY:
0675: case ANCHOR:
0676: case RANGE:
0677: case DOT:
0678: case NRANGE:
0679: case BACKREFERENCE:
0680: case LOOKAHEAD:
0681: case NEGATIVELOOKAHEAD:
0682: case LOOKBEHIND:
0683: case NEGATIVELOOKBEHIND:
0684: case CONDITION:
0685: container.token = null;
0686: return;
0687:
0688: case CHAR: // Ignore CHAR tokens.
0689: container.token = null; // **
0690: return; // **
0691:
0692: case STRING:
0693: container.token = this ;
0694: container.options = options;
0695: return;
0696:
0697: case INDEPENDENT:
0698: case PAREN:
0699: this .getChild(0).findFixedString(container, options);
0700: return;
0701:
0702: case MODIFIERGROUP:
0703: options |= ((ModifierToken) this ).getOptions();
0704: options &= ~((ModifierToken) this ).getOptionsMask();
0705: this .getChild(0).findFixedString(container, options);
0706: return;
0707:
0708: default:
0709: throw new RuntimeException(
0710: "Token#findFixedString(): Invalid Type: "
0711: + this .type);
0712: }
0713: }
0714:
0715: boolean match(int ch) {
0716: throw new RuntimeException("NFAArrow#match(): Internal error: "
0717: + this .type);
0718: }
0719:
0720: // ------------------------------------------------------
0721: static protected Hashtable categories = new Hashtable();
0722: static protected Hashtable categories2 = null;
0723: static final String[] categoryNames = { "Cn", "Lu", "Ll", "Lt",
0724: "Lm", "Lo", "Mn", "Me", "Mc", "Nd", "Nl", "No", "Zs", "Zl",
0725: "Zp", "Cc", "Cf", null, "Co", "Cs", "Pd", "Ps", "Pe", "Pc",
0726: "Po", "Sm", "Sc", "Sk", "So", // 28
0727: "Pi", "Pf", // 29, 30
0728: "L", "M", "N", "Z", "C", "P", "S", // 31-37
0729: };
0730:
0731: // Schema Rec. {Datatypes} - Punctuation
0732: static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
0733: static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
0734: static final int CHAR_LETTER = 31;
0735: static final int CHAR_MARK = 32;
0736: static final int CHAR_NUMBER = 33;
0737: static final int CHAR_SEPARATOR = 34;
0738: static final int CHAR_OTHER = 35;
0739: static final int CHAR_PUNCTUATION = 36;
0740: static final int CHAR_SYMBOL = 37;
0741:
0742: //blockNames in UNICODE 3.1 that supported by XML Schema REC
0743: static final String[] blockNames = {
0744: /*0000..007F;*/"Basic Latin",
0745: /*0080..00FF;*/"Latin-1 Supplement",
0746: /*0100..017F;*/"Latin Extended-A",
0747: /*0180..024F;*/"Latin Extended-B",
0748: /*0250..02AF;*/"IPA Extensions",
0749: /*02B0..02FF;*/"Spacing Modifier Letters",
0750: /*0300..036F;*/"Combining Diacritical Marks",
0751: /*0370..03FF;*/"Greek",
0752: /*0400..04FF;*/"Cyrillic",
0753: /*0530..058F;*/"Armenian",
0754: /*0590..05FF;*/"Hebrew",
0755: /*0600..06FF;*/"Arabic",
0756: /*0700..074F;*/"Syriac",
0757: /*0780..07BF;*/"Thaana",
0758: /*0900..097F;*/"Devanagari",
0759: /*0980..09FF;*/"Bengali",
0760: /*0A00..0A7F;*/"Gurmukhi",
0761: /*0A80..0AFF;*/"Gujarati",
0762: /*0B00..0B7F;*/"Oriya",
0763: /*0B80..0BFF;*/"Tamil",
0764: /*0C00..0C7F;*/"Telugu",
0765: /*0C80..0CFF;*/"Kannada",
0766: /*0D00..0D7F;*/"Malayalam",
0767: /*0D80..0DFF;*/"Sinhala",
0768: /*0E00..0E7F;*/"Thai",
0769: /*0E80..0EFF;*/"Lao",
0770: /*0F00..0FFF;*/"Tibetan",
0771: /*1000..109F;*/"Myanmar",
0772: /*10A0..10FF;*/"Georgian",
0773: /*1100..11FF;*/"Hangul Jamo",
0774: /*1200..137F;*/"Ethiopic",
0775: /*13A0..13FF;*/"Cherokee",
0776: /*1400..167F;*/"Unified Canadian Aboriginal Syllabics",
0777: /*1680..169F;*/"Ogham",
0778: /*16A0..16FF;*/"Runic",
0779: /*1780..17FF;*/"Khmer",
0780: /*1800..18AF;*/"Mongolian",
0781: /*1E00..1EFF;*/"Latin Extended Additional",
0782: /*1F00..1FFF;*/"Greek Extended",
0783: /*2000..206F;*/"General Punctuation",
0784: /*2070..209F;*/"Superscripts and Subscripts",
0785: /*20A0..20CF;*/"Currency Symbols",
0786: /*20D0..20FF;*/"Combining Marks for Symbols",
0787: /*2100..214F;*/"Letterlike Symbols",
0788: /*2150..218F;*/"Number Forms",
0789: /*2190..21FF;*/"Arrows",
0790: /*2200..22FF;*/"Mathematical Operators",
0791: /*2300..23FF;*/"Miscellaneous Technical",
0792: /*2400..243F;*/"Control Pictures",
0793: /*2440..245F;*/"Optical Character Recognition",
0794: /*2460..24FF;*/"Enclosed Alphanumerics",
0795: /*2500..257F;*/"Box Drawing",
0796: /*2580..259F;*/"Block Elements",
0797: /*25A0..25FF;*/"Geometric Shapes",
0798: /*2600..26FF;*/"Miscellaneous Symbols",
0799: /*2700..27BF;*/"Dingbats",
0800: /*2800..28FF;*/"Braille Patterns",
0801: /*2E80..2EFF;*/"CJK Radicals Supplement",
0802: /*2F00..2FDF;*/"Kangxi Radicals",
0803: /*2FF0..2FFF;*/"Ideographic Description Characters",
0804: /*3000..303F;*/"CJK Symbols and Punctuation",
0805: /*3040..309F;*/"Hiragana",
0806: /*30A0..30FF;*/"Katakana",
0807: /*3100..312F;*/"Bopomofo",
0808: /*3130..318F;*/"Hangul Compatibility Jamo",
0809: /*3190..319F;*/"Kanbun",
0810: /*31A0..31BF;*/"Bopomofo Extended",
0811: /*3200..32FF;*/"Enclosed CJK Letters and Months",
0812: /*3300..33FF;*/"CJK Compatibility",
0813: /*3400..4DB5;*/"CJK Unified Ideographs Extension A",
0814: /*4E00..9FFF;*/"CJK Unified Ideographs",
0815: /*A000..A48F;*/"Yi Syllables",
0816: /*A490..A4CF;*/"Yi Radicals",
0817: /*AC00..D7A3;*/"Hangul Syllables",
0818: /*D800..DB7F;*/"High Surrogates",
0819: /*DB80..DBFF;*/"High Private Use Surrogates",
0820: /*DC00..DFFF;*/"Low Surrogates",
0821: /*E000..F8FF;*/"Private Use",
0822: /*F900..FAFF;*/"CJK Compatibility Ideographs",
0823: /*FB00..FB4F;*/"Alphabetic Presentation Forms",
0824: /*FB50..FDFF;*/"Arabic Presentation Forms-A",
0825: /*FE20..FE2F;*/"Combining Half Marks",
0826: /*FE30..FE4F;*/"CJK Compatibility Forms",
0827: /*FE50..FE6F;*/"Small Form Variants",
0828: /*FE70..FEFE;*/"Arabic Presentation Forms-B",
0829: /*FEFF..FEFF;*/"Specials",
0830: /*FF00..FFEF;*/"Halfwidth and Fullwidth Forms",
0831: //missing Specials add manually
0832: /*10300..1032F;*/"Old Italic",
0833: /*10330..1034F;*/"Gothic",
0834: /*10400..1044F;*/"Deseret",
0835: /*1D000..1D0FF;*/"Byzantine Musical Symbols",
0836: /*1D100..1D1FF;*/"Musical Symbols",
0837: /*1D400..1D7FF;*/"Mathematical Alphanumeric Symbols",
0838: /*20000..2A6D6;*/"CJK Unified Ideographs Extension B",
0839: /*2F800..2FA1F;*/"CJK Compatibility Ideographs Supplement",
0840: /*E0000..E007F;*/"Tags",
0841: //missing 2 private use add manually
0842:
0843: };
0844: //ADD THOSE MANUALLY
0845: //F0000..FFFFD; "Private Use",
0846: //100000..10FFFD; "Private Use"
0847: //FFF0..FFFD; "Specials",
0848: static final String blockRanges = "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
0849: + "\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
0850: + "\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
0851: + "\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
0852: + "\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
0853: + "\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
0854: + "\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
0855: + "\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
0856: + "\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
0857: + "\uAC00\uD7A3\uD800\uDB7F\uDB80\uDBFF\uDC00\uDFFF\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
0858: + "\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF\u10300\u1032F\u10330\u1034F"
0859: + "\u10400\u1044F\u1D000\u1D0FFs\u1D100\u1D1FF\u1D400\u1D7FF\u20000\u2A6D6\u2F800\u2FA1F\uE0000\uE007F";
0860:
0861: static protected RangeToken getRange(String name, boolean positive) {
0862: if (Token.categories.size() == 0) {
0863: synchronized (Token.categories) {
0864: Token[] ranges = new Token[Token.categoryNames.length];
0865: for (int i = 0; i < ranges.length; i++) {
0866: ranges[i] = Token.createRange();
0867: }
0868: int type;
0869: for (int i = 0; i < 0x10000; i++) {
0870: type = Character.getType((char) i);
0871: if (type == Character.START_PUNCTUATION
0872: || type == Character.END_PUNCTUATION) {
0873: //build table of Pi values
0874: if (i == 0x00AB || i == 0x2018 || i == 0x201B
0875: || i == 0x201C || i == 0x201F
0876: || i == 0x2039) {
0877: type = CHAR_INIT_QUOTE;
0878: }
0879: //build table of Pf values
0880: if (i == 0x00BB || i == 0x2019 || i == 0x201D
0881: || i == 0x203A) {
0882: type = CHAR_FINAL_QUOTE;
0883: }
0884: }
0885: ranges[type].addRange(i, i);
0886: switch (type) {
0887: case Character.UPPERCASE_LETTER:
0888: case Character.LOWERCASE_LETTER:
0889: case Character.TITLECASE_LETTER:
0890: case Character.MODIFIER_LETTER:
0891: case Character.OTHER_LETTER:
0892: type = CHAR_LETTER;
0893: break;
0894: case Character.NON_SPACING_MARK:
0895: case Character.COMBINING_SPACING_MARK:
0896: case Character.ENCLOSING_MARK:
0897: type = CHAR_MARK;
0898: break;
0899: case Character.DECIMAL_DIGIT_NUMBER:
0900: case Character.LETTER_NUMBER:
0901: case Character.OTHER_NUMBER:
0902: type = CHAR_NUMBER;
0903: break;
0904: case Character.SPACE_SEPARATOR:
0905: case Character.LINE_SEPARATOR:
0906: case Character.PARAGRAPH_SEPARATOR:
0907: type = CHAR_SEPARATOR;
0908: break;
0909: case Character.CONTROL:
0910: case Character.FORMAT:
0911: case Character.SURROGATE:
0912: case Character.PRIVATE_USE:
0913: case Character.UNASSIGNED:
0914: type = CHAR_OTHER;
0915: break;
0916: case Character.CONNECTOR_PUNCTUATION:
0917: case Character.DASH_PUNCTUATION:
0918: case Character.START_PUNCTUATION:
0919: case Character.END_PUNCTUATION:
0920: case CHAR_INIT_QUOTE:
0921: case CHAR_FINAL_QUOTE:
0922: case Character.OTHER_PUNCTUATION:
0923: type = CHAR_PUNCTUATION;
0924: break;
0925: case Character.MATH_SYMBOL:
0926: case Character.CURRENCY_SYMBOL:
0927: case Character.MODIFIER_SYMBOL:
0928: case Character.OTHER_SYMBOL:
0929: type = CHAR_SYMBOL;
0930: break;
0931: default:
0932: throw new RuntimeException(
0933: "org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "
0934: + type);
0935: }
0936: ranges[type].addRange(i, i);
0937: } // for all characters
0938: ranges[Character.UNASSIGNED].addRange(0x10000,
0939: Token.UTF16_MAX);
0940:
0941: Token.categories2 = new Hashtable();
0942: for (int i = 0; i < ranges.length; i++) {
0943: if (Token.categoryNames[i] != null) {
0944: if (i == Character.UNASSIGNED) { // Unassigned
0945: ranges[i]
0946: .addRange(0x10000, Token.UTF16_MAX);
0947: }
0948: Token.categories.put(Token.categoryNames[i],
0949: ranges[i]);
0950: Token.categories2.put(Token.categoryNames[i],
0951: Token.complementRanges(ranges[i]));
0952: }
0953: }
0954: //REVISIT: do we really need to support block names as in Unicode 3.1
0955: // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
0956: //
0957: StringBuffer buffer = new StringBuffer(50);
0958: int location = 0;
0959: for (int i = 0; i < Token.blockNames.length; i++) {
0960: Token r1 = Token.createRange();
0961: location = i * 2;
0962: int rstart = Token.blockRanges.charAt(location);
0963: int rend = Token.blockRanges.charAt(location + 1);
0964: String n = Token.blockNames[i];
0965: //DEBUGING
0966: //System.out.println(n+" " +Integer.toHexString(rstart)
0967: // +"-"+ Integer.toHexString(rend));
0968: r1.addRange(rstart, rend);
0969: if (n.equals("Specials"))
0970: r1.addRange(0xfff0, 0xfffd);
0971: if (n.equals("Private Use")) {
0972: r1.addRange(0xF0000, 0xFFFFD);
0973: r1.addRange(0x100000, 0x10FFFD);
0974: }
0975: Token.categories.put(n, r1);
0976: Token.categories2
0977: .put(n, Token.complementRanges(r1));
0978: buffer.setLength(0);
0979: buffer.append("Is");
0980: if (n.indexOf(' ') >= 0) {
0981: for (int ci = 0; ci < n.length(); ci++)
0982: if (n.charAt(ci) != ' ')
0983: buffer.append((char) n.charAt(ci));
0984: } else {
0985: buffer.append(n);
0986: }
0987: Token.setAlias(buffer.toString(), n, true);
0988: }
0989:
0990: // REVISIT: remove this code later
0991: // the following does not match the XML Schema definition
0992: // for Regular Expressions
0993:
0994: /*
0995: // TR#18 1.2
0996: Token.setAlias("ASSIGNED", "Cn", false);
0997: Token.setAlias("UNASSIGNED", "Cn", true);
0998: Token all = Token.createRange();
0999: all.addRange(0, Token.UTF16_MAX);
1000: Token.categories.put("ALL", all);
1001: Token.categories2.put("ALL", Token.complementRanges(all));
1002: */
1003:
1004: /*
1005: Token isalpha = Token.createRange();
1006: isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
1007: isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
1008: isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
1009: Token.categories.put("IsAlpha", isalpha);
1010: Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
1011:
1012: Token isalnum = Token.createRange();
1013: isalnum.mergeRanges(isalpha); // Lu Ll Lo
1014: isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
1015: Token.categories.put("IsAlnum", isalnum);
1016: Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
1017:
1018: Token isspace = Token.createRange();
1019: isspace.mergeRanges(Token.token_spaces);
1020: isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
1021: Token.categories.put("IsSpace", isspace);
1022: Token.categories2.put("IsSpace", Token.complementRanges(isspace));
1023:
1024: Token isword = Token.createRange();
1025: isword.mergeRanges(isalnum); // Lu Ll Lo Nd
1026: isword.addRange('_', '_');
1027: Token.categories.put("IsWord", isword);
1028: Token.categories2.put("IsWord", Token.complementRanges(isword));
1029:
1030: Token isascii = Token.createRange();
1031: isascii.addRange(0, 127);
1032: Token.categories.put("IsASCII", isascii);
1033: Token.categories2.put("IsASCII", Token.complementRanges(isascii));
1034:
1035: Token isnotgraph = Token.createRange();
1036: isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
1037: isnotgraph.addRange(' ', ' ');
1038: Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
1039: Token.categories2.put("IsGraph", isnotgraph);
1040:
1041: Token isxdigit = Token.createRange();
1042: isxdigit.addRange('0', '9');
1043: isxdigit.addRange('A', 'F');
1044: isxdigit.addRange('a', 'f');
1045: Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
1046: Token.categories2.put("IsXDigit", isxdigit);
1047:
1048: Token.setAlias("IsDigit", "Nd", true);
1049: Token.setAlias("IsUpper", "Lu", true);
1050: Token.setAlias("IsLower", "Ll", true);
1051: Token.setAlias("IsCntrl", "C", true);
1052: Token.setAlias("IsPrint", "C", false);
1053: Token.setAlias("IsPunct", "P", true);
1054:
1055: Token.setAlias("alpha", "IsAlpha", true);
1056: Token.setAlias("alnum", "IsAlnum", true);
1057: Token.setAlias("ascii", "IsASCII", true);
1058: Token.setAlias("cntrl", "IsCntrl", true);
1059: Token.setAlias("digit", "IsDigit", true);
1060: Token.setAlias("graph", "IsGraph", true);
1061: Token.setAlias("lower", "IsLower", true);
1062: Token.setAlias("print", "IsPrint", true);
1063: Token.setAlias("punct", "IsPunct", true);
1064: Token.setAlias("space", "IsSpace", true);
1065: Token.setAlias("upper", "IsUpper", true);
1066: Token.setAlias("word", "IsWord", true); // Perl extension
1067: Token.setAlias("xdigit", "IsXDigit", true);
1068: */
1069: } // synchronized
1070: } // if null
1071: RangeToken tok = positive ? (RangeToken) Token.categories
1072: .get(name) : (RangeToken) Token.categories2.get(name);
1073: if (tok == null)
1074: System.out.println(name);
1075: return tok;
1076: }
1077:
1078: private static void setAlias(String newName, String name,
1079: boolean positive) {
1080: Token t1 = (Token) Token.categories.get(name);
1081: Token t2 = (Token) Token.categories2.get(name);
1082: if (positive) {
1083: Token.categories.put(newName, t1);
1084: Token.categories2.put(newName, t2);
1085: } else {
1086: Token.categories2.put(newName, t1);
1087: Token.categories.put(newName, t2);
1088: }
1089: }
1090:
1091: // ------------------------------------------------------
1092:
1093: static final String viramaString = "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1094: + "\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1095: + "\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1096: + "\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1097: + "\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1098: + "\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1099: + "\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1100: + "\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1101: + "\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1102: + "\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1103: + "\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1104:
1105: static private Token token_grapheme = null;
1106:
1107: static synchronized protected Token getGraphemePattern() {
1108: if (Token.token_grapheme != null)
1109: return Token.token_grapheme;
1110:
1111: Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
1112: base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1113: base_char.subtractRanges(Token.getRange("M", true));
1114: base_char.subtractRanges(Token.getRange("C", true));
1115:
1116: Token virama = Token.createRange();
1117: for (int i = 0; i < Token.viramaString.length(); i++) {
1118: int ch = viramaString.charAt(i);
1119: virama.addRange(i, i);
1120: }
1121:
1122: Token combiner_wo_virama = Token.createRange();
1123: combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1124: combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1125: combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1126:
1127: Token left = Token.createUnion(); // base_char?
1128: left.addChild(base_char);
1129: left.addChild(Token.token_empty);
1130:
1131: Token foo = Token.createUnion();
1132: foo.addChild(Token.createConcat(virama, Token.getRange("L",
1133: true)));
1134: foo.addChild(combiner_wo_virama);
1135:
1136: foo = Token.createClosure(foo);
1137:
1138: foo = Token.createConcat(left, foo);
1139:
1140: Token.token_grapheme = foo;
1141: return Token.token_grapheme;
1142: }
1143:
1144: /**
1145: * Combing Character Sequence in Perl 5.6.
1146: */
1147: static private Token token_ccs = null;
1148:
1149: static synchronized protected Token getCombiningCharacterSequence() {
1150: if (Token.token_ccs != null)
1151: return Token.token_ccs;
1152:
1153: Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1154: foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1155: Token.token_ccs = foo;
1156: return Token.token_ccs;
1157: }
1158:
1159: // ------------------------------------------------------
1160:
1161: // ------------------------------------------------------
1162: /**
1163: * This class represents a node in parse tree.
1164: */
1165: static class StringToken extends Token implements
1166: java.io.Serializable {
1167: String string;
1168: int refNumber;
1169:
1170: StringToken(int type, String str, int n) {
1171: super (type);
1172: this .string = str;
1173: this .refNumber = n;
1174: }
1175:
1176: int getReferenceNumber() { // for STRING
1177: return this .refNumber;
1178: }
1179:
1180: String getString() { // for STRING
1181: return this .string;
1182: }
1183:
1184: public String toString(int options) {
1185: if (this .type == BACKREFERENCE)
1186: return "\\" + this .refNumber;
1187: else
1188: return REUtil.quoteMeta(this .string);
1189: }
1190: }
1191:
1192: /**
1193: * This class represents a node in parse tree.
1194: */
1195: static class ConcatToken extends Token implements
1196: java.io.Serializable {
1197: Token child;
1198: Token child2;
1199:
1200: ConcatToken(Token t1, Token t2) {
1201: super (Token.CONCAT);
1202: this .child = t1;
1203: this .child2 = t2;
1204: }
1205:
1206: int size() {
1207: return 2;
1208: }
1209:
1210: Token getChild(int index) {
1211: return index == 0 ? this .child : this .child2;
1212: }
1213:
1214: public String toString(int options) {
1215: String ret;
1216: if (this .child2.type == CLOSURE
1217: && this .child2.getChild(0) == this .child) {
1218: ret = this .child.toString(options) + "+";
1219: } else if (this .child2.type == NONGREEDYCLOSURE
1220: && this .child2.getChild(0) == this .child) {
1221: ret = this .child.toString(options) + "+?";
1222: } else
1223: ret = this .child.toString(options)
1224: + this .child2.toString(options);
1225: return ret;
1226: }
1227: }
1228:
1229: /**
1230: * This class represents a node in parse tree.
1231: */
1232: static class CharToken extends Token implements
1233: java.io.Serializable {
1234: int chardata;
1235:
1236: CharToken(int type, int ch) {
1237: super (type);
1238: this .chardata = ch;
1239: }
1240:
1241: int getChar() {
1242: return this .chardata;
1243: }
1244:
1245: public String toString(int options) {
1246: String ret;
1247: switch (this .type) {
1248: case CHAR:
1249: switch (this .chardata) {
1250: case '|':
1251: case '*':
1252: case '+':
1253: case '?':
1254: case '(':
1255: case ')':
1256: case '.':
1257: case '[':
1258: case '{':
1259: case '\\':
1260: ret = "\\" + (char) this .chardata;
1261: break;
1262: case '\f':
1263: ret = "\\f";
1264: break;
1265: case '\n':
1266: ret = "\\n";
1267: break;
1268: case '\r':
1269: ret = "\\r";
1270: break;
1271: case '\t':
1272: ret = "\\t";
1273: break;
1274: case 0x1b:
1275: ret = "\\e";
1276: break;
1277: //case 0x0b: ret = "\\v"; break;
1278: default:
1279: if (this .chardata >= 0x10000) {
1280: String pre = "0"
1281: + Integer.toHexString(this .chardata);
1282: ret = "\\v"
1283: + pre.substring(pre.length() - 6, pre
1284: .length());
1285: } else
1286: ret = "" + (char) this .chardata;
1287: }
1288: break;
1289:
1290: case ANCHOR:
1291: if (this == Token.token_linebeginning
1292: || this == Token.token_lineend)
1293: ret = "" + (char) this .chardata;
1294: else
1295: ret = "\\" + (char) this .chardata;
1296: break;
1297:
1298: default:
1299: ret = null;
1300: }
1301: return ret;
1302: }
1303:
1304: boolean match(int ch) {
1305: if (this .type == CHAR) {
1306: return ch == this .chardata;
1307: } else
1308: throw new RuntimeException(
1309: "NFAArrow#match(): Internal error: "
1310: + this .type);
1311: }
1312: }
1313:
1314: /**
1315: * This class represents a node in parse tree.
1316: */
1317: static class ClosureToken extends Token implements
1318: java.io.Serializable {
1319: int min;
1320: int max;
1321: Token child;
1322:
1323: ClosureToken(int type, Token tok) {
1324: super (type);
1325: this .child = tok;
1326: this .setMin(-1);
1327: this .setMax(-1);
1328: }
1329:
1330: int size() {
1331: return 1;
1332: }
1333:
1334: Token getChild(int index) {
1335: return this .child;
1336: }
1337:
1338: final void setMin(int min) {
1339: this .min = min;
1340: }
1341:
1342: final void setMax(int max) {
1343: this .max = max;
1344: }
1345:
1346: final int getMin() {
1347: return this .min;
1348: }
1349:
1350: final int getMax() {
1351: return this .max;
1352: }
1353:
1354: public String toString(int options) {
1355: String ret;
1356: if (this .type == CLOSURE) {
1357: if (this .getMin() < 0 && this .getMax() < 0) {
1358: ret = this .child.toString(options) + "*";
1359: } else if (this .getMin() == this .getMax()) {
1360: ret = this .child.toString(options) + "{"
1361: + this .getMin() + "}";
1362: } else if (this .getMin() >= 0 && this .getMax() >= 0) {
1363: ret = this .child.toString(options) + "{"
1364: + this .getMin() + "," + this .getMax() + "}";
1365: } else if (this .getMin() >= 0 && this .getMax() < 0) {
1366: ret = this .child.toString(options) + "{"
1367: + this .getMin() + ",}";
1368: } else
1369: throw new RuntimeException(
1370: "Token#toString(): CLOSURE "
1371: + this .getMin() + ", "
1372: + this .getMax());
1373: } else {
1374: if (this .getMin() < 0 && this .getMax() < 0) {
1375: ret = this .child.toString(options) + "*?";
1376: } else if (this .getMin() == this .getMax()) {
1377: ret = this .child.toString(options) + "{"
1378: + this .getMin() + "}?";
1379: } else if (this .getMin() >= 0 && this .getMax() >= 0) {
1380: ret = this .child.toString(options) + "{"
1381: + this .getMin() + "," + this .getMax()
1382: + "}?";
1383: } else if (this .getMin() >= 0 && this .getMax() < 0) {
1384: ret = this .child.toString(options) + "{"
1385: + this .getMin() + ",}?";
1386: } else
1387: throw new RuntimeException(
1388: "Token#toString(): NONGREEDYCLOSURE "
1389: + this .getMin() + ", "
1390: + this .getMax());
1391: }
1392: return ret;
1393: }
1394: }
1395:
1396: /**
1397: * This class represents a node in parse tree.
1398: */
1399: static class ParenToken extends Token implements
1400: java.io.Serializable {
1401: Token child;
1402: int parennumber;
1403:
1404: ParenToken(int type, Token tok, int paren) {
1405: super (type);
1406: this .child = tok;
1407: this .parennumber = paren;
1408: }
1409:
1410: int size() {
1411: return 1;
1412: }
1413:
1414: Token getChild(int index) {
1415: return this .child;
1416: }
1417:
1418: int getParenNumber() {
1419: return this .parennumber;
1420: }
1421:
1422: public String toString(int options) {
1423: String ret = null;
1424: switch (this .type) {
1425: case PAREN:
1426: if (this .parennumber == 0) {
1427: ret = "(?:" + this .child.toString(options) + ")";
1428: } else {
1429: ret = "(" + this .child.toString(options) + ")";
1430: }
1431: break;
1432:
1433: case LOOKAHEAD:
1434: ret = "(?=" + this .child.toString(options) + ")";
1435: break;
1436: case NEGATIVELOOKAHEAD:
1437: ret = "(?!" + this .child.toString(options) + ")";
1438: break;
1439: case LOOKBEHIND:
1440: ret = "(?<=" + this .child.toString(options) + ")";
1441: break;
1442: case NEGATIVELOOKBEHIND:
1443: ret = "(?<!" + this .child.toString(options) + ")";
1444: break;
1445: case INDEPENDENT:
1446: ret = "(?>" + this .child.toString(options) + ")";
1447: break;
1448: }
1449: return ret;
1450: }
1451: }
1452:
1453: /**
1454: * (?(condition)yes-pattern|no-pattern)
1455: */
1456: static class ConditionToken extends Token implements
1457: java.io.Serializable {
1458: int refNumber;
1459: Token condition;
1460: Token yes;
1461: Token no;
1462:
1463: ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1464: super (Token.CONDITION);
1465: this .refNumber = refno;
1466: this .condition = cond;
1467: this .yes = yespat;
1468: this .no = nopat;
1469: }
1470:
1471: int size() {
1472: return this .no == null ? 1 : 2;
1473: }
1474:
1475: Token getChild(int index) {
1476: if (index == 0)
1477: return this .yes;
1478: if (index == 1)
1479: return this .no;
1480: throw new RuntimeException("Internal Error: " + index);
1481: }
1482:
1483: public String toString(int options) {
1484: String ret;
1485: if (refNumber > 0) {
1486: ret = "(?(" + refNumber + ")";
1487: } else if (this .condition.type == Token.ANCHOR) {
1488: ret = "(?(" + this .condition + ")";
1489: } else {
1490: ret = "(?" + this .condition;
1491: }
1492:
1493: if (this .no == null) {
1494: ret += this .yes + ")";
1495: } else {
1496: ret += this .yes + "|" + this .no + ")";
1497: }
1498: return ret;
1499: }
1500: }
1501:
1502: /**
1503: * (ims-ims: .... )
1504: */
1505: static class ModifierToken extends Token implements
1506: java.io.Serializable {
1507: Token child;
1508: int add;
1509: int mask;
1510:
1511: ModifierToken(Token tok, int add, int mask) {
1512: super (Token.MODIFIERGROUP);
1513: this .child = tok;
1514: this .add = add;
1515: this .mask = mask;
1516: }
1517:
1518: int size() {
1519: return 1;
1520: }
1521:
1522: Token getChild(int index) {
1523: return this .child;
1524: }
1525:
1526: int getOptions() {
1527: return this .add;
1528: }
1529:
1530: int getOptionsMask() {
1531: return this .mask;
1532: }
1533:
1534: public String toString(int options) {
1535: return "(?"
1536: + (this .add == 0 ? "" : REUtil
1537: .createOptionString(this .add))
1538: + (this .mask == 0 ? "" : REUtil
1539: .createOptionString(this .mask)) + ":"
1540: + this .child.toString(options) + ")";
1541: }
1542: }
1543:
1544: /**
1545: * This class represents a node in parse tree.
1546: * for UNION or CONCAT.
1547: */
1548: static class UnionToken extends Token implements
1549: java.io.Serializable {
1550: Vector children;
1551:
1552: UnionToken(int type) {
1553: super (type);
1554: }
1555:
1556: void addChild(Token tok) {
1557: if (tok == null)
1558: return;
1559: if (this .children == null)
1560: this .children = new Vector();
1561: if (this .type == UNION) {
1562: this .children.addElement(tok);
1563: return;
1564: }
1565: // This is CONCAT, and new child is CONCAT.
1566: if (tok.type == CONCAT) {
1567: for (int i = 0; i < tok.size(); i++)
1568: this .addChild(tok.getChild(i)); // Recursion
1569: return;
1570: }
1571: int size = this .children.size();
1572: if (size == 0) {
1573: this .children.addElement(tok);
1574: return;
1575: }
1576: Token previous = (Token) this .children.elementAt(size - 1);
1577: if (!((previous.type == CHAR || previous.type == STRING) && (tok.type == CHAR || tok.type == STRING))) {
1578: this .children.addElement(tok);
1579: return;
1580: }
1581:
1582: //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1583:
1584: StringBuffer buffer;
1585: int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString()
1586: .length());
1587: if (previous.type == CHAR) { // Replace previous token by STRING
1588: buffer = new StringBuffer(2 + nextMaxLength);
1589: int ch = previous.getChar();
1590: if (ch >= 0x10000)
1591: buffer.append(REUtil.decomposeToSurrogates(ch));
1592: else
1593: buffer.append((char) ch);
1594: previous = Token.createString(null);
1595: this .children.setElementAt(previous, size - 1);
1596: } else { // STRING
1597: buffer = new StringBuffer(previous.getString().length()
1598: + nextMaxLength);
1599: buffer.append(previous.getString());
1600: }
1601:
1602: if (tok.type == CHAR) {
1603: int ch = tok.getChar();
1604: if (ch >= 0x10000)
1605: buffer.append(REUtil.decomposeToSurrogates(ch));
1606: else
1607: buffer.append((char) ch);
1608: } else {
1609: buffer.append(tok.getString());
1610: }
1611:
1612: ((StringToken) previous).string = new String(buffer);
1613: }
1614:
1615: int size() {
1616: return this .children == null ? 0 : this .children.size();
1617: }
1618:
1619: Token getChild(int index) {
1620: return (Token) this .children.elementAt(index);
1621: }
1622:
1623: public String toString(int options) {
1624: String ret;
1625: if (this .type == CONCAT) {
1626: if (this .children.size() == 2) {
1627: Token ch = this .getChild(0);
1628: Token ch2 = this .getChild(1);
1629: if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1630: ret = ch.toString(options) + "+";
1631: } else if (ch2.type == NONGREEDYCLOSURE
1632: && ch2.getChild(0) == ch) {
1633: ret = ch.toString(options) + "+?";
1634: } else
1635: ret = ch.toString(options)
1636: + ch2.toString(options);
1637: } else {
1638: StringBuffer sb = new StringBuffer();
1639: for (int i = 0; i < this .children.size(); i++) {
1640: sb.append(((Token) this .children.elementAt(i))
1641: .toString(options));
1642: }
1643: ret = new String(sb);
1644: }
1645: return ret;
1646: }
1647: if (this .children.size() == 2
1648: && this .getChild(1).type == EMPTY) {
1649: ret = this .getChild(0).toString(options) + "?";
1650: } else if (this .children.size() == 2
1651: && this .getChild(0).type == EMPTY) {
1652: ret = this .getChild(1).toString(options) + "??";
1653: } else {
1654: StringBuffer sb = new StringBuffer();
1655: sb.append(((Token) this .children.elementAt(0))
1656: .toString(options));
1657: for (int i = 1; i < this .children.size(); i++) {
1658: sb.append((char) '|');
1659: sb.append(((Token) this .children.elementAt(i))
1660: .toString(options));
1661: }
1662: ret = new String(sb);
1663: }
1664: return ret;
1665: }
1666: }
1667: }
|