0001: /*
0002: * Licensed to the Apache Software Foundation (ASF) under one or more
0003: * contributor license agreements. See the NOTICE file distributed with
0004: * this work for additional information regarding copyright ownership.
0005: * The ASF licenses this file to You under the Apache License, Version 2.0
0006: * (the "License"); you may not use this file except in compliance with
0007: * the License. You may obtain a copy of the License at
0008: *
0009: * http://www.apache.org/licenses/LICENSE-2.0
0010: *
0011: * Unless required by applicable law or agreed to in writing, software
0012: * distributed under the License is distributed on an "AS IS" BASIS,
0013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014: * See the License for the specific language governing permissions and
0015: * limitations under the License.
0016: */
0017:
0018: /**
0019: * @author Nikolay A. Kuznetsov
0020: * @version $Revision: 1.21.2.2 $
0021: */package java.util.regex;
0022:
0023: import java.util.MissingResourceException;
0024:
0025: import org.apache.harmony.regex.internal.nls.Messages;
0026:
0027: /**
0028: * The purpose of this class is to break given pattern into RE tokens;
0029: *
0030: * @author Nikolay A. Kuznetsov
0031: * @version $Revision: 1.21.2.2 $
0032: */
0033: class Lexer {
0034:
0035: public static final int CHAR_DOLLAR = 0xe0000000 | '$';
0036:
0037: public static final int CHAR_RIGHT_PARENTHESIS = 0xe0000000 | ')';
0038:
0039: public static final int CHAR_LEFT_SQUARE_BRACKET = 0xe0000000 | '[';
0040:
0041: public static final int CHAR_RIGHT_SQUARE_BRACKET = 0xe0000000 | ']';
0042:
0043: public static final int CHAR_CARET = 0xe0000000 | '^';
0044:
0045: public static final int CHAR_VERTICAL_BAR = 0xe0000000 | '|';
0046:
0047: public static final int CHAR_AMPERSAND = 0xe0000000 | '&';
0048:
0049: public static final int CHAR_HYPHEN = 0xe0000000 | '-';
0050:
0051: public static final int CHAR_DOT = 0xe0000000 | '.';
0052:
0053: public static final int QMOD_GREEDY = 0xe0000000;
0054:
0055: public static final int QMOD_RELUCTANT = 0xc0000000;
0056:
0057: public static final int QMOD_POSSESSIVE = 0x80000000;
0058:
0059: public static final int QUANT_STAR = QMOD_GREEDY | '*';
0060:
0061: public static final int QUANT_STAR_P = QMOD_POSSESSIVE | '*';
0062:
0063: public static final int QUANT_STAR_R = QMOD_RELUCTANT | '*';
0064:
0065: public static final int QUANT_PLUS = QMOD_GREEDY | '+';
0066:
0067: public static final int QUANT_PLUS_P = QMOD_POSSESSIVE | '+';
0068:
0069: public static final int QUANT_PLUS_R = QMOD_RELUCTANT | '+';
0070:
0071: public static final int QUANT_ALT = QMOD_GREEDY | '?';
0072:
0073: public static final int QUANT_ALT_P = QMOD_POSSESSIVE | '?';
0074:
0075: public static final int QUANT_ALT_R = QMOD_RELUCTANT | '?';
0076:
0077: public static final int QUANT_COMP = QMOD_GREEDY | '{';
0078:
0079: public static final int QUANT_COMP_P = QMOD_POSSESSIVE | '{';
0080:
0081: public static final int QUANT_COMP_R = QMOD_RELUCTANT | '{';
0082:
0083: public static final int CHAR_LEFT_PARENTHESIS = 0x80000000 | '(';
0084:
0085: public static final int CHAR_NONCAP_GROUP = 0xc0000000 | '(';
0086:
0087: public static final int CHAR_POS_LOOKAHEAD = 0xe0000000 | '(';
0088:
0089: public static final int CHAR_NEG_LOOKAHEAD = 0xf0000000 | '(';
0090:
0091: public static final int CHAR_POS_LOOKBEHIND = 0xf8000000 | '(';
0092:
0093: public static final int CHAR_NEG_LOOKBEHIND = 0xfc000000 | '(';
0094:
0095: public static final int CHAR_ATOMIC_GROUP = 0xfe000000 | '(';
0096:
0097: public static final int CHAR_FLAGS = 0xff000000 | '(';
0098:
0099: public static final int CHAR_START_OF_INPUT = 0x80000000 | 'A';
0100:
0101: public static final int CHAR_WORD_BOUND = 0x80000000 | 'b';
0102:
0103: public static final int CHAR_NONWORD_BOUND = 0x80000000 | 'B';
0104:
0105: public static final int CHAR_PREVIOUS_MATCH = 0x80000000 | 'G';
0106:
0107: public static final int CHAR_END_OF_INPUT = 0x80000000 | 'z';
0108:
0109: public static final int CHAR_END_OF_LINE = 0x80000000 | 'Z';
0110:
0111: public static final int MODE_PATTERN = 1 << 0;
0112:
0113: public static final int MODE_RANGE = 1 << 1;
0114:
0115: public static final int MODE_ESCAPE = 1 << 2;
0116:
0117: //maximum length of decomposition
0118: static final int MAX_DECOMPOSITION_LENGTH = 4;
0119:
0120: /*
0121: * maximum length of Hangul decomposition
0122: * note that MAX_HANGUL_DECOMPOSITION_LENGTH <= MAX_DECOMPOSITION_LENGTH
0123: */
0124: static final int MAX_HANGUL_DECOMPOSITION_LENGTH = 3;
0125:
0126: /*
0127: * Following constants are needed for Hangul canonical decomposition.
0128: * Hangul decomposition algorithm and constants are taken according
0129: * to description at http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf
0130: * "3.12 Conjoining Jamo Behavior"
0131: */
0132: static final int SBase = 0xAC00;
0133:
0134: static final int LBase = 0x1100;
0135:
0136: static final int VBase = 0x1161;
0137:
0138: static final int TBase = 0x11A7;
0139:
0140: static final int SCount = 11172;
0141:
0142: static final int LCount = 19;
0143:
0144: static final int VCount = 21;
0145:
0146: static final int TCount = 28;
0147:
0148: static final int NCount = 588;
0149:
0150: //table that contains canonical decomposition mappings
0151: private static IntArrHash decompTable = null;
0152:
0153: //table that contains canonical combining classes
0154: private static IntHash canonClassesTable = null;
0155:
0156: private static int canonClassesTableSize;
0157:
0158: /*
0159: * Table that contains information about Unicode codepoints with
0160: * single codepoint decomposition
0161: */
0162: private static IntHash singleDecompTable = null;
0163:
0164: private static int singleDecompTableSize;
0165:
0166: private char[] pattern = null;
0167:
0168: private int flags = 0;
0169:
0170: private int mode = 1;
0171:
0172: // when in literal mode, this field will save the previous one
0173: private int saved_mode = 0;
0174:
0175: // previous char read
0176: private int lookBack;
0177:
0178: //current character read
0179: private int ch;
0180:
0181: //next character
0182: private int lookAhead;
0183:
0184: //index of last char in pattern plus one
0185: private int patternFullLength = 0;
0186:
0187: // cur special token
0188: private SpecialToken curST = null;
0189:
0190: // next special token
0191: private SpecialToken lookAheadST = null;
0192:
0193: // cur char being processed
0194: private int index = 0;
0195:
0196: // previous non-whitespace character index;
0197: private int prevNW = 0;
0198:
0199: // cur token start index
0200: private int curToc = 0;
0201:
0202: // look ahead token index
0203: private int lookAheadToc = 0;
0204:
0205: // original string representing pattern
0206: private String orig = null;
0207:
0208: public Lexer(String pattern, int flags) {
0209: orig = pattern;
0210: if ((flags & Pattern.LITERAL) > 0) {
0211: pattern = Pattern.quote(pattern);
0212: } else if ((flags & Pattern.CANON_EQ) > 0) {
0213: pattern = Lexer.normalize(pattern);
0214: }
0215:
0216: this .pattern = new char[pattern.length() + 2];
0217: System.arraycopy(pattern.toCharArray(), 0, this .pattern, 0,
0218: pattern.length());
0219: this .pattern[this .pattern.length - 1] = 0;
0220: this .pattern[this .pattern.length - 2] = 0;
0221: patternFullLength = this .pattern.length;
0222: this .flags = flags;
0223: // read first two tokens;
0224: movePointer();
0225: movePointer();
0226:
0227: }
0228:
0229: /**
0230: * Returns current character w/o reading next one; if there are no more
0231: * characters returns 0;
0232: *
0233: * @return current character;
0234: */
0235: public int peek() {
0236: return ch;
0237: }
0238:
0239: /**
0240: * Set the Lexer to PATTERN or RANGE mode; Lexer interpret character two
0241: * different ways in parser or range modes.
0242: *
0243: * @param mode
0244: * Lexer.PATTERN or Lexer.RANGE
0245: */
0246: public void setMode(int mode) {
0247: if (mode > 0 && mode < 3) {
0248: this .mode = mode;
0249: }
0250:
0251: if (mode == Lexer.MODE_PATTERN) {
0252: reread();
0253: }
0254: }
0255:
0256: /**
0257: * Restores flags for Lexer
0258: *
0259: * @param flags
0260: */
0261: public void restoreFlags(int flags) {
0262: this .flags = flags;
0263: lookAhead = ch;
0264: lookAheadST = curST;
0265:
0266: //curToc is an index of closing bracket )
0267: index = curToc + 1;
0268: lookAheadToc = curToc;
0269: movePointer();
0270: }
0271:
0272: public SpecialToken peekSpecial() {
0273: return curST;
0274: }
0275:
0276: /**
0277: * Returns true, if current token is special, i.e. quantifier, or other
0278: * compound token.
0279: *
0280: * @return - true if current token is special, false otherwise.
0281: */
0282: public boolean isSpecial() {
0283: return curST != null;
0284: }
0285:
0286: public boolean isQuantifier() {
0287: return isSpecial()
0288: && curST.getType() == SpecialToken.TOK_QUANTIFIER;
0289: }
0290:
0291: public boolean isNextSpecial() {
0292: return lookAheadST != null;
0293: }
0294:
0295: /**
0296: * Returns current character and moves string index to the next one;
0297: *
0298: */
0299: public int next() {
0300: movePointer();
0301: return lookBack;
0302: }
0303:
0304: /**
0305: * Returns current special token and moves string index to the next one;
0306: */
0307: public SpecialToken nextSpecial() {
0308: SpecialToken res = curST;
0309: movePointer();
0310: return res;
0311: }
0312:
0313: /**
0314: * Returns nest symbol read.
0315: */
0316: public int lookAhead() {
0317: return lookAhead;
0318: }
0319:
0320: /**
0321: * Returns previous character.
0322: */
0323: public int back() {
0324: return lookBack;
0325: }
0326:
0327: /**
0328: * Normalize given expression.
0329: *
0330: * @param input - expression to normalize
0331: * @return normalized expression.
0332: */
0333: static String normalize(String input) {
0334: char[] inputChars = input.toCharArray();
0335: int inputLength = inputChars.length;
0336: int resCodePointsIndex = 0;
0337: int inputCodePointsIndex = 0;
0338: int decompHangulIndex = 0;
0339:
0340: //codePoints of input
0341: int[] inputCodePoints = new int[inputLength];
0342:
0343: //result of canonical decomposition of input
0344: int[] resCodePoints = new int[inputLength
0345: * MAX_DECOMPOSITION_LENGTH];
0346:
0347: //current symbol's codepoint
0348: int ch;
0349:
0350: //current symbol's decomposition
0351: int[] decomp;
0352:
0353: //result of canonical and Hangul decomposition of input
0354: int[] decompHangul;
0355:
0356: //result of canonical decomposition of input in UTF-16 encoding
0357: StringBuffer result = new StringBuffer();
0358:
0359: decompTable = HashDecompositions.getHashDecompositions();
0360: canonClassesTable = CanClasses.getHashCanClasses();
0361: canonClassesTableSize = canonClassesTable.size;
0362: singleDecompTable = SingleDecompositions
0363: .getHashSingleDecompositions();
0364: singleDecompTableSize = singleDecompTable.size;
0365:
0366: for (int i = 0; i < inputLength; i += Character.charCount(ch)) {
0367: ch = Character.codePointAt(inputChars, i);
0368: inputCodePoints[inputCodePointsIndex++] = ch;
0369: }
0370:
0371: /*
0372: * Canonical decomposition based on mappings in decompTable
0373: */
0374: for (int i = 0; i < inputCodePointsIndex; i++) {
0375: ch = inputCodePoints[i];
0376:
0377: decomp = Lexer.getDecomposition(ch);
0378: if (decomp == null) {
0379: resCodePoints[resCodePointsIndex++] = ch;
0380: } else {
0381: int curSymbDecompLength = decomp.length;
0382:
0383: for (int j = 0; j < curSymbDecompLength; j++) {
0384: resCodePoints[resCodePointsIndex++] = decomp[j];
0385: }
0386: }
0387: }
0388:
0389: /*
0390: * Canonical ordering.
0391: * See http://www.unicode.org/reports/tr15/#Decomposition for
0392: * details
0393: */
0394: resCodePoints = Lexer.getCanonicalOrder(resCodePoints,
0395: resCodePointsIndex);
0396:
0397: /*
0398: * Decomposition for Hangul syllables.
0399: * See http://www.unicode.org/reports/tr15/#Hangul for
0400: * details
0401: */
0402: decompHangul = new int[resCodePoints.length];
0403:
0404: for (int i = 0; i < resCodePointsIndex; i++) {
0405: int curSymb = resCodePoints[i];
0406:
0407: decomp = getHangulDecomposition(curSymb);
0408: if (decomp == null) {
0409: decompHangul[decompHangulIndex++] = curSymb;
0410: } else {
0411:
0412: /*
0413: * Note that Hangul decompositions have length that is
0414: * equal 2 or 3.
0415: */
0416: decompHangul[decompHangulIndex++] = decomp[0];
0417: decompHangul[decompHangulIndex++] = decomp[1];
0418: if (decomp.length == 3) {
0419: decompHangul[decompHangulIndex++] = decomp[2];
0420: }
0421: }
0422: }
0423:
0424: /*
0425: * Translating into UTF-16 encoding
0426: */
0427: for (int i = 0; i < decompHangulIndex; i++) {
0428: result.append(Character.toChars(decompHangul[i]));
0429: }
0430:
0431: return result.toString();
0432: }
0433:
0434: /**
0435: * Rearrange codepoints according
0436: * to canonical order.
0437: *
0438: * @param inputInts - array that contains Unicode codepoints
0439: * @param length - index of last Unicode codepoint plus 1
0440: *
0441: * @return array that contains rearranged codepoints.
0442: */
0443: static int[] getCanonicalOrder(int[] inputInts, int length) {
0444: int inputLength = (length < inputInts.length) ? length
0445: : inputInts.length;
0446:
0447: /*
0448: * Simple bubble-sort algorithm.
0449: * Note that many codepoints have 0
0450: * canonical class, so this algorithm works
0451: * almost lineary in overwhelming majority
0452: * of cases. This is due to specific of Unicode
0453: * combining classes and codepoints.
0454: */
0455: for (int i = 1; i < inputLength; i++) {
0456: int j = i - 1;
0457: int iCanonicalClass = getCanonicalClass(inputInts[i]);
0458: int ch;
0459:
0460: if (iCanonicalClass == 0) {
0461: continue;
0462: }
0463:
0464: while (j > -1) {
0465: if (getCanonicalClass(inputInts[j]) > iCanonicalClass) {
0466: j = j - 1;
0467: } else {
0468: break;
0469: }
0470: }
0471:
0472: ch = inputInts[i];
0473: for (int k = i; k > j + 1; k--) {
0474: inputInts[k] = inputInts[k - 1];
0475: }
0476: inputInts[j + 1] = ch;
0477: }
0478:
0479: return inputInts;
0480: }
0481:
0482: /**
0483: * Reread current character, may be require if previous token changes mode
0484: * to one with different character interpretation.
0485: *
0486: */
0487: private void reread() {
0488: lookAhead = ch;
0489: lookAheadST = curST;
0490: index = lookAheadToc;
0491: lookAheadToc = curToc;
0492: movePointer();
0493: }
0494:
0495: /**
0496: * Moves pointer one position right; save current character to lookBack;
0497: * lookAhead to current one and finally read one more to lookAhead;
0498: */
0499: private void movePointer() {
0500: // swap pointers
0501: lookBack = ch;
0502: ch = lookAhead;
0503: curST = lookAheadST;
0504: curToc = lookAheadToc;
0505: lookAheadToc = index;
0506: boolean reread;
0507: do {
0508: reread = false;
0509: // read next character analyze it and construct token:
0510: // //
0511:
0512: lookAhead = (index < pattern.length) ? nextCodePoint() : 0;
0513: lookAheadST = null;
0514:
0515: if (mode == Lexer.MODE_ESCAPE) {
0516: if (lookAhead == '\\') {
0517:
0518: //need not care about supplementary codepoints here
0519: lookAhead = (index < pattern.length) ? pattern[nextIndex()]
0520: : 0;
0521:
0522: switch (lookAhead) {
0523: case 'E': {
0524: mode = saved_mode;
0525:
0526: lookAhead = (index <= pattern.length - 2) ? nextCodePoint()
0527: : 0;
0528: break;
0529: }
0530:
0531: default: {
0532: lookAhead = '\\';
0533: index = prevNW;
0534: return;
0535: }
0536: }
0537: } else {
0538: return;
0539: }
0540: }
0541:
0542: if (lookAhead == '\\') {
0543:
0544: lookAhead = (index < pattern.length - 2) ? nextCodePoint()
0545: : -1;
0546: switch (lookAhead) {
0547: case -1:
0548: throw new PatternSyntaxException(
0549: Messages.getString("regex.10"), this .toString(), index); //$NON-NLS-1$
0550: case 'P':
0551: case 'p': {
0552: String cs = parseCharClassName();
0553: boolean negative = false;
0554:
0555: if (lookAhead == 'P')
0556: negative = true;
0557: ;
0558: try {
0559: lookAheadST = AbstractCharClass
0560: .getPredefinedClass(cs, negative);
0561: } catch (MissingResourceException mre) {
0562: throw new PatternSyntaxException(Messages
0563: .getString("regex.11" //$NON-NLS-1$
0564: , cs), this .toString(), index);
0565: }
0566: lookAhead = 0;
0567: break;
0568: }
0569:
0570: case 'w':
0571: case 's':
0572: case 'd':
0573: case 'W':
0574: case 'S':
0575: case 'D': {
0576: lookAheadST = CharClass.getPredefinedClass(
0577: new String(pattern, prevNW, 1), false);
0578: lookAhead = 0;
0579: break;
0580: }
0581:
0582: case 'Q': {
0583: saved_mode = mode;
0584: mode = Lexer.MODE_ESCAPE;
0585: reread = true;
0586: break;
0587: }
0588:
0589: case 't':
0590: lookAhead = '\t';
0591: break;
0592: case 'n':
0593: lookAhead = '\n';
0594: break;
0595: case 'r':
0596: lookAhead = '\r';
0597: break;
0598: case 'f':
0599: lookAhead = '\f';
0600: break;
0601: case 'a':
0602: lookAhead = '\u0007';
0603: break;
0604: case 'e':
0605: lookAhead = '\u001B';
0606: break;
0607:
0608: case '1':
0609: case '2':
0610: case '3':
0611: case '4':
0612: case '5':
0613: case '6':
0614: case '7':
0615: case '8':
0616: case '9': {
0617: if (mode == Lexer.MODE_PATTERN) {
0618: lookAhead = 0x80000000 | lookAhead;
0619: }
0620: break;
0621: }
0622:
0623: case '0':
0624: lookAhead = readOctals();
0625: break;
0626: case 'x':
0627: lookAhead = readHex("hexadecimal", 2); //$NON-NLS-1$
0628: break;
0629: case 'u':
0630: lookAhead = readHex("Unicode", 4); //$NON-NLS-1$
0631: break;
0632:
0633: case 'b':
0634: lookAhead = CHAR_WORD_BOUND;
0635: break;
0636: case 'B':
0637: lookAhead = CHAR_NONWORD_BOUND;
0638: break;
0639: case 'A':
0640: lookAhead = CHAR_START_OF_INPUT;
0641: break;
0642: case 'G':
0643: lookAhead = CHAR_PREVIOUS_MATCH;
0644: break;
0645: case 'Z':
0646: lookAhead = CHAR_END_OF_LINE;
0647: break;
0648: case 'z':
0649: lookAhead = CHAR_END_OF_INPUT;
0650: break;
0651: case 'c': {
0652: if (index < pattern.length - 2) {
0653:
0654: //need not care about supplementary codepoints here
0655: lookAhead = (pattern[nextIndex()] & 0x1f);
0656: break;
0657: } else {
0658: throw new PatternSyntaxException(Messages
0659: .getString("regex.12") //$NON-NLS-1$
0660: , this .toString(), index);
0661: }
0662: }
0663: case 'C':
0664: case 'E':
0665: case 'F':
0666: case 'H':
0667: case 'I':
0668: case 'J':
0669: case 'K':
0670: case 'L':
0671: case 'M':
0672: case 'N':
0673: case 'O':
0674: case 'R':
0675: case 'T':
0676: case 'U':
0677: case 'V':
0678: case 'X':
0679: case 'Y':
0680: case 'g':
0681: case 'h':
0682: case 'i':
0683: case 'j':
0684: case 'k':
0685: case 'l':
0686: case 'm':
0687: case 'o':
0688: case 'q':
0689: case 'y':
0690: throw new PatternSyntaxException(Messages
0691: .getString("regex.13") //$NON-NLS-1$
0692: , this .toString(), index);
0693:
0694: default:
0695: break;
0696: }
0697: } else if (mode == Lexer.MODE_PATTERN) {
0698: switch (lookAhead) {
0699: case '+':
0700: case '*':
0701: case '?': {
0702: char mod = (index < pattern.length) ? pattern[index]
0703: : '*';
0704: switch (mod) {
0705: case '+': {
0706: lookAhead = lookAhead | Lexer.QMOD_POSSESSIVE;
0707: nextIndex();
0708: break;
0709: }
0710: case '?': {
0711: lookAhead = lookAhead | Lexer.QMOD_RELUCTANT;
0712: nextIndex();
0713: break;
0714: }
0715: default: {
0716: lookAhead = lookAhead | Lexer.QMOD_GREEDY;
0717: break;
0718: }
0719: }
0720:
0721: break;
0722: }
0723:
0724: case '{': {
0725: lookAheadST = processQuantifier(lookAhead);
0726: break;
0727: }
0728:
0729: case '$':
0730: lookAhead = CHAR_DOLLAR;
0731: break;
0732: case '(': {
0733: if (pattern[index] == '?') {
0734: nextIndex();
0735: char nonCap = pattern[index];
0736: boolean behind = false;
0737: do {
0738: if (!behind) {
0739: switch (nonCap) {
0740: case '!':
0741: lookAhead = CHAR_NEG_LOOKAHEAD;
0742: nextIndex();
0743: break;
0744: case '=':
0745: lookAhead = CHAR_POS_LOOKAHEAD;
0746: nextIndex();
0747: break;
0748: case '>':
0749: lookAhead = CHAR_ATOMIC_GROUP;
0750: nextIndex();
0751: break;
0752: case '<': {
0753: nextIndex();
0754: nonCap = pattern[index];
0755: behind = true;
0756: break;
0757: }
0758: default: {
0759: lookAhead = readFlags();
0760:
0761: /*
0762: * We return res = res | 1 << 8
0763: * from readFlags() if we read
0764: * (?idmsux-idmsux)
0765: */
0766: if (lookAhead >= 256) {
0767:
0768: //Erase auxiliary bit
0769: lookAhead = (lookAhead & 0xff);
0770: flags = lookAhead;
0771: lookAhead = lookAhead << 16;
0772: lookAhead = CHAR_FLAGS
0773: | lookAhead;
0774: } else {
0775: flags = lookAhead;
0776: lookAhead = lookAhead << 16;
0777: lookAhead = CHAR_NONCAP_GROUP
0778: | lookAhead;
0779: }
0780: break;
0781: }
0782: }
0783: } else {
0784: behind = false;
0785: switch (nonCap) {
0786: case '!':
0787: lookAhead = CHAR_NEG_LOOKBEHIND;
0788: nextIndex();
0789: break;
0790: case '=':
0791: lookAhead = CHAR_POS_LOOKBEHIND;
0792: nextIndex();
0793: break;
0794: default:
0795: throw new PatternSyntaxException(
0796: Messages
0797: .getString("regex.14") //$NON-NLS-1$
0798: , this .toString(), index);
0799: }
0800: }
0801: } while (behind);
0802: } else {
0803: lookAhead = CHAR_LEFT_PARENTHESIS;
0804: }
0805: break;
0806: }
0807:
0808: case ')':
0809: lookAhead = CHAR_RIGHT_PARENTHESIS;
0810: break;
0811: case '[': {
0812: lookAhead = CHAR_LEFT_SQUARE_BRACKET;
0813: setMode(Lexer.MODE_RANGE);
0814: break;
0815: }
0816: case ']': {
0817: if (mode == Lexer.MODE_RANGE) {
0818: lookAhead = CHAR_RIGHT_SQUARE_BRACKET;
0819: }
0820: break;
0821: }
0822: case '^':
0823: lookAhead = CHAR_CARET;
0824: break;
0825: case '|':
0826: lookAhead = CHAR_VERTICAL_BAR;
0827: break;
0828: case '.':
0829: lookAhead = CHAR_DOT;
0830: break;
0831: default:
0832: break;
0833: }
0834: } else if (mode == Lexer.MODE_RANGE) {
0835: switch (lookAhead) {
0836: case '[':
0837: lookAhead = CHAR_LEFT_SQUARE_BRACKET;
0838: break;
0839: case ']':
0840: lookAhead = CHAR_RIGHT_SQUARE_BRACKET;
0841: break;
0842: case '^':
0843: lookAhead = CHAR_CARET;
0844: break;
0845: case '&':
0846: lookAhead = CHAR_AMPERSAND;
0847: break;
0848: case '-':
0849: lookAhead = CHAR_HYPHEN;
0850: break;
0851: default:
0852: break;
0853: }
0854: }
0855: } while (reread);
0856: }
0857:
0858: /**
0859: * Parse character classes names and verifies correction of the syntax;
0860: */
0861: private String parseCharClassName() {
0862: StringBuffer sb = new StringBuffer(10);
0863: if (index < pattern.length - 2) {
0864: // one symbol family
0865: if (pattern[index] != '{') {
0866: return "Is" + new String(pattern, nextIndex(), 1); //$NON-NLS-1$
0867: }
0868:
0869: nextIndex();
0870: char ch = 0;
0871: while (index < pattern.length - 2
0872: && (ch = pattern[nextIndex()]) != '}') {
0873: sb.append((char) ch);
0874: }
0875: if (ch != '}')
0876: throw new PatternSyntaxException(Messages
0877: .getString("regex.15"), this //$NON-NLS-1$
0878: .toString(), index);
0879: }
0880:
0881: if (sb.length() == 0)
0882: throw new PatternSyntaxException(Messages
0883: .getString("regex.16"), this .toString(), //$NON-NLS-1$
0884: index);
0885:
0886: String res = sb.toString();
0887: if (res.length() == 1)
0888: return "Is" + res; //$NON-NLS-1$
0889: return (res.length() > 3 && (res.startsWith("Is") || res //$NON-NLS-1$
0890: .startsWith("In"))) ? res.substring(2) : res; //$NON-NLS-1$
0891: }
0892:
0893: /**
0894: * Process given character in assumption that it's quantifier.
0895: */
0896: private Quantifier processQuantifier(int ch) {
0897: StringBuffer sb = new StringBuffer(4);
0898: int min = -1;
0899: int max = Integer.MAX_VALUE;
0900: while (index < pattern.length
0901: && (ch = pattern[nextIndex()]) != '}') {
0902: if (ch == ',' && min < 0) {
0903: try {
0904: min = Integer.parseInt(sb.toString(), 10);
0905: sb.delete(0, sb.length());
0906: } catch (NumberFormatException nfe) {
0907: throw new PatternSyntaxException(Messages
0908: .getString("regex.17"), this //$NON-NLS-1$
0909: .toString(), index);
0910: }
0911: } else {
0912: sb.append((char) ch);
0913: }
0914: }
0915: if (ch != '}') {
0916: throw new PatternSyntaxException(Messages
0917: .getString("regex.17"), //$NON-NLS-1$
0918: this .toString(), index);
0919: }
0920: if (sb.length() > 0) {
0921: try {
0922: max = Integer.parseInt(sb.toString(), 10);
0923: if (min < 0)
0924: min = max;
0925: } catch (NumberFormatException nfe) {
0926: throw new PatternSyntaxException(Messages
0927: .getString("regex.17"), this //$NON-NLS-1$
0928: .toString(), index);
0929: }
0930: } else if (min < 0) {
0931: throw new PatternSyntaxException(Messages
0932: .getString("regex.17"), //$NON-NLS-1$
0933: this .toString(), index);
0934: }
0935: if ((min | max | max - min) < 0) {
0936: throw new PatternSyntaxException(Messages
0937: .getString("regex.17"), //$NON-NLS-1$
0938: this .toString(), index);
0939: }
0940:
0941: char mod = (index < pattern.length) ? pattern[index] : '*';
0942:
0943: switch (mod) {
0944: case '+':
0945: lookAhead = Lexer.QUANT_COMP_P;
0946: nextIndex();
0947: break;
0948: case '?':
0949: lookAhead = Lexer.QUANT_COMP_R;
0950: nextIndex();
0951: break;
0952: default:
0953: lookAhead = Lexer.QUANT_COMP;
0954: break;
0955: }
0956: return new Quantifier(min, max);
0957: }
0958:
0959: public String toString() {
0960: return orig;
0961: }
0962:
0963: /**
0964: * Checks if there are any characters in the pattern.
0965: *
0966: * @return true if there are no more characters in the pattern.
0967: */
0968: public boolean isEmpty() {
0969: return ch == 0 && lookAhead == 0 && index == patternFullLength
0970: && !isSpecial();
0971: }
0972:
0973: /**
0974: * Returns true if current character is plain token.
0975: */
0976: public static boolean isLetter(int ch) {
0977:
0978: //all supplementary codepoints have integer value that is >= 0;
0979: return ch >= 0;
0980: }
0981:
0982: /**
0983: * Return true if current character is letter, false otherwise; This is
0984: * shortcut to static method isLetter to check the current character.
0985: *
0986: * @return true if current character is letter, false otherwise
0987: */
0988: public boolean isLetter() {
0989: return !isEmpty() && !isSpecial() && isLetter(ch);
0990: }
0991:
0992: /*
0993: * Note that Character class methods
0994: * isHighSurrogate(), isLowSurrogate()
0995: * take char parameter while we need an int
0996: * parameter without truncation to char value
0997: */
0998: public boolean isHighSurrogate() {
0999: return (ch <= 0xDBFF) && (ch >= 0xD800);
1000: }
1001:
1002: public boolean isLowSurrogate() {
1003: return (ch <= 0xDFFF) && (ch >= 0xDC00);
1004: }
1005:
1006: public static boolean isHighSurrogate(int ch) {
1007: return (ch <= 0xDBFF) && (ch >= 0xD800);
1008: }
1009:
1010: public static boolean isLowSurrogate(int ch) {
1011: return (ch <= 0xDFFF) && (ch >= 0xDC00);
1012: }
1013:
1014: /**
1015: * Process hexadecimal integer.
1016: */
1017: private int readHex(String radixName, int max) {
1018: StringBuffer st = new StringBuffer(max);
1019: int length = pattern.length - 2;
1020: int i;
1021: for (i = 0; i < max && index < length; i++) {
1022: st.append((char) pattern[nextIndex()]);
1023: }
1024: if (i == max) {
1025: try {
1026: return Integer.parseInt(st.toString(), 16);
1027: } catch (NumberFormatException nfe) {
1028: }
1029: }
1030:
1031: throw new PatternSyntaxException(Messages.getString(
1032: "regex.18", radixName) //$NON-NLS-1$
1033: , this .toString(), index);
1034: }
1035:
1036: /**
1037: * Process octal integer.
1038: */
1039: private int readOctals() {
1040: char ch;
1041: int max = 3;
1042: int i = 1;
1043: int first;
1044: int res;
1045: int length = pattern.length - 2;
1046:
1047: switch (first = Character.digit((ch = pattern[index]), 8)) {
1048: case -1:
1049: throw new PatternSyntaxException(Messages
1050: .getString("regex.19") //$NON-NLS-1$
1051: , this .toString(), index);
1052: default: {
1053: if (first > 3)
1054: max--;
1055: nextIndex();
1056: res = first;
1057: }
1058: }
1059:
1060: while (i < max
1061: && index < length
1062: && (first = Character.digit((ch = pattern[index]), 8)) >= 0) {
1063: res = res * 8 + first;
1064: nextIndex();
1065: i++;
1066: }
1067:
1068: return res;
1069: }
1070:
1071: /**
1072: * Process expression flags given with (?idmsux-idmsux)
1073: */
1074: private int readFlags() {
1075: char ch;
1076: boolean pos = true;
1077: int res = flags;
1078:
1079: while (index < pattern.length) {
1080: ch = pattern[index];
1081: switch (ch) {
1082: case '-':
1083: if (!pos) {
1084: throw new PatternSyntaxException(Messages
1085: .getString("regex.1A") //$NON-NLS-1$
1086: , this .toString(), index);
1087: }
1088: pos = false;
1089: break;
1090:
1091: case 'i':
1092: res = pos ? res | Pattern.CASE_INSENSITIVE
1093: : (res ^ Pattern.CASE_INSENSITIVE) & res;
1094: break;
1095:
1096: case 'd':
1097: res = pos ? res | Pattern.UNIX_LINES
1098: : (res ^ Pattern.UNIX_LINES) & res;
1099: break;
1100:
1101: case 'm':
1102: res = pos ? res | Pattern.MULTILINE
1103: : (res ^ Pattern.MULTILINE) & res;
1104: break;
1105:
1106: case 's':
1107: res = pos ? res | Pattern.DOTALL
1108: : (res ^ Pattern.DOTALL) & res;
1109: break;
1110:
1111: case 'u':
1112: res = pos ? res | Pattern.UNICODE_CASE
1113: : (res ^ Pattern.UNICODE_CASE) & res;
1114: break;
1115:
1116: case 'x':
1117: res = pos ? res | Pattern.COMMENTS
1118: : (res ^ Pattern.COMMENTS) & res;
1119: break;
1120:
1121: case ':':
1122: nextIndex();
1123: return res;
1124:
1125: case ')':
1126: nextIndex();
1127: return res | (1 << 8);
1128:
1129: default:
1130: // ignore invalid flags (HARMONY-2127)
1131: }
1132: nextIndex();
1133: }
1134: throw new PatternSyntaxException(
1135: Messages.getString("regex.1A"), //$NON-NLS-1$
1136: this .toString(), index);
1137: }
1138:
1139: /**
1140: * Returns next character index to read and moves pointer to the next one.
1141: * If comments flag is on this method will skip comments and whitespaces.
1142: *
1143: * The following actions are equivalent if comments flag is off ch =
1144: * pattern[index++] == ch = pattern[nextIndex]
1145: *
1146: * @return next character index to read.
1147: */
1148: private int nextIndex() {
1149: prevNW = index;
1150: if ((flags & Pattern.COMMENTS) != 0) {
1151: skipComments();
1152: } else {
1153: index++;
1154: }
1155: return prevNW;
1156: }
1157:
1158: /**
1159: * Skips comments and whitespaces
1160: */
1161: private int skipComments() {
1162: int length = pattern.length - 2;
1163: index++;
1164: do {
1165: while (index < length
1166: && Character.isWhitespace(pattern[index]))
1167: index++;
1168: if (index < length && pattern[index] == '#') {
1169: index++;
1170: while (index < length
1171: && !isLineSeparator(pattern[index]))
1172: index++;
1173: } else
1174: return index;
1175: } while (true);
1176: }
1177:
1178: private boolean isLineSeparator(int ch) {
1179: return (ch == '\n' || ch == '\r' || ch == '\u0085' || (ch | 1) == '\u2029');
1180: }
1181:
1182: /**
1183: * Gets decomposition for given codepoint from
1184: * decomposition mappings table.
1185: *
1186: * @param ch - Unicode codepoint
1187: * @return array of codepoints that is a canonical
1188: * decomposition of ch.
1189: */
1190: static int[] getDecomposition(int ch) {
1191: return decompTable.get(ch);
1192: }
1193:
1194: /**
1195: * Gets decomposition for given Hangul syllable.
1196: * This is an implementation of Hangul decomposition algorithm
1197: * according to http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf
1198: * "3.12 Conjoining Jamo Behavior".
1199: *
1200: * @param ch - given Hangul syllable
1201: * @return canonical decomposition of ch.
1202: */
1203: static int[] getHangulDecomposition(int ch) {
1204: int SIndex = ch - SBase;
1205:
1206: if (SIndex < 0 || SIndex >= SCount) {
1207: return null;
1208: } else {
1209: int L = LBase + SIndex / NCount;
1210: int V = VBase + (SIndex % NCount) / TCount;
1211: int T = SIndex % TCount;
1212: int decomp[];
1213:
1214: if (T == 0) {
1215: decomp = new int[] { L, V };
1216: } else {
1217: T = TBase + T;
1218: decomp = new int[] { L, V, T };
1219: }
1220: return decomp;
1221: }
1222: }
1223:
1224: /**
1225: * Gets canonical class for given codepoint from
1226: * decomposition mappings table.
1227: *
1228: * @param - ch Unicode codepoint
1229: * @return canonical class for given Unicode codepoint
1230: * that is represented by ch.
1231: */
1232: static int getCanonicalClass(int ch) {
1233: int canClass = canonClassesTable.get(ch);
1234:
1235: return (canClass == canonClassesTableSize) ? 0 : canClass;
1236: }
1237:
1238: /**
1239: * Tests if given codepoint is a canonical decomposition of another
1240: * codepoint.
1241: *
1242: * @param ch - codepoint to test
1243: * @return true if ch is a decomposition.
1244: */
1245: static boolean hasSingleCodepointDecomposition(int ch) {
1246: int hasSingleDecomp = singleDecompTable.get(ch);
1247:
1248: /*
1249: * singleDecompTable doesn't contain ch
1250: * == (hasSingleDecomp == singleDecompTableSize)
1251: */
1252: return (hasSingleDecomp == singleDecompTableSize) ? false
1253: : true;
1254: }
1255:
1256: /**
1257: * Tests if given codepoint has canonical decomposition
1258: * and given codepoint's canonical class is not 0.
1259: *
1260: * @param ch - codepoint to test
1261: * @return true if canonical class is not 0 and ch has a decomposition.
1262: */
1263: static boolean hasDecompositionNonNullCanClass(int ch) {
1264: return ch == 0x0340 | ch == 0x0341 | ch == 0x0343
1265: | ch == 0x0344;
1266: }
1267:
1268: private int nextCodePoint() {
1269: char high = pattern[nextIndex()];
1270:
1271: if (Character.isHighSurrogate(high)) {
1272:
1273: //low and high char may be delimited by spaces
1274: int lowExpectedIndex = prevNW + 1;
1275:
1276: if (lowExpectedIndex < pattern.length) {
1277: char low = pattern[lowExpectedIndex];
1278: if (Character.isLowSurrogate(low)) {
1279: nextIndex();
1280: return Character.toCodePoint(high, low);
1281: }
1282: }
1283: }
1284:
1285: return (int) high;
1286: }
1287:
1288: /**
1289: * Tests Unicode codepoint if it is a boundary
1290: * of decomposed Unicode codepoint.
1291: *
1292: * @param ch - Unicode codepoint to test
1293: * @return true if given codepoint is a boundary.
1294: */
1295: static boolean isDecomposedCharBoundary(int ch) {
1296: int canClass = canonClassesTable.get(ch);
1297:
1298: //Lexer.getCanonicalClass(ch) == 0
1299: boolean isBoundary = (canClass == canonClassesTableSize);
1300:
1301: return isBoundary;
1302: }
1303:
1304: /**
1305: * Returns the curr. character index.
1306: */
1307: public int getIndex() {
1308: return curToc;
1309: }
1310: }
|