0001: /*
0002: * Licensed to the Apache Software Foundation (ASF) under one or more
0003: * contributor license agreements. See the NOTICE file distributed with
0004: * this work for additional information regarding copyright ownership.
0005: * The ASF licenses this file to You under the Apache License, Version 2.0
0006: * (the "License"); you may not use this file except in compliance with
0007: * the License. You may obtain a copy of the License at
0008: *
0009: * http://www.apache.org/licenses/LICENSE-2.0
0010: *
0011: * Unless required by applicable law or agreed to in writing, software
0012: * distributed under the License is distributed on an "AS IS" BASIS,
0013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014: * See the License for the specific language governing permissions and
0015: * limitations under the License.
0016: */
0017:
0018: /**
0019: * @author Nikolay A. Kuznetsov
0020: * @version $Revision: 1.36.2.2 $
0021: */package java.util.regex;
0022:
0023: import java.io.Serializable;
0024:
0025: import java.util.ArrayList;
0026:
0027: import org.apache.harmony.regex.internal.nls.Messages;
0028:
0029: /**
0030: * Pattern implements a compiler for regular expressions as defined by the J2SE
0031: * specification. The regular expression syntax is largely similar to the syntax
0032: * defined by Perl 5 but has both omissions and extensions. A formal and
0033: * complete definition of the regular expression syntax is not provided by the
0034: * J2SE speTBD (TODO)
0035: *
0036: */
0037: public final class Pattern implements Serializable {
0038:
0039: private static final long serialVersionUID = 5073258162644648461L;
0040:
0041: static final boolean _DEBUG_ = false;
0042:
0043: /**
0044: * @com.intel.drl.spec_ref
0045: */
0046: public static final int UNIX_LINES = 1 << 0;
0047:
0048: /**
0049: * @com.intel.drl.spec_ref
0050: */
0051: public static final int CASE_INSENSITIVE = 1 << 1;
0052:
0053: /**
0054: * @com.intel.drl.spec_ref
0055: */
0056: public static final int COMMENTS = 1 << 2;
0057:
0058: /**
0059: * @com.intel.drl.spec_ref
0060: */
0061: public static final int MULTILINE = 1 << 3;
0062:
0063: /**
0064: * @com.intel.drl.spec_ref
0065: */
0066: public static final int LITERAL = 1 << 4;
0067:
0068: /**
0069: * @com.intel.drl.spec_ref
0070: */
0071: public static final int DOTALL = 1 << 5;
0072:
0073: /**
0074: * @com.intel.drl.spec_ref
0075: */
0076: public static final int UNICODE_CASE = 1 << 6;
0077:
0078: /**
0079: * @com.intel.drl.spec_ref
0080: */
0081: public static final int CANON_EQ = 1 << 7;
0082:
0083: static final int BACK_REF_NUMBER = 10;
0084:
0085: /**
0086: * Bit mask that includes all defined match flags
0087: */
0088: static final int flagsBitMask = Pattern.UNIX_LINES
0089: | Pattern.CASE_INSENSITIVE | Pattern.COMMENTS
0090: | Pattern.MULTILINE | Pattern.DOTALL | Pattern.UNICODE_CASE
0091: | Pattern.CANON_EQ;
0092:
0093: /**
0094: * Current <code>pattern</code> to be compiled;
0095: */
0096: private transient Lexer lexemes = null;
0097:
0098: /**
0099: * Pattern compile flags;
0100: */
0101: private int flags = 0;
0102:
0103: private String pattern = null;
0104:
0105: /*
0106: * All backreferences that may be used in pattern.
0107: */
0108: transient private FSet backRefs[] = new FSet[BACK_REF_NUMBER];
0109:
0110: /*
0111: * Is true if backreferenced sets replacement is needed
0112: */
0113: transient private boolean needsBackRefReplacement = false;
0114:
0115: transient private int groupIndex = -1;
0116:
0117: transient private int globalGroupIndex = -1;
0118:
0119: transient private int compCount = -1;
0120:
0121: transient private int consCount = -1;
0122:
0123: transient AbstractSet start = null;
0124:
0125: /**
0126: * Create a matcher for this pattern and a given input character sequence
0127: *
0128: * @param cs
0129: * The input character sequence
0130: * @return A new matcher
0131: */
0132: public Matcher matcher(CharSequence cs) {
0133: return new Matcher(this , cs);
0134: }
0135:
0136: /**
0137: * Split an input string using the pattern as a token separator.
0138: *
0139: * @param input
0140: * Input sequence to tokenize
0141: * @param limit
0142: * If positive, the maximum number of tokens to return. If
0143: * negative, an indefinite number of tokens are returned. If
0144: * zero, an indefinite number of tokens are returned but trailing
0145: * empty tokens are excluded.
0146: * @return A sequence of tokens split out of the input string.
0147: */
0148: public String[] split(CharSequence input, int limit) {
0149: ArrayList res = new ArrayList();
0150: Matcher mat = matcher(input);
0151: int index = 0;
0152: int curPos = 0;
0153:
0154: if (input.length() == 0) {
0155: return new String[] { "" }; //$NON-NLS-1$
0156: } else {
0157: while (mat.find() && (index + 1 < limit || limit <= 0)) {
0158: res.add(input.subSequence(curPos, mat.start())
0159: .toString());
0160: curPos = mat.end();
0161: index++;
0162: }
0163:
0164: res.add(input.subSequence(curPos, input.length())
0165: .toString());
0166: index++;
0167:
0168: /*
0169: * discard trailing empty strings
0170: */
0171: if (limit == 0) {
0172: while (--index >= 0
0173: && res.get(index).toString().length() == 0) {
0174: res.remove(index);
0175: }
0176: }
0177: }
0178: return (String[]) res
0179: .toArray(new String[index >= 0 ? index : 0]);
0180: }
0181:
0182: /**
0183: * @com.intel.drl.spec_ref
0184: */
0185: public String[] split(CharSequence input) {
0186: return split(input, 0);
0187: }
0188:
0189: /**
0190: * Returns the pattern string passed to the compile method
0191: *
0192: * @return A string representation of the pattern
0193: */
0194: public String pattern() {
0195: return lexemes.toString();
0196: }
0197:
0198: /**
0199: * Return a textual representation of the pattern.
0200: *
0201: * @return The regular expression string
0202: */
0203: public String toString() {
0204: return this .pattern();
0205: }
0206:
0207: /**
0208: * Return the mask of flags used to compile the pattern
0209: *
0210: * @return A mask of flags used to compile the pattern.
0211: */
0212: public int flags() {
0213: return this .flags;
0214: }
0215:
0216: /**
0217: * Return a compiled pattern corresponding to the input regular expression
0218: * string.
0219: *
0220: * The input <code>flags</code> is a mask of the following flags:
0221: * <dl>
0222: * <dt><code>UNIX_LINES</code> (0x0001)
0223: * <dd>Enables UNIX lines mode where only \n is recognized as a line
0224: * terminator. The default setting of this flag is <em>off</em> indicating
0225: * that all of the following character sequences are recognized as line
0226: * terminators: \n, \r, \r\n, NEL (\u0085), \u2028 and \u2029.
0227: * <dt><code>CASE_INSENSITIVE</code> (0x0002)
0228: * <dd>Directs matching to be done in a way that ignores differences in
0229: * case. If input character sequences are encoded in character sets other
0230: * than ASCII, then the UNICODE_CASE must also be set to enable Unicode case
0231: * detection.
0232: * <dt><code>UNICODE_CASE</code> (0x0040)
0233: * <dd>Enables Unicode case folding if used in conjunction with the
0234: * <code>CASE_INSENSITIVE</code> flag. If <code>CASE_INSENSITIVE</code>
0235: * is not set, then this flag has no effect.
0236: * <dt><code>COMMENTS</code> (0x0004)
0237: * <dd>Directs the pattern compiler to ignore whitespace and comments in
0238: * the pattern. Whitespace consists of sequences including only these
0239: * characters: SP (\u0020), HT (\t or \u0009), LF (\n or ), VT
0240: * (\u000b), FF (\f or \u000c), and CR (\r or ). A comment is any
0241: * sequence of characters beginning with the "#" (\u0023) character and
0242: * ending in a LF character.
0243: * <dt><code>MULTILINE</code> (0x0008)
0244: * <dd>Turns on multiple line mode for matching of character sequences. By
0245: * default, this mode is off so that the character "^" (\u005e) matches
0246: * the beginning of the entire input sequence and the character "$"
0247: * (\u0024) matches the end of the input character sequence. In multiple
0248: * line mode, the character "^" matches any character in the input sequence
0249: * which immediately follows a line terminator and the character "$" matches
0250: * any character in the input sequence which immediately precedes a line
0251: * terminator.
0252: * <dt><code>DOTALL</code> (0x0020)
0253: * <dd>Enables the DOT (".") character in regular expressions to match line
0254: * terminators. By default, line terminators are not matched by DOT.
0255: * <dt><code>CANON_EQ</code> (0x0080)
0256: * <dd>Enables matching of character sequences which are canonically
0257: * equivalent according to the Unicode standard. Canonical equivalence is
0258: * described here: http://www.unicode.org/reports/tr15/. By default,
0259: * canonical equivalence is not detected while matching.
0260: * </dl>
0261: *
0262: * @param regex
0263: * A regular expression string.
0264: * @param flags
0265: * A set of flags to control the compilation of the pattern.
0266: * @return A compiled pattern
0267: * @throws PatternSyntaxException
0268: * If the input regular expression does not match the required
0269: * grammar.
0270: */
0271: public static Pattern compile(String regex, int flags)
0272: throws PatternSyntaxException {
0273:
0274: if ((flags != 0) && ((flags | flagsBitMask) != flagsBitMask)) {
0275:
0276: throw new IllegalArgumentException(Messages
0277: .getString("regex.1C"));
0278: }
0279:
0280: AbstractSet.counter = 1;
0281:
0282: return new Pattern().compileImpl(regex, flags);
0283: }
0284:
0285: /**
0286: *
0287: * @param pattern -
0288: * Regular expression to be compiled
0289: * @param flags -
0290: * The bit mask including CASE_INSENSITIVE, MULTILINE, DOTALL,
0291: * UNICODE_CASE, and CANON_EQ
0292: *
0293: * @return Compiled pattern
0294: */
0295: private Pattern compileImpl(String regex, int flags)
0296: throws PatternSyntaxException {
0297: this .lexemes = new Lexer(regex, flags);
0298: this .flags = flags;
0299: this .pattern = regex;
0300:
0301: start = processExpression(-1, this .flags, null);
0302: if (!lexemes.isEmpty()) {
0303: throw new PatternSyntaxException(Messages
0304: .getString("regex.08"), lexemes.toString(), //$NON-NLS-1$
0305: lexemes.getIndex());
0306: }
0307: finalizeCompile();
0308: return this ;
0309: }
0310:
0311: /**
0312: * A->(a|)+
0313: */
0314: private AbstractSet processAlternations(AbstractSet last) {
0315: CharClass auxRange = new CharClass(
0316: hasFlag(Pattern.CASE_INSENSITIVE),
0317: hasFlag(Pattern.UNICODE_CASE));
0318: while (!lexemes.isEmpty()
0319: && lexemes.isLetter()
0320: && (lexemes.lookAhead() == 0
0321: || lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes
0322: .lookAhead() == Lexer.CHAR_RIGHT_PARENTHESIS)) {
0323: auxRange.add(lexemes.next());
0324: if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR)
0325: lexemes.next();
0326: }
0327: AbstractSet rangeSet = processRangeSet(auxRange);
0328: rangeSet.setNext(last);
0329:
0330: return rangeSet;
0331: }
0332:
0333: /**
0334: * E->AE; E->S|E; E->S; A->(a|)+ E->S(|S)*
0335: */
0336: private AbstractSet processExpression(int ch, int newFlags,
0337: AbstractSet last) {
0338: ArrayList children = new ArrayList();
0339: AbstractSet child;
0340: int saveFlags = flags;
0341: FSet fSet;
0342: boolean saveChangedFlags = false;
0343:
0344: if (newFlags != flags) {
0345: flags = newFlags;
0346: }
0347:
0348: switch (ch) {
0349: case Lexer.CHAR_NONCAP_GROUP:
0350: fSet = new NonCapFSet(++consCount);
0351: break;
0352:
0353: case Lexer.CHAR_POS_LOOKAHEAD:
0354: /* falls through */
0355:
0356: case Lexer.CHAR_NEG_LOOKAHEAD:
0357: fSet = new AheadFSet();
0358: break;
0359:
0360: case Lexer.CHAR_POS_LOOKBEHIND:
0361: /* falls through */
0362:
0363: case Lexer.CHAR_NEG_LOOKBEHIND:
0364: fSet = new BehindFSet(++consCount);
0365: break;
0366:
0367: case Lexer.CHAR_ATOMIC_GROUP:
0368: fSet = new AtomicFSet(++consCount);
0369: break;
0370:
0371: default:
0372: globalGroupIndex++;
0373: if (last == null) {
0374:
0375: // expr = new StartSet();
0376: fSet = new FinalSet();
0377: saveChangedFlags = true;
0378: } else {
0379:
0380: // expr = new JointSet(globalGroupIndex);
0381: fSet = new FSet(globalGroupIndex);
0382: }
0383: if (globalGroupIndex > -1 && globalGroupIndex < 10) {
0384: backRefs[globalGroupIndex] = fSet;
0385: }
0386: break;
0387: }
0388:
0389: do {
0390: if (lexemes.isLetter()
0391: && lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR) {
0392: child = processAlternations(fSet);
0393: } else if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR) {
0394: child = new EmptySet(fSet);
0395: lexemes.next();
0396: } else {
0397: child = processSubExpression(fSet);
0398: if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR) {
0399: lexemes.next();
0400: }
0401: }
0402: if (child != null) {
0403:
0404: //expr.addChild(child);
0405: children.add(child);
0406: }
0407: } while (!(lexemes.isEmpty() || (lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS)));
0408:
0409: if (lexemes.back() == Lexer.CHAR_VERTICAL_BAR) {
0410: children.add(new EmptySet(fSet));
0411: }
0412:
0413: if (flags != saveFlags && !saveChangedFlags) {
0414: flags = saveFlags;
0415: lexemes.restoreFlags(flags);
0416: }
0417:
0418: switch (ch) {
0419: case Lexer.CHAR_NONCAP_GROUP:
0420: return new NonCapJointSet(children, fSet);
0421:
0422: case Lexer.CHAR_POS_LOOKAHEAD:
0423: return new PositiveLookAhead(children, fSet);
0424:
0425: case Lexer.CHAR_NEG_LOOKAHEAD:
0426: return new NegativeLookAhead(children, fSet);
0427:
0428: case Lexer.CHAR_POS_LOOKBEHIND:
0429: return new PositiveLookBehind(children, fSet);
0430:
0431: case Lexer.CHAR_NEG_LOOKBEHIND:
0432: return new NegativeLookBehind(children, fSet);
0433:
0434: case Lexer.CHAR_ATOMIC_GROUP:
0435: return new AtomicJointSet(children, fSet);
0436:
0437: default:
0438: switch (children.size()) {
0439: case 0:
0440: return new EmptySet(fSet);
0441:
0442: case 1:
0443: return new SingleSet((AbstractSet) children.get(0),
0444: fSet);
0445:
0446: default:
0447: return new JointSet(children, fSet);
0448: }
0449: }
0450: }
0451:
0452: /**
0453: * T->a+
0454: */
0455: private AbstractSet processSequence(AbstractSet last) {
0456: StringBuffer substring = new StringBuffer();
0457:
0458: while (!lexemes.isEmpty()
0459: && lexemes.isLetter()
0460: && !lexemes.isHighSurrogate()
0461: && !lexemes.isLowSurrogate()
0462: && ((!lexemes.isNextSpecial() && lexemes.lookAhead() == 0) // end
0463: // of
0464: // pattern
0465: || (!lexemes.isNextSpecial() && Lexer
0466: .isLetter(lexemes.lookAhead()))
0467: || lexemes.lookAhead() == Lexer.CHAR_RIGHT_PARENTHESIS
0468: || (lexemes.lookAhead() & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS
0469: || lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes
0470: .lookAhead() == Lexer.CHAR_DOLLAR)) {
0471: int ch = lexemes.next();
0472:
0473: if (Character.isSupplementaryCodePoint(ch)) {
0474: substring.append(Character.toChars(ch));
0475: } else {
0476: substring.append((char) ch);
0477: }
0478: }
0479: if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
0480: return new SequenceSet(substring);
0481: } else if (!hasFlag(Pattern.UNICODE_CASE)) {
0482: return new CISequenceSet(substring);
0483: } else {
0484: return new UCISequenceSet(substring);
0485: }
0486: }
0487:
0488: /**
0489: * D->a
0490: */
0491: private AbstractSet processDecomposedChar(AbstractSet last) {
0492: int[] codePoints = new int[Lexer.MAX_DECOMPOSITION_LENGTH];
0493: char[] codePointsHangul;
0494: int readCodePoints = 0;
0495: int curSymb = -1;
0496: int curSymbIndex = -1;
0497:
0498: if (!lexemes.isEmpty() && lexemes.isLetter()) {
0499: curSymb = lexemes.next();
0500: codePoints[readCodePoints] = curSymb;
0501: curSymbIndex = curSymb - Lexer.LBase;
0502: }
0503:
0504: /*
0505: * We process decomposed Hangul syllable LV or LVT or process jamo L.
0506: * See http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf
0507: * "3.12 Conjoining Jamo Behavior"
0508: */
0509: if ((curSymbIndex >= 0) && (curSymbIndex < Lexer.LCount)) {
0510: codePointsHangul = new char[Lexer.MAX_HANGUL_DECOMPOSITION_LENGTH];
0511: codePointsHangul[readCodePoints++] = (char) curSymb;
0512:
0513: curSymb = lexemes.peek();
0514: curSymbIndex = curSymb - Lexer.VBase;
0515: if ((curSymbIndex >= 0) && (curSymbIndex < Lexer.VCount)) {
0516: codePointsHangul[readCodePoints++] = (char) curSymb;
0517: lexemes.next();
0518: curSymb = lexemes.peek();
0519: curSymbIndex = curSymb - Lexer.TBase;
0520: if ((curSymbIndex >= 0)
0521: && (curSymbIndex < Lexer.TCount)) {
0522: codePointsHangul[readCodePoints++] = (char) curSymb;
0523: lexemes.next();
0524:
0525: //LVT syllable
0526: return new HangulDecomposedCharSet(
0527: codePointsHangul, 3);
0528: } else {
0529:
0530: //LV syllable
0531: return new HangulDecomposedCharSet(
0532: codePointsHangul, 2);
0533: }
0534: } else {
0535:
0536: //L jamo
0537: if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
0538: return new CharSet(codePointsHangul[0]);
0539: } else if (!hasFlag(Pattern.UNICODE_CASE)) {
0540: return new CICharSet(codePointsHangul[0]);
0541: } else {
0542: return new UCICharSet(codePointsHangul[0]);
0543: }
0544: }
0545:
0546: /*
0547: * We process single codepoint or decomposed codepoint.
0548: * We collect decomposed codepoint and obtain
0549: * one DecomposedCharSet.
0550: */
0551: } else {
0552: readCodePoints++;
0553:
0554: while ((readCodePoints < Lexer.MAX_DECOMPOSITION_LENGTH)
0555: && !lexemes.isEmpty() && lexemes.isLetter()
0556: && !Lexer.isDecomposedCharBoundary(lexemes.peek())) {
0557: codePoints[readCodePoints++] = lexemes.next();
0558: }
0559:
0560: /*
0561: * We have read an ordinary symbol.
0562: */
0563: if (readCodePoints == 1
0564: && !Lexer
0565: .hasSingleCodepointDecomposition(codePoints[0])) {
0566: return processCharSet(codePoints[0]);
0567: } else {
0568: if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
0569: return new DecomposedCharSet(codePoints,
0570: readCodePoints);
0571: } else if (!hasFlag(Pattern.UNICODE_CASE)) {
0572: return new CIDecomposedCharSet(codePoints,
0573: readCodePoints);
0574: } else {
0575: return new UCIDecomposedCharSet(codePoints,
0576: readCodePoints);
0577: }
0578: }
0579: }
0580: }
0581:
0582: /**
0583: * S->BS; S->QS; S->Q; B->a+
0584: */
0585: private AbstractSet processSubExpression(AbstractSet last) {
0586: AbstractSet cur;
0587: if (lexemes.isLetter() && !lexemes.isNextSpecial()
0588: && Lexer.isLetter(lexemes.lookAhead())) {
0589: if (hasFlag(Pattern.CANON_EQ)) {
0590: cur = processDecomposedChar(last);
0591: if (!lexemes.isEmpty()
0592:
0593: /* && !pattern.isQuantifier() */
0594: && (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS || last instanceof FinalSet)
0595: && lexemes.peek() != Lexer.CHAR_VERTICAL_BAR
0596: && !lexemes.isLetter()) {
0597: cur = processQuantifier(last, cur);
0598: }
0599: } else if (lexemes.isHighSurrogate()
0600: || lexemes.isLowSurrogate()) {
0601: AbstractSet term = processTerminal(last);
0602: cur = processQuantifier(last, term);
0603: } else {
0604: cur = processSequence(last);
0605: }
0606: } else if (lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS) {
0607: if (last instanceof FinalSet) {
0608: throw new PatternSyntaxException(Messages
0609: .getString("regex.09"), lexemes.toString(), //$NON-NLS-1$
0610: lexemes.getIndex());
0611: } else {
0612: cur = new EmptySet(last);
0613: }
0614: } else {
0615: AbstractSet term = processTerminal(last);
0616: cur = processQuantifier(last, term);
0617: }
0618:
0619: if (!lexemes.isEmpty()
0620: // && !pattern.isQuantifier()
0621: && (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS || last instanceof FinalSet)
0622: && lexemes.peek() != Lexer.CHAR_VERTICAL_BAR) {
0623: AbstractSet next = processSubExpression(last);
0624: if (cur instanceof LeafQuantifierSet
0625: // TODO create personal UnifiedQuantifierSet for composite
0626: // quantifiers
0627: // to take into account Quantifier counters
0628: // ////
0629: && !(cur instanceof CompositeQuantifierSet)
0630: && !(cur instanceof GroupQuantifierSet)
0631: && !(cur instanceof AltQuantifierSet)
0632: && !next.first(((LeafQuantifierSet) cur)
0633: .getInnerSet())) {
0634: cur = new UnifiedQuantifierSet((LeafQuantifierSet) cur);
0635: }
0636: if (((char) next.getType()) == '+') {
0637: cur.setNext(((LeafQuantifierSet) next).getInnerSet());
0638: } else {
0639: cur.setNext(next);
0640: }
0641: } else if (cur != null) {
0642: cur.setNext(last);
0643: } else {
0644: return null;
0645: }
0646:
0647: if (((char) cur.getType()) == '+') {
0648: return ((QuantifierSet) cur).getInnerSet();
0649: } else {
0650: return cur;
0651: }
0652: }
0653:
0654: /**
0655: * Q->T(*|+|?...) also do some optimizations.
0656: *
0657: */
0658: private AbstractSet processQuantifier(AbstractSet last,
0659: AbstractSet term) {
0660: int quant = lexemes.peek();
0661:
0662: if (term != null && !(term instanceof LeafSet)) {
0663: switch (quant) {
0664: case Lexer.QUANT_STAR:
0665: case Lexer.QUANT_PLUS: {
0666: QuantifierSet q;
0667:
0668: lexemes.next();
0669: if (term.getType() == AbstractSet.TYPE_DOTSET) {
0670: if (!hasFlag(Pattern.DOTALL)) {
0671: q = new DotQuantifierSet(term, last, quant,
0672: AbstractLineTerminator
0673: .getInstance(flags));
0674: } else {
0675: q = new DotAllQuantifierSet(term, last, quant);
0676: }
0677: } else {
0678: q = new GroupQuantifierSet(term, last, quant);
0679: }
0680: term.setNext(q);
0681: return q;
0682: }
0683:
0684: case Lexer.QUANT_STAR_R:
0685: case Lexer.QUANT_PLUS_R: {
0686: lexemes.next();
0687: GroupQuantifierSet q = new ReluctantGroupQuantifierSet(
0688: term, last, quant);
0689: term.setNext(q);
0690: return q;
0691: }
0692:
0693: case Lexer.QUANT_PLUS_P: {
0694: lexemes.next();
0695: // possessive plus will be handled by unique class
0696: // and should not be postprocessed to point previous set
0697: // to the inner one.
0698: // //
0699: return new PosPlusGroupQuantifierSet(term, last,
0700: Lexer.QUANT_STAR_P);
0701: }
0702:
0703: case Lexer.QUANT_STAR_P: {
0704: lexemes.next();
0705: return new PossessiveGroupQuantifierSet(term, last,
0706: quant);
0707: }
0708:
0709: case Lexer.QUANT_ALT: {
0710: lexemes.next();
0711: AltGroupQuantifierSet q = new AltGroupQuantifierSet(
0712: term, last, Lexer.QUANT_ALT);
0713: term.setNext(last);
0714: return q;
0715: }
0716:
0717: case Lexer.QUANT_ALT_P: {
0718: lexemes.next();
0719: return new PosAltGroupQuantifierSet(term, last,
0720: Lexer.QUANT_ALT);
0721: }
0722:
0723: case Lexer.QUANT_ALT_R: {
0724: lexemes.next();
0725: RelAltGroupQuantifierSet q = new RelAltGroupQuantifierSet(
0726: term, last, Lexer.QUANT_ALT);
0727: term.setNext(last);
0728: return q;
0729: }
0730:
0731: case Lexer.QUANT_COMP: {
0732: CompositeGroupQuantifierSet q = new CompositeGroupQuantifierSet(
0733: (Quantifier) lexemes.nextSpecial(), term, last,
0734: Lexer.QUANT_ALT, ++compCount);
0735: term.setNext(q);
0736: return q;
0737: }
0738:
0739: case Lexer.QUANT_COMP_P: {
0740: return new PosCompositeGroupQuantifierSet(
0741: (Quantifier) lexemes.nextSpecial(), term, last,
0742: Lexer.QUANT_ALT, ++compCount);
0743: }
0744:
0745: case Lexer.QUANT_COMP_R: {
0746: RelCompositeGroupQuantifierSet q = new RelCompositeGroupQuantifierSet(
0747: (Quantifier) lexemes.nextSpecial(), term, last,
0748: Lexer.QUANT_ALT, ++compCount);
0749: term.setNext(q);
0750: return q;
0751: }
0752:
0753: default:
0754: return term;
0755: }
0756: } else {
0757: LeafSet leaf = null;
0758: if (term != null)
0759: leaf = (LeafSet) term;
0760: switch (quant) {
0761: case Lexer.QUANT_STAR:
0762: case Lexer.QUANT_PLUS: {
0763: lexemes.next();
0764: LeafQuantifierSet q = new LeafQuantifierSet(leaf, last,
0765: quant);
0766: leaf.setNext(q);
0767: return q;
0768: }
0769:
0770: case Lexer.QUANT_STAR_R:
0771: case Lexer.QUANT_PLUS_R: {
0772: lexemes.next();
0773: ReluctantQuantifierSet q = new ReluctantQuantifierSet(
0774: leaf, last, quant);
0775: leaf.setNext(q);
0776: return q;
0777: }
0778:
0779: case Lexer.QUANT_PLUS_P:
0780: case Lexer.QUANT_STAR_P: {
0781: lexemes.next();
0782: PossessiveQuantifierSet q = new PossessiveQuantifierSet(
0783: leaf, last, quant);
0784: leaf.setNext(q);
0785: return q;
0786: }
0787:
0788: case Lexer.QUANT_ALT: {
0789: lexemes.next();
0790: return new AltQuantifierSet(leaf, last, Lexer.QUANT_ALT);
0791: }
0792:
0793: case Lexer.QUANT_ALT_P: {
0794: lexemes.next();
0795: return new PossessiveAltQuantifierSet(leaf, last,
0796: Lexer.QUANT_ALT_P);
0797: }
0798:
0799: case Lexer.QUANT_ALT_R: {
0800: lexemes.next();
0801: return new ReluctantAltQuantifierSet(leaf, last,
0802: Lexer.QUANT_ALT_R);
0803: }
0804:
0805: case Lexer.QUANT_COMP: {
0806: return new CompositeQuantifierSet((Quantifier) lexemes
0807: .nextSpecial(), leaf, last, Lexer.QUANT_COMP);
0808: }
0809:
0810: case Lexer.QUANT_COMP_P: {
0811: return new PossessiveCompositeQuantifierSet(
0812: (Quantifier) lexemes.nextSpecial(), leaf, last,
0813: Lexer.QUANT_COMP_P);
0814: }
0815: case Lexer.QUANT_COMP_R: {
0816: return new ReluctantCompositeQuantifierSet(
0817: (Quantifier) lexemes.nextSpecial(), leaf, last,
0818: Lexer.QUANT_COMP_R);
0819: }
0820:
0821: default:
0822: return term;
0823: }
0824: }
0825: }
0826:
0827: /**
0828: * T-> letter|[range]|{char-class}|(E)
0829: */
0830: private AbstractSet processTerminal(AbstractSet last) {
0831: int ch;
0832: AbstractSet term = null;
0833: do {
0834: ch = lexemes.peek();
0835: if ((ch & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS) {
0836: int newFlags;
0837: lexemes.next();
0838: newFlags = (ch & 0x00ff0000) >> 16;
0839: ch = ch & 0xff00ffff;
0840: if (ch == Lexer.CHAR_FLAGS) {
0841: flags = newFlags;
0842: } else {
0843: newFlags = (ch == Lexer.CHAR_NONCAP_GROUP) ? newFlags
0844: : flags;
0845: term = processExpression(ch, newFlags, last);
0846: if (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS) {
0847: throw new PatternSyntaxException(
0848: Messages.getString("regex.0A"), lexemes.toString(), //$NON-NLS-1$
0849: lexemes.getIndex());
0850: }
0851: lexemes.next();
0852: }
0853: } else
0854: switch (ch) {
0855: case Lexer.CHAR_LEFT_SQUARE_BRACKET: {
0856: lexemes.next();
0857: boolean negative = false;
0858: if (lexemes.peek() == Lexer.CHAR_CARET) {
0859: negative = true;
0860: lexemes.next();
0861: }
0862:
0863: term = processRange(negative, last);
0864: if (lexemes.peek() != Lexer.CHAR_RIGHT_SQUARE_BRACKET)
0865: throw new PatternSyntaxException(
0866: Messages.getString("regex.0B"), lexemes.toString(), //$NON-NLS-1$
0867: lexemes.getIndex());
0868: lexemes.setMode(Lexer.MODE_PATTERN);
0869: lexemes.next();
0870: break;
0871: }
0872:
0873: case Lexer.CHAR_DOT: {
0874: lexemes.next();
0875:
0876: if (!hasFlag(Pattern.DOTALL)) {
0877: term = new DotSet(AbstractLineTerminator
0878: .getInstance(flags));
0879: } else {
0880: term = new DotAllSet();
0881: }
0882:
0883: break;
0884: }
0885:
0886: case Lexer.CHAR_CARET: {
0887: lexemes.next();
0888: consCount++;
0889: if (!hasFlag(Pattern.MULTILINE)) {
0890: term = new SOLSet();
0891: } else {
0892: term = new MultiLineSOLSet(
0893: AbstractLineTerminator
0894: .getInstance(flags));
0895: }
0896:
0897: break;
0898: }
0899:
0900: case Lexer.CHAR_DOLLAR: {
0901: lexemes.next();
0902: consCount++;
0903: if (!hasFlag(Pattern.MULTILINE)) {
0904: if (!hasFlag(Pattern.UNIX_LINES)) {
0905: term = new EOLSet(consCount);
0906: } else {
0907: term = new UEOLSet(consCount);
0908: }
0909: } else {
0910: if (!hasFlag(Pattern.UNIX_LINES)) {
0911: term = new MultiLineEOLSet(consCount);
0912: } else {
0913: term = new UMultiLineEOLSet(consCount);
0914: }
0915: }
0916:
0917: break;
0918: }
0919:
0920: case Lexer.CHAR_WORD_BOUND: {
0921: lexemes.next();
0922: term = new WordBoundary(true);
0923: break;
0924: }
0925:
0926: case Lexer.CHAR_NONWORD_BOUND: {
0927: lexemes.next();
0928: term = new WordBoundary(false);
0929: break;
0930: }
0931:
0932: case Lexer.CHAR_END_OF_INPUT: {
0933: lexemes.next();
0934: term = new EOISet();
0935: break;
0936: }
0937:
0938: case Lexer.CHAR_END_OF_LINE: {
0939: lexemes.next();
0940: term = new EOLSet(++consCount);
0941: break;
0942: }
0943:
0944: case Lexer.CHAR_START_OF_INPUT: {
0945: lexemes.next();
0946: term = new SOLSet();
0947: break;
0948: }
0949:
0950: case Lexer.CHAR_PREVIOUS_MATCH: {
0951: lexemes.next();
0952: term = new PreviousMatch();
0953: break;
0954: }
0955:
0956: case 0x80000000 | '1':
0957: case 0x80000000 | '2':
0958: case 0x80000000 | '3':
0959: case 0x80000000 | '4':
0960: case 0x80000000 | '5':
0961: case 0x80000000 | '6':
0962: case 0x80000000 | '7':
0963: case 0x80000000 | '8':
0964: case 0x80000000 | '9': {
0965: int number = (ch & 0x7FFFFFFF) - '0';
0966: if (globalGroupIndex >= number) {
0967: lexemes.next();
0968: consCount++;
0969: if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
0970: term = new BackReferenceSet(number,
0971: consCount);
0972: } else if (!hasFlag(Pattern.UNICODE_CASE)) {
0973: term = new CIBackReferenceSet(number,
0974: consCount);
0975: } else {
0976: term = new UCIBackReferenceSet(number,
0977: consCount);
0978: }
0979: (backRefs[number]).isBackReferenced = true;
0980: needsBackRefReplacement = true;
0981: break;
0982: } else {
0983: throw new PatternSyntaxException(Messages
0984: .getString("regex.0C") //$NON-NLS-1$
0985: , lexemes.toString(), lexemes
0986: .getIndex());
0987: }
0988: }
0989:
0990: case 0: {
0991: AbstractCharClass cc = null;
0992: if ((cc = (AbstractCharClass) lexemes.peekSpecial()) != null) {
0993: term = processRangeSet(cc);
0994: } else if (!lexemes.isEmpty()) {
0995:
0996: //ch == 0
0997: term = new CharSet((char) ch);
0998: } else {
0999: term = new EmptySet(last);
1000: break;
1001: }
1002: lexemes.next();
1003: break;
1004: }
1005:
1006: default: {
1007: if (ch >= 0 && !lexemes.isSpecial()) {
1008: term = processCharSet(ch);
1009: lexemes.next();
1010: } else if (ch == Lexer.CHAR_VERTICAL_BAR) {
1011: term = new EmptySet(last);
1012: } else if (ch == Lexer.CHAR_RIGHT_PARENTHESIS) {
1013: if (last instanceof FinalSet) {
1014: throw new PatternSyntaxException(
1015: Messages.getString("regex.09"), lexemes.toString(), //$NON-NLS-1$
1016: lexemes.getIndex());
1017: } else {
1018: term = new EmptySet(last);
1019: }
1020: } else {
1021: throw new PatternSyntaxException(Messages
1022: .getString("regex.0D", //$NON-NLS-1$
1023: (lexemes.isSpecial() ? lexemes
1024: .peekSpecial()
1025: .toString() : Character
1026: .toString((char) ch))),
1027: lexemes.toString(), lexemes.getIndex());
1028: }
1029: }
1030: }
1031: } while (ch == Lexer.CHAR_FLAGS);
1032: return term;
1033: }
1034:
1035: private AbstractSet processRange(boolean negative, AbstractSet last) {
1036: AbstractCharClass res = processRangeExpression(negative);
1037: AbstractSet rangeSet = processRangeSet(res);
1038: rangeSet.setNext(last);
1039:
1040: return rangeSet;
1041: }
1042:
1043: /**
1044: * process [...] ranges
1045: */
1046: private CharClass processRangeExpression(boolean alt) {
1047: CharClass res = new CharClass(alt,
1048: hasFlag(Pattern.CASE_INSENSITIVE),
1049: hasFlag(Pattern.UNICODE_CASE));
1050: int buffer = -1;
1051: boolean intersection = false;
1052: boolean notClosed = false;
1053: boolean firstInClass = true;
1054:
1055: while (!lexemes.isEmpty()
1056: && (notClosed = (lexemes.peek()) != Lexer.CHAR_RIGHT_SQUARE_BRACKET
1057: || firstInClass)) {
1058: switch (lexemes.peek()) {
1059:
1060: case Lexer.CHAR_RIGHT_SQUARE_BRACKET: {
1061: if (buffer >= 0)
1062: res.add(buffer);
1063: buffer = ']';
1064: lexemes.next();
1065: break;
1066: }
1067: case Lexer.CHAR_LEFT_SQUARE_BRACKET: {
1068: if (buffer >= 0) {
1069: res.add(buffer);
1070: buffer = -1;
1071: }
1072: lexemes.next();
1073: boolean negative = false;
1074: if (lexemes.peek() == Lexer.CHAR_CARET) {
1075: lexemes.next();
1076: negative = true;
1077: }
1078:
1079: if (intersection)
1080: res.intersection(processRangeExpression(negative));
1081: else
1082: res.union(processRangeExpression(negative));
1083: intersection = false;
1084: lexemes.next();
1085: break;
1086: }
1087:
1088: case Lexer.CHAR_AMPERSAND: {
1089: if (buffer >= 0)
1090: res.add(buffer);
1091: buffer = lexemes.next();
1092:
1093: /*
1094: * if there is a start for subrange we will do an intersection
1095: * otherwise treat '&' as a normal character
1096: */
1097: if (lexemes.peek() == Lexer.CHAR_AMPERSAND) {
1098: if (lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET) {
1099: lexemes.next();
1100: intersection = true;
1101: buffer = -1;
1102: } else {
1103: lexemes.next();
1104: if (firstInClass) {
1105:
1106: //skip "&&" at "[&&...]" or "[^&&...]"
1107: res = processRangeExpression(false);
1108: } else {
1109:
1110: //ignore "&&" at "[X&&]" ending where X != empty string
1111: if (!(lexemes.peek() == Lexer.CHAR_RIGHT_SQUARE_BRACKET)) {
1112: res
1113: .intersection(processRangeExpression(false));
1114: }
1115: }
1116:
1117: }
1118: } else {
1119:
1120: //treat '&' as a normal character
1121: buffer = '&';
1122: }
1123:
1124: break;
1125: }
1126:
1127: case Lexer.CHAR_HYPHEN: {
1128: if (firstInClass
1129: || lexemes.lookAhead() == Lexer.CHAR_RIGHT_SQUARE_BRACKET
1130: || lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET
1131: || buffer < 0) {
1132: // treat hypen as normal character
1133: if (buffer >= 0)
1134: res.add(buffer);
1135: buffer = '-';
1136: lexemes.next();
1137: // range
1138: } else {
1139: lexemes.next();
1140: int cur = lexemes.peek();
1141:
1142: if (!lexemes.isSpecial()
1143: && (cur >= 0
1144: || lexemes.lookAhead() == Lexer.CHAR_RIGHT_SQUARE_BRACKET
1145: || lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET || buffer < 0)) {
1146:
1147: try {
1148: if (!Lexer.isLetter(cur)) {
1149: cur = cur & 0xFFFF;
1150: }
1151: res.add(buffer, cur);
1152: } catch (Exception e) {
1153: throw new PatternSyntaxException(Messages
1154: .getString("regex.0E"), //$NON-NLS-1$
1155: pattern(), lexemes.getIndex());
1156: }
1157: lexemes.next();
1158: buffer = -1;
1159: } else {
1160: throw new PatternSyntaxException(Messages
1161: .getString("regex.0E"), //$NON-NLS-1$
1162: pattern(), lexemes.getIndex());
1163: }
1164: }
1165:
1166: break;
1167: }
1168:
1169: case Lexer.CHAR_CARET: {
1170: if (buffer >= 0)
1171: res.add(buffer);
1172: buffer = '^';
1173: lexemes.next();
1174: break;
1175: }
1176:
1177: case 0: {
1178: if (buffer >= 0)
1179: res.add(buffer);
1180: AbstractCharClass cs = (AbstractCharClass) lexemes
1181: .peekSpecial();
1182: if (cs != null) {
1183: res.add(cs);
1184: buffer = -1;
1185: } else {
1186: buffer = 0;
1187: }
1188:
1189: lexemes.next();
1190: break;
1191: }
1192:
1193: default: {
1194: if (buffer >= 0)
1195: res.add(buffer);
1196: buffer = lexemes.next();
1197: break;
1198: }
1199: }
1200:
1201: firstInClass = false;
1202: }
1203: if (notClosed) {
1204: throw new PatternSyntaxException(Messages
1205: .getString("regex.0F"), //$NON-NLS-1$
1206: pattern(), lexemes.getIndex() - 1);
1207: }
1208: if (buffer >= 0)
1209: res.add(buffer);
1210: return res;
1211: }
1212:
1213: private AbstractSet processCharSet(int ch) {
1214: boolean isSupplCodePoint = Character
1215: .isSupplementaryCodePoint(ch);
1216:
1217: if (hasFlag(Pattern.CASE_INSENSITIVE)) {
1218:
1219: if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
1220: return new CICharSet((char) ch);
1221: } else if (hasFlag(Pattern.UNICODE_CASE) && ch > 128) {
1222: if (isSupplCodePoint) {
1223: return new UCISupplCharSet(ch);
1224: } else if (Lexer.isLowSurrogate(ch)) {
1225:
1226: //we need no UCILowSurrogateCharSet
1227: return new LowSurrogateCharSet((char) ch);
1228: } else if (Lexer.isHighSurrogate(ch)) {
1229:
1230: //we need no UCIHighSurrogateCharSet
1231: return new HighSurrogateCharSet((char) ch);
1232: } else {
1233: return new UCICharSet((char) ch);
1234: }
1235: }
1236: }
1237:
1238: if (isSupplCodePoint) {
1239: return new SupplCharSet(ch);
1240: } else if (Lexer.isLowSurrogate(ch)) {
1241: return new LowSurrogateCharSet((char) ch);
1242: } else if (Lexer.isHighSurrogate(ch)) {
1243: return new HighSurrogateCharSet((char) ch);
1244: } else {
1245: return new CharSet((char) ch);
1246: }
1247: }
1248:
1249: private AbstractSet processRangeSet(AbstractCharClass charClass) {
1250: if (charClass.hasLowHighSurrogates()) {
1251: AbstractCharClass surrogates = charClass.getSurrogates();
1252: LowHighSurrogateRangeSet lowHighSurrRangeSet = new LowHighSurrogateRangeSet(
1253: surrogates);
1254:
1255: if (charClass.mayContainSupplCodepoints()) {
1256: if (!charClass.hasUCI()) {
1257: return new CompositeRangeSet(new SupplRangeSet(
1258: charClass.getWithoutSurrogates()),
1259: lowHighSurrRangeSet);
1260: } else {
1261: return new CompositeRangeSet(new UCISupplRangeSet(
1262: charClass.getWithoutSurrogates()),
1263: lowHighSurrRangeSet);
1264: }
1265: }
1266:
1267: if (!charClass.hasUCI()) {
1268: return new CompositeRangeSet(new RangeSet(charClass
1269: .getWithoutSurrogates()), lowHighSurrRangeSet);
1270: } else {
1271: return new CompositeRangeSet(new UCIRangeSet(charClass
1272: .getWithoutSurrogates()), lowHighSurrRangeSet);
1273: }
1274: }
1275:
1276: if (charClass.mayContainSupplCodepoints()) {
1277: if (!charClass.hasUCI()) {
1278: return new SupplRangeSet(charClass);
1279: } else {
1280: return new UCISupplRangeSet(charClass);
1281: }
1282: }
1283:
1284: if (!charClass.hasUCI()) {
1285: return new RangeSet(charClass);
1286: } else {
1287: return new UCIRangeSet(charClass);
1288: }
1289: }
1290:
1291: /**
1292: * @com.intel.drl.spec_ref
1293: */
1294: public static Pattern compile(String pattern) {
1295: return compile(pattern, 0);
1296: }
1297:
1298: /*
1299: * This method do traverses of
1300: * automata to finish compilation.
1301: */
1302: private void finalizeCompile() {
1303:
1304: /*
1305: * Processing second pass
1306: */
1307: if (needsBackRefReplacement) { //|| needsReason1 || needsReason2) {
1308: start.processSecondPass();
1309: }
1310:
1311: }
1312:
1313: /**
1314: * @com.intel.drl.spec_ref
1315: */
1316: public static boolean matches(String regex, CharSequence input) {
1317: return Pattern.compile(regex).matcher(input).matches();
1318: }
1319:
1320: public static String quote(String s) {
1321: StringBuffer sb = new StringBuffer().append("\\Q"); //$NON-NLS-1$
1322: int apos = 0;
1323: int k;
1324: while ((k = s.indexOf("\\E", apos)) >= 0) { //$NON-NLS-1$
1325: sb.append(s.substring(apos, k + 2)).append("\\\\E\\Q"); //$NON-NLS-1$
1326: apos = k + 2;
1327: }
1328:
1329: return sb.append(s.substring(apos)).append("\\E").toString(); //$NON-NLS-1$
1330: }
1331:
1332: /**
1333: * return number of groups found at compile time
1334: */
1335: int groupCount() {
1336: return globalGroupIndex;
1337: }
1338:
1339: int compCount() {
1340: return this .compCount + 1;
1341: }
1342:
1343: int consCount() {
1344: return this .consCount + 1;
1345: }
1346:
1347: /**
1348: * Returns supplementary character. At this time only for ASCII chars.
1349: */
1350: static char getSupplement(char ch) {
1351: char res = ch;
1352: if (ch >= 'a' && ch <= 'z') {
1353: res -= 32;
1354: } else if (ch >= 'A' && ch <= 'Z') {
1355: res += 32;
1356: }
1357:
1358: return res;
1359: }
1360:
1361: /**
1362: * @return true if pattern has specified flag
1363: */
1364: private boolean hasFlag(int flag) {
1365: return (flags & flag) == flag;
1366: }
1367:
1368: /**
1369: * Dismiss public constructor.
1370: *
1371: */
1372: private Pattern() {
1373: }
1374:
1375: /**
1376: * Serialization support
1377: */
1378: private void readObject(java.io.ObjectInputStream s)
1379: throws java.io.IOException, ClassNotFoundException {
1380: s.defaultReadObject();
1381: AbstractSet.counter = 1;
1382: groupIndex = -1;
1383: globalGroupIndex = -1;
1384: compCount = -1;
1385: consCount = -1;
1386: backRefs = new FSet[BACK_REF_NUMBER];
1387:
1388: compileImpl(pattern, flags);
1389:
1390: }
1391: }
|