0001: /*
0002: * $Id: Perl5Compiler.java,v 1.21 2003/11/07 20:16:25 dfs Exp $
0003: *
0004: * ====================================================================
0005: * The Apache Software License, Version 1.1
0006: *
0007: * Copyright (c) 2000 The Apache Software Foundation. All rights
0008: * reserved.
0009: *
0010: * Redistribution and use in source and binary forms, with or without
0011: * modification, are permitted provided that the following conditions
0012: * are met:
0013: *
0014: * 1. Redistributions of source code must retain the above copyright
0015: * notice, this list of conditions and the following disclaimer.
0016: *
0017: * 2. Redistributions in binary form must reproduce the above copyright
0018: * notice, this list of conditions and the following disclaimer in
0019: * the documentation and/or other materials provided with the
0020: * distribution.
0021: *
0022: * 3. The end-user documentation included with the redistribution,
0023: * if any, must include the following acknowledgment:
0024: * "This product includes software developed by the
0025: * Apache Software Foundation (http://www.apache.org/)."
0026: * Alternately, this acknowledgment may appear in the software itself,
0027: * if and wherever such third-party acknowledgments normally appear.
0028: *
0029: * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
0030: * must not be used to endorse or promote products derived from this
0031: * software without prior written permission. For written
0032: * permission, please contact apache@apache.org.
0033: *
0034: * 5. Products derived from this software may not be called "Apache"
0035: * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
0036: * name, without prior written permission of the Apache Software Foundation.
0037: *
0038: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0039: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0040: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0041: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0042: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0043: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0044: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0045: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0046: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0047: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0048: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0049: * SUCH DAMAGE.
0050: * ====================================================================
0051: *
0052: * This software consists of voluntary contributions made by many
0053: * individuals on behalf of the Apache Software Foundation. For more
0054: * information on the Apache Software Foundation, please see
0055: * <http://www.apache.org/>.
0056: */
0057:
0058: package org.apache.oro.text.regex;
0059:
0060: import java.util.*;
0061:
0062: /**
0063: * The Perl5Compiler class is used to create compiled regular expressions
0064: * conforming to the Perl5 regular expression syntax. It generates
0065: * Perl5Pattern instances upon compilation to be used in conjunction
0066: * with a Perl5Matcher instance. Please see the user's guide for more
0067: * information about Perl5 regular expressions.
0068: * <p>
0069: * Perl5Compiler and Perl5Matcher are designed with the intent that
0070: * you use a separate instance of each per thread to avoid the overhead
0071: * of both synchronization and concurrent access (e.g., a match that takes
0072: * a long time in one thread will block the progress of another thread with
0073: * a shorter match). If you want to use a single instance of each
0074: * in a concurrent program, you must appropriately protect access to
0075: * the instances with critical sections. If you want to share Perl5Pattern
0076: * instances between concurrently executing instances of Perl5Matcher, you
0077: * must compile the patterns with {@link Perl5Compiler#READ_ONLY_MASK}.
0078: *
0079: * @version @version@
0080: * @since 1.0
0081: * @see PatternCompiler
0082: * @see MalformedPatternException
0083: * @see Perl5Pattern
0084: * @see Perl5Matcher
0085: */
0086:
0087: public final class Perl5Compiler implements PatternCompiler {
0088: private static final int __WORSTCASE = 0, __NONNULL = 0x1,
0089: __SIMPLE = 0x2, __SPSTART = 0x4, __TRYAGAIN = 0x8;
0090:
0091: private static final char __CASE_INSENSITIVE = 0x0001,
0092: __GLOBAL = 0x0002, __KEEP = 0x0004, __MULTILINE = 0x0008,
0093: __SINGLELINE = 0x0010, __EXTENDED = 0x0020,
0094: __READ_ONLY = 0x8000;
0095:
0096: private static final String __HEX_DIGIT = "0123456789abcdef0123456789ABCDEFx";
0097: private CharStringPointer __input;
0098: private boolean __sawBackreference;
0099: private char[] __modifierFlags = { 0 };
0100:
0101: // IMPORTANT: __numParentheses starts out equal to 1 during compilation.
0102: // It is always one greater than the number of parentheses encountered
0103: // so far in the regex. That is because it refers to the number of groups
0104: // to save, and the entire match is always saved (group 0)
0105: private int __numParentheses, __programSize, __cost;
0106:
0107: // When doing the second pass and actually generating code, __programSize
0108: // keeps track of the current offset.
0109: private char[] __program;
0110:
0111: /** Lookup table for POSIX character class names */
0112: private static final HashMap __hashPOSIX;
0113:
0114: static {
0115: __hashPOSIX = new HashMap();
0116: __hashPOSIX.put("alnum", new Character(OpCode._ALNUMC));
0117: __hashPOSIX.put("word", new Character(OpCode._ALNUM));
0118: __hashPOSIX.put("alpha", new Character(OpCode._ALPHA));
0119: __hashPOSIX.put("blank", new Character(OpCode._BLANK));
0120: __hashPOSIX.put("cntrl", new Character(OpCode._CNTRL));
0121: __hashPOSIX.put("digit", new Character(OpCode._DIGIT));
0122: __hashPOSIX.put("graph", new Character(OpCode._GRAPH));
0123: __hashPOSIX.put("lower", new Character(OpCode._LOWER));
0124: __hashPOSIX.put("print", new Character(OpCode._PRINT));
0125: __hashPOSIX.put("punct", new Character(OpCode._PUNCT));
0126: __hashPOSIX.put("space", new Character(OpCode._SPACE));
0127: __hashPOSIX.put("upper", new Character(OpCode._UPPER));
0128: __hashPOSIX.put("xdigit", new Character(OpCode._XDIGIT));
0129: __hashPOSIX.put("ascii", new Character(OpCode._ASCII));
0130: }
0131:
0132: /**
0133: * The default mask for the {@link #compile compile} methods.
0134: * It is equal to 0.
0135: * The default behavior is for a regular expression to be case sensitive
0136: * and to not specify if it is multiline or singleline. When MULITLINE_MASK
0137: * and SINGLINE_MASK are not defined, the <b>^</b>, <b>$</b>, and <b>.</b>
0138: * metacharacters are
0139: * interpreted according to the value of isMultiline() in Perl5Matcher.
0140: * The default behavior of Perl5Matcher is to treat the Perl5Pattern
0141: * as though MULTILINE_MASK were enabled. If isMultiline() returns false,
0142: * then the pattern is treated as though SINGLINE_MASK were set. However,
0143: * compiling a pattern with the MULTILINE_MASK or SINGLELINE_MASK masks
0144: * will ALWAYS override whatever behavior is specified by the setMultiline()
0145: * in Perl5Matcher.
0146: */
0147: public static final int DEFAULT_MASK = 0;
0148:
0149: /**
0150: * A mask passed as an option to the {@link #compile compile} methods
0151: * to indicate a compiled regular expression should be case insensitive.
0152: */
0153: public static final int CASE_INSENSITIVE_MASK = __CASE_INSENSITIVE;
0154:
0155: /**
0156: * A mask passed as an option to the {@link #compile compile} methods
0157: * to indicate a compiled regular expression should treat input as having
0158: * multiple lines. This option affects the interpretation of
0159: * the <b>^</b> and <b>$</b> metacharacters. When this mask is used,
0160: * the <b>^</b> metacharacter matches at the beginning of every line,
0161: * and the <b>$</b> metacharacter matches at the end of every line.
0162: * Additionally the <b> . </b> metacharacter will not match newlines when
0163: * an expression is compiled with <b> MULTILINE_MASK </b>, which is its
0164: * default behavior.
0165: */
0166: public static final int MULTILINE_MASK = __MULTILINE;
0167:
0168: /**
0169: * A mask passed as an option to the {@link #compile compile} methods
0170: * to indicate a compiled regular expression should treat input as being
0171: * a single line. This option affects the interpretation of
0172: * the <b>^</b> and <b>$</b> metacharacters. When this mask is used,
0173: * the <b>^</b> metacharacter matches at the beginning of the input,
0174: * and the <b>$</b> metacharacter matches at the end of the input.
0175: * The <b>^</b> and <b>$</b> metacharacters will not match at the beginning
0176: * and end of lines occurring between the begnning and end of the input.
0177: * Additionally, the <b> . </b> metacharacter will match newlines when
0178: * an expression is compiled with <b> SINGLELINE_MASK </b>, unlike its
0179: * default behavior.
0180: */
0181: public static final int SINGLELINE_MASK = __SINGLELINE;
0182:
0183: /**
0184: * A mask passed as an option to the {@link #compile compile} methods
0185: * to indicate a compiled regular expression should be treated as a Perl5
0186: * extended pattern (i.e., a pattern using the <b>/x</b> modifier). This
0187: * option tells the compiler to ignore whitespace that is not backslashed or
0188: * within a character class. It also tells the compiler to treat the
0189: * <b>#</b> character as a metacharacter introducing a comment as in
0190: * Perl. In other words, the <b>#</b> character will comment out any
0191: * text in the regular expression between it and the next newline.
0192: * The intent of this option is to allow you to divide your patterns
0193: * into more readable parts. It is provided to maintain compatibility
0194: * with Perl5 regular expressions, although it will not often
0195: * make sense to use it in Java.
0196: */
0197: public static final int EXTENDED_MASK = __EXTENDED;
0198:
0199: /**
0200: * A mask passed as an option to the {@link #compile compile} methods
0201: * to indicate that the resulting Perl5Pattern should be treated as a
0202: * read only data structure by Perl5Matcher, making it safe to share
0203: * a single Perl5Pattern instance among multiple threads without needing
0204: * synchronization. Without this option, Perl5Matcher reserves the right
0205: * to store heuristic or other information in Perl5Pattern that might
0206: * accelerate future matches. When you use this option, Perl5Matcher will
0207: * not store or modify any information in a Perl5Pattern. Use this option
0208: * when you want to share a Perl5Pattern instance among multiple threads
0209: * using different Perl5Matcher instances.
0210: */
0211: public static final int READ_ONLY_MASK = __READ_ONLY;
0212:
0213: /**
0214: * Given a character string, returns a Perl5 expression that interprets
0215: * each character of the original string literally. In other words, all
0216: * special metacharacters are quoted/escaped. This method is useful for
0217: * converting user input meant for literal interpretation into a safe
0218: * regular expression representing the literal input.
0219: * <p>
0220: * In effect, this method is the analog of the Perl5 quotemeta() builtin
0221: * method.
0222: * <p>
0223: * @param expression The expression to convert.
0224: * @return A String containing a Perl5 regular expression corresponding to
0225: * a literal interpretation of the pattern.
0226: */
0227: public static final String quotemeta(char[] expression) {
0228: int ch;
0229: StringBuffer buffer;
0230:
0231: buffer = new StringBuffer(2 * expression.length);
0232: for (ch = 0; ch < expression.length; ch++) {
0233: if (!OpCode._isWordCharacter(expression[ch]))
0234: buffer.append('\\');
0235: buffer.append(expression[ch]);
0236: }
0237:
0238: return buffer.toString();
0239: }
0240:
0241: /**
0242: * Given a character string, returns a Perl5 expression that interprets
0243: * each character of the original string literally. In other words, all
0244: * special metacharacters are quoted/escaped. This method is useful for
0245: * converting user input meant for literal interpretation into a safe
0246: * regular expression representing the literal input.
0247: * <p>
0248: * In effect, this method is the analog of the Perl5 quotemeta() builtin
0249: * method.
0250: * <p>
0251: * @param pattern The pattern to convert.
0252: * @return A String containing a Perl5 regular expression corresponding to
0253: * a literal interpretation of the pattern.
0254: */
0255: public static final String quotemeta(String expression) {
0256: return quotemeta(expression.toCharArray());
0257: }
0258:
0259: private static boolean __isSimpleRepetitionOp(char ch) {
0260: return (ch == '*' || ch == '+' || ch == '?');
0261: }
0262:
0263: private static boolean __isComplexRepetitionOp(char[] ch, int offset) {
0264: if (offset < ch.length && offset >= 0)
0265: return (ch[offset] == '*' || ch[offset] == '+'
0266: || ch[offset] == '?' || (ch[offset] == '{' && __parseRepetition(
0267: ch, offset)));
0268: return false;
0269: }
0270:
0271: // determines if {\d+,\d*} is the next part of the string
0272: private static boolean __parseRepetition(char[] str, int offset) {
0273: if (str[offset] != '{')
0274: return false;
0275: ++offset;
0276:
0277: if (offset >= str.length || !Character.isDigit(str[offset]))
0278: return false;
0279:
0280: while (offset < str.length && Character.isDigit(str[offset]))
0281: ++offset;
0282:
0283: if (offset < str.length && str[offset] == ',')
0284: ++offset;
0285:
0286: while (offset < str.length && Character.isDigit(str[offset]))
0287: ++offset;
0288:
0289: if (offset >= str.length || str[offset] != '}')
0290: return false;
0291:
0292: return true;
0293: }
0294:
0295: private static int __parseHex(char[] str, int offset,
0296: int maxLength, int[] scanned) {
0297: int val = 0, index;
0298:
0299: scanned[0] = 0;
0300: while (offset < str.length && maxLength-- > 0
0301: && (index = __HEX_DIGIT.indexOf(str[offset])) != -1) {
0302: val <<= 4;
0303: val |= (index & 15);
0304: ++offset;
0305: ++scanned[0];
0306: }
0307:
0308: return val;
0309: }
0310:
0311: private static int __parseOctal(char[] str, int offset,
0312: int maxLength, int[] scanned) {
0313: int val = 0;
0314:
0315: scanned[0] = 0;
0316: while (offset < str.length && maxLength > 0
0317: && str[offset] >= '0' && str[offset] <= '7') {
0318: val <<= 3;
0319: val |= (str[offset] - '0');
0320: --maxLength;
0321: ++offset;
0322: ++scanned[0];
0323: }
0324:
0325: return val;
0326: }
0327:
0328: private static void __setModifierFlag(char[] flags, char ch) {
0329: switch (ch) {
0330: case 'i':
0331: flags[0] |= __CASE_INSENSITIVE;
0332: return;
0333: case 'g':
0334: flags[0] |= __GLOBAL;
0335: return;
0336: case 'o':
0337: flags[0] |= __KEEP;
0338: return;
0339: case 'm':
0340: flags[0] |= __MULTILINE;
0341: return;
0342: case 's':
0343: flags[0] |= __SINGLELINE;
0344: return;
0345: case 'x':
0346: flags[0] |= __EXTENDED;
0347: return;
0348: }
0349: }
0350:
0351: // Emit a specific character code.
0352: private void __emitCode(char code) {
0353:
0354: if (__program != null)
0355: __program[__programSize] = code;
0356:
0357: ++__programSize;
0358: }
0359:
0360: // Emit an operator with no arguments.
0361: // Return an offset into the __program array as a pointer to node.
0362: private int __emitNode(char operator) {
0363: int offset;
0364:
0365: offset = __programSize;
0366:
0367: if (__program == null)
0368: __programSize += 2;
0369: else {
0370: __program[__programSize++] = operator;
0371: __program[__programSize++] = OpCode._NULL_POINTER;
0372: }
0373:
0374: return offset;
0375: }
0376:
0377: // Emit an operator with arguments.
0378: // Return an offset into the __programarray as a pointer to node.
0379: private int __emitArgNode(char operator, char arg) {
0380: int offset;
0381:
0382: offset = __programSize;
0383:
0384: if (__program == null)
0385: __programSize += 3;
0386: else {
0387: __program[__programSize++] = operator;
0388: __program[__programSize++] = OpCode._NULL_POINTER;
0389: __program[__programSize++] = arg;
0390: }
0391:
0392: return offset;
0393: }
0394:
0395: // Insert an operator at a given offset.
0396: private void __programInsertOperator(char operator, int operand) {
0397: int src, dest, offset;
0398:
0399: offset = (OpCode._opType[operator] == OpCode._CURLY ? 2 : 0);
0400:
0401: if (__program == null) {
0402: __programSize += (2 + offset);
0403: return;
0404: }
0405:
0406: src = __programSize;
0407: __programSize += (2 + offset);
0408: dest = __programSize;
0409:
0410: while (src > operand) {
0411: --src;
0412: --dest;
0413: __program[dest] = __program[src];
0414: }
0415:
0416: __program[operand++] = operator;
0417: __program[operand++] = OpCode._NULL_POINTER;
0418:
0419: while (offset-- > 0)
0420: __program[operand++] = OpCode._NULL_POINTER;
0421:
0422: }
0423:
0424: private void __programAddTail(int current, int value) {
0425: int scan, temp, offset;
0426: if (__program == null || current == OpCode._NULL_OFFSET)
0427: return;
0428:
0429: scan = current;
0430:
0431: while (true) {
0432: temp = OpCode._getNext(__program, scan);
0433: if (temp == OpCode._NULL_OFFSET)
0434: break;
0435: scan = temp;
0436: }
0437:
0438: if (__program[scan] == OpCode._BACK)
0439: offset = scan - value;
0440: else
0441: offset = value - scan;
0442:
0443: __program[scan + 1] = (char) offset;
0444: }
0445:
0446: private void __programAddOperatorTail(int current, int value) {
0447: if (__program == null || current == OpCode._NULL_OFFSET
0448: || OpCode._opType[__program[current]] != OpCode._BRANCH)
0449: return;
0450: __programAddTail(OpCode._getNextOperator(current), value);
0451: }
0452:
0453: private char __getNextChar() {
0454: char ret, value;
0455:
0456: ret = __input._postIncrement();
0457:
0458: while (true) {
0459: value = __input._getValue();
0460:
0461: if (value == '(' && __input._getValueRelative(1) == '?'
0462: && __input._getValueRelative(2) == '#') {
0463: // Skip comments
0464: while (value != CharStringPointer._END_OF_STRING
0465: && value != ')')
0466: value = __input._increment();
0467: __input._increment();
0468: continue;
0469: }
0470:
0471: if ((__modifierFlags[0] & __EXTENDED) != 0) {
0472: if (Character.isWhitespace(value)) {
0473: __input._increment();
0474: continue;
0475: } else if (value == '#') {
0476: while (value != CharStringPointer._END_OF_STRING
0477: && value != '\n')
0478: value = __input._increment();
0479: __input._increment();
0480: continue;
0481: }
0482: }
0483:
0484: return ret;
0485: }
0486:
0487: }
0488:
0489: private int __parseAlternation(int[] retFlags)
0490: throws MalformedPatternException {
0491: int chain, offset, latest;
0492: int flags = 0;
0493: char value;
0494:
0495: retFlags[0] = __WORSTCASE;
0496:
0497: offset = __emitNode(OpCode._BRANCH);
0498:
0499: chain = OpCode._NULL_OFFSET;
0500:
0501: if (__input._getOffset() == 0) {
0502: __input._setOffset(-1);
0503: __getNextChar();
0504: } else {
0505: __input._decrement();
0506: __getNextChar();
0507: }
0508:
0509: value = __input._getValue();
0510:
0511: while (value != CharStringPointer._END_OF_STRING
0512: && value != '|' && value != ')') {
0513: flags &= ~__TRYAGAIN;
0514: latest = __parseBranch(retFlags);
0515:
0516: if (latest == OpCode._NULL_OFFSET) {
0517: if ((flags & __TRYAGAIN) != 0) {
0518: value = __input._getValue();
0519: continue;
0520: }
0521: return OpCode._NULL_OFFSET;
0522: }
0523:
0524: retFlags[0] |= (flags & __NONNULL);
0525:
0526: if (chain == OpCode._NULL_OFFSET)
0527: retFlags[0] |= (flags & __SPSTART);
0528: else {
0529: ++__cost;
0530: __programAddTail(chain, latest);
0531: }
0532: chain = latest;
0533: value = __input._getValue();
0534: }
0535:
0536: // If loop was never entered.
0537: if (chain == OpCode._NULL_OFFSET)
0538: __emitNode(OpCode._NOTHING);
0539:
0540: return offset;
0541: }
0542:
0543: private int __parseAtom(int[] retFlags)
0544: throws MalformedPatternException {
0545: boolean doDefault;
0546: char value;
0547: int offset, flags[] = { 0 };
0548:
0549: retFlags[0] = __WORSTCASE;
0550: doDefault = false;
0551: offset = OpCode._NULL_OFFSET;
0552:
0553: tryAgain: while (true) {
0554:
0555: value = __input._getValue();
0556:
0557: switch (value) {
0558: case '^':
0559: __getNextChar();
0560: // The order here is important in order to support /ms.
0561: // /m takes precedence over /s for ^ and $, but not for .
0562: if ((__modifierFlags[0] & __MULTILINE) != 0)
0563: offset = __emitNode(OpCode._MBOL);
0564: else if ((__modifierFlags[0] & __SINGLELINE) != 0)
0565: offset = __emitNode(OpCode._SBOL);
0566: else
0567: offset = __emitNode(OpCode._BOL);
0568: break tryAgain;
0569:
0570: case '$':
0571: __getNextChar();
0572: // The order here is important in order to support /ms.
0573: // /m takes precedence over /s for ^ and $, but not for .
0574: if ((__modifierFlags[0] & __MULTILINE) != 0)
0575: offset = __emitNode(OpCode._MEOL);
0576: else if ((__modifierFlags[0] & __SINGLELINE) != 0)
0577: offset = __emitNode(OpCode._SEOL);
0578: else
0579: offset = __emitNode(OpCode._EOL);
0580: break tryAgain;
0581:
0582: case '.':
0583: __getNextChar();
0584: // The order here is important in order to support /ms.
0585: // /m takes precedence over /s for ^ and $, but not for .
0586: if ((__modifierFlags[0] & __SINGLELINE) != 0)
0587: offset = __emitNode(OpCode._SANY);
0588: else
0589: offset = __emitNode(OpCode._ANY);
0590: ++__cost;
0591: retFlags[0] |= (__NONNULL | __SIMPLE);
0592: break tryAgain;
0593:
0594: case '[':
0595: __input._increment();
0596: offset = __parseUnicodeClass();
0597: retFlags[0] |= (__NONNULL | __SIMPLE);
0598: break tryAgain;
0599:
0600: case '(':
0601: __getNextChar();
0602: offset = __parseExpression(true, flags);
0603: if (offset == OpCode._NULL_OFFSET) {
0604: if ((flags[0] & __TRYAGAIN) != 0)
0605: continue tryAgain;
0606: return OpCode._NULL_OFFSET;
0607: }
0608: retFlags[0] |= (flags[0] & (__NONNULL | __SPSTART));
0609: break tryAgain;
0610:
0611: case '|':
0612: case ')':
0613: if ((flags[0] & __TRYAGAIN) != 0) {
0614: retFlags[0] |= __TRYAGAIN;
0615: return OpCode._NULL_OFFSET;
0616: }
0617:
0618: throw new MalformedPatternException(
0619: "Error in expression at "
0620: + __input._toString(__input
0621: ._getOffset()));
0622: //break tryAgain;
0623:
0624: case '?':
0625: case '+':
0626: case '*':
0627: throw new MalformedPatternException(
0628: "?+* follows nothing in expression");
0629: //break tryAgain;
0630:
0631: case '\\':
0632: value = __input._increment();
0633:
0634: switch (value) {
0635: case 'A':
0636: offset = __emitNode(OpCode._SBOL);
0637: retFlags[0] |= __SIMPLE;
0638: __getNextChar();
0639: break;
0640: case 'G':
0641: offset = __emitNode(OpCode._GBOL);
0642: retFlags[0] |= __SIMPLE;
0643: __getNextChar();
0644: break;
0645: case 'Z':
0646: offset = __emitNode(OpCode._SEOL);
0647: retFlags[0] |= __SIMPLE;
0648: __getNextChar();
0649: break;
0650: case 'w':
0651: offset = __emitNode(OpCode._ALNUM);
0652: retFlags[0] |= (__NONNULL | __SIMPLE);
0653: __getNextChar();
0654: break;
0655: case 'W':
0656: offset = __emitNode(OpCode._NALNUM);
0657: retFlags[0] |= (__NONNULL | __SIMPLE);
0658: __getNextChar();
0659: break;
0660: case 'b':
0661: offset = __emitNode(OpCode._BOUND);
0662: retFlags[0] |= __SIMPLE;
0663: __getNextChar();
0664: break;
0665: case 'B':
0666: offset = __emitNode(OpCode._NBOUND);
0667: retFlags[0] |= __SIMPLE;
0668: __getNextChar();
0669: break;
0670: case 's':
0671: offset = __emitNode(OpCode._SPACE);
0672: retFlags[0] |= (__NONNULL | __SIMPLE);
0673: __getNextChar();
0674: break;
0675: case 'S':
0676: offset = __emitNode(OpCode._NSPACE);
0677: retFlags[0] |= (__NONNULL | __SIMPLE);
0678: __getNextChar();
0679: break;
0680: case 'd':
0681: offset = __emitNode(OpCode._DIGIT);
0682: retFlags[0] |= (__NONNULL | __SIMPLE);
0683: __getNextChar();
0684: break;
0685: case 'D':
0686: offset = __emitNode(OpCode._NDIGIT);
0687: retFlags[0] |= (__NONNULL | __SIMPLE);
0688: __getNextChar();
0689: break;
0690: case 'n':
0691: case 'r':
0692: case 't':
0693: case 'f':
0694: case 'e':
0695: case 'a':
0696: case 'x':
0697: case 'c':
0698: case '0':
0699: doDefault = true;
0700: break tryAgain;
0701: case '1':
0702: case '2':
0703: case '3':
0704: case '4':
0705: case '5':
0706: case '6':
0707: case '7':
0708: case '8':
0709: case '9':
0710: int num;
0711: StringBuffer buffer = new StringBuffer(10);
0712:
0713: num = 0;
0714: value = __input._getValueRelative(num);
0715:
0716: while (Character.isDigit(value)) {
0717: buffer.append(value);
0718: ++num;
0719: value = __input._getValueRelative(num);
0720: }
0721:
0722: try {
0723: num = Integer.parseInt(buffer.toString());
0724: } catch (NumberFormatException e) {
0725: throw new MalformedPatternException(
0726: "Unexpected number format exception. Please report this bug."
0727: + "NumberFormatException message: "
0728: + e.getMessage());
0729: }
0730:
0731: if (num > 9 && num >= __numParentheses) {
0732: doDefault = true;
0733: break tryAgain;
0734: } else {
0735: // A backreference may only occur AFTER its group
0736: if (num >= __numParentheses)
0737: throw new MalformedPatternException(
0738: "Invalid backreference: \\" + num);
0739: __sawBackreference = true;
0740: offset = __emitArgNode(OpCode._REF, (char) num);
0741: retFlags[0] |= __NONNULL;
0742:
0743: value = __input._getValue();
0744: while (Character.isDigit(value))
0745: value = __input._increment();
0746:
0747: __input._decrement();
0748: __getNextChar();
0749: }
0750: break;
0751: case '\0':
0752: case CharStringPointer._END_OF_STRING:
0753: if (__input._isAtEnd())
0754: throw new MalformedPatternException(
0755: "Trailing \\ in expression.");
0756:
0757: // fall through to default
0758: default:
0759: doDefault = true;
0760: break tryAgain;
0761: }
0762: break tryAgain;
0763:
0764: case '#':
0765: // skip over comments
0766: if ((__modifierFlags[0] & __EXTENDED) != 0) {
0767: while (!__input._isAtEnd()
0768: && __input._getValue() != '\n')
0769: __input._increment();
0770: if (!__input._isAtEnd())
0771: continue tryAgain;
0772: }
0773: // fall through to default
0774: default:
0775: __input._increment();
0776: doDefault = true;
0777: break tryAgain;
0778: }// end master switch
0779: } // end tryAgain
0780:
0781: if (doDefault) {
0782: char ender;
0783: int length, pOffset, maxOffset, lastOffset, numLength[];
0784:
0785: offset = __emitNode(OpCode._EXACTLY);
0786: // Not sure that it's ok to use 0 to mark end.
0787: //__emitCode((char)0);
0788: __emitCode((char) CharStringPointer._END_OF_STRING);
0789:
0790: forLoop: for (length = 0, pOffset = __input._getOffset() - 1, maxOffset = __input
0791: ._getLength(); length < 127 && pOffset < maxOffset; ++length) {
0792:
0793: lastOffset = pOffset;
0794: value = __input._getValue(pOffset);
0795:
0796: switch (value) {
0797: case '^':
0798: case '$':
0799: case '.':
0800: case '[':
0801: case '(':
0802: case ')':
0803: case '|':
0804: break forLoop;
0805: case '\\':
0806: value = __input._getValue(++pOffset);
0807:
0808: switch (value) {
0809: case 'A':
0810: case 'G':
0811: case 'Z':
0812: case 'w':
0813: case 'W':
0814: case 'b':
0815: case 'B':
0816: case 's':
0817: case 'S':
0818: case 'd':
0819: case 'D':
0820: --pOffset;
0821: break forLoop;
0822: case 'n':
0823: ender = '\n';
0824: ++pOffset;
0825: break;
0826: case 'r':
0827: ender = '\r';
0828: ++pOffset;
0829: break;
0830: case 't':
0831: ender = '\t';
0832: ++pOffset;
0833: break;
0834: case 'f':
0835: ender = '\f';
0836: ++pOffset;
0837: break;
0838: case 'e':
0839: ender = '\033';
0840: ++pOffset;
0841: break;
0842: case 'a':
0843: ender = '\007';
0844: ++pOffset;
0845: break;
0846: case 'x':
0847: numLength = new int[1];
0848: ender = (char) __parseHex(__input._array,
0849: ++pOffset, 2, numLength);
0850: pOffset += numLength[0];
0851: break;
0852: case 'c':
0853: ++pOffset;
0854: ender = __input._getValue(pOffset++);
0855: if (Character.isLowerCase(ender))
0856: ender = Character.toUpperCase(ender);
0857: ender ^= 64;
0858: break;
0859: case '0':
0860: case '1':
0861: case '2':
0862: case '3':
0863: case '4':
0864: case '5':
0865: case '6':
0866: case '7':
0867: case '8':
0868: case '9':
0869: boolean doOctal = false;
0870: value = __input._getValue(pOffset);
0871:
0872: if (value == '0')
0873: doOctal = true;
0874: value = __input._getValue(pOffset + 1);
0875:
0876: if (Character.isDigit(value)) {
0877: int num;
0878: StringBuffer buffer = new StringBuffer(10);
0879:
0880: num = pOffset;
0881: value = __input._getValue(num);
0882:
0883: while (Character.isDigit(value)) {
0884: buffer.append(value);
0885: ++num;
0886: value = __input._getValue(num);
0887: }
0888:
0889: try {
0890: num = Integer.parseInt(buffer
0891: .toString());
0892: } catch (NumberFormatException e) {
0893: throw new MalformedPatternException(
0894: "Unexpected number format exception. Please report this bug."
0895: + "NumberFormatException message: "
0896: + e.getMessage());
0897: }
0898:
0899: if (!doOctal)
0900: doOctal = (num >= __numParentheses);
0901: }
0902:
0903: if (doOctal) {
0904: numLength = new int[1];
0905: ender = (char) __parseOctal(__input._array,
0906: pOffset, 3, numLength);
0907: pOffset += numLength[0];
0908: } else {
0909: --pOffset;
0910: break forLoop;
0911: }
0912: break;
0913: case CharStringPointer._END_OF_STRING:
0914: case '\0':
0915: if (pOffset >= maxOffset)
0916: throw new MalformedPatternException(
0917: "Trailing \\ in expression.");
0918: // fall through to default
0919: default:
0920: ender = __input._getValue(pOffset++);
0921: break;
0922: } // end backslash switch
0923: break;
0924: case '#':
0925: if ((__modifierFlags[0] & __EXTENDED) != 0) {
0926: while (pOffset < maxOffset
0927: && __input._getValue(pOffset) != '\n')
0928: ++pOffset;
0929: }
0930: // fall through to whitespace handling
0931: case ' ':
0932: case '\t':
0933: case '\n':
0934: case '\r':
0935: case '\f':
0936: case '\013':
0937: if ((__modifierFlags[0] & __EXTENDED) != 0) {
0938: ++pOffset;
0939: --length;
0940: continue;
0941: }
0942: // fall through to default
0943: default:
0944: ender = __input._getValue(pOffset++);
0945: break;
0946:
0947: } // end master switch
0948:
0949: if ((__modifierFlags[0] & __CASE_INSENSITIVE) != 0
0950: && Character.isUpperCase(ender))
0951: ender = Character.toLowerCase(ender);
0952:
0953: if (pOffset < maxOffset
0954: && __isComplexRepetitionOp(__input._array,
0955: pOffset)) {
0956: if (length > 0)
0957: pOffset = lastOffset;
0958: else {
0959: ++length;
0960: __emitCode(ender);
0961: }
0962: break;
0963: }
0964:
0965: __emitCode(ender);
0966:
0967: } // end for loop
0968:
0969: __input._setOffset(pOffset - 1);
0970: __getNextChar();
0971:
0972: if (length < 0)
0973: throw new MalformedPatternException(
0974: "Unexpected compilation failure. Please report this bug!");
0975: if (length > 0)
0976: retFlags[0] |= __NONNULL;
0977: if (length == 1)
0978: retFlags[0] |= __SIMPLE;
0979: if (__program != null)
0980: __program[OpCode._getOperand(offset)] = (char) length;
0981: //__emitCode('\0'); // debug
0982: __emitCode(CharStringPointer._END_OF_STRING);
0983: }
0984:
0985: return offset;
0986: }
0987:
0988: // These are the original 8-bit character class handling methods.
0989: // We don't want to delete them just yet only to have to dig it out
0990: // of revision control later.
0991: /*
0992: // Set the bits in a character class. Only recognizes ascii.
0993: private void __setCharacterClassBits(char[] bits, int offset, char deflt,
0994: char ch)
0995: {
0996: if(__program== null || ch >= 256)
0997: return;
0998: ch &= 0xffff;
0999:
1000: if(deflt == 0) {
1001: bits[offset + (ch >> 4)] |= (1 << (ch & 0xf));
1002: } else {
1003: bits[offset + (ch >> 4)] &= ~(1 << (ch & 0xf));
1004: }
1005: }
1006:
1007: private int __parseCharacterClass() throws MalformedPatternException {
1008: boolean range = false, skipTest;
1009: char clss, deflt, lastclss = Character.MAX_VALUE;
1010: int offset, bits, numLength[] = { 0 };
1011:
1012: offset = __emitNode(OpCode._ANYOF);
1013:
1014: if(__input._getValue() == '^') {
1015: ++__cost;
1016: __input._increment();
1017: deflt = 0;
1018: } else {
1019: deflt = 0xffff;
1020: }
1021:
1022: bits = __programSize;
1023: for(clss = 0; clss < 16; clss++)
1024: __emitCode(deflt);
1025:
1026: clss = __input._getValue();
1027:
1028: if(clss == ']' || clss == '-')
1029: skipTest = true;
1030: else
1031: skipTest = false;
1032:
1033: while((!__input._isAtEnd() && (clss = __input._getValue()) != ']')
1034: || skipTest) {
1035: // It sucks, but we have to make this assignment every time
1036: skipTest = false;
1037: __input._increment();
1038: if(clss == '\\') {
1039: clss = __input._postIncrement();
1040:
1041: switch(clss){
1042: case 'w':
1043: for(clss = 0; clss < 256; clss++)
1044: if(OpCode._isWordCharacter(clss))
1045: __setCharacterClassBits(__program, bits, deflt, clss);
1046: lastclss = Character.MAX_VALUE;
1047: continue;
1048: case 'W':
1049: for(clss = 0; clss < 256; clss++)
1050: if(!OpCode._isWordCharacter(clss))
1051: __setCharacterClassBits(__program, bits, deflt, clss);
1052: lastclss = Character.MAX_VALUE;
1053: continue;
1054: case 's':
1055: for(clss = 0; clss < 256; clss++)
1056: if(Character.isWhitespace(clss))
1057: __setCharacterClassBits(__program, bits, deflt, clss);
1058: lastclss = Character.MAX_VALUE;
1059: continue;
1060: case 'S':
1061: for(clss = 0; clss < 256; clss++)
1062: if(!Character.isWhitespace(clss))
1063: __setCharacterClassBits(__program, bits, deflt, clss);
1064: lastclss = Character.MAX_VALUE;
1065: continue;
1066: case 'd':
1067: for(clss = '0'; clss <= '9'; clss++)
1068: __setCharacterClassBits(__program, bits, deflt, clss);
1069: lastclss = Character.MAX_VALUE;
1070: continue;
1071: case 'D':
1072: for(clss = 0; clss < '0'; clss++)
1073: __setCharacterClassBits(__program, bits, deflt, clss);
1074: for(clss = (char)('9' + 1); clss < 256; clss++)
1075: __setCharacterClassBits(__program, bits, deflt, clss);
1076: lastclss = Character.MAX_VALUE;
1077: continue;
1078: case 'n':
1079: clss = '\n';
1080: break;
1081: case 'r':
1082: clss = '\r';
1083: break;
1084: case 't':
1085: clss = '\t';
1086: break;
1087: case 'f':
1088: clss = '\f';
1089: break;
1090: case 'b':
1091: clss = '\b';
1092: break;
1093: case 'e':
1094: clss = '\033';
1095: break;
1096: case 'a':
1097: clss = '\007';
1098: break;
1099: case 'x':
1100: clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
1101: numLength);
1102: __input._increment(numLength[0]);
1103: break;
1104: case 'c':
1105: clss = __input._postIncrement();
1106: if(Character.isLowerCase(clss))
1107: clss = Character.toUpperCase(clss);
1108: clss ^= 64;
1109: break;
1110: case '0': case '1': case '2': case '3': case '4':
1111: case '5': case '6': case '7': case '8': case '9':
1112: clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
1113: 3, numLength);
1114: __input._increment(numLength[0] - 1);
1115: break;
1116: }
1117: }
1118:
1119: if(range) {
1120: if(lastclss > clss)
1121: throw new MalformedPatternException(
1122: "Invalid [] range in expression.");
1123: range = false;
1124: } else {
1125: lastclss = clss;
1126:
1127: if(__input._getValue() == '-' &&
1128: __input._getOffset() + 1 < __input._getLength() &&
1129: __input._getValueRelative(1) != ']') {
1130: __input._increment();
1131: range = true;
1132: continue;
1133: }
1134: }
1135:
1136: while(lastclss <= clss) {
1137: __setCharacterClassBits(__program, bits, deflt, lastclss);
1138: if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
1139: Character.isUpperCase(lastclss))
1140: __setCharacterClassBits(__program, bits, deflt,
1141: Character.toLowerCase(lastclss));
1142:
1143: ++lastclss;
1144: }
1145:
1146: lastclss = clss;
1147: }
1148:
1149: if(__input._getValue() != ']')
1150: throw new MalformedPatternException("Unmatched [] in expression.");
1151:
1152: __getNextChar();
1153:
1154: return offset;
1155: }
1156: */
1157:
1158: private int __parseUnicodeClass() throws MalformedPatternException {
1159: boolean range = false, skipTest;
1160: char clss, lastclss = Character.MAX_VALUE;
1161:
1162: int offset, numLength[] = { 0 };
1163: boolean negFlag[] = { false };
1164: boolean opcodeFlag; /* clss isn't character when this flag true. */
1165:
1166: if (__input._getValue() == '^') {
1167: offset = __emitNode(OpCode._NANYOFUN);
1168: __input._increment();
1169: } else {
1170: offset = __emitNode(OpCode._ANYOFUN);
1171: }
1172:
1173: clss = __input._getValue();
1174:
1175: if (clss == ']' || clss == '-')
1176: skipTest = true;
1177: else
1178: skipTest = false;
1179:
1180: while ((!__input._isAtEnd() && (clss = __input._getValue()) != ']')
1181: || skipTest) {
1182: // It sucks, but we have to make this assignment every time
1183: skipTest = false;
1184: opcodeFlag = false;
1185: __input._increment();
1186:
1187: if (clss == '\\' || clss == '[') {
1188: if (clss == '\\') {
1189: /* character is escaped */
1190: clss = __input._postIncrement();
1191: } else {
1192: /* try POSIX expression */
1193: char posixOpCode = __parsePOSIX(negFlag);
1194: if (posixOpCode != 0) {
1195: opcodeFlag = true;
1196: clss = posixOpCode;
1197: }
1198: }
1199: if (opcodeFlag != true) {
1200: switch (clss) {
1201: case 'w':
1202: opcodeFlag = true;
1203: clss = OpCode._ALNUM;
1204: lastclss = Character.MAX_VALUE;
1205: break;
1206: case 'W':
1207: opcodeFlag = true;
1208: clss = OpCode._NALNUM;
1209: lastclss = Character.MAX_VALUE;
1210: break;
1211: case 's':
1212: opcodeFlag = true;
1213: clss = OpCode._SPACE;
1214: lastclss = Character.MAX_VALUE;
1215: break;
1216: case 'S':
1217: opcodeFlag = true;
1218: clss = OpCode._NSPACE;
1219: lastclss = Character.MAX_VALUE;
1220: break;
1221: case 'd':
1222: opcodeFlag = true;
1223: clss = OpCode._DIGIT;
1224: lastclss = Character.MAX_VALUE;
1225: break;
1226: case 'D':
1227: opcodeFlag = true;
1228: clss = OpCode._NDIGIT;
1229: lastclss = Character.MAX_VALUE;
1230: break;
1231: case 'n':
1232: clss = '\n';
1233: break;
1234: case 'r':
1235: clss = '\r';
1236: break;
1237: case 't':
1238: clss = '\t';
1239: break;
1240: case 'f':
1241: clss = '\f';
1242: break;
1243: case 'b':
1244: clss = '\b';
1245: break;
1246: case 'e':
1247: clss = '\033';
1248: break;
1249: case 'a':
1250: clss = '\007';
1251: break;
1252: case 'x':
1253: clss = (char) __parseHex(__input._array,
1254: __input._getOffset(), 2, numLength);
1255: __input._increment(numLength[0]);
1256: break;
1257: case 'c':
1258: clss = __input._postIncrement();
1259: if (Character.isLowerCase(clss))
1260: clss = Character.toUpperCase(clss);
1261: clss ^= 64;
1262: break;
1263: case '0':
1264: case '1':
1265: case '2':
1266: case '3':
1267: case '4':
1268: case '5':
1269: case '6':
1270: case '7':
1271: case '8':
1272: case '9':
1273: clss = (char) __parseOctal(__input._array,
1274: __input._getOffset() - 1, 3, numLength);
1275: __input._increment(numLength[0] - 1);
1276: break;
1277: default:
1278: break;
1279: }
1280: }
1281: }
1282:
1283: if (range) {
1284: if (lastclss > clss)
1285: throw new MalformedPatternException(
1286: "Invalid [] range in expression.");
1287: range = false;
1288: } else {
1289: lastclss = clss;
1290:
1291: if (opcodeFlag == false
1292: && __input._getValue() == '-'
1293: && __input._getOffset() + 1 < __input
1294: ._getLength()
1295: && __input._getValueRelative(1) != ']') {
1296: __input._increment();
1297: range = true;
1298: continue;
1299: }
1300: }
1301:
1302: if (lastclss == clss) {
1303: if (opcodeFlag == true) {
1304: if (negFlag[0] == false)
1305: __emitCode(OpCode._OPCODE);
1306: else
1307: __emitCode(OpCode._NOPCODE);
1308: } else
1309: __emitCode(OpCode._ONECHAR);
1310:
1311: __emitCode(clss);
1312:
1313: if ((__modifierFlags[0] & __CASE_INSENSITIVE) != 0
1314: && Character.isUpperCase(clss)
1315: && Character.isUpperCase(lastclss)) {
1316: __programSize--;
1317: __emitCode(Character.toLowerCase(clss));
1318: }
1319: }
1320:
1321: if (lastclss < clss) {
1322: __emitCode(OpCode._RANGE);
1323: __emitCode(lastclss);
1324: __emitCode(clss);
1325:
1326: if ((__modifierFlags[0] & __CASE_INSENSITIVE) != 0
1327: && Character.isUpperCase(clss)
1328: && Character.isUpperCase(lastclss)) {
1329: __programSize -= 2;
1330: __emitCode(Character.toLowerCase(lastclss));
1331: __emitCode(Character.toLowerCase(clss));
1332:
1333: }
1334:
1335: lastclss = Character.MAX_VALUE;
1336: range = false;
1337: }
1338:
1339: lastclss = clss;
1340: }
1341:
1342: if (__input._getValue() != ']')
1343: throw new MalformedPatternException(
1344: "Unmatched [] in expression.");
1345:
1346: __getNextChar();
1347: __emitCode(OpCode._END);
1348:
1349: return offset;
1350: }
1351:
1352: /**
1353: * Parse POSIX epxression like [:foo:].
1354: *
1355: * @return OpCode. return 0 when fail parsing POSIX expression.
1356: */
1357: private char __parsePOSIX(boolean negFlag[])
1358: throws MalformedPatternException {
1359: int offset = __input._getOffset();
1360: int len = __input._getLength();
1361: int pos = offset;
1362: char value = __input._getValue(pos++);
1363: StringBuffer buf;
1364: Object opcode;
1365:
1366: if (value != ':')
1367: return 0;
1368: if (__input._getValue(pos) == '^') {
1369: negFlag[0] = true;
1370: pos++;
1371: } else {
1372: negFlag[0] = false;
1373: }
1374:
1375: buf = new StringBuffer();
1376:
1377: try {
1378: while ((value = __input._getValue(pos++)) != ':'
1379: && pos < len) {
1380: buf.append(value);
1381: }
1382: } catch (Exception e) {
1383: return 0;
1384: }
1385:
1386: if (__input._getValue(pos++) != ']') {
1387: return 0;
1388: }
1389:
1390: opcode = __hashPOSIX.get(buf.toString());
1391:
1392: if (opcode == null)
1393: return 0;
1394:
1395: __input._setOffset(pos);
1396:
1397: return ((Character) opcode).charValue();
1398: }
1399:
1400: private int __parseBranch(int[] retFlags)
1401: throws MalformedPatternException {
1402: boolean nestCheck = false, handleRepetition = false;
1403: int offset, next, min, max, flags[] = { 0 };
1404: char operator, value;
1405:
1406: min = 0;
1407: max = Character.MAX_VALUE;
1408: offset = __parseAtom(flags);
1409:
1410: if (offset == OpCode._NULL_OFFSET) {
1411: if ((flags[0] & __TRYAGAIN) != 0)
1412: retFlags[0] |= __TRYAGAIN;
1413: return OpCode._NULL_OFFSET;
1414: }
1415:
1416: operator = __input._getValue();
1417:
1418: if (operator == '(' && __input._getValueRelative(1) == '?'
1419: && __input._getValueRelative(2) == '#') {
1420: while (operator != CharStringPointer._END_OF_STRING
1421: && operator != ')')
1422: operator = __input._increment();
1423:
1424: if (operator != CharStringPointer._END_OF_STRING) {
1425: __getNextChar();
1426: operator = __input._getValue();
1427: }
1428: }
1429:
1430: if (operator == '{'
1431: && __parseRepetition(__input._array, __input
1432: ._getOffset())) {
1433: int maxOffset, pos;
1434:
1435: next = __input._getOffset() + 1;
1436: pos = maxOffset = __input._getLength();
1437:
1438: value = __input._getValue(next);
1439:
1440: while (Character.isDigit(value) || value == ',') {
1441: if (value == ',') {
1442: if (pos != maxOffset)
1443: break;
1444: else
1445: pos = next;
1446: }
1447: ++next;
1448: value = __input._getValue(next);
1449: }
1450:
1451: if (value == '}') {
1452: int num;
1453: StringBuffer buffer = new StringBuffer(10);
1454:
1455: if (pos == maxOffset)
1456: pos = next;
1457: __input._increment();
1458:
1459: num = __input._getOffset();
1460: value = __input._getValue(num);
1461:
1462: while (Character.isDigit(value)) {
1463: buffer.append(value);
1464: ++num;
1465: value = __input._getValue(num);
1466: }
1467:
1468: try {
1469: min = Integer.parseInt(buffer.toString());
1470: } catch (NumberFormatException e) {
1471: throw new MalformedPatternException(
1472: "Unexpected number format exception. Please report this bug."
1473: + "NumberFormatException message: "
1474: + e.getMessage());
1475: }
1476:
1477: value = __input._getValue(pos);
1478: if (value == ',')
1479: ++pos;
1480: else
1481: pos = __input._getOffset();
1482:
1483: num = pos;
1484: buffer = new StringBuffer(10);
1485:
1486: value = __input._getValue(num);
1487:
1488: while (Character.isDigit(value)) {
1489: buffer.append(value);
1490: ++num;
1491: value = __input._getValue(num);
1492: }
1493:
1494: try {
1495: if (num != pos)
1496: max = Integer.parseInt(buffer.toString());
1497: } catch (NumberFormatException e) {
1498: throw new MalformedPatternException(
1499: "Unexpected number format exception. Please report this bug."
1500: + "NumberFormatException message: "
1501: + e.getMessage());
1502: }
1503:
1504: if (max == 0 && __input._getValue(pos) != '0')
1505: max = Character.MAX_VALUE;
1506: __input._setOffset(next);
1507: __getNextChar();
1508:
1509: nestCheck = true;
1510: handleRepetition = true;
1511: }
1512: }
1513:
1514: if (!nestCheck) {
1515: handleRepetition = false;
1516:
1517: if (!__isSimpleRepetitionOp(operator)) {
1518: retFlags[0] = flags[0];
1519: return offset;
1520: }
1521:
1522: __getNextChar();
1523:
1524: retFlags[0] = ((operator != '+') ? (__WORSTCASE | __SPSTART)
1525: : (__WORSTCASE | __NONNULL));
1526:
1527: if (operator == '*' && ((flags[0] & __SIMPLE) != 0)) {
1528: __programInsertOperator(OpCode._STAR, offset);
1529: __cost += 4;
1530: } else if (operator == '*') {
1531: min = 0;
1532: handleRepetition = true;
1533: } else if (operator == '+' && (flags[0] & __SIMPLE) != 0) {
1534: __programInsertOperator(OpCode._PLUS, offset);
1535: __cost += 3;
1536: } else if (operator == '+') {
1537: min = 1;
1538: handleRepetition = true;
1539: } else if (operator == '?') {
1540: min = 0;
1541: max = 1;
1542: handleRepetition = true;
1543: }
1544: }
1545:
1546: if (handleRepetition) {
1547:
1548: // handle repetition
1549: if ((flags[0] & __SIMPLE) != 0) {
1550: __cost += ((2 + __cost) / 2);
1551: __programInsertOperator(OpCode._CURLY, offset);
1552: } else {
1553: __cost += (4 + __cost);
1554: __programAddTail(offset, __emitNode(OpCode._WHILEM));
1555: __programInsertOperator(OpCode._CURLYX, offset);
1556: __programAddTail(offset, __emitNode(OpCode._NOTHING));
1557: }
1558:
1559: if (min > 0)
1560: retFlags[0] = (__WORSTCASE | __NONNULL);
1561:
1562: if (max != 0 && max < min)
1563: throw new MalformedPatternException(
1564: "Invalid interval {" + min + "," + max + "}");
1565:
1566: if (__program != null) {
1567: __program[offset + 2] = (char) min;
1568: __program[offset + 3] = (char) max;
1569: }
1570: }
1571:
1572: if (__input._getValue() == '?') {
1573: __getNextChar();
1574: __programInsertOperator(OpCode._MINMOD, offset);
1575: __programAddTail(offset, offset + 2);
1576: }
1577:
1578: if (__isComplexRepetitionOp(__input._array, __input
1579: ._getOffset()))
1580: throw new MalformedPatternException(
1581: "Nested repetitions *?+ in expression");
1582:
1583: return offset;
1584: }
1585:
1586: private int __parseExpression(boolean isParenthesized,
1587: int[] hintFlags) throws MalformedPatternException {
1588: char value, paren;
1589: char[] modifierFlags, posFlags = { 0 }, negFlags = { 0 };
1590: int nodeOffset = OpCode._NULL_OFFSET, parenthesisNum = 0, br, ender;
1591: int[] flags = { 0 };
1592: ;
1593: String modifiers = "iogmsx-";
1594:
1595: modifierFlags = posFlags;
1596: // Initially we assume expression doesn't match null string.
1597: hintFlags[0] = __NONNULL;
1598:
1599: if (isParenthesized) {
1600: paren = 1;
1601: if (__input._getValue() == '?') {
1602: __input._increment();
1603: paren = value = __input._postIncrement();
1604:
1605: switch (value) {
1606: case ':':
1607: case '=':
1608: case '!':
1609: break;
1610: case '#':
1611: value = __input._getValue();
1612: while (value != CharStringPointer._END_OF_STRING
1613: && value != ')')
1614: value = __input._increment();
1615: if (value != ')')
1616: throw new MalformedPatternException(
1617: "Sequence (?#... not terminated");
1618: __getNextChar();
1619: hintFlags[0] = __TRYAGAIN;
1620: return OpCode._NULL_OFFSET;
1621: default:
1622: __input._decrement();
1623: value = __input._getValue();
1624: while (value != CharStringPointer._END_OF_STRING
1625: && modifiers.indexOf(value) != -1) {
1626: if (value == '-')
1627: modifierFlags = negFlags;
1628: else
1629: __setModifierFlag(modifierFlags, value);
1630: value = __input._increment();
1631: }
1632: __modifierFlags[0] |= posFlags[0];
1633: __modifierFlags[0] &= ~negFlags[0];
1634:
1635: if (value != ')')
1636: throw new MalformedPatternException(
1637: "Sequence (?" + value
1638: + "...) not recognized");
1639: __getNextChar();
1640: hintFlags[0] = __TRYAGAIN;
1641: return OpCode._NULL_OFFSET;
1642: }
1643: } else {
1644: parenthesisNum = __numParentheses;
1645: ++__numParentheses;
1646: nodeOffset = __emitArgNode(OpCode._OPEN,
1647: (char) parenthesisNum);
1648: }
1649: } else
1650: paren = 0;
1651:
1652: br = __parseAlternation(flags);
1653:
1654: if (br == OpCode._NULL_OFFSET)
1655: return OpCode._NULL_OFFSET;
1656:
1657: if (nodeOffset != OpCode._NULL_OFFSET)
1658: __programAddTail(nodeOffset, br);
1659: else
1660: nodeOffset = br;
1661:
1662: if ((flags[0] & __NONNULL) == 0)
1663: hintFlags[0] &= ~__NONNULL;
1664:
1665: hintFlags[0] |= (flags[0] & __SPSTART);
1666:
1667: while (__input._getValue() == '|') {
1668: __getNextChar();
1669: br = __parseAlternation(flags);
1670:
1671: if (br == OpCode._NULL_OFFSET)
1672: return OpCode._NULL_OFFSET;
1673:
1674: __programAddTail(nodeOffset, br);
1675:
1676: if ((flags[0] & __NONNULL) == 0)
1677: hintFlags[0] &= ~__NONNULL;
1678:
1679: hintFlags[0] |= (flags[0] & __SPSTART);
1680: }
1681:
1682: switch (paren) {
1683: case ':':
1684: ender = __emitNode(OpCode._NOTHING);
1685: break;
1686: case 1:
1687: ender = __emitArgNode(OpCode._CLOSE, (char) parenthesisNum);
1688: break;
1689: case '=':
1690: case '!':
1691: ender = __emitNode(OpCode._SUCCEED);
1692: hintFlags[0] &= ~__NONNULL;
1693: break;
1694: case 0:
1695: default:
1696: ender = __emitNode(OpCode._END);
1697: break;
1698: }
1699:
1700: __programAddTail(nodeOffset, ender);
1701:
1702: for (br = nodeOffset; br != OpCode._NULL_OFFSET; br = OpCode
1703: ._getNext(__program, br))
1704: __programAddOperatorTail(br, ender);
1705:
1706: if (paren == '=') {
1707: __programInsertOperator(OpCode._IFMATCH, nodeOffset);
1708: __programAddTail(nodeOffset, __emitNode(OpCode._NOTHING));
1709: } else if (paren == '!') {
1710: __programInsertOperator(OpCode._UNLESSM, nodeOffset);
1711: __programAddTail(nodeOffset, __emitNode(OpCode._NOTHING));
1712: }
1713:
1714: if (paren != 0
1715: && (__input._isAtEnd() || __getNextChar() != ')')) {
1716: throw new MalformedPatternException(
1717: "Unmatched parentheses.");
1718: } else if (paren == 0 && !__input._isAtEnd()) {
1719: if (__input._getValue() == ')')
1720: throw new MalformedPatternException(
1721: "Unmatched parentheses.");
1722: else
1723: // Should never happen.
1724: throw new MalformedPatternException(
1725: "Unreached characters at end of expression. Please report this bug!");
1726: }
1727:
1728: return nodeOffset;
1729: }
1730:
1731: /**
1732: * Compiles a Perl5 regular expression into a Perl5Pattern instance that
1733: * can be used by a Perl5Matcher object to perform pattern matching.
1734: * Please see the user's guide for more information about Perl5 regular
1735: * expressions.
1736: * <p>
1737: * @param pattern A Perl5 regular expression to compile.
1738: * @param options A set of flags giving the compiler instructions on
1739: * how to treat the regular expression. The flags
1740: * are a logical OR of any number of the five <b>MASK</b>
1741: * constants. For example:
1742: * <pre>
1743: * regex =
1744: * compiler.compile(pattern, Perl5Compiler.
1745: * CASE_INSENSITIVE_MASK |
1746: * Perl5Compiler.MULTILINE_MASK);
1747: * </pre>
1748: * This says to compile the pattern so that it treats
1749: * input as consisting of multiple lines and to perform
1750: * matches in a case insensitive manner.
1751: * @return A Pattern instance constituting the compiled regular expression.
1752: * This instance will always be a Perl5Pattern and can be reliably
1753: * casted to a Perl5Pattern.
1754: * @exception MalformedPatternException If the compiled expression
1755: * is not a valid Perl5 regular expression.
1756: */
1757: public Pattern compile(char[] pattern, int options)
1758: throws MalformedPatternException {
1759: int[] flags = { 0 };
1760: int caseInsensitive, scan;
1761: Perl5Pattern regexp;
1762: String mustString, startString;
1763:
1764: int first;
1765: boolean sawOpen = false, sawPlus = false;
1766:
1767: StringBuffer lastLongest, longest;
1768: int length, minLength = 0, curBack, back, backmost;
1769:
1770: __input = new CharStringPointer(pattern);
1771:
1772: caseInsensitive = options & __CASE_INSENSITIVE;
1773: __modifierFlags[0] = (char) options;
1774: __sawBackreference = false;
1775: __numParentheses = 1;
1776: __programSize = 0;
1777: __cost = 0;
1778: __program = null;
1779:
1780: __emitCode((char) 0);
1781: if (__parseExpression(false, flags) == OpCode._NULL_OFFSET)
1782: throw new MalformedPatternException(
1783: "Unknown compilation error.");
1784:
1785: if (__programSize >= Character.MAX_VALUE - 1)
1786: throw new MalformedPatternException(
1787: "Expression is too large.");
1788:
1789: __program = new char[__programSize];
1790: regexp = new Perl5Pattern();
1791:
1792: regexp._program = __program;
1793: regexp._expression = new String(pattern);
1794:
1795: __input._setOffset(0);
1796:
1797: __numParentheses = 1;
1798: __programSize = 0;
1799: __cost = 0;
1800:
1801: __emitCode((char) 0);
1802: if (__parseExpression(false, flags) == OpCode._NULL_OFFSET)
1803: throw new MalformedPatternException(
1804: "Unknown compilation error.");
1805:
1806: caseInsensitive = __modifierFlags[0] & __CASE_INSENSITIVE;
1807:
1808: regexp._isExpensive = (__cost >= 10);
1809: regexp._startClassOffset = OpCode._NULL_OFFSET;
1810: regexp._anchor = 0;
1811: regexp._back = -1;
1812: regexp._options = options;
1813: regexp._startString = null;
1814: regexp._mustString = null;
1815: mustString = null;
1816: startString = null;
1817:
1818: scan = 1;
1819: if (__program[OpCode._getNext(__program, scan)] == OpCode._END) {
1820: boolean doItAgain; // bad variables names!
1821: char op;
1822:
1823: first = scan = OpCode._getNextOperator(scan);
1824: op = __program[first];
1825:
1826: while ((op == OpCode._OPEN && (sawOpen = true))
1827: || (op == OpCode._BRANCH && __program[OpCode
1828: ._getNext(__program, first)] != OpCode._BRANCH)
1829: || op == OpCode._PLUS
1830: || op == OpCode._MINMOD
1831: || (OpCode._opType[op] == OpCode._CURLY && OpCode
1832: ._getArg1(__program, first) > 0)) {
1833: if (op == OpCode._PLUS)
1834: sawPlus = true;
1835: else
1836: first += OpCode._operandLength[op];
1837:
1838: first = OpCode._getNextOperator(first);
1839: op = __program[first];
1840: }
1841:
1842: doItAgain = true;
1843:
1844: while (doItAgain) {
1845: doItAgain = false;
1846: op = __program[first];
1847:
1848: if (op == OpCode._EXACTLY) {
1849: startString = new String(__program, OpCode
1850: ._getOperand(first + 1), __program[OpCode
1851: ._getOperand(first)]);
1852:
1853: } else if (OpCode
1854: ._isInArray(op, OpCode._opLengthOne, 2))
1855: regexp._startClassOffset = first;
1856: else if (op == OpCode._BOUND || op == OpCode._NBOUND)
1857: regexp._startClassOffset = first;
1858: else if (OpCode._opType[op] == OpCode._BOL) {
1859: if (op == OpCode._BOL)
1860: regexp._anchor = Perl5Pattern._OPT_ANCH_BOL;
1861: else if (op == OpCode._MBOL)
1862: regexp._anchor = Perl5Pattern._OPT_ANCH_MBOL;
1863: else
1864: regexp._anchor = Perl5Pattern._OPT_ANCH;
1865: first = OpCode._getNextOperator(first);
1866: doItAgain = true;
1867: continue;
1868: } else if (op == OpCode._STAR
1869: && OpCode._opType[__program[OpCode
1870: ._getNextOperator(first)]] == OpCode._ANY
1871: && (regexp._anchor & Perl5Pattern._OPT_ANCH) != 0) {
1872: regexp._anchor = Perl5Pattern._OPT_ANCH
1873: | Perl5Pattern._OPT_IMPLICIT;
1874: first = OpCode._getNextOperator(first);
1875: doItAgain = true;
1876: continue;
1877: }
1878: } // end while do it again
1879:
1880: if (sawPlus && (!sawOpen || !__sawBackreference))
1881: regexp._anchor |= Perl5Pattern._OPT_SKIP;
1882:
1883: lastLongest = new StringBuffer();
1884: longest = new StringBuffer();
1885: length = 0;
1886: minLength = 0;
1887: curBack = 0;
1888: back = 0;
1889: backmost = 0;
1890:
1891: while (scan > 0 && (op = __program[scan]) != OpCode._END) {
1892:
1893: if (op == OpCode._BRANCH) {
1894: if (__program[OpCode._getNext(__program, scan)] == OpCode._BRANCH) {
1895: curBack = -30000;
1896: while (__program[scan] == OpCode._BRANCH)
1897: scan = OpCode._getNext(__program, scan);
1898: } else
1899: scan = OpCode._getNextOperator(scan);
1900: continue;
1901: }
1902:
1903: if (op == OpCode._UNLESSM) {
1904: curBack = -30000;
1905: scan = OpCode._getNext(__program, scan);
1906: continue;
1907: }
1908:
1909: if (op == OpCode._EXACTLY) {
1910: int temp;
1911:
1912: first = scan;
1913: while (__program[(temp = OpCode._getNext(__program,
1914: scan))] == OpCode._CLOSE)
1915: scan = temp;
1916:
1917: minLength += __program[OpCode._getOperand(first)];
1918:
1919: temp = __program[OpCode._getOperand(first)];
1920:
1921: if (curBack - back == length) {
1922: lastLongest.append(new String(__program, OpCode
1923: ._getOperand(first) + 1, temp));
1924: length += temp;
1925: curBack += temp;
1926: first = OpCode._getNext(__program, scan);
1927: } else if (temp >= (length + (curBack >= 0 ? 1 : 0))) {
1928: length = temp;
1929: lastLongest = new StringBuffer(new String(
1930: __program,
1931: OpCode._getOperand(first) + 1, temp));
1932: back = curBack;
1933: curBack += length;
1934: first = OpCode._getNext(__program, scan);
1935: } else
1936: curBack += temp;
1937: } else if (OpCode._isInArray(op,
1938: OpCode._opLengthVaries, 0)) {
1939: curBack = -30000;
1940: length = 0;
1941:
1942: if (lastLongest.length() > longest.length()) {
1943: longest = lastLongest;
1944: backmost = back;
1945: }
1946:
1947: lastLongest = new StringBuffer();
1948:
1949: if (op == OpCode._PLUS
1950: && OpCode._isInArray(__program[OpCode
1951: ._getNextOperator(scan)],
1952: OpCode._opLengthOne, 0))
1953: ++minLength;
1954: else if (OpCode._opType[op] == OpCode._CURLY
1955: && OpCode._isInArray(__program[OpCode
1956: ._getNextOperator(scan) + 2],
1957: OpCode._opLengthOne, 0))
1958: minLength += OpCode._getArg1(__program, scan);
1959: } else if (OpCode
1960: ._isInArray(op, OpCode._opLengthOne, 0)) {
1961: ++curBack;
1962: ++minLength;
1963: length = 0;
1964: if (lastLongest.length() > longest.length()) {
1965: longest = lastLongest;
1966: backmost = back;
1967: }
1968: lastLongest = new StringBuffer();
1969: }
1970:
1971: scan = OpCode._getNext(__program, scan);
1972: } // end while
1973:
1974: if (lastLongest.length()
1975: + ((OpCode._opType[__program[first]] == OpCode._EOL) ? 1
1976: : 0) > longest.length()) {
1977: longest = lastLongest;
1978: backmost = back;
1979: } else
1980: lastLongest = new StringBuffer();
1981:
1982: if (longest.length() > 0 && startString == null) {
1983: mustString = longest.toString();
1984: if (backmost < 0)
1985: backmost = -1;
1986: regexp._back = backmost;
1987:
1988: /*
1989:
1990: if(longest.length() >
1991: (((caseInsensitive & __CASE_INSENSITIVE) != 0 ||
1992: OpCode._opType[__program[first]] == OpCode._EOL)
1993: ? 1 : 0))
1994: */
1995: } else
1996: longest = null;
1997: } // end if
1998:
1999: regexp._isCaseInsensitive = ((caseInsensitive & __CASE_INSENSITIVE) != 0);
2000: regexp._numParentheses = __numParentheses - 1;
2001: regexp._minLength = minLength;
2002:
2003: if (mustString != null) {
2004: regexp._mustString = mustString.toCharArray();
2005: regexp._mustUtility = 100;
2006: }
2007:
2008: if (startString != null)
2009: regexp._startString = startString.toCharArray();
2010:
2011: return regexp;
2012: }
2013:
2014: /**
2015: * Same as calling <b>compile(pattern, Perl5Compiler.DEFAULT_MASK);</b>
2016: * <p>
2017: * @param pattern A regular expression to compile.
2018: * @return A Pattern instance constituting the compiled regular expression.
2019: * This instance will always be a Perl5Pattern and can be reliably
2020: * casted to a Perl5Pattern.
2021: * @exception MalformedPatternException If the compiled expression
2022: * is not a valid Perl5 regular expression.
2023: */
2024: public Pattern compile(char[] pattern)
2025: throws MalformedPatternException {
2026: return compile(pattern, DEFAULT_MASK);
2027: }
2028:
2029: /**
2030: * Same as calling <b>compile(pattern, Perl5Compiler.DEFAULT_MASK);</b>
2031: * <p>
2032: * @param pattern A regular expression to compile.
2033: * @return A Pattern instance constituting the compiled regular expression.
2034: * This instance will always be a Perl5Pattern and can be reliably
2035: * casted to a Perl5Pattern.
2036: * @exception MalformedPatternException If the compiled expression
2037: * is not a valid Perl5 regular expression.
2038: */
2039: public Pattern compile(String pattern)
2040: throws MalformedPatternException {
2041: return compile(pattern.toCharArray(), DEFAULT_MASK);
2042: }
2043:
2044: /**
2045: * Compiles a Perl5 regular expression into a Perl5Pattern instance that
2046: * can be used by a Perl5Matcher object to perform pattern matching.
2047: * Please see the user's guide for more information about Perl5 regular
2048: * expressions.
2049: * <p>
2050: * @param pattern A Perl5 regular expression to compile.
2051: * @param options A set of flags giving the compiler instructions on
2052: * how to treat the regular expression. The flags
2053: * are a logical OR of any number of the five <b>MASK</b>
2054: * constants. For example:
2055: * <pre>
2056: * regex =
2057: * compiler.compile("^\\w+\\d+$",
2058: * Perl5Compiler.CASE_INSENSITIVE_MASK |
2059: * Perl5Compiler.MULTILINE_MASK);
2060: * </pre>
2061: * This says to compile the pattern so that it treats
2062: * input as consisting of multiple lines and to perform
2063: * matches in a case insensitive manner.
2064: * @return A Pattern instance constituting the compiled regular expression.
2065: * This instance will always be a Perl5Pattern and can be reliably
2066: * casted to a Perl5Pattern.
2067: * @exception MalformedPatternException If the compiled expression
2068: * is not a valid Perl5 regular expression.
2069: */
2070: public Pattern compile(String pattern, int options)
2071: throws MalformedPatternException {
2072: return compile(pattern.toCharArray(), options);
2073: }
2074:
2075: }
|