0001: /* ====================================================================
0002: * The Jcorporate Apache Style Software License, Version 1.2 05-07-2002
0003: *
0004: * Copyright (c) 1995-2002 Jcorporate Ltd. All rights reserved.
0005: *
0006: * Redistribution and use in source and binary forms, with or without
0007: * modification, are permitted provided that the following conditions
0008: * are met:
0009: *
0010: * 1. Redistributions of source code must retain the above copyright
0011: * notice, this list of conditions and the following disclaimer.
0012: *
0013: * 2. Redistributions in binary form must reproduce the above copyright
0014: * notice, this list of conditions and the following disclaimer in
0015: * the documentation and/or other materials provided with the
0016: * distribution.
0017: *
0018: * 3. The end-user documentation included with the redistribution,
0019: * if any, must include the following acknowledgment:
0020: * "This product includes software developed by Jcorporate Ltd.
0021: * (http://www.jcorporate.com/)."
0022: * Alternately, this acknowledgment may appear in the software itself,
0023: * if and wherever such third-party acknowledgments normally appear.
0024: *
0025: * 4. "Jcorporate" and product names such as "Expresso" must
0026: * not be used to endorse or promote products derived from this
0027: * software without prior written permission. For written permission,
0028: * please contact info@jcorporate.com.
0029: *
0030: * 5. Products derived from this software may not be called "Expresso",
0031: * or other Jcorporate product names; nor may "Expresso" or other
0032: * Jcorporate product names appear in their name, without prior
0033: * written permission of Jcorporate Ltd.
0034: *
0035: * 6. No product derived from this software may compete in the same
0036: * market space, i.e. framework, without prior written permission
0037: * of Jcorporate Ltd. For written permission, please contact
0038: * partners@jcorporate.com.
0039: *
0040: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0041: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0042: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0043: * DISCLAIMED. IN NO EVENT SHALL JCORPORATE LTD OR ITS CONTRIBUTORS
0044: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0045: * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
0046: * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0047: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0048: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0049: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0050: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0051: * SUCH DAMAGE.
0052: * ====================================================================
0053: *
0054: * This software consists of voluntary contributions made by many
0055: * individuals on behalf of the Jcorporate Ltd. Contributions back
0056: * to the project(s) are encouraged when you make modifications.
0057: * Please send them to support@jcorporate.com. For more information
0058: * on Jcorporate Ltd. and its products, please see
0059: * <http://www.jcorporate.com/>.
0060: *
0061: * Portions of this software are based upon other open source
0062: * products and are subject to their respective licenses.
0063: */
0064:
0065: package com.jcorporate.expresso.ext.regexp;
0066:
0067: /*
0068: * ====================================================================
0069: *
0070: * The Apache Software License, Version 1.1
0071: *
0072: * Copyright (c) 1999 The Apache Software Foundation. All rights
0073: * reserved.
0074: *
0075: * Redistribution and use in source and binary forms, with or without
0076: * modification, are permitted provided that the following conditions
0077: * are met:
0078: *
0079: * 1. Redistributions of source code must retain the above copyright
0080: * notice, this list of conditions and the following disclaimer.
0081: *
0082: * 2. Redistributions in binary form must reproduce the above copyright
0083: * notice, this list of conditions and the following disclaimer in
0084: * the documentation and/or other materials provided with the
0085: * distribution.
0086: *
0087: * 3. The end-user documentation included with the redistribution, if
0088: * any, must include the following acknowlegement:
0089: * "This product includes software developed by the
0090: * Apache Software Foundation (http://www.apache.org/)."
0091: * Alternately, this acknowlegement may appear in the software itself,
0092: * if and wherever such third-party acknowlegements normally appear.
0093: *
0094: * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
0095: * Foundation" must not be used to endorse or promote products derived
0096: * from this software without prior written permission. For written
0097: * permission, please contact apache@apache.org.
0098: *
0099: * 5. Products derived from this software may not be called "Apache"
0100: * nor may "Apache" appear in their names without prior written
0101: * permission of the Apache Group.
0102: *
0103: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0104: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0105: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0106: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0107: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0108: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0109: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0110: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0111: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0112: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0113: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0114: * SUCH DAMAGE.
0115: * ====================================================================
0116: *
0117: * This software consists of voluntary contributions made by many
0118: * individuals on behalf of the Apache Software Foundation. For more
0119: * information on the Apache Software Foundation, please see
0120: * <http://www.apache.org/>.
0121: *
0122: */
0123:
0124: import java.util.Vector;
0125:
0126: /**
0127: * RE is an efficient, lightweight regular expression evaluator/matcher class.
0128: * Regular expressions are pattern descriptions which enable sophisticated matching of
0129: * strings. In addition to being able to match a string against a pattern, you
0130: * can also extract parts of the match. This is especially useful in text parsing!
0131: * Details on the syntax of regular expression patterns are given below.
0132: * <p/>
0133: * <p/>
0134: * <p/>
0135: * To compile a regular expression (RE), you can simply construct an RE matcher
0136: * object from the string specification of the pattern, like this:
0137: * <p/>
0138: * <pre>
0139: * <p/>
0140: * RE r = new RE("a*b");
0141: * <p/>
0142: * </pre>
0143: * <p/>
0144: * <p/>
0145: * <p/>
0146: * Once you have done this, you can call either of the RE.match methods to
0147: * perform matching on a String. For example:
0148: * <p/>
0149: * <pre>
0150: * <p/>
0151: * boolean matched = r.match("aaaab");
0152: * <p/>
0153: * </pre>
0154: * <p/>
0155: * will cause the boolean matched to be set to true because the
0156: * pattern "a*b" matches the string "aaaab".
0157: * <p/>
0158: * <p/>
0159: * If you were interested in the <i>number</i> of a's which matched the first
0160: * part of our example expression, you could change the expression to
0161: * "(a*)b". Then when you compiled the expression and matched it against
0162: * something like "xaaaab", you would get results like this:
0163: * <p/>
0164: * <pre>
0165: * <p/>
0166: * RE r = new RE("(a*)b"); // Compile expression
0167: * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
0168: * <p/>
0169: * <br>
0170: * <p/>
0171: * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
0172: * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
0173: * <p/>
0174: * <br>
0175: * <p/>
0176: * int startWholeExpr = getParenStart(0); // startWholeExpr will be index 1
0177: * int endWholeExpr = getParenEnd(0); // endWholeExpr will be index 6
0178: * int lenWholeExpr = getParenLength(0); // lenWholeExpr will be 5
0179: * <p/>
0180: * <br>
0181: * <p/>
0182: * int startInside = getParenStart(1); // startInside will be index 1
0183: * int endInside = getParenEnd(1); // endInside will be index 5
0184: * int lenInside = getParenLength(1); // lenInside will be 4
0185: * <p/>
0186: * </pre>
0187: * <p/>
0188: * You can also refer to the contents of a parenthesized expression within
0189: * a regular expression itself. This is called a 'backreference'. The first
0190: * backreference in a regular expression is denoted by \1, the second by \2
0191: * and so on. So the expression:
0192: * <p/>
0193: * <pre>
0194: * <p/>
0195: * ([0-9]+)=\1
0196: * <p/>
0197: * </pre>
0198: * <p/>
0199: * will match any string of the form n=n (like 0=0 or 2=2).
0200: * <p/>
0201: * <p/>
0202: * <p/>
0203: * The full regular expression syntax accepted by RE is described here:
0204: * <p/>
0205: * <pre>
0206: * <p/>
0207: * <br>
0208: * <p/>
0209: * <b><font face=times roman>Characters</font></b>
0210: * <p/>
0211: * <br>
0212: * <p/>
0213: * <i>unicodeChar</i> Matches any identical unicode character
0214: * \ Used to quote a meta-character (like '*')
0215: * \\ Matches a single '\' character
0216: * \0nnn Matches a given octal character
0217: * \xhh Matches a given 8-bit hexadecimal character
0218: * \\uhhhh Matches a given 16-bit hexadecimal character
0219: * \t Matches an ASCII tab character
0220: * \n Matches an ASCII newline character
0221: * \r Matches an ASCII return character
0222: * \f Matches an ASCII form feed character
0223: * <p/>
0224: * <br>
0225: * <p/>
0226: * <b><font face=times roman>Character Classes</font></b>
0227: * <p/>
0228: * <br>
0229: * <p/>
0230: * [abc] Simple character class
0231: * [a-zA-Z] Character class with ranges
0232: * [^abc] Negated character class
0233: * <p/>
0234: * <br>
0235: * <p/>
0236: * <b><font face=times roman>Standard POSIX Character Classes</font></b>
0237: * <p/>
0238: * <br>
0239: * <p/>
0240: * [:alnum:] Alphanumeric characters.
0241: * [:alpha:] Alphabetic characters.
0242: * [:blank:] Space and tab characters.
0243: * [:cntrl:] Control characters.
0244: * [:digit:] Numeric characters.
0245: * [:graph:] Characters that are printable and are also visible. (A space is printable, but not visible, while an `a' is both.)
0246: * [:lower:] Lower-case alphabetic characters.
0247: * [:print:] Printable characters (characters that are not control characters.)
0248: * [:punct:] Punctuation characters (characters that are not letter, digits, control characters, or space characters).
0249: * [:space:] Space characters (such as space, tab, and formfeed, to name a few).
0250: * [:upper:] Upper-case alphabetic characters.
0251: * [:xdigit:] Characters that are hexadecimal digits.
0252: * <p/>
0253: * <br>
0254: * <p/>
0255: * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
0256: * <p/>
0257: * <br>
0258: * <p/>
0259: * [:javastart:] Start of a Java identifier
0260: * [:javapart:] Part of a Java identifier
0261: * <p/>
0262: * <br>
0263: * <p/>
0264: * <b><font face=times roman>Predefined Classes</font></b>
0265: * <p/>
0266: * <br>
0267: * <p/>
0268: * . Matches any character other than newline
0269: * \w Matches a "word" character (alphanumeric plus "_")
0270: * \W Matches a non-word character
0271: * \s Matches a whitespace character
0272: * \S Matches a non-whitespace character
0273: * \d Matches a digit character
0274: * \D Matches a non-digit character
0275: * <p/>
0276: * <br>
0277: * <p/>
0278: * <b><font face=times roman>Boundary Matchers</font></b>
0279: * <p/>
0280: * <br>
0281: * <p/>
0282: * ^ Matches only at the beginning of a line
0283: * $ Matches only at the end of a line
0284: * \b Matches only at a word boundary
0285: * \B Matches only at a non-word boundary
0286: * <p/>
0287: * <br>
0288: * <p/>
0289: * <b><font face=times roman>Greedy Closures</font></b>
0290: * <p/>
0291: * <br>
0292: * <p/>
0293: * A* Matches A 0 or more times (greedy)
0294: * A+ Matches A 1 or more times (greedy)
0295: * A? Matches A 1 or 0 times (greedy)
0296: * A{n} Matches A exactly n times (greedy)
0297: * A{n,} Matches A at least n times (greedy)
0298: * A{n,m} Matches A at least n but not more than m times (greedy)
0299: * <p/>
0300: * <br>
0301: * <p/>
0302: * <b><font face=times roman>Reluctant Closures</font></b>
0303: * <p/>
0304: * <br>
0305: * <p/>
0306: * A*? Matches A 0 or more times (reluctant)
0307: * A+? Matches A 1 or more times (reluctant)
0308: * A?? Matches A 0 or 1 times (reluctant)
0309: * <p/>
0310: * <br>
0311: * <p/>
0312: * <b><font face=times roman>Logical Operators</font></b>
0313: * <p/>
0314: * <br>
0315: * <p/>
0316: * AB Matches A followed by B
0317: * A|B Matches either A or B
0318: * (A) Used for subexpression grouping
0319: * <p/>
0320: * <br>
0321: * <p/>
0322: * <b><font face=times roman>Backreferences</font></b>
0323: * <p/>
0324: * <br>
0325: * <p/>
0326: * \1 Backreference to 1st parenthesized subexpression
0327: * \2 Backreference to 2nd parenthesized subexpression
0328: * \3 Backreference to 3rd parenthesized subexpression
0329: * \4 Backreference to 4th parenthesized subexpression
0330: * \5 Backreference to 5th parenthesized subexpression
0331: * \6 Backreference to 6th parenthesized subexpression
0332: * \7 Backreference to 7th parenthesized subexpression
0333: * \8 Backreference to 8th parenthesized subexpression
0334: * \9 Backreference to 9th parenthesized subexpression
0335: * <p/>
0336: * <br>
0337: * <p/>
0338: * </pre>
0339: * <p/>
0340: * <p/>
0341: * <p/>
0342: * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning that they
0343: * match as many elements of the string as possible without causing the overall
0344: * match to fail. If you want a closure to be reluctant (non-greedy), you can
0345: * simply follow it with a '?'. A reluctant closure will match as few elements
0346: * of the string as possible when finding matches. {m,n} closures don't currently
0347: * support reluctancy.
0348: * <p/>
0349: * <p/>
0350: * <p/>
0351: * RE runs programs compiled by the RECompiler class. But the RE matcher class
0352: * does not include the actual regular expression compiler for reasons of
0353: * efficiency. In fact, if you want to pre-compile one or more regular expressions,
0354: * the 'recompile' class can be invoked from the command line to produce compiled
0355: * output like this:
0356: * <p/>
0357: * <pre>
0358: * <p/>
0359: * // Pre-compiled regular expression "a*b"
0360: * char[] re1Instructions =
0361: * {
0362: * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
0363: * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
0364: * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
0365: * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
0366: * 0x0000,
0367: * };
0368: * <p/>
0369: * <br>
0370: * <p/>
0371: * REProgram re1 = new REProgram(re1Instructions);
0372: * <p/>
0373: * </pre>
0374: * <p/>
0375: * You can then construct a regular expression matcher (RE) object from the pre-compiled
0376: * expression re1 and thus avoid the overhead of compiling the expression at runtime.
0377: * If you require more dynamic regular expressions, you can construct a single RECompiler
0378: * object and re-use it to compile each expression. Similarly, you can change the
0379: * program run by a given matcher object at any time. However, RE and RECompiler are
0380: * not threadsafe (for efficiency reasons, and because requiring thread safety in this
0381: * class is deemed to be a rare requirement), so you will need to construct a separate
0382: * compiler or matcher object for each thread (unless you do thread synchronization
0383: * yourself).
0384: * <p/>
0385: * </pre>
0386: * <br><p><br>
0387: * <p/>
0388: * <font color=red>
0389: * <i>ISSUES:</i>
0390: * <p/>
0391: * <ul>
0392: * <li>com.weusours.util.re is not currently compatible with all standard POSIX regcomp flags
0393: * <li>com.weusours.util.re does not support POSIX equivalence classes ([=foo=] syntax) (I18N/locale issue)
0394: * <li>com.weusours.util.re does not support nested POSIX character classes (definitely should, but not completely trivial)
0395: * <li>com.weusours.util.re Does not support POSIX character collation concepts ([.foo.] syntax) (I18N/locale issue)
0396: * <li>Should there be different matching styles (simple, POSIX, Perl etc?)
0397: * <li>Should RE support character iterators (for backwards RE matching!)?
0398: * <li>Should RE support reluctant {m,n} closures (does anyone care)?
0399: * <li>Not *all* possibilities are considered for greediness when backreferences
0400: * are involved (as POSIX suggests should be the case). The POSIX RE
0401: * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
0402: * of acdacaa where \1 is "a". This is not the case in this RE package,
0403: * and actually Perl doesn't go to this extent either! Until someone
0404: * actually complains about this, I'm not sure it's worth "fixing".
0405: * If it ever is fixed, test #137 in RETest.txt should be updated.
0406: * </ul>
0407: * <p/>
0408: * </font>
0409: *
0410: * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
0411: * @version $Id: RE.java,v 1.9 2004/11/18 02:03:28 lhamel Exp $
0412: * @see RECompiler
0413: * @deprecated since v5.6, use jakarta oro
0414: */
0415: public class RE {
0416:
0417: /**
0418: * Specifies normal, case-sensitive matching behaviour.
0419: */
0420: public static final int MATCH_NORMAL = 0x0000;
0421:
0422: /**
0423: * Flag to indicate that matching should be case-independent (folded)
0424: */
0425: public static final int MATCH_CASEINDEPENDENT = 0x0001;
0426:
0427: /**
0428: * Newlines should match as BOL/EOL (^ and $)
0429: */
0430: public static final int MATCH_MULTILINE = 0x0002;
0431:
0432: /**
0433: * *********************************************
0434: * *
0435: * The format of a node in a program is: *
0436: * *
0437: * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
0438: * *
0439: * char OPCODE - instruction *
0440: * char OPDATA - modifying data *
0441: * char OPNEXT - next node (relative offset) *
0442: * *
0443: * **********************************************
0444: */
0445: // Opcode Char Opdata/Operand Meaning
0446: // ---------- ---------- --------------- --------------------------------------------------
0447: static final char OP_END = 'E'; // end of program
0448: static final char OP_BOL = '^'; // match only if at beginning of line
0449: static final char OP_EOL = '$'; // match only if at end of line
0450: static final char OP_ANY = '.'; // match any single character except newline
0451: static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
0452: static final char OP_BRANCH = '|'; // node match this alternative or the next one
0453: static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
0454: static final char OP_STAR = '*'; // node kleene closure
0455: static final char OP_PLUS = '+'; // node positive closure
0456: static final char OP_MAYBE = '?'; // node optional closure
0457: static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
0458: static final char OP_OPEN = '('; // number nth opening paren
0459: static final char OP_CLOSE = ')'; // number nth closing paren
0460: static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
0461: static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
0462: static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
0463: static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
0464: static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
0465: static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
0466: static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
0467:
0468: // Escape codes
0469: static final char E_ALNUM = 'w'; // Alphanumeric
0470: static final char E_NALNUM = 'W'; // Non-alphanumeric
0471: static final char E_BOUND = 'b'; // Word boundary
0472: static final char E_NBOUND = 'B'; // Non-word boundary
0473: static final char E_SPACE = 's'; // Whitespace
0474: static final char E_NSPACE = 'S'; // Non-whitespace
0475: static final char E_DIGIT = 'd'; // Digit
0476: static final char E_NDIGIT = 'D'; // Non-digit
0477:
0478: // Posix character classes
0479: static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
0480: static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
0481: static final char POSIX_CLASS_BLANK = 'b'; // Blanks
0482: static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
0483: static final char POSIX_CLASS_DIGIT = 'd'; // Digits
0484: static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
0485: static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
0486: static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
0487: static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
0488: static final char POSIX_CLASS_SPACE = 's'; // Spaces
0489: static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
0490: static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
0491: static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
0492: static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
0493:
0494: // Limits
0495: static final int maxNode = 65536; // Maximum number of nodes in a program
0496: static final int maxParen = 16; // Number of paren pairs (only 9 can be backrefs)
0497:
0498: // Node layout constants
0499: static final int offsetOpcode = 0; // Opcode offset (first character)
0500: static final int offsetOpdata = 1; // Opdata offset (second char)
0501: static final int offsetNext = 2; // Next index offset (third char)
0502: static final int nodeSize = 3; // Node size (in chars)
0503:
0504: /**
0505: * Line Separator
0506: */
0507: static final String NEWLINE = System.getProperty("line.separator");
0508:
0509: // State of current program
0510: REProgram program; // Compiled regular expression 'program'
0511: CharacterIterator search; // The string being matched against
0512: int idx; // Current index in string being searched
0513: int matchFlags; // Match behaviour flags
0514:
0515: // Parenthesized subexpressions
0516: int parenCount; // Number of subexpressions matched (num open parens + 1)
0517: int start0; // Cache of start[0]
0518: int end0; // Cache of start[0]
0519: int start1; // Cache of start[1]
0520: int end1; // Cache of start[1]
0521: int start2; // Cache of start[2]
0522: int end2; // Cache of start[2]
0523: int[] startn; // Lazy-alloced array of sub-expression starts
0524: int[] endn; // Lazy-alloced array of sub-expression ends
0525:
0526: // Backreferences
0527: int[] startBackref; // Lazy-alloced array of backref starts
0528: int[] endBackref; // Lazy-alloced array of backref ends
0529:
0530: /**
0531: * Flag bit that indicates that subst should replace all occurrences of this
0532: * regular expression.
0533: */
0534: public static final int REPLACE_ALL = 0x0000;
0535:
0536: /**
0537: * Flag bit that indicates that subst should only replace the first occurrence
0538: * of this regular expression.
0539: */
0540: public static final int REPLACE_FIRSTONLY = 0x0001;
0541:
0542: /**
0543: * Constructs a regular expression matcher with no initial program.
0544: * This is likely to be an uncommon practice, but is still supported.
0545: */
0546: public RE() {
0547: this ((REProgram) null, MATCH_NORMAL);
0548: }
0549:
0550: /**
0551: * Construct a matcher for a pre-compiled regular expression from program
0552: * (bytecode) data.
0553: *
0554: * @param program Compiled regular expression program
0555: * @see RECompiler
0556: */
0557: public RE(REProgram program) {
0558: this (program, MATCH_NORMAL);
0559: }
0560:
0561: /**
0562: * Construct a matcher for a pre-compiled regular expression from program
0563: * (bytecode) data. Permits special flags to be passed in to modify matching
0564: * behaviour.
0565: *
0566: * @param program Compiled regular expression program (see RECompiler and/or recompile)
0567: * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
0568: * <p/>
0569: * <pre>
0570: * <p/>
0571: * MATCH_NORMAL // Normal (case-sensitive) matching
0572: * MATCH_CASEINDEPENDENT // Case folded comparisons
0573: * MATCH_MULTILINE // Newline matches as BOL/EOL
0574: * <p/>
0575: * </pre>
0576: * @see RECompiler
0577: * @see REProgram
0578: */
0579: public RE(REProgram program, int matchFlags) {
0580: setProgram(program);
0581: setMatchFlags(matchFlags);
0582: }
0583:
0584: /**
0585: * Constructs a regular expression matcher from a String by compiling it
0586: * using a new instance of RECompiler. If you will be compiling many
0587: * expressions, you may prefer to use a single RECompiler object instead.
0588: *
0589: * @param pattern The regular expression pattern to compile.
0590: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
0591: * @see RECompiler
0592: */
0593: public RE(String pattern) throws RESyntaxException {
0594: this (pattern, MATCH_NORMAL);
0595: }
0596:
0597: /**
0598: * Constructs a regular expression matcher from a String by compiling it
0599: * using a new instance of RECompiler. If you will be compiling many
0600: * expressions, you may prefer to use a single RECompiler object instead.
0601: *
0602: * @param pattern The regular expression pattern to compile.
0603: * @param matchFlags The matching style
0604: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
0605: * @see RECompiler
0606: */
0607: public RE(String pattern, int matchFlags) throws RESyntaxException {
0608: this (new RECompiler().compile(pattern));
0609: setMatchFlags(matchFlags);
0610: }
0611:
0612: /**
0613: * Performs lazy allocation of subexpression arrays
0614: */
0615: private final void allocParens() {
0616:
0617: // Allocate arrays for subexpressions
0618: startn = new int[maxParen];
0619: endn = new int[maxParen];
0620:
0621: // Set sub-expression pointers to invalid values
0622: for (int i = 0; i < maxParen; i++) {
0623: startn[i] = -1;
0624: endn[i] = -1;
0625: }
0626: }
0627:
0628: /**
0629: * Returns the current match behaviour flags.
0630: *
0631: * @return Current match behaviour flags (RE.MATCH_*).
0632: * <p/>
0633: * <pre>
0634: * <p/>
0635: * MATCH_NORMAL // Normal (case-sensitive) matching
0636: * MATCH_CASEINDEPENDENT // Case folded comparisons
0637: * MATCH_MULTILINE // Newline matches as BOL/EOL
0638: * <p/>
0639: * </pre>
0640: * @see #setMatchFlags
0641: */
0642: public int getMatchFlags() {
0643: return matchFlags;
0644: }
0645:
0646: /**
0647: * Gets the contents of a parenthesized subexpression after a successful match.
0648: *
0649: * @param which Nesting level of subexpression
0650: * @return String
0651: */
0652: public String getParen(int which) {
0653: if (which < parenCount) {
0654: return search.substring(getParenStart(which),
0655: getParenEnd(which));
0656: }
0657:
0658: return null;
0659: }
0660:
0661: /**
0662: * Returns the number of parenthesized subexpressions available after a successful match.
0663: *
0664: * @return Number of available parenthesized subexpressions
0665: */
0666: public int getParenCount() {
0667: return parenCount;
0668: }
0669:
0670: /**
0671: * Returns the end index of a given paren level.
0672: *
0673: * @param which Nesting level of subexpression
0674: * @return String index
0675: */
0676: public final int getParenEnd(int which) {
0677: if (which < parenCount) {
0678: switch (which) {
0679: case 0:
0680: return end0;
0681:
0682: case 1:
0683: return end1;
0684:
0685: case 2:
0686: return end2;
0687:
0688: default:
0689:
0690: if (endn == null) {
0691: allocParens();
0692: }
0693:
0694: return endn[which];
0695: }
0696: }
0697:
0698: return -1;
0699: }
0700:
0701: /**
0702: * Returns the length of a given paren level.
0703: *
0704: * @param which Nesting level of subexpression
0705: * @return Number of characters in the parenthesized subexpression
0706: */
0707: public final int getParenLength(int which) {
0708: if (which < parenCount) {
0709: return getParenEnd(which) - getParenStart(which);
0710: }
0711:
0712: return -1;
0713: }
0714:
0715: /**
0716: * Returns the start index of a given paren level.
0717: *
0718: * @param which Nesting level of subexpression
0719: * @return String index
0720: */
0721: public final int getParenStart(int which) {
0722: if (which < parenCount) {
0723: switch (which) {
0724: case 0:
0725: return start0;
0726:
0727: case 1:
0728: return start1;
0729:
0730: case 2:
0731: return start2;
0732:
0733: default:
0734:
0735: if (startn == null) {
0736: allocParens();
0737: }
0738:
0739: return startn[which];
0740: }
0741: }
0742:
0743: return -1;
0744: }
0745:
0746: /**
0747: * Returns the current regular expression program in use by this matcher object.
0748: *
0749: * @return Regular expression program
0750: * @see #setProgram
0751: */
0752: public REProgram getProgram() {
0753: return program;
0754: }
0755:
0756: /**
0757: * Returns an array of Strings, whose toString representation matches a regular
0758: * expression. This method works like the Perl function of the same name. Given
0759: * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
0760: * aaaab], the array of Strings returned by grep would be [aab, aaaab].
0761: *
0762: * @param search Array of Objects to search
0763: * @return Array of Objects whose toString value matches this regular expression.
0764: */
0765: public String[] grep(Object[] search) {
0766:
0767: // Create new vector to hold return items
0768: Vector v = new Vector();
0769:
0770: // Traverse array of objects
0771: for (int i = 0; i < search.length; i++) {
0772:
0773: // Get next object as a string
0774: String s = search[i].toString();
0775:
0776: // If it matches this regexp, add it to the list
0777: if (match(s)) {
0778: v.addElement(s);
0779: }
0780: }
0781:
0782: // Return vector as an array of strings
0783: String[] ret = new String[v.size()];
0784: v.copyInto(ret);
0785:
0786: return ret;
0787: }
0788:
0789: /**
0790: * Throws an Error representing an internal error condition probably resulting
0791: * from a bug in the regular expression compiler (or possibly data corruption).
0792: * In practice, this should be very rare.
0793: *
0794: * @param s Error description
0795: */
0796: protected void internalError(String s) throws Error {
0797: throw new Error("RE internal error: " + s);
0798: }
0799:
0800: /**
0801: * @return true if at the i-th position in the 'search' a newline ends
0802: */
0803: private boolean isNewline(int i) {
0804: if (i < NEWLINE.length() - 1) {
0805: return false;
0806: }
0807: if (search.charAt(i) == '\n') {
0808: return true;
0809: }
0810: for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
0811: if (NEWLINE.charAt(j) != search.charAt(i)) {
0812: return false;
0813: }
0814: }
0815:
0816: return true;
0817: }
0818:
0819: /**
0820: * Matches the current regular expression program against a character array,
0821: * starting at a given index.
0822: *
0823: * @param search String to match against
0824: * @param i Index to start searching at
0825: * @return True if string matched
0826: */
0827: public boolean match(CharacterIterator search, int i) {
0828:
0829: // There is no compiled program to search with!
0830: if (program == null) {
0831:
0832: // This should be uncommon enough to be an error case rather
0833: // than an exception (which would have to be handled everywhere)
0834: internalError("No RE program to run!");
0835: }
0836:
0837: // Save string to search
0838: this .search = search;
0839:
0840: // Can we optimize the search by looking for a prefix string?
0841: if (program.prefix == null) {
0842:
0843: // Unprefixed matching must try for a match at each character
0844: for (; !search.isEnd(i - 1); i++) {
0845:
0846: // Try a match at index i
0847: if (matchAt(i)) {
0848: return true;
0849: }
0850: }
0851:
0852: return false;
0853: } else {
0854:
0855: // Prefix-anchored matching is possible
0856: char[] prefix = program.prefix;
0857:
0858: for (; !search.isEnd(i + prefix.length - 1); i++) {
0859:
0860: // If the first character of the prefix matches
0861: if (search.charAt(i) == prefix[0]) {
0862:
0863: // Save first character position
0864: int firstChar = i++;
0865: int k;
0866:
0867: for (k = 1; k < prefix.length;) {
0868:
0869: // If there's a mismatch of any character in the prefix, give up
0870: if (search.charAt(i++) != prefix[k++]) {
0871: break;
0872: }
0873: }
0874: // See if the whole prefix string matched
0875: if (k == prefix.length) {
0876:
0877: // We matched the full prefix at firstChar, so try it
0878: if (matchAt(firstChar)) {
0879: return true;
0880: }
0881: }
0882:
0883: // Match failed, reset i to continue the search
0884: i = firstChar;
0885: }
0886: }
0887:
0888: return false;
0889: }
0890: }
0891:
0892: /**
0893: * Matches the current regular expression program against a String.
0894: *
0895: * @param search String to match against
0896: * @return True if string matched
0897: */
0898: public boolean match(String search) {
0899: return match(search, 0);
0900: }
0901:
0902: /**
0903: * Matches the current regular expression program against a character array,
0904: * starting at a given index.
0905: *
0906: * @param search String to match against
0907: * @param i Index to start searching at
0908: * @return True if string matched
0909: */
0910: public boolean match(String search, int i) {
0911: return match(new StringCharacterIterator(search), i);
0912: }
0913:
0914: /**
0915: * Match the current regular expression program against the current
0916: * input string, starting at index i of the input string. This method
0917: * is only meant for internal use.
0918: *
0919: * @param i The input string index to start matching at
0920: * @return True if the input matched the expression
0921: */
0922: protected boolean matchAt(int i) {
0923:
0924: // Initialize start pointer, paren cache and paren count
0925: start0 = -1;
0926: end0 = -1;
0927: start1 = -1;
0928: end1 = -1;
0929: start2 = -1;
0930: end2 = -1;
0931: startn = null;
0932: endn = null;
0933: parenCount = 1;
0934: setParenStart(0, i);
0935:
0936: // Allocate backref arrays (unless optimizations indicate otherwise)
0937: if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
0938: startBackref = new int[maxParen];
0939: endBackref = new int[maxParen];
0940: }
0941:
0942: // Match against string
0943: int idx;
0944:
0945: if ((idx = matchNodes(0, maxNode, i)) != -1) {
0946: setParenEnd(0, idx);
0947:
0948: return true;
0949: }
0950:
0951: // Didn't match
0952: parenCount = 0;
0953:
0954: return false;
0955: }
0956:
0957: /**
0958: * Try to match a string against a subset of nodes in the program
0959: *
0960: * @param firstNode Node to start at in program
0961: * @param lastNode Last valid node (used for matching a subexpression without
0962: * matching the rest of the program as well).
0963: * @param idxStart Starting position in character array
0964: * @return Final input array index if match succeeded. -1 if not.
0965: */
0966: protected int matchNodes(int firstNode, int lastNode, int idxStart) {
0967:
0968: // Our current place in the string
0969: int idx = idxStart;
0970:
0971: // Loop while node is valid
0972: int next;
0973: int opcode;
0974: int opdata;
0975: int idxNew;
0976: char[] instruction = program.instruction;
0977:
0978: for (int node = firstNode; node < lastNode;) {
0979: opcode = instruction[node + offsetOpcode];
0980: next = node + (short) instruction[node + offsetNext];
0981: opdata = instruction[node + offsetOpdata];
0982:
0983: switch (opcode) {
0984: case OP_RELUCTANTMAYBE: {
0985: int once = 0;
0986:
0987: do {
0988:
0989: // Try to match the rest without using the reluctant subexpr
0990: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0991: return idxNew;
0992: }
0993: } while ((once++ == 0)
0994: && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
0995:
0996: return -1;
0997: }
0998: case OP_RELUCTANTPLUS:
0999: while ((idx = matchNodes(node + nodeSize, next, idx)) != -1) {
1000:
1001: // Try to match the rest without using the reluctant subexpr
1002: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1003: return idxNew;
1004: }
1005: }
1006:
1007: return -1;
1008:
1009: case OP_RELUCTANTSTAR:
1010: do {
1011:
1012: // Try to match the rest without using the reluctant subexpr
1013: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1014: return idxNew;
1015: }
1016: } while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
1017:
1018: return -1;
1019:
1020: case OP_OPEN:
1021:
1022: // Match subexpression
1023: if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
1024: startBackref[opdata] = idx;
1025: }
1026: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1027:
1028: // Increase valid paren count
1029: if ((opdata + 1) > parenCount) {
1030: parenCount = opdata + 1;
1031: }
1032: // Don't set paren if already set later on
1033: if (getParenStart(opdata) == -1) {
1034: setParenStart(opdata, idx);
1035: }
1036: }
1037:
1038: return idxNew;
1039:
1040: case OP_CLOSE:
1041:
1042: // Done matching subexpression
1043: if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
1044: endBackref[opdata] = idx;
1045: }
1046: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1047:
1048: // Increase valid paren count
1049: if ((opdata + 1) > parenCount) {
1050: parenCount = opdata + 1;
1051: }
1052: // Don't set paren if already set later on
1053: if (getParenEnd(opdata) == -1) {
1054: setParenEnd(opdata, idx);
1055: }
1056: }
1057:
1058: return idxNew;
1059:
1060: case OP_BACKREF: {
1061:
1062: // Get the start and end of the backref
1063: int s = startBackref[opdata];
1064: int e = endBackref[opdata];
1065:
1066: // We don't know the backref yet
1067: if (s == -1 || e == -1) {
1068: return -1;
1069: }
1070: // The backref is empty size
1071: if (s == e) {
1072: break;
1073: }
1074:
1075: // Get the length of the backref
1076: int l = e - s;
1077:
1078: // If there's not enough input left, give up.
1079: if (search.isEnd(idx + l)) {
1080: return -1;
1081: }
1082: // Case fold the backref?
1083: if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) {
1084:
1085: // Compare backref to input, case-folding as we go
1086: for (int i = 0; i < l; i++) {
1087: if (Character.toLowerCase(search.charAt(idx++)) != Character
1088: .toLowerCase(search.charAt(s + i))) {
1089: return -1;
1090: }
1091: }
1092: } else {
1093:
1094: // Compare backref to input
1095: for (int i = 0; i < l; i++) {
1096: if (search.charAt(idx++) != search
1097: .charAt(s + i)) {
1098: return -1;
1099: }
1100: }
1101: }
1102: }
1103:
1104: break;
1105:
1106: case OP_BOL:
1107:
1108: // Fail if we're not at the start of the string
1109: if (idx != 0) {
1110:
1111: // If we're multiline matching, we could still be at the start of a line
1112: if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
1113:
1114: // If not at start of line, give up
1115: if (idx <= 0 || !isNewline(idx - 1)) {
1116: return -1;
1117: } else {
1118: break;
1119: }
1120: }
1121:
1122: return -1;
1123: }
1124:
1125: break;
1126:
1127: case OP_EOL:
1128:
1129: // If we're not at the end of string
1130: if (!search.isEnd(0) && !search.isEnd(idx)) {
1131:
1132: // If we're multi-line matching
1133: if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
1134:
1135: // Give up if we're not at the end of a line
1136: if (!isNewline(idx)) {
1137: return -1;
1138: } else {
1139: break;
1140: }
1141: }
1142:
1143: return -1;
1144: }
1145:
1146: break;
1147:
1148: case OP_ESCAPE:
1149:
1150: // Which escape?
1151: switch (opdata) {
1152:
1153: // Word boundary match
1154: case E_NBOUND:
1155: case E_BOUND: {
1156: char cLast = ((idx == getParenStart(0)) ? '\n'
1157: : search.charAt(idx - 1));
1158: char cNext = ((search.isEnd(idx)) ? '\n' : search
1159: .charAt(idx));
1160:
1161: if ((Character.isLetterOrDigit(cLast) == Character
1162: .isLetterOrDigit(cNext)) == (opdata == E_BOUND)) {
1163: return -1;
1164: }
1165: }
1166:
1167: break;
1168:
1169: // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1170: case E_ALNUM:
1171: case E_NALNUM:
1172: case E_DIGIT:
1173: case E_NDIGIT:
1174: case E_SPACE:
1175: case E_NSPACE:
1176:
1177: // Give up if out of input
1178: if (search.isEnd(idx)) {
1179: return -1;
1180: }
1181: // Switch on escape
1182: switch (opdata) {
1183: case E_ALNUM:
1184: case E_NALNUM:
1185: if (!(Character.isLetterOrDigit(search
1186: .charAt(idx)) == (opdata == E_ALNUM))) {
1187: return -1;
1188: }
1189:
1190: break;
1191:
1192: case E_DIGIT:
1193: case E_NDIGIT:
1194: if (!(Character.isDigit(search.charAt(idx)) == (opdata == E_DIGIT))) {
1195: return -1;
1196: }
1197:
1198: break;
1199:
1200: case E_SPACE:
1201: case E_NSPACE:
1202: if (!(Character
1203: .isWhitespace(search.charAt(idx)) == (opdata == E_SPACE))) {
1204: return -1;
1205: }
1206:
1207: break;
1208: }
1209:
1210: idx++;
1211: break;
1212:
1213: default:
1214: internalError("Unrecognized escape '" + opdata
1215: + "'");
1216: }
1217:
1218: break;
1219:
1220: case OP_ANY:
1221:
1222: // Match anything but a newline
1223: if (search.isEnd(idx) || search.charAt(idx++) == '\n') {
1224: return -1;
1225: }
1226:
1227: break;
1228:
1229: case OP_ATOM: {
1230:
1231: // Match an atom value
1232: if (search.isEnd(idx)) {
1233: return -1;
1234: }
1235:
1236: // Get length of atom and starting index
1237: int lenAtom = opdata;
1238: int startAtom = node + nodeSize;
1239:
1240: // Give up if not enough input remains to have a match
1241: if (search.isEnd(lenAtom + idx - 1)) {
1242: return -1;
1243: }
1244: // Match atom differently depending on casefolding flag
1245: if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) {
1246: for (int i = 0; i < lenAtom; i++) {
1247: if (Character.toLowerCase(search.charAt(idx++)) != Character
1248: .toLowerCase(instruction[startAtom + i])) {
1249: return -1;
1250: }
1251: }
1252: } else {
1253: for (int i = 0; i < lenAtom; i++) {
1254: if (search.charAt(idx++) != instruction[startAtom
1255: + i]) {
1256: return -1;
1257: }
1258: }
1259: }
1260: }
1261:
1262: break;
1263:
1264: case OP_POSIXCLASS: {
1265:
1266: // Out of input?
1267: if (search.isEnd(idx)) {
1268: return -1;
1269: }
1270: switch (opdata) {
1271: case POSIX_CLASS_ALNUM:
1272: if (!Character.isLetterOrDigit(search.charAt(idx))) {
1273: return -1;
1274: }
1275:
1276: break;
1277:
1278: case POSIX_CLASS_ALPHA:
1279: if (!Character.isLetter(search.charAt(idx))) {
1280: return -1;
1281: }
1282:
1283: break;
1284:
1285: case POSIX_CLASS_DIGIT:
1286: if (!Character.isDigit(search.charAt(idx))) {
1287: return -1;
1288: }
1289:
1290: break;
1291:
1292: case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1293: if (!Character.isSpaceChar(search.charAt(idx))) {
1294: return -1;
1295: }
1296:
1297: break;
1298:
1299: case POSIX_CLASS_SPACE:
1300: if (!Character.isWhitespace(search.charAt(idx))) {
1301: return -1;
1302: }
1303:
1304: break;
1305:
1306: case POSIX_CLASS_CNTRL:
1307: if (Character.getType(search.charAt(idx)) != Character.CONTROL) {
1308: return -1;
1309: }
1310:
1311: break;
1312:
1313: case POSIX_CLASS_GRAPH: // JWL - bugbug???
1314: switch (Character.getType(search.charAt(idx))) {
1315: case Character.MATH_SYMBOL:
1316: case Character.CURRENCY_SYMBOL:
1317: case Character.MODIFIER_SYMBOL:
1318: case Character.OTHER_SYMBOL:
1319: break;
1320:
1321: default:
1322: return -1;
1323: }
1324:
1325: break;
1326:
1327: case POSIX_CLASS_LOWER:
1328: if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) {
1329: return -1;
1330: }
1331:
1332: break;
1333:
1334: case POSIX_CLASS_UPPER:
1335: if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) {
1336: return -1;
1337: }
1338:
1339: break;
1340:
1341: case POSIX_CLASS_PRINT:
1342: if (Character.getType(search.charAt(idx)) == Character.CONTROL) {
1343: return -1;
1344: }
1345:
1346: break;
1347:
1348: case POSIX_CLASS_PUNCT: {
1349: int type = Character.getType(search.charAt(idx));
1350:
1351: switch (type) {
1352: case Character.DASH_PUNCTUATION:
1353: case Character.START_PUNCTUATION:
1354: case Character.END_PUNCTUATION:
1355: case Character.CONNECTOR_PUNCTUATION:
1356: case Character.OTHER_PUNCTUATION:
1357: break;
1358:
1359: default:
1360: return -1;
1361: }
1362: }
1363:
1364: break;
1365:
1366: case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1367: {
1368: boolean isXDigit = ((search.charAt(idx) >= '0' && search
1369: .charAt(idx) <= '9')
1370: || (search.charAt(idx) >= 'a' && search
1371: .charAt(idx) <= 'f') || (search
1372: .charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1373:
1374: if (!isXDigit) {
1375: return -1;
1376: }
1377: }
1378:
1379: break;
1380:
1381: case POSIX_CLASS_JSTART:
1382: if (!Character.isJavaIdentifierStart(search
1383: .charAt(idx))) {
1384: return -1;
1385: }
1386:
1387: break;
1388:
1389: case POSIX_CLASS_JPART:
1390: if (!Character.isJavaIdentifierPart(search
1391: .charAt(idx))) {
1392: return -1;
1393: }
1394:
1395: break;
1396:
1397: default:
1398: internalError("Bad posix class");
1399: break;
1400: }
1401:
1402: // Matched.
1403: idx++;
1404: }
1405:
1406: break;
1407:
1408: case OP_ANYOF: {
1409:
1410: // Out of input?
1411: if (search.isEnd(idx)) {
1412: return -1;
1413: }
1414:
1415: // Get character to match against character class and maybe casefold
1416: char c = search.charAt(idx);
1417: boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1418:
1419: if (caseFold) {
1420: c = Character.toLowerCase(c);
1421: }
1422:
1423: // Loop through character class checking our match character
1424: int idxRange = node + nodeSize;
1425: int idxEnd = idxRange + (opdata * 2);
1426: boolean match = false;
1427:
1428: for (int i = idxRange; i < idxEnd;) {
1429:
1430: // Get start, end and match characters
1431: char s = instruction[i++];
1432: char e = instruction[i++];
1433:
1434: // Fold ends of range and match character
1435: if (caseFold) {
1436: s = Character.toLowerCase(s);
1437: e = Character.toLowerCase(e);
1438: }
1439: // If the match character is in range, break out
1440: if (c >= s && c <= e) {
1441: match = true;
1442: break;
1443: }
1444: }
1445: // Fail if we didn't match the character class
1446: if (!match) {
1447: return -1;
1448: }
1449:
1450: idx++;
1451: }
1452:
1453: break;
1454:
1455: case OP_BRANCH: {
1456:
1457: // Check for choices
1458: if (instruction[next + offsetOpcode] != OP_BRANCH) {
1459:
1460: // If there aren't any other choices, just evaluate this branch.
1461: node += nodeSize;
1462: continue;
1463: }
1464:
1465: // Try all available branches
1466: short nextBranch;
1467:
1468: do {
1469:
1470: // Try matching the branch against the string
1471: if ((idxNew = matchNodes(node + nodeSize, maxNode,
1472: idx)) != -1) {
1473: return idxNew;
1474: }
1475:
1476: // Go to next branch (if any)
1477: nextBranch = (short) instruction[node + offsetNext];
1478: node += nextBranch;
1479: } while (nextBranch != 0
1480: && (instruction[node + offsetOpcode] == OP_BRANCH));
1481: // Failed to match any branch!
1482: return -1;
1483: }
1484: case OP_NOTHING:
1485: case OP_GOTO:
1486:
1487: // Just advance to the next node without doing anything
1488: break;
1489:
1490: case OP_END:
1491:
1492: // Match has succeeded!
1493: setParenEnd(0, idx);
1494:
1495: return idx;
1496:
1497: default:
1498:
1499: // Corrupt program
1500: internalError("Invalid opcode '" + opcode + "'");
1501: }
1502:
1503: // Advance to the next node in the program
1504: node = next;
1505: }
1506:
1507: // We "should" never end up here
1508: internalError("Corrupt program");
1509:
1510: return -1;
1511: }
1512:
1513: /**
1514: * Sets match behaviour flags which alter the way RE does matching.
1515: *
1516: * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
1517: * <p/>
1518: * <pre>
1519: * <p/>
1520: * MATCH_NORMAL // Normal (case-sensitive) matching
1521: * MATCH_CASEINDEPENDENT // Case folded comparisons
1522: * MATCH_MULTILINE // Newline matches as BOL/EOL
1523: * <p/>
1524: * </pre>
1525: */
1526: public void setMatchFlags(int matchFlags) {
1527: this .matchFlags = matchFlags;
1528: }
1529:
1530: /**
1531: * Sets the end of a paren level
1532: *
1533: * @param which Which paren level
1534: * @param i Index in input array
1535: */
1536: protected final void setParenEnd(int which, int i) {
1537: if (which < parenCount) {
1538: switch (which) {
1539: case 0:
1540: end0 = i;
1541: break;
1542:
1543: case 1:
1544: end1 = i;
1545: break;
1546:
1547: case 2:
1548: end2 = i;
1549: break;
1550:
1551: default:
1552:
1553: if (endn == null) {
1554: allocParens();
1555: }
1556:
1557: endn[which] = i;
1558: break;
1559: }
1560: }
1561: }
1562:
1563: /**
1564: * Sets the start of a paren level
1565: *
1566: * @param which Which paren level
1567: * @param i Index in input array
1568: */
1569: protected final void setParenStart(int which, int i) {
1570: if (which < parenCount) {
1571: switch (which) {
1572: case 0:
1573: start0 = i;
1574: break;
1575:
1576: case 1:
1577: start1 = i;
1578: break;
1579:
1580: case 2:
1581: start2 = i;
1582: break;
1583:
1584: default:
1585:
1586: if (startn == null) {
1587: allocParens();
1588: }
1589:
1590: startn[which] = i;
1591: break;
1592: }
1593: }
1594: }
1595:
1596: /**
1597: * Sets the current regular expression program used by this matcher object.
1598: *
1599: * @param program Regular expression program compiled by RECompiler.
1600: * @see RECompiler
1601: * @see REProgram
1602: */
1603: public void setProgram(REProgram program) {
1604: this .program = program;
1605: }
1606:
1607: /**
1608: * Converts a 'simplified' regular expression to a full regular expression
1609: *
1610: * @param pattern The pattern to convert
1611: * @return The full regular expression
1612: */
1613: public static String simplePatternToFullRegularExpression(
1614: String pattern) {
1615: StringBuffer buf = new StringBuffer();
1616:
1617: for (int i = 0; i < pattern.length(); i++) {
1618: char c = pattern.charAt(i);
1619:
1620: switch (c) {
1621: case '*':
1622: buf.append(".*");
1623: break;
1624:
1625: case '.':
1626: case '[':
1627: case ']':
1628: case '\\':
1629: case '+':
1630: case '?':
1631: case '{':
1632: case '}':
1633: case '$':
1634: case '^':
1635: case '|':
1636: case '(':
1637: case ')':
1638: buf.append('\\');
1639:
1640: default:
1641: buf.append(c);
1642: break;
1643: }
1644: }
1645:
1646: return buf.toString();
1647: }
1648:
1649: /**
1650: * Splits a string into an array of strings on regular expression boundaries.
1651: * This function works the same way as the Perl function of the same name.
1652: * Given a regular expression of "[ab]+" and a string to split of
1653: * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1654: * "[xyzzy, yyz, 123]".
1655: *
1656: * @param s String to split on this regular exression
1657: * @return Array of strings
1658: */
1659: public String[] split(String s) {
1660:
1661: // Create new vector
1662: Vector v = new Vector();
1663:
1664: // Start at position 0 and search the whole string
1665: int pos = 0;
1666: int len = s.length();
1667:
1668: // Try a match at each position
1669: while (pos < len && match(s, pos)) {
1670:
1671: // Get start of match
1672: int start = getParenStart(0);
1673:
1674: // Get end of match
1675: int newpos = getParenEnd(0);
1676:
1677: // Check if no progress was made
1678: if (newpos == pos) {
1679: v.addElement(s.substring(pos, start + 1));
1680: newpos++;
1681: } else {
1682: v.addElement(s.substring(pos, start));
1683: }
1684:
1685: // Move to new position
1686: pos = newpos;
1687: }
1688:
1689: // Push remainder if it's not empty
1690: String remainder = s.substring(pos);
1691:
1692: if (remainder.length() != 0) {
1693: v.addElement(remainder);
1694: }
1695:
1696: // Return vector as an array of strings
1697: String[] ret = new String[v.size()];
1698: v.copyInto(ret);
1699:
1700: return ret;
1701: }
1702:
1703: /**
1704: * Substitutes a string for this regular expression in another string.
1705: * This method works like the Perl function of the same name.
1706: * Given a regular expression of "a*b", a String to substituteIn of
1707: * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1708: * resulting String returned by subst would be "-foo-garply-wacky-".
1709: *
1710: * @param substituteIn String to substitute within
1711: * @param substitution String to substitute for all matches of this regular expression.
1712: * @return The string substituteIn with zero or more occurrences of the current
1713: * regular expression replaced with the substitution String (if this regular
1714: * expression object doesn't match at any position, the original String is returned
1715: * unchanged).
1716: */
1717: public String subst(String substituteIn, String substitution) {
1718: return subst(substituteIn, substitution, REPLACE_ALL);
1719: }
1720:
1721: /**
1722: * Substitutes a string for this regular expression in another string.
1723: * This method works like the Perl function of the same name.
1724: * Given a regular expression of "a*b", a String to substituteIn of
1725: * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1726: * resulting String returned by subst would be "-foo-garply-wacky-".
1727: *
1728: * @param substituteIn String to substitute within
1729: * @param substitution String to substitute for matches of this regular expression
1730: * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY
1731: * flag bit is set, only the first occurrence of this regular expression is replaced.
1732: * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1733: * replaced.
1734: * @return The string substituteIn with zero or more occurrences of the current
1735: * regular expression replaced with the substitution String (if this regular
1736: * expression object doesn't match at any position, the original String is returned
1737: * unchanged).
1738: */
1739: public String subst(String substituteIn, String substitution,
1740: int flags) {
1741:
1742: // String to return
1743: StringBuffer ret = new StringBuffer();
1744:
1745: // Start at position 0 and search the whole string
1746: int pos = 0;
1747: int len = substituteIn.length();
1748:
1749: // Try a match at each position
1750: while (pos < len && match(substituteIn, pos)) {
1751:
1752: // Append string before match
1753: ret.append(substituteIn.substring(pos, getParenStart(0)));
1754:
1755: // Append substitution
1756: ret.append(substitution);
1757:
1758: // Move forward, skipping past match
1759: int newpos = getParenEnd(0);
1760:
1761: // We always want to make progress!
1762: if (newpos == pos) {
1763: newpos++;
1764: }
1765:
1766: // Try new position
1767: pos = newpos;
1768:
1769: // Break out if we're only supposed to replace one occurrence
1770: if ((flags & REPLACE_FIRSTONLY) != 0) {
1771: break;
1772: }
1773: }
1774: // If there's remaining input, append it
1775: if (pos < len) {
1776: ret.append(substituteIn.substring(pos));
1777: }
1778:
1779: // Return string buffer as string
1780: return ret.toString();
1781: }
1782: }
|