0001: package org.apache.regexp;
0002:
0003: /*
0004: * ====================================================================
0005: *
0006: * The Apache Software License, Version 1.1
0007: *
0008: * Copyright (c) 1999 The Apache Software Foundation. All rights
0009: * reserved.
0010: *
0011: * Redistribution and use in source and binary forms, with or without
0012: * modification, are permitted provided that the following conditions
0013: * are met:
0014: *
0015: * 1. Redistributions of source code must retain the above copyright
0016: * notice, this list of conditions and the following disclaimer.
0017: *
0018: * 2. Redistributions in binary form must reproduce the above copyright
0019: * notice, this list of conditions and the following disclaimer in
0020: * the documentation and/or other materials provided with the
0021: * distribution.
0022: *
0023: * 3. The end-user documentation included with the redistribution, if
0024: * any, must include the following acknowlegement:
0025: * "This product includes software developed by the
0026: * Apache Software Foundation (http://www.apache.org/)."
0027: * Alternately, this acknowlegement may appear in the software itself,
0028: * if and wherever such third-party acknowlegements normally appear.
0029: *
0030: * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
0031: * Foundation" must not be used to endorse or promote products derived
0032: * from this software without prior written permission. For written
0033: * permission, please contact apache@apache.org.
0034: *
0035: * 5. Products derived from this software may not be called "Apache"
0036: * nor may "Apache" appear in their names without prior written
0037: * permission of the Apache Group.
0038: *
0039: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0040: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0041: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0042: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0043: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0044: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0045: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0046: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0047: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0048: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0049: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0050: * SUCH DAMAGE.
0051: * ====================================================================
0052: *
0053: * This software consists of voluntary contributions made by many
0054: * individuals on behalf of the Apache Software Foundation. For more
0055: * information on the Apache Software Foundation, please see
0056: * <http://www.apache.org/>.
0057: *
0058: */
0059:
0060: import java.util.Vector;
0061:
0062: /**
0063: * RE is an efficient, lightweight regular expression evaluator/matcher class.
0064: * Regular expressions are pattern descriptions which enable sophisticated matching of
0065: * strings. In addition to being able to match a string against a pattern, you
0066: * can also extract parts of the match. This is especially useful in text parsing!
0067: * Details on the syntax of regular expression patterns are given below.
0068: *
0069: * <p>
0070: *
0071: * To compile a regular expression (RE), you can simply construct an RE matcher
0072: * object from the string specification of the pattern, like this:
0073: *
0074: * <pre>
0075: *
0076: * RE r = new RE("a*b");
0077: *
0078: * </pre>
0079: *
0080: * <p>
0081: *
0082: * Once you have done this, you can call either of the RE.match methods to
0083: * perform matching on a String. For example:
0084: *
0085: * <pre>
0086: *
0087: * boolean matched = r.match("aaaab");
0088: *
0089: * </pre>
0090: *
0091: * will cause the boolean matched to be set to true because the
0092: * pattern "a*b" matches the string "aaaab".
0093: *
0094: * <p>
0095: * If you were interested in the <i>number</i> of a's which matched the first
0096: * part of our example expression, you could change the expression to
0097: * "(a*)b". Then when you compiled the expression and matched it against
0098: * something like "xaaaab", you would get results like this:
0099: *
0100: * <pre>
0101: *
0102: * RE r = new RE("(a*)b"); // Compile expression
0103: * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
0104: *
0105: * <br>
0106: *
0107: * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
0108: * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
0109: *
0110: * <br>
0111: *
0112: * int startWholeExpr = getParenStart(0); // startWholeExpr will be index 1
0113: * int endWholeExpr = getParenEnd(0); // endWholeExpr will be index 6
0114: * int lenWholeExpr = getParenLength(0); // lenWholeExpr will be 5
0115: *
0116: * <br>
0117: *
0118: * int startInside = getParenStart(1); // startInside will be index 1
0119: * int endInside = getParenEnd(1); // endInside will be index 5
0120: * int lenInside = getParenLength(1); // lenInside will be 4
0121: *
0122: * </pre>
0123: *
0124: * You can also refer to the contents of a parenthesized expression within
0125: * a regular expression itself. This is called a 'backreference'. The first
0126: * backreference in a regular expression is denoted by \1, the second by \2
0127: * and so on. So the expression:
0128: *
0129: * <pre>
0130: *
0131: * ([0-9]+)=\1
0132: *
0133: * </pre>
0134: *
0135: * will match any string of the form n=n (like 0=0 or 2=2).
0136: *
0137: * <p>
0138: *
0139: * The full regular expression syntax accepted by RE is described here:
0140: *
0141: * <pre>
0142: *
0143: * <br>
0144: *
0145: * <b><font face=times roman>Characters</font></b>
0146: *
0147: * <br>
0148: *
0149: * <i>unicodeChar</i> Matches any identical unicode character
0150: * \ Used to quote a meta-character (like '*')
0151: * \\ Matches a single '\' character
0152: * \0nnn Matches a given octal character
0153: * \xhh Matches a given 8-bit hexadecimal character
0154: * \\uhhhh Matches a given 16-bit hexadecimal character
0155: * \t Matches an ASCII tab character
0156: * \n Matches an ASCII newline character
0157: * \r Matches an ASCII return character
0158: * \f Matches an ASCII form feed character
0159: *
0160: * <br>
0161: *
0162: * <b><font face=times roman>Character Classes</font></b>
0163: *
0164: * <br>
0165: *
0166: * [abc] Simple character class
0167: * [a-zA-Z] Character class with ranges
0168: * [^abc] Negated character class
0169: *
0170: * <br>
0171: *
0172: * <b><font face=times roman>Standard POSIX Character Classes</font></b>
0173: *
0174: * <br>
0175: *
0176: * [:alnum:] Alphanumeric characters.
0177: * [:alpha:] Alphabetic characters.
0178: * [:blank:] Space and tab characters.
0179: * [:cntrl:] Control characters.
0180: * [:digit:] Numeric characters.
0181: * [:graph:] Characters that are printable and are also visible. (A space is printable, but not visible, while an `a' is both.)
0182: * [:lower:] Lower-case alphabetic characters.
0183: * [:print:] Printable characters (characters that are not control characters.)
0184: * [:punct:] Punctuation characters (characters that are not letter, digits, control characters, or space characters).
0185: * [:space:] Space characters (such as space, tab, and formfeed, to name a few).
0186: * [:upper:] Upper-case alphabetic characters.
0187: * [:xdigit:] Characters that are hexadecimal digits.
0188: *
0189: * <br>
0190: *
0191: * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
0192: *
0193: * <br>
0194: *
0195: * [:javastart:] Start of a Java identifier
0196: * [:javapart:] Part of a Java identifier
0197: *
0198: * <br>
0199: *
0200: * <b><font face=times roman>Predefined Classes</font></b>
0201: *
0202: * <br>
0203: *
0204: * . Matches any character other than newline
0205: * \w Matches a "word" character (alphanumeric plus "_")
0206: * \W Matches a non-word character
0207: * \s Matches a whitespace character
0208: * \S Matches a non-whitespace character
0209: * \d Matches a digit character
0210: * \D Matches a non-digit character
0211: *
0212: * <br>
0213: *
0214: * <b><font face=times roman>Boundary Matchers</font></b>
0215: *
0216: * <br>
0217: *
0218: * ^ Matches only at the beginning of a line
0219: * $ Matches only at the end of a line
0220: * \b Matches only at a word boundary
0221: * \B Matches only at a non-word boundary
0222: *
0223: * <br>
0224: *
0225: * <b><font face=times roman>Greedy Closures</font></b>
0226: *
0227: * <br>
0228: *
0229: * A* Matches A 0 or more times (greedy)
0230: * A+ Matches A 1 or more times (greedy)
0231: * A? Matches A 1 or 0 times (greedy)
0232: * A{n} Matches A exactly n times (greedy)
0233: * A{n,} Matches A at least n times (greedy)
0234: * A{n,m} Matches A at least n but not more than m times (greedy)
0235: *
0236: * <br>
0237: *
0238: * <b><font face=times roman>Reluctant Closures</font></b>
0239: *
0240: * <br>
0241: *
0242: * A*? Matches A 0 or more times (reluctant)
0243: * A+? Matches A 1 or more times (reluctant)
0244: * A?? Matches A 0 or 1 times (reluctant)
0245: *
0246: * <br>
0247: *
0248: * <b><font face=times roman>Logical Operators</font></b>
0249: *
0250: * <br>
0251: *
0252: * AB Matches A followed by B
0253: * A|B Matches either A or B
0254: * (A) Used for subexpression grouping
0255: *
0256: * <br>
0257: *
0258: * <b><font face=times roman>Backreferences</font></b>
0259: *
0260: * <br>
0261: *
0262: * \1 Backreference to 1st parenthesized subexpression
0263: * \2 Backreference to 2nd parenthesized subexpression
0264: * \3 Backreference to 3rd parenthesized subexpression
0265: * \4 Backreference to 4th parenthesized subexpression
0266: * \5 Backreference to 5th parenthesized subexpression
0267: * \6 Backreference to 6th parenthesized subexpression
0268: * \7 Backreference to 7th parenthesized subexpression
0269: * \8 Backreference to 8th parenthesized subexpression
0270: * \9 Backreference to 9th parenthesized subexpression
0271: *
0272: * <br>
0273: *
0274: * </pre>
0275: *
0276: * <p>
0277: *
0278: * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning that they
0279: * match as many elements of the string as possible without causing the overall
0280: * match to fail. If you want a closure to be reluctant (non-greedy), you can
0281: * simply follow it with a '?'. A reluctant closure will match as few elements
0282: * of the string as possible when finding matches. {m,n} closures don't currently
0283: * support reluctancy.
0284: *
0285: * <p>
0286: *
0287: * RE runs programs compiled by the RECompiler class. But the RE matcher class
0288: * does not include the actual regular expression compiler for reasons of
0289: * efficiency. In fact, if you want to pre-compile one or more regular expressions,
0290: * the 'recompile' class can be invoked from the command line to produce compiled
0291: * output like this:
0292: *
0293: * <pre>
0294: *
0295: * // Pre-compiled regular expression "a*b"
0296: * char[] re1Instructions =
0297: * {
0298: * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
0299: * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
0300: * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
0301: * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
0302: * 0x0000,
0303: * };
0304: *
0305: * <br>
0306: *
0307: * REProgram re1 = new REProgram(re1Instructions);
0308: *
0309: * </pre>
0310: *
0311: * You can then construct a regular expression matcher (RE) object from the pre-compiled
0312: * expression re1 and thus avoid the overhead of compiling the expression at runtime.
0313: * If you require more dynamic regular expressions, you can construct a single RECompiler
0314: * object and re-use it to compile each expression. Similarly, you can change the
0315: * program run by a given matcher object at any time. However, RE and RECompiler are
0316: * not threadsafe (for efficiency reasons, and because requiring thread safety in this
0317: * class is deemed to be a rare requirement), so you will need to construct a separate
0318: * compiler or matcher object for each thread (unless you do thread synchronization
0319: * yourself).
0320: *
0321: * </pre>
0322: * <br><p><br>
0323: *
0324: * <font color=red>
0325: * <i>ISSUES:</i>
0326: *
0327: * <ul>
0328: * <li>com.weusours.util.re is not currently compatible with all standard POSIX regcomp flags
0329: * <li>com.weusours.util.re does not support POSIX equivalence classes ([=foo=] syntax) (I18N/locale issue)
0330: * <li>com.weusours.util.re does not support nested POSIX character classes (definitely should, but not completely trivial)
0331: * <li>com.weusours.util.re Does not support POSIX character collation concepts ([.foo.] syntax) (I18N/locale issue)
0332: * <li>Should there be different matching styles (simple, POSIX, Perl etc?)
0333: * <li>Should RE support character iterators (for backwards RE matching!)?
0334: * <li>Should RE support reluctant {m,n} closures (does anyone care)?
0335: * <li>Not *all* possibilities are considered for greediness when backreferences
0336: * are involved (as POSIX suggests should be the case). The POSIX RE
0337: * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
0338: * of acdacaa where \1 is "a". This is not the case in this RE package,
0339: * and actually Perl doesn't go to this extent either! Until someone
0340: * actually complains about this, I'm not sure it's worth "fixing".
0341: * If it ever is fixed, test #137 in RETest.txt should be updated.
0342: * </ul>
0343: *
0344: * </font>
0345: *
0346: * @see recompile
0347: * @see RECompiler
0348: *
0349: * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
0350: * @version $Id: RE.java,v 1.1.1.1 2002/01/31 03:14:36 rcm Exp $
0351: */
0352: public class RE {
0353: /**
0354: * Specifies normal, case-sensitive matching behaviour.
0355: */
0356: public static final int MATCH_NORMAL = 0x0000;
0357:
0358: /**
0359: * Flag to indicate that matching should be case-independent (folded)
0360: */
0361: public static final int MATCH_CASEINDEPENDENT = 0x0001;
0362:
0363: /**
0364: * Newlines should match as BOL/EOL (^ and $)
0365: */
0366: public static final int MATCH_MULTILINE = 0x0002;
0367:
0368: /**
0369: * Consider all input a single body of text - newlines are matched by .
0370: */
0371: public static final int MATCH_SINGLELINE = 0x0004;
0372:
0373: /************************************************
0374: * *
0375: * The format of a node in a program is: *
0376: * *
0377: * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
0378: * *
0379: * char OPCODE - instruction *
0380: * char OPDATA - modifying data *
0381: * char OPNEXT - next node (relative offset) *
0382: * *
0383: ************************************************/
0384:
0385: // Opcode Char Opdata/Operand Meaning
0386: // ---------- ---------- --------------- --------------------------------------------------
0387: static final char OP_END = 'E'; // end of program
0388: static final char OP_BOL = '^'; // match only if at beginning of line
0389: static final char OP_EOL = '$'; // match only if at end of line
0390: static final char OP_ANY = '.'; // match any single character except newline
0391: static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
0392: static final char OP_BRANCH = '|'; // node match this alternative or the next one
0393: static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
0394: static final char OP_STAR = '*'; // node kleene closure
0395: static final char OP_PLUS = '+'; // node positive closure
0396: static final char OP_MAYBE = '?'; // node optional closure
0397: static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
0398: static final char OP_OPEN = '('; // number nth opening paren
0399: static final char OP_CLOSE = ')'; // number nth closing paren
0400: static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
0401: static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
0402: static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
0403: static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
0404: static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
0405: static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
0406: static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
0407:
0408: // Escape codes
0409: static final char E_ALNUM = 'w'; // Alphanumeric
0410: static final char E_NALNUM = 'W'; // Non-alphanumeric
0411: static final char E_BOUND = 'b'; // Word boundary
0412: static final char E_NBOUND = 'B'; // Non-word boundary
0413: static final char E_SPACE = 's'; // Whitespace
0414: static final char E_NSPACE = 'S'; // Non-whitespace
0415: static final char E_DIGIT = 'd'; // Digit
0416: static final char E_NDIGIT = 'D'; // Non-digit
0417:
0418: // Posix character classes
0419: static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
0420: static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
0421: static final char POSIX_CLASS_BLANK = 'b'; // Blanks
0422: static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
0423: static final char POSIX_CLASS_DIGIT = 'd'; // Digits
0424: static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
0425: static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
0426: static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
0427: static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
0428: static final char POSIX_CLASS_SPACE = 's'; // Spaces
0429: static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
0430: static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
0431: static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
0432: static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
0433:
0434: // Limits
0435: static final int maxNode = 65536; // Maximum number of nodes in a program
0436: static final int maxParen = 16; // Number of paren pairs (only 9 can be backrefs)
0437:
0438: // Node layout constants
0439: static final int offsetOpcode = 0; // Opcode offset (first character)
0440: static final int offsetOpdata = 1; // Opdata offset (second char)
0441: static final int offsetNext = 2; // Next index offset (third char)
0442: static final int nodeSize = 3; // Node size (in chars)
0443:
0444: /** Line Separator */
0445: static final String NEWLINE = System.getProperty("line.separator");
0446:
0447: // State of current program
0448: REProgram program; // Compiled regular expression 'program'
0449: CharacterIterator search; // The string being matched against
0450: int idx; // Current index in string being searched
0451: int matchFlags; // Match behaviour flags
0452:
0453: // Parenthesized subexpressions
0454: int parenCount; // Number of subexpressions matched (num open parens + 1)
0455: int start0; // Cache of start[0]
0456: int end0; // Cache of start[0]
0457: int start1; // Cache of start[1]
0458: int end1; // Cache of start[1]
0459: int start2; // Cache of start[2]
0460: int end2; // Cache of start[2]
0461: int[] startn; // Lazy-alloced array of sub-expression starts
0462: int[] endn; // Lazy-alloced array of sub-expression ends
0463:
0464: // Backreferences
0465: int[] startBackref; // Lazy-alloced array of backref starts
0466: int[] endBackref; // Lazy-alloced array of backref ends
0467:
0468: /**
0469: * Constructs a regular expression matcher from a String by compiling it
0470: * using a new instance of RECompiler. If you will be compiling many
0471: * expressions, you may prefer to use a single RECompiler object instead.
0472: * @param pattern The regular expression pattern to compile.
0473: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0474: * @see RECompiler
0475: * @see recompile
0476: */
0477: public RE(String pattern) throws RESyntaxException {
0478: this (pattern, MATCH_NORMAL);
0479: }
0480:
0481: /**
0482: * Constructs a regular expression matcher from a String by compiling it
0483: * using a new instance of RECompiler. If you will be compiling many
0484: * expressions, you may prefer to use a single RECompiler object instead.
0485: * @param pattern The regular expression pattern to compile.
0486: * @param matchFlags The matching style
0487: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0488: * @see RECompiler
0489: * @see recompile
0490: */
0491: public RE(String pattern, int matchFlags) throws RESyntaxException {
0492: this (new RECompiler().compile(pattern));
0493: setMatchFlags(matchFlags);
0494: }
0495:
0496: /**
0497: * Construct a matcher for a pre-compiled regular expression from program
0498: * (bytecode) data. Permits special flags to be passed in to modify matching
0499: * behaviour.
0500: * @param program Compiled regular expression program (see RECompiler and/or recompile)
0501: * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
0502: *
0503: * <pre>
0504: *
0505: * MATCH_NORMAL // Normal (case-sensitive) matching
0506: * MATCH_CASEINDEPENDENT // Case folded comparisons
0507: * MATCH_MULTILINE // Newline matches as BOL/EOL
0508: *
0509: * </pre>
0510: *
0511: * @see RECompiler
0512: * @see REProgram
0513: * @see recompile
0514: */
0515: public RE(REProgram program, int matchFlags) {
0516: setProgram(program);
0517: setMatchFlags(matchFlags);
0518: }
0519:
0520: /**
0521: * Construct a matcher for a pre-compiled regular expression from program
0522: * (bytecode) data.
0523: * @param program Compiled regular expression program
0524: * @see RECompiler
0525: * @see recompile
0526: */
0527: public RE(REProgram program) {
0528: this (program, MATCH_NORMAL);
0529: }
0530:
0531: /**
0532: * Constructs a regular expression matcher with no initial program.
0533: * This is likely to be an uncommon practice, but is still supported.
0534: */
0535: public RE() {
0536: this ((REProgram) null, MATCH_NORMAL);
0537: }
0538:
0539: /**
0540: * Converts a 'simplified' regular expression to a full regular expression
0541: * @param pattern The pattern to convert
0542: * @return The full regular expression
0543: */
0544: public static String simplePatternToFullRegularExpression(
0545: String pattern) {
0546: StringBuffer buf = new StringBuffer();
0547: for (int i = 0; i < pattern.length(); i++) {
0548: char c = pattern.charAt(i);
0549: switch (c) {
0550: case '*':
0551: buf.append(".*");
0552: break;
0553:
0554: case '.':
0555: case '[':
0556: case ']':
0557: case '\\':
0558: case '+':
0559: case '?':
0560: case '{':
0561: case '}':
0562: case '$':
0563: case '^':
0564: case '|':
0565: case '(':
0566: case ')':
0567: buf.append('\\');
0568: default:
0569: buf.append(c);
0570: break;
0571: }
0572: }
0573: return buf.toString();
0574: }
0575:
0576: /**
0577: * Sets match behaviour flags which alter the way RE does matching.
0578: * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
0579: *
0580: * <pre>
0581: *
0582: * MATCH_NORMAL // Normal (case-sensitive) matching
0583: * MATCH_CASEINDEPENDENT // Case folded comparisons
0584: * MATCH_MULTILINE // Newline matches as BOL/EOL
0585: *
0586: * </pre>
0587: *
0588: */
0589: public void setMatchFlags(int matchFlags) {
0590: this .matchFlags = matchFlags;
0591: }
0592:
0593: /**
0594: * Returns the current match behaviour flags.
0595: * @return Current match behaviour flags (RE.MATCH_*).
0596: *
0597: * <pre>
0598: *
0599: * MATCH_NORMAL // Normal (case-sensitive) matching
0600: * MATCH_CASEINDEPENDENT // Case folded comparisons
0601: * MATCH_MULTILINE // Newline matches as BOL/EOL
0602: *
0603: * </pre>
0604: *
0605: * @see #setMatchFlags
0606: *
0607: */
0608: public int getMatchFlags() {
0609: return matchFlags;
0610: }
0611:
0612: /**
0613: * Sets the current regular expression program used by this matcher object.
0614: * @param program Regular expression program compiled by RECompiler.
0615: * @see RECompiler
0616: * @see REProgram
0617: * @see recompile
0618: */
0619: public void setProgram(REProgram program) {
0620: this .program = program;
0621: }
0622:
0623: /**
0624: * Returns the current regular expression program in use by this matcher object.
0625: * @return Regular expression program
0626: * @see #setProgram
0627: */
0628: public REProgram getProgram() {
0629: return program;
0630: }
0631:
0632: /**
0633: * Returns the number of parenthesized subexpressions available after a successful match.
0634: * @return Number of available parenthesized subexpressions
0635: */
0636: public int getParenCount() {
0637: return parenCount;
0638: }
0639:
0640: /**
0641: * Gets the contents of a parenthesized subexpression after a successful match.
0642: * @param which Nesting level of subexpression
0643: * @return String
0644: */
0645: public String getParen(int which) {
0646: int start;
0647: if (which < parenCount && (start = getParenStart(which)) >= 0) {
0648: return search.substring(start, getParenEnd(which));
0649: }
0650: return null;
0651: }
0652:
0653: /**
0654: * Returns the start index of a given paren level.
0655: * @param which Nesting level of subexpression
0656: * @return String index
0657: */
0658: public final int getParenStart(int which) {
0659: if (which < parenCount) {
0660: switch (which) {
0661: case 0:
0662: return start0;
0663:
0664: case 1:
0665: return start1;
0666:
0667: case 2:
0668: return start2;
0669:
0670: default:
0671: if (startn == null) {
0672: allocParens();
0673: }
0674: return startn[which];
0675: }
0676: }
0677: return -1;
0678: }
0679:
0680: /**
0681: * Returns the end index of a given paren level.
0682: * @param which Nesting level of subexpression
0683: * @return String index
0684: */
0685: public final int getParenEnd(int which) {
0686: if (which < parenCount) {
0687: switch (which) {
0688: case 0:
0689: return end0;
0690:
0691: case 1:
0692: return end1;
0693:
0694: case 2:
0695: return end2;
0696:
0697: default:
0698: if (endn == null) {
0699: allocParens();
0700: }
0701: return endn[which];
0702: }
0703: }
0704: return -1;
0705: }
0706:
0707: /**
0708: * Returns the length of a given paren level.
0709: * @param which Nesting level of subexpression
0710: * @return Number of characters in the parenthesized subexpression
0711: */
0712: public final int getParenLength(int which) {
0713: if (which < parenCount) {
0714: return getParenEnd(which) - getParenStart(which);
0715: }
0716: return -1;
0717: }
0718:
0719: /**
0720: * Sets the start of a paren level
0721: * @param which Which paren level
0722: * @param i Index in input array
0723: */
0724: protected final void setParenStart(int which, int i) {
0725: if (which < parenCount) {
0726: switch (which) {
0727: case 0:
0728: start0 = i;
0729: break;
0730:
0731: case 1:
0732: start1 = i;
0733: break;
0734:
0735: case 2:
0736: start2 = i;
0737: break;
0738:
0739: default:
0740: if (startn == null) {
0741: allocParens();
0742: }
0743: startn[which] = i;
0744: break;
0745: }
0746: }
0747: }
0748:
0749: /**
0750: * Sets the end of a paren level
0751: * @param which Which paren level
0752: * @param i Index in input array
0753: */
0754: protected final void setParenEnd(int which, int i) {
0755: if (which < parenCount) {
0756: switch (which) {
0757: case 0:
0758: end0 = i;
0759: break;
0760:
0761: case 1:
0762: end1 = i;
0763: break;
0764:
0765: case 2:
0766: end2 = i;
0767: break;
0768:
0769: default:
0770: if (endn == null) {
0771: allocParens();
0772: }
0773: endn[which] = i;
0774: break;
0775: }
0776: }
0777: }
0778:
0779: /**
0780: * Throws an Error representing an internal error condition probably resulting
0781: * from a bug in the regular expression compiler (or possibly data corruption).
0782: * In practice, this should be very rare.
0783: * @param s Error description
0784: */
0785: protected void internalError(String s) throws Error {
0786: throw new Error("RE internal error: " + s);
0787: }
0788:
0789: /**
0790: * Performs lazy allocation of subexpression arrays
0791: */
0792: private final void allocParens() {
0793: // Allocate arrays for subexpressions
0794: startn = new int[maxParen];
0795: endn = new int[maxParen];
0796:
0797: // Set sub-expression pointers to invalid values
0798: for (int i = 0; i < maxParen; i++) {
0799: startn[i] = -1;
0800: endn[i] = -1;
0801: }
0802: }
0803:
0804: /**
0805: * Try to match a string against a subset of nodes in the program
0806: * @param firstNode Node to start at in program
0807: * @param lastNode Last valid node (used for matching a subexpression without
0808: * matching the rest of the program as well).
0809: * @param idxStart Starting position in character array
0810: * @return Final input array index if match succeeded. -1 if not.
0811: */
0812: protected int matchNodes(int firstNode, int lastNode, int idxStart) {
0813: // Our current place in the string
0814: int idx = idxStart;
0815:
0816: // Loop while node is valid
0817: int next, opcode, opdata;
0818: int idxNew;
0819: char[] instruction = program.instruction;
0820: for (int node = firstNode; node < lastNode;) {
0821: opcode = instruction[node + offsetOpcode];
0822: next = node + (short) instruction[node + offsetNext];
0823: opdata = instruction[node + offsetOpdata];
0824:
0825: switch (opcode) {
0826: case OP_RELUCTANTMAYBE: {
0827: int once = 0;
0828: do {
0829: // Try to match the rest without using the reluctant subexpr
0830: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0831: return idxNew;
0832: }
0833: } while ((once++ == 0)
0834: && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
0835: return -1;
0836: }
0837:
0838: case OP_RELUCTANTPLUS:
0839: while ((idx = matchNodes(node + nodeSize, next, idx)) != -1) {
0840: // Try to match the rest without using the reluctant subexpr
0841: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0842: return idxNew;
0843: }
0844: }
0845: return -1;
0846:
0847: case OP_RELUCTANTSTAR:
0848: do {
0849: // Try to match the rest without using the reluctant subexpr
0850: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0851: return idxNew;
0852: }
0853: } while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
0854: return -1;
0855:
0856: case OP_OPEN:
0857:
0858: // Match subexpression
0859: if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
0860: startBackref[opdata] = idx;
0861: }
0862: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0863: // Increase valid paren count
0864: if ((opdata + 1) > parenCount) {
0865: parenCount = opdata + 1;
0866: }
0867:
0868: // Don't set paren if already set later on
0869: if (getParenStart(opdata) == -1) {
0870: setParenStart(opdata, idx);
0871: }
0872: }
0873: return idxNew;
0874:
0875: case OP_CLOSE:
0876:
0877: // Done matching subexpression
0878: if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
0879: endBackref[opdata] = idx;
0880: }
0881: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0882: // Increase valid paren count
0883: if ((opdata + 1) > parenCount) {
0884: parenCount = opdata + 1;
0885: }
0886:
0887: // Don't set paren if already set later on
0888: if (getParenEnd(opdata) == -1) {
0889: setParenEnd(opdata, idx);
0890: }
0891: }
0892: return idxNew;
0893:
0894: case OP_BACKREF: {
0895: // Get the start and end of the backref
0896: int s = startBackref[opdata];
0897: int e = endBackref[opdata];
0898:
0899: // We don't know the backref yet
0900: if (s == -1 || e == -1) {
0901: return -1;
0902: }
0903:
0904: // The backref is empty size
0905: if (s == e) {
0906: break;
0907: }
0908:
0909: // Get the length of the backref
0910: int l = e - s;
0911:
0912: // If there's not enough input left, give up.
0913: if (search.isEnd(idx + l - 1)) {
0914: return -1;
0915: }
0916:
0917: // Case fold the backref?
0918: if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) {
0919: // Compare backref to input, case-folding as we go
0920: for (int i = 0; i < l; i++) {
0921: if (Character.toLowerCase(search.charAt(idx++)) != Character
0922: .toLowerCase(search.charAt(s + i))) {
0923: return -1;
0924: }
0925: }
0926: } else {
0927: // Compare backref to input
0928: for (int i = 0; i < l; i++) {
0929: if (search.charAt(idx++) != search
0930: .charAt(s + i)) {
0931: return -1;
0932: }
0933: }
0934: }
0935: }
0936: break;
0937:
0938: case OP_BOL:
0939:
0940: // Fail if we're not at the start of the string
0941: if (idx != 0) {
0942: // If we're multiline matching, we could still be at the start of a line
0943: if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
0944: // If not at start of line, give up
0945: if (idx <= 0 || !isNewline(idx - 1)) {
0946: return -1;
0947: } else {
0948: break;
0949: }
0950: }
0951: return -1;
0952: }
0953: break;
0954:
0955: case OP_EOL:
0956:
0957: // If we're not at the end of string
0958: if (!search.isEnd(0) && !search.isEnd(idx)) {
0959: // If we're multi-line matching
0960: if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
0961: // Give up if we're not at the end of a line
0962: if (!isNewline(idx)) {
0963: return -1;
0964: } else {
0965: break;
0966: }
0967: }
0968: return -1;
0969: }
0970: break;
0971:
0972: case OP_ESCAPE:
0973:
0974: // Which escape?
0975: switch (opdata) {
0976: // Word boundary match
0977: case E_NBOUND:
0978: case E_BOUND: {
0979: char cLast = ((idx == getParenStart(0)) ? '\n'
0980: : search.charAt(idx - 1));
0981: char cNext = ((search.isEnd(idx)) ? '\n' : search
0982: .charAt(idx));
0983: if ((Character.isLetterOrDigit(cLast) == Character
0984: .isLetterOrDigit(cNext)) == (opdata == E_BOUND)) {
0985: return -1;
0986: }
0987: }
0988: break;
0989:
0990: // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
0991: case E_ALNUM:
0992: case E_NALNUM:
0993: case E_DIGIT:
0994: case E_NDIGIT:
0995: case E_SPACE:
0996: case E_NSPACE:
0997:
0998: // Give up if out of input
0999: if (search.isEnd(idx)) {
1000: return -1;
1001: }
1002:
1003: // Switch on escape
1004: switch (opdata) {
1005: case E_ALNUM:
1006: case E_NALNUM:
1007: if (!(Character.isLetterOrDigit(search
1008: .charAt(idx)) == (opdata == E_ALNUM))) {
1009: return -1;
1010: }
1011: break;
1012:
1013: case E_DIGIT:
1014: case E_NDIGIT:
1015: if (!(Character.isDigit(search.charAt(idx)) == (opdata == E_DIGIT))) {
1016: return -1;
1017: }
1018: break;
1019:
1020: case E_SPACE:
1021: case E_NSPACE:
1022: if (!(Character
1023: .isWhitespace(search.charAt(idx)) == (opdata == E_SPACE))) {
1024: return -1;
1025: }
1026: break;
1027: }
1028: idx++;
1029: break;
1030:
1031: default:
1032: internalError("Unrecognized escape '" + opdata
1033: + "'");
1034: }
1035: break;
1036:
1037: case OP_ANY:
1038:
1039: if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1040: // Match anything
1041: if (search.isEnd(idx)) {
1042: return -1;
1043: }
1044: idx++;
1045: break;
1046: } else {
1047: // Match anything but a newline
1048: if (search.isEnd(idx)
1049: || search.charAt(idx++) == '\n') {
1050: return -1;
1051: }
1052: break;
1053: }
1054:
1055: case OP_ATOM: {
1056: // Match an atom value
1057: if (search.isEnd(idx)) {
1058: return -1;
1059: }
1060:
1061: // Get length of atom and starting index
1062: int lenAtom = opdata;
1063: int startAtom = node + nodeSize;
1064:
1065: // Give up if not enough input remains to have a match
1066: if (search.isEnd(lenAtom + idx - 1)) {
1067: return -1;
1068: }
1069:
1070: // Match atom differently depending on casefolding flag
1071: if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) {
1072: for (int i = 0; i < lenAtom; i++) {
1073: if (Character.toLowerCase(search.charAt(idx++)) != Character
1074: .toLowerCase(instruction[startAtom + i])) {
1075: return -1;
1076: }
1077: }
1078: } else {
1079: for (int i = 0; i < lenAtom; i++) {
1080: if (search.charAt(idx++) != instruction[startAtom
1081: + i]) {
1082: return -1;
1083: }
1084: }
1085: }
1086: }
1087: break;
1088:
1089: case OP_POSIXCLASS: {
1090: // Out of input?
1091: if (search.isEnd(idx)) {
1092: return -1;
1093: }
1094:
1095: switch (opdata) {
1096: case POSIX_CLASS_ALNUM:
1097: if (!Character.isLetterOrDigit(search.charAt(idx))) {
1098: return -1;
1099: }
1100: break;
1101:
1102: case POSIX_CLASS_ALPHA:
1103: if (!Character.isLetter(search.charAt(idx))) {
1104: return -1;
1105: }
1106: break;
1107:
1108: case POSIX_CLASS_DIGIT:
1109: if (!Character.isDigit(search.charAt(idx))) {
1110: return -1;
1111: }
1112: break;
1113:
1114: case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1115: if (!Character.isSpaceChar(search.charAt(idx))) {
1116: return -1;
1117: }
1118: break;
1119:
1120: case POSIX_CLASS_SPACE:
1121: if (!Character.isWhitespace(search.charAt(idx))) {
1122: return -1;
1123: }
1124: break;
1125:
1126: case POSIX_CLASS_CNTRL:
1127: if (Character.getType(search.charAt(idx)) != Character.CONTROL) {
1128: return -1;
1129: }
1130: break;
1131:
1132: case POSIX_CLASS_GRAPH: // JWL - bugbug???
1133: switch (Character.getType(search.charAt(idx))) {
1134: case Character.MATH_SYMBOL:
1135: case Character.CURRENCY_SYMBOL:
1136: case Character.MODIFIER_SYMBOL:
1137: case Character.OTHER_SYMBOL:
1138: break;
1139:
1140: default:
1141: return -1;
1142: }
1143: break;
1144:
1145: case POSIX_CLASS_LOWER:
1146: if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) {
1147: return -1;
1148: }
1149: break;
1150:
1151: case POSIX_CLASS_UPPER:
1152: if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) {
1153: return -1;
1154: }
1155: break;
1156:
1157: case POSIX_CLASS_PRINT:
1158: if (Character.getType(search.charAt(idx)) == Character.CONTROL) {
1159: return -1;
1160: }
1161: break;
1162:
1163: case POSIX_CLASS_PUNCT: {
1164: int type = Character.getType(search.charAt(idx));
1165: switch (type) {
1166: case Character.DASH_PUNCTUATION:
1167: case Character.START_PUNCTUATION:
1168: case Character.END_PUNCTUATION:
1169: case Character.CONNECTOR_PUNCTUATION:
1170: case Character.OTHER_PUNCTUATION:
1171: break;
1172:
1173: default:
1174: return -1;
1175: }
1176: }
1177: break;
1178:
1179: case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1180: {
1181: boolean isXDigit = ((search.charAt(idx) >= '0' && search
1182: .charAt(idx) <= '9')
1183: || (search.charAt(idx) >= 'a' && search
1184: .charAt(idx) <= 'f') || (search
1185: .charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1186: if (!isXDigit) {
1187: return -1;
1188: }
1189: }
1190: break;
1191:
1192: case POSIX_CLASS_JSTART:
1193: if (!Character.isJavaIdentifierStart(search
1194: .charAt(idx))) {
1195: return -1;
1196: }
1197: break;
1198:
1199: case POSIX_CLASS_JPART:
1200: if (!Character.isJavaIdentifierPart(search
1201: .charAt(idx))) {
1202: return -1;
1203: }
1204: break;
1205:
1206: default:
1207: internalError("Bad posix class");
1208: break;
1209: }
1210:
1211: // Matched.
1212: idx++;
1213: }
1214: break;
1215:
1216: case OP_ANYOF: {
1217: // Out of input?
1218: if (search.isEnd(idx)) {
1219: return -1;
1220: }
1221:
1222: // Get character to match against character class and maybe casefold
1223: char c = search.charAt(idx);
1224: boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1225: if (caseFold) {
1226: c = Character.toLowerCase(c);
1227: }
1228:
1229: // Loop through character class checking our match character
1230: int idxRange = node + nodeSize;
1231: int idxEnd = idxRange + (opdata * 2);
1232: boolean match = false;
1233: for (int i = idxRange; i < idxEnd;) {
1234: // Get start, end and match characters
1235: char s = instruction[i++];
1236: char e = instruction[i++];
1237:
1238: // Fold ends of range and match character
1239: if (caseFold) {
1240: s = Character.toLowerCase(s);
1241: e = Character.toLowerCase(e);
1242: }
1243:
1244: // If the match character is in range, break out
1245: if (c >= s && c <= e) {
1246: match = true;
1247: break;
1248: }
1249: }
1250:
1251: // Fail if we didn't match the character class
1252: if (!match) {
1253: return -1;
1254: }
1255: idx++;
1256: }
1257: break;
1258:
1259: case OP_BRANCH: {
1260: // Check for choices
1261: if (instruction[next + offsetOpcode] != OP_BRANCH) {
1262: // If there aren't any other choices, just evaluate this branch.
1263: node += nodeSize;
1264: continue;
1265: }
1266:
1267: // Try all available branches
1268: short nextBranch;
1269: do {
1270: // Try matching the branch against the string
1271: if ((idxNew = matchNodes(node + nodeSize, maxNode,
1272: idx)) != -1) {
1273: return idxNew;
1274: }
1275:
1276: // Go to next branch (if any)
1277: nextBranch = (short) instruction[node + offsetNext];
1278: node += nextBranch;
1279: } while (nextBranch != 0
1280: && (instruction[node + offsetOpcode] == OP_BRANCH));
1281:
1282: // Failed to match any branch!
1283: return -1;
1284: }
1285:
1286: case OP_NOTHING:
1287: case OP_GOTO:
1288:
1289: // Just advance to the next node without doing anything
1290: break;
1291:
1292: case OP_END:
1293:
1294: // Match has succeeded!
1295: setParenEnd(0, idx);
1296: return idx;
1297:
1298: default:
1299:
1300: // Corrupt program
1301: internalError("Invalid opcode '" + opcode + "'");
1302: }
1303:
1304: // Advance to the next node in the program
1305: node = next;
1306: }
1307:
1308: // We "should" never end up here
1309: internalError("Corrupt program");
1310: return -1;
1311: }
1312:
1313: /**
1314: * Match the current regular expression program against the current
1315: * input string, starting at index i of the input string. This method
1316: * is only meant for internal use.
1317: * @param i The input string index to start matching at
1318: * @return True if the input matched the expression
1319: */
1320: protected boolean matchAt(int i) {
1321: // Initialize start pointer, paren cache and paren count
1322: start0 = -1;
1323: end0 = -1;
1324: start1 = -1;
1325: end1 = -1;
1326: start2 = -1;
1327: end2 = -1;
1328: startn = null;
1329: endn = null;
1330: parenCount = 1;
1331: setParenStart(0, i);
1332:
1333: // Allocate backref arrays (unless optimizations indicate otherwise)
1334: if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
1335: startBackref = new int[maxParen];
1336: endBackref = new int[maxParen];
1337: }
1338:
1339: // Match against string
1340: int idx;
1341: if ((idx = matchNodes(0, maxNode, i)) != -1) {
1342: setParenEnd(0, idx);
1343: return true;
1344: }
1345:
1346: // Didn't match
1347: parenCount = 0;
1348: return false;
1349: }
1350:
1351: /**
1352: * Matches the current regular expression program against a character array,
1353: * starting at a given index.
1354: * @param search String to match against
1355: * @param i Index to start searching at
1356: * @return True if string matched
1357: */
1358: public boolean match(String search, int i) {
1359: return match(new StringCharacterIterator(search), i);
1360: }
1361:
1362: /**
1363: * Matches the current regular expression program against a character array,
1364: * starting at a given index.
1365: * @param search String to match against
1366: * @param i Index to start searching at
1367: * @return True if string matched
1368: */
1369: public boolean match(CharacterIterator search, int i) {
1370: // There is no compiled program to search with!
1371: if (program == null) {
1372: // This should be uncommon enough to be an error case rather
1373: // than an exception (which would have to be handled everywhere)
1374: internalError("No RE program to run!");
1375: }
1376:
1377: // Save string to search
1378: this .search = search;
1379:
1380: // Can we optimize the search by looking for a prefix string?
1381: if (program.prefix == null) {
1382: // Unprefixed matching must try for a match at each character
1383: for (; !search.isEnd(i - 1); i++) {
1384: // Try a match at index i
1385: if (matchAt(i)) {
1386: return true;
1387: }
1388: }
1389: return false;
1390: } else {
1391: // Prefix-anchored matching is possible
1392: boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1393: char[] prefix = program.prefix;
1394: for (; !search.isEnd(i + prefix.length - 1); i++) {
1395: // If the first character of the prefix matches
1396: boolean match = false;
1397: if (caseIndependent)
1398: match = Character.toLowerCase(search.charAt(i)) == Character
1399: .toLowerCase(prefix[0]);
1400: else
1401: match = search.charAt(i) == prefix[0];
1402: if (match) {
1403: // Save first character position
1404: int firstChar = i++;
1405: int k;
1406: for (k = 1; k < prefix.length;) {
1407: // If there's a mismatch of any character in the prefix, give up
1408: if (caseIndependent)
1409: match = Character.toLowerCase(search
1410: .charAt(i++)) == Character
1411: .toLowerCase(prefix[k++]);
1412: else
1413: match = search.charAt(i++) == prefix[k++];
1414: if (!match) {
1415: break;
1416: }
1417: }
1418:
1419: // See if the whole prefix string matched
1420: if (k == prefix.length) {
1421: // We matched the full prefix at firstChar, so try it
1422: if (matchAt(firstChar)) {
1423: return true;
1424: }
1425: }
1426:
1427: // Match failed, reset i to continue the search
1428: i = firstChar;
1429: }
1430: }
1431: return false;
1432: }
1433: }
1434:
1435: /**
1436: * Matches the current regular expression program against a String.
1437: * @param search String to match against
1438: * @return True if string matched
1439: */
1440: public boolean match(String search) {
1441: return match(search, 0);
1442: }
1443:
1444: /**
1445: * Splits a string into an array of strings on regular expression boundaries.
1446: * This function works the same way as the Perl function of the same name.
1447: * Given a regular expression of "[ab]+" and a string to split of
1448: * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1449: * "[xyzzy, yyz, 123]".
1450: * @param s String to split on this regular exression
1451: * @return Array of strings
1452: */
1453: public String[] split(String s) {
1454: // Create new vector
1455: Vector v = new Vector();
1456:
1457: // Start at position 0 and search the whole string
1458: int pos = 0;
1459: int len = s.length();
1460:
1461: // Try a match at each position
1462: while (pos < len && match(s, pos)) {
1463: // Get start of match
1464: int start = getParenStart(0);
1465:
1466: // Get end of match
1467: int newpos = getParenEnd(0);
1468:
1469: // Check if no progress was made
1470: if (newpos == pos) {
1471: v.addElement(s.substring(pos, start + 1));
1472: newpos++;
1473: } else {
1474: v.addElement(s.substring(pos, start));
1475: }
1476:
1477: // Move to new position
1478: pos = newpos;
1479: }
1480:
1481: // Push remainder if it's not empty
1482: String remainder = s.substring(pos);
1483: if (remainder.length() != 0) {
1484: v.addElement(remainder);
1485: }
1486:
1487: // Return vector as an array of strings
1488: String[] ret = new String[v.size()];
1489: v.copyInto(ret);
1490: return ret;
1491: }
1492:
1493: /**
1494: * Flag bit that indicates that subst should replace all occurrences of this
1495: * regular expression.
1496: */
1497: public static final int REPLACE_ALL = 0x0000;
1498:
1499: /**
1500: * Flag bit that indicates that subst should only replace the first occurrence
1501: * of this regular expression.
1502: */
1503: public static final int REPLACE_FIRSTONLY = 0x0001;
1504:
1505: /**
1506: * Substitutes a string for this regular expression in another string.
1507: * This method works like the Perl function of the same name.
1508: * Given a regular expression of "a*b", a String to substituteIn of
1509: * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1510: * resulting String returned by subst would be "-foo-garply-wacky-".
1511: * @param substituteIn String to substitute within
1512: * @param substitution String to substitute for all matches of this regular expression.
1513: * @return The string substituteIn with zero or more occurrences of the current
1514: * regular expression replaced with the substitution String (if this regular
1515: * expression object doesn't match at any position, the original String is returned
1516: * unchanged).
1517: */
1518: public String subst(String substituteIn, String substitution) {
1519: return subst(substituteIn, substitution, REPLACE_ALL);
1520: }
1521:
1522: /**
1523: * Substitutes a string for this regular expression in another string.
1524: * This method works like the Perl function of the same name.
1525: * Given a regular expression of "a*b", a String to substituteIn of
1526: * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1527: * resulting String returned by subst would be "-foo-garply-wacky-".
1528: * @param substituteIn String to substitute within
1529: * @param substitution String to substitute for matches of this regular expression
1530: * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY
1531: * flag bit is set, only the first occurrence of this regular expression is replaced.
1532: * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1533: * replaced.
1534: * @return The string substituteIn with zero or more occurrences of the current
1535: * regular expression replaced with the substitution String (if this regular
1536: * expression object doesn't match at any position, the original String is returned
1537: * unchanged).
1538: */
1539: public String subst(String substituteIn, String substitution,
1540: int flags) {
1541: // String to return
1542: StringBuffer ret = new StringBuffer();
1543:
1544: // Start at position 0 and search the whole string
1545: int pos = 0;
1546: int len = substituteIn.length();
1547:
1548: // Try a match at each position
1549: while (pos < len && match(substituteIn, pos)) {
1550: // Append string before match
1551: ret.append(substituteIn.substring(pos, getParenStart(0)));
1552:
1553: // Append substitution
1554: ret.append(substitution);
1555:
1556: // Move forward, skipping past match
1557: int newpos = getParenEnd(0);
1558:
1559: // We always want to make progress!
1560: if (newpos == pos) {
1561: newpos++;
1562: }
1563:
1564: // Try new position
1565: pos = newpos;
1566:
1567: // Break out if we're only supposed to replace one occurrence
1568: if ((flags & REPLACE_FIRSTONLY) != 0) {
1569: break;
1570: }
1571: }
1572:
1573: // If there's remaining input, append it
1574: if (pos < len) {
1575: ret.append(substituteIn.substring(pos));
1576: }
1577:
1578: // Return string buffer as string
1579: return ret.toString();
1580: }
1581:
1582: /**
1583: * Returns an array of Strings, whose toString representation matches a regular
1584: * expression. This method works like the Perl function of the same name. Given
1585: * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
1586: * aaaab], the array of Strings returned by grep would be [aab, aaaab].
1587: * @param search Array of Objects to search
1588: * @return Array of Objects whose toString value matches this regular expression.
1589: */
1590: public String[] grep(Object[] search) {
1591: // Create new vector to hold return items
1592: Vector v = new Vector();
1593:
1594: // Traverse array of objects
1595: for (int i = 0; i < search.length; i++) {
1596: // Get next object as a string
1597: String s = search[i].toString();
1598:
1599: // If it matches this regexp, add it to the list
1600: if (match(s)) {
1601: v.addElement(s);
1602: }
1603: }
1604:
1605: // Return vector as an array of strings
1606: String[] ret = new String[v.size()];
1607: v.copyInto(ret);
1608: return ret;
1609: }
1610:
1611: /** @return true if at the i-th position in the 'search' a newline ends */
1612: private boolean isNewline(int i) {
1613:
1614: if (i < NEWLINE.length() - 1) {
1615: return false;
1616: }
1617:
1618: if (search.charAt(i) == '\n') {
1619: return true;
1620: }
1621:
1622: for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
1623: if (NEWLINE.charAt(j) != search.charAt(i)) {
1624: return false;
1625: }
1626: }
1627: return true;
1628: }
1629: }
|