0001: /*
0002: * Licensed to the Apache Software Foundation (ASF) under one or more
0003: * contributor license agreements. See the NOTICE file distributed with
0004: * this work for additional information regarding copyright ownership.
0005: * The ASF licenses this file to You under the Apache License, Version 2.0
0006: * (the "License"); you may not use this file except in compliance with
0007: * the License. You may obtain a copy of the License at
0008: *
0009: * http://www.apache.org/licenses/LICENSE-2.0
0010: *
0011: * Unless required by applicable law or agreed to in writing, software
0012: * distributed under the License is distributed on an "AS IS" BASIS,
0013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014: * See the License for the specific language governing permissions and
0015: * limitations under the License.
0016: */
0017:
0018: package org.apache.regexp;
0019:
0020: import java.io.Serializable;
0021: import java.util.Vector;
0022:
0023: /**
0024: * RE is an efficient, lightweight regular expression evaluator/matcher
0025: * class. Regular expressions are pattern descriptions which enable
0026: * sophisticated matching of strings. In addition to being able to
0027: * match a string against a pattern, you can also extract parts of the
0028: * match. This is especially useful in text parsing! Details on the
0029: * syntax of regular expression patterns are given below.
0030: *
0031: * <p>
0032: * To compile a regular expression (RE), you can simply construct an RE
0033: * matcher object from the string specification of the pattern, like this:
0034: *
0035: * <pre>
0036: * RE r = new RE("a*b");
0037: * </pre>
0038: *
0039: * <p>
0040: * Once you have done this, you can call either of the RE.match methods to
0041: * perform matching on a String. For example:
0042: *
0043: * <pre>
0044: * boolean matched = r.match("aaaab");
0045: * </pre>
0046: *
0047: * will cause the boolean matched to be set to true because the
0048: * pattern "a*b" matches the string "aaaab".
0049: *
0050: * <p>
0051: * If you were interested in the <i>number</i> of a's which matched the
0052: * first part of our example expression, you could change the expression to
0053: * "(a*)b". Then when you compiled the expression and matched it against
0054: * something like "xaaaab", you would get results like this:
0055: *
0056: * <pre>
0057: * RE r = new RE("(a*)b"); // Compile expression
0058: * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
0059: *
0060: * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
0061: * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
0062: *
0063: * int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
0064: * int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6
0065: * int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5
0066: *
0067: * int startInside = r.getParenStart(1); // startInside will be index 1
0068: * int endInside = r.getParenEnd(1); // endInside will be index 5
0069: * int lenInside = r.getParenLength(1); // lenInside will be 4
0070: * </pre>
0071: *
0072: * You can also refer to the contents of a parenthesized expression
0073: * within a regular expression itself. This is called a
0074: * 'backreference'. The first backreference in a regular expression is
0075: * denoted by \1, the second by \2 and so on. So the expression:
0076: *
0077: * <pre>
0078: * ([0-9]+)=\1
0079: * </pre>
0080: *
0081: * will match any string of the form n=n (like 0=0 or 2=2).
0082: *
0083: * <p>
0084: * The full regular expression syntax accepted by RE is described here:
0085: *
0086: * <pre>
0087: *
0088: * <b><font face=times roman>Characters</font></b>
0089: *
0090: * <i>unicodeChar</i> Matches any identical unicode character
0091: * \ Used to quote a meta-character (like '*')
0092: * \\ Matches a single '\' character
0093: * \0nnn Matches a given octal character
0094: * \xhh Matches a given 8-bit hexadecimal character
0095: * \\uhhhh Matches a given 16-bit hexadecimal character
0096: * \t Matches an ASCII tab character
0097: * \n Matches an ASCII newline character
0098: * \r Matches an ASCII return character
0099: * \f Matches an ASCII form feed character
0100: *
0101: *
0102: * <b><font face=times roman>Character Classes</font></b>
0103: *
0104: * [abc] Simple character class
0105: * [a-zA-Z] Character class with ranges
0106: * [^abc] Negated character class
0107: * </pre>
0108: *
0109: * <b>NOTE:</b> Incomplete ranges will be interpreted as "starts
0110: * from zero" or "ends with last character".
0111: * <br>
0112: * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
0113: * [-] means "all characters".
0114: *
0115: * <pre>
0116: *
0117: * <b><font face=times roman>Standard POSIX Character Classes</font></b>
0118: *
0119: * [:alnum:] Alphanumeric characters.
0120: * [:alpha:] Alphabetic characters.
0121: * [:blank:] Space and tab characters.
0122: * [:cntrl:] Control characters.
0123: * [:digit:] Numeric characters.
0124: * [:graph:] Characters that are printable and are also visible.
0125: * (A space is printable, but not visible, while an
0126: * `a' is both.)
0127: * [:lower:] Lower-case alphabetic characters.
0128: * [:print:] Printable characters (characters that are not
0129: * control characters.)
0130: * [:punct:] Punctuation characters (characters that are not letter,
0131: * digits, control characters, or space characters).
0132: * [:space:] Space characters (such as space, tab, and formfeed,
0133: * to name a few).
0134: * [:upper:] Upper-case alphabetic characters.
0135: * [:xdigit:] Characters that are hexadecimal digits.
0136: *
0137: *
0138: * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
0139: *
0140: * [:javastart:] Start of a Java identifier
0141: * [:javapart:] Part of a Java identifier
0142: *
0143: *
0144: * <b><font face=times roman>Predefined Classes</font></b>
0145: *
0146: * . Matches any character other than newline
0147: * \w Matches a "word" character (alphanumeric plus "_")
0148: * \W Matches a non-word character
0149: * \s Matches a whitespace character
0150: * \S Matches a non-whitespace character
0151: * \d Matches a digit character
0152: * \D Matches a non-digit character
0153: *
0154: *
0155: * <b><font face=times roman>Boundary Matchers</font></b>
0156: *
0157: * ^ Matches only at the beginning of a line
0158: * $ Matches only at the end of a line
0159: * \b Matches only at a word boundary
0160: * \B Matches only at a non-word boundary
0161: *
0162: *
0163: * <b><font face=times roman>Greedy Closures</font></b>
0164: *
0165: * A* Matches A 0 or more times (greedy)
0166: * A+ Matches A 1 or more times (greedy)
0167: * A? Matches A 1 or 0 times (greedy)
0168: * A{n} Matches A exactly n times (greedy)
0169: * A{n,} Matches A at least n times (greedy)
0170: * A{n,m} Matches A at least n but not more than m times (greedy)
0171: *
0172: *
0173: * <b><font face=times roman>Reluctant Closures</font></b>
0174: *
0175: * A*? Matches A 0 or more times (reluctant)
0176: * A+? Matches A 1 or more times (reluctant)
0177: * A?? Matches A 0 or 1 times (reluctant)
0178: *
0179: *
0180: * <b><font face=times roman>Logical Operators</font></b>
0181: *
0182: * AB Matches A followed by B
0183: * A|B Matches either A or B
0184: * (A) Used for subexpression grouping
0185: * (?:A) Used for subexpression clustering (just like grouping but
0186: * no backrefs)
0187: *
0188: *
0189: * <b><font face=times roman>Backreferences</font></b>
0190: *
0191: * \1 Backreference to 1st parenthesized subexpression
0192: * \2 Backreference to 2nd parenthesized subexpression
0193: * \3 Backreference to 3rd parenthesized subexpression
0194: * \4 Backreference to 4th parenthesized subexpression
0195: * \5 Backreference to 5th parenthesized subexpression
0196: * \6 Backreference to 6th parenthesized subexpression
0197: * \7 Backreference to 7th parenthesized subexpression
0198: * \8 Backreference to 8th parenthesized subexpression
0199: * \9 Backreference to 9th parenthesized subexpression
0200: * </pre>
0201: *
0202: * <p>
0203: * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
0204: * that they match as many elements of the string as possible without
0205: * causing the overall match to fail. If you want a closure to be
0206: * reluctant (non-greedy), you can simply follow it with a '?'. A
0207: * reluctant closure will match as few elements of the string as
0208: * possible when finding matches. {m,n} closures don't currently
0209: * support reluctancy.
0210: *
0211: * <p>
0212: * <b><font face="times roman">Line terminators</font></b>
0213: * <br>
0214: * A line terminator is a one- or two-character sequence that marks
0215: * the end of a line of the input character sequence. The following
0216: * are recognized as line terminators:
0217: * <ul>
0218: * <li>A newline (line feed) character ('\n'),</li>
0219: * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li>
0220: * <li>A standalone carriage-return character ('\r'),</li>
0221: * <li>A next-line character ('\u0085'),</li>
0222: * <li>A line-separator character ('\u2028'), or</li>
0223: * <li>A paragraph-separator character ('\u2029).</li>
0224: * </ul>
0225: *
0226: * <p>
0227: * RE runs programs compiled by the RECompiler class. But the RE
0228: * matcher class does not include the actual regular expression compiler
0229: * for reasons of efficiency. In fact, if you want to pre-compile one
0230: * or more regular expressions, the 'recompile' class can be invoked
0231: * from the command line to produce compiled output like this:
0232: *
0233: * <pre>
0234: * // Pre-compiled regular expression "a*b"
0235: * char[] re1Instructions =
0236: * {
0237: * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
0238: * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
0239: * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
0240: * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
0241: * 0x0000,
0242: * };
0243: *
0244: *
0245: * REProgram re1 = new REProgram(re1Instructions);
0246: * </pre>
0247: *
0248: * You can then construct a regular expression matcher (RE) object from
0249: * the pre-compiled expression re1 and thus avoid the overhead of
0250: * compiling the expression at runtime. If you require more dynamic
0251: * regular expressions, you can construct a single RECompiler object and
0252: * re-use it to compile each expression. Similarly, you can change the
0253: * program run by a given matcher object at any time. However, RE and
0254: * RECompiler are not threadsafe (for efficiency reasons, and because
0255: * requiring thread safety in this class is deemed to be a rare
0256: * requirement), so you will need to construct a separate compiler or
0257: * matcher object for each thread (unless you do thread synchronization
0258: * yourself). Once expression compiled into the REProgram object, REProgram
0259: * can be safely shared across multiple threads and RE objects.
0260: *
0261: * <br><p><br>
0262: *
0263: * <font color="red">
0264: * <i>ISSUES:</i>
0265: *
0266: * <ul>
0267: * <li>com.weusours.util.re is not currently compatible with all
0268: * standard POSIX regcomp flags</li>
0269: * <li>com.weusours.util.re does not support POSIX equivalence classes
0270: * ([=foo=] syntax) (I18N/locale issue)</li>
0271: * <li>com.weusours.util.re does not support nested POSIX character
0272: * classes (definitely should, but not completely trivial)</li>
0273: * <li>com.weusours.util.re Does not support POSIX character collation
0274: * concepts ([.foo.] syntax) (I18N/locale issue)</li>
0275: * <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
0276: * <li>Should RE support character iterators (for backwards RE matching!)?</li>
0277: * <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
0278: * <li>Not *all* possibilities are considered for greediness when backreferences
0279: * are involved (as POSIX suggests should be the case). The POSIX RE
0280: * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
0281: * of acdacaa where \1 is "a". This is not the case in this RE package,
0282: * and actually Perl doesn't go to this extent either! Until someone
0283: * actually complains about this, I'm not sure it's worth "fixing".
0284: * If it ever is fixed, test #137 in RETest.txt should be updated.</li>
0285: * </ul>
0286: *
0287: * </font>
0288: *
0289: * @see recompile
0290: * @see RECompiler
0291: *
0292: * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
0293: * @author <a href="mailto:ts@sch-fer.de">Tobias Schäfer</a>
0294: * @version $Id: RE.java 518156 2007-03-14 14:31:26Z vgritsenko $
0295: */
0296: public class RE implements Serializable {
0297: /**
0298: * Specifies normal, case-sensitive matching behaviour.
0299: */
0300: public static final int MATCH_NORMAL = 0x0000;
0301:
0302: /**
0303: * Flag to indicate that matching should be case-independent (folded)
0304: */
0305: public static final int MATCH_CASEINDEPENDENT = 0x0001;
0306:
0307: /**
0308: * Newlines should match as BOL/EOL (^ and $)
0309: */
0310: public static final int MATCH_MULTILINE = 0x0002;
0311:
0312: /**
0313: * Consider all input a single body of text - newlines are matched by .
0314: */
0315: public static final int MATCH_SINGLELINE = 0x0004;
0316:
0317: /************************************************
0318: * *
0319: * The format of a node in a program is: *
0320: * *
0321: * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
0322: * *
0323: * char OPCODE - instruction *
0324: * char OPDATA - modifying data *
0325: * char OPNEXT - next node (relative offset) *
0326: * *
0327: ************************************************/
0328:
0329: // Opcode Char Opdata/Operand Meaning
0330: // ---------- ---------- --------------- --------------------------------------------------
0331: static final char OP_END = 'E'; // end of program
0332: static final char OP_BOL = '^'; // match only if at beginning of line
0333: static final char OP_EOL = '$'; // match only if at end of line
0334: static final char OP_ANY = '.'; // match any single character except newline
0335: static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
0336: static final char OP_BRANCH = '|'; // node match this alternative or the next one
0337: static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
0338: static final char OP_STAR = '*'; // node kleene closure
0339: static final char OP_PLUS = '+'; // node positive closure
0340: static final char OP_MAYBE = '?'; // node optional closure
0341: static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
0342: static final char OP_OPEN = '('; // number nth opening paren
0343: static final char OP_OPEN_CLUSTER = '<'; // opening cluster
0344: static final char OP_CLOSE = ')'; // number nth closing paren
0345: static final char OP_CLOSE_CLUSTER = '>'; // closing cluster
0346: static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
0347: static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
0348: static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
0349: static final char OP_CONTINUE = 'C'; // continue to the following command (ignore next)
0350: static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
0351: static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
0352: static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
0353: static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
0354:
0355: // Escape codes
0356: static final char E_ALNUM = 'w'; // Alphanumeric
0357: static final char E_NALNUM = 'W'; // Non-alphanumeric
0358: static final char E_BOUND = 'b'; // Word boundary
0359: static final char E_NBOUND = 'B'; // Non-word boundary
0360: static final char E_SPACE = 's'; // Whitespace
0361: static final char E_NSPACE = 'S'; // Non-whitespace
0362: static final char E_DIGIT = 'd'; // Digit
0363: static final char E_NDIGIT = 'D'; // Non-digit
0364:
0365: // Posix character classes
0366: static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
0367: static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
0368: static final char POSIX_CLASS_BLANK = 'b'; // Blanks
0369: static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
0370: static final char POSIX_CLASS_DIGIT = 'd'; // Digits
0371: static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
0372: static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
0373: static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
0374: static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
0375: static final char POSIX_CLASS_SPACE = 's'; // Spaces
0376: static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
0377: static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
0378: static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
0379: static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
0380:
0381: // Limits
0382: static final int maxNode = 65536; // Maximum number of nodes in a program
0383: static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs)
0384:
0385: // Node layout constants
0386: static final int offsetOpcode = 0; // Opcode offset (first character)
0387: static final int offsetOpdata = 1; // Opdata offset (second char)
0388: static final int offsetNext = 2; // Next index offset (third char)
0389: static final int nodeSize = 3; // Node size (in chars)
0390:
0391: // State of current program
0392: REProgram program; // Compiled regular expression 'program'
0393: transient CharacterIterator search; // The string being matched against
0394: int matchFlags; // Match behaviour flags
0395: int maxParen = MAX_PAREN;
0396:
0397: // Parenthesized subexpressions
0398: transient int parenCount; // Number of subexpressions matched (num open parens + 1)
0399: transient int start0; // Cache of start[0]
0400: transient int end0; // Cache of start[0]
0401: transient int start1; // Cache of start[1]
0402: transient int end1; // Cache of start[1]
0403: transient int start2; // Cache of start[2]
0404: transient int end2; // Cache of start[2]
0405: transient int[] startn; // Lazy-alloced array of sub-expression starts
0406: transient int[] endn; // Lazy-alloced array of sub-expression ends
0407:
0408: // Backreferences
0409: transient int[] startBackref; // Lazy-alloced array of backref starts
0410: transient int[] endBackref; // Lazy-alloced array of backref ends
0411:
0412: /**
0413: * Constructs a regular expression matcher from a String by compiling it
0414: * using a new instance of RECompiler. If you will be compiling many
0415: * expressions, you may prefer to use a single RECompiler object instead.
0416: *
0417: * @param pattern The regular expression pattern to compile.
0418: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0419: * @see RECompiler
0420: * @see recompile
0421: */
0422: public RE(String pattern) throws RESyntaxException {
0423: this (pattern, MATCH_NORMAL);
0424: }
0425:
0426: /**
0427: * Constructs a regular expression matcher from a String by compiling it
0428: * using a new instance of RECompiler. If you will be compiling many
0429: * expressions, you may prefer to use a single RECompiler object instead.
0430: *
0431: * @param pattern The regular expression pattern to compile.
0432: * @param matchFlags The matching style
0433: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0434: * @see RECompiler
0435: * @see recompile
0436: */
0437: public RE(String pattern, int matchFlags) throws RESyntaxException {
0438: this (new RECompiler().compile(pattern), matchFlags);
0439: }
0440:
0441: /**
0442: * Construct a matcher for a pre-compiled regular expression from program
0443: * (bytecode) data. Permits special flags to be passed in to modify matching
0444: * behaviour.
0445: *
0446: * @param program Compiled regular expression program (see RECompiler and/or recompile)
0447: * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
0448: *
0449: * <pre>
0450: * MATCH_NORMAL // Normal (case-sensitive) matching
0451: * MATCH_CASEINDEPENDENT // Case folded comparisons
0452: * MATCH_MULTILINE // Newline matches as BOL/EOL
0453: * </pre>
0454: *
0455: * @see RECompiler
0456: * @see REProgram
0457: * @see recompile
0458: */
0459: public RE(REProgram program, int matchFlags) {
0460: setProgram(program);
0461: setMatchFlags(matchFlags);
0462: }
0463:
0464: /**
0465: * Construct a matcher for a pre-compiled regular expression from program
0466: * (bytecode) data.
0467: *
0468: * @param program Compiled regular expression program
0469: * @see RECompiler
0470: * @see recompile
0471: */
0472: public RE(REProgram program) {
0473: this (program, MATCH_NORMAL);
0474: }
0475:
0476: /**
0477: * Constructs a regular expression matcher with no initial program.
0478: * This is likely to be an uncommon practice, but is still supported.
0479: */
0480: public RE() {
0481: this ((REProgram) null, MATCH_NORMAL);
0482: }
0483:
0484: /**
0485: * Converts a 'simplified' regular expression to a full regular expression
0486: *
0487: * @param pattern The pattern to convert
0488: * @return The full regular expression
0489: */
0490: public static String simplePatternToFullRegularExpression(
0491: String pattern) {
0492: StringBuffer buf = new StringBuffer();
0493: for (int i = 0; i < pattern.length(); i++) {
0494: char c = pattern.charAt(i);
0495: switch (c) {
0496: case '*':
0497: buf.append(".*");
0498: break;
0499:
0500: case '.':
0501: case '[':
0502: case ']':
0503: case '\\':
0504: case '+':
0505: case '?':
0506: case '{':
0507: case '}':
0508: case '$':
0509: case '^':
0510: case '|':
0511: case '(':
0512: case ')':
0513: buf.append('\\');
0514: default:
0515: buf.append(c);
0516: break;
0517: }
0518: }
0519: return buf.toString();
0520: }
0521:
0522: /**
0523: * Sets match behaviour flags which alter the way RE does matching.
0524: * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
0525: *
0526: * <pre>
0527: * MATCH_NORMAL // Normal (case-sensitive) matching
0528: * MATCH_CASEINDEPENDENT // Case folded comparisons
0529: * MATCH_MULTILINE // Newline matches as BOL/EOL
0530: * </pre>
0531: */
0532: public void setMatchFlags(int matchFlags) {
0533: this .matchFlags = matchFlags;
0534: }
0535:
0536: /**
0537: * Returns the current match behaviour flags.
0538: * @return Current match behaviour flags (RE.MATCH_*).
0539: *
0540: * <pre>
0541: * MATCH_NORMAL // Normal (case-sensitive) matching
0542: * MATCH_CASEINDEPENDENT // Case folded comparisons
0543: * MATCH_MULTILINE // Newline matches as BOL/EOL
0544: * </pre>
0545: *
0546: * @see #setMatchFlags
0547: */
0548: public int getMatchFlags() {
0549: return matchFlags;
0550: }
0551:
0552: /**
0553: * Sets the current regular expression program used by this matcher object.
0554: *
0555: * @param program Regular expression program compiled by RECompiler.
0556: * @see RECompiler
0557: * @see REProgram
0558: * @see recompile
0559: */
0560: public void setProgram(REProgram program) {
0561: this .program = program;
0562: if (program != null && program.maxParens != -1) {
0563: this .maxParen = program.maxParens;
0564: } else {
0565: this .maxParen = MAX_PAREN;
0566: }
0567: }
0568:
0569: /**
0570: * Returns the current regular expression program in use by this matcher object.
0571: *
0572: * @return Regular expression program
0573: * @see #setProgram
0574: */
0575: public REProgram getProgram() {
0576: return program;
0577: }
0578:
0579: /**
0580: * Returns the number of parenthesized subexpressions available after a successful match.
0581: *
0582: * @return Number of available parenthesized subexpressions
0583: */
0584: public int getParenCount() {
0585: return parenCount;
0586: }
0587:
0588: /**
0589: * Gets the contents of a parenthesized subexpression after a successful match.
0590: *
0591: * @param which Nesting level of subexpression
0592: * @return String
0593: */
0594: public String getParen(int which) {
0595: int start;
0596: if (which < parenCount && (start = getParenStart(which)) >= 0) {
0597: return search.substring(start, getParenEnd(which));
0598: }
0599: return null;
0600: }
0601:
0602: /**
0603: * Returns the start index of a given paren level.
0604: *
0605: * @param which Nesting level of subexpression
0606: * @return String index
0607: */
0608: public final int getParenStart(int which) {
0609: if (which < parenCount) {
0610: switch (which) {
0611: case 0:
0612: return start0;
0613:
0614: case 1:
0615: return start1;
0616:
0617: case 2:
0618: return start2;
0619:
0620: default:
0621: if (startn == null) {
0622: allocParens();
0623: }
0624: return startn[which];
0625: }
0626: }
0627: return -1;
0628: }
0629:
0630: /**
0631: * Returns the end index of a given paren level.
0632: *
0633: * @param which Nesting level of subexpression
0634: * @return String index
0635: */
0636: public final int getParenEnd(int which) {
0637: if (which < parenCount) {
0638: switch (which) {
0639: case 0:
0640: return end0;
0641:
0642: case 1:
0643: return end1;
0644:
0645: case 2:
0646: return end2;
0647:
0648: default:
0649: if (endn == null) {
0650: allocParens();
0651: }
0652: return endn[which];
0653: }
0654: }
0655: return -1;
0656: }
0657:
0658: /**
0659: * Returns the length of a given paren level.
0660: *
0661: * @param which Nesting level of subexpression
0662: * @return Number of characters in the parenthesized subexpression
0663: */
0664: public final int getParenLength(int which) {
0665: if (which < parenCount) {
0666: return getParenEnd(which) - getParenStart(which);
0667: }
0668: return -1;
0669: }
0670:
0671: /**
0672: * Sets the start of a paren level
0673: *
0674: * @param which Which paren level
0675: * @param i Index in input array
0676: */
0677: protected final void setParenStart(int which, int i) {
0678: if (which < parenCount) {
0679: switch (which) {
0680: case 0:
0681: start0 = i;
0682: break;
0683:
0684: case 1:
0685: start1 = i;
0686: break;
0687:
0688: case 2:
0689: start2 = i;
0690: break;
0691:
0692: default:
0693: if (startn == null) {
0694: allocParens();
0695: }
0696: startn[which] = i;
0697: break;
0698: }
0699: }
0700: }
0701:
0702: /**
0703: * Sets the end of a paren level
0704: *
0705: * @param which Which paren level
0706: * @param i Index in input array
0707: */
0708: protected final void setParenEnd(int which, int i) {
0709: if (which < parenCount) {
0710: switch (which) {
0711: case 0:
0712: end0 = i;
0713: break;
0714:
0715: case 1:
0716: end1 = i;
0717: break;
0718:
0719: case 2:
0720: end2 = i;
0721: break;
0722:
0723: default:
0724: if (endn == null) {
0725: allocParens();
0726: }
0727: endn[which] = i;
0728: break;
0729: }
0730: }
0731: }
0732:
0733: /**
0734: * Throws an Error representing an internal error condition probably resulting
0735: * from a bug in the regular expression compiler (or possibly data corruption).
0736: * In practice, this should be very rare.
0737: *
0738: * @param s Error description
0739: */
0740: protected void internalError(String s) throws Error {
0741: throw new Error("RE internal error: " + s);
0742: }
0743:
0744: /**
0745: * Performs lazy allocation of subexpression arrays
0746: */
0747: private void allocParens() {
0748: // Allocate arrays for subexpressions
0749: startn = new int[maxParen];
0750: endn = new int[maxParen];
0751:
0752: // Set sub-expression pointers to invalid values
0753: for (int i = 0; i < maxParen; i++) {
0754: startn[i] = -1;
0755: endn[i] = -1;
0756: }
0757: }
0758:
0759: /**
0760: * Try to match a string against a subset of nodes in the program
0761: *
0762: * @param firstNode Node to start at in program
0763: * @param lastNode Last valid node (used for matching a subexpression without
0764: * matching the rest of the program as well).
0765: * @param idxStart Starting position in character array
0766: * @return Final input array index if match succeeded. -1 if not.
0767: */
0768: protected int matchNodes(int firstNode, int lastNode, int idxStart) {
0769: // Our current place in the string
0770: int idx = idxStart;
0771:
0772: // Loop while node is valid
0773: int next, opcode, opdata;
0774: int idxNew;
0775: char[] instruction = program.instruction;
0776: for (int node = firstNode; node < lastNode;) {
0777: opcode = instruction[node /* + offsetOpcode */];
0778: next = node + (short) instruction[node + offsetNext];
0779: opdata = instruction[node + offsetOpdata];
0780:
0781: switch (opcode) {
0782: case OP_MAYBE:
0783: case OP_STAR: {
0784: // Try to match the following subexpr. If it matches:
0785: // MAYBE: Continues matching rest of the expression
0786: // STAR: Points back here to repeat subexpr matching
0787: if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) {
0788: return idxNew;
0789: }
0790:
0791: // If failed, just continue with the rest of expression
0792: break;
0793: }
0794:
0795: case OP_PLUS: {
0796: // Try to match the subexpr again (and again (and ...
0797: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0798: return idxNew;
0799: }
0800:
0801: // If failed, just continue with the rest of expression
0802: // Rest is located at the next pointer of the next instruction
0803: // (which must be OP_CONTINUE)
0804: node = next + (short) instruction[next + offsetNext];
0805: continue;
0806: }
0807:
0808: case OP_RELUCTANTMAYBE:
0809: case OP_RELUCTANTSTAR: {
0810: // Try to match the rest without using the reluctant subexpr
0811: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0812: return idxNew;
0813: }
0814:
0815: // Try reluctant subexpr. If it matches:
0816: // RELUCTANTMAYBE: Continues matching rest of the expression
0817: // RELUCTANTSTAR: Points back here to repeat reluctant star matching
0818: return matchNodes(node + nodeSize, next, idx);
0819: }
0820:
0821: case OP_RELUCTANTPLUS: {
0822: // Continue matching the rest without using the reluctant subexpr
0823: if ((idxNew = matchNodes(next
0824: + (short) instruction[next + offsetNext],
0825: maxNode, idx)) != -1) {
0826: return idxNew;
0827: }
0828:
0829: // Try to match subexpression again
0830: break;
0831: }
0832:
0833: case OP_OPEN:
0834:
0835: // Match subexpression
0836: if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
0837: startBackref[opdata] = idx;
0838: }
0839: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0840: // Increase valid paren count
0841: if (opdata >= parenCount) {
0842: parenCount = opdata + 1;
0843: }
0844:
0845: // Don't set paren if already set later on
0846: if (getParenStart(opdata) == -1) {
0847: setParenStart(opdata, idx);
0848: }
0849: }
0850: return idxNew;
0851:
0852: case OP_CLOSE:
0853:
0854: // Done matching subexpression
0855: if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
0856: endBackref[opdata] = idx;
0857: }
0858: if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
0859: // Increase valid paren count
0860: if (opdata >= parenCount) {
0861: parenCount = opdata + 1;
0862: }
0863:
0864: // Don't set paren if already set later on
0865: if (getParenEnd(opdata) == -1) {
0866: setParenEnd(opdata, idx);
0867: }
0868: }
0869: return idxNew;
0870:
0871: case OP_BACKREF: {
0872: // Get the start and end of the backref
0873: int s = startBackref[opdata];
0874: int e = endBackref[opdata];
0875:
0876: // We don't know the backref yet
0877: if (s == -1 || e == -1) {
0878: return -1;
0879: }
0880:
0881: // The backref is empty size
0882: if (s == e) {
0883: break;
0884: }
0885:
0886: // Get the length of the backref
0887: int l = e - s;
0888:
0889: // If there's not enough input left, give up.
0890: if (search.isEnd(idx + l - 1)) {
0891: return -1;
0892: }
0893:
0894: // Case fold the backref?
0895: final boolean caseFold = ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
0896: // Compare backref to input
0897: for (int i = 0; i < l; i++) {
0898: if (compareChars(search.charAt(idx++), search
0899: .charAt(s + i), caseFold) != 0) {
0900: return -1;
0901: }
0902: }
0903: }
0904: break;
0905:
0906: case OP_BOL:
0907:
0908: // Fail if we're not at the start of the string
0909: if (idx != 0) {
0910: // If we're multiline matching, we could still be at the start of a line
0911: if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
0912: // Continue if at the start of a line
0913: if (isNewline(idx - 1)) {
0914: break;
0915: }
0916: }
0917: return -1;
0918: }
0919: break;
0920:
0921: case OP_EOL:
0922:
0923: // If we're not at the end of string
0924: if (!search.isEnd(0) && !search.isEnd(idx)) {
0925: // If we're multi-line matching
0926: if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
0927: // Continue if we're at the end of a line
0928: if (isNewline(idx)) {
0929: break;
0930: }
0931: }
0932: return -1;
0933: }
0934: break;
0935:
0936: case OP_ESCAPE:
0937:
0938: // Which escape?
0939: switch (opdata) {
0940: // Word boundary match
0941: case E_NBOUND:
0942: case E_BOUND: {
0943: char cLast = ((idx == 0) ? '\n' : search
0944: .charAt(idx - 1));
0945: char cNext = ((search.isEnd(idx)) ? '\n' : search
0946: .charAt(idx));
0947: if ((Character.isLetterOrDigit(cLast) == Character
0948: .isLetterOrDigit(cNext)) == (opdata == E_BOUND)) {
0949: return -1;
0950: }
0951: }
0952: break;
0953:
0954: // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
0955: case E_ALNUM:
0956: case E_NALNUM:
0957: case E_DIGIT:
0958: case E_NDIGIT:
0959: case E_SPACE:
0960: case E_NSPACE:
0961:
0962: // Give up if out of input
0963: if (search.isEnd(idx)) {
0964: return -1;
0965: }
0966:
0967: char c = search.charAt(idx);
0968:
0969: // Switch on escape
0970: switch (opdata) {
0971: case E_ALNUM:
0972: case E_NALNUM:
0973: if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM))) {
0974: return -1;
0975: }
0976: break;
0977:
0978: case E_DIGIT:
0979: case E_NDIGIT:
0980: if (!(Character.isDigit(c) == (opdata == E_DIGIT))) {
0981: return -1;
0982: }
0983: break;
0984:
0985: case E_SPACE:
0986: case E_NSPACE:
0987: if (!(Character.isWhitespace(c) == (opdata == E_SPACE))) {
0988: return -1;
0989: }
0990: break;
0991: }
0992: idx++;
0993: break;
0994:
0995: default:
0996: internalError("Unrecognized escape '" + opdata
0997: + "'");
0998: }
0999: break;
1000:
1001: case OP_ANY:
1002:
1003: if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1004: // Match anything
1005: if (search.isEnd(idx)) {
1006: return -1;
1007: }
1008: } else {
1009: // Match anything but a newline
1010: if (search.isEnd(idx) || isNewline(idx)) {
1011: return -1;
1012: }
1013: }
1014: idx++;
1015: break;
1016:
1017: case OP_ATOM: {
1018: // Match an atom value
1019: if (search.isEnd(idx)) {
1020: return -1;
1021: }
1022:
1023: // Get length of atom and starting index
1024: // int lenAtom = opdata;
1025: int startAtom = node + nodeSize;
1026:
1027: // Give up if not enough input remains to have a match
1028: if (search.isEnd(opdata + idx - 1)) {
1029: return -1;
1030: }
1031:
1032: // Match atom differently depending on casefolding flag
1033: final boolean caseFold = ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
1034:
1035: for (int i = 0; i < opdata; i++) {
1036: if (compareChars(search.charAt(idx++),
1037: instruction[startAtom + i], caseFold) != 0) {
1038: return -1;
1039: }
1040: }
1041: }
1042: break;
1043:
1044: case OP_POSIXCLASS: {
1045: // Out of input?
1046: if (search.isEnd(idx)) {
1047: return -1;
1048: }
1049:
1050: switch (opdata) {
1051: case POSIX_CLASS_ALNUM:
1052: if (!Character.isLetterOrDigit(search.charAt(idx))) {
1053: return -1;
1054: }
1055: break;
1056:
1057: case POSIX_CLASS_ALPHA:
1058: if (!Character.isLetter(search.charAt(idx))) {
1059: return -1;
1060: }
1061: break;
1062:
1063: case POSIX_CLASS_DIGIT:
1064: if (!Character.isDigit(search.charAt(idx))) {
1065: return -1;
1066: }
1067: break;
1068:
1069: case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1070: if (!Character.isSpaceChar(search.charAt(idx))) {
1071: return -1;
1072: }
1073: break;
1074:
1075: case POSIX_CLASS_SPACE:
1076: if (!Character.isWhitespace(search.charAt(idx))) {
1077: return -1;
1078: }
1079: break;
1080:
1081: case POSIX_CLASS_CNTRL:
1082: if (Character.getType(search.charAt(idx)) != Character.CONTROL) {
1083: return -1;
1084: }
1085: break;
1086:
1087: case POSIX_CLASS_GRAPH: // JWL - bugbug???
1088: switch (Character.getType(search.charAt(idx))) {
1089: case Character.MATH_SYMBOL:
1090: case Character.CURRENCY_SYMBOL:
1091: case Character.MODIFIER_SYMBOL:
1092: case Character.OTHER_SYMBOL:
1093: break;
1094:
1095: default:
1096: return -1;
1097: }
1098: break;
1099:
1100: case POSIX_CLASS_LOWER:
1101: if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) {
1102: return -1;
1103: }
1104: break;
1105:
1106: case POSIX_CLASS_UPPER:
1107: if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) {
1108: return -1;
1109: }
1110: break;
1111:
1112: case POSIX_CLASS_PRINT:
1113: if (Character.getType(search.charAt(idx)) == Character.CONTROL) {
1114: return -1;
1115: }
1116: break;
1117:
1118: case POSIX_CLASS_PUNCT: {
1119: int type = Character.getType(search.charAt(idx));
1120: switch (type) {
1121: case Character.DASH_PUNCTUATION:
1122: case Character.START_PUNCTUATION:
1123: case Character.END_PUNCTUATION:
1124: case Character.CONNECTOR_PUNCTUATION:
1125: case Character.OTHER_PUNCTUATION:
1126: break;
1127:
1128: default:
1129: return -1;
1130: }
1131: }
1132: break;
1133:
1134: case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1135: {
1136: boolean isXDigit = ((search.charAt(idx) >= '0' && search
1137: .charAt(idx) <= '9')
1138: || (search.charAt(idx) >= 'a' && search
1139: .charAt(idx) <= 'f') || (search
1140: .charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1141: if (!isXDigit) {
1142: return -1;
1143: }
1144: }
1145: break;
1146:
1147: case POSIX_CLASS_JSTART:
1148: if (!Character.isJavaIdentifierStart(search
1149: .charAt(idx))) {
1150: return -1;
1151: }
1152: break;
1153:
1154: case POSIX_CLASS_JPART:
1155: if (!Character.isJavaIdentifierPart(search
1156: .charAt(idx))) {
1157: return -1;
1158: }
1159: break;
1160:
1161: default:
1162: internalError("Bad posix class");
1163: break;
1164: }
1165:
1166: // Matched.
1167: idx++;
1168: }
1169: break;
1170:
1171: case OP_ANYOF: {
1172: // Out of input?
1173: if (search.isEnd(idx)) {
1174: return -1;
1175: }
1176:
1177: // Get character to match against character class and maybe casefold
1178: char c = search.charAt(idx);
1179: boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1180: // Loop through character class checking our match character
1181: int idxRange = node + nodeSize;
1182: int idxEnd = idxRange + (opdata * 2);
1183: boolean match = false;
1184: for (int i = idxRange; !match && i < idxEnd;) {
1185: // Get start, end and match characters
1186: char s = instruction[i++];
1187: char e = instruction[i++];
1188:
1189: match = ((compareChars(c, s, caseFold) >= 0) && (compareChars(
1190: c, e, caseFold) <= 0));
1191: }
1192:
1193: // Fail if we didn't match the character class
1194: if (!match) {
1195: return -1;
1196: }
1197: idx++;
1198: }
1199: break;
1200:
1201: case OP_BRANCH: {
1202: // Check for choices
1203: // FIXME Dead code - only reason to keep is backward compat with pre-compiled exprs. Remove?
1204: if (instruction[next /* + offsetOpcode */] != OP_BRANCH) {
1205: // If there aren't any other choices, just evaluate this branch.
1206: node += nodeSize;
1207: continue;
1208: }
1209:
1210: // Try all available branches
1211: int nextBranch;
1212: do {
1213: // Try matching the branch against the string
1214: if ((idxNew = matchNodes(node + nodeSize, maxNode,
1215: idx)) != -1) {
1216: return idxNew;
1217: }
1218:
1219: // Go to next branch (if any)
1220: nextBranch = (short) instruction[node + offsetNext];
1221: node += nextBranch;
1222: } while (nextBranch != 0
1223: && (instruction[node /* + offsetOpcode */] == OP_BRANCH));
1224:
1225: // Failed to match any branch!
1226: return -1;
1227: }
1228:
1229: case OP_OPEN_CLUSTER:
1230: case OP_CLOSE_CLUSTER:
1231: // starting or ending the matching of a subexpression which has no backref.
1232:
1233: case OP_NOTHING:
1234: case OP_GOTO:
1235:
1236: // Just advance to the next node without doing anything
1237: break;
1238:
1239: case OP_CONTINUE:
1240:
1241: // Advance to the following node
1242: node += nodeSize;
1243: continue;
1244:
1245: case OP_END:
1246:
1247: // Match has succeeded!
1248: setParenEnd(0, idx);
1249: return idx;
1250:
1251: default:
1252:
1253: // Corrupt program
1254: internalError("Invalid opcode '" + opcode + "'");
1255: }
1256:
1257: // Advance to the next node in the program
1258: node = next;
1259: }
1260:
1261: // We "should" never end up here
1262: internalError("Corrupt program");
1263: return -1;
1264: }
1265:
1266: /**
1267: * Match the current regular expression program against the current
1268: * input string, starting at index i of the input string. This method
1269: * is only meant for internal use.
1270: *
1271: * @param i The input string index to start matching at
1272: * @return True if the input matched the expression
1273: */
1274: protected boolean matchAt(int i) {
1275: // Initialize start pointer, paren cache and paren count
1276: start0 = -1;
1277: end0 = -1;
1278: start1 = -1;
1279: end1 = -1;
1280: start2 = -1;
1281: end2 = -1;
1282: startn = null;
1283: endn = null;
1284: parenCount = 1;
1285: setParenStart(0, i);
1286:
1287: // Allocate backref arrays (unless optimizations indicate otherwise)
1288: if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
1289: startBackref = new int[maxParen];
1290: endBackref = new int[maxParen];
1291: }
1292:
1293: // Match against string
1294: int idx;
1295: if ((idx = matchNodes(0, maxNode, i)) != -1) {
1296: setParenEnd(0, idx);
1297: return true;
1298: }
1299:
1300: // Didn't match
1301: parenCount = 0;
1302: return false;
1303: }
1304:
1305: /**
1306: * Matches the current regular expression program against a character array,
1307: * starting at a given index.
1308: *
1309: * @param search String to match against
1310: * @param i Index to start searching at
1311: * @return True if string matched
1312: */
1313: public boolean match(String search, int i) {
1314: return match(new StringCharacterIterator(search), i);
1315: }
1316:
1317: /**
1318: * Matches the current regular expression program against a character array,
1319: * starting at a given index.
1320: *
1321: * @param search String to match against
1322: * @param i Index to start searching at
1323: * @return True if string matched
1324: */
1325: public boolean match(CharacterIterator search, int i) {
1326: // There is no compiled program to search with!
1327: if (program == null) {
1328: // This should be uncommon enough to be an error case rather
1329: // than an exception (which would have to be handled everywhere)
1330: internalError("No RE program to run!");
1331: }
1332:
1333: // Save string to search
1334: this .search = search;
1335:
1336: // Can we optimize the search by looking for new lines?
1337: if ((program.flags & REProgram.OPT_HASBOL) == REProgram.OPT_HASBOL) {
1338: // Non multi-line matching with BOL: Must match at '0' index
1339: if ((matchFlags & MATCH_MULTILINE) == 0) {
1340: return i == 0 && matchAt(i);
1341: }
1342:
1343: // Multi-line matching with BOL: Seek to next line
1344: for (; !search.isEnd(i); i++) {
1345: // Skip if we are at the beginning of the line
1346: if (isNewline(i)) {
1347: continue;
1348: }
1349:
1350: // Match at the beginning of the line
1351: if (matchAt(i)) {
1352: return true;
1353: }
1354:
1355: // Skip to the end of line
1356: for (; !search.isEnd(i); i++) {
1357: if (isNewline(i)) {
1358: break;
1359: }
1360: }
1361: }
1362:
1363: return false;
1364: }
1365:
1366: // Can we optimize the search by looking for a prefix string?
1367: if (program.prefix == null) {
1368: // Unprefixed matching must try for a match at each character
1369: for (; !search.isEnd(i - 1); i++) {
1370: // Try a match at index i
1371: if (matchAt(i)) {
1372: return true;
1373: }
1374: }
1375: return false;
1376: } else {
1377: // Prefix-anchored matching is possible
1378: boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1379: char[] prefix = program.prefix;
1380: for (; !search.isEnd(i + prefix.length - 1); i++) {
1381: int j = i;
1382: int k = 0;
1383:
1384: boolean match;
1385: do {
1386: // If there's a mismatch of any character in the prefix, give up
1387: match = (compareChars(search.charAt(j++),
1388: prefix[k++], caseIndependent) == 0);
1389: } while (match && k < prefix.length);
1390:
1391: // See if the whole prefix string matched
1392: if (k == prefix.length) {
1393: // We matched the full prefix at firstChar, so try it
1394: if (matchAt(i)) {
1395: return true;
1396: }
1397: }
1398: }
1399: return false;
1400: }
1401: }
1402:
1403: /**
1404: * Matches the current regular expression program against a String.
1405: *
1406: * @param search String to match against
1407: * @return True if string matched
1408: */
1409: public boolean match(String search) {
1410: return match(search, 0);
1411: }
1412:
1413: /**
1414: * Splits a string into an array of strings on regular expression boundaries.
1415: * This function works the same way as the Perl function of the same name.
1416: * Given a regular expression of "[ab]+" and a string to split of
1417: * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1418: * "[xyzzy, yyz, 123]".
1419: *
1420: * <p>Please note that the first string in the resulting array may be an empty
1421: * string. This happens when the very first character of input string is
1422: * matched by the pattern.
1423: *
1424: * @param s String to split on this regular exression
1425: * @return Array of strings
1426: */
1427: public String[] split(String s) {
1428: // Create new vector
1429: Vector v = new Vector();
1430:
1431: // Start at position 0 and search the whole string
1432: int pos = 0;
1433: int len = s.length();
1434:
1435: // Try a match at each position
1436: while (pos < len && match(s, pos)) {
1437: // Get start of match
1438: int start = getParenStart(0);
1439:
1440: // Get end of match
1441: int newpos = getParenEnd(0);
1442:
1443: // Check if no progress was made
1444: if (newpos == pos) {
1445: v.addElement(s.substring(pos, start + 1));
1446: newpos++;
1447: } else {
1448: v.addElement(s.substring(pos, start));
1449: }
1450:
1451: // Move to new position
1452: pos = newpos;
1453: }
1454:
1455: // Push remainder if it's not empty
1456: String remainder = s.substring(pos);
1457: if (remainder.length() != 0) {
1458: v.addElement(remainder);
1459: }
1460:
1461: // Return vector as an array of strings
1462: String[] ret = new String[v.size()];
1463: v.copyInto(ret);
1464: return ret;
1465: }
1466:
1467: /**
1468: * Flag bit that indicates that subst should replace all occurrences of this
1469: * regular expression.
1470: */
1471: public static final int REPLACE_ALL = 0x0000;
1472:
1473: /**
1474: * Flag bit that indicates that subst should only replace the first occurrence
1475: * of this regular expression.
1476: */
1477: public static final int REPLACE_FIRSTONLY = 0x0001;
1478:
1479: /**
1480: * Flag bit that indicates that subst should replace backreferences
1481: */
1482: public static final int REPLACE_BACKREFERENCES = 0x0002;
1483:
1484: /**
1485: * Substitutes a string for this regular expression in another string.
1486: * This method works like the Perl function of the same name.
1487: * Given a regular expression of "a*b", a String to substituteIn of
1488: * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1489: * resulting String returned by subst would be "-foo-garply-wacky-".
1490: *
1491: * @param substituteIn String to substitute within
1492: * @param substitution String to substitute for all matches of this regular expression.
1493: * @return The string substituteIn with zero or more occurrences of the current
1494: * regular expression replaced with the substitution String (if this regular
1495: * expression object doesn't match at any position, the original String is returned
1496: * unchanged).
1497: */
1498: public String subst(String substituteIn, String substitution) {
1499: return subst(substituteIn, substitution, REPLACE_ALL);
1500: }
1501:
1502: /**
1503: * Substitutes a string for this regular expression in another string.
1504: * This method works like the Perl function of the same name.
1505: * Given a regular expression of "a*b", a String to substituteIn of
1506: * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1507: * resulting String returned by subst would be "-foo-garply-wacky-".
1508: * <p>
1509: * It is also possible to reference the contents of a parenthesized expression
1510: * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
1511: * a String to substituteIn of "visit us: http://www.apache.org!" and the
1512: * substitution String "<a href=\"$0\">$0</a>", the resulting String
1513: * returned by subst would be
1514: * "visit us: <a href=\"http://www.apache.org\">http://www.apache.org</a>!".
1515: * <p>
1516: * <i>Note:</i> $0 represents the whole match.
1517: *
1518: * @param substituteIn String to substitute within
1519: * @param substitution String to substitute for matches of this regular expression
1520: * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY
1521: * flag bit is set, only the first occurrence of this regular expression is replaced.
1522: * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1523: * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will
1524: * be processed.
1525: * @return The string substituteIn with zero or more occurrences of the current
1526: * regular expression replaced with the substitution String (if this regular
1527: * expression object doesn't match at any position, the original String is returned
1528: * unchanged).
1529: */
1530: public String subst(String substituteIn, String substitution,
1531: int flags) {
1532: // String to return
1533: StringBuffer ret = new StringBuffer();
1534:
1535: // Start at position 0 and search the whole string
1536: int pos = 0;
1537: int len = substituteIn.length();
1538:
1539: // Try a match at each position
1540: while (pos < len && match(substituteIn, pos)) {
1541: // Append string before match
1542: ret.append(substituteIn.substring(pos, getParenStart(0)));
1543:
1544: if ((flags & REPLACE_BACKREFERENCES) != 0) {
1545: // Process backreferences
1546: int lCurrentPosition = 0;
1547: int lLastPosition = -2;
1548: int lLength = substitution.length();
1549:
1550: while ((lCurrentPosition = substitution.indexOf("$",
1551: lCurrentPosition)) >= 0) {
1552: if ((lCurrentPosition == 0 || substitution
1553: .charAt(lCurrentPosition - 1) != '\\')
1554: && lCurrentPosition + 1 < lLength) {
1555: char c = substitution
1556: .charAt(lCurrentPosition + 1);
1557: if (c >= '0' && c <= '9') {
1558: // Append everything between the last and the current $ sign
1559: ret.append(substitution
1560: .substring(lLastPosition + 2,
1561: lCurrentPosition));
1562:
1563: // Append the parenthesized expression, if present
1564: String val = getParen(c - '0');
1565: if (val != null) {
1566: ret.append(val);
1567: }
1568: lLastPosition = lCurrentPosition;
1569: }
1570: }
1571:
1572: // Move forward, skipping past match
1573: lCurrentPosition++;
1574: }
1575:
1576: // Append everything after the last $ sign
1577: ret.append(substitution.substring(lLastPosition + 2,
1578: lLength));
1579: } else {
1580: // Append substitution without processing backreferences
1581: ret.append(substitution);
1582: }
1583:
1584: // Move forward, skipping past match
1585: int newpos = getParenEnd(0);
1586:
1587: // We always want to make progress!
1588: if (newpos == pos) {
1589: newpos++;
1590: }
1591:
1592: // Try new position
1593: pos = newpos;
1594:
1595: // Break out if we're only supposed to replace one occurrence
1596: if ((flags & REPLACE_FIRSTONLY) != 0) {
1597: break;
1598: }
1599: }
1600:
1601: // If there's remaining input, append it
1602: if (pos < len) {
1603: ret.append(substituteIn.substring(pos));
1604: }
1605:
1606: // Return string buffer as string
1607: return ret.toString();
1608: }
1609:
1610: /**
1611: * Returns an array of Strings, whose toString representation matches a regular
1612: * expression. This method works like the Perl function of the same name. Given
1613: * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
1614: * aaaab], the array of Strings returned by grep would be [aab, aaaab].
1615: *
1616: * @param search Array of Objects to search
1617: * @return Array of Strings whose toString() value matches this regular expression.
1618: */
1619: public String[] grep(Object[] search) {
1620: // Create new vector to hold return items
1621: Vector v = new Vector();
1622:
1623: // Traverse array of objects
1624: for (int i = 0; i < search.length; i++) {
1625: // Get next object as a string
1626: String s = search[i].toString();
1627:
1628: // If it matches this regexp, add it to the list
1629: if (match(s)) {
1630: v.addElement(s);
1631: }
1632: }
1633:
1634: // Return vector as an array of strings
1635: String[] ret = new String[v.size()];
1636: v.copyInto(ret);
1637: return ret;
1638: }
1639:
1640: /**
1641: * @return true if character at i-th position in the <code>search</code> string is a newline
1642: */
1643: private boolean isNewline(int i) {
1644: char nextChar = search.charAt(i);
1645:
1646: return nextChar == '\n' || nextChar == '\r'
1647: || nextChar == '\u0085' || nextChar == '\u2028'
1648: || nextChar == '\u2029';
1649: }
1650:
1651: /**
1652: * Compares two characters.
1653: *
1654: * @param c1 first character to compare.
1655: * @param c2 second character to compare.
1656: * @param caseIndependent whether comparision is case insensitive or not.
1657: * @return negative, 0, or positive integer as the first character
1658: * less than, equal to, or greater then the second.
1659: */
1660: private int compareChars(char c1, char c2, boolean caseIndependent) {
1661: if (caseIndependent) {
1662: c1 = Character.toLowerCase(c1);
1663: c2 = Character.toLowerCase(c2);
1664: }
1665: return ((int) c1 - (int) c2);
1666: }
1667: }
|