0001: /*
0002: * gnu/regexp/RE.java
0003: * Copyright (C) 1998-2001 Wes Biggs
0004: *
0005: * This library is free software; you can redistribute it and/or modify
0006: * it under the terms of the GNU Lesser General Public License as published
0007: * by the Free Software Foundation; either version 2.1 of the License, or
0008: * (at your option) any later version.
0009: *
0010: * This library is distributed in the hope that it will be useful,
0011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0013: * GNU Lesser General Public License for more details.
0014: *
0015: * You should have received a copy of the GNU Lesser General Public License
0016: * along with this program; if not, write to the Free Software
0017: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
0018: */
0019:
0020: package gnu.regexp;
0021:
0022: import java.io.InputStream;
0023: import java.io.Reader;
0024: import java.io.Serializable;
0025: import java.util.Locale;
0026: import java.util.PropertyResourceBundle;
0027: import java.util.ResourceBundle;
0028: import java.util.Vector;
0029:
0030: class IntPair implements Serializable {
0031: public int first, second;
0032: }
0033:
0034: class CharUnit implements Serializable {
0035: public char ch;
0036: public boolean bk;
0037: }
0038:
0039: /**
0040: * RE provides the user interface for compiling and matching regular
0041: * expressions.
0042: * <P>
0043: * A regular expression object (class RE) is compiled by constructing it
0044: * from a String, StringBuffer or character array, with optional
0045: * compilation flags (below)
0046: * and an optional syntax specification (see RESyntax; if not specified,
0047: * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
0048: * <P>
0049: * Once compiled, a regular expression object is reusable as well as
0050: * threadsafe: multiple threads can use the RE instance simultaneously
0051: * to match against different input text.
0052: * <P>
0053: * Various methods attempt to match input text against a compiled
0054: * regular expression. These methods are:
0055: * <LI><code>isMatch</code>: returns true if the input text in its
0056: * entirety matches the regular expression pattern.
0057: * <LI><code>getMatch</code>: returns the first match found in the
0058: * input text, or null if no match is found.
0059: * <LI><code>getAllMatches</code>: returns an array of all
0060: * non-overlapping matches found in the input text. If no matches are
0061: * found, the array is zero-length.
0062: * <LI><code>substitute</code>: substitute the first occurence of the
0063: * pattern in the input text with a replacement string (which may
0064: * include metacharacters $0-$9, see REMatch.substituteInto).
0065: * <LI><code>substituteAll</code>: same as above, but repeat for each
0066: * match before returning.
0067: * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
0068: * object that allows iteration over the matches (see
0069: * REMatchEnumeration for some reasons why you may want to do this
0070: * instead of using <code>getAllMatches</code>.
0071: * <P>
0072: *
0073: * These methods all have similar argument lists. The input can be a
0074: * String, a character array, a StringBuffer, a Reader or an
0075: * InputStream of some sort. Note that when using a Reader or
0076: * InputStream, the stream read position cannot be guaranteed after
0077: * attempting a match (this is not a bug, but a consequence of the way
0078: * regular expressions work). Using an REMatchEnumeration can
0079: * eliminate most positioning problems.
0080: *
0081: * <P>
0082: *
0083: * The optional index argument specifies the offset from the beginning
0084: * of the text at which the search should start (see the descriptions
0085: * of some of the execution flags for how this can affect positional
0086: * pattern operators). For a Reader or InputStream, this means an
0087: * offset from the current read position, so subsequent calls with the
0088: * same index argument on a Reader or an InputStream will not
0089: * necessarily access the same position on the stream, whereas
0090: * repeated searches at a given index in a fixed string will return
0091: * consistent results.
0092: *
0093: * <P>
0094: * You can optionally affect the execution environment by using a
0095: * combination of execution flags (constants listed below).
0096: *
0097: * <P>
0098: * All operations on a regular expression are performed in a
0099: * thread-safe manner.
0100: *
0101: * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
0102: * @version 1.1.5-dev, to be released
0103: */
0104:
0105: public class RE extends REToken {
0106: // This String will be returned by getVersion()
0107: private static final String VERSION = "1.1.5-dev";
0108:
0109: // The localized strings are kept in a separate file
0110: private static ResourceBundle messages = PropertyResourceBundle
0111: .getBundle("gnu/regexp/MessagesBundle", Locale.getDefault());
0112:
0113: // These are, respectively, the first and last tokens in our linked list
0114: // If there is only one token, firstToken == lastToken
0115: private REToken firstToken, lastToken;
0116:
0117: // This is the number of subexpressions in this regular expression,
0118: // with a minimum value of zero. Returned by getNumSubs()
0119: private int numSubs;
0120:
0121: /** Minimum length, in characters, of any possible match. */
0122: private int minimumLength;
0123:
0124: /**
0125: * Compilation flag. Do not differentiate case. Subsequent
0126: * searches using this RE will be case insensitive.
0127: */
0128: public static final int REG_ICASE = 2;
0129:
0130: /**
0131: * Compilation flag. The match-any-character operator (dot)
0132: * will match a newline character. When set this overrides the syntax
0133: * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
0134: * the "/s" operator in Perl.
0135: */
0136: public static final int REG_DOT_NEWLINE = 4;
0137:
0138: /**
0139: * Compilation flag. Use multiline mode. In this mode, the ^ and $
0140: * anchors will match based on newlines within the input. This is
0141: * equivalent to the "/m" operator in Perl.
0142: */
0143: public static final int REG_MULTILINE = 8;
0144:
0145: /**
0146: * Execution flag.
0147: * The match-beginning operator (^) will not match at the beginning
0148: * of the input string. Useful for matching on a substring when you
0149: * know the context of the input is such that position zero of the
0150: * input to the match test is not actually position zero of the text.
0151: * <P>
0152: * This example demonstrates the results of various ways of matching on
0153: * a substring.
0154: * <P>
0155: * <CODE>
0156: * String s = "food bar fool";<BR>
0157: * RE exp = new RE("^foo.");<BR>
0158: * REMatch m0 = exp.getMatch(s);<BR>
0159: * REMatch m1 = exp.getMatch(s.substring(8));<BR>
0160: * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
0161: * REMatch m3 = exp.getMatch(s,8); <BR>
0162: * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR>
0163: * <P>
0164: * // Results:<BR>
0165: * // m0.toString(): "food"<BR>
0166: * // m1.toString(): "fool"<BR>
0167: * // m2.toString(): null<BR>
0168: * // m3.toString(): null<BR>
0169: * // m4.toString(): "fool"<BR>
0170: * </CODE>
0171: */
0172: public static final int REG_NOTBOL = 16;
0173:
0174: /**
0175: * Execution flag.
0176: * The match-end operator ($) does not match at the end
0177: * of the input string. Useful for matching on substrings.
0178: */
0179: public static final int REG_NOTEOL = 32;
0180:
0181: /**
0182: * Execution flag.
0183: * When a match method is invoked that starts matching at a non-zero
0184: * index into the input, treat the input as if it begins at the index
0185: * given. The effect of this flag is that the engine does not "see"
0186: * any text in the input before the given index. This is useful so
0187: * that the match-beginning operator (^) matches not at position 0
0188: * in the input string, but at the position the search started at
0189: * (based on the index input given to the getMatch function). See
0190: * the example under REG_NOTBOL. It also affects the use of the \<
0191: * and \b operators.
0192: */
0193: public static final int REG_ANCHORINDEX = 64;
0194:
0195: /**
0196: * Execution flag.
0197: * The substitute and substituteAll methods will not attempt to
0198: * interpolate occurrences of $1-$9 in the replacement text with
0199: * the corresponding subexpressions. For example, you may want to
0200: * replace all matches of "one dollar" with "$1".
0201: */
0202: public static final int REG_NO_INTERPOLATE = 128;
0203:
0204: /** Returns a string representing the version of the gnu.regexp package. */
0205: public static final String version() {
0206: return VERSION;
0207: }
0208:
0209: // Retrieves a message from the ResourceBundle
0210: static final String getLocalizedMessage(String key) {
0211: return messages.getString(key);
0212: }
0213:
0214: /**
0215: * Constructs a regular expression pattern buffer without any compilation
0216: * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
0217: *
0218: * @param pattern A regular expression pattern, in the form of a String,
0219: * StringBuffer or char[]. Other input types will be converted to
0220: * strings using the toString() method.
0221: * @exception REException The input pattern could not be parsed.
0222: * @exception NullPointerException The pattern was null.
0223: */
0224: public RE(Object pattern) throws REException {
0225: this (pattern, 0, RESyntax.RE_SYNTAX_PERL5, 0, 0);
0226: }
0227:
0228: /**
0229: * Constructs a regular expression pattern buffer using the specified
0230: * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
0231: *
0232: * @param pattern A regular expression pattern, in the form of a String,
0233: * StringBuffer, or char[]. Other input types will be converted to
0234: * strings using the toString() method.
0235: * @param cflags The logical OR of any combination of the compilation flags listed above.
0236: * @exception REException The input pattern could not be parsed.
0237: * @exception NullPointerException The pattern was null.
0238: */
0239: public RE(Object pattern, int cflags) throws REException {
0240: this (pattern, cflags, RESyntax.RE_SYNTAX_PERL5, 0, 0);
0241: }
0242:
0243: /**
0244: * Constructs a regular expression pattern buffer using the specified
0245: * compilation flags and regular expression syntax.
0246: *
0247: * @param pattern A regular expression pattern, in the form of a String,
0248: * StringBuffer, or char[]. Other input types will be converted to
0249: * strings using the toString() method.
0250: * @param cflags The logical OR of any combination of the compilation flags listed above.
0251: * @param syntax The type of regular expression syntax to use.
0252: * @exception REException The input pattern could not be parsed.
0253: * @exception NullPointerException The pattern was null.
0254: */
0255: public RE(Object pattern, int cflags, RESyntax syntax)
0256: throws REException {
0257: this (pattern, cflags, syntax, 0, 0);
0258: }
0259:
0260: // internal constructor used for alternation
0261: private RE(REToken first, REToken last, int subs, int subIndex,
0262: int minLength) {
0263: super (subIndex);
0264: firstToken = first;
0265: lastToken = last;
0266: numSubs = subs;
0267: minimumLength = minLength;
0268: addToken(new RETokenEndSub(subIndex));
0269: }
0270:
0271: private RE(Object patternObj, int cflags, RESyntax syntax,
0272: int myIndex, int nextSub) throws REException {
0273: super (myIndex); // Subexpression index of this token.
0274: initialize(patternObj, cflags, syntax, myIndex, nextSub);
0275: }
0276:
0277: // For use by subclasses
0278: protected RE() {
0279: super (0);
0280: }
0281:
0282: // The meat of construction
0283: protected void initialize(Object patternObj, int cflags,
0284: RESyntax syntax, int myIndex, int nextSub)
0285: throws REException {
0286: char[] pattern;
0287: if (patternObj instanceof String) {
0288: pattern = ((String) patternObj).toCharArray();
0289: } else if (patternObj instanceof char[]) {
0290: pattern = (char[]) patternObj;
0291: } else if (patternObj instanceof StringBuffer) {
0292: pattern = new char[((StringBuffer) patternObj).length()];
0293: ((StringBuffer) patternObj).getChars(0, pattern.length,
0294: pattern, 0);
0295: } else {
0296: pattern = patternObj.toString().toCharArray();
0297: }
0298:
0299: int pLength = pattern.length;
0300:
0301: numSubs = 0; // Number of subexpressions in this token.
0302: Vector branches = null;
0303:
0304: // linked list of tokens (sort of -- some closed loops can exist)
0305: firstToken = lastToken = null;
0306:
0307: // Precalculate these so we don't pay for the math every time we
0308: // need to access them.
0309: boolean insens = ((cflags & REG_ICASE) > 0);
0310:
0311: // Parse pattern into tokens. Does anyone know if it's more efficient
0312: // to use char[] than a String.charAt()? I'm assuming so.
0313:
0314: // index tracks the position in the char array
0315: int index = 0;
0316:
0317: // this will be the current parse character (pattern[index])
0318: CharUnit unit = new CharUnit();
0319:
0320: // This is used for {x,y} calculations
0321: IntPair minMax = new IntPair();
0322:
0323: // Buffer a token so we can create a TokenRepeated, etc.
0324: REToken currentToken = null;
0325: char ch;
0326:
0327: while (index < pLength) {
0328: // read the next character unit (including backslash escapes)
0329: index = getCharUnit(pattern, index, unit);
0330:
0331: // ALTERNATION OPERATOR
0332: // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
0333: // not available if RE_LIMITED_OPS is set
0334:
0335: // TODO: the '\n' literal here should be a test against REToken.newline,
0336: // which unfortunately may be more than a single character.
0337: if (((unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk)) || (syntax
0338: .get(RESyntax.RE_NEWLINE_ALT)
0339: && (unit.ch == '\n') && !unit.bk))
0340: && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
0341: // make everything up to here be a branch. create vector if nec.
0342: addToken(currentToken);
0343: RE theBranch = new RE(firstToken, lastToken, numSubs,
0344: subIndex, minimumLength);
0345: minimumLength = 0;
0346: if (branches == null) {
0347: branches = new Vector();
0348: }
0349: branches.addElement(theBranch);
0350: firstToken = lastToken = currentToken = null;
0351: }
0352:
0353: // INTERVAL OPERATOR:
0354: // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES)
0355: // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
0356: //
0357: // OPEN QUESTION:
0358: // what is proper interpretation of '{' at start of string?
0359:
0360: else if ((unit.ch == '{')
0361: && syntax.get(RESyntax.RE_INTERVALS)
0362: && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) {
0363: int newIndex = getMinMax(pattern, index, minMax, syntax);
0364: if (newIndex > index) {
0365: if (minMax.first > minMax.second)
0366: throw new REException(
0367: getLocalizedMessage("interval.order"),
0368: REException.REG_BADRPT, newIndex);
0369: if (currentToken == null)
0370: throw new REException(
0371: getLocalizedMessage("repeat.no.token"),
0372: REException.REG_BADRPT, newIndex);
0373: if (currentToken instanceof RETokenRepeated)
0374: throw new REException(
0375: getLocalizedMessage("repeat.chained"),
0376: REException.REG_BADRPT, newIndex);
0377: if (currentToken instanceof RETokenWordBoundary
0378: || currentToken instanceof RETokenWordBoundary)
0379: throw new REException(
0380: getLocalizedMessage("repeat.assertion"),
0381: REException.REG_BADRPT, newIndex);
0382: if ((currentToken.getMinimumLength() == 0)
0383: && (minMax.second == Integer.MAX_VALUE))
0384: throw new REException(
0385: getLocalizedMessage("repeat.empty.token"),
0386: REException.REG_BADRPT, newIndex);
0387: index = newIndex;
0388: currentToken = setRepeated(currentToken,
0389: minMax.first, minMax.second, index);
0390: } else {
0391: addToken(currentToken);
0392: currentToken = new RETokenChar(subIndex, unit.ch,
0393: insens);
0394: }
0395: }
0396:
0397: // LIST OPERATOR:
0398: // [...] | [^...]
0399:
0400: else if ((unit.ch == '[') && !unit.bk) {
0401: Vector options = new Vector();
0402: boolean negative = false;
0403: char lastChar = 0;
0404: if (index == pLength)
0405: throw new REException(
0406: getLocalizedMessage("unmatched.bracket"),
0407: REException.REG_EBRACK, index);
0408:
0409: // Check for initial caret, negation
0410: if ((ch = pattern[index]) == '^') {
0411: negative = true;
0412: if (++index == pLength)
0413: throw new REException(
0414: getLocalizedMessage("class.no.end"),
0415: REException.REG_EBRACK, index);
0416: ch = pattern[index];
0417: }
0418:
0419: // Check for leading right bracket literal
0420: if (ch == ']') {
0421: lastChar = ch;
0422: if (++index == pLength)
0423: throw new REException(
0424: getLocalizedMessage("class.no.end"),
0425: REException.REG_EBRACK, index);
0426: }
0427:
0428: while ((ch = pattern[index++]) != ']') {
0429: if ((ch == '-') && (lastChar != 0)) {
0430: if (index == pLength)
0431: throw new REException(
0432: getLocalizedMessage("class.no.end"),
0433: REException.REG_EBRACK, index);
0434: if ((ch = pattern[index]) == ']') {
0435: options.addElement(new RETokenChar(
0436: subIndex, lastChar, insens));
0437: lastChar = '-';
0438: } else {
0439: options.addElement(new RETokenRange(
0440: subIndex, lastChar, ch, insens));
0441: lastChar = 0;
0442: index++;
0443: }
0444: } else if ((ch == '\\')
0445: && syntax
0446: .get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
0447: if (index == pLength)
0448: throw new REException(
0449: getLocalizedMessage("class.no.end"),
0450: REException.REG_EBRACK, index);
0451: int posixID = -1;
0452: boolean negate = false;
0453: char asciiEsc = 0;
0454: if (("dswDSW".indexOf(pattern[index]) != -1)
0455: && syntax
0456: .get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
0457: switch (pattern[index]) {
0458: case 'D':
0459: negate = true;
0460: case 'd':
0461: posixID = RETokenPOSIX.DIGIT;
0462: break;
0463: case 'S':
0464: negate = true;
0465: case 's':
0466: posixID = RETokenPOSIX.SPACE;
0467: break;
0468: case 'W':
0469: negate = true;
0470: case 'w':
0471: posixID = RETokenPOSIX.ALNUM;
0472: break;
0473: }
0474: } else if ("nrt".indexOf(pattern[index]) != -1) {
0475: switch (pattern[index]) {
0476: case 'n':
0477: asciiEsc = '\n';
0478: break;
0479: case 't':
0480: asciiEsc = '\t';
0481: break;
0482: case 'r':
0483: asciiEsc = '\r';
0484: break;
0485: }
0486: }
0487: if (lastChar != 0)
0488: options.addElement(new RETokenChar(
0489: subIndex, lastChar, insens));
0490:
0491: if (posixID != -1) {
0492: options.addElement(new RETokenPOSIX(
0493: subIndex, posixID, insens, negate));
0494: } else if (asciiEsc != 0) {
0495: lastChar = asciiEsc;
0496: } else {
0497: lastChar = pattern[index];
0498: }
0499: ++index;
0500: } else if ((ch == '[')
0501: && (syntax.get(RESyntax.RE_CHAR_CLASSES))
0502: && (index < pLength)
0503: && (pattern[index] == ':')) {
0504: StringBuffer posixSet = new StringBuffer();
0505: index = getPosixSet(pattern, index + 1,
0506: posixSet);
0507: int posixId = RETokenPOSIX.intValue(posixSet
0508: .toString());
0509: if (posixId != -1)
0510: options.addElement(new RETokenPOSIX(
0511: subIndex, posixId, insens, false));
0512: } else {
0513: if (lastChar != 0)
0514: options.addElement(new RETokenChar(
0515: subIndex, lastChar, insens));
0516: lastChar = ch;
0517: }
0518: if (index == pLength)
0519: throw new REException(
0520: getLocalizedMessage("class.no.end"),
0521: REException.REG_EBRACK, index);
0522: } // while in list
0523: // Out of list, index is one past ']'
0524:
0525: if (lastChar != 0)
0526: options.addElement(new RETokenChar(subIndex,
0527: lastChar, insens));
0528:
0529: // Create a new RETokenOneOf
0530: addToken(currentToken);
0531: options.trimToSize();
0532: currentToken = new RETokenOneOf(subIndex, options,
0533: negative);
0534: }
0535:
0536: // SUBEXPRESSIONS
0537: // (...) | \(...\) depending on RE_NO_BK_PARENS
0538:
0539: else if ((unit.ch == '(')
0540: && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) {
0541: boolean pure = false;
0542: boolean comment = false;
0543: boolean lookAhead = false;
0544: boolean negativelh = false;
0545: if ((index + 1 < pLength) && (pattern[index] == '?')) {
0546: switch (pattern[index + 1]) {
0547: case '!':
0548: if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
0549: pure = true;
0550: negativelh = true;
0551: lookAhead = true;
0552: index += 2;
0553: }
0554: break;
0555: case '=':
0556: if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
0557: pure = true;
0558: lookAhead = true;
0559: index += 2;
0560: }
0561: break;
0562: case ':':
0563: if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
0564: pure = true;
0565: index += 2;
0566: }
0567: break;
0568: case '#':
0569: if (syntax.get(RESyntax.RE_COMMENTS)) {
0570: comment = true;
0571: }
0572: break;
0573: default:
0574: throw new REException(
0575: getLocalizedMessage("repeat.no.token"),
0576: REException.REG_BADRPT, index);
0577: }
0578: }
0579:
0580: if (index >= pLength) {
0581: throw new REException(
0582: getLocalizedMessage("unmatched.paren"),
0583: REException.REG_ESUBREG, index);
0584: }
0585:
0586: // find end of subexpression
0587: int endIndex = index;
0588: int nextIndex = index;
0589: int nested = 0;
0590:
0591: while (((nextIndex = getCharUnit(pattern, endIndex,
0592: unit)) > 0)
0593: && !(nested == 0 && (unit.ch == ')') && (syntax
0594: .get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)))
0595: if ((endIndex = nextIndex) >= pLength)
0596: throw new REException(
0597: getLocalizedMessage("subexpr.no.end"),
0598: REException.REG_ESUBREG, nextIndex);
0599: else if (unit.ch == '('
0600: && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
0601: nested++;
0602: else if (unit.ch == ')'
0603: && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
0604: nested--;
0605:
0606: // endIndex is now position at a ')','\)'
0607: // nextIndex is end of string or position after ')' or '\)'
0608:
0609: if (comment)
0610: index = nextIndex;
0611: else { // not a comment
0612: // create RE subexpression as token.
0613: addToken(currentToken);
0614: if (!pure) {
0615: numSubs++;
0616: }
0617:
0618: int useIndex = (pure || lookAhead) ? 0 : nextSub
0619: + numSubs;
0620: currentToken = new RE(String.valueOf(pattern,
0621: index, endIndex - index).toCharArray(),
0622: cflags, syntax, useIndex, nextSub + numSubs);
0623: numSubs += ((RE) currentToken).getNumSubs();
0624:
0625: if (lookAhead) {
0626: currentToken = new RETokenLookAhead(
0627: currentToken, negativelh);
0628: }
0629:
0630: index = nextIndex;
0631: } // not a comment
0632: } // subexpression
0633:
0634: // UNMATCHED RIGHT PAREN
0635: // ) or \) throw exception if
0636: // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
0637: else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
0638: && ((unit.ch == ')') && (syntax
0639: .get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) {
0640: throw new REException(
0641: getLocalizedMessage("unmatched.paren"),
0642: REException.REG_EPAREN, index);
0643: }
0644:
0645: // START OF LINE OPERATOR
0646: // ^
0647:
0648: else if ((unit.ch == '^') && !unit.bk) {
0649: addToken(currentToken);
0650: currentToken = null;
0651: addToken(new RETokenStart(subIndex,
0652: ((cflags & REG_MULTILINE) > 0) ? syntax
0653: .getLineSeparator() : null));
0654: }
0655:
0656: // END OF LINE OPERATOR
0657: // $
0658:
0659: else if ((unit.ch == '$') && !unit.bk) {
0660: addToken(currentToken);
0661: currentToken = null;
0662: addToken(new RETokenEnd(subIndex,
0663: ((cflags & REG_MULTILINE) > 0) ? syntax
0664: .getLineSeparator() : null));
0665: }
0666:
0667: // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
0668: // .
0669:
0670: else if ((unit.ch == '.') && !unit.bk) {
0671: addToken(currentToken);
0672: currentToken = new RETokenAny(subIndex, syntax
0673: .get(RESyntax.RE_DOT_NEWLINE)
0674: || ((cflags & REG_DOT_NEWLINE) > 0), syntax
0675: .get(RESyntax.RE_DOT_NOT_NULL));
0676: }
0677:
0678: // ZERO-OR-MORE REPEAT OPERATOR
0679: // *
0680:
0681: else if ((unit.ch == '*') && !unit.bk) {
0682: if (currentToken == null)
0683: throw new REException(
0684: getLocalizedMessage("repeat.no.token"),
0685: REException.REG_BADRPT, index);
0686: if (currentToken instanceof RETokenRepeated)
0687: throw new REException(
0688: getLocalizedMessage("repeat.chained"),
0689: REException.REG_BADRPT, index);
0690: if (currentToken instanceof RETokenWordBoundary
0691: || currentToken instanceof RETokenWordBoundary)
0692: throw new REException(
0693: getLocalizedMessage("repeat.assertion"),
0694: REException.REG_BADRPT, index);
0695: if (currentToken.getMinimumLength() == 0)
0696: throw new REException(
0697: getLocalizedMessage("repeat.empty.token"),
0698: REException.REG_BADRPT, index);
0699: currentToken = setRepeated(currentToken, 0,
0700: Integer.MAX_VALUE, index);
0701: }
0702:
0703: // ONE-OR-MORE REPEAT OPERATOR
0704: // + | \+ depending on RE_BK_PLUS_QM
0705: // not available if RE_LIMITED_OPS is set
0706:
0707: else if ((unit.ch == '+')
0708: && !syntax.get(RESyntax.RE_LIMITED_OPS)
0709: && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
0710: if (currentToken == null)
0711: throw new REException(
0712: getLocalizedMessage("repeat.no.token"),
0713: REException.REG_BADRPT, index);
0714: if (currentToken instanceof RETokenRepeated)
0715: throw new REException(
0716: getLocalizedMessage("repeat.chained"),
0717: REException.REG_BADRPT, index);
0718: if (currentToken instanceof RETokenWordBoundary
0719: || currentToken instanceof RETokenWordBoundary)
0720: throw new REException(
0721: getLocalizedMessage("repeat.assertion"),
0722: REException.REG_BADRPT, index);
0723: if (currentToken.getMinimumLength() == 0)
0724: throw new REException(
0725: getLocalizedMessage("repeat.empty.token"),
0726: REException.REG_BADRPT, index);
0727: currentToken = setRepeated(currentToken, 1,
0728: Integer.MAX_VALUE, index);
0729: }
0730:
0731: // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
0732: // ? | \? depending on RE_BK_PLUS_QM
0733: // not available if RE_LIMITED_OPS is set
0734: // stingy matching if RE_STINGY_OPS is set and it follows a quantifier
0735:
0736: else if ((unit.ch == '?')
0737: && !syntax.get(RESyntax.RE_LIMITED_OPS)
0738: && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
0739: if (currentToken == null)
0740: throw new REException(
0741: getLocalizedMessage("repeat.no.token"),
0742: REException.REG_BADRPT, index);
0743:
0744: // Check for stingy matching on RETokenRepeated
0745: if (currentToken instanceof RETokenRepeated) {
0746: if (syntax.get(RESyntax.RE_STINGY_OPS)
0747: && !((RETokenRepeated) currentToken)
0748: .isStingy())
0749: ((RETokenRepeated) currentToken).makeStingy();
0750: else
0751: throw new REException(
0752: getLocalizedMessage("repeat.chained"),
0753: REException.REG_BADRPT, index);
0754: } else if (currentToken instanceof RETokenWordBoundary
0755: || currentToken instanceof RETokenWordBoundary)
0756: throw new REException(
0757: getLocalizedMessage("repeat.assertion"),
0758: REException.REG_BADRPT, index);
0759: else
0760: currentToken = setRepeated(currentToken, 0, 1,
0761: index);
0762: }
0763:
0764: // BACKREFERENCE OPERATOR
0765: // \1 \2 ... \9
0766: // not available if RE_NO_BK_REFS is set
0767:
0768: else if (unit.bk && Character.isDigit(unit.ch)
0769: && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
0770: addToken(currentToken);
0771: currentToken = new RETokenBackRef(subIndex, Character
0772: .digit(unit.ch, 10), insens);
0773: }
0774:
0775: // START OF STRING OPERATOR
0776: // \A if RE_STRING_ANCHORS is set
0777:
0778: else if (unit.bk && (unit.ch == 'A')
0779: && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
0780: addToken(currentToken);
0781: currentToken = new RETokenStart(subIndex, null);
0782: }
0783:
0784: // WORD BREAK OPERATOR
0785: // \b if ????
0786:
0787: else if (unit.bk && (unit.ch == 'b')
0788: && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
0789: addToken(currentToken);
0790: currentToken = new RETokenWordBoundary(subIndex,
0791: RETokenWordBoundary.BEGIN
0792: | RETokenWordBoundary.END, false);
0793: }
0794:
0795: // WORD BEGIN OPERATOR
0796: // \< if ????
0797: else if (unit.bk && (unit.ch == '<')) {
0798: addToken(currentToken);
0799: currentToken = new RETokenWordBoundary(subIndex,
0800: RETokenWordBoundary.BEGIN, false);
0801: }
0802:
0803: // WORD END OPERATOR
0804: // \> if ????
0805: else if (unit.bk && (unit.ch == '>')) {
0806: addToken(currentToken);
0807: currentToken = new RETokenWordBoundary(subIndex,
0808: RETokenWordBoundary.END, false);
0809: }
0810:
0811: // NON-WORD BREAK OPERATOR
0812: // \B if ????
0813:
0814: else if (unit.bk && (unit.ch == 'B')
0815: && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
0816: addToken(currentToken);
0817: currentToken = new RETokenWordBoundary(subIndex,
0818: RETokenWordBoundary.BEGIN
0819: | RETokenWordBoundary.END, true);
0820: }
0821:
0822: // DIGIT OPERATOR
0823: // \d if RE_CHAR_CLASS_ESCAPES is set
0824:
0825: else if (unit.bk && (unit.ch == 'd')
0826: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0827: addToken(currentToken);
0828: currentToken = new RETokenPOSIX(subIndex,
0829: RETokenPOSIX.DIGIT, insens, false);
0830: }
0831:
0832: // NON-DIGIT OPERATOR
0833: // \D
0834:
0835: else if (unit.bk && (unit.ch == 'D')
0836: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0837: addToken(currentToken);
0838: currentToken = new RETokenPOSIX(subIndex,
0839: RETokenPOSIX.DIGIT, insens, true);
0840: }
0841:
0842: // NEWLINE ESCAPE
0843: // \n
0844:
0845: else if (unit.bk && (unit.ch == 'n')) {
0846: addToken(currentToken);
0847: currentToken = new RETokenChar(subIndex, '\n', false);
0848: }
0849:
0850: // RETURN ESCAPE
0851: // \r
0852:
0853: else if (unit.bk && (unit.ch == 'r')) {
0854: addToken(currentToken);
0855: currentToken = new RETokenChar(subIndex, '\r', false);
0856: }
0857:
0858: // WHITESPACE OPERATOR
0859: // \s if RE_CHAR_CLASS_ESCAPES is set
0860:
0861: else if (unit.bk && (unit.ch == 's')
0862: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0863: addToken(currentToken);
0864: currentToken = new RETokenPOSIX(subIndex,
0865: RETokenPOSIX.SPACE, insens, false);
0866: }
0867:
0868: // NON-WHITESPACE OPERATOR
0869: // \S
0870:
0871: else if (unit.bk && (unit.ch == 'S')
0872: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0873: addToken(currentToken);
0874: currentToken = new RETokenPOSIX(subIndex,
0875: RETokenPOSIX.SPACE, insens, true);
0876: }
0877:
0878: // TAB ESCAPE
0879: // \t
0880:
0881: else if (unit.bk && (unit.ch == 't')) {
0882: addToken(currentToken);
0883: currentToken = new RETokenChar(subIndex, '\t', false);
0884: }
0885:
0886: // ALPHANUMERIC OPERATOR
0887: // \w
0888:
0889: else if (unit.bk && (unit.ch == 'w')
0890: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0891: addToken(currentToken);
0892: currentToken = new RETokenPOSIX(subIndex,
0893: RETokenPOSIX.ALNUM, insens, false);
0894: }
0895:
0896: // NON-ALPHANUMERIC OPERATOR
0897: // \W
0898:
0899: else if (unit.bk && (unit.ch == 'W')
0900: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0901: addToken(currentToken);
0902: currentToken = new RETokenPOSIX(subIndex,
0903: RETokenPOSIX.ALNUM, insens, true);
0904: }
0905:
0906: // END OF STRING OPERATOR
0907: // \Z
0908:
0909: else if (unit.bk && (unit.ch == 'Z')
0910: && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
0911: addToken(currentToken);
0912: currentToken = new RETokenEnd(subIndex, null);
0913: }
0914:
0915: // NON-SPECIAL CHARACTER (or escape to make literal)
0916: // c | \* for example
0917:
0918: else { // not a special character
0919: addToken(currentToken);
0920: currentToken = new RETokenChar(subIndex, unit.ch,
0921: insens);
0922: }
0923: } // end while
0924:
0925: // Add final buffered token and an EndSub marker
0926: addToken(currentToken);
0927:
0928: if (branches != null) {
0929: branches.addElement(new RE(firstToken, lastToken, numSubs,
0930: subIndex, minimumLength));
0931: branches.trimToSize(); // compact the Vector
0932: minimumLength = 0;
0933: firstToken = lastToken = null;
0934: addToken(new RETokenOneOf(subIndex, branches, false));
0935: } else
0936: addToken(new RETokenEndSub(subIndex));
0937:
0938: }
0939:
0940: private static int getCharUnit(char[] input, int index,
0941: CharUnit unit) throws REException {
0942: unit.ch = input[index++];
0943: if (unit.bk = (unit.ch == '\\'))
0944: if (index < input.length)
0945: unit.ch = input[index++];
0946: else
0947: throw new REException(
0948: getLocalizedMessage("ends.with.backslash"),
0949: REException.REG_ESCAPE, index);
0950: return index;
0951: }
0952:
0953: /**
0954: * Checks if the regular expression matches the input in its entirety.
0955: *
0956: * @param input The input text.
0957: */
0958: public boolean isMatch(Object input) {
0959: return isMatch(input, 0, 0);
0960: }
0961:
0962: /**
0963: * Checks if the input string, starting from index, is an exact match of
0964: * this regular expression.
0965: *
0966: * @param input The input text.
0967: * @param index The offset index at which the search should be begin.
0968: */
0969: public boolean isMatch(Object input, int index) {
0970: return isMatch(input, index, 0);
0971: }
0972:
0973: /**
0974: * Checks if the input, starting from index and using the specified
0975: * execution flags, is an exact match of this regular expression.
0976: *
0977: * @param input The input text.
0978: * @param index The offset index at which the search should be begin.
0979: * @param eflags The logical OR of any execution flags above.
0980: */
0981: public boolean isMatch(Object input, int index, int eflags) {
0982: return isMatchImpl(makeCharIndexed(input, index), index, eflags);
0983: }
0984:
0985: private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
0986: if (firstToken == null) // Trivial case
0987: return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
0988: REMatch m = new REMatch(numSubs, index, eflags);
0989: if (firstToken.match(input, m)) {
0990: while (m != null) {
0991: if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
0992: return true;
0993: }
0994: m = m.next;
0995: }
0996: }
0997: return false;
0998: }
0999:
1000: /**
1001: * Returns the maximum number of subexpressions in this regular expression.
1002: * If the expression contains branches, the value returned will be the
1003: * maximum subexpressions in any of the branches.
1004: */
1005: public int getNumSubs() {
1006: return numSubs;
1007: }
1008:
1009: // Overrides REToken.setUncle
1010: void setUncle(REToken uncle) {
1011: if (lastToken != null) {
1012: lastToken.setUncle(uncle);
1013: } else
1014: super .setUncle(uncle); // to deal with empty subexpressions
1015: }
1016:
1017: // Overrides REToken.chain
1018:
1019: boolean chain(REToken next) {
1020: super .chain(next);
1021: setUncle(next);
1022: return true;
1023: }
1024:
1025: /**
1026: * Returns the minimum number of characters that could possibly
1027: * constitute a match of this regular expression.
1028: */
1029: public int getMinimumLength() {
1030: return minimumLength;
1031: }
1032:
1033: /**
1034: * Returns an array of all matches found in the input.
1035: *
1036: * If the regular expression allows the empty string to match, it will
1037: * substitute matches at all positions except the end of the input.
1038: *
1039: * @param input The input text.
1040: * @return a non-null (but possibly zero-length) array of matches
1041: */
1042: public REMatch[] getAllMatches(Object input) {
1043: return getAllMatches(input, 0, 0);
1044: }
1045:
1046: /**
1047: * Returns an array of all matches found in the input,
1048: * beginning at the specified index position.
1049: *
1050: * If the regular expression allows the empty string to match, it will
1051: * substitute matches at all positions except the end of the input.
1052: *
1053: * @param input The input text.
1054: * @param index The offset index at which the search should be begin.
1055: * @return a non-null (but possibly zero-length) array of matches
1056: */
1057: public REMatch[] getAllMatches(Object input, int index) {
1058: return getAllMatches(input, index, 0);
1059: }
1060:
1061: /**
1062: * Returns an array of all matches found in the input string,
1063: * beginning at the specified index position and using the specified
1064: * execution flags.
1065: *
1066: * If the regular expression allows the empty string to match, it will
1067: * substitute matches at all positions except the end of the input.
1068: *
1069: * @param input The input text.
1070: * @param index The offset index at which the search should be begin.
1071: * @param eflags The logical OR of any execution flags above.
1072: * @return a non-null (but possibly zero-length) array of matches
1073: */
1074: public REMatch[] getAllMatches(Object input, int index, int eflags) {
1075: return getAllMatchesImpl(makeCharIndexed(input, index), index,
1076: eflags);
1077: }
1078:
1079: // this has been changed since 1.03 to be non-overlapping matches
1080: private REMatch[] getAllMatchesImpl(CharIndexed input, int index,
1081: int eflags) {
1082: Vector all = new Vector();
1083: REMatch m = null;
1084: while ((m = getMatchImpl(input, index, eflags, null)) != null) {
1085: all.addElement(m);
1086: index = m.getEndIndex();
1087: if (m.end[0] == 0) { // handle pathological case of zero-length match
1088: index++;
1089: input.move(1);
1090: } else {
1091: input.move(m.end[0]);
1092: }
1093: if (!input.isValid())
1094: break;
1095: }
1096: REMatch[] mset = new REMatch[all.size()];
1097: all.copyInto(mset);
1098: return mset;
1099: }
1100:
1101: /* Implements abstract method REToken.match() */
1102: boolean match(CharIndexed input, REMatch mymatch) {
1103: if (firstToken == null)
1104: return next(input, mymatch);
1105:
1106: // Note the start of this subexpression
1107: mymatch.start[subIndex] = mymatch.index;
1108:
1109: return firstToken.match(input, mymatch);
1110: }
1111:
1112: /**
1113: * Returns the first match found in the input. If no match is found,
1114: * null is returned.
1115: *
1116: * @param input The input text.
1117: * @return An REMatch instance referencing the match, or null if none.
1118: */
1119: public REMatch getMatch(Object input) {
1120: return getMatch(input, 0, 0);
1121: }
1122:
1123: /**
1124: * Returns the first match found in the input, beginning
1125: * the search at the specified index. If no match is found,
1126: * returns null.
1127: *
1128: * @param input The input text.
1129: * @param index The offset within the text to begin looking for a match.
1130: * @return An REMatch instance referencing the match, or null if none.
1131: */
1132: public REMatch getMatch(Object input, int index) {
1133: return getMatch(input, index, 0);
1134: }
1135:
1136: /**
1137: * Returns the first match found in the input, beginning
1138: * the search at the specified index, and using the specified
1139: * execution flags. If no match is found, returns null.
1140: *
1141: * @param input The input text.
1142: * @param index The offset index at which the search should be begin.
1143: * @param eflags The logical OR of any execution flags above.
1144: * @return An REMatch instance referencing the match, or null if none.
1145: */
1146: public REMatch getMatch(Object input, int index, int eflags) {
1147: return getMatch(input, index, eflags, null);
1148: }
1149:
1150: /**
1151: * Returns the first match found in the input, beginning the search
1152: * at the specified index, and using the specified execution flags.
1153: * If no match is found, returns null. If a StringBuffer is
1154: * provided and is non-null, the contents of the input text from the
1155: * index to the beginning of the match (or to the end of the input,
1156: * if there is no match) are appended to the StringBuffer.
1157: *
1158: * @param input The input text.
1159: * @param index The offset index at which the search should be begin.
1160: * @param eflags The logical OR of any execution flags above.
1161: * @param buffer The StringBuffer to save pre-match text in.
1162: * @return An REMatch instance referencing the match, or null if none. */
1163: public REMatch getMatch(Object input, int index, int eflags,
1164: StringBuffer buffer) {
1165: return getMatchImpl(makeCharIndexed(input, index), index,
1166: eflags, buffer);
1167: }
1168:
1169: REMatch getMatchImpl(CharIndexed input, int anchor, int eflags,
1170: StringBuffer buffer) {
1171: // Create a new REMatch to hold results
1172: REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1173: do {
1174: // Optimization: check if anchor + minimumLength > length
1175: if (minimumLength == 0
1176: || input.charAt(minimumLength - 1) != CharIndexed.OUT_OF_BOUNDS) {
1177: if (match(input, mymatch)) {
1178: // Find longest match of them all to observe leftmost longest
1179: REMatch longest = mymatch;
1180: while ((mymatch = mymatch.next) != null) {
1181: if (mymatch.index > longest.index) {
1182: longest = mymatch;
1183: }
1184: }
1185:
1186: longest.end[0] = longest.index;
1187: longest.finish(input);
1188: return longest;
1189: }
1190: }
1191: mymatch.clear(++anchor);
1192: // Append character to buffer if needed
1193: if (buffer != null
1194: && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1195: buffer.append(input.charAt(0));
1196: }
1197: } while (input.move(1));
1198:
1199: // Special handling at end of input for e.g. "$"
1200: if (minimumLength == 0) {
1201: if (match(input, mymatch)) {
1202: mymatch.finish(input);
1203: return mymatch;
1204: }
1205: }
1206:
1207: return null;
1208: }
1209:
1210: /**
1211: * Returns an REMatchEnumeration that can be used to iterate over the
1212: * matches found in the input text.
1213: *
1214: * @param input The input text.
1215: * @return A non-null REMatchEnumeration instance.
1216: */
1217: public REMatchEnumeration getMatchEnumeration(Object input) {
1218: return getMatchEnumeration(input, 0, 0);
1219: }
1220:
1221: /**
1222: * Returns an REMatchEnumeration that can be used to iterate over the
1223: * matches found in the input text.
1224: *
1225: * @param input The input text.
1226: * @param index The offset index at which the search should be begin.
1227: * @return A non-null REMatchEnumeration instance, with its input cursor
1228: * set to the index position specified.
1229: */
1230: public REMatchEnumeration getMatchEnumeration(Object input,
1231: int index) {
1232: return getMatchEnumeration(input, index, 0);
1233: }
1234:
1235: /**
1236: * Returns an REMatchEnumeration that can be used to iterate over the
1237: * matches found in the input text.
1238: *
1239: * @param input The input text.
1240: * @param index The offset index at which the search should be begin.
1241: * @param eflags The logical OR of any execution flags above.
1242: * @return A non-null REMatchEnumeration instance, with its input cursor
1243: * set to the index position specified.
1244: */
1245: public REMatchEnumeration getMatchEnumeration(Object input,
1246: int index, int eflags) {
1247: return new REMatchEnumeration(this , makeCharIndexed(input,
1248: index), index, eflags);
1249: }
1250:
1251: /**
1252: * Substitutes the replacement text for the first match found in the input.
1253: *
1254: * @param input The input text.
1255: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1256: * @return A String interpolating the substituted text.
1257: * @see REMatch#substituteInto
1258: */
1259: public String substitute(Object input, String replace) {
1260: return substitute(input, replace, 0, 0);
1261: }
1262:
1263: /**
1264: * Substitutes the replacement text for the first match found in the input
1265: * beginning at the specified index position. Specifying an index
1266: * effectively causes the regular expression engine to throw away the
1267: * specified number of characters.
1268: *
1269: * @param input The input text.
1270: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1271: * @param index The offset index at which the search should be begin.
1272: * @return A String containing the substring of the input, starting
1273: * at the index position, and interpolating the substituted text.
1274: * @see REMatch#substituteInto
1275: */
1276: public String substitute(Object input, String replace, int index) {
1277: return substitute(input, replace, index, 0);
1278: }
1279:
1280: /**
1281: * Substitutes the replacement text for the first match found in the input
1282: * string, beginning at the specified index position and using the
1283: * specified execution flags.
1284: *
1285: * @param input The input text.
1286: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1287: * @param index The offset index at which the search should be begin.
1288: * @param eflags The logical OR of any execution flags above.
1289: * @return A String containing the substring of the input, starting
1290: * at the index position, and interpolating the substituted text.
1291: * @see REMatch#substituteInto
1292: */
1293: public String substitute(Object input, String replace, int index,
1294: int eflags) {
1295: return substituteImpl(makeCharIndexed(input, index), replace,
1296: index, eflags);
1297: }
1298:
1299: private String substituteImpl(CharIndexed input, String replace,
1300: int index, int eflags) {
1301: StringBuffer buffer = new StringBuffer();
1302: REMatch m = getMatchImpl(input, index, eflags, buffer);
1303: if (m == null)
1304: return buffer.toString();
1305: buffer.append(((eflags & REG_NO_INTERPOLATE) > 0) ? replace : m
1306: .substituteInto(replace));
1307: if (input.move(m.end[0])) {
1308: do {
1309: buffer.append(input.charAt(0));
1310: } while (input.move(1));
1311: }
1312: return buffer.toString();
1313: }
1314:
1315: /**
1316: * Substitutes the replacement text for each non-overlapping match found
1317: * in the input text.
1318: *
1319: * @param input The input text.
1320: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1321: * @return A String interpolating the substituted text.
1322: * @see REMatch#substituteInto
1323: */
1324: public String substituteAll(Object input, String replace) {
1325: return substituteAll(input, replace, 0, 0);
1326: }
1327:
1328: /**
1329: * Substitutes the replacement text for each non-overlapping match found
1330: * in the input text, starting at the specified index.
1331: *
1332: * If the regular expression allows the empty string to match, it will
1333: * substitute matches at all positions except the end of the input.
1334: *
1335: * @param input The input text.
1336: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1337: * @param index The offset index at which the search should be begin.
1338: * @return A String containing the substring of the input, starting
1339: * at the index position, and interpolating the substituted text.
1340: * @see REMatch#substituteInto
1341: */
1342: public String substituteAll(Object input, String replace, int index) {
1343: return substituteAll(input, replace, index, 0);
1344: }
1345:
1346: /**
1347: * Substitutes the replacement text for each non-overlapping match found
1348: * in the input text, starting at the specified index and using the
1349: * specified execution flags.
1350: *
1351: * @param input The input text.
1352: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1353: * @param index The offset index at which the search should be begin.
1354: * @param eflags The logical OR of any execution flags above.
1355: * @return A String containing the substring of the input, starting
1356: * at the index position, and interpolating the substituted text.
1357: * @see REMatch#substituteInto
1358: */
1359: public String substituteAll(Object input, String replace,
1360: int index, int eflags) {
1361: return substituteAllImpl(makeCharIndexed(input, index),
1362: replace, index, eflags);
1363: }
1364:
1365: private String substituteAllImpl(CharIndexed input, String replace,
1366: int index, int eflags) {
1367: StringBuffer buffer = new StringBuffer();
1368: REMatch m;
1369: while ((m = getMatchImpl(input, index, eflags, buffer)) != null) {
1370: buffer.append(((eflags & REG_NO_INTERPOLATE) > 0) ? replace
1371: : m.substituteInto(replace));
1372: index = m.getEndIndex();
1373: if (m.end[0] == 0) {
1374: char ch = input.charAt(0);
1375: if (ch != CharIndexed.OUT_OF_BOUNDS)
1376: buffer.append(ch);
1377: input.move(1);
1378: } else {
1379: input.move(m.end[0]);
1380: }
1381:
1382: if (!input.isValid())
1383: break;
1384: }
1385: return buffer.toString();
1386: }
1387:
1388: /* Helper function for constructor */
1389: private void addToken(REToken next) {
1390: if (next == null)
1391: return;
1392: minimumLength += next.getMinimumLength();
1393: if (firstToken == null) {
1394: lastToken = firstToken = next;
1395: } else {
1396: // if chain returns false, it "rejected" the token due to
1397: // an optimization, and next was combined with lastToken
1398: if (lastToken.chain(next)) {
1399: lastToken = next;
1400: }
1401: }
1402: }
1403:
1404: private static REToken setRepeated(REToken current, int min,
1405: int max, int index) throws REException {
1406: if (current == null)
1407: throw new REException(
1408: getLocalizedMessage("repeat.no.token"),
1409: REException.REG_BADRPT, index);
1410: return new RETokenRepeated(current.subIndex, current, min, max);
1411: }
1412:
1413: private static int getPosixSet(char[] pattern, int index,
1414: StringBuffer buf) {
1415: // Precondition: pattern[index-1] == ':'
1416: // we will return pos of closing ']'.
1417: int i;
1418: for (i = index; i < (pattern.length - 1); i++) {
1419: if ((pattern[i] == ':') && (pattern[i + 1] == ']'))
1420: return i + 2;
1421: buf.append(pattern[i]);
1422: }
1423: return index; // didn't match up
1424: }
1425:
1426: private int getMinMax(char[] input, int index, IntPair minMax,
1427: RESyntax syntax) throws REException {
1428: // Precondition: input[index-1] == '{', minMax != null
1429:
1430: boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
1431: int startIndex = index;
1432: if (index == input.length) {
1433: if (mustMatch)
1434: throw new REException(
1435: getLocalizedMessage("unmatched.brace"),
1436: REException.REG_EBRACE, index);
1437: else
1438: return startIndex;
1439: }
1440:
1441: int min, max = 0;
1442: CharUnit unit = new CharUnit();
1443: StringBuffer buf = new StringBuffer();
1444:
1445: // Read string of digits
1446: do {
1447: index = getCharUnit(input, index, unit);
1448: if (Character.isDigit(unit.ch))
1449: buf.append(unit.ch);
1450: } while ((index != input.length) && Character.isDigit(unit.ch));
1451:
1452: // Check for {} tomfoolery
1453: if (buf.length() == 0) {
1454: if (mustMatch)
1455: throw new REException(
1456: getLocalizedMessage("interval.error"),
1457: REException.REG_EBRACE, index);
1458: else
1459: return startIndex;
1460: }
1461:
1462: min = Integer.parseInt(buf.toString());
1463:
1464: if ((unit.ch == '}')
1465: && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1466: max = min;
1467: else if (index == input.length)
1468: if (mustMatch)
1469: throw new REException(
1470: getLocalizedMessage("interval.no.end"),
1471: REException.REG_EBRACE, index);
1472: else
1473: return startIndex;
1474: else if ((unit.ch == ',') && !unit.bk) {
1475: buf = new StringBuffer();
1476: // Read string of digits
1477: while (((index = getCharUnit(input, index, unit)) != input.length)
1478: && Character.isDigit(unit.ch))
1479: buf.append(unit.ch);
1480:
1481: if (!((unit.ch == '}') && (syntax
1482: .get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1483: if (mustMatch)
1484: throw new REException(
1485: getLocalizedMessage("interval.error"),
1486: REException.REG_EBRACE, index);
1487: else
1488: return startIndex;
1489:
1490: // This is the case of {x,}
1491: if (buf.length() == 0)
1492: max = Integer.MAX_VALUE;
1493: else
1494: max = Integer.parseInt(buf.toString());
1495: } else if (mustMatch)
1496: throw new REException(
1497: getLocalizedMessage("interval.error"),
1498: REException.REG_EBRACE, index);
1499: else
1500: return startIndex;
1501:
1502: // We know min and max now, and they are valid.
1503:
1504: minMax.first = min;
1505: minMax.second = max;
1506:
1507: // return the index following the '}'
1508: return index;
1509: }
1510:
1511: /**
1512: * Return a human readable form of the compiled regular expression,
1513: * useful for debugging.
1514: */
1515: public String toString() {
1516: StringBuffer sb = new StringBuffer();
1517: dump(sb);
1518: return sb.toString();
1519: }
1520:
1521: void dump(StringBuffer os) {
1522: os.append('(');
1523: if (subIndex == 0)
1524: os.append("?:");
1525: if (firstToken != null)
1526: firstToken.dumpAll(os);
1527: os.append(')');
1528: }
1529:
1530: // Cast input appropriately or throw exception
1531: private static CharIndexed makeCharIndexed(Object input, int index) {
1532: // We could let a String fall through to final input, but since
1533: // it's the most likely input type, we check it first.
1534: if (input instanceof String)
1535: return new CharIndexedString((String) input, index);
1536: else if (input instanceof char[])
1537: return new CharIndexedCharArray((char[]) input, index);
1538: else if (input instanceof StringBuffer)
1539: return new CharIndexedStringBuffer((StringBuffer) input,
1540: index);
1541: else if (input instanceof InputStream)
1542: return new CharIndexedInputStream((InputStream) input,
1543: index);
1544: else if (input instanceof Reader)
1545: return new CharIndexedReader((Reader) input, index);
1546: else if (input instanceof CharIndexed)
1547: return (CharIndexed) input; // do we lose index info?
1548: else
1549: return new CharIndexedString(input.toString(), index);
1550: }
1551: }
|