0001: /*
0002: * gnu/regexp/RE.java
0003: * Copyright (C) 1998-2001 Wes Biggs
0004: *
0005: * This library is free software; you can redistribute it and/or modify
0006: * it under the terms of the GNU Lesser General Public License as published
0007: * by the Free Software Foundation; either version 2.1 of the License, or
0008: * (at your option) any later version.
0009: *
0010: * This library is distributed in the hope that it will be useful,
0011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0013: * GNU Lesser General Public License for more details.
0014: *
0015: * You should have received a copy of the GNU Lesser General Public License
0016: * along with this program; if not, write to the Free Software
0017: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
0018: */
0019:
0020: package gnu.regexp;
0021:
0022: import java.io.InputStream;
0023: import java.io.Reader;
0024: import java.io.Serializable;
0025: import java.util.Locale;
0026: import java.util.PropertyResourceBundle;
0027: import java.util.ResourceBundle;
0028: import java.util.Vector;
0029:
0030: class IntPair implements Serializable {
0031: public int first, second;
0032: }
0033:
0034: class CharUnit implements Serializable {
0035: public char ch;
0036: public boolean bk;
0037: }
0038:
0039: /**
0040: * RE provides the user interface for compiling and matching regular
0041: * expressions.
0042: * <P>
0043: * A regular expression object (class RE) is compiled by constructing it
0044: * from a String, StringBuffer or character array, with optional
0045: * compilation flags (below)
0046: * and an optional syntax specification (see RESyntax; if not specified,
0047: * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
0048: * <P>
0049: * Various methods attempt to match input text against a compiled
0050: * regular expression. These methods are:
0051: * <LI><code>isMatch</code>: returns true if the input text in its entirety
0052: * matches the regular expression pattern.
0053: * <LI><code>getMatch</code>: returns the first match found in the input text,
0054: * or null if no match is found.
0055: * <LI><code>getAllMatches</code>: returns an array of all non-overlapping
0056: * matches found in the input text. If no matches are found, the array is
0057: * zero-length.
0058: * <LI><code>substitute</code>: substitute the first occurence of the pattern
0059: * in the input text with a replacement string (which may include
0060: * metacharacters $0-$9, see REMatch.substituteInto).
0061: * <LI><code>substituteAll</code>: same as above, but repeat for each match
0062: * before returning.
0063: * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration object
0064: * that allows iteration over the matches (see REMatchEnumeration for some
0065: * reasons why you may want to do this instead of using <code>getAllMatches</code>.
0066: * <P>
0067: *
0068: * These methods all have similar argument lists. The input can be a
0069: * String, a character array, a StringBuffer, a Reader or an
0070: * InputStream of some sort. Note that when using a Reader or
0071: * InputStream, the stream read position cannot be guaranteed after
0072: * attempting a match (this is not a bug, but a consequence of the way
0073: * regular expressions work). Using an REMatchEnumeration can
0074: * eliminate most positioning problems.
0075: *
0076: * <P>
0077: *
0078: * The optional index argument specifies the offset from the beginning
0079: * of the text at which the search should start (see the descriptions
0080: * of some of the execution flags for how this can affect positional
0081: * pattern operators). For a Reader or InputStream, this means an
0082: * offset from the current read position, so subsequent calls with the
0083: * same index argument on a Reader or an InputStream will not
0084: * necessarily access the same position on the stream, whereas
0085: * repeated searches at a given index in a fixed string will return
0086: * consistent results.
0087: *
0088: * <P>
0089: * You can optionally affect the execution environment by using a
0090: * combination of execution flags (constants listed below).
0091: *
0092: * <P>
0093: * All operations on a regular expression are performed in a
0094: * thread-safe manner.
0095: *
0096: * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
0097: * @version 1.1.4-dev, to be released
0098: */
0099:
0100: public class RE extends REToken {
0101: // This String will be returned by getVersion()
0102: private static final String VERSION = "1.1.4-dev";
0103:
0104: // The localized strings are kept in a separate file
0105: private static ResourceBundle messages = PropertyResourceBundle
0106: .getBundle("gnu/regexp/MessagesBundle", Locale.getDefault());
0107:
0108: // These are, respectively, the first and last tokens in our linked list
0109: // If there is only one token, firstToken == lastToken
0110: private REToken firstToken, lastToken;
0111:
0112: // This is the number of subexpressions in this regular expression,
0113: // with a minimum value of zero. Returned by getNumSubs()
0114: private int numSubs;
0115:
0116: /** Minimum length, in characters, of any possible match. */
0117: private int minimumLength;
0118:
0119: /**
0120: * Compilation flag. Do not differentiate case. Subsequent
0121: * searches using this RE will be case insensitive.
0122: */
0123: public static final int REG_ICASE = 2;
0124:
0125: /**
0126: * Compilation flag. The match-any-character operator (dot)
0127: * will match a newline character. When set this overrides the syntax
0128: * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
0129: * the "/s" operator in Perl.
0130: */
0131: public static final int REG_DOT_NEWLINE = 4;
0132:
0133: /**
0134: * Compilation flag. Use multiline mode. In this mode, the ^ and $
0135: * anchors will match based on newlines within the input. This is
0136: * equivalent to the "/m" operator in Perl.
0137: */
0138: public static final int REG_MULTILINE = 8;
0139:
0140: /**
0141: * Execution flag.
0142: * The match-beginning operator (^) will not match at the beginning
0143: * of the input string. Useful for matching on a substring when you
0144: * know the context of the input is such that position zero of the
0145: * input to the match test is not actually position zero of the text.
0146: * <P>
0147: * This example demonstrates the results of various ways of matching on
0148: * a substring.
0149: * <P>
0150: * <CODE>
0151: * String s = "food bar fool";<BR>
0152: * RE exp = new RE("^foo.");<BR>
0153: * REMatch m0 = exp.getMatch(s);<BR>
0154: * REMatch m1 = exp.getMatch(s.substring(8));<BR>
0155: * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
0156: * REMatch m3 = exp.getMatch(s,8); <BR>
0157: * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR>
0158: * <P>
0159: * // Results:<BR>
0160: * // m0 = "food"<BR>
0161: * // m1 = "fool"<BR>
0162: * // m2 = null<BR>
0163: * // m3 = null<BR>
0164: * // m4 = "fool"<BR>
0165: * </CODE>
0166: */
0167: public static final int REG_NOTBOL = 16;
0168:
0169: /**
0170: * Execution flag.
0171: * The match-end operator ($) does not match at the end
0172: * of the input string. Useful for matching on substrings.
0173: */
0174: public static final int REG_NOTEOL = 32;
0175:
0176: /**
0177: * Execution flag.
0178: * When a match method is invoked that starts matching at a non-zero
0179: * index into the input, treat the input as if it begins at the index
0180: * given. The effect of this flag is that the engine does not "see"
0181: * any text in the input before the given index. This is useful so
0182: * that the match-beginning operator (^) matches not at position 0
0183: * in the input string, but at the position the search started at
0184: * (based on the index input given to the getMatch function). See
0185: * the example under REG_NOTBOL. It also affects the use of the \<
0186: * and \b operators.
0187: */
0188: public static final int REG_ANCHORINDEX = 64;
0189:
0190: /**
0191: * Execution flag.
0192: * The substitute and substituteAll methods will not attempt to
0193: * interpolate occurrences of $1-$9 in the replacement text with
0194: * the corresponding subexpressions. For example, you may want to
0195: * replace all matches of "one dollar" with "$1".
0196: */
0197: public static final int REG_NO_INTERPOLATE = 128;
0198:
0199: /** Returns a string representing the version of the gnu.regexp package. */
0200: public static final String version() {
0201: return VERSION;
0202: }
0203:
0204: // Retrieves a message from the ResourceBundle
0205: static final String getLocalizedMessage(String key) {
0206: return messages.getString(key);
0207: }
0208:
0209: /**
0210: * Constructs a regular expression pattern buffer without any compilation
0211: * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
0212: *
0213: * @param pattern A regular expression pattern, in the form of a String,
0214: * StringBuffer or char[]. Other input types will be converted to
0215: * strings using the toString() method.
0216: * @exception REException The input pattern could not be parsed.
0217: * @exception NullPointerException The pattern was null.
0218: */
0219: public RE(Object pattern) throws REException {
0220: this (pattern, 0, RESyntax.RE_SYNTAX_PERL5, 0, 0);
0221: }
0222:
0223: /**
0224: * Constructs a regular expression pattern buffer using the specified
0225: * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
0226: *
0227: * @param pattern A regular expression pattern, in the form of a String,
0228: * StringBuffer, or char[]. Other input types will be converted to
0229: * strings using the toString() method.
0230: * @param cflags The logical OR of any combination of the compilation flags listed above.
0231: * @exception REException The input pattern could not be parsed.
0232: * @exception NullPointerException The pattern was null.
0233: */
0234: public RE(Object pattern, int cflags) throws REException {
0235: this (pattern, cflags, RESyntax.RE_SYNTAX_PERL5, 0, 0);
0236: }
0237:
0238: /**
0239: * Constructs a regular expression pattern buffer using the specified
0240: * compilation flags and regular expression syntax.
0241: *
0242: * @param pattern A regular expression pattern, in the form of a String,
0243: * StringBuffer, or char[]. Other input types will be converted to
0244: * strings using the toString() method.
0245: * @param cflags The logical OR of any combination of the compilation flags listed above.
0246: * @param syntax The type of regular expression syntax to use.
0247: * @exception REException The input pattern could not be parsed.
0248: * @exception NullPointerException The pattern was null.
0249: */
0250: public RE(Object pattern, int cflags, RESyntax syntax)
0251: throws REException {
0252: this (pattern, cflags, syntax, 0, 0);
0253: }
0254:
0255: // internal constructor used for alternation
0256: private RE(REToken first, REToken last, int subs, int subIndex,
0257: int minLength) {
0258: super (subIndex);
0259: firstToken = first;
0260: lastToken = last;
0261: numSubs = subs;
0262: minimumLength = minLength;
0263: addToken(new RETokenEndSub(subIndex));
0264: }
0265:
0266: private RE(Object patternObj, int cflags, RESyntax syntax,
0267: int myIndex, int nextSub) throws REException {
0268: super (myIndex); // Subexpression index of this token.
0269: initialize(patternObj, cflags, syntax, myIndex, nextSub);
0270: }
0271:
0272: // For use by subclasses
0273: protected RE() {
0274: super (0);
0275: }
0276:
0277: // The meat of construction
0278: protected void initialize(Object patternObj, int cflags,
0279: RESyntax syntax, int myIndex, int nextSub)
0280: throws REException {
0281: char[] pattern;
0282: if (patternObj instanceof String) {
0283: pattern = ((String) patternObj).toCharArray();
0284: } else if (patternObj instanceof char[]) {
0285: pattern = (char[]) patternObj;
0286: } else if (patternObj instanceof StringBuffer) {
0287: pattern = new char[((StringBuffer) patternObj).length()];
0288: ((StringBuffer) patternObj).getChars(0, pattern.length,
0289: pattern, 0);
0290: } else {
0291: pattern = patternObj.toString().toCharArray();
0292: }
0293:
0294: int pLength = pattern.length;
0295:
0296: numSubs = 0; // Number of subexpressions in this token.
0297: Vector branches = null;
0298:
0299: // linked list of tokens (sort of -- some closed loops can exist)
0300: firstToken = lastToken = null;
0301:
0302: // Precalculate these so we don't pay for the math every time we
0303: // need to access them.
0304: boolean insens = ((cflags & REG_ICASE) > 0);
0305:
0306: // Parse pattern into tokens. Does anyone know if it's more efficient
0307: // to use char[] than a String.charAt()? I'm assuming so.
0308:
0309: // index tracks the position in the char array
0310: int index = 0;
0311:
0312: // this will be the current parse character (pattern[index])
0313: CharUnit unit = new CharUnit();
0314:
0315: // This is used for {x,y} calculations
0316: IntPair minMax = new IntPair();
0317:
0318: // Buffer a token so we can create a TokenRepeated, etc.
0319: REToken currentToken = null;
0320: char ch;
0321:
0322: while (index < pLength) {
0323: // read the next character unit (including backslash escapes)
0324: index = getCharUnit(pattern, index, unit);
0325:
0326: // ALTERNATION OPERATOR
0327: // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
0328: // not available if RE_LIMITED_OPS is set
0329:
0330: // TODO: the '\n' literal here should be a test against REToken.newline,
0331: // which unfortunately may be more than a single character.
0332: if (((unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk)) || (syntax
0333: .get(RESyntax.RE_NEWLINE_ALT)
0334: && (unit.ch == '\n') && !unit.bk))
0335: && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
0336: // make everything up to here be a branch. create vector if nec.
0337: addToken(currentToken);
0338: RE theBranch = new RE(firstToken, lastToken, numSubs,
0339: subIndex, minimumLength);
0340: minimumLength = 0;
0341: if (branches == null) {
0342: branches = new Vector();
0343: }
0344: branches.addElement(theBranch);
0345: firstToken = lastToken = currentToken = null;
0346: }
0347:
0348: // INTERVAL OPERATOR:
0349: // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES)
0350: // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
0351: //
0352: // OPEN QUESTION:
0353: // what is proper interpretation of '{' at start of string?
0354:
0355: else if ((unit.ch == '{')
0356: && syntax.get(RESyntax.RE_INTERVALS)
0357: && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) {
0358: int newIndex = getMinMax(pattern, index, minMax, syntax);
0359: if (newIndex > index) {
0360: if (minMax.first > minMax.second)
0361: throw new REException(
0362: getLocalizedMessage("interval.order"),
0363: REException.REG_BADRPT, newIndex);
0364: if (currentToken == null)
0365: throw new REException(
0366: getLocalizedMessage("repeat.no.token"),
0367: REException.REG_BADRPT, newIndex);
0368: if (currentToken instanceof RETokenRepeated)
0369: throw new REException(
0370: getLocalizedMessage("repeat.chained"),
0371: REException.REG_BADRPT, newIndex);
0372: if (currentToken instanceof RETokenWordBoundary
0373: || currentToken instanceof RETokenWordBoundary)
0374: throw new REException(
0375: getLocalizedMessage("repeat.assertion"),
0376: REException.REG_BADRPT, newIndex);
0377: if ((currentToken.getMinimumLength() == 0)
0378: && (minMax.second == Integer.MAX_VALUE))
0379: throw new REException(
0380: getLocalizedMessage("repeat.empty.token"),
0381: REException.REG_BADRPT, newIndex);
0382: index = newIndex;
0383: currentToken = setRepeated(currentToken,
0384: minMax.first, minMax.second, index);
0385: } else {
0386: addToken(currentToken);
0387: currentToken = new RETokenChar(subIndex, unit.ch,
0388: insens);
0389: }
0390: }
0391:
0392: // LIST OPERATOR:
0393: // [...] | [^...]
0394:
0395: else if ((unit.ch == '[') && !unit.bk) {
0396: Vector options = new Vector();
0397: boolean negative = false;
0398: char lastChar = 0;
0399: if (index == pLength)
0400: throw new REException(
0401: getLocalizedMessage("unmatched.bracket"),
0402: REException.REG_EBRACK, index);
0403:
0404: // Check for initial caret, negation
0405: if ((ch = pattern[index]) == '^') {
0406: negative = true;
0407: if (++index == pLength)
0408: throw new REException(
0409: getLocalizedMessage("class.no.end"),
0410: REException.REG_EBRACK, index);
0411: ch = pattern[index];
0412: }
0413:
0414: // Check for leading right bracket literal
0415: if (ch == ']') {
0416: lastChar = ch;
0417: if (++index == pLength)
0418: throw new REException(
0419: getLocalizedMessage("class.no.end"),
0420: REException.REG_EBRACK, index);
0421: }
0422:
0423: while ((ch = pattern[index++]) != ']') {
0424: if ((ch == '-') && (lastChar != 0)) {
0425: if (index == pLength)
0426: throw new REException(
0427: getLocalizedMessage("class.no.end"),
0428: REException.REG_EBRACK, index);
0429: if ((ch = pattern[index]) == ']') {
0430: options.addElement(new RETokenChar(
0431: subIndex, lastChar, insens));
0432: lastChar = '-';
0433: } else {
0434: options.addElement(new RETokenRange(
0435: subIndex, lastChar, ch, insens));
0436: lastChar = 0;
0437: index++;
0438: }
0439: } else if ((ch == '\\')
0440: && syntax
0441: .get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
0442: if (index == pLength)
0443: throw new REException(
0444: getLocalizedMessage("class.no.end"),
0445: REException.REG_EBRACK, index);
0446: int posixID = -1;
0447: boolean negate = false;
0448: char asciiEsc = 0;
0449: if (("dswDSW".indexOf(pattern[index]) != -1)
0450: && syntax
0451: .get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
0452: switch (pattern[index]) {
0453: case 'D':
0454: negate = true;
0455: case 'd':
0456: posixID = RETokenPOSIX.DIGIT;
0457: break;
0458: case 'S':
0459: negate = true;
0460: case 's':
0461: posixID = RETokenPOSIX.SPACE;
0462: break;
0463: case 'W':
0464: negate = true;
0465: case 'w':
0466: posixID = RETokenPOSIX.ALNUM;
0467: break;
0468: }
0469: } else if ("nrt".indexOf(pattern[index]) != -1) {
0470: switch (pattern[index]) {
0471: case 'n':
0472: asciiEsc = '\n';
0473: break;
0474: case 't':
0475: asciiEsc = '\t';
0476: break;
0477: case 'r':
0478: asciiEsc = '\r';
0479: break;
0480: }
0481: }
0482: if (lastChar != 0)
0483: options.addElement(new RETokenChar(
0484: subIndex, lastChar, insens));
0485:
0486: if (posixID != -1) {
0487: options.addElement(new RETokenPOSIX(
0488: subIndex, posixID, insens, negate));
0489: } else if (asciiEsc != 0) {
0490: lastChar = asciiEsc;
0491: } else {
0492: lastChar = pattern[index];
0493: }
0494: ++index;
0495: } else if ((ch == '[')
0496: && (syntax.get(RESyntax.RE_CHAR_CLASSES))
0497: && (index < pLength)
0498: && (pattern[index] == ':')) {
0499: StringBuffer posixSet = new StringBuffer();
0500: index = getPosixSet(pattern, index + 1,
0501: posixSet);
0502: int posixId = RETokenPOSIX.intValue(posixSet
0503: .toString());
0504: if (posixId != -1)
0505: options.addElement(new RETokenPOSIX(
0506: subIndex, posixId, insens, false));
0507: } else {
0508: if (lastChar != 0)
0509: options.addElement(new RETokenChar(
0510: subIndex, lastChar, insens));
0511: lastChar = ch;
0512: }
0513: if (index == pLength)
0514: throw new REException(
0515: getLocalizedMessage("class.no.end"),
0516: REException.REG_EBRACK, index);
0517: } // while in list
0518: // Out of list, index is one past ']'
0519:
0520: if (lastChar != 0)
0521: options.addElement(new RETokenChar(subIndex,
0522: lastChar, insens));
0523:
0524: // Create a new RETokenOneOf
0525: addToken(currentToken);
0526: options.trimToSize();
0527: currentToken = new RETokenOneOf(subIndex, options,
0528: negative);
0529: }
0530:
0531: // SUBEXPRESSIONS
0532: // (...) | \(...\) depending on RE_NO_BK_PARENS
0533:
0534: else if ((unit.ch == '(')
0535: && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) {
0536: boolean pure = false;
0537: boolean comment = false;
0538: boolean lookAhead = false;
0539: boolean negativelh = false;
0540: if ((index + 1 < pLength) && (pattern[index] == '?')) {
0541: switch (pattern[index + 1]) {
0542: case '!':
0543: if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
0544: pure = true;
0545: negativelh = true;
0546: lookAhead = true;
0547: index += 2;
0548: }
0549: break;
0550: case '=':
0551: if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
0552: pure = true;
0553: lookAhead = true;
0554: index += 2;
0555: }
0556: break;
0557: case ':':
0558: if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
0559: pure = true;
0560: index += 2;
0561: }
0562: break;
0563: case '#':
0564: if (syntax.get(RESyntax.RE_COMMENTS)) {
0565: comment = true;
0566: }
0567: break;
0568: default:
0569: throw new REException(
0570: getLocalizedMessage("repeat.no.token"),
0571: REException.REG_BADRPT, index);
0572: }
0573: }
0574:
0575: if (index >= pLength) {
0576: throw new REException(
0577: getLocalizedMessage("unmatched.paren"),
0578: REException.REG_ESUBREG, index);
0579: }
0580:
0581: // find end of subexpression
0582: int endIndex = index;
0583: int nextIndex = index;
0584: int nested = 0;
0585:
0586: while (((nextIndex = getCharUnit(pattern, endIndex,
0587: unit)) > 0)
0588: && !(nested == 0 && (unit.ch == ')') && (syntax
0589: .get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)))
0590: if ((endIndex = nextIndex) >= pLength)
0591: throw new REException(
0592: getLocalizedMessage("subexpr.no.end"),
0593: REException.REG_ESUBREG, nextIndex);
0594: else if (unit.ch == '('
0595: && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
0596: nested++;
0597: else if (unit.ch == ')'
0598: && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
0599: nested--;
0600:
0601: // endIndex is now position at a ')','\)'
0602: // nextIndex is end of string or position after ')' or '\)'
0603:
0604: if (comment)
0605: index = nextIndex;
0606: else { // not a comment
0607: // create RE subexpression as token.
0608: addToken(currentToken);
0609: if (!pure) {
0610: numSubs++;
0611: }
0612:
0613: int useIndex = (pure || lookAhead) ? 0 : nextSub
0614: + numSubs;
0615: currentToken = new RE(String.valueOf(pattern,
0616: index, endIndex - index).toCharArray(),
0617: cflags, syntax, useIndex, nextSub + numSubs);
0618: numSubs += ((RE) currentToken).getNumSubs();
0619:
0620: if (lookAhead) {
0621: currentToken = new RETokenLookAhead(
0622: currentToken, negativelh);
0623: }
0624:
0625: index = nextIndex;
0626: } // not a comment
0627: } // subexpression
0628:
0629: // UNMATCHED RIGHT PAREN
0630: // ) or \) throw exception if
0631: // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
0632: else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
0633: && ((unit.ch == ')') && (syntax
0634: .get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) {
0635: throw new REException(
0636: getLocalizedMessage("unmatched.paren"),
0637: REException.REG_EPAREN, index);
0638: }
0639:
0640: // START OF LINE OPERATOR
0641: // ^
0642:
0643: else if ((unit.ch == '^') && !unit.bk) {
0644: addToken(currentToken);
0645: currentToken = null;
0646: addToken(new RETokenStart(subIndex,
0647: ((cflags & REG_MULTILINE) > 0) ? syntax
0648: .getLineSeparator() : null));
0649: }
0650:
0651: // END OF LINE OPERATOR
0652: // $
0653:
0654: else if ((unit.ch == '$') && !unit.bk) {
0655: addToken(currentToken);
0656: currentToken = null;
0657: addToken(new RETokenEnd(subIndex,
0658: ((cflags & REG_MULTILINE) > 0) ? syntax
0659: .getLineSeparator() : null));
0660: }
0661:
0662: // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
0663: // .
0664:
0665: else if ((unit.ch == '.') && !unit.bk) {
0666: addToken(currentToken);
0667: currentToken = new RETokenAny(subIndex, syntax
0668: .get(RESyntax.RE_DOT_NEWLINE)
0669: || ((cflags & REG_DOT_NEWLINE) > 0), syntax
0670: .get(RESyntax.RE_DOT_NOT_NULL));
0671: }
0672:
0673: // ZERO-OR-MORE REPEAT OPERATOR
0674: // *
0675:
0676: else if ((unit.ch == '*') && !unit.bk) {
0677: if (currentToken == null)
0678: throw new REException(
0679: getLocalizedMessage("repeat.no.token"),
0680: REException.REG_BADRPT, index);
0681: if (currentToken instanceof RETokenRepeated)
0682: throw new REException(
0683: getLocalizedMessage("repeat.chained"),
0684: REException.REG_BADRPT, index);
0685: if (currentToken instanceof RETokenWordBoundary
0686: || currentToken instanceof RETokenWordBoundary)
0687: throw new REException(
0688: getLocalizedMessage("repeat.assertion"),
0689: REException.REG_BADRPT, index);
0690: if (currentToken.getMinimumLength() == 0)
0691: throw new REException(
0692: getLocalizedMessage("repeat.empty.token"),
0693: REException.REG_BADRPT, index);
0694: currentToken = setRepeated(currentToken, 0,
0695: Integer.MAX_VALUE, index);
0696: }
0697:
0698: // ONE-OR-MORE REPEAT OPERATOR
0699: // + | \+ depending on RE_BK_PLUS_QM
0700: // not available if RE_LIMITED_OPS is set
0701:
0702: else if ((unit.ch == '+')
0703: && !syntax.get(RESyntax.RE_LIMITED_OPS)
0704: && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
0705: if (currentToken == null)
0706: throw new REException(
0707: getLocalizedMessage("repeat.no.token"),
0708: REException.REG_BADRPT, index);
0709: if (currentToken instanceof RETokenRepeated)
0710: throw new REException(
0711: getLocalizedMessage("repeat.chained"),
0712: REException.REG_BADRPT, index);
0713: if (currentToken instanceof RETokenWordBoundary
0714: || currentToken instanceof RETokenWordBoundary)
0715: throw new REException(
0716: getLocalizedMessage("repeat.assertion"),
0717: REException.REG_BADRPT, index);
0718: if (currentToken.getMinimumLength() == 0)
0719: throw new REException(
0720: getLocalizedMessage("repeat.empty.token"),
0721: REException.REG_BADRPT, index);
0722: currentToken = setRepeated(currentToken, 1,
0723: Integer.MAX_VALUE, index);
0724: }
0725:
0726: // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
0727: // ? | \? depending on RE_BK_PLUS_QM
0728: // not available if RE_LIMITED_OPS is set
0729: // stingy matching if RE_STINGY_OPS is set and it follows a quantifier
0730:
0731: else if ((unit.ch == '?')
0732: && !syntax.get(RESyntax.RE_LIMITED_OPS)
0733: && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
0734: if (currentToken == null)
0735: throw new REException(
0736: getLocalizedMessage("repeat.no.token"),
0737: REException.REG_BADRPT, index);
0738:
0739: // Check for stingy matching on RETokenRepeated
0740: if (currentToken instanceof RETokenRepeated) {
0741: if (syntax.get(RESyntax.RE_STINGY_OPS)
0742: && !((RETokenRepeated) currentToken)
0743: .isStingy())
0744: ((RETokenRepeated) currentToken).makeStingy();
0745: else
0746: throw new REException(
0747: getLocalizedMessage("repeat.chained"),
0748: REException.REG_BADRPT, index);
0749: } else if (currentToken instanceof RETokenWordBoundary
0750: || currentToken instanceof RETokenWordBoundary)
0751: throw new REException(
0752: getLocalizedMessage("repeat.assertion"),
0753: REException.REG_BADRPT, index);
0754: else
0755: currentToken = setRepeated(currentToken, 0, 1,
0756: index);
0757: }
0758:
0759: // BACKREFERENCE OPERATOR
0760: // \1 \2 ... \9
0761: // not available if RE_NO_BK_REFS is set
0762:
0763: else if (unit.bk && Character.isDigit(unit.ch)
0764: && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
0765: addToken(currentToken);
0766: currentToken = new RETokenBackRef(subIndex, Character
0767: .digit(unit.ch, 10), insens);
0768: }
0769:
0770: // START OF STRING OPERATOR
0771: // \A if RE_STRING_ANCHORS is set
0772:
0773: else if (unit.bk && (unit.ch == 'A')
0774: && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
0775: addToken(currentToken);
0776: currentToken = new RETokenStart(subIndex, null);
0777: }
0778:
0779: // WORD BREAK OPERATOR
0780: // \b if ????
0781:
0782: else if (unit.bk && (unit.ch == 'b')
0783: && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
0784: addToken(currentToken);
0785: currentToken = new RETokenWordBoundary(subIndex,
0786: RETokenWordBoundary.BEGIN
0787: | RETokenWordBoundary.END, false);
0788: }
0789:
0790: // WORD BEGIN OPERATOR
0791: // \< if ????
0792: else if (unit.bk && (unit.ch == '<')) {
0793: addToken(currentToken);
0794: currentToken = new RETokenWordBoundary(subIndex,
0795: RETokenWordBoundary.BEGIN, false);
0796: }
0797:
0798: // WORD END OPERATOR
0799: // \> if ????
0800: else if (unit.bk && (unit.ch == '>')) {
0801: addToken(currentToken);
0802: currentToken = new RETokenWordBoundary(subIndex,
0803: RETokenWordBoundary.END, false);
0804: }
0805:
0806: // NON-WORD BREAK OPERATOR
0807: // \B if ????
0808:
0809: else if (unit.bk && (unit.ch == 'B')
0810: && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
0811: addToken(currentToken);
0812: currentToken = new RETokenWordBoundary(subIndex,
0813: RETokenWordBoundary.BEGIN
0814: | RETokenWordBoundary.END, true);
0815: }
0816:
0817: // DIGIT OPERATOR
0818: // \d if RE_CHAR_CLASS_ESCAPES is set
0819:
0820: else if (unit.bk && (unit.ch == 'd')
0821: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0822: addToken(currentToken);
0823: currentToken = new RETokenPOSIX(subIndex,
0824: RETokenPOSIX.DIGIT, insens, false);
0825: }
0826:
0827: // NON-DIGIT OPERATOR
0828: // \D
0829:
0830: else if (unit.bk && (unit.ch == 'D')
0831: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0832: addToken(currentToken);
0833: currentToken = new RETokenPOSIX(subIndex,
0834: RETokenPOSIX.DIGIT, insens, true);
0835: }
0836:
0837: // NEWLINE ESCAPE
0838: // \n
0839:
0840: else if (unit.bk && (unit.ch == 'n')) {
0841: addToken(currentToken);
0842: currentToken = new RETokenChar(subIndex, '\n', false);
0843: }
0844:
0845: // RETURN ESCAPE
0846: // \r
0847:
0848: else if (unit.bk && (unit.ch == 'r')) {
0849: addToken(currentToken);
0850: currentToken = new RETokenChar(subIndex, '\r', false);
0851: }
0852:
0853: // WHITESPACE OPERATOR
0854: // \s if RE_CHAR_CLASS_ESCAPES is set
0855:
0856: else if (unit.bk && (unit.ch == 's')
0857: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0858: addToken(currentToken);
0859: currentToken = new RETokenPOSIX(subIndex,
0860: RETokenPOSIX.SPACE, insens, false);
0861: }
0862:
0863: // NON-WHITESPACE OPERATOR
0864: // \S
0865:
0866: else if (unit.bk && (unit.ch == 'S')
0867: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0868: addToken(currentToken);
0869: currentToken = new RETokenPOSIX(subIndex,
0870: RETokenPOSIX.SPACE, insens, true);
0871: }
0872:
0873: // TAB ESCAPE
0874: // \t
0875:
0876: else if (unit.bk && (unit.ch == 't')) {
0877: addToken(currentToken);
0878: currentToken = new RETokenChar(subIndex, '\t', false);
0879: }
0880:
0881: // ALPHANUMERIC OPERATOR
0882: // \w
0883:
0884: else if (unit.bk && (unit.ch == 'w')
0885: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0886: addToken(currentToken);
0887: currentToken = new RETokenPOSIX(subIndex,
0888: RETokenPOSIX.ALNUM, insens, false);
0889: }
0890:
0891: // NON-ALPHANUMERIC OPERATOR
0892: // \W
0893:
0894: else if (unit.bk && (unit.ch == 'W')
0895: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0896: addToken(currentToken);
0897: currentToken = new RETokenPOSIX(subIndex,
0898: RETokenPOSIX.ALNUM, insens, true);
0899: }
0900:
0901: // END OF STRING OPERATOR
0902: // \Z
0903:
0904: else if (unit.bk && (unit.ch == 'Z')
0905: && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
0906: addToken(currentToken);
0907: currentToken = new RETokenEnd(subIndex, null);
0908: }
0909:
0910: // NON-SPECIAL CHARACTER (or escape to make literal)
0911: // c | \* for example
0912:
0913: else { // not a special character
0914: addToken(currentToken);
0915: currentToken = new RETokenChar(subIndex, unit.ch,
0916: insens);
0917: }
0918: } // end while
0919:
0920: // Add final buffered token and an EndSub marker
0921: addToken(currentToken);
0922:
0923: if (branches != null) {
0924: branches.addElement(new RE(firstToken, lastToken, numSubs,
0925: subIndex, minimumLength));
0926: branches.trimToSize(); // compact the Vector
0927: minimumLength = 0;
0928: firstToken = lastToken = null;
0929: addToken(new RETokenOneOf(subIndex, branches, false));
0930: } else
0931: addToken(new RETokenEndSub(subIndex));
0932:
0933: }
0934:
0935: private static int getCharUnit(char[] input, int index,
0936: CharUnit unit) throws REException {
0937: unit.ch = input[index++];
0938: if (unit.bk = (unit.ch == '\\'))
0939: if (index < input.length)
0940: unit.ch = input[index++];
0941: else
0942: throw new REException(
0943: getLocalizedMessage("ends.with.backslash"),
0944: REException.REG_ESCAPE, index);
0945: return index;
0946: }
0947:
0948: /**
0949: * Checks if the regular expression matches the input in its entirety.
0950: *
0951: * @param input The input text.
0952: */
0953: public boolean isMatch(Object input) {
0954: return isMatch(input, 0, 0);
0955: }
0956:
0957: /**
0958: * Checks if the input string, starting from index, is an exact match of
0959: * this regular expression.
0960: *
0961: * @param input The input text.
0962: * @param index The offset index at which the search should be begin.
0963: */
0964: public boolean isMatch(Object input, int index) {
0965: return isMatch(input, index, 0);
0966: }
0967:
0968: /**
0969: * Checks if the input, starting from index and using the specified
0970: * execution flags, is an exact match of this regular expression.
0971: *
0972: * @param input The input text.
0973: * @param index The offset index at which the search should be begin.
0974: * @param eflags The logical OR of any execution flags above.
0975: */
0976: public boolean isMatch(Object input, int index, int eflags) {
0977: return isMatchImpl(makeCharIndexed(input, index), index, eflags);
0978: }
0979:
0980: private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
0981: if (firstToken == null) // Trivial case
0982: return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
0983: REMatch m = new REMatch(numSubs, index, eflags);
0984: if (firstToken.match(input, m)) {
0985: while (m != null) {
0986: if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
0987: return true;
0988: }
0989: m = m.next;
0990: }
0991: }
0992: return false;
0993: }
0994:
0995: /**
0996: * Returns the maximum number of subexpressions in this regular expression.
0997: * If the expression contains branches, the value returned will be the
0998: * maximum subexpressions in any of the branches.
0999: */
1000: public int getNumSubs() {
1001: return numSubs;
1002: }
1003:
1004: // Overrides REToken.setUncle
1005: void setUncle(REToken uncle) {
1006: if (lastToken != null) {
1007: lastToken.setUncle(uncle);
1008: } else
1009: super .setUncle(uncle); // to deal with empty subexpressions
1010: }
1011:
1012: // Overrides REToken.chain
1013:
1014: boolean chain(REToken next) {
1015: super .chain(next);
1016: setUncle(next);
1017: return true;
1018: }
1019:
1020: /**
1021: * Returns the minimum number of characters that could possibly
1022: * constitute a match of this regular expression.
1023: */
1024: public int getMinimumLength() {
1025: return minimumLength;
1026: }
1027:
1028: /**
1029: * Returns an array of all matches found in the input.
1030: *
1031: * If the regular expression allows the empty string to match, it will
1032: * substitute matches at all positions except the end of the input.
1033: *
1034: * @param input The input text.
1035: * @return a non-null (but possibly zero-length) array of matches
1036: */
1037: public REMatch[] getAllMatches(Object input) {
1038: return getAllMatches(input, 0, 0);
1039: }
1040:
1041: /**
1042: * Returns an array of all matches found in the input,
1043: * beginning at the specified index position.
1044: *
1045: * If the regular expression allows the empty string to match, it will
1046: * substitute matches at all positions except the end of the input.
1047: *
1048: * @param input The input text.
1049: * @param index The offset index at which the search should be begin.
1050: * @return a non-null (but possibly zero-length) array of matches
1051: */
1052: public REMatch[] getAllMatches(Object input, int index) {
1053: return getAllMatches(input, index, 0);
1054: }
1055:
1056: /**
1057: * Returns an array of all matches found in the input string,
1058: * beginning at the specified index position and using the specified
1059: * execution flags.
1060: *
1061: * If the regular expression allows the empty string to match, it will
1062: * substitute matches at all positions except the end of the input.
1063: *
1064: * @param input The input text.
1065: * @param index The offset index at which the search should be begin.
1066: * @param eflags The logical OR of any execution flags above.
1067: * @return a non-null (but possibly zero-length) array of matches
1068: */
1069: public REMatch[] getAllMatches(Object input, int index, int eflags) {
1070: return getAllMatchesImpl(makeCharIndexed(input, index), index,
1071: eflags);
1072: }
1073:
1074: // this has been changed since 1.03 to be non-overlapping matches
1075: private REMatch[] getAllMatchesImpl(CharIndexed input, int index,
1076: int eflags) {
1077: Vector all = new Vector();
1078: REMatch m = null;
1079: while ((m = getMatchImpl(input, index, eflags, null)) != null) {
1080: all.addElement(m);
1081: index = m.getEndIndex();
1082: if (m.end[0] == 0) { // handle pathological case of zero-length match
1083: index++;
1084: input.move(1);
1085: } else {
1086: input.move(m.end[0]);
1087: }
1088: if (!input.isValid())
1089: break;
1090: }
1091: REMatch[] mset = new REMatch[all.size()];
1092: all.copyInto(mset);
1093: return mset;
1094: }
1095:
1096: /* Implements abstract method REToken.match() */
1097: boolean match(CharIndexed input, REMatch mymatch) {
1098: if (firstToken == null)
1099: return next(input, mymatch);
1100:
1101: // Note the start of this subexpression
1102: mymatch.start[subIndex] = mymatch.index;
1103:
1104: return firstToken.match(input, mymatch);
1105: }
1106:
1107: /**
1108: * Returns the first match found in the input. If no match is found,
1109: * null is returned.
1110: *
1111: * @param input The input text.
1112: * @return An REMatch instance referencing the match, or null if none.
1113: */
1114: public REMatch getMatch(Object input) {
1115: return getMatch(input, 0, 0);
1116: }
1117:
1118: /**
1119: * Returns the first match found in the input, beginning
1120: * the search at the specified index. If no match is found,
1121: * returns null.
1122: *
1123: * @param input The input text.
1124: * @param index The offset within the text to begin looking for a match.
1125: * @return An REMatch instance referencing the match, or null if none.
1126: */
1127: public REMatch getMatch(Object input, int index) {
1128: return getMatch(input, index, 0);
1129: }
1130:
1131: /**
1132: * Returns the first match found in the input, beginning
1133: * the search at the specified index, and using the specified
1134: * execution flags. If no match is found, returns null.
1135: *
1136: * @param input The input text.
1137: * @param index The offset index at which the search should be begin.
1138: * @param eflags The logical OR of any execution flags above.
1139: * @return An REMatch instance referencing the match, or null if none.
1140: */
1141: public REMatch getMatch(Object input, int index, int eflags) {
1142: return getMatch(input, index, eflags, null);
1143: }
1144:
1145: /**
1146: * Returns the first match found in the input, beginning the search
1147: * at the specified index, and using the specified execution flags.
1148: * If no match is found, returns null. If a StringBuffer is
1149: * provided and is non-null, the contents of the input text from the
1150: * index to the beginning of the match (or to the end of the input,
1151: * if there is no match) are appended to the StringBuffer.
1152: *
1153: * @param input The input text.
1154: * @param index The offset index at which the search should be begin.
1155: * @param eflags The logical OR of any execution flags above.
1156: * @param buffer The StringBuffer to save pre-match text in.
1157: * @return An REMatch instance referencing the match, or null if none. */
1158: public REMatch getMatch(Object input, int index, int eflags,
1159: StringBuffer buffer) {
1160: return getMatchImpl(makeCharIndexed(input, index), index,
1161: eflags, buffer);
1162: }
1163:
1164: REMatch getMatchImpl(CharIndexed input, int anchor, int eflags,
1165: StringBuffer buffer) {
1166: // Create a new REMatch to hold results
1167: REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1168: do {
1169: // Optimization: check if anchor + minimumLength > length
1170: if (minimumLength == 0
1171: || input.charAt(minimumLength - 1) != CharIndexed.OUT_OF_BOUNDS) {
1172: if (match(input, mymatch)) {
1173: // Find longest match of them all to observe leftmost longest
1174: REMatch longest = mymatch;
1175: while ((mymatch = mymatch.next) != null) {
1176: if (mymatch.index > longest.index) {
1177: longest = mymatch;
1178: }
1179: }
1180:
1181: longest.end[0] = longest.index;
1182: longest.finish(input);
1183: return longest;
1184: }
1185: }
1186: mymatch.clear(++anchor);
1187: // Append character to buffer if needed
1188: if (buffer != null
1189: && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1190: buffer.append(input.charAt(0));
1191: }
1192: } while (input.move(1));
1193:
1194: return null;
1195: }
1196:
1197: /**
1198: * Returns an REMatchEnumeration that can be used to iterate over the
1199: * matches found in the input text.
1200: *
1201: * @param input The input text.
1202: * @return A non-null REMatchEnumeration instance.
1203: */
1204: public REMatchEnumeration getMatchEnumeration(Object input) {
1205: return getMatchEnumeration(input, 0, 0);
1206: }
1207:
1208: /**
1209: * Returns an REMatchEnumeration that can be used to iterate over the
1210: * matches found in the input text.
1211: *
1212: * @param input The input text.
1213: * @param index The offset index at which the search should be begin.
1214: * @return A non-null REMatchEnumeration instance, with its input cursor
1215: * set to the index position specified.
1216: */
1217: public REMatchEnumeration getMatchEnumeration(Object input,
1218: int index) {
1219: return getMatchEnumeration(input, index, 0);
1220: }
1221:
1222: /**
1223: * Returns an REMatchEnumeration that can be used to iterate over the
1224: * matches found in the input text.
1225: *
1226: * @param input The input text.
1227: * @param index The offset index at which the search should be begin.
1228: * @param eflags The logical OR of any execution flags above.
1229: * @return A non-null REMatchEnumeration instance, with its input cursor
1230: * set to the index position specified.
1231: */
1232: public REMatchEnumeration getMatchEnumeration(Object input,
1233: int index, int eflags) {
1234: return new REMatchEnumeration(this , makeCharIndexed(input,
1235: index), index, eflags);
1236: }
1237:
1238: /**
1239: * Substitutes the replacement text for the first match found in the input.
1240: *
1241: * @param input The input text.
1242: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1243: * @return A String interpolating the substituted text.
1244: * @see REMatch#substituteInto
1245: */
1246: public String substitute(Object input, String replace) {
1247: return substitute(input, replace, 0, 0);
1248: }
1249:
1250: /**
1251: * Substitutes the replacement text for the first match found in the input
1252: * beginning at the specified index position. Specifying an index
1253: * effectively causes the regular expression engine to throw away the
1254: * specified number of characters.
1255: *
1256: * @param input The input text.
1257: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1258: * @param index The offset index at which the search should be begin.
1259: * @return A String containing the substring of the input, starting
1260: * at the index position, and interpolating the substituted text.
1261: * @see REMatch#substituteInto
1262: */
1263: public String substitute(Object input, String replace, int index) {
1264: return substitute(input, replace, index, 0);
1265: }
1266:
1267: /**
1268: * Substitutes the replacement text for the first match found in the input
1269: * string, beginning at the specified index position and using the
1270: * specified execution flags.
1271: *
1272: * @param input The input text.
1273: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1274: * @param index The offset index at which the search should be begin.
1275: * @param eflags The logical OR of any execution flags above.
1276: * @return A String containing the substring of the input, starting
1277: * at the index position, and interpolating the substituted text.
1278: * @see REMatch#substituteInto
1279: */
1280: public String substitute(Object input, String replace, int index,
1281: int eflags) {
1282: return substituteImpl(makeCharIndexed(input, index), replace,
1283: index, eflags);
1284: }
1285:
1286: private String substituteImpl(CharIndexed input, String replace,
1287: int index, int eflags) {
1288: StringBuffer buffer = new StringBuffer();
1289: REMatch m = getMatchImpl(input, index, eflags, buffer);
1290: if (m == null)
1291: return buffer.toString();
1292: buffer.append(((eflags & REG_NO_INTERPOLATE) > 0) ? replace : m
1293: .substituteInto(replace));
1294: if (input.move(m.end[0])) {
1295: do {
1296: buffer.append(input.charAt(0));
1297: } while (input.move(1));
1298: }
1299: return buffer.toString();
1300: }
1301:
1302: /**
1303: * Substitutes the replacement text for each non-overlapping match found
1304: * in the input text.
1305: *
1306: * @param input The input text.
1307: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1308: * @return A String interpolating the substituted text.
1309: * @see REMatch#substituteInto
1310: */
1311: public String substituteAll(Object input, String replace) {
1312: return substituteAll(input, replace, 0, 0);
1313: }
1314:
1315: /**
1316: * Substitutes the replacement text for each non-overlapping match found
1317: * in the input text, starting at the specified index.
1318: *
1319: * If the regular expression allows the empty string to match, it will
1320: * substitute matches at all positions except the end of the input.
1321: *
1322: * @param input The input text.
1323: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1324: * @param index The offset index at which the search should be begin.
1325: * @return A String containing the substring of the input, starting
1326: * at the index position, and interpolating the substituted text.
1327: * @see REMatch#substituteInto
1328: */
1329: public String substituteAll(Object input, String replace, int index) {
1330: return substituteAll(input, replace, index, 0);
1331: }
1332:
1333: /**
1334: * Substitutes the replacement text for each non-overlapping match found
1335: * in the input text, starting at the specified index and using the
1336: * specified execution flags.
1337: *
1338: * @param input The input text.
1339: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1340: * @param index The offset index at which the search should be begin.
1341: * @param eflags The logical OR of any execution flags above.
1342: * @return A String containing the substring of the input, starting
1343: * at the index position, and interpolating the substituted text.
1344: * @see REMatch#substituteInto
1345: */
1346: public String substituteAll(Object input, String replace,
1347: int index, int eflags) {
1348: return substituteAllImpl(makeCharIndexed(input, index),
1349: replace, index, eflags);
1350: }
1351:
1352: private String substituteAllImpl(CharIndexed input, String replace,
1353: int index, int eflags) {
1354: StringBuffer buffer = new StringBuffer();
1355: REMatch m;
1356: while ((m = getMatchImpl(input, index, eflags, buffer)) != null) {
1357: buffer.append(((eflags & REG_NO_INTERPOLATE) > 0) ? replace
1358: : m.substituteInto(replace));
1359: index = m.getEndIndex();
1360: if (m.end[0] == 0) {
1361: char ch = input.charAt(0);
1362: if (ch != CharIndexed.OUT_OF_BOUNDS)
1363: buffer.append(ch);
1364: input.move(1);
1365: } else {
1366: input.move(m.end[0]);
1367: }
1368:
1369: if (!input.isValid())
1370: break;
1371: }
1372: return buffer.toString();
1373: }
1374:
1375: /* Helper function for constructor */
1376: private void addToken(REToken next) {
1377: if (next == null)
1378: return;
1379: minimumLength += next.getMinimumLength();
1380: if (firstToken == null) {
1381: lastToken = firstToken = next;
1382: } else {
1383: // if chain returns false, it "rejected" the token due to
1384: // an optimization, and next was combined with lastToken
1385: if (lastToken.chain(next)) {
1386: lastToken = next;
1387: }
1388: }
1389: }
1390:
1391: private static REToken setRepeated(REToken current, int min,
1392: int max, int index) throws REException {
1393: if (current == null)
1394: throw new REException(
1395: getLocalizedMessage("repeat.no.token"),
1396: REException.REG_BADRPT, index);
1397: return new RETokenRepeated(current.subIndex, current, min, max);
1398: }
1399:
1400: private static int getPosixSet(char[] pattern, int index,
1401: StringBuffer buf) {
1402: // Precondition: pattern[index-1] == ':'
1403: // we will return pos of closing ']'.
1404: int i;
1405: for (i = index; i < (pattern.length - 1); i++) {
1406: if ((pattern[i] == ':') && (pattern[i + 1] == ']'))
1407: return i + 2;
1408: buf.append(pattern[i]);
1409: }
1410: return index; // didn't match up
1411: }
1412:
1413: private int getMinMax(char[] input, int index, IntPair minMax,
1414: RESyntax syntax) throws REException {
1415: // Precondition: input[index-1] == '{', minMax != null
1416:
1417: boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
1418: int startIndex = index;
1419: if (index == input.length) {
1420: if (mustMatch)
1421: throw new REException(
1422: getLocalizedMessage("unmatched.brace"),
1423: REException.REG_EBRACE, index);
1424: else
1425: return startIndex;
1426: }
1427:
1428: int min, max = 0;
1429: CharUnit unit = new CharUnit();
1430: StringBuffer buf = new StringBuffer();
1431:
1432: // Read string of digits
1433: do {
1434: index = getCharUnit(input, index, unit);
1435: if (Character.isDigit(unit.ch))
1436: buf.append(unit.ch);
1437: } while ((index != input.length) && Character.isDigit(unit.ch));
1438:
1439: // Check for {} tomfoolery
1440: if (buf.length() == 0) {
1441: if (mustMatch)
1442: throw new REException(
1443: getLocalizedMessage("interval.error"),
1444: REException.REG_EBRACE, index);
1445: else
1446: return startIndex;
1447: }
1448:
1449: min = Integer.parseInt(buf.toString());
1450:
1451: if ((unit.ch == '}')
1452: && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1453: max = min;
1454: else if (index == input.length)
1455: if (mustMatch)
1456: throw new REException(
1457: getLocalizedMessage("interval.no.end"),
1458: REException.REG_EBRACE, index);
1459: else
1460: return startIndex;
1461: else if ((unit.ch == ',') && !unit.bk) {
1462: buf = new StringBuffer();
1463: // Read string of digits
1464: while (((index = getCharUnit(input, index, unit)) != input.length)
1465: && Character.isDigit(unit.ch))
1466: buf.append(unit.ch);
1467:
1468: if (!((unit.ch == '}') && (syntax
1469: .get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1470: if (mustMatch)
1471: throw new REException(
1472: getLocalizedMessage("interval.error"),
1473: REException.REG_EBRACE, index);
1474: else
1475: return startIndex;
1476:
1477: // This is the case of {x,}
1478: if (buf.length() == 0)
1479: max = Integer.MAX_VALUE;
1480: else
1481: max = Integer.parseInt(buf.toString());
1482: } else if (mustMatch)
1483: throw new REException(
1484: getLocalizedMessage("interval.error"),
1485: REException.REG_EBRACE, index);
1486: else
1487: return startIndex;
1488:
1489: // We know min and max now, and they are valid.
1490:
1491: minMax.first = min;
1492: minMax.second = max;
1493:
1494: // return the index following the '}'
1495: return index;
1496: }
1497:
1498: /**
1499: * Return a human readable form of the compiled regular expression,
1500: * useful for debugging.
1501: */
1502: public String toString() {
1503: StringBuffer sb = new StringBuffer();
1504: dump(sb);
1505: return sb.toString();
1506: }
1507:
1508: void dump(StringBuffer os) {
1509: os.append('(');
1510: if (subIndex == 0)
1511: os.append("?:");
1512: if (firstToken != null)
1513: firstToken.dumpAll(os);
1514: os.append(')');
1515: }
1516:
1517: // Cast input appropriately or throw exception
1518: private static CharIndexed makeCharIndexed(Object input, int index) {
1519: // We could let a String fall through to final input, but since
1520: // it's the most likely input type, we check it first.
1521: if (input instanceof String)
1522: return new CharIndexedString((String) input, index);
1523: else if (input instanceof char[])
1524: return new CharIndexedCharArray((char[]) input, index);
1525: else if (input instanceof StringBuffer)
1526: return new CharIndexedStringBuffer((StringBuffer) input,
1527: index);
1528: else if (input instanceof InputStream)
1529: return new CharIndexedInputStream((InputStream) input,
1530: index);
1531: else if (input instanceof Reader)
1532: return new CharIndexedReader((Reader) input, index);
1533: else if (input instanceof CharIndexed)
1534: return (CharIndexed) input; // do we lose index info?
1535: else
1536: return new CharIndexedString(input.toString(), index);
1537: }
1538: }
|