0001: /*
0002: * gnu/regexp/RE.java
0003: * Copyright (C) 1998 Wes Biggs
0004: *
0005: * This library is free software; you can redistribute it and/or modify
0006: * it under the terms of the GNU Library General Public License as published
0007: * by the Free Software Foundation; either version 2 of the License, or
0008: * (at your option) any later version.
0009: *
0010: * This library is distributed in the hope that it will be useful,
0011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0013: * GNU Library General Public License for more details.
0014: *
0015: * You should have received a copy of the GNU Library General Public License
0016: * along with this program; if not, write to the Free Software
0017: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
0018: */
0019:
0020: package gnu.regexp;
0021:
0022: import java.io.InputStream;
0023: import java.util.Vector;
0024:
0025: class IntPair {
0026: public int first, second;
0027: }
0028:
0029: class CharUnit {
0030: public char ch;
0031: public boolean bk;
0032: }
0033:
0034: /**
0035: * RE provides the user interface for compiling and matching regular
0036: * expressions.
0037: * <P>
0038: * A regular expression object (class RE) is compiled by constructing it
0039: * from a String, StringBuffer or character array, with optional
0040: * compilation flags (below)
0041: * and an optional syntax specification (see RESyntax; if not specified,
0042: * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
0043: * <P>
0044: * Various methods attempt to match input text against a compiled
0045: * regular expression. These methods are:
0046: * <LI><code>isMatch</code>: returns true if the input text in its entirety
0047: * matches the regular expression pattern.
0048: * <LI><code>getMatch</code>: returns the first match found in the input text,
0049: * or null if no match is found.
0050: * <LI><code>getAllMatches</code>: returns an array of all non-overlapping
0051: * matches found in the input text. If no matches are found, the array is
0052: * zero-length.
0053: * <LI><code>substitute</code>: substitute the first occurence of the pattern
0054: * in the input text with a replacement string (which may include
0055: * metacharacters $0-$9, see REMatch.substituteInto).
0056: * <LI><code>substituteAll</code>: same as above, but repeat for each match
0057: * before returning.
0058: * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration object
0059: * that allows iteration over the matches (see REMatchEnumeration for some
0060: * reasons why you may want to do this instead of using <code>getAllMatches</code>.
0061: * <P>
0062: * These methods all have similar argument lists. The input can be a
0063: * String, a character array, a StringBuffer or an InputStream of some sort.
0064: * Note that
0065: * when using an InputStream, the stream read position cannot be guaranteed
0066: * after attempting a match (this is not a bug, but a consequence of the way
0067: * regular expressions work). Using an REMatchEnumeration can eliminate most
0068: * positioning problems.
0069: * <P>
0070: * The optional index argument specifies the offset from the beginning of the
0071: * text at which the search should start (see the descriptions of some of
0072: * the execution flags for how this can affect positional pattern operators).
0073: * For an InputStream, this means an offset from the current read position,
0074: * so subsequent calls with the same index argument on an InputStream will not
0075: * necessarily be accessing the same position on the stream, whereas repeated
0076: * searches at a given index in a fixed string will return consistent
0077: * results.
0078: * <P>
0079: * You can optionally affect the execution environment by using a
0080: * combination of execution flags (constants listed below).
0081: *
0082: * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
0083: * @version 1.0.8, 21 March 1999
0084: */
0085:
0086: public class RE extends REToken {
0087: // This String will be returned by getVersion()
0088: private static final String s_version = "1.0.8";
0089:
0090: // These are, respectively, the first and last tokens in our linked list
0091: // If there is only one token, firstToken == lastToken
0092: private REToken firstToken, lastToken;
0093:
0094: // This is the number of subexpressions in this regular expression,
0095: // with a minimum value of zero. Returned by getNumSubs()
0096: private int m_numSubs;
0097:
0098: /**
0099: * Compilation flag. Do not differentiate case. Subsequent
0100: * searches using this RE will be case insensitive.
0101: */
0102: public static final int REG_ICASE = 2;
0103:
0104: /**
0105: * Compilation flag. The match-any-character operator (dot)
0106: * will match a newline character. When set this overrides the syntax
0107: * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
0108: * the "/s" operator in Perl.
0109: */
0110: public static final int REG_DOT_NEWLINE = 4;
0111:
0112: /**
0113: * Compilation flag. Use multiline mode. In this mode, the ^ and $
0114: * anchors will match based on newlines within the input. This is
0115: * equivalent to the "/m" operator in Perl.
0116: */
0117: public static final int REG_MULTILINE = 8;
0118:
0119: /**
0120: * Execution flag.
0121: * The match-beginning operator (^) will not match at the beginning
0122: * of the input string. Useful for matching on a substring when you
0123: * know the context of the input is such that position zero of the
0124: * input to the match test is not actually position zero of the text.
0125: * <P>
0126: * This example demonstrates the results of various ways of matching on
0127: * a substring.
0128: * <P>
0129: * <CODE>
0130: * String s = "food bar fool";<BR>
0131: * RE exp = new RE("^foo.");<BR>
0132: * REMatch m0 = exp.getMatch(s);<BR>
0133: * REMatch m1 = exp.getMatch(s.substring(8));<BR>
0134: * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
0135: * REMatch m3 = exp.getMatch(s,8); <BR>
0136: * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR>
0137: * <P>
0138: * // Results:<BR>
0139: * // m0 = "food"<BR>
0140: * // m1 = "fool"<BR>
0141: * // m2 = null<BR>
0142: * // m3 = null<BR>
0143: * // m4 = "fool"<BR>
0144: * </CODE>
0145: */
0146: public static final int REG_NOTBOL = 16;
0147:
0148: /**
0149: * Execution flag.
0150: * The match-end operator ($) does not match at the end
0151: * of the input string. Useful for matching on substrings.
0152: */
0153: public static final int REG_NOTEOL = 32;
0154:
0155: /**
0156: * Execution flag.
0157: * The match-beginning operator (^) matches not at position 0
0158: * in the input string, but at the position the search started at
0159: * (based on the index input given to the getMatch function). See
0160: * the example under REG_NOTBOL.
0161: */
0162: public static final int REG_ANCHORINDEX = 64;
0163:
0164: /** Returns a string representing the version of the gnu.regexp package. */
0165: public static final String version() {
0166: return s_version;
0167: }
0168:
0169: /**
0170: * Constructs a regular expression pattern buffer without any compilation
0171: * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
0172: *
0173: * @param pattern A regular expression pattern, in the form of a String,
0174: * StringBuffer or char[].
0175: * @exception REException The input pattern could not be parsed.
0176: * @exception IllegalArgumentException The pattern was not a String,
0177: * StringBuffer or char[].
0178: * @exception NullPointerException The pattern was null.
0179: */
0180: public RE(Object pattern) throws REException {
0181: this (pattern, 0, RESyntax.RE_SYNTAX_PERL5, 0, 0);
0182: }
0183:
0184: /**
0185: * Constructs a regular expression pattern buffer using the specified
0186: * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
0187: *
0188: * @param pattern A regular expression pattern, in the form of a String,
0189: * StringBuffer, or char[].
0190: * @param cflags The logical OR of any combination of the compilation flags listed above.
0191: * @exception REException The input pattern could not be parsed.
0192: * @exception IllegalArgumentException The pattern was not a String,
0193: * StringBuffer or char[].
0194: * @exception NullPointerException The pattern was null.
0195: */
0196: public RE(Object pattern, int cflags) throws REException {
0197: this (pattern, cflags, RESyntax.RE_SYNTAX_PERL5, 0, 0);
0198: }
0199:
0200: /**
0201: * Constructs a regular expression pattern buffer using the specified
0202: * compilation flags and regular expression syntax.
0203: *
0204: * @param pattern A regular expression pattern, in the form of a String,
0205: * StringBuffer, or char[].
0206: * @param cflags The logical OR of any combination of the compilation flags listed above.
0207: * @param syntax The type of regular expression syntax to use.
0208: * @exception REException The input pattern could not be parsed.
0209: * @exception IllegalArgumentException The pattern was not a String,
0210: * StringBuffer or char[].
0211: * @exception NullPointerException The pattern was null.
0212: */
0213: public RE(Object pattern, int cflags, RESyntax syntax)
0214: throws REException {
0215: this (pattern, cflags, syntax, 0, 0);
0216: }
0217:
0218: // internal constructor used for alternation
0219: private RE(REToken f_first, REToken f_last, int f_subs,
0220: int f_subIndex) {
0221: super (f_subIndex); // ???
0222: firstToken = f_first;
0223: lastToken = f_last;
0224: m_numSubs = f_subs;
0225: }
0226:
0227: // Actual constructor implementation
0228: private RE(Object patternObj, int cflags, RESyntax syntax,
0229: int myIndex, int nextSub) throws REException {
0230: super (myIndex); // Subexpression index of this token.
0231: char[] pattern;
0232: if (patternObj instanceof String) {
0233: pattern = ((String) patternObj).toCharArray();
0234: } else if (patternObj instanceof char[]) {
0235: pattern = (char[]) patternObj;
0236: } else if (patternObj instanceof StringBuffer) {
0237: pattern = new char[((StringBuffer) patternObj).length()];
0238: ((StringBuffer) patternObj).getChars(0, pattern.length,
0239: pattern, 0);
0240: } else
0241: throw new IllegalArgumentException(
0242: "Invalid class for pattern");
0243:
0244: int pLength = pattern.length;
0245:
0246: m_numSubs = 0; // Number of subexpressions in this token.
0247: Vector branches = null;
0248:
0249: // linked list of tokens (sort of -- some closed loops can exist)
0250: firstToken = lastToken = null;
0251:
0252: // Precalculate these so we don't pay for the math every time we
0253: // need to access them.
0254: boolean insens = ((cflags & REG_ICASE) > 0);
0255:
0256: // Parse pattern into tokens. Does anyone know if it's more efficient
0257: // to use char[] than a String.charAt()? I'm assuming so.
0258:
0259: // index tracks the position in the char array
0260: int index = 0;
0261:
0262: // this will be the current parse character (pattern[index])
0263: CharUnit unit = new CharUnit();
0264:
0265: // This is used for {x,y} calculations
0266: IntPair minMax = new IntPair();
0267:
0268: // Buffer a token so we can create a TokenRepeated, etc.
0269: REToken currentToken = null;
0270: char ch;
0271:
0272: while (index < pLength) {
0273: // read the next character unit (including backslash escapes)
0274: index = getCharUnit(pattern, index, unit);
0275:
0276: // ALTERNATION OPERATOR
0277: // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
0278: // not available if RE_LIMITED_OPS is set
0279:
0280: // TODO: the '\n' literal here should be a test against REToken.newline,
0281: // which unfortunately may be more than a single character.
0282: if (((unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk)) || (syntax
0283: .get(RESyntax.RE_NEWLINE_ALT)
0284: && (unit.ch == '\n') && !unit.bk))
0285: && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
0286: // make everything up to here be a branch. create vector if nec.
0287: if (branches == null)
0288: branches = new Vector();
0289: addToken(currentToken);
0290: branches.addElement(new RE(firstToken, lastToken,
0291: m_numSubs, m_subIndex));
0292: firstToken = lastToken = currentToken = null;
0293: }
0294:
0295: // INTERVAL OPERATOR:
0296: // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES)
0297: // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
0298: //
0299: // OPEN QUESTION:
0300: // what is proper interpretation of '{' at start of string?
0301:
0302: else if ((unit.ch == '{')
0303: && syntax.get(RESyntax.RE_INTERVALS)
0304: && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) {
0305: if (currentToken == null)
0306: throw new REException("{ without preceding token",
0307: REException.REG_EBRACE, index);
0308:
0309: index = getMinMax(pattern, index, minMax, syntax);
0310: if ((currentToken.getMinimumLength() == 0)
0311: && (minMax.second == Integer.MAX_VALUE))
0312: throw new REException(
0313: "repeated argument may be empty",
0314: REException.REG_BADRPT, index);
0315: currentToken = setRepeated(currentToken, minMax.first,
0316: minMax.second, index);
0317: }
0318:
0319: // LIST OPERATOR:
0320: // [...] | [^...]
0321:
0322: else if ((unit.ch == '[') && !unit.bk) {
0323: Vector options = new Vector();
0324: boolean negative = false;
0325: char lastChar = 0;
0326: if (index == pLength)
0327: throw new REException("unmatched [",
0328: REException.REG_EBRACK, index);
0329:
0330: // Check for initial caret, negation
0331: if ((ch = pattern[index]) == '^') {
0332: negative = true;
0333: if (++index == pLength)
0334: throw new REException("no end of list",
0335: REException.REG_EBRACK, index);
0336: ch = pattern[index];
0337: }
0338:
0339: // Check for leading right bracket literal
0340: if (ch == ']') {
0341: lastChar = ch;
0342: if (++index == pLength)
0343: throw new REException("no end of list",
0344: REException.REG_EBRACK, index);
0345: }
0346:
0347: while ((ch = pattern[index++]) != ']') {
0348: if ((ch == '-') && (lastChar != 0)) {
0349: if (index == pLength)
0350: throw new REException("no end of list",
0351: REException.REG_EBRACK, index);
0352: if ((ch = pattern[index]) == ']') {
0353: options.addElement(new RETokenChar(
0354: m_subIndex, lastChar, insens));
0355: lastChar = '-';
0356: } else {
0357: options.addElement(new RETokenRange(
0358: m_subIndex, lastChar, ch, insens));
0359: lastChar = 0;
0360: index++;
0361: }
0362: } else if ((ch == '\\')
0363: && syntax
0364: .get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
0365: if (index == pLength)
0366: throw new REException("no end of list",
0367: REException.REG_EBRACK, index);
0368: int posixID = -1;
0369: boolean negate = false;
0370: if (syntax
0371: .get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
0372: switch (pattern[index]) {
0373: case 'D':
0374: negate = true;
0375: case 'd':
0376: posixID = RETokenPOSIX.DIGIT;
0377: break;
0378: case 'S':
0379: negate = true;
0380: case 's':
0381: posixID = RETokenPOSIX.SPACE;
0382: break;
0383: case 'W':
0384: negate = true;
0385: case 'w':
0386: posixID = RETokenPOSIX.ALNUM;
0387: break;
0388: }
0389: }
0390: if (lastChar != 0)
0391: options.addElement(new RETokenChar(
0392: m_subIndex, lastChar, insens));
0393:
0394: if (posixID != -1) {
0395: options
0396: .addElement(new RETokenPOSIX(
0397: m_subIndex, posixID,
0398: insens, negate));
0399: } else {
0400: lastChar = pattern[index];
0401: }
0402: ++index;
0403: } else if ((ch == '[')
0404: && (syntax.get(RESyntax.RE_CHAR_CLASSES))
0405: && (pattern[index] == ':')) {
0406: StringBuffer posixSet = new StringBuffer();
0407: index = getPosixSet(pattern, index + 1,
0408: posixSet);
0409: int posixId = RETokenPOSIX.intValue(posixSet
0410: .toString());
0411: if (posixId != -1)
0412: options
0413: .addElement(new RETokenPOSIX(
0414: m_subIndex, posixId,
0415: insens, false));
0416: } else {
0417: if (lastChar != 0)
0418: options.addElement(new RETokenChar(
0419: m_subIndex, lastChar, insens));
0420: lastChar = ch;
0421: }
0422: if (index == pLength)
0423: throw new REException("no end of list",
0424: REException.REG_EBRACK, index);
0425: } // while in list
0426: // Out of list, index is one past ']'
0427:
0428: if (lastChar != 0)
0429: options.addElement(new RETokenChar(m_subIndex,
0430: lastChar, insens));
0431:
0432: // Create a new RETokenOneOf
0433: addToken(currentToken);
0434: options.trimToSize();
0435: currentToken = new RETokenOneOf(m_subIndex, options,
0436: negative);
0437: }
0438:
0439: // SUBEXPRESSIONS
0440: // (...) | \(...\) depending on RE_NO_BK_PARENS
0441:
0442: else if ((unit.ch == '(')
0443: && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) {
0444: boolean pure = false;
0445: boolean comment = false;
0446: if ((index + 1 < pLength) && (pattern[index] == '?')) {
0447: switch (pattern[index + 1]) {
0448: case ':':
0449: if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
0450: pure = true;
0451: index += 2;
0452: }
0453: break;
0454: case '#':
0455: if (syntax.get(RESyntax.RE_COMMENTS)) {
0456: comment = true;
0457: }
0458: break;
0459: }
0460: }
0461:
0462: // find end of subexpression
0463: int endIndex = index;
0464: int nextIndex = index;
0465: int nested = 0;
0466:
0467: while (((nextIndex = getCharUnit(pattern, endIndex,
0468: unit)) > 0)
0469: && !(nested == 0 && (unit.ch == ')') && (syntax
0470: .get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)))
0471: if ((endIndex = nextIndex) >= pLength)
0472: throw new REException(
0473: "no end of subexpression",
0474: REException.REG_ESUBREG, index - 1);
0475: else if (unit.ch == '('
0476: && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
0477: nested++;
0478: else if (unit.ch == ')'
0479: && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
0480: nested--;
0481:
0482: // endIndex is now position at a ')','\)'
0483: // nextIndex is end of string or position after ')' or '\)'
0484:
0485: if (comment)
0486: index = nextIndex;
0487: else { // not a comment
0488: // create RE subexpression as token.
0489: addToken(currentToken);
0490: if (!pure) {
0491: nextSub++;
0492: m_numSubs++;
0493: }
0494:
0495: int useIndex = pure ? 0 : nextSub;
0496:
0497: currentToken = new RE(String.valueOf(pattern,
0498: index, endIndex - index).toCharArray(),
0499: cflags, syntax, useIndex, nextSub);
0500: nextSub += ((RE) currentToken).getNumSubs();
0501: m_numSubs += ((RE) currentToken).getNumSubs();
0502: index = nextIndex;
0503: } // not a comment
0504: } // subexpression
0505:
0506: // UNMATCHED RIGHT PAREN
0507: // ) or \)? need to implement throw exception if
0508: // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
0509: else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
0510: && ((unit.ch == ')') && (syntax
0511: .get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) {
0512: throw new REException("unmatched right paren",
0513: REException.REG_EPAREN, index);
0514: }
0515:
0516: // START OF LINE OPERATOR
0517: // ^
0518:
0519: else if ((unit.ch == '^') && !unit.bk) {
0520: addToken(currentToken);
0521: currentToken = null;
0522: addToken(new RETokenStart(m_subIndex,
0523: (cflags & REG_MULTILINE) > 0));
0524: }
0525:
0526: // END OF LINE OPERATOR
0527: // $
0528:
0529: else if ((unit.ch == '$') && !unit.bk) {
0530: addToken(currentToken);
0531: currentToken = null;
0532: addToken(new RETokenEnd(m_subIndex,
0533: (cflags & REG_MULTILINE) > 0));
0534: }
0535:
0536: // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
0537: // .
0538:
0539: else if ((unit.ch == '.') && !unit.bk) {
0540: addToken(currentToken);
0541: currentToken = new RETokenAny(m_subIndex, syntax
0542: .get(RESyntax.RE_DOT_NEWLINE)
0543: || ((cflags & REG_DOT_NEWLINE) > 0), syntax
0544: .get(RESyntax.RE_DOT_NOT_NULL));
0545: }
0546:
0547: // ZERO-OR-MORE REPEAT OPERATOR
0548: // *
0549:
0550: else if ((unit.ch == '*') && !unit.bk) {
0551: if ((currentToken == null)
0552: || (currentToken.getMinimumLength() == 0))
0553: throw new REException(
0554: "repeated argument may be empty",
0555: REException.REG_BADRPT, index);
0556: currentToken = setRepeated(currentToken, 0,
0557: Integer.MAX_VALUE, index);
0558: }
0559:
0560: // ONE-OR-MORE REPEAT OPERATOR
0561: // + | \+ depending on RE_BK_PLUS_QM
0562: // not available if RE_LIMITED_OPS is set
0563:
0564: else if ((unit.ch == '+')
0565: && !syntax.get(RESyntax.RE_LIMITED_OPS)
0566: && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
0567: if ((currentToken == null)
0568: || (currentToken.getMinimumLength() == 0))
0569: throw new REException(
0570: "repeated argument may be empty",
0571: REException.REG_BADRPT, index);
0572: currentToken = setRepeated(currentToken, 1,
0573: Integer.MAX_VALUE, index);
0574: }
0575:
0576: // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
0577: // ? | \? depending on RE_BK_PLUS_QM
0578: // not available if RE_LIMITED_OPS is set
0579: // stingy matching if RE_STINGY_OPS is set and it follows a quantifier
0580:
0581: else if ((unit.ch == '?')
0582: && !syntax.get(RESyntax.RE_LIMITED_OPS)
0583: && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
0584: if (currentToken == null)
0585: throw new REException("? without preceding token",
0586: REException.REG_BADRPT, index);
0587:
0588: // Check for stingy matching on RETokenRepeated
0589: if ((currentToken instanceof RETokenRepeated)
0590: && (syntax.get(RESyntax.RE_STINGY_OPS)))
0591: ((RETokenRepeated) currentToken).makeStingy();
0592: else
0593: currentToken = setRepeated(currentToken, 0, 1,
0594: index);
0595: }
0596:
0597: // BACKREFERENCE OPERATOR
0598: // \1 \2 \3 \4 ...
0599: // not available if RE_NO_BK_REFS is set
0600:
0601: else if (unit.bk && Character.isDigit(unit.ch)
0602: && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
0603: addToken(currentToken);
0604: currentToken = new RETokenBackRef(m_subIndex, Character
0605: .digit(unit.ch, 10), insens);
0606: }
0607:
0608: // START OF STRING OPERATOR
0609: // \A if RE_STRING_ANCHORS is set
0610:
0611: else if (unit.bk && (unit.ch == 'A')
0612: && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
0613: addToken(currentToken);
0614: currentToken = new RETokenStart(m_subIndex, false);
0615: }
0616:
0617: // DIGIT OPERATOR
0618: // \d if RE_CHAR_CLASS_ESCAPES is set
0619:
0620: else if (unit.bk && (unit.ch == 'd')
0621: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0622: addToken(currentToken);
0623: currentToken = new RETokenPOSIX(m_subIndex,
0624: RETokenPOSIX.DIGIT, insens, false);
0625: }
0626:
0627: // NON-DIGIT OPERATOR
0628: // \D
0629:
0630: else if (unit.bk && (unit.ch == 'D')
0631: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0632: addToken(currentToken);
0633: currentToken = new RETokenPOSIX(m_subIndex,
0634: RETokenPOSIX.DIGIT, insens, true);
0635: }
0636:
0637: // NEWLINE ESCAPE
0638: // \n
0639:
0640: else if (unit.bk && (unit.ch == 'n')) {
0641: addToken(currentToken);
0642: currentToken = new RETokenChar(m_subIndex, '\n', false);
0643: }
0644:
0645: // RETURN ESCAPE
0646: // \r
0647:
0648: else if (unit.bk && (unit.ch == 'r')) {
0649: addToken(currentToken);
0650: currentToken = new RETokenChar(m_subIndex, '\r', false);
0651: }
0652:
0653: // WHITESPACE OPERATOR
0654: // \s if RE_CHAR_CLASS_ESCAPES is set
0655:
0656: else if (unit.bk && (unit.ch == 's')
0657: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0658: addToken(currentToken);
0659: currentToken = new RETokenPOSIX(m_subIndex,
0660: RETokenPOSIX.SPACE, insens, false);
0661: }
0662:
0663: // NON-WHITESPACE OPERATOR
0664: // \S
0665:
0666: else if (unit.bk && (unit.ch == 'S')
0667: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0668: addToken(currentToken);
0669: currentToken = new RETokenPOSIX(m_subIndex,
0670: RETokenPOSIX.SPACE, insens, true);
0671: }
0672:
0673: // TAB ESCAPE
0674: // \t
0675:
0676: else if (unit.bk && (unit.ch == 't')) {
0677: addToken(currentToken);
0678: currentToken = new RETokenChar(m_subIndex, '\t', false);
0679: }
0680:
0681: // ALPHANUMERIC OPERATOR
0682: // \w
0683:
0684: else if (unit.bk && (unit.ch == 'w')
0685: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0686: addToken(currentToken);
0687: currentToken = new RETokenPOSIX(m_subIndex,
0688: RETokenPOSIX.ALNUM, insens, false);
0689: }
0690:
0691: // NON-ALPHANUMERIC OPERATOR
0692: // \W
0693:
0694: else if (unit.bk && (unit.ch == 'W')
0695: && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
0696: addToken(currentToken);
0697: currentToken = new RETokenPOSIX(m_subIndex,
0698: RETokenPOSIX.ALNUM, insens, true);
0699: }
0700:
0701: // END OF STRING OPERATOR
0702: // \Z
0703:
0704: else if (unit.bk && (unit.ch == 'Z')
0705: && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
0706: addToken(currentToken);
0707: currentToken = new RETokenEnd(m_subIndex, false);
0708: }
0709:
0710: // NON-SPECIAL CHARACTER (or escape to make literal)
0711: // c | \* for example
0712:
0713: else { // not a special character
0714: addToken(currentToken);
0715: currentToken = new RETokenChar(m_subIndex, unit.ch,
0716: insens);
0717: }
0718: } // end while
0719:
0720: // Add final buffered token if applicable
0721: addToken(currentToken);
0722:
0723: if (branches != null) {
0724: branches.addElement(new RE(firstToken, lastToken,
0725: m_numSubs, m_subIndex));
0726: branches.trimToSize(); // compact the Vector
0727: firstToken = lastToken = new RETokenOneOf(m_subIndex,
0728: branches, false);
0729: }
0730: }
0731:
0732: private static int getCharUnit(char[] input, int index,
0733: CharUnit unit) throws REException {
0734: unit.ch = input[index++];
0735: if (unit.bk = (unit.ch == '\\'))
0736: if (index < input.length)
0737: unit.ch = input[index++];
0738: else
0739: throw new REException("\\ at end of pattern.",
0740: REException.REG_ESCAPE, index);
0741: return index;
0742: }
0743:
0744: /**
0745: * Checks if the input in its entirety is an exact match of
0746: * this regular expression.
0747: *
0748: * @param input The input text.
0749: * @exception IllegalArgumentException The input text was not a String, char[], or InputStream.
0750: */
0751: public boolean isMatch(Object input) {
0752: return isMatch(input, 0, 0);
0753: }
0754:
0755: /**
0756: * Checks if the input string, starting from index, is an exact match of
0757: * this regular expression.
0758: *
0759: * @param input The input text.
0760: * @param index The offset index at which the search should be begin.
0761: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
0762: */
0763: public boolean isMatch(Object input, int index) {
0764: return isMatch(input, index, 0);
0765: }
0766:
0767: /**
0768: * Checks if the input, starting from index and using the specified
0769: * execution flags, is an exact match of this regular expression.
0770: *
0771: * @param input The input text.
0772: * @param index The offset index at which the search should be begin.
0773: * @param eflags The logical OR of any execution flags above.
0774: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
0775: */
0776: public boolean isMatch(Object input, int index, int eflags) {
0777: return isMatchImpl(makeCharIndexed(input, index), index, eflags);
0778: }
0779:
0780: private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
0781: if (firstToken == null) // Trivial case
0782: return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
0783: int[] i = firstToken.match(input, 0, eflags, new REMatch(
0784: m_numSubs, index));
0785: return (i != null)
0786: && (input.charAt(i[0]) == CharIndexed.OUT_OF_BOUNDS);
0787: }
0788:
0789: /**
0790: * Returns the maximum number of subexpressions in this regular expression.
0791: * If the expression contains branches, the value returned will be the
0792: * maximum subexpressions in any of the branches.
0793: */
0794: public int getNumSubs() {
0795: return m_numSubs;
0796: }
0797:
0798: // Overrides REToken.setUncle
0799: void setUncle(REToken f_uncle) {
0800: lastToken.setUncle(f_uncle);
0801: }
0802:
0803: // Overrides REToken.chain
0804: boolean chain(REToken f_next) {
0805: super .chain(f_next);
0806: if (lastToken != null)
0807: lastToken.setUncle(f_next);
0808: return true;
0809: }
0810:
0811: /**
0812: * Returns the minimum number of characters that could possibly
0813: * constitute a match of this regular expression.
0814: */
0815: public int getMinimumLength() {
0816: int min = 0;
0817: REToken t = firstToken;
0818: if (t == null)
0819: return 0;
0820: do {
0821: min += t.getMinimumLength();
0822: } while ((t = t.m_next) != null);
0823: return min;
0824: }
0825:
0826: /**
0827: * Returns an array of all matches found in the input.
0828: *
0829: * @param input The input text.
0830: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
0831: */
0832: public REMatch[] getAllMatches(Object input) {
0833: return getAllMatches(input, 0, 0);
0834: }
0835:
0836: /**
0837: * Returns an array of all matches found in the input,
0838: * beginning at the specified index position.
0839: *
0840: * @param input The input text.
0841: * @param index The offset index at which the search should be begin.
0842: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
0843: */
0844: public REMatch[] getAllMatches(Object input, int index) {
0845: return getAllMatches(input, index, 0);
0846: }
0847:
0848: /**
0849: * Returns an array of all matches found in the input string,
0850: * beginning at the specified index position and using the specified
0851: * execution flags.
0852: *
0853: * @param input The input text.
0854: * @param index The offset index at which the search should be begin.
0855: * @param eflags The logical OR of any execution flags above.
0856: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
0857: */
0858: public REMatch[] getAllMatches(Object input, int index, int eflags) {
0859: return getAllMatchesImpl(makeCharIndexed(input, index), index,
0860: eflags);
0861: }
0862:
0863: // this has been changed since 1.03 to be non-overlapping matches
0864: private REMatch[] getAllMatchesImpl(CharIndexed input, int index,
0865: int eflags) {
0866: Vector all = new Vector();
0867: REMatch m = null;
0868: while ((m = getMatchImpl(input, index, eflags, null)) != null) {
0869: all.addElement(m);
0870: index = m.getEndIndex();
0871: if (m.end[0] == 0) { // handle pathological case of zero-length match
0872: index++;
0873: input.move(1);
0874: } else {
0875: input.move(m.end[0]);
0876: }
0877: }
0878: REMatch[] mset = new REMatch[all.size()];
0879: all.copyInto(mset);
0880: return mset;
0881: }
0882:
0883: /* Implements abstract method REToken.match() */
0884: int[] match(CharIndexed input, int index, int eflags,
0885: REMatch mymatch) {
0886: if (firstToken == null)
0887: return new int[] { index }; // Trivial case
0888: /*
0889: if ((mymatch.start[m_subIndex] == -1)
0890: || (mymatch.start[m_subIndex] > index))
0891: */
0892: int oldstart = mymatch.start[m_subIndex];
0893: mymatch.start[m_subIndex] = index;
0894: int[] newIndex = firstToken
0895: .match(input, index, eflags, mymatch);
0896: if (newIndex == null) {
0897: mymatch.start[m_subIndex] = oldstart;
0898: } else {
0899: // If this match succeeded, then whole rest of string is good,
0900: // and newIndex[0] is the end of the match AT THIS LEVEL
0901:
0902: // We need to make list of all possible nexts.
0903: int[] doables = new int[0];
0904: int[] this Result;
0905: for (int i = 0; i < newIndex.length; i++) {
0906: this Result = next(input, newIndex[i], eflags, mymatch);
0907: if (this Result != null) {
0908: int[] temp = new int[doables.length
0909: + this Result.length];
0910: System.arraycopy(doables, 0, temp, 0,
0911: doables.length);
0912: for (int j = 0; j < this Result.length; j++) {
0913: temp[doables.length + j] = this Result[j];
0914: }
0915: doables = temp;
0916: }
0917: }
0918: return (doables.length == 0) ? null : doables;
0919: }
0920: return null;
0921: }
0922:
0923: /**
0924: * Returns the first match found in the input.
0925: *
0926: * @param input The input text.
0927: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
0928: */
0929: public REMatch getMatch(Object input) {
0930: return getMatch(input, 0, 0);
0931: }
0932:
0933: /**
0934: * Returns the first match found in the input, beginning
0935: * the search at the specified index.
0936: *
0937: * @param input The input text.
0938: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
0939: */
0940: public REMatch getMatch(Object input, int index) {
0941: return getMatch(input, index, 0);
0942: }
0943:
0944: /**
0945: * Returns the first match found in the input, beginning
0946: * the search at the specified index, and using the specified
0947: * execution flags. If no match is found, returns null.
0948: *
0949: * @param input The input text.
0950: * @param index The offset index at which the search should be begin.
0951: * @param eflags The logical OR of any execution flags above.
0952: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
0953: */
0954: public REMatch getMatch(Object input, int index, int eflags) {
0955: return getMatch(input, index, eflags, null);
0956: }
0957:
0958: /**
0959: * Returns the first match found in the input, beginning
0960: * the search at the specified index, and using the specified
0961: * execution flags. If no match is found, returns null. If a StringBuffer
0962: * is provided and is non-null, the contents of the input text from the index to the
0963: * beginning of the match (or to the end of the input, if there is no match)
0964: * are appended to the StringBuffer.
0965: *
0966: * @param input The input text.
0967: * @param index The offset index at which the search should be begin.
0968: * @param eflags The logical OR of any execution flags above.
0969: * @param buffer The StringBuffer to save pre-match text in.
0970: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
0971: */
0972: public REMatch getMatch(Object input, int index, int eflags,
0973: StringBuffer buffer) {
0974: return getMatchImpl(makeCharIndexed(input, index), index,
0975: eflags, buffer);
0976: }
0977:
0978: REMatch getMatchImpl(CharIndexed input, int index, int eflags,
0979: StringBuffer buffer) {
0980: // check if input is at a valid position
0981: if (!input.isValid())
0982: return null;
0983: REMatch mymatch = new REMatch(m_numSubs, index);
0984: do {
0985: int[] result = match(input, 0, eflags, mymatch);
0986: if (result != null) {
0987: mymatch.end[0] = result[0]; // may break leftmost longest
0988: mymatch.finish(input);
0989: return mymatch;
0990: }
0991: mymatch.clear(++index);
0992: if (buffer != null)
0993: buffer.append(input.charAt(0));
0994: } while (input.move(1));
0995:
0996: return null;
0997: }
0998:
0999: /**
1000: * Returns an REMatchEnumeration that can be used to iterate over the
1001: * matches found in the input text.
1002: *
1003: * @param input The input text.
1004: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1005: */
1006: public REMatchEnumeration getMatchEnumeration(Object input) {
1007: return getMatchEnumeration(input, 0, 0);
1008: }
1009:
1010: /**
1011: * Returns an REMatchEnumeration that can be used to iterate over the
1012: * matches found in the input text.
1013: *
1014: * @param input The input text.
1015: * @param index The offset index at which the search should be begin.
1016: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1017: */
1018: public REMatchEnumeration getMatchEnumeration(Object input,
1019: int index) {
1020: return getMatchEnumeration(input, index, 0);
1021: }
1022:
1023: /**
1024: * Returns an REMatchEnumeration that can be used to iterate over the
1025: * matches found in the input text.
1026: *
1027: * @param input The input text.
1028: * @param index The offset index at which the search should be begin.
1029: * @param eflags The logical OR of any execution flags above.
1030: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1031: */
1032: public REMatchEnumeration getMatchEnumeration(Object input,
1033: int index, int eflags) {
1034: return new REMatchEnumeration(this , makeCharIndexed(input,
1035: index), index, eflags);
1036: }
1037:
1038: /**
1039: * Substitutes the replacement text for the first match found in the input.
1040: *
1041: * @param input The input text.
1042: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1043: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1044: */
1045: public String substitute(Object input, String replace) {
1046: return substitute(input, replace, 0, 0);
1047: }
1048:
1049: /**
1050: * Substitutes the replacement text for the first match found in the input
1051: * beginning at the specified index position.
1052: *
1053: * @param input The input text.
1054: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1055: * @param index The offset index at which the search should be begin.
1056: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1057: */
1058: public String substitute(Object input, String replace, int index) {
1059: return substitute(input, replace, index, 0);
1060: }
1061:
1062: /**
1063: * Substitutes the replacement text for the first match found in the input
1064: * string, beginning at the specified index position and using the
1065: * specified execution flags.
1066: *
1067: * @param input The input text.
1068: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1069: * @param index The offset index at which the search should be begin.
1070: * @param eflags The logical OR of any execution flags above.
1071: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1072: */
1073: public String substitute(Object input, String replace, int index,
1074: int eflags) {
1075: return substituteImpl(makeCharIndexed(input, index), replace,
1076: index, eflags);
1077: }
1078:
1079: private String substituteImpl(CharIndexed input, String replace,
1080: int index, int eflags) {
1081: StringBuffer buffer = new StringBuffer();
1082: REMatch m = getMatchImpl(input, index, eflags, buffer);
1083: if (m == null)
1084: return buffer.toString();
1085: buffer.append(m.substituteInto(replace));
1086: if (input.move(m.end[0])) {
1087: do {
1088: buffer.append(input.charAt(0));
1089: } while (input.move(1));
1090: }
1091: return buffer.toString();
1092: }
1093:
1094: /**
1095: * Substitutes the replacement text for each non-overlapping match found
1096: * in the input text.
1097: *
1098: * @param input The input text.
1099: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1100: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1101: */
1102: public String substituteAll(Object input, String replace) {
1103: return substituteAll(input, replace, 0, 0);
1104: }
1105:
1106: /**
1107: * Substitutes the replacement text for each non-overlapping match found
1108: * in the input text, starting at the specified index.
1109: *
1110: * @param input The input text.
1111: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1112: * @param index The offset index at which the search should be begin.
1113: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1114: */
1115: public String substituteAll(Object input, String replace, int index) {
1116: return substituteAll(input, replace, index, 0);
1117: }
1118:
1119: /**
1120: * Substitutes the replacement text for each non-overlapping match found
1121: * in the input text, starting at the specified index and using the
1122: * specified execution flags.
1123: *
1124: * @param input The input text.
1125: * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1126: * @param index The offset index at which the search should be begin.
1127: * @param eflags The logical OR of any execution flags above.
1128: * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1129: */
1130: public String substituteAll(Object input, String replace,
1131: int index, int eflags) {
1132: return substituteAllImpl(makeCharIndexed(input, index),
1133: replace, index, eflags);
1134: }
1135:
1136: private String substituteAllImpl(CharIndexed input, String replace,
1137: int index, int eflags) {
1138: StringBuffer buffer = new StringBuffer();
1139: REMatch m;
1140: while ((m = getMatchImpl(input, index, eflags, buffer)) != null) {
1141: buffer.append(m.substituteInto(replace));
1142: index = m.getEndIndex();
1143: if (m.end[0] == 0) {
1144: char ch = input.charAt(0);
1145: if (ch != CharIndexed.OUT_OF_BOUNDS)
1146: buffer.append(ch);
1147: input.move(1);
1148: } else {
1149: input.move(m.end[0]);
1150: }
1151: }
1152: return buffer.toString();
1153: }
1154:
1155: /* Helper function for constructor */
1156: private void addToken(REToken next) {
1157: if (next == null)
1158: return;
1159: if (firstToken == null)
1160: lastToken = firstToken = next;
1161: else
1162: // if chain returns false, it "rejected" the token due to
1163: // an optimization, and next was combined with lastToken
1164: if (lastToken.chain(next))
1165: lastToken = next;
1166: }
1167:
1168: private static REToken setRepeated(REToken current, int min,
1169: int max, int index) throws REException {
1170: if (current == null)
1171: throw new REException("repeat preceding token",
1172: REException.REG_BADRPT, index);
1173: return new RETokenRepeated(current.m_subIndex, current, min,
1174: max);
1175: }
1176:
1177: private static int getPosixSet(char[] pattern, int index,
1178: StringBuffer buf) {
1179: // Precondition: pattern[index-1] == ':'
1180: // we will return pos of closing ']'.
1181: int i;
1182: for (i = index; i < (pattern.length - 1); i++) {
1183: if ((pattern[i] == ':') && (pattern[i + 1] == ']'))
1184: return i + 2;
1185: buf.append(pattern[i]);
1186: }
1187: return index; // didn't match up
1188: }
1189:
1190: private int getMinMax(char[] input, int index, IntPair minMax,
1191: RESyntax syntax) throws REException {
1192: // Precondition: input[index-1] == '{', minMax != null
1193:
1194: if (index == input.length)
1195: throw new REException("no matching brace",
1196: REException.REG_EBRACE, index);
1197:
1198: int min, max = 0;
1199: CharUnit unit = new CharUnit();
1200: StringBuffer buf = new StringBuffer();
1201:
1202: // Read string of digits
1203: while (((index = getCharUnit(input, index, unit)) != input.length)
1204: && Character.isDigit(unit.ch))
1205: buf.append(unit.ch);
1206:
1207: // Check for {} tomfoolery
1208: if (buf.length() == 0)
1209: throw new REException("bad brace construct",
1210: REException.REG_EBRACE, index);
1211:
1212: min = Integer.parseInt(buf.toString());
1213:
1214: if ((unit.ch == '}')
1215: && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1216: max = min;
1217: else if ((unit.ch == ',') && !unit.bk) {
1218: buf = new StringBuffer();
1219: // Read string of digits
1220: while (((index = getCharUnit(input, index, unit)) != input.length)
1221: && Character.isDigit(unit.ch))
1222: buf.append(unit.ch);
1223:
1224: if (!((unit.ch == '}') && (syntax
1225: .get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1226: throw new REException("expected end of interval",
1227: REException.REG_EBRACE, index);
1228:
1229: // This is the case of {x,}
1230: if (buf.length() == 0)
1231: max = Integer.MAX_VALUE;
1232: else
1233: max = Integer.parseInt(buf.toString());
1234: } else
1235: throw new REException(
1236: "invalid character in brace expression",
1237: REException.REG_EBRACE, index);
1238:
1239: // We know min and max now, and they are valid.
1240:
1241: minMax.first = min;
1242: minMax.second = max;
1243:
1244: // return the index following the '}'
1245: return index;
1246: }
1247:
1248: /**
1249: * Return a human readable form of the compiled regular expression,
1250: * useful for debugging.
1251: */
1252: public String toString() {
1253: StringBuffer sb = new StringBuffer();
1254: dump(sb);
1255: return sb.toString();
1256: }
1257:
1258: void dump(StringBuffer os) {
1259: os.append('(');
1260: if (m_subIndex == 0)
1261: os.append("?:");
1262: if (firstToken != null)
1263: firstToken.dumpAll(os);
1264: os.append(')');
1265: }
1266:
1267: // Cast input appropriately or throw exception
1268: private static CharIndexed makeCharIndexed(Object input, int index) {
1269: if (input instanceof String)
1270: return new CharIndexedString((String) input, index);
1271: else if (input instanceof char[])
1272: return new CharIndexedCharArray((char[]) input, index);
1273: else if (input instanceof StringBuffer)
1274: return new CharIndexedStringBuffer((StringBuffer) input,
1275: index);
1276: else if (input instanceof InputStream)
1277: return new CharIndexedInputStream((InputStream) input,
1278: index);
1279: else
1280: throw new IllegalArgumentException(
1281: "Invalid class for input text");
1282: }
1283: }
|