0001: /*
0002: * Regexp.java
0003: *
0004: * Copyright (c) 1999 Sun Microsystems, Inc.
0005: *
0006: * See the file "license.terms" for information on usage and
0007: * redistribution of this file, and for a DISCLAIMER OF ALL
0008: * WARRANTIES.
0009: *
0010: * SCCS: %Z% %M% %I% %E% %U%
0011: */
0012:
0013: package sunlabs.brazil.util.regexp;
0014:
0015: import java.util.*;
0016: import java.io.*;
0017:
0018: /**
0019: * The <code>Regexp</code> class can be used to match a pattern against a
0020: * string and optionally replace the matched parts with new strings.
0021: * <p>
0022: * Regular expressions were implemented by translating Henry Spencer's
0023: * regular expression package for <a href="http://www.scriptics.com">tcl8.0</a>.
0024: * Much of the description below is copied verbatim from the tcl8.0 regsub
0025: * manual entry.
0026: * <hr>
0027: * REGULAR EXPRESSIONS
0028: * <p>
0029: * A regular expression is zero or more <code>branches</code>, separated by
0030: * "|". It matches anything that matches one of the branches.
0031: * <p>
0032: * A branch is zero or more <code>pieces</code>, concatenated.
0033: * It matches a match for the first piece, followed by a match for the
0034: * second piece, etc.
0035: * <p>
0036: * A piece is an <code>atom</code>, possibly followed by "*", "+", or
0037: * "?". <ul>
0038: * <li> An atom followed by "*" matches a sequence of 0 or more matches of
0039: * the atom.
0040: * <li> An atom followed by "+" matches a sequence of 1 or more matches of
0041: * the atom.
0042: * <li> An atom followed by "?" matches either 0 or 1 matches of the atom.
0043: * </ul>
0044: * <p>
0045: * An atom is <ul>
0046: * <li> a regular expression in parentheses (matching a match for the
0047: * regular expression)
0048: * <li> a <code>range</code> (see below)
0049: * <li> "." (matching any single character)
0050: * <li> "^" (matching the null string at the beginning of the input string)
0051: * <li> "$" (matching the null string at the end of the input string)
0052: * <li> a "\" followed by a single character (matching that character)
0053: * <li> a single character with no other significance (matching that
0054: * character).
0055: * </ul>
0056: * <p>
0057: * A <code>range</code> is a sequence of characters enclosed in "[]".
0058: * The range normally matches any single character from the sequence.
0059: * If the sequence begins with "^", the range matches any single character
0060: * <b>not</b> from the rest of the sequence.
0061: * If two characters in the sequence are separated by "-", this is shorthand
0062: * for the full list of characters between them (e.g. "[0-9]" matches any
0063: * decimal digit). To include a literal "]" in the sequence, make it the
0064: * first character (following a possible "^"). To include a literal "-",
0065: * make it the first or last character.
0066: * <p>
0067: * In general there may be more than one way to match a regular expression
0068: * to an input string. For example, consider the command
0069: * <pre>
0070: * String[] match = new String[2];
0071: * Regexp.match("(a*)b*", "aabaaabb", match);
0072: * </pre>
0073: * Considering only the rules given so far, <code>match[0]</code> and
0074: * <code>match[1]</code> could end up with the values <ul>
0075: * <li> "aabb" and "aa"
0076: * <li> "aaab" and "aaa"
0077: * <li> "ab" and "a"
0078: * </ul>
0079: * or any of several other combinations. To resolve this potential ambiguity,
0080: * Regexp chooses among alternatives using the rule "first then longest".
0081: * In other words, it considers the possible matches in order working
0082: * from left to right across the input string and the pattern, and it
0083: * attempts to match longer pieces of the input string before shorter
0084: * ones. More specifically, the following rules apply in decreasing
0085: * order of priority: <ol>
0086: * <li> If a regular expression could match two different parts of an input
0087: * string then it will match the one that begins earliest.
0088: * <li> If a regular expression contains "|" operators then the
0089: * leftmost matching sub-expression is chosen.
0090: * <li> In "*", "+", and "?" constructs, longer matches are chosen in
0091: * preference to shorter ones.
0092: * <li>
0093: * In sequences of expression components the components are considered
0094: * from left to right.
0095: * </ol>
0096: * <p>
0097: * In the example from above, "(a*)b*" therefore matches exactly "aab"; the
0098: * "(a*)" portion of the pattern is matched first and it consumes the leading
0099: * "aa", then the "b*" portion of the pattern consumes the next "b". Or,
0100: * consider the following example:
0101: * <pre>
0102: * String match = new String[3];
0103: * Regexp.match("(ab|a)(b*)c", "abc", match);
0104: * </pre>
0105: * After this command, <code>match[0]</code> will be "abc",
0106: * <code>match[1]</code> will be "ab", and <code>match[2]</code> will be an
0107: * empty string.
0108: * Rule 4 specifies that the "(ab|a)" component gets first shot at the input
0109: * string and Rule 2 specifies that the "ab" sub-expression
0110: * is checked before the "a" sub-expression.
0111: * Thus the "b" has already been claimed before the "(b*)"
0112: * component is checked and therefore "(b*)" must match an empty string.
0113: * <hr>
0114: * <a name=regsub></a>
0115: * REGULAR EXPRESSION SUBSTITUTION
0116: * <p>
0117: * Regular expression substitution matches a string against a regular
0118: * expression, transforming the string by replacing the matched region(s)
0119: * with new substring(s).
0120: * <p>
0121: * What gets substituted into the result is controlled by a
0122: * <code>subspec</code>. The subspec is a formatting string that specifies
0123: * what portions of the matched region should be substituted into the
0124: * result.
0125: * <ul>
0126: * <li> "&" or "\0" is replaced with a copy of the entire matched region.
0127: * <li> "\<code>n</code>", where <code>n</code> is a digit from 1 to 9,
0128: * is replaced with a copy of the <code>n</code><i>th</i> subexpression.
0129: * <li> "\&" or "\\" are replaced with just "&" or "\" to escape their
0130: * special meaning.
0131: * <li> any other character is passed through.
0132: * </ul>
0133: * In the above, strings like "\2" represents the two characters
0134: * <code>backslash</code> and "2", not the Unicode character 0002.
0135: * <hr>
0136: * Here is an example of how to use Regexp
0137: * <pre>
0138: *
0139: * public static void
0140: * main(String[] args)
0141: * throws Exception
0142: * {
0143: * Regexp re;
0144: * String[] matches;
0145: * String s;
0146: *
0147: * /*
0148: * * A regular expression to match the first line of a HTTP request.
0149: * *
0150: * * 1. ^ - starting at the beginning of the line
0151: * * 2. ([A-Z]+) - match and remember some upper case characters
0152: * * 3. [ \t]+ - skip blank space
0153: * * 4. ([^ \t]*) - match and remember up to the next blank space
0154: * * 5. [ \t]+ - skip more blank space
0155: * * 6. (HTTP/1\\.[01]) - match and remember HTTP/1.0 or HTTP/1.1
0156: * * 7. $ - end of string - no chars left.
0157: * */
0158: *
0159: * s = "GET http://a.b.com:1234/index.html HTTP/1.1";
0160: *
0161: * re = new Regexp("^([A-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/1\\.[01])$");
0162: * matches = new String[4];
0163: * if (re.match(s, matches)) {
0164: * System.out.println("METHOD " + matches[1]);
0165: * System.out.println("URL " + matches[2]);
0166: * System.out.println("VERSION " + matches[3]);
0167: * }
0168: *
0169: * /*
0170: * * A regular expression to extract some simple comma-separated data,
0171: * * reorder some of the columns, and discard column 2.
0172: * */
0173: *
0174: * s = "abc,def,ghi,klm,nop,pqr";
0175: *
0176: * re = new Regexp("^([^,]+),([^,]+),([^,]+),(.*)");
0177: * System.out.println(re.sub(s, "\\3,\\1,\\4"));
0178: * }
0179: * </pre>
0180: *
0181: * @author Colin Stevens (colin.stevens@sun.com)
0182: * @version 1.7, 99/10/14
0183: * @see Regsub
0184: */
0185:
0186: public class Regexp {
0187: public static void main(String[] args) throws Exception {
0188: if ((args.length == 2) && (args[0].equals("compile"))) {
0189: System.out.println(new Regexp(args[1]));
0190: } else if ((args.length == 3) && (args[0].equals("match"))) {
0191: Regexp r = new Regexp(args[1]);
0192: String[] substrs = new String[r.subspecs()];
0193: boolean match = r.match(args[2], substrs);
0194: System.out.println("match:\t" + match);
0195: for (int i = 0; i < substrs.length; i++) {
0196: System.out.println((i + 1) + ":\t" + substrs[i]);
0197: }
0198: } else if ((args.length == 4) && (args[0].equals("sub"))) {
0199: Regexp r = new Regexp(args[1]);
0200: System.out.println(r.subAll(args[2], args[3]));
0201: } else {
0202: System.out.println("usage:");
0203: System.out.println("\tRegexp match <pattern> <string>");
0204: System.out
0205: .println("\tRegexp sub <pattern> <string> <subspec>");
0206: System.out.println("\tRegexp compile <pattern>");
0207: }
0208: }
0209:
0210: /*
0211: * Structure for regexp "program". This is essentially a linear encoding
0212: * of a nondeterministic finite-state machine (aka syntax charts or
0213: * "railroad normal form" in parsing technology). Each node is an opcode
0214: * plus a "next" pointer, possibly plus an operand. "Next" pointers of
0215: * all nodes except BRANCH implement concatenation; a "next" pointer with
0216: * a BRANCH on both ends of it is connecting two alternatives. (Here we
0217: * have one of the subtle syntax dependencies: an individual BRANCH (as
0218: * opposed to a collection of them) is never concatenated with anything
0219: * because of operator precedence.) The operand of some types of node is
0220: * a literal string; for others, it is a node leading into a sub-FSM. In
0221: * particular, the operand of a BRANCH node is the first node of the branch.
0222: * (NB this is *not* a tree structure: the tail of the branch connects
0223: * to the thing following the set of BRANCHes.) The opcodes are:
0224: */
0225:
0226: static final int NSUBEXP = 100;
0227:
0228: /* definition number opnd? meaning */
0229:
0230: static final char END = 0; /* no End of program. */
0231: static final char BOL = 1; /* no Match "" at beginning of line. */
0232: static final char EOL = 2; /* no Match "" at end of line. */
0233: static final char ANY = 3; /* no Match any one character. */
0234: static final char ANYOF = 4; /* str Match any character in this string. */
0235: static final char ANYBUT = 5; /* str Match any character not in this string. */
0236: static final char BRANCH = 6; /* node Match this alternative, or the next... */
0237: static final char BACK = 7; /* no Match "", "next" ptr points backward. */
0238: static final char EXACTLY = 8; /* str Match this string. */
0239: static final char NOTHING = 9; /* no Match empty string. */
0240: static final char STAR = 10; /* node Match this (simple) thing 0 or more times. */
0241: static final char PLUS = 11; /* node Match this (simple) thing 1 or more times. */
0242: static final char OPEN = 20; /* no Mark this point in input as start of #n. */
0243: /* OPEN+1 is number 1, etc. */
0244: static final char CLOSE = (char) (OPEN + NSUBEXP);
0245: /* no Analogous to OPEN. */
0246: static final String[] opnames = { "END", "BOL", "EOL", "ANY",
0247: "ANYOF", "ANYBUT", "BRANCH", "BACK", "EXACTLY", "NOTHING",
0248: "STAR", "PLUS" };
0249:
0250: /*
0251: * A node is one char of opcode followed by one char of "next" pointer.
0252: * The value is a positive offset from the opcode of the node containing
0253: * it. An operand, if any, simply follows the node. (Note that much of
0254: * the code generation knows about this implicit relationship.)
0255: *
0256: * Opcode notes:
0257: *
0258: * BRANCH The set of branches constituting a single choice are hooked
0259: * together with their "next" pointers, since precedence prevents
0260: * anything being concatenated to any individual branch. The
0261: * "next" pointer of the last BRANCH in a choice points to the
0262: * thing following the whole choice. This is also where the
0263: * final "next" pointer of each individual branch points; each
0264: * branch starts with the operand node of a BRANCH node.
0265: *
0266: * ANYOF, ANYBUT, EXACTLY
0267: * The format of a string operand is one char of length
0268: * followed by the characters making up the string.
0269: *
0270: * BACK Normal "next" pointers all implicitly point forward; BACK
0271: * exists to make loop structures possible.
0272: *
0273: * STAR, PLUS
0274: * '?', and complex '*' and '+' are implemented as circular
0275: * BRANCH structures using BACK. Simple cases (one character
0276: * per match) are implemented with STAR and PLUS for speed
0277: * and to minimize recursive plunges.
0278: *
0279: * OPENn, CLOSEn
0280: * are numbered at compile time.
0281: */
0282:
0283: /**
0284: * The bytecodes making up the regexp program.
0285: */
0286: char[] program;
0287:
0288: /**
0289: * Whether the regexp matching should be case insensitive.
0290: */
0291: boolean ignoreCase;
0292:
0293: /**
0294: * The number of parenthesized subexpressions in the regexp pattern,
0295: * plus 1 for the match of the whole pattern itself.
0296: */
0297: int npar;
0298:
0299: /**
0300: * <code>true</code> if the pattern must match the beginning of the
0301: * string, so we don't have to waste time matching against all possible
0302: * starting locations in the string.
0303: */
0304: boolean anchored;
0305:
0306: int startChar;
0307: String must;
0308:
0309: /**
0310: * Compiles a new Regexp object from the given regular expression
0311: * pattern.
0312: * <p>
0313: * It takes a certain amount of time to parse and validate a regular
0314: * expression pattern before it can be used to perform matches
0315: * or substitutions. If the caller caches the new Regexp object, that
0316: * parsing time will be saved because the same Regexp can be used with
0317: * respect to many different strings.
0318: *
0319: * @param pat
0320: * The string holding the regular expression pattern.
0321: *
0322: * @throws IllegalArgumentException if the pattern is malformed.
0323: * The detail message for the exception will be set to a
0324: * string indicating how the pattern was malformed.
0325: */
0326: public Regexp(String pat) throws IllegalArgumentException {
0327: compile(pat);
0328: }
0329:
0330: /**
0331: * Compiles a new Regexp object from the given regular expression
0332: * pattern.
0333: *
0334: * @param pat
0335: * The string holding the regular expression pattern.
0336: *
0337: * @param ignoreCase
0338: * If <code>true</code> then this regular expression will
0339: * do case-insensitive matching. If <code>false</code>, then
0340: * the matches are case-sensitive. Regular expressions
0341: * generated by <code>Regexp(String)</code> are case-sensitive.
0342: *
0343: * @throws IllegalArgumentException if the pattern is malformed.
0344: * The detail message for the exception will be set to a
0345: * string indicating how the pattern was malformed.
0346: */
0347: public Regexp(String pat, boolean ignoreCase)
0348: throws IllegalArgumentException {
0349: this .ignoreCase = ignoreCase;
0350: if (ignoreCase) {
0351: pat = pat.toLowerCase();
0352: }
0353: compile(pat);
0354: }
0355:
0356: /**
0357: * Returns the number of parenthesized subexpressions in this regular
0358: * expression, plus one more for this expression itself.
0359: *
0360: * @return The number.
0361: */
0362: public int subspecs() {
0363: return npar;
0364: }
0365:
0366: /**
0367: * Matches the given string against this regular expression.
0368: *
0369: * @param str
0370: * The string to match.
0371: *
0372: * @return The substring of <code>str</code> that matched the entire
0373: * regular expression, or <code>null</code> if the string did not
0374: * match this regular expression.
0375: */
0376: public String match(String str) {
0377: Match m = exec(str, 0, 0);
0378:
0379: if (m == null) {
0380: return null;
0381: }
0382: return str.substring(m.indices[0], m.indices[1]);
0383: }
0384:
0385: /**
0386: * Matches the given string against this regular expression, and computes
0387: * the set of substrings that matched the parenthesized subexpressions.
0388: * <p>
0389: * <code>substrs[0]</code> is set to the range of <code>str</code>
0390: * that matched the entire regular expression.
0391: * <p>
0392: * <code>substrs[1]</code> is set to the range of <code>str</code>
0393: * that matched the first (leftmost) parenthesized subexpression.
0394: * <code>substrs[n]</code> is set to the range that matched the
0395: * <code>n</code><i>th</i> subexpression, and so on.
0396: * <p>
0397: * If subexpression <code>n</code> did not match, then
0398: * <code>substrs[n]</code> is set to <code>null</code>. Not to
0399: * be confused with "", which is a valid value for a
0400: * subexpression that matched 0 characters.
0401: * <p>
0402: * The length that the caller should use when allocating the
0403: * <code>substr</code> array is the return value of
0404: * <code>Regexp.subspecs</code>. The array
0405: * can be shorter (in which case not all the information will
0406: * be returned), or longer (in which case the remainder of the
0407: * elements are initialized to <code>null</code>), or
0408: * <code>null</code> (to ignore the subexpressions).
0409: *
0410: * @param str
0411: * The string to match.
0412: *
0413: * @param substrs
0414: * An array of strings allocated by the caller, and filled in
0415: * with information about the portions of <code>str</code> that
0416: * matched the regular expression. May be <code>null</code>.
0417: *
0418: * @return <code>true</code> if <code>str</code> that matched this
0419: * regular expression, <code>false</code> otherwise.
0420: * If <code>false</code> is returned, then the contents of
0421: * <code>substrs</code> are unchanged.
0422: *
0423: * @see #subspecs
0424: */
0425: public boolean match(String str, String[] substrs) {
0426: Match m = exec(str, 0, 0);
0427:
0428: if (m == null) {
0429: return false;
0430: }
0431: if (substrs != null) {
0432: int max = Math.min(substrs.length, npar);
0433: int i;
0434: int j = 0;
0435: for (i = 0; i < max; i++) {
0436: int start = m.indices[j++];
0437: int end = m.indices[j++];
0438: if (start < 0) {
0439: substrs[i] = null;
0440: } else {
0441: substrs[i] = str.substring(start, end);
0442: }
0443: }
0444: for (; i < substrs.length; i++) {
0445: substrs[i] = null;
0446: }
0447: }
0448: return true;
0449: }
0450:
0451: /**
0452: * Matches the given string against this regular expression, and computes
0453: * the set of substrings that matched the parenthesized subexpressions.
0454: * <p>
0455: * For the indices specified below, the range extends from the character
0456: * at the starting index up to, but not including, the character at the
0457: * ending index.
0458: * <p>
0459: * <code>indices[0]</code> and <code>indices[1]</code> are set to
0460: * starting and ending indices of the range of <code>str</code>
0461: * that matched the entire regular expression.
0462: * <p>
0463: * <code>indices[2]</code> and <code>indices[3]</code> are set to the
0464: * starting and ending indices of the range of <code>str</code> that
0465: * matched the first (leftmost) parenthesized subexpression.
0466: * <code>indices[n * 2]</code> and <code>indices[n * 2 + 1]</code>
0467: * are set to the range that matched the <code>n</code><i>th</i>
0468: * subexpression, and so on.
0469: * <p>
0470: * If subexpression <code>n</code> did not match, then
0471: * <code>indices[n * 2]</code> and <code>indices[n * 2 + 1]</code>
0472: * are both set to <code>-1</code>.
0473: * <p>
0474: * The length that the caller should use when allocating the
0475: * <code>indices</code> array is twice the return value of
0476: * <code>Regexp.subspecs</code>. The array
0477: * can be shorter (in which case not all the information will
0478: * be returned), or longer (in which case the remainder of the
0479: * elements are initialized to <code>-1</code>), or
0480: * <code>null</code> (to ignore the subexpressions).
0481: *
0482: * @param str
0483: * The string to match.
0484: *
0485: * @param indices
0486: * An array of integers allocated by the caller, and filled in
0487: * with information about the portions of <code>str</code> that
0488: * matched all the parts of the regular expression.
0489: * May be <code>null</code>.
0490: *
0491: * @return <code>true</code> if the string matched the regular expression,
0492: * <code>false</code> otherwise. If <code>false</code> is
0493: * returned, then the contents of <code>indices</code> are
0494: * unchanged.
0495: *
0496: * @see #subspecs
0497: */
0498: public boolean match(String str, int[] indices) {
0499: Match m = exec(str, 0, 0);
0500:
0501: if (m == null) {
0502: return false;
0503: }
0504: if (indices != null) {
0505: int max = Math.min(indices.length, npar * 2);
0506: System.arraycopy(m.indices, 0, indices, 0, max);
0507:
0508: for (int i = max; i < indices.length; i++) {
0509: indices[i] = -1;
0510: }
0511: }
0512: return true;
0513: }
0514:
0515: /**
0516: * Matches a string against a regular expression and replaces the first
0517: * match with the string generated from the substitution parameter.
0518: *
0519: * @param str
0520: * The string to match against this regular expression.
0521: *
0522: * @param subspec
0523: * The substitution parameter, described in <a href=#regsub>
0524: * REGULAR EXPRESSION SUBSTITUTION</a>.
0525: *
0526: * @return The string formed by replacing the first match in
0527: * <code>str</code> with the string generated from
0528: * <code>subspec</code>. If no matches were found, then
0529: * the return value is <code>null</code>.
0530: */
0531: public String sub(String str, String subspec) {
0532: Regsub rs = new Regsub(this , str);
0533: if (rs.nextMatch()) {
0534: StringBuffer sb = new StringBuffer(rs.skipped());
0535: applySubspec(rs, subspec, sb);
0536: sb.append(rs.rest());
0537:
0538: return sb.toString();
0539: } else {
0540: return null;
0541: }
0542: }
0543:
0544: /**
0545: * Matches a string against a regular expression and replaces all
0546: * matches with the string generated from the substitution parameter.
0547: * After each substutition is done, the portions of the string already
0548: * examined, including the newly substituted region, are <b>not</b> checked
0549: * again for new matches -- only the rest of the string is examined.
0550: *
0551: * @param str
0552: * The string to match against this regular expression.
0553: *
0554: * @param subspec
0555: * The substitution parameter, described in <a href=#regsub>
0556: * REGULAR EXPRESSION SUBSTITUTION</a>.
0557: *
0558: * @return The string formed by replacing all the matches in
0559: * <code>str</code> with the strings generated from
0560: * <code>subspec</code>. If no matches were found, then
0561: * the return value is a copy of <code>str</code>.
0562: */
0563: public String subAll(String str, String subspec) {
0564: return sub(str, new SubspecFilter(subspec, true));
0565: }
0566:
0567: /**
0568: * Utility method to give access to the standard substitution algorithm
0569: * used by <code>sub</code> and <code>subAll</code>. Appends to the
0570: * string buffer the string generated by applying the substitution
0571: * parameter to the matched region.
0572: *
0573: * @param rs
0574: * Information about the matched region.
0575: *
0576: * @param subspec
0577: * The substitution parameter.
0578: *
0579: * @param sb
0580: * StringBuffer to which the generated string is appended.
0581: */
0582: public static void applySubspec(Regsub rs, String subspec,
0583: StringBuffer sb) {
0584: try {
0585: int len = subspec.length();
0586: for (int i = 0; i < len; i++) {
0587: char ch = subspec.charAt(i);
0588: switch (ch) {
0589: case '&': {
0590: sb.append(rs.matched());
0591: break;
0592: }
0593: case '\\': {
0594: i++;
0595: ch = subspec.charAt(i);
0596: if ((ch >= '0') && (ch <= '9')) {
0597: String match = rs.submatch(ch - '0');
0598: if (match != null) {
0599: sb.append(match);
0600: }
0601: break;
0602: }
0603: // fall through.
0604: }
0605: default: {
0606: sb.append(ch);
0607: }
0608: }
0609: }
0610: } catch (IndexOutOfBoundsException e) {
0611: /*
0612: * Ignore malformed substitution pattern.
0613: * Return string matched so far.
0614: */
0615: }
0616: }
0617:
0618: public String sub(String str, Filter rf) {
0619: Regsub rs = new Regsub(this , str);
0620: if (rs.nextMatch() == false) {
0621: return str;
0622: }
0623:
0624: StringBuffer sb = new StringBuffer();
0625: do {
0626: sb.append(rs.skipped());
0627: if (rf.filter(rs, sb) == false) {
0628: break;
0629: }
0630: } while (rs.nextMatch());
0631: sb.append(rs.rest());
0632: return sb.toString();
0633: }
0634:
0635: /**
0636: * This interface is used by the <code>Regexp</code> class to generate
0637: * the replacement string for each pattern match found in the source
0638: * string.
0639: *
0640: * @author Colin Stevens (colin.stevens@sun.com)
0641: * @version 1.7, 99/10/14
0642: */
0643: public interface Filter {
0644: /**
0645: * Given the current state of the match, generate the replacement
0646: * string. This method will be called for each match found in
0647: * the source string, unless this filter decides not to handle any
0648: * more matches.
0649: * <p>
0650: * The implementation can use whatever rules it chooses
0651: * to generate the replacement string. For example, here is an
0652: * example of a filter that replaces the first <b>5</b>
0653: * occurrences of "%XX" in a string with the ASCII character
0654: * represented by the hex digits "XX":
0655: * <pre>
0656: * String str = ...;
0657: *
0658: * Regexp re = new Regexp("%[a-fA-F0-9][a-fA-F0-9]");
0659: *
0660: * Regexp.Filter rf = new Regexp.Filter() {
0661: * int count = 5;
0662: * public boolean filter(Regsub rs, StringBuffer sb) {
0663: * String match = rs.matched();
0664: * int hi = Character.digit(match.charAt(1), 16);
0665: * int lo = Character.digit(match.charAt(2), 16);
0666: * sb.append((char) ((hi << 4) | lo));
0667: * return (--count > 0);
0668: * }
0669: * }
0670: *
0671: * String result = re.sub(str, rf);
0672: * </pre>
0673: *
0674: * @param rs
0675: * <code>Regsub</code> containing the state of the current
0676: * match.
0677: *
0678: * @param sb
0679: * The string buffer that this filter should append the
0680: * generated string to. This string buffer actually
0681: * contains the results the calling <code>Regexp</code> has
0682: * generated up to this point.
0683: *
0684: * @return <code>false</code> if no further matches should be
0685: * considered in this string, <code>true</code> to allow
0686: * <code>Regexp</code> to continue looking for further
0687: * matches.
0688: */
0689: public boolean filter(Regsub rs, StringBuffer sb);
0690: }
0691:
0692: private static class SubspecFilter implements Filter {
0693: String subspec;
0694: boolean all;
0695:
0696: public SubspecFilter(String subspec, boolean all) {
0697: this .subspec = subspec;
0698: this .all = all;
0699: }
0700:
0701: public boolean filter(Regsub rs, StringBuffer sb) {
0702: applySubspec(rs, subspec, sb);
0703: return all;
0704: }
0705: }
0706:
0707: /**
0708: * Returns a string representation of this compiled regular
0709: * expression. The format of the string representation is a
0710: * symbolic dump of the bytecodes.
0711: *
0712: * @return A string representation of this regular expression.
0713: */
0714: public String toString() {
0715: StringBuffer sb = new StringBuffer();
0716:
0717: sb.append("# subs: " + npar + "\n");
0718: sb.append("anchor: " + anchored + "\n");
0719: sb.append("start: " + (char) startChar + "\n");
0720: sb.append("must: " + must + "\n");
0721:
0722: for (int i = 0; i < program.length;) {
0723: sb.append(i + ":\t");
0724: int op = program[i];
0725: if (op >= CLOSE) {
0726: sb.append("CLOSE" + (op - CLOSE));
0727: } else if (op >= OPEN) {
0728: sb.append("OPEN" + (op - OPEN));
0729: } else {
0730: sb.append(opnames[op]);
0731: }
0732: int line;
0733: int offset = (int) program[i + 1];
0734: if (offset == 0) {
0735: sb.append('\t');
0736: } else if (op == BACK) {
0737: sb.append("\t-" + offset + "," + (i - offset));
0738: } else {
0739: sb.append("\t+" + offset + "," + (i + offset));
0740: }
0741:
0742: if ((op == ANYOF) || (op == ANYBUT) || (op == EXACTLY)) {
0743: sb.append("\t'");
0744: sb.append(program, i + 3, program[i + 2]);
0745: sb.append("'");
0746: i += 3 + program[i + 2];
0747: } else {
0748: i += 2;
0749: }
0750: sb.append('\n');
0751: }
0752: return sb.toString();
0753: }
0754:
0755: private void compile(String exp) throws IllegalArgumentException {
0756: Compiler rcstate = new Compiler();
0757: rcstate.parse = exp.toCharArray();
0758: rcstate.off = 0;
0759: rcstate.npar = 1;
0760: rcstate.code = new StringBuffer();
0761:
0762: rcstate.reg(false);
0763:
0764: program = rcstate.code.toString().toCharArray();
0765: npar = rcstate.npar;
0766: startChar = -1;
0767:
0768: /* optimize */
0769: if (program[rcstate.regnext(0)] == END) {
0770: if (program[2] == BOL) {
0771: anchored = true;
0772: } else if (program[2] == EXACTLY) {
0773: startChar = (int) program[5];
0774: }
0775: }
0776:
0777: /*
0778: * If there's something expensive in the r.e., find the
0779: * longest literal string that must appear and make it the
0780: * regmust. Resolve ties in favor of later strings, since
0781: * the regstart check works with the beginning of the r.e.
0782: * and avoiding duplication strengthens checking. Not a
0783: * strong reason, but sufficient in the absence of others.
0784: */
0785: /*
0786: if ((rcstate.flagp & Compiler.SPSTART) != 0) {
0787: int index = -1;
0788: int longest = 0;
0789:
0790: for (scan = 0; scan < program.length; ) {
0791: switch (program[scan]) {
0792: case EXACTLY:
0793: int length = program[scan + 2];
0794: if (length > longest) {
0795: index = scan;
0796: longest = length;
0797: }
0798: // fall through;
0799:
0800: case ANYOF:
0801: case ANYBUT:
0802: scan += 3 + program[scan + 2];
0803: break;
0804:
0805: default:
0806: scan += 2;
0807: break;
0808: }
0809: }
0810: if (longest > 0) {
0811: must = new String(program, index + 3, longest);
0812: }
0813: }
0814: */
0815:
0816: }
0817:
0818: Match exec(String str, int start, int off) {
0819: if (ignoreCase) {
0820: str = str.toLowerCase();
0821: }
0822:
0823: Match match = new Match();
0824:
0825: match.program = program;
0826:
0827: /* Mark beginning of line for ^ . */
0828: match.str = str;
0829: match.bol = start;
0830: match.length = str.length();
0831:
0832: match.indices = new int[npar * 2];
0833:
0834: if (anchored) {
0835: /* Simplest case: anchored match need be tried only once. */
0836: if (match.regtry(off)) {
0837: return match;
0838: }
0839: } else if (startChar >= 0) {
0840: /* We know what char it must start with. */
0841: while (off < match.length) {
0842: off = str.indexOf(startChar, off);
0843: if (off < 0) {
0844: break;
0845: }
0846: if (match.regtry(off)) {
0847: return match;
0848: }
0849: off++;
0850: }
0851: } else {
0852: /* Messy cases: unanchored match. */
0853: do {
0854: if (match.regtry(off)) {
0855: return match;
0856: }
0857: } while (off++ < match.length);
0858: }
0859: return null;
0860: }
0861:
0862: static class Compiler {
0863: char[] parse;
0864: int off;
0865: int npar;
0866: StringBuffer code;
0867: int flagp;
0868:
0869: static final String META = "^$.[()|?+*\\";
0870: static final String MULT = "*+?";
0871:
0872: static final int WORST = 00; /* Worst case. */
0873: static final int HASWIDTH = 01; /* Known never to match null string. */
0874: static final int SIMPLE = 02; /* Simple enough to be STAR/PLUS operand. */
0875: static final int SPSTART = 04; /* Starts with * or +. */
0876:
0877: /*
0878: - reg - regular expression, i.e. main body or parenthesized thing
0879: *
0880: * Caller must absorb opening parenthesis.
0881: *
0882: * Combining parenthesis handling with the base level of regular expression
0883: * is a trifle forced, but the need to tie the tails of the branches to what
0884: * follows makes it hard to avoid.
0885: */
0886: int reg(boolean paren) throws IllegalArgumentException {
0887: int netFlags = HASWIDTH;
0888: int parno = 0;
0889:
0890: int ret = -1;
0891: if (paren) {
0892: parno = npar++;
0893: if (npar >= NSUBEXP) {
0894: throw new IllegalArgumentException("too many ()");
0895: }
0896: ret = regnode((char) (OPEN + parno));
0897: }
0898:
0899: /* Pick up the branches, linking them together. */
0900: int br = regbranch();
0901: if (ret >= 0) {
0902: regtail(ret, br);
0903: } else {
0904: ret = br;
0905: }
0906:
0907: if ((flagp & HASWIDTH) == 0) {
0908: netFlags &= ~HASWIDTH;
0909: }
0910: netFlags |= (flagp & SPSTART);
0911: while ((off < parse.length) && (parse[off] == '|')) {
0912: off++;
0913: br = regbranch();
0914: regtail(ret, br);
0915: if ((flagp & HASWIDTH) == 0) {
0916: netFlags &= ~HASWIDTH;
0917: }
0918: netFlags |= (flagp & SPSTART);
0919: }
0920:
0921: /* Make a closing node, and hook it on the end. */
0922: int ender = regnode((paren) ? (char) (CLOSE + parno) : END);
0923: regtail(ret, ender);
0924:
0925: /* Hook the tails of the branches to the closing node. */
0926: for (br = ret; br >= 0; br = regnext(br)) {
0927: regoptail(br, ender);
0928: }
0929:
0930: /* Check for proper termination. */
0931: if (paren
0932: && ((off >= parse.length) || (parse[off++] != ')'))) {
0933: throw new IllegalArgumentException("missing )");
0934: } else if ((paren == false) && (off < parse.length)) {
0935: throw new IllegalArgumentException("unexpected )");
0936: }
0937:
0938: flagp = netFlags;
0939: return ret;
0940: }
0941:
0942: /*
0943: - regbranch - one alternative of an | operator
0944: *
0945: * Implements the concatenation operator.
0946: */
0947: int regbranch() throws IllegalArgumentException {
0948: int netFlags = WORST; /* Tentatively. */
0949:
0950: int ret = regnode(BRANCH);
0951: int chain = -1;
0952: while ((off < parse.length) && (parse[off] != '|')
0953: && (parse[off] != ')')) {
0954: int latest = regpiece();
0955: netFlags |= flagp & HASWIDTH;
0956: if (chain < 0) { /* First piece. */
0957: netFlags |= (flagp & SPSTART);
0958: } else {
0959: regtail(chain, latest);
0960: }
0961: chain = latest;
0962: }
0963: if (chain < 0) { /* Loop ran zero times. */
0964: regnode(NOTHING);
0965: }
0966:
0967: flagp = netFlags;
0968: return ret;
0969: }
0970:
0971: /*
0972: - regpiece - something followed by possible [*+?]
0973: *
0974: * Note that the branching code sequences used for ? and the general cases
0975: * of * and + are somewhat optimized: they use the same NOTHING node as
0976: * both the endmarker for their branch list and the body of the last branch.
0977: * It might seem that this node could be dispensed with entirely, but the
0978: * endmarker role is not redundant.
0979: */
0980: int regpiece() throws IllegalArgumentException {
0981: int netFlags;
0982:
0983: int ret = regatom();
0984:
0985: if ((off >= parse.length) || (isMult(parse[off]) == false)) {
0986: return ret;
0987: }
0988: char op = parse[off];
0989:
0990: if (((flagp & HASWIDTH) == 0) && (op != '?')) {
0991: throw new IllegalArgumentException(
0992: "*+ operand could be empty");
0993: }
0994: netFlags = (op != '+') ? (WORST | SPSTART)
0995: : (WORST | HASWIDTH);
0996:
0997: if ((op == '*') && ((flagp & SIMPLE) != 0)) {
0998: reginsert(STAR, ret);
0999: } else if (op == '*') {
1000: /* Emit x* as (x&|), where & means "self". */
1001: reginsert(BRANCH, ret); /* Either x */
1002: regoptail(ret, regnode(BACK)); /* and loop */
1003: regoptail(ret, ret); /* back */
1004: regtail(ret, regnode(BRANCH)); /* or */
1005: regtail(ret, regnode(NOTHING)); /* null. */
1006: } else if ((op == '+') && ((flagp & SIMPLE) != 0)) {
1007: reginsert(PLUS, ret);
1008: } else if (op == '+') {
1009: /* Emit x+ as x(&|), where & means "self". */
1010: int next = regnode(BRANCH); /* Either */
1011: regtail(ret, next);
1012: regtail(regnode(BACK), ret); /* loop back */
1013: regtail(next, regnode(BRANCH)); /* or */
1014: regtail(ret, regnode(NOTHING)); /* null. */
1015: } else if (op == '?') {
1016: /* Emit x? as (x|) */
1017: reginsert(BRANCH, ret); /* Either x */
1018: regtail(ret, regnode(BRANCH)); /* or */
1019: int next = regnode(NOTHING); /* null. */
1020: regtail(ret, next);
1021: regoptail(ret, next);
1022: }
1023: off++;
1024: if ((off < parse.length) && isMult(parse[off])) {
1025: throw new IllegalArgumentException("nested *?+");
1026: }
1027:
1028: flagp = netFlags;
1029: return ret;
1030: }
1031:
1032: /*
1033: - regatom - the lowest level
1034: *
1035: * Optimization: gobbles an entire sequence of ordinary characters so that
1036: * it can turn them into a single node, which is smaller to store and
1037: * faster to run. Backslashed characters are exceptions, each becoming a
1038: * separate node; the code is simpler that way and it's not worth fixing.
1039: */
1040: int regatom() throws IllegalArgumentException {
1041: int netFlags = WORST; /* Tentatively. */
1042: int ret;
1043:
1044: switch (parse[off++]) {
1045: case '^':
1046: ret = regnode(BOL);
1047: break;
1048: case '$':
1049: ret = regnode(EOL);
1050: break;
1051: case '.':
1052: ret = regnode(ANY);
1053: netFlags |= (HASWIDTH | SIMPLE);
1054: break;
1055: case '[': {
1056: try {
1057: if (parse[off] == '^') {
1058: ret = regnode(ANYBUT);
1059: off++;
1060: } else {
1061: ret = regnode(ANYOF);
1062: }
1063:
1064: int pos = reglen();
1065: regc('\0');
1066:
1067: if ((parse[off] == ']') || (parse[off] == '-')) {
1068: regc(parse[off++]);
1069: }
1070: while (parse[off] != ']') {
1071: if (parse[off] == '-') {
1072: off++;
1073: if (parse[off] == ']') {
1074: regc('-');
1075: } else {
1076: int start = parse[off - 2];
1077: int end = parse[off++];
1078: if (start > end) {
1079: throw new IllegalArgumentException(
1080: "invalid [] range");
1081: }
1082: for (int i = start + 1; i <= end; i++) {
1083: regc((char) i);
1084: }
1085: }
1086: } else {
1087: regc(parse[off++]);
1088: }
1089: }
1090: regset(pos, (char) (reglen() - pos - 1));
1091: off++;
1092: netFlags |= HASWIDTH | SIMPLE;
1093: } catch (ArrayIndexOutOfBoundsException e) {
1094: throw new IllegalArgumentException("missing ]");
1095: }
1096: break;
1097: }
1098: case '(':
1099: ret = reg(true);
1100: netFlags |= (flagp & (HASWIDTH | SPSTART));
1101: break;
1102: case '|':
1103: case ')':
1104: throw new IllegalArgumentException("internal urp");
1105: case '?':
1106: case '+':
1107: case '*':
1108: throw new IllegalArgumentException(
1109: "?+* follows nothing");
1110: case '\\':
1111: if (off >= parse.length) {
1112: throw new IllegalArgumentException("trailing \\");
1113: }
1114: ret = regnode(EXACTLY);
1115: regc((char) 1);
1116: regc(parse[off++]);
1117: netFlags |= HASWIDTH | SIMPLE;
1118: break;
1119: default: {
1120: off--;
1121: int end;
1122: for (end = off; end < parse.length; end++) {
1123: if (META.indexOf(parse[end]) >= 0) {
1124: break;
1125: }
1126: }
1127: if ((end > off + 1) && (end < parse.length)
1128: && isMult(parse[end])) {
1129: end--; /* Back off clear of ?+* operand. */
1130: }
1131: netFlags |= HASWIDTH;
1132: if (end == off + 1) {
1133: netFlags |= SIMPLE;
1134: }
1135: ret = regnode(EXACTLY);
1136: regc((char) (end - off));
1137: for (; off < end; off++) {
1138: regc(parse[off]);
1139: }
1140: }
1141: break;
1142: }
1143:
1144: flagp = netFlags;
1145: return ret;
1146: }
1147:
1148: /*
1149: - regnode - emit a node
1150: */
1151: int regnode(char op) {
1152: int ret = code.length();
1153: code.append(op);
1154: code.append('\0');
1155:
1156: return ret;
1157: }
1158:
1159: /*
1160: - regc - emit (if appropriate) a byte of code
1161: */
1162: void regc(char b) {
1163: code.append(b);
1164: }
1165:
1166: int reglen() {
1167: return code.length();
1168: }
1169:
1170: void regset(int pos, char ch) {
1171: code.setCharAt(pos, ch);
1172: }
1173:
1174: /*
1175: - reginsert - insert an operator in front of already-emitted operand
1176: *
1177: * Means relocating the operand.
1178: */
1179: void reginsert(char op, int pos) {
1180: char[] tmp = new char[] { op, '\0' };
1181: code.insert(pos, tmp);
1182: }
1183:
1184: /*
1185: - regtail - set the next-pointer at the end of a node chain
1186: */
1187: void regtail(int pos, int val) {
1188: /* Find last node. */
1189:
1190: int scan = pos;
1191: while (true) {
1192: int tmp = regnext(scan);
1193: if (tmp < 0) {
1194: break;
1195: }
1196: scan = tmp;
1197: }
1198:
1199: int offset = (code.charAt(scan) == BACK) ? scan - val : val
1200: - scan;
1201: code.setCharAt(scan + 1, (char) offset);
1202: }
1203:
1204: /*
1205: - regoptail - regtail on operand of first argument; nop if operandless
1206: */
1207: void regoptail(int pos, int val) {
1208: if ((pos < 0) || (code.charAt(pos) != BRANCH)) {
1209: return;
1210: }
1211: regtail(pos + 2, val);
1212: }
1213:
1214: /*
1215: - regnext - dig the "next" pointer out of a node
1216: */
1217: int regnext(int pos) {
1218: int offset = code.charAt(pos + 1);
1219: if (offset == 0) {
1220: return -1;
1221: }
1222: if (code.charAt(pos) == BACK) {
1223: return pos - offset;
1224: } else {
1225: return pos + offset;
1226: }
1227: }
1228:
1229: static boolean isMult(char ch) {
1230: return (ch == '*') || (ch == '+') || (ch == '?');
1231: }
1232:
1233: }
1234:
1235: static class Match {
1236: char[] program;
1237:
1238: String str;
1239: int bol;
1240: int input;
1241: int length;
1242:
1243: int[] indices;
1244:
1245: boolean regtry(int off) {
1246: this .input = off;
1247:
1248: for (int i = 0; i < indices.length; i++) {
1249: indices[i] = -1;
1250: }
1251:
1252: if (regmatch(0)) {
1253: indices[0] = off;
1254: indices[1] = input;
1255: return true;
1256: } else {
1257: return false;
1258: }
1259: }
1260:
1261: /*
1262: - regmatch - main matching routine
1263: *
1264: * Conceptually the strategy is simple: check to see whether the current
1265: * node matches, call self recursively to see whether the rest matches,
1266: * and then act accordingly. In practice we make some effort to avoid
1267: * recursion, in particular by going through "ordinary" nodes (that don't
1268: * need to know whether the rest of the match failed) by a loop instead of
1269: * by recursion.
1270: */
1271: boolean regmatch(int scan) {
1272: while (true) {
1273: int next = regnext(scan);
1274: int op = program[scan];
1275: switch (op) {
1276: case BOL:
1277: if (input != bol) {
1278: return false;
1279: }
1280: break;
1281:
1282: case EOL:
1283: if (input != length) {
1284: return false;
1285: }
1286: break;
1287:
1288: case ANY:
1289: if (input >= length) {
1290: return false;
1291: }
1292: input++;
1293: break;
1294:
1295: case EXACTLY: {
1296: if (compare(scan) == false) {
1297: return false;
1298: }
1299: break;
1300: }
1301:
1302: case ANYOF:
1303: if (input >= length) {
1304: return false;
1305: }
1306: if (present(scan) == false) {
1307: return false;
1308: }
1309: input++;
1310: break;
1311:
1312: case ANYBUT:
1313: if (input >= length) {
1314: return false;
1315: }
1316: if (present(scan)) {
1317: return false;
1318: }
1319: input++;
1320: break;
1321:
1322: case NOTHING:
1323: case BACK:
1324: break;
1325:
1326: case BRANCH: {
1327: if (program[next] != BRANCH) {
1328: next = scan + 2;
1329: } else {
1330: do {
1331: int save = input;
1332: if (regmatch(scan + 2)) {
1333: return true;
1334: }
1335: input = save;
1336: scan = regnext(scan);
1337: } while ((scan >= 0)
1338: && (program[scan] == BRANCH));
1339: return false;
1340: }
1341: break;
1342: }
1343:
1344: case STAR:
1345: case PLUS: {
1346: /*
1347: * Lookahead to avoid useless match attempts
1348: * when we know what character comes next.
1349: */
1350:
1351: int ch = -1;
1352: if (program[next] == EXACTLY) {
1353: ch = program[next + 3];
1354: }
1355:
1356: int min = (op == STAR) ? 0 : 1;
1357: int save = input;
1358: int no = regrepeat(scan + 2);
1359:
1360: while (no >= min) {
1361: /* If it could work, try it. */
1362: if ((ch < 0)
1363: || ((input < length) && (str
1364: .charAt(input) == ch))) {
1365: if (regmatch(next)) {
1366: return true;
1367: }
1368: }
1369: /* Couldn't or didn't -- back up. */
1370: no--;
1371: input = save + no;
1372: }
1373: return false;
1374: }
1375:
1376: case END:
1377: return true;
1378:
1379: default:
1380: if (op >= CLOSE) {
1381: int no = op - CLOSE;
1382: int save = input;
1383:
1384: if (regmatch(next)) {
1385: /*
1386: * Don't set endp if some later
1387: * invocation of the same parentheses
1388: * already has.
1389: */
1390: if (indices[no * 2 + 1] <= 0) {
1391: indices[no * 2 + 1] = save;
1392: }
1393: return true;
1394: }
1395: } else if (op >= OPEN) {
1396: int no = op - OPEN;
1397: int save = input;
1398:
1399: if (regmatch(next)) {
1400: /*
1401: * Don't set startp if some later invocation of the
1402: * same parentheses already has.
1403: */
1404: if (indices[no * 2] <= 0) {
1405: indices[no * 2] = save;
1406: }
1407: return true;
1408: }
1409: }
1410: return false;
1411: }
1412: scan = next;
1413: }
1414: }
1415:
1416: boolean compare(int scan) {
1417: int count = program[scan + 2];
1418: if (input + count > length) {
1419: return false;
1420: }
1421: int start = scan + 3;
1422: int end = start + count;
1423: for (int i = start; i < end; i++) {
1424: if (str.charAt(input++) != program[i]) {
1425: return false;
1426: }
1427: }
1428: return true;
1429: }
1430:
1431: boolean present(int scan) {
1432: char ch = str.charAt(input);
1433:
1434: int count = program[scan + 2];
1435: int start = scan + 3;
1436: int end = start + count;
1437:
1438: for (int i = start; i < end; i++) {
1439: if (program[i] == ch) {
1440: return true;
1441: }
1442: }
1443: return false;
1444: }
1445:
1446: /*
1447: - regrepeat - repeatedly match something simple, report how many
1448: */
1449: int regrepeat(int scan) {
1450: int op = program[scan];
1451: int count = 0;
1452:
1453: switch (op) {
1454: case ANY:
1455: // '.*' matches all the way to the end.
1456:
1457: count = length - input;
1458: input = length;
1459: break;
1460:
1461: case EXACTLY: {
1462: // 'g*' matches all the following 'g' characters.
1463:
1464: char ch = program[scan + 3];
1465: while ((input < length) && (str.charAt(input) == ch)) {
1466: input++;
1467: count++;
1468: }
1469: break;
1470: }
1471:
1472: case ANYOF:
1473: // [abc]*
1474:
1475: while ((input < length) && present(scan)) {
1476: input++;
1477: count++;
1478: }
1479: break;
1480:
1481: case ANYBUT:
1482: while ((input < length) && !present(scan)) {
1483: input++;
1484: count++;
1485: }
1486: break;
1487:
1488: }
1489: return count;
1490: }
1491:
1492: /*
1493: - regnext - dig the "next" pointer out of a node
1494: */
1495: int regnext(int scan) {
1496: int offset = program[scan + 1];
1497: if (program[scan] == BACK) {
1498: return scan - offset;
1499: } else {
1500: return scan + offset;
1501: }
1502: }
1503: }
1504: }
|