0001: /*
0002: * Regexp.java
0003: *
0004: * Brazil project web application Framework,
0005: * export version: 1.1
0006: * Copyright (c) 1999-2000 Sun Microsystems, Inc.
0007: *
0008: * Sun Public License Notice
0009: *
0010: * The contents of this file are subject to the Sun Public License Version
0011: * 1.0 (the "License"). You may not use this file except in compliance with
0012: * the License. A copy of the License is included as the file "license.terms",
0013: * and also available at http://www.sun.com/
0014: *
0015: * The Original Code is from:
0016: * Brazil project web application Framework release 1.1.
0017: * The Initial Developer of the Original Code is: cstevens.
0018: * Portions created by cstevens are Copyright (C) Sun Microsystems, Inc.
0019: * All Rights Reserved.
0020: *
0021: * Contributor(s): cstevens, suhler.
0022: *
0023: * Version: 1.10
0024: * Created by cstevens on 99/08/10
0025: * Last modified by suhler on 00/11/06 10:45:53
0026: */
0027:
0028: package sunlabs.brazil.util.regexp;
0029:
0030: /**
0031: * The <code>Regexp</code> class can be used to match a pattern against a
0032: * string and optionally replace the matched parts with new strings.
0033: * <p>
0034: * Regular expressions were implemented by translating Henry Spencer's
0035: * regular expression package for <a href="http://www.scriptics.com">tcl8.0</a>.
0036: * Much of the description below is copied verbatim from the tcl8.0 regsub
0037: * manual entry.
0038: * <hr>
0039: * REGULAR EXPRESSIONS
0040: * <p>
0041: * A regular expression is zero or more <code>branches</code>, separated by
0042: * "|". It matches anything that matches one of the branches.
0043: * <p>
0044: * A branch is zero or more <code>pieces</code>, concatenated.
0045: * It matches a match for the first piece, followed by a match for the
0046: * second piece, etc.
0047: * <p>
0048: * A piece is an <code>atom</code>, possibly followed by "*", "+", or
0049: * "?". <ul>
0050: * <li> An atom followed by "*" matches a sequence of 0 or more matches of
0051: * the atom.
0052: * <li> An atom followed by "+" matches a sequence of 1 or more matches of
0053: * the atom.
0054: * <li> An atom followed by "?" matches either 0 or 1 matches of the atom.
0055: * </ul>
0056: * <p>
0057: * An atom is <ul>
0058: * <li> a regular expression in parentheses (matching a match for the
0059: * regular expression)
0060: * <li> a <code>range</code> (see below)
0061: * <li> "." (matching any single character)
0062: * <li> "^" (matching the null string at the beginning of the input string)
0063: * <li> "$" (matching the null string at the end of the input string)
0064: * <li> a "\" followed by a single character (matching that character)
0065: * <li> a single character with no other significance (matching that
0066: * character).
0067: * </ul>
0068: * <p>
0069: * A <code>range</code> is a sequence of characters enclosed in "[]".
0070: * The range normally matches any single character from the sequence.
0071: * If the sequence begins with "^", the range matches any single character
0072: * <b>not</b> from the rest of the sequence.
0073: * If two characters in the sequence are separated by "-", this is shorthand
0074: * for the full list of characters between them (e.g. "[0-9]" matches any
0075: * decimal digit). To include a literal "]" in the sequence, make it the
0076: * first character (following a possible "^"). To include a literal "-",
0077: * make it the first or last character.
0078: * <p>
0079: * In general there may be more than one way to match a regular expression
0080: * to an input string. For example, consider the command
0081: * <pre>
0082: * String[] match = new String[2];
0083: * Regexp.match("(a*)b*", "aabaaabb", match);
0084: * </pre>
0085: * Considering only the rules given so far, <code>match[0]</code> and
0086: * <code>match[1]</code> could end up with the values <ul>
0087: * <li> "aabb" and "aa"
0088: * <li> "aaab" and "aaa"
0089: * <li> "ab" and "a"
0090: * </ul>
0091: * or any of several other combinations. To resolve this potential ambiguity,
0092: * Regexp chooses among alternatives using the rule "first then longest".
0093: * In other words, it considers the possible matches in order working
0094: * from left to right across the input string and the pattern, and it
0095: * attempts to match longer pieces of the input string before shorter
0096: * ones. More specifically, the following rules apply in decreasing
0097: * order of priority: <ol>
0098: * <li> If a regular expression could match two different parts of an input
0099: * string then it will match the one that begins earliest.
0100: * <li> If a regular expression contains "|" operators then the
0101: * leftmost matching sub-expression is chosen.
0102: * <li> In "*", "+", and "?" constructs, longer matches are chosen in
0103: * preference to shorter ones.
0104: * <li>
0105: * In sequences of expression components the components are considered
0106: * from left to right.
0107: * </ol>
0108: * <p>
0109: * In the example from above, "(a*)b*" therefore matches exactly "aab"; the
0110: * "(a*)" portion of the pattern is matched first and it consumes the leading
0111: * "aa", then the "b*" portion of the pattern consumes the next "b". Or,
0112: * consider the following example:
0113: * <pre>
0114: * String match = new String[3];
0115: * Regexp.match("(ab|a)(b*)c", "abc", match);
0116: * </pre>
0117: * After this command, <code>match[0]</code> will be "abc",
0118: * <code>match[1]</code> will be "ab", and <code>match[2]</code> will be an
0119: * empty string.
0120: * Rule 4 specifies that the "(ab|a)" component gets first shot at the input
0121: * string and Rule 2 specifies that the "ab" sub-expression
0122: * is checked before the "a" sub-expression.
0123: * Thus the "b" has already been claimed before the "(b*)"
0124: * component is checked and therefore "(b*)" must match an empty string.
0125: * <hr>
0126: * <a name=regsub></a>
0127: * REGULAR EXPRESSION SUBSTITUTION
0128: * <p>
0129: * Regular expression substitution matches a string against a regular
0130: * expression, transforming the string by replacing the matched region(s)
0131: * with new substring(s).
0132: * <p>
0133: * What gets substituted into the result is controlled by a
0134: * <code>subspec</code>. The subspec is a formatting string that specifies
0135: * what portions of the matched region should be substituted into the
0136: * result.
0137: * <ul>
0138: * <li> "&" or "\0" is replaced with a copy of the entire matched region.
0139: * <li> "\<code>n</code>", where <code>n</code> is a digit from 1 to 9,
0140: * is replaced with a copy of the <code>n</code><i>th</i> subexpression.
0141: * <li> "\&" or "\\" are replaced with just "&" or "\" to escape their
0142: * special meaning.
0143: * <li> any other character is passed through.
0144: * </ul>
0145: * In the above, strings like "\2" represents the two characters
0146: * <code>backslash</code> and "2", not the Unicode character 0002.
0147: * <hr>
0148: * Here is an example of how to use Regexp
0149: * <pre>
0150: *
0151: * public static void
0152: * main(String[] args)
0153: * throws Exception
0154: * {
0155: * Regexp re;
0156: * String[] matches;
0157: * String s;
0158: *
0159: * /*
0160: * * A regular expression to match the first line of a HTTP request.
0161: * *
0162: * * 1. ^ - starting at the beginning of the line
0163: * * 2. ([A-Z]+) - match and remember some upper case characters
0164: * * 3. [ \t]+ - skip blank space
0165: * * 4. ([^ \t]*) - match and remember up to the next blank space
0166: * * 5. [ \t]+ - skip more blank space
0167: * * 6. (HTTP/1\\.[01]) - match and remember HTTP/1.0 or HTTP/1.1
0168: * * 7. $ - end of string - no chars left.
0169: * */
0170: *
0171: * s = "GET http://a.b.com:1234/index.html HTTP/1.1";
0172: *
0173: * re = new Regexp("^([A-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/1\\.[01])$");
0174: * matches = new String[4];
0175: * if (re.match(s, matches)) {
0176: * System.out.println("METHOD " + matches[1]);
0177: * System.out.println("URL " + matches[2]);
0178: * System.out.println("VERSION " + matches[3]);
0179: * }
0180: *
0181: * /*
0182: * * A regular expression to extract some simple comma-separated data,
0183: * * reorder some of the columns, and discard column 2.
0184: * */
0185: *
0186: * s = "abc,def,ghi,klm,nop,pqr";
0187: *
0188: * re = new Regexp("^([^,]+),([^,]+),([^,]+),(.*)");
0189: * System.out.println(re.sub(s, "\\3,\\1,\\4"));
0190: * }
0191: * </pre>
0192: *
0193: * @author Colin Stevens (colin.stevens@sun.com)
0194: * @version 1.10, 00/11/06
0195: * @see Regsub
0196: */
0197:
0198: public class Regexp implements java.io.Serializable {
0199: public static void main(String[] args) throws Exception {
0200: if ((args.length == 2) && (args[0].equals("compile"))) {
0201: System.out.println(new Regexp(args[1]));
0202: } else if ((args.length == 3) && (args[0].equals("match"))) {
0203: Regexp r = new Regexp(args[1]);
0204: String[] substrs = new String[r.subspecs()];
0205: boolean match = r.match(args[2], substrs);
0206: System.out.println("match:\t" + match);
0207: for (int i = 0; i < substrs.length; i++) {
0208: System.out.println((i + 1) + ":\t" + substrs[i]);
0209: }
0210: } else if ((args.length == 4) && (args[0].equals("sub"))) {
0211: Regexp r = new Regexp(args[1]);
0212: System.out.println(r.subAll(args[2], args[3]));
0213: } else {
0214: System.out.println("usage:");
0215: System.out.println("\tRegexp match <pattern> <string>");
0216: System.out
0217: .println("\tRegexp sub <pattern> <string> <subspec>");
0218: System.out.println("\tRegexp compile <pattern>");
0219: }
0220: }
0221:
0222: /*
0223: * Structure for regexp "program". This is essentially a linear encoding
0224: * of a nondeterministic finite-state machine (aka syntax charts or
0225: * "railroad normal form" in parsing technology). Each node is an opcode
0226: * plus a "next" pointer, possibly plus an operand. "Next" pointers of
0227: * all nodes except BRANCH implement concatenation; a "next" pointer with
0228: * a BRANCH on both ends of it is connecting two alternatives. (Here we
0229: * have one of the subtle syntax dependencies: an individual BRANCH (as
0230: * opposed to a collection of them) is never concatenated with anything
0231: * because of operator precedence.) The operand of some types of node is
0232: * a literal string; for others, it is a node leading into a sub-FSM. In
0233: * particular, the operand of a BRANCH node is the first node of the branch.
0234: * (NB this is *not* a tree structure: the tail of the branch connects
0235: * to the thing following the set of BRANCHes.) The opcodes are:
0236: */
0237:
0238: static final int NSUBEXP = 100;
0239:
0240: /* definition number opnd? meaning */
0241:
0242: static final char END = 0; /* no End of program. */
0243: static final char BOL = 1; /* no Match "" at beginning of line. */
0244: static final char EOL = 2; /* no Match "" at end of line. */
0245: static final char ANY = 3; /* no Match any one character. */
0246: static final char ANYOF = 4; /* str Match any character in this string. */
0247: static final char ANYBUT = 5; /* str Match any character not in this string. */
0248: static final char BRANCH = 6; /* node Match this alternative, or the next... */
0249: static final char BACK = 7; /* no Match "", "next" ptr points backward. */
0250: static final char EXACTLY = 8; /* str Match this string. */
0251: static final char NOTHING = 9; /* no Match empty string. */
0252: static final char STAR = 10; /* node Match this (simple) thing 0 or more times. */
0253: static final char PLUS = 11; /* node Match this (simple) thing 1 or more times. */
0254: static final char OPEN = 20; /* no Mark this point in input as start of #n. */
0255: /* OPEN+1 is number 1, etc. */
0256: static final char CLOSE = (char) (OPEN + NSUBEXP);
0257: /* no Analogous to OPEN. */
0258: static final String[] opnames = { "END", "BOL", "EOL", "ANY",
0259: "ANYOF", "ANYBUT", "BRANCH", "BACK", "EXACTLY", "NOTHING",
0260: "STAR", "PLUS" };
0261:
0262: /*
0263: * A node is one char of opcode followed by one char of "next" pointer.
0264: * The value is a positive offset from the opcode of the node containing
0265: * it. An operand, if any, simply follows the node. (Note that much of
0266: * the code generation knows about this implicit relationship.)
0267: *
0268: * Opcode notes:
0269: *
0270: * BRANCH The set of branches constituting a single choice are hooked
0271: * together with their "next" pointers, since precedence prevents
0272: * anything being concatenated to any individual branch. The
0273: * "next" pointer of the last BRANCH in a choice points to the
0274: * thing following the whole choice. This is also where the
0275: * final "next" pointer of each individual branch points; each
0276: * branch starts with the operand node of a BRANCH node.
0277: *
0278: * ANYOF, ANYBUT, EXACTLY
0279: * The format of a string operand is one char of length
0280: * followed by the characters making up the string.
0281: *
0282: * BACK Normal "next" pointers all implicitly point forward; BACK
0283: * exists to make loop structures possible.
0284: *
0285: * STAR, PLUS
0286: * '?', and complex '*' and '+' are implemented as circular
0287: * BRANCH structures using BACK. Simple cases (one character
0288: * per match) are implemented with STAR and PLUS for speed
0289: * and to minimize recursive plunges.
0290: *
0291: * OPENn, CLOSEn
0292: * are numbered at compile time.
0293: */
0294:
0295: /**
0296: * The bytecodes making up the regexp program.
0297: */
0298: char[] program;
0299:
0300: /**
0301: * Whether the regexp matching should be case insensitive.
0302: */
0303: boolean ignoreCase;
0304:
0305: /**
0306: * The number of parenthesized subexpressions in the regexp pattern,
0307: * plus 1 for the match of the whole pattern itself.
0308: */
0309: int npar;
0310:
0311: /**
0312: * <code>true</code> if the pattern must match the beginning of the
0313: * string, so we don't have to waste time matching against all possible
0314: * starting locations in the string.
0315: */
0316: boolean anchored;
0317:
0318: int startChar;
0319: String must;
0320:
0321: /**
0322: * Compiles a new Regexp object from the given regular expression
0323: * pattern.
0324: * <p>
0325: * It takes a certain amount of time to parse and validate a regular
0326: * expression pattern before it can be used to perform matches
0327: * or substitutions. If the caller caches the new Regexp object, that
0328: * parsing time will be saved because the same Regexp can be used with
0329: * respect to many different strings.
0330: *
0331: * @param pat
0332: * The string holding the regular expression pattern.
0333: *
0334: * @throws IllegalArgumentException if the pattern is malformed.
0335: * The detail message for the exception will be set to a
0336: * string indicating how the pattern was malformed.
0337: */
0338: public Regexp(String pat) throws IllegalArgumentException {
0339: compile(pat);
0340: }
0341:
0342: /**
0343: * Compiles a new Regexp object from the given regular expression
0344: * pattern.
0345: *
0346: * @param pat
0347: * The string holding the regular expression pattern.
0348: *
0349: * @param ignoreCase
0350: * If <code>true</code> then this regular expression will
0351: * do case-insensitive matching. If <code>false</code>, then
0352: * the matches are case-sensitive. Regular expressions
0353: * generated by <code>Regexp(String)</code> are case-sensitive.
0354: *
0355: * @throws IllegalArgumentException if the pattern is malformed.
0356: * The detail message for the exception will be set to a
0357: * string indicating how the pattern was malformed.
0358: */
0359: public Regexp(String pat, boolean ignoreCase)
0360: throws IllegalArgumentException {
0361: this .ignoreCase = ignoreCase;
0362: if (ignoreCase) {
0363: pat = pat.toLowerCase();
0364: }
0365: compile(pat);
0366: }
0367:
0368: /**
0369: * Returns the number of parenthesized subexpressions in this regular
0370: * expression, plus one more for this expression itself.
0371: *
0372: * @return The number.
0373: */
0374: public int subspecs() {
0375: return npar;
0376: }
0377:
0378: /**
0379: * Matches the given string against this regular expression.
0380: *
0381: * @param str
0382: * The string to match.
0383: *
0384: * @return The substring of <code>str</code> that matched the entire
0385: * regular expression, or <code>null</code> if the string did not
0386: * match this regular expression.
0387: */
0388: public String match(String str) {
0389: Match m = exec(str, 0, 0);
0390:
0391: if (m == null) {
0392: return null;
0393: }
0394: return str.substring(m.indices[0], m.indices[1]);
0395: }
0396:
0397: /**
0398: * Matches the given string against this regular expression, and computes
0399: * the set of substrings that matched the parenthesized subexpressions.
0400: * <p>
0401: * <code>substrs[0]</code> is set to the range of <code>str</code>
0402: * that matched the entire regular expression.
0403: * <p>
0404: * <code>substrs[1]</code> is set to the range of <code>str</code>
0405: * that matched the first (leftmost) parenthesized subexpression.
0406: * <code>substrs[n]</code> is set to the range that matched the
0407: * <code>n</code><i>th</i> subexpression, and so on.
0408: * <p>
0409: * If subexpression <code>n</code> did not match, then
0410: * <code>substrs[n]</code> is set to <code>null</code>. Not to
0411: * be confused with "", which is a valid value for a
0412: * subexpression that matched 0 characters.
0413: * <p>
0414: * The length that the caller should use when allocating the
0415: * <code>substr</code> array is the return value of
0416: * <code>Regexp.subspecs</code>. The array
0417: * can be shorter (in which case not all the information will
0418: * be returned), or longer (in which case the remainder of the
0419: * elements are initialized to <code>null</code>), or
0420: * <code>null</code> (to ignore the subexpressions).
0421: *
0422: * @param str
0423: * The string to match.
0424: *
0425: * @param substrs
0426: * An array of strings allocated by the caller, and filled in
0427: * with information about the portions of <code>str</code> that
0428: * matched the regular expression. May be <code>null</code>.
0429: *
0430: * @return <code>true</code> if <code>str</code> that matched this
0431: * regular expression, <code>false</code> otherwise.
0432: * If <code>false</code> is returned, then the contents of
0433: * <code>substrs</code> are unchanged.
0434: *
0435: * @see #subspecs
0436: */
0437: public boolean match(String str, String[] substrs) {
0438: Match m = exec(str, 0, 0);
0439:
0440: if (m == null) {
0441: return false;
0442: }
0443: if (substrs != null) {
0444: int max = Math.min(substrs.length, npar);
0445: int i;
0446: int j = 0;
0447: for (i = 0; i < max; i++) {
0448: int start = m.indices[j++];
0449: int end = m.indices[j++];
0450: if (start < 0) {
0451: substrs[i] = null;
0452: } else {
0453: substrs[i] = str.substring(start, end);
0454: }
0455: }
0456: for (; i < substrs.length; i++) {
0457: substrs[i] = null;
0458: }
0459: }
0460: return true;
0461: }
0462:
0463: /**
0464: * Matches the given string against this regular expression, and computes
0465: * the set of substrings that matched the parenthesized subexpressions.
0466: * <p>
0467: * For the indices specified below, the range extends from the character
0468: * at the starting index up to, but not including, the character at the
0469: * ending index.
0470: * <p>
0471: * <code>indices[0]</code> and <code>indices[1]</code> are set to
0472: * starting and ending indices of the range of <code>str</code>
0473: * that matched the entire regular expression.
0474: * <p>
0475: * <code>indices[2]</code> and <code>indices[3]</code> are set to the
0476: * starting and ending indices of the range of <code>str</code> that
0477: * matched the first (leftmost) parenthesized subexpression.
0478: * <code>indices[n * 2]</code> and <code>indices[n * 2 + 1]</code>
0479: * are set to the range that matched the <code>n</code><i>th</i>
0480: * subexpression, and so on.
0481: * <p>
0482: * If subexpression <code>n</code> did not match, then
0483: * <code>indices[n * 2]</code> and <code>indices[n * 2 + 1]</code>
0484: * are both set to <code>-1</code>.
0485: * <p>
0486: * The length that the caller should use when allocating the
0487: * <code>indices</code> array is twice the return value of
0488: * <code>Regexp.subspecs</code>. The array
0489: * can be shorter (in which case not all the information will
0490: * be returned), or longer (in which case the remainder of the
0491: * elements are initialized to <code>-1</code>), or
0492: * <code>null</code> (to ignore the subexpressions).
0493: *
0494: * @param str
0495: * The string to match.
0496: *
0497: * @param indices
0498: * An array of integers allocated by the caller, and filled in
0499: * with information about the portions of <code>str</code> that
0500: * matched all the parts of the regular expression.
0501: * May be <code>null</code>.
0502: *
0503: * @return <code>true</code> if the string matched the regular expression,
0504: * <code>false</code> otherwise. If <code>false</code> is
0505: * returned, then the contents of <code>indices</code> are
0506: * unchanged.
0507: *
0508: * @see #subspecs
0509: */
0510: public boolean match(String str, int[] indices) {
0511: Match m = exec(str, 0, 0);
0512:
0513: if (m == null) {
0514: return false;
0515: }
0516: if (indices != null) {
0517: int max = Math.min(indices.length, npar * 2);
0518: System.arraycopy(m.indices, 0, indices, 0, max);
0519:
0520: for (int i = max; i < indices.length; i++) {
0521: indices[i] = -1;
0522: }
0523: }
0524: return true;
0525: }
0526:
0527: /**
0528: * Matches a string against a regular expression and replaces the first
0529: * match with the string generated from the substitution parameter.
0530: *
0531: * @param str
0532: * The string to match against this regular expression.
0533: *
0534: * @param subspec
0535: * The substitution parameter, described in <a href=#regsub>
0536: * REGULAR EXPRESSION SUBSTITUTION</a>.
0537: *
0538: * @return The string formed by replacing the first match in
0539: * <code>str</code> with the string generated from
0540: * <code>subspec</code>. If no matches were found, then
0541: * the return value is <code>null</code>.
0542: */
0543: public String sub(String str, String subspec) {
0544: Regsub rs = new Regsub(this , str);
0545: if (rs.nextMatch()) {
0546: StringBuffer sb = new StringBuffer(rs.skipped());
0547: applySubspec(rs, subspec, sb);
0548: sb.append(rs.rest());
0549:
0550: return sb.toString();
0551: } else {
0552: return null;
0553: }
0554: }
0555:
0556: /**
0557: * Matches a string against a regular expression and replaces all
0558: * matches with the string generated from the substitution parameter.
0559: * After each substutition is done, the portions of the string already
0560: * examined, including the newly substituted region, are <b>not</b> checked
0561: * again for new matches -- only the rest of the string is examined.
0562: *
0563: * @param str
0564: * The string to match against this regular expression.
0565: *
0566: * @param subspec
0567: * The substitution parameter, described in <a href=#regsub>
0568: * REGULAR EXPRESSION SUBSTITUTION</a>.
0569: *
0570: * @return The string formed by replacing all the matches in
0571: * <code>str</code> with the strings generated from
0572: * <code>subspec</code>. If no matches were found, then
0573: * the return value is a copy of <code>str</code>.
0574: */
0575: public String subAll(String str, String subspec) {
0576: return sub(str, new SubspecFilter(subspec, true));
0577: }
0578:
0579: /**
0580: * Utility method to give access to the standard substitution algorithm
0581: * used by <code>sub</code> and <code>subAll</code>. Appends to the
0582: * string buffer the string generated by applying the substitution
0583: * parameter to the matched region.
0584: *
0585: * @param rs
0586: * Information about the matched region.
0587: *
0588: * @param subspec
0589: * The substitution parameter.
0590: *
0591: * @param sb
0592: * StringBuffer to which the generated string is appended.
0593: */
0594: public static void applySubspec(Regsub rs, String subspec,
0595: StringBuffer sb) {
0596: try {
0597: int len = subspec.length();
0598: for (int i = 0; i < len; i++) {
0599: char ch = subspec.charAt(i);
0600: switch (ch) {
0601: case '&': {
0602: sb.append(rs.matched());
0603: break;
0604: }
0605: case '\\': {
0606: i++;
0607: ch = subspec.charAt(i);
0608: if ((ch >= '0') && (ch <= '9')) {
0609: String match = rs.submatch(ch - '0');
0610: if (match != null) {
0611: sb.append(match);
0612: }
0613: break;
0614: }
0615: // fall through.
0616: }
0617: default: {
0618: sb.append(ch);
0619: }
0620: }
0621: }
0622: } catch (IndexOutOfBoundsException e) {
0623: /*
0624: * Ignore malformed substitution pattern.
0625: * Return string matched so far.
0626: */
0627: }
0628: }
0629:
0630: public String sub(String str, Filter rf) {
0631: Regsub rs = new Regsub(this , str);
0632: if (rs.nextMatch() == false) {
0633: return str;
0634: }
0635:
0636: StringBuffer sb = new StringBuffer();
0637: do {
0638: sb.append(rs.skipped());
0639: if (rf.filter(rs, sb) == false) {
0640: break;
0641: }
0642: } while (rs.nextMatch());
0643: sb.append(rs.rest());
0644: return sb.toString();
0645: }
0646:
0647: /**
0648: * This interface is used by the <code>Regexp</code> class to generate
0649: * the replacement string for each pattern match found in the source
0650: * string.
0651: *
0652: * @author Colin Stevens (colin.stevens@sun.com)
0653: * @version 1.10, 00/11/06
0654: */
0655: public interface Filter {
0656: /**
0657: * Given the current state of the match, generate the replacement
0658: * string. This method will be called for each match found in
0659: * the source string, unless this filter decides not to handle any
0660: * more matches.
0661: * <p>
0662: * The implementation can use whatever rules it chooses
0663: * to generate the replacement string. For example, here is an
0664: * example of a filter that replaces the first <b>5</b>
0665: * occurrences of "%XX" in a string with the ASCII character
0666: * represented by the hex digits "XX":
0667: * <pre>
0668: * String str = ...;
0669: *
0670: * Regexp re = new Regexp("%[a-fA-F0-9][a-fA-F0-9]");
0671: *
0672: * Regexp.Filter rf = new Regexp.Filter() {
0673: * int count = 5;
0674: * public boolean filter(Regsub rs, StringBuffer sb) {
0675: * String match = rs.matched();
0676: * int hi = Character.digit(match.charAt(1), 16);
0677: * int lo = Character.digit(match.charAt(2), 16);
0678: * sb.append((char) ((hi << 4) | lo));
0679: * return (--count > 0);
0680: * }
0681: * }
0682: *
0683: * String result = re.sub(str, rf);
0684: * </pre>
0685: *
0686: * @param rs
0687: * <code>Regsub</code> containing the state of the current
0688: * match.
0689: *
0690: * @param sb
0691: * The string buffer that this filter should append the
0692: * generated string to. This string buffer actually
0693: * contains the results the calling <code>Regexp</code> has
0694: * generated up to this point.
0695: *
0696: * @return <code>false</code> if no further matches should be
0697: * considered in this string, <code>true</code> to allow
0698: * <code>Regexp</code> to continue looking for further
0699: * matches.
0700: */
0701: public boolean filter(Regsub rs, StringBuffer sb);
0702: }
0703:
0704: private static class SubspecFilter implements Filter {
0705: String subspec;
0706: boolean all;
0707:
0708: public SubspecFilter(String subspec, boolean all) {
0709: this .subspec = subspec;
0710: this .all = all;
0711: }
0712:
0713: public boolean filter(Regsub rs, StringBuffer sb) {
0714: applySubspec(rs, subspec, sb);
0715: return all;
0716: }
0717: }
0718:
0719: /**
0720: * Returns a string representation of this compiled regular
0721: * expression. The format of the string representation is a
0722: * symbolic dump of the bytecodes.
0723: *
0724: * @return A string representation of this regular expression.
0725: */
0726: public String toString() {
0727: StringBuffer sb = new StringBuffer();
0728:
0729: sb.append("# subs: " + npar + "\n");
0730: sb.append("anchor: " + anchored + "\n");
0731: sb.append("start: " + (char) startChar + "\n");
0732: sb.append("must: " + must + "\n");
0733:
0734: for (int i = 0; i < program.length;) {
0735: sb.append(i + ":\t");
0736: int op = program[i];
0737: if (op >= CLOSE) {
0738: sb.append("CLOSE" + (op - CLOSE));
0739: } else if (op >= OPEN) {
0740: sb.append("OPEN" + (op - OPEN));
0741: } else {
0742: sb.append(opnames[op]);
0743: }
0744: int line;
0745: int offset = (int) program[i + 1];
0746: if (offset == 0) {
0747: sb.append('\t');
0748: } else if (op == BACK) {
0749: sb.append("\t-" + offset + "," + (i - offset));
0750: } else {
0751: sb.append("\t+" + offset + "," + (i + offset));
0752: }
0753:
0754: if ((op == ANYOF) || (op == ANYBUT) || (op == EXACTLY)) {
0755: sb.append("\t'");
0756: sb.append(program, i + 3, program[i + 2]);
0757: sb.append("'");
0758: i += 3 + program[i + 2];
0759: } else {
0760: i += 2;
0761: }
0762: sb.append('\n');
0763: }
0764: return sb.toString();
0765: }
0766:
0767: private void compile(String exp) throws IllegalArgumentException {
0768: Compiler rcstate = new Compiler();
0769: rcstate.parse = exp.toCharArray();
0770: rcstate.off = 0;
0771: rcstate.npar = 1;
0772: rcstate.code = new StringBuffer();
0773:
0774: rcstate.reg(false);
0775:
0776: program = rcstate.code.toString().toCharArray();
0777: npar = rcstate.npar;
0778: startChar = -1;
0779:
0780: /* optimize */
0781: if (program[rcstate.regnext(0)] == END) {
0782: if (program[2] == BOL) {
0783: anchored = true;
0784: } else if (program[2] == EXACTLY) {
0785: startChar = (int) program[5];
0786: }
0787: }
0788:
0789: /*
0790: * If there's something expensive in the r.e., find the
0791: * longest literal string that must appear and make it the
0792: * regmust. Resolve ties in favor of later strings, since
0793: * the regstart check works with the beginning of the r.e.
0794: * and avoiding duplication strengthens checking. Not a
0795: * strong reason, but sufficient in the absence of others.
0796: */
0797: /*
0798: if ((rcstate.flagp & Compiler.SPSTART) != 0) {
0799: int index = -1;
0800: int longest = 0;
0801:
0802: for (scan = 0; scan < program.length; ) {
0803: switch (program[scan]) {
0804: case EXACTLY:
0805: int length = program[scan + 2];
0806: if (length > longest) {
0807: index = scan;
0808: longest = length;
0809: }
0810: // fall through;
0811:
0812: case ANYOF:
0813: case ANYBUT:
0814: scan += 3 + program[scan + 2];
0815: break;
0816:
0817: default:
0818: scan += 2;
0819: break;
0820: }
0821: }
0822: if (longest > 0) {
0823: must = new String(program, index + 3, longest);
0824: }
0825: }
0826: */
0827:
0828: }
0829:
0830: Match exec(String str, int start, int off) {
0831: if (ignoreCase) {
0832: str = str.toLowerCase();
0833: }
0834:
0835: Match match = new Match();
0836:
0837: match.program = program;
0838:
0839: /* Mark beginning of line for ^ . */
0840: match.str = str;
0841: match.bol = start;
0842: match.length = str.length();
0843:
0844: match.indices = new int[npar * 2];
0845:
0846: if (anchored) {
0847: /* Simplest case: anchored match need be tried only once. */
0848: if (match.regtry(off)) {
0849: return match;
0850: }
0851: } else if (startChar >= 0) {
0852: /* We know what char it must start with. */
0853: while (off < match.length) {
0854: off = str.indexOf(startChar, off);
0855: if (off < 0) {
0856: break;
0857: }
0858: if (match.regtry(off)) {
0859: return match;
0860: }
0861: off++;
0862: }
0863: } else {
0864: /* Messy cases: unanchored match. */
0865: do {
0866: if (match.regtry(off)) {
0867: return match;
0868: }
0869: } while (off++ < match.length);
0870: }
0871: return null;
0872: }
0873:
0874: static class Compiler {
0875: char[] parse;
0876: int off;
0877: int npar;
0878: StringBuffer code;
0879: int flagp;
0880:
0881: static final String META = "^$.[()|?+*\\";
0882: static final String MULT = "*+?";
0883:
0884: static final int WORST = 00; /* Worst case. */
0885: static final int HASWIDTH = 01; /* Known never to match null string. */
0886: static final int SIMPLE = 02; /* Simple enough to be STAR/PLUS operand. */
0887: static final int SPSTART = 04; /* Starts with * or +. */
0888:
0889: /*
0890: - reg - regular expression, i.e. main body or parenthesized thing
0891: *
0892: * Caller must absorb opening parenthesis.
0893: *
0894: * Combining parenthesis handling with the base level of regular expression
0895: * is a trifle forced, but the need to tie the tails of the branches to what
0896: * follows makes it hard to avoid.
0897: */
0898: int reg(boolean paren) throws IllegalArgumentException {
0899: int netFlags = HASWIDTH;
0900: int parno = 0;
0901:
0902: int ret = -1;
0903: if (paren) {
0904: parno = npar++;
0905: if (npar >= NSUBEXP) {
0906: throw new IllegalArgumentException("too many ()");
0907: }
0908: ret = regnode((char) (OPEN + parno));
0909: }
0910:
0911: /* Pick up the branches, linking them together. */
0912: int br = regbranch();
0913: if (ret >= 0) {
0914: regtail(ret, br);
0915: } else {
0916: ret = br;
0917: }
0918:
0919: if ((flagp & HASWIDTH) == 0) {
0920: netFlags &= ~HASWIDTH;
0921: }
0922: netFlags |= (flagp & SPSTART);
0923: while ((off < parse.length) && (parse[off] == '|')) {
0924: off++;
0925: br = regbranch();
0926: regtail(ret, br);
0927: if ((flagp & HASWIDTH) == 0) {
0928: netFlags &= ~HASWIDTH;
0929: }
0930: netFlags |= (flagp & SPSTART);
0931: }
0932:
0933: /* Make a closing node, and hook it on the end. */
0934: int ender = regnode((paren) ? (char) (CLOSE + parno) : END);
0935: regtail(ret, ender);
0936:
0937: /* Hook the tails of the branches to the closing node. */
0938: for (br = ret; br >= 0; br = regnext(br)) {
0939: regoptail(br, ender);
0940: }
0941:
0942: /* Check for proper termination. */
0943: if (paren
0944: && ((off >= parse.length) || (parse[off++] != ')'))) {
0945: throw new IllegalArgumentException("missing )");
0946: } else if ((paren == false) && (off < parse.length)) {
0947: throw new IllegalArgumentException("unexpected )");
0948: }
0949:
0950: flagp = netFlags;
0951: return ret;
0952: }
0953:
0954: /*
0955: - regbranch - one alternative of an | operator
0956: *
0957: * Implements the concatenation operator.
0958: */
0959: int regbranch() throws IllegalArgumentException {
0960: int netFlags = WORST; /* Tentatively. */
0961:
0962: int ret = regnode(BRANCH);
0963: int chain = -1;
0964: while ((off < parse.length) && (parse[off] != '|')
0965: && (parse[off] != ')')) {
0966: int latest = regpiece();
0967: netFlags |= flagp & HASWIDTH;
0968: if (chain < 0) { /* First piece. */
0969: netFlags |= (flagp & SPSTART);
0970: } else {
0971: regtail(chain, latest);
0972: }
0973: chain = latest;
0974: }
0975: if (chain < 0) { /* Loop ran zero times. */
0976: regnode(NOTHING);
0977: }
0978:
0979: flagp = netFlags;
0980: return ret;
0981: }
0982:
0983: /*
0984: - regpiece - something followed by possible [*+?]
0985: *
0986: * Note that the branching code sequences used for ? and the general cases
0987: * of * and + are somewhat optimized: they use the same NOTHING node as
0988: * both the endmarker for their branch list and the body of the last branch.
0989: * It might seem that this node could be dispensed with entirely, but the
0990: * endmarker role is not redundant.
0991: */
0992: int regpiece() throws IllegalArgumentException {
0993: int netFlags;
0994:
0995: int ret = regatom();
0996:
0997: if ((off >= parse.length) || (isMult(parse[off]) == false)) {
0998: return ret;
0999: }
1000: char op = parse[off];
1001:
1002: if (((flagp & HASWIDTH) == 0) && (op != '?')) {
1003: throw new IllegalArgumentException(
1004: "*+ operand could be empty");
1005: }
1006: netFlags = (op != '+') ? (WORST | SPSTART)
1007: : (WORST | HASWIDTH);
1008:
1009: if ((op == '*') && ((flagp & SIMPLE) != 0)) {
1010: reginsert(STAR, ret);
1011: } else if (op == '*') {
1012: /* Emit x* as (x&|), where & means "self". */
1013: reginsert(BRANCH, ret); /* Either x */
1014: regoptail(ret, regnode(BACK)); /* and loop */
1015: regoptail(ret, ret); /* back */
1016: regtail(ret, regnode(BRANCH)); /* or */
1017: regtail(ret, regnode(NOTHING)); /* null. */
1018: } else if ((op == '+') && ((flagp & SIMPLE) != 0)) {
1019: reginsert(PLUS, ret);
1020: } else if (op == '+') {
1021: /* Emit x+ as x(&|), where & means "self". */
1022: int next = regnode(BRANCH); /* Either */
1023: regtail(ret, next);
1024: regtail(regnode(BACK), ret); /* loop back */
1025: regtail(next, regnode(BRANCH)); /* or */
1026: regtail(ret, regnode(NOTHING)); /* null. */
1027: } else if (op == '?') {
1028: /* Emit x? as (x|) */
1029: reginsert(BRANCH, ret); /* Either x */
1030: regtail(ret, regnode(BRANCH)); /* or */
1031: int next = regnode(NOTHING); /* null. */
1032: regtail(ret, next);
1033: regoptail(ret, next);
1034: }
1035: off++;
1036: if ((off < parse.length) && isMult(parse[off])) {
1037: throw new IllegalArgumentException("nested *?+");
1038: }
1039:
1040: flagp = netFlags;
1041: return ret;
1042: }
1043:
1044: /*
1045: - regatom - the lowest level
1046: *
1047: * Optimization: gobbles an entire sequence of ordinary characters so that
1048: * it can turn them into a single node, which is smaller to store and
1049: * faster to run. Backslashed characters are exceptions, each becoming a
1050: * separate node; the code is simpler that way and it's not worth fixing.
1051: */
1052: int regatom() throws IllegalArgumentException {
1053: int netFlags = WORST; /* Tentatively. */
1054: int ret;
1055:
1056: switch (parse[off++]) {
1057: case '^':
1058: ret = regnode(BOL);
1059: break;
1060: case '$':
1061: ret = regnode(EOL);
1062: break;
1063: case '.':
1064: ret = regnode(ANY);
1065: netFlags |= (HASWIDTH | SIMPLE);
1066: break;
1067: case '[': {
1068: try {
1069: if (parse[off] == '^') {
1070: ret = regnode(ANYBUT);
1071: off++;
1072: } else {
1073: ret = regnode(ANYOF);
1074: }
1075:
1076: int pos = reglen();
1077: regc('\0');
1078:
1079: if ((parse[off] == ']') || (parse[off] == '-')) {
1080: regc(parse[off++]);
1081: }
1082: while (parse[off] != ']') {
1083: if (parse[off] == '-') {
1084: off++;
1085: if (parse[off] == ']') {
1086: regc('-');
1087: } else {
1088: int start = parse[off - 2];
1089: int end = parse[off++];
1090: if (start > end) {
1091: throw new IllegalArgumentException(
1092: "invalid [] range");
1093: }
1094: for (int i = start + 1; i <= end; i++) {
1095: regc((char) i);
1096: }
1097: }
1098: } else {
1099: regc(parse[off++]);
1100: }
1101: }
1102: regset(pos, (char) (reglen() - pos - 1));
1103: off++;
1104: netFlags |= HASWIDTH | SIMPLE;
1105: } catch (ArrayIndexOutOfBoundsException e) {
1106: throw new IllegalArgumentException("missing ]");
1107: }
1108: break;
1109: }
1110: case '(':
1111: ret = reg(true);
1112: netFlags |= (flagp & (HASWIDTH | SPSTART));
1113: break;
1114: case '|':
1115: case ')':
1116: throw new IllegalArgumentException("internal urp");
1117: case '?':
1118: case '+':
1119: case '*':
1120: throw new IllegalArgumentException(
1121: "?+* follows nothing");
1122: case '\\':
1123: if (off >= parse.length) {
1124: throw new IllegalArgumentException("trailing \\");
1125: }
1126: ret = regnode(EXACTLY);
1127: regc((char) 1);
1128: regc(parse[off++]);
1129: netFlags |= HASWIDTH | SIMPLE;
1130: break;
1131: default: {
1132: off--;
1133: int end;
1134: for (end = off; end < parse.length; end++) {
1135: if (META.indexOf(parse[end]) >= 0) {
1136: break;
1137: }
1138: }
1139: if ((end > off + 1) && (end < parse.length)
1140: && isMult(parse[end])) {
1141: end--; /* Back off clear of ?+* operand. */
1142: }
1143: netFlags |= HASWIDTH;
1144: if (end == off + 1) {
1145: netFlags |= SIMPLE;
1146: }
1147: ret = regnode(EXACTLY);
1148: regc((char) (end - off));
1149: for (; off < end; off++) {
1150: regc(parse[off]);
1151: }
1152: }
1153: break;
1154: }
1155:
1156: flagp = netFlags;
1157: return ret;
1158: }
1159:
1160: /*
1161: - regnode - emit a node
1162: */
1163: int regnode(char op) {
1164: int ret = code.length();
1165: code.append(op);
1166: code.append('\0');
1167:
1168: return ret;
1169: }
1170:
1171: /*
1172: - regc - emit (if appropriate) a byte of code
1173: */
1174: void regc(char b) {
1175: code.append(b);
1176: }
1177:
1178: int reglen() {
1179: return code.length();
1180: }
1181:
1182: void regset(int pos, char ch) {
1183: code.setCharAt(pos, ch);
1184: }
1185:
1186: /*
1187: - reginsert - insert an operator in front of already-emitted operand
1188: *
1189: * Means relocating the operand.
1190: */
1191: void reginsert(char op, int pos) {
1192: char[] tmp = new char[] { op, '\0' };
1193: code.insert(pos, tmp);
1194: }
1195:
1196: /*
1197: - regtail - set the next-pointer at the end of a node chain
1198: */
1199: void regtail(int pos, int val) {
1200: /* Find last node. */
1201:
1202: int scan = pos;
1203: while (true) {
1204: int tmp = regnext(scan);
1205: if (tmp < 0) {
1206: break;
1207: }
1208: scan = tmp;
1209: }
1210:
1211: int offset = (code.charAt(scan) == BACK) ? scan - val : val
1212: - scan;
1213: code.setCharAt(scan + 1, (char) offset);
1214: }
1215:
1216: /*
1217: - regoptail - regtail on operand of first argument; nop if operandless
1218: */
1219: void regoptail(int pos, int val) {
1220: if ((pos < 0) || (code.charAt(pos) != BRANCH)) {
1221: return;
1222: }
1223: regtail(pos + 2, val);
1224: }
1225:
1226: /*
1227: - regnext - dig the "next" pointer out of a node
1228: */
1229: int regnext(int pos) {
1230: int offset = code.charAt(pos + 1);
1231: if (offset == 0) {
1232: return -1;
1233: }
1234: if (code.charAt(pos) == BACK) {
1235: return pos - offset;
1236: } else {
1237: return pos + offset;
1238: }
1239: }
1240:
1241: static boolean isMult(char ch) {
1242: return (ch == '*') || (ch == '+') || (ch == '?');
1243: }
1244:
1245: }
1246:
1247: static class Match {
1248: char[] program;
1249:
1250: String str;
1251: int bol;
1252: int input;
1253: int length;
1254:
1255: int[] indices;
1256:
1257: boolean regtry(int off) {
1258: this .input = off;
1259:
1260: for (int i = 0; i < indices.length; i++) {
1261: indices[i] = -1;
1262: }
1263:
1264: if (regmatch(0)) {
1265: indices[0] = off;
1266: indices[1] = input;
1267: return true;
1268: } else {
1269: return false;
1270: }
1271: }
1272:
1273: /*
1274: - regmatch - main matching routine
1275: *
1276: * Conceptually the strategy is simple: check to see whether the current
1277: * node matches, call self recursively to see whether the rest matches,
1278: * and then act accordingly. In practice we make some effort to avoid
1279: * recursion, in particular by going through "ordinary" nodes (that don't
1280: * need to know whether the rest of the match failed) by a loop instead of
1281: * by recursion.
1282: */
1283: boolean regmatch(int scan) {
1284: while (true) {
1285: int next = regnext(scan);
1286: int op = program[scan];
1287: switch (op) {
1288: case BOL:
1289: if (input != bol) {
1290: return false;
1291: }
1292: break;
1293:
1294: case EOL:
1295: if (input != length) {
1296: return false;
1297: }
1298: break;
1299:
1300: case ANY:
1301: if (input >= length) {
1302: return false;
1303: }
1304: input++;
1305: break;
1306:
1307: case EXACTLY: {
1308: if (compare(scan) == false) {
1309: return false;
1310: }
1311: break;
1312: }
1313:
1314: case ANYOF:
1315: if (input >= length) {
1316: return false;
1317: }
1318: if (present(scan) == false) {
1319: return false;
1320: }
1321: input++;
1322: break;
1323:
1324: case ANYBUT:
1325: if (input >= length) {
1326: return false;
1327: }
1328: if (present(scan)) {
1329: return false;
1330: }
1331: input++;
1332: break;
1333:
1334: case NOTHING:
1335: case BACK:
1336: break;
1337:
1338: case BRANCH: {
1339: if (program[next] != BRANCH) {
1340: next = scan + 2;
1341: } else {
1342: do {
1343: int save = input;
1344: if (regmatch(scan + 2)) {
1345: return true;
1346: }
1347: input = save;
1348: scan = regnext(scan);
1349: } while ((scan >= 0)
1350: && (program[scan] == BRANCH));
1351: return false;
1352: }
1353: break;
1354: }
1355:
1356: case STAR:
1357: case PLUS: {
1358: /*
1359: * Lookahead to avoid useless match attempts
1360: * when we know what character comes next.
1361: */
1362:
1363: int ch = -1;
1364: if (program[next] == EXACTLY) {
1365: ch = program[next + 3];
1366: }
1367:
1368: int min = (op == STAR) ? 0 : 1;
1369: int save = input;
1370: int no = regrepeat(scan + 2);
1371:
1372: while (no >= min) {
1373: /* If it could work, try it. */
1374: if ((ch < 0)
1375: || ((input < length) && (str
1376: .charAt(input) == ch))) {
1377: if (regmatch(next)) {
1378: return true;
1379: }
1380: }
1381: /* Couldn't or didn't -- back up. */
1382: no--;
1383: input = save + no;
1384: }
1385: return false;
1386: }
1387:
1388: case END:
1389: return true;
1390:
1391: default:
1392: if (op >= CLOSE) {
1393: int no = op - CLOSE;
1394: int save = input;
1395:
1396: if (regmatch(next)) {
1397: /*
1398: * Don't set endp if some later
1399: * invocation of the same parentheses
1400: * already has.
1401: */
1402: if (indices[no * 2 + 1] <= 0) {
1403: indices[no * 2 + 1] = save;
1404: }
1405: return true;
1406: }
1407: } else if (op >= OPEN) {
1408: int no = op - OPEN;
1409: int save = input;
1410:
1411: if (regmatch(next)) {
1412: /*
1413: * Don't set startp if some later invocation of the
1414: * same parentheses already has.
1415: */
1416: if (indices[no * 2] <= 0) {
1417: indices[no * 2] = save;
1418: }
1419: return true;
1420: }
1421: }
1422: return false;
1423: }
1424: scan = next;
1425: }
1426: }
1427:
1428: boolean compare(int scan) {
1429: int count = program[scan + 2];
1430: if (input + count > length) {
1431: return false;
1432: }
1433: int start = scan + 3;
1434: int end = start + count;
1435: for (int i = start; i < end; i++) {
1436: if (str.charAt(input++) != program[i]) {
1437: return false;
1438: }
1439: }
1440: return true;
1441: }
1442:
1443: boolean present(int scan) {
1444: char ch = str.charAt(input);
1445:
1446: int count = program[scan + 2];
1447: int start = scan + 3;
1448: int end = start + count;
1449:
1450: for (int i = start; i < end; i++) {
1451: if (program[i] == ch) {
1452: return true;
1453: }
1454: }
1455: return false;
1456: }
1457:
1458: /*
1459: - regrepeat - repeatedly match something simple, report how many
1460: */
1461: int regrepeat(int scan) {
1462: int op = program[scan];
1463: int count = 0;
1464:
1465: switch (op) {
1466: case ANY:
1467: // '.*' matches all the way to the end.
1468:
1469: count = length - input;
1470: input = length;
1471: break;
1472:
1473: case EXACTLY: {
1474: // 'g*' matches all the following 'g' characters.
1475:
1476: char ch = program[scan + 3];
1477: while ((input < length) && (str.charAt(input) == ch)) {
1478: input++;
1479: count++;
1480: }
1481: break;
1482: }
1483:
1484: case ANYOF:
1485: // [abc]*
1486:
1487: while ((input < length) && present(scan)) {
1488: input++;
1489: count++;
1490: }
1491: break;
1492:
1493: case ANYBUT:
1494: while ((input < length) && !present(scan)) {
1495: input++;
1496: count++;
1497: }
1498: break;
1499:
1500: }
1501: return count;
1502: }
1503:
1504: /*
1505: - regnext - dig the "next" pointer out of a node
1506: */
1507: int regnext(int scan) {
1508: int offset = program[scan + 1];
1509: if (program[scan] == BACK) {
1510: return scan - offset;
1511: } else {
1512: return scan + offset;
1513: }
1514: }
1515: }
1516: }
|