0001: package org.apache.regexp;
0002:
0003: /*
0004: * ====================================================================
0005: *
0006: * The Apache Software License, Version 1.1
0007: *
0008: * Copyright (c) 1999 The Apache Software Foundation. All rights
0009: * reserved.
0010: *
0011: * Redistribution and use in source and binary forms, with or without
0012: * modification, are permitted provided that the following conditions
0013: * are met:
0014: *
0015: * 1. Redistributions of source code must retain the above copyright
0016: * notice, this list of conditions and the following disclaimer.
0017: *
0018: * 2. Redistributions in binary form must reproduce the above copyright
0019: * notice, this list of conditions and the following disclaimer in
0020: * the documentation and/or other materials provided with the
0021: * distribution.
0022: *
0023: * 3. The end-user documentation included with the redistribution, if
0024: * any, must include the following acknowlegement:
0025: * "This product includes software developed by the
0026: * Apache Software Foundation (http://www.apache.org/)."
0027: * Alternately, this acknowlegement may appear in the software itself,
0028: * if and wherever such third-party acknowlegements normally appear.
0029: *
0030: * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
0031: * Foundation" must not be used to endorse or promote products derived
0032: * from this software without prior written permission. For written
0033: * permission, please contact apache@apache.org.
0034: *
0035: * 5. Products derived from this software may not be called "Apache"
0036: * nor may "Apache" appear in their names without prior written
0037: * permission of the Apache Group.
0038: *
0039: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0040: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0041: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0042: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0043: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0044: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0045: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0046: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0047: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0048: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0049: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0050: * SUCH DAMAGE.
0051: * ====================================================================
0052: *
0053: * This software consists of voluntary contributions made by many
0054: * individuals on behalf of the Apache Software Foundation. For more
0055: * information on the Apache Software Foundation, please see
0056: * <http://www.apache.org/>.
0057: *
0058: */
0059:
0060: import org.apache.regexp.RE;
0061: import java.util.Hashtable;
0062:
0063: /**
0064: * A regular expression compiler class. This class compiles a pattern string into a
0065: * regular expression program interpretable by the RE evaluator class. The 'recompile'
0066: * command line tool uses this compiler to pre-compile regular expressions for use
0067: * with RE. For a description of the syntax accepted by RECompiler and what you can
0068: * do with regular expressions, see the documentation for the RE matcher class.
0069: *
0070: * @see RE
0071: * @see recompile
0072: *
0073: * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
0074: * @version $Id: RECompiler.java,v 1.1.1.1 2002/01/31 03:14:36 rcm Exp $
0075: */
0076: public class RECompiler {
0077: // The compiled program
0078: char[] instruction; // The compiled RE 'program' instruction buffer
0079: int lenInstruction; // The amount of the program buffer currently in use
0080:
0081: // Input state for compiling regular expression
0082: String pattern; // Input string
0083: int len; // Length of the pattern string
0084: int idx; // Current input index into ac
0085: int parens; // Total number of paren pairs
0086:
0087: // Node flags
0088: static final int NODE_NORMAL = 0; // No flags (nothing special)
0089: static final int NODE_NULLABLE = 1; // True if node is potentially null
0090: static final int NODE_TOPLEVEL = 2; // True if top level expr
0091:
0092: // Special types of 'escapes'
0093: static final char ESC_MASK = 0xfff0; // Escape complexity mask
0094: static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
0095: static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
0096: static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
0097:
0098: // {m,n} stacks
0099: static final int maxBrackets = 10; // Maximum number of bracket pairs
0100: static int brackets = 0; // Number of bracket sets
0101: static int[] bracketStart = null; // Starting point
0102: static int[] bracketEnd = null; // Ending point
0103: static int[] bracketMin = null; // Minimum number of matches
0104: static int[] bracketOpt = null; // Additional optional matches
0105: static final int bracketUnbounded = -1; // Unbounded value
0106: static final int bracketFinished = -2; // Unbounded value
0107:
0108: // Lookup table for POSIX character class names
0109: static Hashtable hashPOSIX = new Hashtable();
0110: static {
0111: hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM));
0112: hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA));
0113: hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK));
0114: hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL));
0115: hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT));
0116: hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH));
0117: hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER));
0118: hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT));
0119: hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT));
0120: hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE));
0121: hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER));
0122: hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT));
0123: hashPOSIX
0124: .put("javastart", new Character(RE.POSIX_CLASS_JSTART));
0125: hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART));
0126: }
0127:
0128: /**
0129: * Constructor. Creates (initially empty) storage for a regular expression program.
0130: */
0131: public RECompiler() {
0132: // Start off with a generous, yet reasonable, initial size
0133: instruction = new char[128];
0134: lenInstruction = 0;
0135: }
0136:
0137: /**
0138: * Ensures that n more characters can fit in the program buffer.
0139: * If n more can't fit, then the size is doubled until it can.
0140: * @param n Number of additional characters to ensure will fit.
0141: */
0142: void ensure(int n) {
0143: // Get current program length
0144: int curlen = instruction.length;
0145:
0146: // If the current length + n more is too much
0147: if (lenInstruction + n >= curlen) {
0148: // Double the size of the program array until n more will fit
0149: while (lenInstruction + n >= curlen) {
0150: curlen *= 2;
0151: }
0152:
0153: // Allocate new program array and move data into it
0154: char[] newInstruction = new char[curlen];
0155: System.arraycopy(instruction, 0, newInstruction, 0,
0156: lenInstruction);
0157: instruction = newInstruction;
0158: }
0159: }
0160:
0161: /**
0162: * Emit a single character into the program stream.
0163: * @param c Character to add
0164: */
0165: void emit(char c) {
0166: // Make room for character
0167: ensure(1);
0168:
0169: // Add character
0170: instruction[lenInstruction++] = c;
0171: }
0172:
0173: /**
0174: * Inserts a node with a given opcode and opdata at insertAt. The node relative next
0175: * pointer is initialized to 0.
0176: * @param opcode Opcode for new node
0177: * @param opdata Opdata for new node (only the low 16 bits are currently used)
0178: * @param insertAt Index at which to insert the new node in the program
0179: */
0180: void nodeInsert(char opcode, int opdata, int insertAt) {
0181: // Make room for a new node
0182: ensure(RE.nodeSize);
0183:
0184: // Move everything from insertAt to the end down nodeSize elements
0185: System.arraycopy(instruction, insertAt, instruction, insertAt
0186: + RE.nodeSize, lenInstruction - insertAt);
0187: instruction[insertAt + RE.offsetOpcode] = opcode;
0188: instruction[insertAt + RE.offsetOpdata] = (char) opdata;
0189: instruction[insertAt + RE.offsetNext] = 0;
0190: lenInstruction += RE.nodeSize;
0191: }
0192:
0193: /**
0194: * Appends a node to the end of a node chain
0195: * @param node Start of node chain to traverse
0196: * @param pointTo Node to have the tail of the chain point to
0197: */
0198: void setNextOfEnd(int node, int pointTo) {
0199: // Traverse the chain until the next offset is 0
0200: int next;
0201: while ((next = instruction[node + RE.offsetNext]) != 0) {
0202: node += next;
0203: }
0204:
0205: // Point the last node in the chain to pointTo.
0206: instruction[node + RE.offsetNext] = (char) (short) (pointTo - node);
0207: }
0208:
0209: /**
0210: * Adds a new node
0211: * @param opcode Opcode for node
0212: * @param opdata Opdata for node (only the low 16 bits are currently used)
0213: * @return Index of new node in program
0214: */
0215: int node(char opcode, int opdata) {
0216: // Make room for a new node
0217: ensure(RE.nodeSize);
0218:
0219: // Add new node at end
0220: instruction[lenInstruction + RE.offsetOpcode] = opcode;
0221: instruction[lenInstruction + RE.offsetOpdata] = (char) opdata;
0222: instruction[lenInstruction + RE.offsetNext] = 0;
0223: lenInstruction += RE.nodeSize;
0224:
0225: // Return index of new node
0226: return lenInstruction - RE.nodeSize;
0227: }
0228:
0229: /**
0230: * Throws a new internal error exception
0231: * @exception Error Thrown in the event of an internal error.
0232: */
0233: void internalError() throws Error {
0234: throw new Error("Internal error!");
0235: }
0236:
0237: /**
0238: * Throws a new syntax error exception
0239: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0240: */
0241: void syntaxError(String s) throws RESyntaxException {
0242: throw new RESyntaxException(s);
0243: }
0244:
0245: /**
0246: * Allocate storage for brackets only as needed
0247: */
0248: void allocBrackets() {
0249: // Allocate bracket stacks if not already done
0250: if (bracketStart == null) {
0251: // Allocate storage
0252: bracketStart = new int[maxBrackets];
0253: bracketEnd = new int[maxBrackets];
0254: bracketMin = new int[maxBrackets];
0255: bracketOpt = new int[maxBrackets];
0256:
0257: // Initialize to invalid values
0258: for (int i = 0; i < maxBrackets; i++) {
0259: bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
0260: }
0261: }
0262: }
0263:
0264: /**
0265: * Match bracket {m,n} expression put results in bracket member variables
0266: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0267: */
0268: void bracket() throws RESyntaxException {
0269: // Current character must be a '{'
0270: if (idx >= len || pattern.charAt(idx++) != '{') {
0271: internalError();
0272: }
0273:
0274: // Next char must be a digit
0275: if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
0276: syntaxError("Expected digit");
0277: }
0278:
0279: // Get min ('m' of {m,n}) number
0280: StringBuffer number = new StringBuffer();
0281: while (idx < len && Character.isDigit(pattern.charAt(idx))) {
0282: number.append(pattern.charAt(idx++));
0283: }
0284: try {
0285: bracketMin[brackets] = Integer.parseInt(number.toString());
0286: } catch (NumberFormatException e) {
0287: syntaxError("Expected valid number");
0288: }
0289:
0290: // If out of input, fail
0291: if (idx >= len) {
0292: syntaxError("Expected comma or right bracket");
0293: }
0294:
0295: // If end of expr, optional limit is 0
0296: if (pattern.charAt(idx) == '}') {
0297: idx++;
0298: bracketOpt[brackets] = 0;
0299: return;
0300: }
0301:
0302: // Must have at least {m,} and maybe {m,n}.
0303: if (idx >= len || pattern.charAt(idx++) != ',') {
0304: syntaxError("Expected comma");
0305: }
0306:
0307: // If out of input, fail
0308: if (idx >= len) {
0309: syntaxError("Expected comma or right bracket");
0310: }
0311:
0312: // If {m,} max is unlimited
0313: if (pattern.charAt(idx) == '}') {
0314: idx++;
0315: bracketOpt[brackets] = bracketUnbounded;
0316: return;
0317: }
0318:
0319: // Next char must be a digit
0320: if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
0321: syntaxError("Expected digit");
0322: }
0323:
0324: // Get max number
0325: number.setLength(0);
0326: while (idx < len && Character.isDigit(pattern.charAt(idx))) {
0327: number.append(pattern.charAt(idx++));
0328: }
0329: try {
0330: bracketOpt[brackets] = Integer.parseInt(number.toString())
0331: - bracketMin[brackets];
0332: } catch (NumberFormatException e) {
0333: syntaxError("Expected valid number");
0334: }
0335:
0336: // Optional repetitions must be > 0
0337: if (bracketOpt[brackets] <= 0) {
0338: syntaxError("Bad range");
0339: }
0340:
0341: // Must have close brace
0342: if (idx >= len || pattern.charAt(idx++) != '}') {
0343: syntaxError("Missing close brace");
0344: }
0345: }
0346:
0347: /**
0348: * Match an escape sequence. Handles quoted chars and octal escapes as well
0349: * as normal escape characters. Always advances the input stream by the
0350: * right amount. This code "understands" the subtle difference between an
0351: * octal escape and a backref. You can access the type of ESC_CLASS or
0352: * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
0353: * @return ESC_* code or character if simple escape
0354: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0355: */
0356: char escape() throws RESyntaxException {
0357: // "Shouldn't" happen
0358: if (pattern.charAt(idx) != '\\') {
0359: internalError();
0360: }
0361:
0362: // Escape shouldn't occur as last character in string!
0363: if (idx + 1 == len) {
0364: syntaxError("Escape terminates string");
0365: }
0366:
0367: // Switch on character after backslash
0368: idx += 2;
0369: char escapeChar = pattern.charAt(idx - 1);
0370: switch (escapeChar) {
0371: case RE.E_BOUND:
0372: case RE.E_NBOUND:
0373: return ESC_COMPLEX;
0374:
0375: case RE.E_ALNUM:
0376: case RE.E_NALNUM:
0377: case RE.E_SPACE:
0378: case RE.E_NSPACE:
0379: case RE.E_DIGIT:
0380: case RE.E_NDIGIT:
0381: return ESC_CLASS;
0382:
0383: case 'u':
0384: case 'x': {
0385: // Exact required hex digits for escape type
0386: int hexDigits = (escapeChar == 'u' ? 4 : 2);
0387:
0388: // Parse up to hexDigits characters from input
0389: int val = 0;
0390: for (; idx < len && hexDigits-- > 0; idx++) {
0391: // Get char
0392: char c = pattern.charAt(idx);
0393:
0394: // If it's a hexadecimal digit (0-9)
0395: if (c >= '0' && c <= '9') {
0396: // Compute new value
0397: val = (val << 4) + c - '0';
0398: } else {
0399: // If it's a hexadecimal letter (a-f)
0400: c = Character.toLowerCase(c);
0401: if (c >= 'a' && c <= 'f') {
0402: // Compute new value
0403: val = (val << 4) + (c - 'a') + 10;
0404: } else {
0405: // If it's not a valid digit or hex letter, the escape must be invalid
0406: // because hexDigits of input have not been absorbed yet.
0407: syntaxError("Expected " + hexDigits
0408: + " hexadecimal digits after \\"
0409: + escapeChar);
0410: }
0411: }
0412: }
0413: return (char) val;
0414: }
0415:
0416: case 't':
0417: return '\t';
0418:
0419: case 'n':
0420: return '\n';
0421:
0422: case 'r':
0423: return '\r';
0424:
0425: case 'f':
0426: return '\f';
0427:
0428: case '0':
0429: case '1':
0430: case '2':
0431: case '3':
0432: case '4':
0433: case '5':
0434: case '6':
0435: case '7':
0436: case '8':
0437: case '9':
0438:
0439: // An octal escape starts with a 0 or has two digits in a row
0440: if ((idx < len && Character.isDigit(pattern.charAt(idx)))
0441: || escapeChar == '0') {
0442: // Handle \nnn octal escapes
0443: int val = escapeChar - '0';
0444: if (idx < len && Character.isDigit(pattern.charAt(idx))) {
0445: val = ((val << 3) + (pattern.charAt(idx++) - '0'));
0446: if (idx < len
0447: && Character.isDigit(pattern.charAt(idx))) {
0448: val = ((val << 3) + (pattern.charAt(idx++) - '0'));
0449: }
0450: }
0451: return (char) val;
0452: }
0453:
0454: // It's actually a backreference (\[1-9]), not an escape
0455: return ESC_BACKREF;
0456:
0457: default:
0458:
0459: // Simple quoting of a character
0460: return escapeChar;
0461: }
0462: }
0463:
0464: /**
0465: * Compile a character class
0466: * @return Index of class node
0467: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0468: */
0469: int characterClass() throws RESyntaxException {
0470: // Check for bad calling or empty class
0471: if (pattern.charAt(idx) != '[') {
0472: internalError();
0473: }
0474:
0475: // Check for unterminated or empty class
0476: if ((idx + 1) >= len || pattern.charAt(++idx) == ']') {
0477: syntaxError("Empty or unterminated class");
0478: }
0479:
0480: // Check for POSIX character class
0481: if (idx < len && pattern.charAt(idx) == ':') {
0482: // Skip colon
0483: idx++;
0484:
0485: // POSIX character classes are denoted with lowercase ASCII strings
0486: int idxStart = idx;
0487: while (idx < len && pattern.charAt(idx) >= 'a'
0488: && pattern.charAt(idx) <= 'z') {
0489: idx++;
0490: }
0491:
0492: // Should be a ":]" to terminate the POSIX character class
0493: if ((idx + 1) < len && pattern.charAt(idx) == ':'
0494: && pattern.charAt(idx + 1) == ']') {
0495: // Get character class
0496: String charClass = pattern.substring(idxStart, idx);
0497:
0498: // Select the POSIX class id
0499: Character i = (Character) hashPOSIX.get(charClass);
0500: if (i != null) {
0501: // Move past colon and right bracket
0502: idx += 2;
0503:
0504: // Return new POSIX character class node
0505: return node(RE.OP_POSIXCLASS, i.charValue());
0506: }
0507: syntaxError("Invalid POSIX character class '"
0508: + charClass + "'");
0509: }
0510: syntaxError("Invalid POSIX character class syntax");
0511: }
0512:
0513: // Try to build a class. Create OP_ANYOF node
0514: int ret = node(RE.OP_ANYOF, 0);
0515:
0516: // Parse class declaration
0517: char CHAR_INVALID = Character.MAX_VALUE;
0518: char last = CHAR_INVALID;
0519: char simpleChar = 0;
0520: boolean include = true;
0521: boolean definingRange = false;
0522: int idxFirst = idx;
0523: char rangeStart = Character.MIN_VALUE;
0524: char rangeEnd;
0525: RERange range = new RERange();
0526: while (idx < len && pattern.charAt(idx) != ']') {
0527:
0528: switchOnCharacter:
0529:
0530: // Switch on character
0531: switch (pattern.charAt(idx)) {
0532: case '^':
0533: include = !include;
0534: if (idx == idxFirst) {
0535: range.include(Character.MIN_VALUE,
0536: Character.MAX_VALUE, true);
0537: }
0538: idx++;
0539: continue;
0540:
0541: case '\\': {
0542: // Escape always advances the stream
0543: char c;
0544: switch (c = escape()) {
0545: case ESC_COMPLEX:
0546: case ESC_BACKREF:
0547:
0548: // Word boundaries and backrefs not allowed in a character class!
0549: syntaxError("Bad character class");
0550:
0551: case ESC_CLASS:
0552:
0553: // Classes can't be an endpoint of a range
0554: if (definingRange) {
0555: syntaxError("Bad character class");
0556: }
0557:
0558: // Handle specific type of class (some are ok)
0559: switch (pattern.charAt(idx - 1)) {
0560: case RE.E_NSPACE:
0561: case RE.E_NDIGIT:
0562: case RE.E_NALNUM:
0563: syntaxError("Bad character class");
0564:
0565: case RE.E_SPACE:
0566: range.include('\t', include);
0567: range.include('\r', include);
0568: range.include('\f', include);
0569: range.include('\n', include);
0570: range.include('\b', include);
0571: range.include(' ', include);
0572: break;
0573:
0574: case RE.E_ALNUM:
0575: range.include('a', 'z', include);
0576: range.include('A', 'Z', include);
0577: range.include('_', include);
0578:
0579: // Fall through!
0580:
0581: case RE.E_DIGIT:
0582: range.include('0', '9', include);
0583: break;
0584: }
0585:
0586: // Make last char invalid (can't be a range start)
0587: last = CHAR_INVALID;
0588: break;
0589:
0590: default:
0591:
0592: // Escape is simple so treat as a simple char
0593: simpleChar = c;
0594: break switchOnCharacter;
0595: }
0596: }
0597: continue;
0598:
0599: case '-':
0600:
0601: // Start a range if one isn't already started
0602: if (definingRange) {
0603: syntaxError("Bad class range");
0604: }
0605: definingRange = true;
0606:
0607: // If no last character, start of range is 0
0608: rangeStart = (last == CHAR_INVALID ? 0 : last);
0609:
0610: // Premature end of range. define up to Character.MAX_VALUE
0611: if ((idx + 1) < len && pattern.charAt(++idx) == ']') {
0612: simpleChar = Character.MAX_VALUE;
0613: break;
0614: }
0615: continue;
0616:
0617: default:
0618: simpleChar = pattern.charAt(idx++);
0619: break;
0620: }
0621:
0622: // Handle simple character simpleChar
0623: if (definingRange) {
0624: // if we are defining a range make it now
0625: rangeEnd = simpleChar;
0626:
0627: // Actually create a range if the range is ok
0628: if (rangeStart >= rangeEnd) {
0629: syntaxError("Bad character class");
0630: }
0631: range.include(rangeStart, rangeEnd, include);
0632:
0633: // We are done defining the range
0634: last = CHAR_INVALID;
0635: definingRange = false;
0636: } else {
0637: // If simple character and not start of range, include it
0638: if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-') {
0639: range.include(simpleChar, include);
0640: }
0641: last = simpleChar;
0642: }
0643: }
0644:
0645: // Shouldn't be out of input
0646: if (idx == len) {
0647: syntaxError("Unterminated character class");
0648: }
0649:
0650: // Absorb the ']' end of class marker
0651: idx++;
0652:
0653: // Emit character class definition
0654: instruction[ret + RE.offsetOpdata] = (char) range.num;
0655: for (int i = 0; i < range.num; i++) {
0656: emit((char) range.minRange[i]);
0657: emit((char) range.maxRange[i]);
0658: }
0659: return ret;
0660: }
0661:
0662: /**
0663: * Absorb an atomic character string. This method is a little tricky because
0664: * it can un-include the last character of string if a closure operator follows.
0665: * This is correct because *+? have higher precedence than concatentation (thus
0666: * ABC* means AB(C*) and NOT (ABC)*).
0667: * @return Index of new atom node
0668: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0669: */
0670: int atom() throws RESyntaxException {
0671: // Create a string node
0672: int ret = node(RE.OP_ATOM, 0);
0673:
0674: // Length of atom
0675: int lenAtom = 0;
0676:
0677: // Loop while we've got input
0678:
0679: atomLoop:
0680:
0681: while (idx < len) {
0682: // Is there a next char?
0683: if ((idx + 1) < len) {
0684: char c = pattern.charAt(idx + 1);
0685:
0686: // If the next 'char' is an escape, look past the whole escape
0687: if (pattern.charAt(idx) == '\\') {
0688: int idxEscape = idx;
0689: escape();
0690: if (idx < len) {
0691: c = pattern.charAt(idx);
0692: }
0693: idx = idxEscape;
0694: }
0695:
0696: // Switch on next char
0697: switch (c) {
0698: case '{':
0699: case '?':
0700: case '*':
0701: case '+':
0702:
0703: // If the next character is a closure operator and our atom is non-empty, the
0704: // current character should bind to the closure operator rather than the atom
0705: if (lenAtom != 0) {
0706: break atomLoop;
0707: }
0708: }
0709: }
0710:
0711: // Switch on current char
0712: switch (pattern.charAt(idx)) {
0713: case ']':
0714: case '^':
0715: case '$':
0716: case '.':
0717: case '[':
0718: case '(':
0719: case ')':
0720: case '|':
0721: break atomLoop;
0722:
0723: case '{':
0724: case '?':
0725: case '*':
0726: case '+':
0727:
0728: // We should have an atom by now
0729: if (lenAtom == 0) {
0730: // No atom before closure
0731: syntaxError("Missing operand to closure");
0732: }
0733: break atomLoop;
0734:
0735: case '\\':
0736:
0737: {
0738: // Get the escaped character (advances input automatically)
0739: int idxBeforeEscape = idx;
0740: char c = escape();
0741:
0742: // Check if it's a simple escape (as opposed to, say, a backreference)
0743: if ((c & ESC_MASK) == ESC_MASK) {
0744: // Not a simple escape, so backup to where we were before the escape.
0745: idx = idxBeforeEscape;
0746: break atomLoop;
0747: }
0748:
0749: // Add escaped char to atom
0750: emit(c);
0751: lenAtom++;
0752: }
0753: break;
0754:
0755: default:
0756:
0757: // Add normal character to atom
0758: emit(pattern.charAt(idx++));
0759: lenAtom++;
0760: break;
0761: }
0762: }
0763:
0764: // This "shouldn't" happen
0765: if (lenAtom == 0) {
0766: internalError();
0767: }
0768:
0769: // Emit the atom length into the program
0770: instruction[ret + RE.offsetOpdata] = (char) lenAtom;
0771: return ret;
0772: }
0773:
0774: /**
0775: * Match a terminal node.
0776: * @param flags Flags
0777: * @return Index of terminal node (closeable)
0778: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0779: */
0780: int terminal(int[] flags) throws RESyntaxException {
0781: switch (pattern.charAt(idx)) {
0782: case RE.OP_EOL:
0783: case RE.OP_BOL:
0784: case RE.OP_ANY:
0785: return node(pattern.charAt(idx++), 0);
0786:
0787: case '[':
0788: return characterClass();
0789:
0790: case '(':
0791: return expr(flags);
0792:
0793: case ')':
0794: syntaxError("Unexpected close paren");
0795:
0796: case '|':
0797: internalError();
0798:
0799: case ']':
0800: syntaxError("Mismatched class");
0801:
0802: case 0:
0803: syntaxError("Unexpected end of input");
0804:
0805: case '?':
0806: case '+':
0807: case '{':
0808: case '*':
0809: syntaxError("Missing operand to closure");
0810:
0811: case '\\': {
0812: // Don't forget, escape() advances the input stream!
0813: int idxBeforeEscape = idx;
0814:
0815: // Switch on escaped character
0816: switch (escape()) {
0817: case ESC_CLASS:
0818: case ESC_COMPLEX:
0819: flags[0] &= ~NODE_NULLABLE;
0820: return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
0821:
0822: case ESC_BACKREF: {
0823: char backreference = (char) (pattern.charAt(idx - 1) - '0');
0824: if (parens <= backreference) {
0825: syntaxError("Bad backreference");
0826: }
0827: flags[0] |= NODE_NULLABLE;
0828: return node(RE.OP_BACKREF, backreference);
0829: }
0830:
0831: default:
0832:
0833: // We had a simple escape and we want to have it end up in
0834: // an atom, so we back up and fall though to the default handling
0835: idx = idxBeforeEscape;
0836: flags[0] &= ~NODE_NULLABLE;
0837: break;
0838: }
0839: }
0840: }
0841:
0842: // Everything above either fails or returns.
0843: // If it wasn't one of the above, it must be the start of an atom.
0844: flags[0] &= ~NODE_NULLABLE;
0845: return atom();
0846: }
0847:
0848: /**
0849: * Compile a possibly closured terminal
0850: * @param flags Flags passed by reference
0851: * @return Index of closured node
0852: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0853: */
0854: int closure(int[] flags) throws RESyntaxException {
0855: // Before terminal
0856: int idxBeforeTerminal = idx;
0857:
0858: // Values to pass by reference to terminal()
0859: int[] terminalFlags = { NODE_NORMAL };
0860:
0861: // Get terminal symbol
0862: int ret = terminal(terminalFlags);
0863:
0864: // Or in flags from terminal symbol
0865: flags[0] |= terminalFlags[0];
0866:
0867: // Advance input, set NODE_NULLABLE flag and do sanity checks
0868: if (idx >= len) {
0869: return ret;
0870: }
0871: boolean greedy = true;
0872: char closureType = pattern.charAt(idx);
0873: switch (closureType) {
0874: case '?':
0875: case '*':
0876:
0877: // The current node can be null
0878: flags[0] |= NODE_NULLABLE;
0879:
0880: case '+':
0881:
0882: // Eat closure character
0883: idx++;
0884:
0885: case '{':
0886:
0887: // Don't allow blantant stupidity
0888: int opcode = instruction[ret + RE.offsetOpcode];
0889: if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) {
0890: syntaxError("Bad closure operand");
0891: }
0892: if ((terminalFlags[0] & NODE_NULLABLE) != 0) {
0893: syntaxError("Closure operand can't be nullable");
0894: }
0895: break;
0896: }
0897:
0898: // If the next character is a '?', make the closure non-greedy (reluctant)
0899: if (idx < len && pattern.charAt(idx) == '?') {
0900: idx++;
0901: greedy = false;
0902: }
0903:
0904: if (greedy) {
0905: // Actually do the closure now
0906: switch (closureType) {
0907: case '{': {
0908: // We look for our bracket in the list
0909: boolean found = false;
0910: int i;
0911: allocBrackets();
0912: for (i = 0; i < brackets; i++) {
0913: if (bracketStart[i] == idx) {
0914: found = true;
0915: break;
0916: }
0917: }
0918:
0919: // If its not in the list we parse the {m,n}
0920: if (!found) {
0921: if (brackets >= maxBrackets) {
0922: syntaxError("Too many bracketed closures (limit is 10)");
0923: }
0924: bracketStart[brackets] = idx;
0925: bracket();
0926: bracketEnd[brackets] = idx;
0927: i = brackets++;
0928: }
0929:
0930: // If there's a min, rewind stream and reparse
0931: if (--bracketMin[i] > 0) {
0932: // Rewind stream and run it through again
0933: idx = idxBeforeTerminal;
0934: break;
0935: }
0936:
0937: // Do the right thing for maximum ({m,})
0938: if (bracketOpt[i] == bracketFinished) {
0939: // Drop through now and closure expression.
0940: // We are done with the {m,} expr, so skip rest
0941: closureType = '*';
0942: bracketOpt[i] = 0;
0943: idx = bracketEnd[i];
0944: } else if (bracketOpt[i] == bracketUnbounded) {
0945: idx = idxBeforeTerminal;
0946: bracketOpt[i] = bracketFinished;
0947: break;
0948: } else if (bracketOpt[i]-- > 0) {
0949: // Drop through to optionally close and then 'play it again sam!'
0950: idx = idxBeforeTerminal;
0951: closureType = '?';
0952: } else {
0953: // We are done. skip the rest of {m,n} expr
0954: idx = bracketEnd[i];
0955: break;
0956: }
0957: }
0958:
0959: // Fall through!
0960:
0961: case '?':
0962: case '*':
0963:
0964: if (!greedy) {
0965: break;
0966: }
0967:
0968: if (closureType == '?') {
0969: // X? is compiled as (X|)
0970: nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
0971: setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // inserted branch to option
0972: int nothing = node(RE.OP_NOTHING, 0); // which is OP_NOTHING
0973: setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
0974: setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
0975: }
0976:
0977: if (closureType == '*') {
0978: // X* is compiled as (X{gotoX}|)
0979: nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
0980: setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH,
0981: 0)); // end of X points to an option
0982: setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
0983: setNextOfEnd(ret + RE.nodeSize, ret); // the start again
0984: setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
0985: setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
0986: }
0987: break;
0988:
0989: case '+': {
0990: // X+ is compiled as X({gotoX}|)
0991: int branch;
0992: branch = node(RE.OP_BRANCH, 0); // a new branch
0993: setNextOfEnd(ret, branch); // is added to the end of X
0994: setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
0995: setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
0996: setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
0997: }
0998: break;
0999: }
1000: } else {
1001: // Add end after closured subexpr
1002: setNextOfEnd(ret, node(RE.OP_END, 0));
1003:
1004: // Actually do the closure now
1005: switch (closureType) {
1006: case '?':
1007: nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
1008: break;
1009:
1010: case '*':
1011: nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
1012: break;
1013:
1014: case '+':
1015: nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
1016: break;
1017: }
1018:
1019: // Point to the expr after the closure
1020: setNextOfEnd(ret, lenInstruction);
1021: }
1022: return ret;
1023: }
1024:
1025: /**
1026: * Compile one branch of an or operator (implements concatenation)
1027: * @param flags Flags passed by reference
1028: * @return Pointer to branch node
1029: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1030: */
1031: int branch(int[] flags) throws RESyntaxException {
1032: // Get each possibly closured piece and concat
1033: int node;
1034: int ret = node(RE.OP_BRANCH, 0);
1035: int chain = -1;
1036: int[] closureFlags = new int[1];
1037: boolean nullable = true;
1038: while (idx < len && pattern.charAt(idx) != '|'
1039: && pattern.charAt(idx) != ')') {
1040: // Get new node
1041: closureFlags[0] = NODE_NORMAL;
1042: node = closure(closureFlags);
1043: if (closureFlags[0] == NODE_NORMAL) {
1044: nullable = false;
1045: }
1046:
1047: // If there's a chain, append to the end
1048: if (chain != -1) {
1049: setNextOfEnd(chain, node);
1050: }
1051:
1052: // Chain starts at current
1053: chain = node;
1054: }
1055:
1056: // If we don't run loop, make a nothing node
1057: if (chain == -1) {
1058: node(RE.OP_NOTHING, 0);
1059: }
1060:
1061: // Set nullable flag for this branch
1062: if (nullable) {
1063: flags[0] |= NODE_NULLABLE;
1064: }
1065: return ret;
1066: }
1067:
1068: /**
1069: * Compile an expression with possible parens around it. Paren matching
1070: * is done at this level so we can tie the branch tails together.
1071: * @param flags Flag value passed by reference
1072: * @return Node index of expression in instruction array
1073: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1074: */
1075: int expr(int[] flags) throws RESyntaxException {
1076: // Create open paren node unless we were called from the top level (which has no parens)
1077: boolean paren = false;
1078: int ret = -1;
1079: int closeParens = parens;
1080: if ((flags[0] & NODE_TOPLEVEL) == 0
1081: && pattern.charAt(idx) == '(') {
1082: idx++;
1083: paren = true;
1084: ret = node(RE.OP_OPEN, parens++);
1085: }
1086: flags[0] &= ~NODE_TOPLEVEL;
1087:
1088: // Create a branch node
1089: int branch = branch(flags);
1090: if (ret == -1) {
1091: ret = branch;
1092: } else {
1093: setNextOfEnd(ret, branch);
1094: }
1095:
1096: // Loop through branches
1097: while (idx < len && pattern.charAt(idx) == '|') {
1098: idx++;
1099: branch = branch(flags);
1100: setNextOfEnd(ret, branch);
1101: }
1102:
1103: // Create an ending node (either a close paren or an OP_END)
1104: int end;
1105: if (paren) {
1106: if (idx < len && pattern.charAt(idx) == ')') {
1107: idx++;
1108: } else {
1109: syntaxError("Missing close paren");
1110: }
1111: end = node(RE.OP_CLOSE, closeParens);
1112: } else {
1113: end = node(RE.OP_END, 0);
1114: }
1115:
1116: // Append the ending node to the ret nodelist
1117: setNextOfEnd(ret, end);
1118:
1119: // Hook the ends of each branch to the end node
1120: for (int next = -1, i = ret; next != 0; next = instruction[i
1121: + RE.offsetNext], i += next) {
1122: // If branch, make the end of the branch's operand chain point to the end node.
1123: if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH) {
1124: setNextOfEnd(i + RE.nodeSize, end);
1125: }
1126: }
1127:
1128: // Return the node list
1129: return ret;
1130: }
1131:
1132: /**
1133: * Compiles a regular expression pattern into a program runnable by the pattern
1134: * matcher class 'RE'.
1135: * @param pattern Regular expression pattern to compile (see RECompiler class
1136: * for details).
1137: * @return A compiled regular expression program.
1138: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1139: * @see RECompiler
1140: * @see RE
1141: */
1142: public REProgram compile(String pattern) throws RESyntaxException {
1143: // Initialize variables for compilation
1144: this .pattern = pattern; // Save pattern in instance variable
1145: len = pattern.length(); // Precompute pattern length for speed
1146: idx = 0; // Set parsing index to the first character
1147: lenInstruction = 0; // Set emitted instruction count to zero
1148: parens = 1; // Set paren level to 1 (the implicit outer parens)
1149: brackets = 0; // No bracketed closures yet
1150:
1151: // Initialize pass by reference flags value
1152: int[] flags = { NODE_TOPLEVEL };
1153:
1154: // Parse expression
1155: expr(flags);
1156:
1157: // Should be at end of input
1158: if (idx != len) {
1159: if (pattern.charAt(idx) == ')') {
1160: syntaxError("Unmatched close paren");
1161: }
1162: syntaxError("Unexpected input remains");
1163: }
1164:
1165: // Return the result
1166: char[] ins = new char[lenInstruction];
1167: System.arraycopy(instruction, 0, ins, 0, lenInstruction);
1168: return new REProgram(ins);
1169: }
1170:
1171: /**
1172: * Local, nested class for maintaining character ranges for character classes.
1173: */
1174: class RERange {
1175: int size = 16; // Capacity of current range arrays
1176: int[] minRange = new int[size]; // Range minima
1177: int[] maxRange = new int[size]; // Range maxima
1178: int num = 0; // Number of range array elements in use
1179:
1180: /**
1181: * Deletes the range at a given index from the range lists
1182: * @param index Index of range to delete from minRange and maxRange arrays.
1183: */
1184: void delete(int index) {
1185: // Return if no elements left or index is out of range
1186: if (num == 0 || index >= num) {
1187: return;
1188: }
1189:
1190: // Move elements down
1191: while (index++ < num) {
1192: if (index - 1 >= 0) {
1193: minRange[index - 1] = minRange[index];
1194: maxRange[index - 1] = maxRange[index];
1195: }
1196: }
1197:
1198: // One less element now
1199: num--;
1200: }
1201:
1202: /**
1203: * Merges a range into the range list, coalescing ranges if possible.
1204: * @param min Minimum end of range
1205: * @param max Maximum end of range
1206: */
1207: void merge(int min, int max) {
1208: // Loop through ranges
1209: for (int i = 0; i < num; i++) {
1210: // Min-max is subsumed by minRange[i]-maxRange[i]
1211: if (min >= minRange[i] && max <= maxRange[i]) {
1212: return;
1213: }
1214:
1215: // Min-max subsumes minRange[i]-maxRange[i]
1216: else if (min <= minRange[i] && max >= maxRange[i]) {
1217: delete(i);
1218: merge(min, max);
1219: return;
1220: }
1221:
1222: // Min is in the range, but max is outside
1223: else if (min >= minRange[i] && min <= maxRange[i]) {
1224: delete(i);
1225: min = minRange[i];
1226: merge(min, max);
1227: return;
1228: }
1229:
1230: // Max is in the range, but min is outside
1231: else if (max >= minRange[i] && max <= maxRange[i]) {
1232: delete(i);
1233: max = maxRange[i];
1234: merge(min, max);
1235: return;
1236: }
1237: }
1238:
1239: // Must not overlap any other ranges
1240: if (num >= size) {
1241: size *= 2;
1242: int[] newMin = new int[size];
1243: int[] newMax = new int[size];
1244: System.arraycopy(minRange, 0, newMin, 0, num);
1245: System.arraycopy(maxRange, 0, newMax, 0, num);
1246: minRange = newMin;
1247: maxRange = newMax;
1248: }
1249: minRange[num] = min;
1250: maxRange[num] = max;
1251: num++;
1252: }
1253:
1254: /**
1255: * Removes a range by deleting or shrinking all other ranges
1256: * @param min Minimum end of range
1257: * @param max Maximum end of range
1258: */
1259: void remove(int min, int max) {
1260: // Loop through ranges
1261: for (int i = 0; i < num; i++) {
1262: // minRange[i]-maxRange[i] is subsumed by min-max
1263: if (minRange[i] >= min && maxRange[i] <= max) {
1264: delete(i);
1265: i--;
1266: return;
1267: }
1268:
1269: // min-max is subsumed by minRange[i]-maxRange[i]
1270: else if (min >= minRange[i] && max <= maxRange[i]) {
1271: int minr = minRange[i];
1272: int maxr = maxRange[i];
1273: delete(i);
1274: if (minr < min - 1) {
1275: merge(minr, min - 1);
1276: }
1277: if (max + 1 < maxr) {
1278: merge(max + 1, maxr);
1279: }
1280: return;
1281: }
1282:
1283: // minRange is in the range, but maxRange is outside
1284: else if (minRange[i] >= min && minRange[i] <= max) {
1285: minRange[i] = max + 1;
1286: return;
1287: }
1288:
1289: // maxRange is in the range, but minRange is outside
1290: else if (maxRange[i] >= min && maxRange[i] <= max) {
1291: maxRange[i] = min - 1;
1292: return;
1293: }
1294: }
1295: }
1296:
1297: /**
1298: * Includes (or excludes) the range from min to max, inclusive.
1299: * @param min Minimum end of range
1300: * @param max Maximum end of range
1301: * @param include True if range should be included. False otherwise.
1302: */
1303: void include(int min, int max, boolean include) {
1304: if (include) {
1305: merge(min, max);
1306: } else {
1307: remove(min, max);
1308: }
1309: }
1310:
1311: /**
1312: * Includes a range with the same min and max
1313: * @param minmax Minimum and maximum end of range (inclusive)
1314: * @param include True if range should be included. False otherwise.
1315: */
1316: void include(char minmax, boolean include) {
1317: include(minmax, minmax, include);
1318: }
1319: }
1320: }
|