0001: /*
0002: * Licensed to the Apache Software Foundation (ASF) under one or more
0003: * contributor license agreements. See the NOTICE file distributed with
0004: * this work for additional information regarding copyright ownership.
0005: * The ASF licenses this file to You under the Apache License, Version 2.0
0006: * (the "License"); you may not use this file except in compliance with
0007: * the License. You may obtain a copy of the License at
0008: *
0009: * http://www.apache.org/licenses/LICENSE-2.0
0010: *
0011: * Unless required by applicable law or agreed to in writing, software
0012: * distributed under the License is distributed on an "AS IS" BASIS,
0013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014: * See the License for the specific language governing permissions and
0015: * limitations under the License.
0016: */
0017:
0018: package org.apache.regexp;
0019:
0020: import java.util.Hashtable;
0021:
0022: /**
0023: * A regular expression compiler class. This class compiles a pattern string into a
0024: * regular expression program interpretable by the RE evaluator class. The 'recompile'
0025: * command line tool uses this compiler to pre-compile regular expressions for use
0026: * with RE. For a description of the syntax accepted by RECompiler and what you can
0027: * do with regular expressions, see the documentation for the RE matcher class.
0028: *
0029: * @see RE
0030: * @see recompile
0031: *
0032: * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
0033: * @author <a href="mailto:gholam@xtra.co.nz">Michael McCallum</a>
0034: * @version $Id: RECompiler.java 518156 2007-03-14 14:31:26Z vgritsenko $
0035: */
0036: public class RECompiler {
0037: // The compiled program
0038: char[] instruction; // The compiled RE 'program' instruction buffer
0039: int lenInstruction; // The amount of the program buffer currently in use
0040:
0041: // Input state for compiling regular expression
0042: String pattern; // Input string
0043: int len; // Length of the pattern string
0044: int idx; // Current input index into ac
0045: int parens; // Total number of paren pairs
0046:
0047: // Node flags
0048: static final int NODE_NORMAL = 0; // No flags (nothing special)
0049: static final int NODE_NULLABLE = 1; // True if node is potentially null
0050: static final int NODE_TOPLEVEL = 2; // True if top level expr
0051:
0052: // Special types of 'escapes'
0053: static final int ESC_MASK = 0xffff0; // Escape complexity mask
0054: static final int ESC_BACKREF = 0xfffff; // Escape is really a backreference
0055: static final int ESC_COMPLEX = 0xffffe; // Escape isn't really a true character
0056: static final int ESC_CLASS = 0xffffd; // Escape represents a whole class of characters
0057:
0058: // {m,n} stacks
0059: static final int bracketUnbounded = -1; // Unbounded value
0060: int bracketMin; // Minimum number of matches
0061: int bracketOpt; // Additional optional matches
0062:
0063: // Lookup table for POSIX character class names
0064: static final Hashtable hashPOSIX = new Hashtable();
0065: static {
0066: hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM));
0067: hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA));
0068: hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK));
0069: hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL));
0070: hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT));
0071: hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH));
0072: hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER));
0073: hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT));
0074: hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT));
0075: hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE));
0076: hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER));
0077: hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT));
0078: hashPOSIX
0079: .put("javastart", new Character(RE.POSIX_CLASS_JSTART));
0080: hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART));
0081: }
0082:
0083: /**
0084: * Constructor. Creates (initially empty) storage for a regular expression program.
0085: */
0086: public RECompiler() {
0087: // Start off with a generous, yet reasonable, initial size
0088: instruction = new char[128];
0089: lenInstruction = 0;
0090: }
0091:
0092: /**
0093: * Ensures that n more characters can fit in the program buffer.
0094: * If n more can't fit, then the size is doubled until it can.
0095: * @param n Number of additional characters to ensure will fit.
0096: */
0097: void ensure(int n) {
0098: // Get current program length
0099: int curlen = instruction.length;
0100:
0101: // If the current length + n more is too much
0102: if (lenInstruction + n >= curlen) {
0103: // Double the size of the program array until n more will fit
0104: while (lenInstruction + n >= curlen) {
0105: curlen *= 2;
0106: }
0107:
0108: // Allocate new program array and move data into it
0109: char[] newInstruction = new char[curlen];
0110: System.arraycopy(instruction, 0, newInstruction, 0,
0111: lenInstruction);
0112: instruction = newInstruction;
0113: }
0114: }
0115:
0116: /**
0117: * Emit a single character into the program stream.
0118: * @param c Character to add
0119: */
0120: void emit(char c) {
0121: // Make room for character
0122: ensure(1);
0123:
0124: // Add character
0125: instruction[lenInstruction++] = c;
0126: }
0127:
0128: /**
0129: * Inserts a node with a given opcode and opdata at insertAt. The node relative next
0130: * pointer is initialized to 0.
0131: * @param opcode Opcode for new node
0132: * @param opdata Opdata for new node (only the low 16 bits are currently used)
0133: * @param insertAt Index at which to insert the new node in the program
0134: */
0135: void nodeInsert(char opcode, int opdata, int insertAt) {
0136: // Make room for a new node
0137: ensure(RE.nodeSize);
0138:
0139: // Move everything from insertAt to the end down nodeSize elements
0140: System.arraycopy(instruction, insertAt, instruction, insertAt
0141: + RE.nodeSize, lenInstruction - insertAt);
0142: instruction[insertAt /* + RE.offsetOpcode */] = opcode;
0143: instruction[insertAt + RE.offsetOpdata] = (char) opdata;
0144: instruction[insertAt + RE.offsetNext] = 0;
0145: lenInstruction += RE.nodeSize;
0146: }
0147:
0148: /**
0149: * Appends a node to the end of a node chain
0150: * @param node Start of node chain to traverse
0151: * @param pointTo Node to have the tail of the chain point to
0152: */
0153: void setNextOfEnd(int node, int pointTo) {
0154: // Traverse the chain until the next offset is 0
0155: int next = instruction[node + RE.offsetNext];
0156: // while the 'node' is not the last in the chain
0157: // and the 'node' is not the last in the program.
0158: while (next != 0 && node < lenInstruction) {
0159: // if the node we are supposed to point to is in the chain then
0160: // point to the end of the program instead.
0161: // Michael McCallum <gholam@xtra.co.nz>
0162: // FIXME: This is a _hack_ to stop infinite programs.
0163: // I believe that the implementation of the reluctant matches is wrong but
0164: // have not worked out a better way yet.
0165: if (node == pointTo) {
0166: pointTo = lenInstruction;
0167: }
0168: node += next;
0169: next = instruction[node + RE.offsetNext];
0170: }
0171:
0172: // if we have reached the end of the program then dont set the pointTo.
0173: // im not sure if this will break any thing but passes all the tests.
0174: if (node < lenInstruction) {
0175: // Some patterns result in very large programs which exceed
0176: // capacity of the short used for specifying signed offset of the
0177: // next instruction. Example: a{1638}
0178: int offset = pointTo - node;
0179: if (offset != (short) offset) {
0180: throw new RESyntaxException(
0181: "Exceeded short jump range.");
0182: }
0183:
0184: // Point the last node in the chain to pointTo.
0185: instruction[node + RE.offsetNext] = (char) (short) offset;
0186: }
0187: }
0188:
0189: /**
0190: * Adds a new node
0191: * @param opcode Opcode for node
0192: * @param opdata Opdata for node (only the low 16 bits are currently used)
0193: * @return Index of new node in program
0194: */
0195: int node(char opcode, int opdata) {
0196: // Make room for a new node
0197: ensure(RE.nodeSize);
0198:
0199: // Add new node at end
0200: instruction[lenInstruction /* + RE.offsetOpcode */] = opcode;
0201: instruction[lenInstruction + RE.offsetOpdata] = (char) opdata;
0202: instruction[lenInstruction + RE.offsetNext] = 0;
0203: lenInstruction += RE.nodeSize;
0204:
0205: // Return index of new node
0206: return lenInstruction - RE.nodeSize;
0207: }
0208:
0209: /**
0210: * Throws a new internal error exception
0211: * @exception Error Thrown in the event of an internal error.
0212: */
0213: void internalError() throws Error {
0214: throw new Error("Internal error!");
0215: }
0216:
0217: /**
0218: * Throws a new syntax error exception
0219: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0220: */
0221: void syntaxError(String s) throws RESyntaxException {
0222: throw new RESyntaxException(s);
0223: }
0224:
0225: /**
0226: * Match bracket {m,n} expression put results in bracket member variables
0227: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0228: */
0229: void bracket() throws RESyntaxException {
0230: // Current character must be a '{'
0231: if (idx >= len || pattern.charAt(idx++) != '{') {
0232: internalError();
0233: }
0234:
0235: // Next char must be a digit
0236: if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
0237: syntaxError("Expected digit");
0238: }
0239:
0240: // Get min ('m' of {m,n}) number
0241: StringBuffer number = new StringBuffer();
0242: while (idx < len && Character.isDigit(pattern.charAt(idx))) {
0243: number.append(pattern.charAt(idx++));
0244: }
0245: try {
0246: bracketMin = Integer.parseInt(number.toString());
0247: } catch (NumberFormatException e) {
0248: syntaxError("Expected valid number");
0249: }
0250:
0251: // If out of input, fail
0252: if (idx >= len) {
0253: syntaxError("Expected comma or right bracket");
0254: }
0255:
0256: // If end of expr, optional limit is 0
0257: if (pattern.charAt(idx) == '}') {
0258: idx++;
0259: bracketOpt = 0;
0260: return;
0261: }
0262:
0263: // Must have at least {m,} and maybe {m,n}.
0264: if (idx >= len || pattern.charAt(idx++) != ',') {
0265: syntaxError("Expected comma");
0266: }
0267:
0268: // If out of input, fail
0269: if (idx >= len) {
0270: syntaxError("Expected comma or right bracket");
0271: }
0272:
0273: // If {m,} max is unlimited
0274: if (pattern.charAt(idx) == '}') {
0275: idx++;
0276: bracketOpt = bracketUnbounded;
0277: return;
0278: }
0279:
0280: // Next char must be a digit
0281: if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
0282: syntaxError("Expected digit");
0283: }
0284:
0285: // Get max number
0286: number.setLength(0);
0287: while (idx < len && Character.isDigit(pattern.charAt(idx))) {
0288: number.append(pattern.charAt(idx++));
0289: }
0290: try {
0291: bracketOpt = Integer.parseInt(number.toString())
0292: - bracketMin;
0293: } catch (NumberFormatException e) {
0294: syntaxError("Expected valid number");
0295: }
0296:
0297: // Optional repetitions must be >= 0
0298: if (bracketOpt < 0) {
0299: syntaxError("Bad range");
0300: }
0301:
0302: // Must have close brace
0303: if (idx >= len || pattern.charAt(idx++) != '}') {
0304: syntaxError("Missing close brace");
0305: }
0306: }
0307:
0308: /**
0309: * Match an escape sequence. Handles quoted chars and octal escapes as well
0310: * as normal escape characters. Always advances the input stream by the
0311: * right amount. This code "understands" the subtle difference between an
0312: * octal escape and a backref. You can access the type of ESC_CLASS or
0313: * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
0314: * @return ESC_* code or character if simple escape
0315: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0316: */
0317: int escape() throws RESyntaxException {
0318: // "Shouldn't" happen
0319: if (pattern.charAt(idx) != '\\') {
0320: internalError();
0321: }
0322:
0323: // Escape shouldn't occur as last character in string!
0324: if (idx + 1 == len) {
0325: syntaxError("Escape terminates string");
0326: }
0327:
0328: // Switch on character after backslash
0329: idx += 2;
0330: char escapeChar = pattern.charAt(idx - 1);
0331: switch (escapeChar) {
0332: case RE.E_BOUND:
0333: case RE.E_NBOUND:
0334: return ESC_COMPLEX;
0335:
0336: case RE.E_ALNUM:
0337: case RE.E_NALNUM:
0338: case RE.E_SPACE:
0339: case RE.E_NSPACE:
0340: case RE.E_DIGIT:
0341: case RE.E_NDIGIT:
0342: return ESC_CLASS;
0343:
0344: case 'u':
0345: case 'x': {
0346: // Exact required hex digits for escape type
0347: int hexDigits = (escapeChar == 'u' ? 4 : 2);
0348:
0349: // Parse up to hexDigits characters from input
0350: int val = 0;
0351: for (; idx < len && hexDigits-- > 0; idx++) {
0352: // Get char
0353: char c = pattern.charAt(idx);
0354:
0355: // If it's a hexadecimal digit (0-9)
0356: if (c >= '0' && c <= '9') {
0357: // Compute new value
0358: val = (val << 4) + c - '0';
0359: } else {
0360: // If it's a hexadecimal letter (a-f)
0361: c = Character.toLowerCase(c);
0362: if (c >= 'a' && c <= 'f') {
0363: // Compute new value
0364: val = (val << 4) + (c - 'a') + 10;
0365: } else {
0366: // If it's not a valid digit or hex letter, the escape must be invalid
0367: // because hexDigits of input have not been absorbed yet.
0368: syntaxError("Expected " + hexDigits
0369: + " hexadecimal digits after \\"
0370: + escapeChar);
0371: }
0372: }
0373: }
0374: return val;
0375: }
0376:
0377: case 't':
0378: return '\t';
0379:
0380: case 'n':
0381: return '\n';
0382:
0383: case 'r':
0384: return '\r';
0385:
0386: case 'f':
0387: return '\f';
0388:
0389: case '0':
0390: case '1':
0391: case '2':
0392: case '3':
0393: case '4':
0394: case '5':
0395: case '6':
0396: case '7':
0397: case '8':
0398: case '9':
0399:
0400: // An octal escape starts with a 0 or has two digits in a row
0401: if ((idx < len && Character.isDigit(pattern.charAt(idx)))
0402: || escapeChar == '0') {
0403: // Handle \nnn octal escapes
0404: int val = escapeChar - '0';
0405: if (idx < len && Character.isDigit(pattern.charAt(idx))) {
0406: val = ((val << 3) + (pattern.charAt(idx++) - '0'));
0407: if (idx < len
0408: && Character.isDigit(pattern.charAt(idx))) {
0409: val = ((val << 3) + (pattern.charAt(idx++) - '0'));
0410: }
0411: }
0412: return val;
0413: }
0414:
0415: // It's actually a backreference (\[1-9]), not an escape
0416: return ESC_BACKREF;
0417:
0418: default:
0419:
0420: // Simple quoting of a character
0421: return escapeChar;
0422: }
0423: }
0424:
0425: /**
0426: * Compile a character class
0427: * @return Index of class node
0428: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0429: */
0430: int characterClass() throws RESyntaxException {
0431: // Check for bad calling or empty class
0432: if (pattern.charAt(idx) != '[') {
0433: internalError();
0434: }
0435:
0436: // Check for unterminated or empty class
0437: if ((idx + 1) >= len || pattern.charAt(++idx) == ']') {
0438: syntaxError("Empty or unterminated class");
0439: }
0440:
0441: // Check for POSIX character class
0442: if (idx < len && pattern.charAt(idx) == ':') {
0443: // Skip colon
0444: idx++;
0445:
0446: // POSIX character classes are denoted with lowercase ASCII strings
0447: int idxStart = idx;
0448: while (idx < len && pattern.charAt(idx) >= 'a'
0449: && pattern.charAt(idx) <= 'z') {
0450: idx++;
0451: }
0452:
0453: // Should be a ":]" to terminate the POSIX character class
0454: if ((idx + 1) < len && pattern.charAt(idx) == ':'
0455: && pattern.charAt(idx + 1) == ']') {
0456: // Get character class
0457: String charClass = pattern.substring(idxStart, idx);
0458:
0459: // Select the POSIX class id
0460: Character i = (Character) hashPOSIX.get(charClass);
0461: if (i != null) {
0462: // Move past colon and right bracket
0463: idx += 2;
0464:
0465: // Return new POSIX character class node
0466: return node(RE.OP_POSIXCLASS, i.charValue());
0467: }
0468: syntaxError("Invalid POSIX character class '"
0469: + charClass + "'");
0470: }
0471: syntaxError("Invalid POSIX character class syntax");
0472: }
0473:
0474: // Try to build a class. Create OP_ANYOF node
0475: int ret = node(RE.OP_ANYOF, 0);
0476:
0477: // Parse class declaration
0478: char CHAR_INVALID = Character.MAX_VALUE;
0479: char last = CHAR_INVALID;
0480: char simpleChar;
0481: boolean include = true;
0482: boolean definingRange = false;
0483: int idxFirst = idx;
0484: char rangeStart = Character.MIN_VALUE;
0485: char rangeEnd;
0486: RERange range = new RERange();
0487: while (idx < len && pattern.charAt(idx) != ']') {
0488:
0489: switchOnCharacter:
0490:
0491: // Switch on character
0492: switch (pattern.charAt(idx)) {
0493: case '^':
0494: include = !include;
0495: if (idx == idxFirst) {
0496: range.include(Character.MIN_VALUE,
0497: Character.MAX_VALUE, true);
0498: }
0499: idx++;
0500: continue;
0501:
0502: case '\\': {
0503: // Escape always advances the stream
0504: int c;
0505: switch (c = escape()) {
0506: case ESC_COMPLEX:
0507: case ESC_BACKREF:
0508:
0509: // Word boundaries and backrefs not allowed in a character class!
0510: syntaxError("Bad character class");
0511:
0512: case ESC_CLASS:
0513:
0514: // Classes can't be an endpoint of a range
0515: if (definingRange) {
0516: syntaxError("Bad character class");
0517: }
0518:
0519: // Handle specific type of class (some are ok)
0520: switch (pattern.charAt(idx - 1)) {
0521: case RE.E_NSPACE:
0522: range.include(Character.MIN_VALUE, 7, include); // [Min - \b )
0523: range.include((char) 11, include); // ( \n - \f )
0524: range.include(14, 31, include); // ( \r - ' ')
0525: range.include(33, Character.MAX_VALUE, include); // (' ' - Max]
0526: break;
0527:
0528: case RE.E_NALNUM:
0529: range
0530: .include(Character.MIN_VALUE, '/',
0531: include); // [Min - '0')
0532: range.include(':', '@', include); // ('9' - 'A')
0533: range.include('[', '^', include); // ('Z' - '_')
0534: range.include('`', include); // ('_' - 'a')
0535: range
0536: .include('{', Character.MAX_VALUE,
0537: include); // ('z' - Max]
0538: break;
0539:
0540: case RE.E_NDIGIT:
0541: range
0542: .include(Character.MIN_VALUE, '/',
0543: include); // [Min - '0')
0544: range
0545: .include(':', Character.MAX_VALUE,
0546: include); // ('9' - Max]
0547: break;
0548:
0549: case RE.E_SPACE:
0550: range.include('\t', include);
0551: range.include('\r', include);
0552: range.include('\f', include);
0553: range.include('\n', include);
0554: range.include('\b', include);
0555: range.include(' ', include);
0556: break;
0557:
0558: case RE.E_ALNUM:
0559: range.include('a', 'z', include);
0560: range.include('A', 'Z', include);
0561: range.include('_', include);
0562:
0563: // Fall through!
0564:
0565: case RE.E_DIGIT:
0566: range.include('0', '9', include);
0567: break;
0568: }
0569:
0570: // Make last char invalid (can't be a range start)
0571: last = CHAR_INVALID;
0572: break;
0573:
0574: default:
0575:
0576: // Escape is simple so treat as a simple char
0577: simpleChar = (char) c;
0578: break switchOnCharacter;
0579: }
0580: }
0581: continue;
0582:
0583: case '-':
0584:
0585: // Start a range if one isn't already started
0586: if (definingRange) {
0587: syntaxError("Bad class range");
0588: }
0589: definingRange = true;
0590:
0591: // If no last character, start of range is 0
0592: rangeStart = (last == CHAR_INVALID ? 0 : last);
0593:
0594: // Premature end of range. define up to Character.MAX_VALUE
0595: if ((idx + 1) < len && pattern.charAt(++idx) == ']') {
0596: simpleChar = Character.MAX_VALUE;
0597: break;
0598: }
0599: continue;
0600:
0601: default:
0602: simpleChar = pattern.charAt(idx++);
0603: break;
0604: }
0605:
0606: // Handle simple character simpleChar
0607: if (definingRange) {
0608: // if we are defining a range make it now
0609: rangeEnd = simpleChar;
0610:
0611: // Actually create a range if the range is ok
0612: if (rangeStart >= rangeEnd) {
0613: syntaxError("Bad character class");
0614: }
0615: range.include(rangeStart, rangeEnd, include);
0616:
0617: // We are done defining the range
0618: last = CHAR_INVALID;
0619: definingRange = false;
0620: } else {
0621: // If simple character and not start of range, include it
0622: if (idx >= len || pattern.charAt(idx) != '-') {
0623: range.include(simpleChar, include);
0624: }
0625: last = simpleChar;
0626: }
0627: }
0628:
0629: // Shouldn't be out of input
0630: if (idx == len) {
0631: syntaxError("Unterminated character class");
0632: }
0633:
0634: // Absorb the ']' end of class marker
0635: idx++;
0636:
0637: // Emit character class definition
0638: instruction[ret + RE.offsetOpdata] = (char) range.num;
0639: for (int i = 0; i < range.num; i++) {
0640: emit((char) range.minRange[i]);
0641: emit((char) range.maxRange[i]);
0642: }
0643: return ret;
0644: }
0645:
0646: /**
0647: * Absorb an atomic character string. This method is a little tricky because
0648: * it can un-include the last character of string if a closure operator follows.
0649: * This is correct because *+? have higher precedence than concatentation (thus
0650: * ABC* means AB(C*) and NOT (ABC)*).
0651: * @return Index of new atom node
0652: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0653: */
0654: int atom() throws RESyntaxException {
0655: // Create a string node
0656: int ret = node(RE.OP_ATOM, 0);
0657:
0658: // Length of atom
0659: int lenAtom = 0;
0660:
0661: // Loop while we've got input
0662:
0663: atomLoop:
0664:
0665: while (idx < len) {
0666: // Is there a next char?
0667: if ((idx + 1) < len) {
0668: char c = pattern.charAt(idx + 1);
0669:
0670: // If the next 'char' is an escape, look past the whole escape
0671: if (pattern.charAt(idx) == '\\') {
0672: int idxEscape = idx;
0673: escape();
0674: if (idx < len) {
0675: c = pattern.charAt(idx);
0676: }
0677: idx = idxEscape;
0678: }
0679:
0680: // Switch on next char
0681: switch (c) {
0682: case '{':
0683: case '?':
0684: case '*':
0685: case '+':
0686:
0687: // If the next character is a closure operator and our atom is non-empty, the
0688: // current character should bind to the closure operator rather than the atom
0689: if (lenAtom != 0) {
0690: break atomLoop;
0691: }
0692: }
0693: }
0694:
0695: // Switch on current char
0696: switch (pattern.charAt(idx)) {
0697: case ']':
0698: case '^':
0699: case '$':
0700: case '.':
0701: case '[':
0702: case '(':
0703: case ')':
0704: case '|':
0705: break atomLoop;
0706:
0707: case '{':
0708: case '?':
0709: case '*':
0710: case '+':
0711:
0712: // We should have an atom by now
0713: if (lenAtom == 0) {
0714: // No atom before closure
0715: syntaxError("Missing operand to closure");
0716: }
0717: break atomLoop;
0718:
0719: case '\\':
0720:
0721: {
0722: // Get the escaped character (advances input automatically)
0723: int idxBeforeEscape = idx;
0724: int c = escape();
0725:
0726: // Check if it's a simple escape (as opposed to, say, a backreference)
0727: if ((c & ESC_MASK) == ESC_MASK) {
0728: // Not a simple escape, so backup to where we were before the escape.
0729: idx = idxBeforeEscape;
0730: break atomLoop;
0731: }
0732:
0733: // Add escaped char to atom
0734: emit((char) c);
0735: lenAtom++;
0736: }
0737: break;
0738:
0739: default:
0740:
0741: // Add normal character to atom
0742: emit(pattern.charAt(idx++));
0743: lenAtom++;
0744: break;
0745: }
0746: }
0747:
0748: // This "shouldn't" happen
0749: if (lenAtom == 0) {
0750: internalError();
0751: }
0752:
0753: // Emit the atom length into the program
0754: instruction[ret + RE.offsetOpdata] = (char) lenAtom;
0755: return ret;
0756: }
0757:
0758: /**
0759: * Match a terminal node.
0760: * @param flags Flags
0761: * @return Index of terminal node (closeable)
0762: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0763: */
0764: int terminal(int[] flags) throws RESyntaxException {
0765: switch (pattern.charAt(idx)) {
0766: case RE.OP_EOL:
0767: case RE.OP_BOL:
0768: case RE.OP_ANY:
0769: return node(pattern.charAt(idx++), 0);
0770:
0771: case '[':
0772: return characterClass();
0773:
0774: case '(':
0775: return expr(flags);
0776:
0777: case ')':
0778: syntaxError("Unexpected close paren");
0779:
0780: case '|':
0781: internalError();
0782:
0783: case ']':
0784: syntaxError("Mismatched class");
0785:
0786: case 0:
0787: syntaxError("Unexpected end of input");
0788:
0789: case '?':
0790: case '+':
0791: case '{':
0792: case '*':
0793: syntaxError("Missing operand to closure");
0794:
0795: case '\\': {
0796: // Don't forget, escape() advances the input stream!
0797: int idxBeforeEscape = idx;
0798:
0799: // Switch on escaped character
0800: switch (escape()) {
0801: case ESC_CLASS:
0802: case ESC_COMPLEX:
0803: flags[0] &= ~NODE_NULLABLE;
0804: return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
0805:
0806: case ESC_BACKREF: {
0807: char backreference = (char) (pattern.charAt(idx - 1) - '0');
0808: if (parens <= backreference) {
0809: syntaxError("Bad backreference");
0810: }
0811: flags[0] |= NODE_NULLABLE;
0812: return node(RE.OP_BACKREF, backreference);
0813: }
0814:
0815: default:
0816:
0817: // We had a simple escape and we want to have it end up in
0818: // an atom, so we back up and fall though to the default handling
0819: idx = idxBeforeEscape;
0820: flags[0] &= ~NODE_NULLABLE;
0821: break;
0822: }
0823: }
0824: }
0825:
0826: // Everything above either fails or returns.
0827: // If it wasn't one of the above, it must be the start of an atom.
0828: flags[0] &= ~NODE_NULLABLE;
0829: return atom();
0830: }
0831:
0832: /**
0833: * Compile a possibly closured terminal
0834: * @param flags Flags passed by reference
0835: * @return Index of closured node
0836: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
0837: */
0838: int closure(int[] flags) throws RESyntaxException {
0839: // Before terminal
0840: int idxBeforeTerminal = idx;
0841:
0842: // Values to pass by reference to terminal()
0843: int[] terminalFlags = { NODE_NORMAL };
0844:
0845: // Get terminal symbol
0846: int ret = terminal(terminalFlags);
0847:
0848: // Or in flags from terminal symbol
0849: flags[0] |= terminalFlags[0];
0850:
0851: // Advance input, set NODE_NULLABLE flag and do sanity checks
0852: if (idx >= len) {
0853: return ret;
0854: }
0855:
0856: boolean greedy = true;
0857: char closureType = pattern.charAt(idx);
0858: switch (closureType) {
0859: case '?':
0860: case '*':
0861:
0862: // The current node can be null
0863: flags[0] |= NODE_NULLABLE;
0864:
0865: // Drop through
0866:
0867: case '+':
0868:
0869: // Eat closure character
0870: idx++;
0871:
0872: // Drop through
0873:
0874: case '{':
0875:
0876: // Don't allow blantant stupidity
0877: int opcode = instruction[ret /* + RE.offsetOpcode */];
0878: if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) {
0879: syntaxError("Bad closure operand");
0880: }
0881: if ((terminalFlags[0] & NODE_NULLABLE) != 0) {
0882: syntaxError("Closure operand can't be nullable");
0883: }
0884: }
0885:
0886: // If the next character is a '?', make the closure non-greedy (reluctant)
0887: if (idx < len && pattern.charAt(idx) == '?') {
0888: idx++;
0889: greedy = false;
0890: }
0891:
0892: if (greedy) {
0893: // Actually do the closure now
0894: switch (closureType) {
0895: case '{': {
0896: bracket();
0897: int bracketEnd = idx;
0898: int bracketMin = this .bracketMin;
0899: int bracketOpt = this .bracketOpt;
0900:
0901: // Pointer to the last terminal
0902: int pos = ret;
0903:
0904: // Process min first
0905: for (int c = 0; c < bracketMin; c++) {
0906: // Rewind stream and run it through again - more matchers coming
0907: idx = idxBeforeTerminal;
0908: setNextOfEnd(pos, pos = terminal(terminalFlags));
0909: }
0910:
0911: // Do the right thing for maximum ({m,})
0912: if (bracketOpt == bracketUnbounded) {
0913: // Drop through now and closure expression.
0914: // We are done with the {m,} expr, so skip rest
0915: idx = bracketEnd;
0916: nodeInsert(RE.OP_STAR, 0, pos);
0917: setNextOfEnd(pos + RE.nodeSize, pos);
0918: break;
0919: } else if (bracketOpt > 0) {
0920: int opt[] = new int[bracketOpt + 1];
0921: // Surround first optional terminal with MAYBE
0922: nodeInsert(RE.OP_MAYBE, 0, pos);
0923: opt[0] = pos;
0924:
0925: // Add all the rest optional terminals with preceeding MAYBEs
0926: for (int c = 1; c < bracketOpt; c++) {
0927: opt[c] = node(RE.OP_MAYBE, 0);
0928: // Rewind stream and run it through again - more matchers coming
0929: idx = idxBeforeTerminal;
0930: terminal(terminalFlags);
0931: }
0932:
0933: // Tie ends together
0934: int end = opt[bracketOpt] = node(RE.OP_NOTHING, 0);
0935: for (int c = 0; c < bracketOpt; c++) {
0936: setNextOfEnd(opt[c], end);
0937: setNextOfEnd(opt[c] + RE.nodeSize, opt[c + 1]);
0938: }
0939: } else {
0940: // Rollback terminal - no opt matchers present
0941: lenInstruction = pos;
0942: node(RE.OP_NOTHING, 0);
0943: }
0944:
0945: // We are done. skip the reminder of {m,n} expr
0946: idx = bracketEnd;
0947: break;
0948: }
0949:
0950: case '?': {
0951: nodeInsert(RE.OP_MAYBE, 0, ret);
0952: int n = node(RE.OP_NOTHING, 0);
0953: setNextOfEnd(ret, n);
0954: setNextOfEnd(ret + RE.nodeSize, n);
0955: break;
0956: }
0957:
0958: case '*': {
0959: nodeInsert(RE.OP_STAR, 0, ret);
0960: setNextOfEnd(ret + RE.nodeSize, ret);
0961: break;
0962: }
0963:
0964: case '+': {
0965: nodeInsert(RE.OP_CONTINUE, 0, ret);
0966: int n = node(RE.OP_PLUS, 0);
0967: setNextOfEnd(ret + RE.nodeSize, n);
0968: setNextOfEnd(n, ret);
0969: break;
0970: }
0971: }
0972: } else {
0973: // Actually do the closure now
0974: switch (closureType) {
0975: case '?': {
0976: nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
0977: int n = node(RE.OP_NOTHING, 0);
0978: setNextOfEnd(ret, n);
0979: setNextOfEnd(ret + RE.nodeSize, n);
0980: break;
0981: }
0982:
0983: case '*': {
0984: nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
0985: setNextOfEnd(ret + RE.nodeSize, ret);
0986: break;
0987: }
0988:
0989: case '+': {
0990: nodeInsert(RE.OP_CONTINUE, 0, ret);
0991: int n = node(RE.OP_RELUCTANTPLUS, 0);
0992: setNextOfEnd(n, ret);
0993: setNextOfEnd(ret + RE.nodeSize, n);
0994: break;
0995: }
0996: }
0997: }
0998:
0999: return ret;
1000: }
1001:
1002: /**
1003: * Compile body of one branch of an or operator (implements concatenation)
1004: *
1005: * @param flags Flags passed by reference
1006: * @return Pointer to first node in the branch
1007: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1008: */
1009: int branch(int[] flags) throws RESyntaxException {
1010: // Get each possibly closured piece and concat
1011: int node;
1012: int ret = -1;
1013: int chain = -1;
1014: int[] closureFlags = new int[1];
1015: boolean nullable = true;
1016: while (idx < len && pattern.charAt(idx) != '|'
1017: && pattern.charAt(idx) != ')') {
1018: // Get new node
1019: closureFlags[0] = NODE_NORMAL;
1020: node = closure(closureFlags);
1021: if (closureFlags[0] == NODE_NORMAL) {
1022: nullable = false;
1023: }
1024:
1025: // If there's a chain, append to the end
1026: if (chain != -1) {
1027: setNextOfEnd(chain, node);
1028: }
1029:
1030: // Chain starts at current
1031: chain = node;
1032: if (ret == -1) {
1033: ret = node;
1034: }
1035: }
1036:
1037: // If we don't run loop, make a nothing node
1038: if (ret == -1) {
1039: ret = node(RE.OP_NOTHING, 0);
1040: }
1041:
1042: // Set nullable flag for this branch
1043: if (nullable) {
1044: flags[0] |= NODE_NULLABLE;
1045: }
1046:
1047: return ret;
1048: }
1049:
1050: /**
1051: * Compile an expression with possible parens around it. Paren matching
1052: * is done at this level so we can tie the branch tails together.
1053: *
1054: * @param flags Flag value passed by reference
1055: * @return Node index of expression in instruction array
1056: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1057: */
1058: int expr(int[] flags) throws RESyntaxException {
1059: // Create open paren node unless we were called from the top level (which has no parens)
1060: int paren = -1;
1061: int ret = -1;
1062: int closeParens = parens;
1063: if ((flags[0] & NODE_TOPLEVEL) == 0
1064: && pattern.charAt(idx) == '(') {
1065: // if its a cluster ( rather than a proper subexpression ie with backrefs )
1066: if (idx + 2 < len && pattern.charAt(idx + 1) == '?'
1067: && pattern.charAt(idx + 2) == ':') {
1068: paren = 2;
1069: idx += 3;
1070: ret = node(RE.OP_OPEN_CLUSTER, 0);
1071: } else {
1072: paren = 1;
1073: idx++;
1074: ret = node(RE.OP_OPEN, parens++);
1075: }
1076: }
1077: flags[0] &= ~NODE_TOPLEVEL;
1078:
1079: // Process contents of first branch node
1080: boolean open = false;
1081: int branch = branch(flags);
1082: if (ret == -1) {
1083: ret = branch;
1084: } else {
1085: setNextOfEnd(ret, branch);
1086: }
1087:
1088: // Loop through branches
1089: while (idx < len && pattern.charAt(idx) == '|') {
1090: // Now open the first branch since there are more than one
1091: if (!open) {
1092: nodeInsert(RE.OP_BRANCH, 0, branch);
1093: open = true;
1094: }
1095:
1096: idx++;
1097: setNextOfEnd(branch, branch = node(RE.OP_BRANCH, 0));
1098: branch(flags);
1099: }
1100:
1101: // Create an ending node (either a close paren or an OP_END)
1102: int end;
1103: if (paren > 0) {
1104: if (idx < len && pattern.charAt(idx) == ')') {
1105: idx++;
1106: } else {
1107: syntaxError("Missing close paren");
1108: }
1109: if (paren == 1) {
1110: end = node(RE.OP_CLOSE, closeParens);
1111: } else {
1112: end = node(RE.OP_CLOSE_CLUSTER, 0);
1113: }
1114: } else {
1115: end = node(RE.OP_END, 0);
1116: }
1117:
1118: // Append the ending node to the ret nodelist
1119: setNextOfEnd(ret, end);
1120:
1121: // Hook the ends of each branch to the end node
1122: int currentNode = ret;
1123: int nextNodeOffset = instruction[currentNode + RE.offsetNext];
1124: // while the next node o
1125: while (nextNodeOffset != 0 && currentNode < lenInstruction) {
1126: // If branch, make the end of the branch's operand chain point to the end node.
1127: if (instruction[currentNode /* + RE.offsetOpcode */] == RE.OP_BRANCH) {
1128: setNextOfEnd(currentNode + RE.nodeSize, end);
1129: }
1130: nextNodeOffset = instruction[currentNode + RE.offsetNext];
1131: currentNode += nextNodeOffset;
1132: }
1133:
1134: // Return the node list
1135: return ret;
1136: }
1137:
1138: /**
1139: * Compiles a regular expression pattern into a program runnable by the pattern
1140: * matcher class 'RE'.
1141: * @param pattern Regular expression pattern to compile (see RECompiler class
1142: * for details).
1143: * @return A compiled regular expression program.
1144: * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1145: * @see RECompiler
1146: * @see RE
1147: */
1148: public REProgram compile(String pattern) throws RESyntaxException {
1149: // Initialize variables for compilation
1150: this .pattern = pattern; // Save pattern in instance variable
1151: len = pattern.length(); // Precompute pattern length for speed
1152: idx = 0; // Set parsing index to the first character
1153: lenInstruction = 0; // Set emitted instruction count to zero
1154: parens = 1; // Set paren level to 1 (the implicit outer parens)
1155:
1156: // Initialize pass by reference flags value
1157: int[] flags = { NODE_TOPLEVEL };
1158:
1159: // Parse expression
1160: expr(flags);
1161:
1162: // Should be at end of input
1163: if (idx != len) {
1164: if (pattern.charAt(idx) == ')') {
1165: syntaxError("Unmatched close paren");
1166: }
1167: syntaxError("Unexpected input remains");
1168: }
1169:
1170: // Return the result
1171: char[] ins = new char[lenInstruction];
1172: System.arraycopy(instruction, 0, ins, 0, lenInstruction);
1173: return new REProgram(parens, ins);
1174: }
1175:
1176: /**
1177: * Local, nested class for maintaining character ranges for character classes.
1178: */
1179: class RERange {
1180: int size = 16; // Capacity of current range arrays
1181: int[] minRange = new int[size]; // Range minima
1182: int[] maxRange = new int[size]; // Range maxima
1183: int num = 0; // Number of range array elements in use
1184:
1185: /**
1186: * Deletes the range at a given index from the range lists
1187: * @param index Index of range to delete from minRange and maxRange arrays.
1188: */
1189: void delete(int index) {
1190: // Return if no elements left or index is out of range
1191: if (num == 0 || index >= num) {
1192: return;
1193: }
1194:
1195: // Move elements down
1196: while (++index < num) {
1197: if (index - 1 >= 0) {
1198: minRange[index - 1] = minRange[index];
1199: maxRange[index - 1] = maxRange[index];
1200: }
1201: }
1202:
1203: // One less element now
1204: num--;
1205: }
1206:
1207: /**
1208: * Merges a range into the range list, coalescing ranges if possible.
1209: * @param min Minimum end of range
1210: * @param max Maximum end of range
1211: */
1212: void merge(int min, int max) {
1213: // Loop through ranges
1214: for (int i = 0; i < num; i++) {
1215: // Min-max is subsumed by minRange[i]-maxRange[i]
1216: if (min >= minRange[i] && max <= maxRange[i]) {
1217: return;
1218: }
1219:
1220: // Min-max subsumes minRange[i]-maxRange[i]
1221: else if (min <= minRange[i] && max >= maxRange[i]) {
1222: delete(i);
1223: merge(min, max);
1224: return;
1225: }
1226:
1227: // Min is in the range, but max is outside
1228: else if (min >= minRange[i] && min <= maxRange[i]) {
1229: min = minRange[i];
1230: delete(i);
1231: merge(min, max);
1232: return;
1233: }
1234:
1235: // Max is in the range, but min is outside
1236: else if (max >= minRange[i] && max <= maxRange[i]) {
1237: max = maxRange[i];
1238: delete(i);
1239: merge(min, max);
1240: return;
1241: }
1242: }
1243:
1244: // Must not overlap any other ranges
1245: if (num >= size) {
1246: size *= 2;
1247: int[] newMin = new int[size];
1248: int[] newMax = new int[size];
1249: System.arraycopy(minRange, 0, newMin, 0, num);
1250: System.arraycopy(maxRange, 0, newMax, 0, num);
1251: minRange = newMin;
1252: maxRange = newMax;
1253: }
1254: minRange[num] = min;
1255: maxRange[num] = max;
1256: num++;
1257: }
1258:
1259: /**
1260: * Removes a range by deleting or shrinking all other ranges
1261: * @param min Minimum end of range
1262: * @param max Maximum end of range
1263: */
1264: void remove(int min, int max) {
1265: // Loop through ranges
1266: for (int i = 0; i < num; i++) {
1267: // minRange[i]-maxRange[i] is subsumed by min-max
1268: if (minRange[i] >= min && maxRange[i] <= max) {
1269: delete(i);
1270: return;
1271: }
1272:
1273: // min-max is subsumed by minRange[i]-maxRange[i]
1274: else if (min >= minRange[i] && max <= maxRange[i]) {
1275: int minr = minRange[i];
1276: int maxr = maxRange[i];
1277: delete(i);
1278: if (minr < min) {
1279: merge(minr, min - 1);
1280: }
1281: if (max < maxr) {
1282: merge(max + 1, maxr);
1283: }
1284: return;
1285: }
1286:
1287: // minRange is in the range, but maxRange is outside
1288: else if (minRange[i] >= min && minRange[i] <= max) {
1289: minRange[i] = max + 1;
1290: return;
1291: }
1292:
1293: // maxRange is in the range, but minRange is outside
1294: else if (maxRange[i] >= min && maxRange[i] <= max) {
1295: maxRange[i] = min - 1;
1296: return;
1297: }
1298: }
1299: }
1300:
1301: /**
1302: * Includes (or excludes) the range from min to max, inclusive.
1303: * @param min Minimum end of range
1304: * @param max Maximum end of range
1305: * @param include True if range should be included. False otherwise.
1306: */
1307: void include(int min, int max, boolean include) {
1308: if (include) {
1309: merge(min, max);
1310: } else {
1311: remove(min, max);
1312: }
1313: }
1314:
1315: /**
1316: * Includes a range with the same min and max
1317: * @param minmax Minimum and maximum end of range (inclusive)
1318: * @param include True if range should be included. False otherwise.
1319: */
1320: void include(char minmax, boolean include) {
1321: include(minmax, minmax, include);
1322: }
1323: }
1324: }
|