0001: /* ====================================================================
0002: * The Jcorporate Apache Style Software License, Version 1.2 05-07-2002
0003: *
0004: * Copyright (c) 1995-2002 Jcorporate Ltd. All rights reserved.
0005: *
0006: * Redistribution and use in source and binary forms, with or without
0007: * modification, are permitted provided that the following conditions
0008: * are met:
0009: *
0010: * 1. Redistributions of source code must retain the above copyright
0011: * notice, this list of conditions and the following disclaimer.
0012: *
0013: * 2. Redistributions in binary form must reproduce the above copyright
0014: * notice, this list of conditions and the following disclaimer in
0015: * the documentation and/or other materials provided with the
0016: * distribution.
0017: *
0018: * 3. The end-user documentation included with the redistribution,
0019: * if any, must include the following acknowledgment:
0020: * "This product includes software developed by Jcorporate Ltd.
0021: * (http://www.jcorporate.com/)."
0022: * Alternately, this acknowledgment may appear in the software itself,
0023: * if and wherever such third-party acknowledgments normally appear.
0024: *
0025: * 4. "Jcorporate" and product names such as "Expresso" must
0026: * not be used to endorse or promote products derived from this
0027: * software without prior written permission. For written permission,
0028: * please contact info@jcorporate.com.
0029: *
0030: * 5. Products derived from this software may not be called "Expresso",
0031: * or other Jcorporate product names; nor may "Expresso" or other
0032: * Jcorporate product names appear in their name, without prior
0033: * written permission of Jcorporate Ltd.
0034: *
0035: * 6. No product derived from this software may compete in the same
0036: * market space, i.e. framework, without prior written permission
0037: * of Jcorporate Ltd. For written permission, please contact
0038: * partners@jcorporate.com.
0039: *
0040: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0041: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0042: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0043: * DISCLAIMED. IN NO EVENT SHALL JCORPORATE LTD OR ITS CONTRIBUTORS
0044: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0045: * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
0046: * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0047: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0048: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0049: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0050: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0051: * SUCH DAMAGE.
0052: * ====================================================================
0053: *
0054: * This software consists of voluntary contributions made by many
0055: * individuals on behalf of the Jcorporate Ltd. Contributions back
0056: * to the project(s) are encouraged when you make modifications.
0057: * Please send them to support@jcorporate.com. For more information
0058: * on Jcorporate Ltd. and its products, please see
0059: * <http://www.jcorporate.com/>.
0060: *
0061: * Portions of this software are based upon other open source
0062: * products and are subject to their respective licenses.
0063: */
0064:
0065: package com.jcorporate.expresso.ext.regexp;
0066:
0067: /*
0068: * ====================================================================
0069: *
0070: * The Apache Software License, Version 1.1
0071: *
0072: * Copyright (c) 1999 The Apache Software Foundation. All rights
0073: * reserved.
0074: *
0075: * Redistribution and use in source and binary forms, with or without
0076: * modification, are permitted provided that the following conditions
0077: * are met:
0078: *
0079: * 1. Redistributions of source code must retain the above copyright
0080: * notice, this list of conditions and the following disclaimer.
0081: *
0082: * 2. Redistributions in binary form must reproduce the above copyright
0083: * notice, this list of conditions and the following disclaimer in
0084: * the documentation and/or other materials provided with the
0085: * distribution.
0086: *
0087: * 3. The end-user documentation included with the redistribution, if
0088: * any, must include the following acknowlegement:
0089: * "This product includes software developed by the
0090: * Apache Software Foundation (http://www.apache.org/)."
0091: * Alternately, this acknowlegement may appear in the software itself,
0092: * if and wherever such third-party acknowlegements normally appear.
0093: *
0094: * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
0095: * Foundation" must not be used to endorse or promote products derived
0096: * from this software without prior written permission. For written
0097: * permission, please contact apache@apache.org.
0098: *
0099: * 5. Products derived from this software may not be called "Apache"
0100: * nor may "Apache" appear in their names without prior written
0101: * permission of the Apache Group.
0102: *
0103: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0104: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0105: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0106: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0107: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0108: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0109: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0110: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0111: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0112: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0113: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0114: * SUCH DAMAGE.
0115: * ====================================================================
0116: *
0117: * This software consists of voluntary contributions made by many
0118: * individuals on behalf of the Apache Software Foundation. For more
0119: * information on the Apache Software Foundation, please see
0120: * <http://www.apache.org/>.
0121: *
0122: */
0123:
0124: import java.util.Hashtable;
0125:
0126: /**
0127: * A regular expression compiler class. This class compiles a pattern string into a
0128: * regular expression program interpretable by the RE evaluator class. The 'recompile'
0129: * command line tool uses this compiler to pre-compile regular expressions for use
0130: * with RE. For a description of the syntax accepted by RECompiler and what you can
0131: * do with regular expressions, see the documentation for the RE matcher class.
0132: *
0133: * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
0134: * @version $Id: RECompiler.java,v 1.7 2004/11/17 20:48:15 lhamel Exp $
0135: * @see RE
0136: * @deprecated since v5.6, use jakarta oro
0137: */
0138: public class RECompiler {
0139:
0140: // The compiled program
0141: char[] instruction; // The compiled RE 'program' instruction buffer
0142: int lenInstruction; // The amount of the program buffer currently in use
0143:
0144: // Input state for compiling regular expression
0145: String pattern; // Input string
0146: int len; // Length of the pattern string
0147: int idx; // Current input index into ac
0148: int parens; // Total number of paren pairs
0149:
0150: // Node flags
0151: static final int NODE_NORMAL = 0; // No flags (nothing special)
0152: static final int NODE_NULLABLE = 1; // True if node is potentially null
0153: static final int NODE_TOPLEVEL = 2; // True if top level expr
0154:
0155: // Special types of 'escapes'
0156: static final char ESC_MASK = 0xfff0; // Escape complexity mask
0157: static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
0158: static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
0159: static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
0160:
0161: // {m,n} stacks
0162: static final int maxBrackets = 10; // Maximum number of bracket pairs
0163: static int brackets = 0; // Number of bracket sets
0164: static int[] bracketStart = null; // Starting point
0165: static int[] bracketEnd = null; // Ending point
0166: static int[] bracketMin = null; // Minimum number of matches
0167: static int[] bracketOpt = null; // Additional optional matches
0168: static final int bracketUnbounded = -1; // Unbounded value
0169: static final int bracketFinished = -2; // Unbounded value
0170:
0171: // Lookup table for POSIX character class names
0172: static Hashtable hashPOSIX = new Hashtable();
0173:
0174: static {
0175: hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM));
0176: hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA));
0177: hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK));
0178: hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL));
0179: hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT));
0180: hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH));
0181: hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER));
0182: hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT));
0183: hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT));
0184: hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE));
0185: hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER));
0186: hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT));
0187: hashPOSIX
0188: .put("javastart", new Character(RE.POSIX_CLASS_JSTART));
0189: hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART));
0190: }
0191:
0192: /**
0193: * Local, nested class for maintaining character ranges for character classes.
0194: */
0195: class RERange {
0196: int size = 16; // Capacity of current range arrays
0197: int[] minRange = new int[size]; // Range minima
0198: int[] maxRange = new int[size]; // Range maxima
0199: int num = 0; // Number of range array elements in use
0200:
0201: /**
0202: * Deletes the range at a given index from the range lists
0203: *
0204: * @param index Index of range to delete from minRange and maxRange arrays.
0205: */
0206: void delete(int index) {
0207:
0208: // Return if no elements left or index is out of range
0209: if (num == 0 || index >= num) {
0210: return;
0211: }
0212: // Move elements down
0213: while (index++ < num) {
0214: if (index - 1 >= 0) {
0215: minRange[index - 1] = minRange[index];
0216: maxRange[index - 1] = maxRange[index];
0217: }
0218: }
0219:
0220: // One less element now
0221: num--;
0222: }
0223:
0224: /**
0225: * Merges a range into the range list, coalescing ranges if possible.
0226: *
0227: * @param min Minimum end of range
0228: * @param max Maximum end of range
0229: */
0230: void merge(int min, int max) {
0231:
0232: // Loop through ranges
0233: for (int i = 0; i < num; i++) {
0234:
0235: // Min-max is subsumed by minRange[i]-maxRange[i]
0236: if (min >= minRange[i] && max <= maxRange[i]) {
0237: return;
0238: }
0239:
0240: // Min-max subsumes minRange[i]-maxRange[i]
0241: else if (min <= minRange[i] && max >= maxRange[i]) {
0242: delete(i);
0243: merge(min, max);
0244:
0245: return;
0246: }
0247:
0248: // Min is in the range, but max is outside
0249: else if (min >= minRange[i] && min <= maxRange[i]) {
0250: delete(i);
0251: min = minRange[i];
0252: merge(min, max);
0253:
0254: return;
0255: }
0256:
0257: // Max is in the range, but min is outside
0258: else if (max >= minRange[i] && max <= maxRange[i]) {
0259: delete(i);
0260: max = maxRange[i];
0261: merge(min, max);
0262:
0263: return;
0264: }
0265: }
0266: // Must not overlap any other ranges
0267: if (num >= size) {
0268: size *= 2;
0269:
0270: int[] newMin = new int[size];
0271: int[] newMax = new int[size];
0272: System.arraycopy(minRange, 0, newMin, 0, num);
0273: System.arraycopy(maxRange, 0, newMax, 0, num);
0274: minRange = newMin;
0275: maxRange = newMax;
0276: }
0277:
0278: minRange[num] = min;
0279: maxRange[num] = max;
0280: num++;
0281: }
0282:
0283: /**
0284: * Removes a range by deleting or shrinking all other ranges
0285: *
0286: * @param min Minimum end of range
0287: * @param max Maximum end of range
0288: */
0289: void remove(int min, int max) {
0290:
0291: // Loop through ranges
0292: for (int i = 0; i < num; i++) {
0293:
0294: // minRange[i]-maxRange[i] is subsumed by min-max
0295: if (minRange[i] >= min && maxRange[i] <= max) {
0296: delete(i);
0297: i--;
0298:
0299: return;
0300: }
0301:
0302: // min-max is subsumed by minRange[i]-maxRange[i]
0303: else if (min >= minRange[i] && max <= maxRange[i]) {
0304: int minr = minRange[i];
0305: int maxr = maxRange[i];
0306: delete(i);
0307:
0308: if (minr < min - 1) {
0309: merge(minr, min - 1);
0310: }
0311: if (max + 1 < maxr) {
0312: merge(max + 1, maxr);
0313: }
0314:
0315: return;
0316: }
0317:
0318: // minRange is in the range, but maxRange is outside
0319: else if (minRange[i] >= min && minRange[i] <= max) {
0320: minRange[i] = max + 1;
0321:
0322: return;
0323: }
0324:
0325: // maxRange is in the range, but minRange is outside
0326: else if (maxRange[i] >= min && maxRange[i] <= max) {
0327: maxRange[i] = min - 1;
0328:
0329: return;
0330: }
0331: }
0332: }
0333:
0334: /**
0335: * Includes (or excludes) the range from min to max, inclusive.
0336: *
0337: * @param min Minimum end of range
0338: * @param max Maximum end of range
0339: * @param include True if range should be included. False otherwise.
0340: */
0341: void include(int min, int max, boolean include) {
0342: if (include) {
0343: merge(min, max);
0344: } else {
0345: remove(min, max);
0346: }
0347: }
0348:
0349: /**
0350: * Includes a range with the same min and max
0351: *
0352: * @param minmax Minimum and maximum end of range (inclusive)
0353: * @param include True if range should be included. False otherwise.
0354: */
0355: void include(char minmax, boolean include) {
0356: include(minmax, minmax, include);
0357: }
0358: }
0359:
0360: /**
0361: * Constructor. Creates (initially empty) storage for a regular expression program.
0362: */
0363: public RECompiler() {
0364:
0365: // Start off with a generous, yet reasonable, initial size
0366: instruction = new char[128];
0367: lenInstruction = 0;
0368: }
0369:
0370: /**
0371: * Allocate storage for brackets only as needed
0372: */
0373: void allocBrackets() {
0374:
0375: // Allocate bracket stacks if not already done
0376: if (bracketStart == null) {
0377:
0378: // Allocate storage
0379: bracketStart = new int[maxBrackets];
0380: bracketEnd = new int[maxBrackets];
0381: bracketMin = new int[maxBrackets];
0382: bracketOpt = new int[maxBrackets];
0383:
0384: // Initialize to invalid values
0385: for (int i = 0; i < maxBrackets; i++) {
0386: bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
0387: }
0388: }
0389: }
0390:
0391: /**
0392: * Absorb an atomic character string. This method is a little tricky because
0393: * it can un-include the last character of string if a closure operator follows.
0394: * This is correct because *+? have higher precedence than concatentation (thus
0395: * ABC* means AB(C*) and NOT (ABC)*).
0396: *
0397: * @return Index of new atom node
0398: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
0399: */
0400: int atom() throws RESyntaxException {
0401:
0402: // Create a string node
0403: int ret = node(RE.OP_ATOM, 0);
0404:
0405: // Length of atom
0406: int lenAtom = 0;
0407:
0408: // Loop while we've got input
0409: atomLoop:
0410:
0411: while (idx < len) {
0412:
0413: // Is there a next char?
0414: if ((idx + 1) < len) {
0415: char c = pattern.charAt(idx + 1);
0416:
0417: // If the next 'char' is an escape, look past the whole escape
0418: if (pattern.charAt(idx) == '\\') {
0419: int idxEscape = idx;
0420: escape();
0421:
0422: if (idx < len) {
0423: c = pattern.charAt(idx);
0424: }
0425:
0426: idx = idxEscape;
0427: }
0428: // Switch on next char
0429: switch (c) {
0430: case '{':
0431: case '?':
0432: case '*':
0433: case '+':
0434:
0435: // If the next character is a closure operator and our atom is non-empty, the
0436: // current character should bind to the closure operator rather than the atom
0437: if (lenAtom != 0) {
0438: break atomLoop;
0439: }
0440: }
0441: }
0442: // Switch on current char
0443: switch (pattern.charAt(idx)) {
0444: case ']':
0445: case '^':
0446: case '$':
0447: case '.':
0448: case '[':
0449: case '(':
0450: case ')':
0451: case '|':
0452: break atomLoop;
0453:
0454: case '{':
0455: case '?':
0456: case '*':
0457: case '+':
0458:
0459: // We should have an atom by now
0460: if (lenAtom == 0) {
0461:
0462: // No atom before closure
0463: syntaxError("Missing operand to closure");
0464: }
0465:
0466: break atomLoop;
0467:
0468: case '\\': {
0469:
0470: // Get the escaped character (advances input automatically)
0471: int idxBeforeEscape = idx;
0472: char c = escape();
0473:
0474: // Check if it's a simple escape (as opposed to, say, a backreference)
0475: if ((c & ESC_MASK) == ESC_MASK) {
0476:
0477: // Not a simple escape, so backup to where we were before the escape.
0478: idx = idxBeforeEscape;
0479: break atomLoop;
0480: }
0481:
0482: // Add escaped char to atom
0483: emit(c);
0484: lenAtom++;
0485: }
0486:
0487: break;
0488:
0489: default:
0490:
0491: // Add normal character to atom
0492: emit(pattern.charAt(idx++));
0493: lenAtom++;
0494: break;
0495: }
0496: }
0497: // This "shouldn't" happen
0498: if (lenAtom == 0) {
0499: internalError();
0500: }
0501:
0502: // Emit the atom length into the program
0503: instruction[ret + RE.offsetOpdata] = (char) lenAtom;
0504:
0505: return ret;
0506: }
0507:
0508: /**
0509: * Match bracket {m,n} expression put results in bracket member variables
0510: *
0511: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
0512: */
0513: void bracket() throws RESyntaxException {
0514:
0515: // Current character must be a '{'
0516: if (idx >= len || pattern.charAt(idx++) != '{') {
0517: internalError();
0518: }
0519: // Next char must be a digit
0520: if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
0521: syntaxError("Expected digit");
0522: }
0523:
0524: // Get min ('m' of {m,n}) number
0525: StringBuffer number = new StringBuffer();
0526:
0527: while (idx < len && Character.isDigit(pattern.charAt(idx))) {
0528: number.append(pattern.charAt(idx++));
0529: }
0530: try {
0531: bracketMin[brackets] = Integer.parseInt(number.toString());
0532: } catch (NumberFormatException e) {
0533: syntaxError("Expected valid number");
0534: }
0535: // If out of input, fail
0536: if (idx >= len) {
0537: syntaxError("Expected comma or right bracket");
0538: }
0539: // If end of expr, optional limit is 0
0540: if (pattern.charAt(idx) == '}') {
0541: idx++;
0542: bracketOpt[brackets] = 0;
0543:
0544: return;
0545: }
0546: // Must have at least {m,} and maybe {m,n}.
0547: if (idx >= len || pattern.charAt(idx++) != ',') {
0548: syntaxError("Expected comma");
0549: }
0550: // If out of input, fail
0551: if (idx >= len) {
0552: syntaxError("Expected comma or right bracket");
0553: }
0554: // If {m,} max is unlimited
0555: if (pattern.charAt(idx) == '}') {
0556: idx++;
0557: bracketOpt[brackets] = bracketUnbounded;
0558:
0559: return;
0560: }
0561: // Next char must be a digit
0562: if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
0563: syntaxError("Expected digit");
0564: }
0565:
0566: // Get max number
0567: number.setLength(0);
0568:
0569: while (idx < len && Character.isDigit(pattern.charAt(idx))) {
0570: number.append(pattern.charAt(idx++));
0571: }
0572: try {
0573: bracketOpt[brackets] = Integer.parseInt(number.toString())
0574: - bracketMin[brackets];
0575: } catch (NumberFormatException e) {
0576: syntaxError("Expected valid number");
0577: }
0578: // Optional repetitions must be > 0
0579: if (bracketOpt[brackets] <= 0) {
0580: syntaxError("Bad range");
0581: }
0582: // Must have close brace
0583: if (idx >= len || pattern.charAt(idx++) != '}') {
0584: syntaxError("Missing close brace");
0585: }
0586: }
0587:
0588: /**
0589: * Compile one branch of an or operator (implements concatenation)
0590: *
0591: * @param flags Flags passed by reference
0592: * @return Pointer to branch node
0593: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
0594: */
0595: int branch(int[] flags) throws RESyntaxException {
0596:
0597: // Get each possibly closured piece and concat
0598: int node;
0599: int ret = node(RE.OP_BRANCH, 0);
0600: int chain = -1;
0601: int[] closureFlags = new int[1];
0602: boolean nullable = true;
0603:
0604: while (idx < len && pattern.charAt(idx) != '|'
0605: && pattern.charAt(idx) != ')') {
0606:
0607: // Get new node
0608: closureFlags[0] = NODE_NORMAL;
0609: node = closure(closureFlags);
0610:
0611: if (closureFlags[0] == NODE_NORMAL) {
0612: nullable = false;
0613: }
0614: // If there's a chain, append to the end
0615: if (chain != -1) {
0616: setNextOfEnd(chain, node);
0617: }
0618:
0619: // Chain starts at current
0620: chain = node;
0621: }
0622: // If we don't run loop, make a nothing node
0623: if (chain == -1) {
0624: node(RE.OP_NOTHING, 0);
0625: }
0626: // Set nullable flag for this branch
0627: if (nullable) {
0628: flags[0] |= NODE_NULLABLE;
0629: }
0630:
0631: return ret;
0632: }
0633:
0634: /**
0635: * Compile a character class
0636: *
0637: * @return Index of class node
0638: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
0639: */
0640: int characterClass() throws RESyntaxException {
0641:
0642: // Check for bad calling or empty class
0643: if (pattern.charAt(idx) != '[') {
0644: internalError();
0645: }
0646: // Check for unterminated or empty class
0647: if ((idx + 1) >= len || pattern.charAt(++idx) == ']') {
0648: syntaxError("Empty or unterminated class");
0649: }
0650: // Check for POSIX character class
0651: if (idx < len && pattern.charAt(idx) == ':') {
0652:
0653: // Skip colon
0654: idx++;
0655:
0656: // POSIX character classes are denoted with lowercase ASCII strings
0657: int idxStart = idx;
0658:
0659: while (idx < len && pattern.charAt(idx) >= 'a'
0660: && pattern.charAt(idx) <= 'z') {
0661: idx++;
0662: }
0663: // Should be a ":]" to terminate the POSIX character class
0664: if ((idx + 1) < len && pattern.charAt(idx) == ':'
0665: && pattern.charAt(idx + 1) == ']') {
0666:
0667: // Get character class
0668: String charClass = pattern.substring(idxStart, idx);
0669:
0670: // Select the POSIX class id
0671: Character i = (Character) hashPOSIX.get(charClass);
0672:
0673: if (i != null) {
0674:
0675: // Move past colon and right bracket
0676: idx += 2;
0677:
0678: // Return new POSIX character class node
0679: return node(RE.OP_POSIXCLASS, i.charValue());
0680: }
0681:
0682: syntaxError("Invalid POSIX character class '"
0683: + charClass + "'");
0684: }
0685:
0686: syntaxError("Invalid POSIX character class syntax");
0687: }
0688:
0689: // Try to build a class. Create OP_ANYOF node
0690: int ret = node(RE.OP_ANYOF, 0);
0691:
0692: // Parse class declaration
0693: char CHAR_INVALID = Character.MAX_VALUE;
0694: char last = CHAR_INVALID;
0695: char simpleChar = 0;
0696: boolean include = true;
0697: boolean definingRange = false;
0698: int idxFirst = idx;
0699: char rangeStart = Character.MIN_VALUE;
0700: char rangeEnd;
0701: RERange range = new RERange();
0702:
0703: while (idx < len && pattern.charAt(idx) != ']') {
0704: switchOnCharacter:
0705:
0706: // Switch on character
0707: switch (pattern.charAt(idx)) {
0708: case '^':
0709: include = !include;
0710:
0711: if (idx == idxFirst) {
0712: range.include(Character.MIN_VALUE,
0713: Character.MAX_VALUE, true);
0714: }
0715:
0716: idx++;
0717: continue;
0718:
0719: case '\\': {
0720:
0721: // Escape always advances the stream
0722: char c;
0723:
0724: switch (c = escape()) {
0725: case ESC_COMPLEX:
0726: case ESC_BACKREF:
0727:
0728: // Word boundaries and backrefs not allowed in a character class!
0729: syntaxError("Bad character class");
0730:
0731: case ESC_CLASS:
0732:
0733: // Classes can't be an endpoint of a range
0734: if (definingRange) {
0735: syntaxError("Bad character class");
0736: }
0737: // Handle specific type of class (some are ok)
0738: switch (pattern.charAt(idx - 1)) {
0739: case RE.E_NSPACE:
0740: case RE.E_NDIGIT:
0741: case RE.E_NALNUM:
0742: syntaxError("Bad character class");
0743:
0744: case RE.E_SPACE:
0745: range.include('\t', include);
0746: range.include('\r', include);
0747: range.include('\f', include);
0748: range.include('\n', include);
0749: range.include('\b', include);
0750: range.include(' ', include);
0751: break;
0752:
0753: case RE.E_ALNUM:
0754: range.include('a', 'z', include);
0755: range.include('A', 'Z', include);
0756: range.include('_', include);
0757:
0758: // Fall through!
0759: case RE.E_DIGIT:
0760: range.include('0', '9', include);
0761: break;
0762: }
0763:
0764: // Make last char invalid (can't be a range start)
0765: last = CHAR_INVALID;
0766: break;
0767:
0768: default:
0769:
0770: // Escape is simple so treat as a simple char
0771: simpleChar = c;
0772: break switchOnCharacter;
0773: }
0774: }
0775:
0776: continue;
0777:
0778: case '-':
0779:
0780: // Start a range if one isn't already started
0781: if (definingRange) {
0782: syntaxError("Bad class range");
0783: }
0784:
0785: definingRange = true;
0786:
0787: // If no last character, start of range is 0
0788: rangeStart = (last == CHAR_INVALID ? 0 : last);
0789:
0790: // Premature end of range. define up to Character.MAX_VALUE
0791: if ((idx + 1) < len && pattern.charAt(++idx) == ']') {
0792: simpleChar = Character.MAX_VALUE;
0793: break;
0794: }
0795:
0796: continue;
0797:
0798: default:
0799: simpleChar = pattern.charAt(idx++);
0800: break;
0801: }
0802: // Handle simple character simpleChar
0803: if (definingRange) {
0804:
0805: // if we are defining a range make it now
0806: rangeEnd = simpleChar;
0807:
0808: // Actually create a range if the range is ok
0809: if (rangeStart >= rangeEnd) {
0810: syntaxError("Bad character class");
0811: }
0812:
0813: range.include(rangeStart, rangeEnd, include);
0814:
0815: // We are done defining the range
0816: last = CHAR_INVALID;
0817: definingRange = false;
0818: } else {
0819:
0820: // If simple character and not start of range, include it
0821: if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-') {
0822: range.include(simpleChar, include);
0823: }
0824:
0825: last = simpleChar;
0826: }
0827: }
0828: // Shouldn't be out of input
0829: if (idx == len) {
0830: syntaxError("Unterminated character class");
0831: }
0832:
0833: // Absorb the ']' end of class marker
0834: idx++;
0835:
0836: // Emit character class definition
0837: instruction[ret + RE.offsetOpdata] = (char) range.num;
0838:
0839: for (int i = 0; i < range.num; i++) {
0840: emit((char) range.minRange[i]);
0841: emit((char) range.maxRange[i]);
0842: }
0843:
0844: return ret;
0845: }
0846:
0847: /**
0848: * Compile a possibly closured terminal
0849: *
0850: * @param flags Flags passed by reference
0851: * @return Index of closured node
0852: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
0853: */
0854: int closure(int[] flags) throws RESyntaxException {
0855:
0856: // Before terminal
0857: int idxBeforeTerminal = idx;
0858:
0859: // Values to pass by reference to terminal()
0860: int[] terminalFlags = { NODE_NORMAL };
0861:
0862: // Get terminal symbol
0863: int ret = terminal(terminalFlags);
0864:
0865: // Or in flags from terminal symbol
0866: flags[0] |= terminalFlags[0];
0867:
0868: // Advance input, set NODE_NULLABLE flag and do sanity checks
0869: if (idx >= len) {
0870: return ret;
0871: }
0872:
0873: boolean greedy = true;
0874: char closureType = pattern.charAt(idx);
0875:
0876: switch (closureType) {
0877: case '?':
0878: case '*':
0879:
0880: // The current node can be null
0881: flags[0] |= NODE_NULLABLE;
0882:
0883: case '+':
0884:
0885: // Eat closure character
0886: idx++;
0887:
0888: case '{':
0889:
0890: // Don't allow blantant stupidity
0891: int opcode = instruction[ret + RE.offsetOpcode];
0892:
0893: if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) {
0894: syntaxError("Bad closure operand");
0895: }
0896: if ((terminalFlags[0] & NODE_NULLABLE) != 0) {
0897: syntaxError("Closure operand can't be nullable");
0898: }
0899:
0900: break;
0901: }
0902: // If the next character is a '?', make the closure non-greedy (reluctant)
0903: if (idx < len && pattern.charAt(idx) == '?') {
0904: idx++;
0905: greedy = false;
0906: }
0907: if (greedy) {
0908:
0909: // Actually do the closure now
0910: switch (closureType) {
0911: case '{': {
0912:
0913: // We look for our bracket in the list
0914: boolean found = false;
0915: int i;
0916: allocBrackets();
0917:
0918: for (i = 0; i < brackets; i++) {
0919: if (bracketStart[i] == idx) {
0920: found = true;
0921: break;
0922: }
0923: }
0924: // If its not in the list we parse the {m,n}
0925: if (!found) {
0926: if (brackets >= maxBrackets) {
0927: syntaxError("Too many bracketed closures (limit is 10)");
0928: }
0929:
0930: bracketStart[brackets] = idx;
0931: bracket();
0932: bracketEnd[brackets] = idx;
0933: i = brackets++;
0934: }
0935: // If there's a min, rewind stream and reparse
0936: if (--bracketMin[i] > 0) {
0937:
0938: // Rewind stream and run it through again
0939: idx = idxBeforeTerminal;
0940: break;
0941: }
0942: // Do the right thing for maximum ({m,})
0943: if (bracketOpt[i] == bracketFinished) {
0944:
0945: // Drop through now and closure expression.
0946: // We are done with the {m,} expr, so skip rest
0947: closureType = '*';
0948: bracketOpt[i] = 0;
0949: idx = bracketEnd[i];
0950: } else if (bracketOpt[i] == bracketUnbounded) {
0951: idx = idxBeforeTerminal;
0952: bracketOpt[i] = bracketFinished;
0953: break;
0954: } else if (bracketOpt[i]-- > 0) {
0955:
0956: // Drop through to optionally close and then 'play it again sam!'
0957: idx = idxBeforeTerminal;
0958: closureType = '?';
0959: } else {
0960:
0961: // We are done. skip the rest of {m,n} expr
0962: idx = bracketEnd[i];
0963: break;
0964: }
0965: }
0966: // Fall through!
0967: case '?':
0968: case '*':
0969: if (!greedy) {
0970: break;
0971: }
0972: if (closureType == '?') {
0973:
0974: // X? is compiled as (X|)
0975: nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
0976: setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // inserted branch to option
0977:
0978: int nothing = node(RE.OP_NOTHING, 0); // which is OP_NOTHING
0979: setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
0980: setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
0981: }
0982: if (closureType == '*') {
0983:
0984: // X* is compiled as (X{gotoX}|)
0985: nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
0986: setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH,
0987: 0)); // end of X points to an option
0988: setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
0989: setNextOfEnd(ret + RE.nodeSize, ret); // the start again
0990: setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
0991: setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
0992: }
0993:
0994: break;
0995:
0996: case '+': {
0997:
0998: // X+ is compiled as X({gotoX}|)
0999: int branch;
1000: branch = node(RE.OP_BRANCH, 0); // a new branch
1001: setNextOfEnd(ret, branch); // is added to the end of X
1002: setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
1003: setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
1004: setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
1005: }
1006:
1007: break;
1008: }
1009: } else {
1010:
1011: // Add end after closured subexpr
1012: setNextOfEnd(ret, node(RE.OP_END, 0));
1013:
1014: // Actually do the closure now
1015: switch (closureType) {
1016: case '?':
1017: nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
1018: break;
1019:
1020: case '*':
1021: nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
1022: break;
1023:
1024: case '+':
1025: nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
1026: break;
1027: }
1028:
1029: // Point to the expr after the closure
1030: setNextOfEnd(ret, lenInstruction);
1031: }
1032:
1033: return ret;
1034: }
1035:
1036: /**
1037: * Compiles a regular expression pattern into a program runnable by the pattern
1038: * matcher class 'RE'.
1039: *
1040: * @param pattern Regular expression pattern to compile (see RECompiler class
1041: * for details).
1042: * @return A compiled regular expression program.
1043: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1044: * @see RECompiler
1045: * @see RE
1046: */
1047: public REProgram compile(String pattern) throws RESyntaxException {
1048:
1049: // Initialize variables for compilation
1050: this .pattern = pattern; // Save pattern in instance variable
1051: len = pattern.length(); // Precompute pattern length for speed
1052: idx = 0; // Set parsing index to the first character
1053: lenInstruction = 0; // Set emitted instruction count to zero
1054: parens = 1; // Set paren level to 1 (the implicit outer parens)
1055: brackets = 0; // No bracketed closures yet
1056:
1057: // Initialize pass by reference flags value
1058: int[] flags = { NODE_TOPLEVEL };
1059:
1060: // Parse expression
1061: expr(flags);
1062:
1063: // Should be at end of input
1064: if (idx != len) {
1065: if (pattern.charAt(idx) == ')') {
1066: syntaxError("Unmatched close paren");
1067: }
1068:
1069: syntaxError("Unexpected input remains");
1070: }
1071:
1072: // Return the result
1073: char[] ins = new char[lenInstruction];
1074: System.arraycopy(instruction, 0, ins, 0, lenInstruction);
1075:
1076: return new REProgram(ins);
1077: }
1078:
1079: /**
1080: * Emit a single character into the program stream.
1081: *
1082: * @param c Character to add
1083: */
1084: void emit(char c) {
1085:
1086: // Make room for character
1087: ensure(1);
1088:
1089: // Add character
1090: instruction[lenInstruction++] = c;
1091: }
1092:
1093: /**
1094: * Ensures that n more characters can fit in the program buffer.
1095: * If n more can't fit, then the size is doubled until it can.
1096: *
1097: * @param n Number of additional characters to ensure will fit.
1098: */
1099: void ensure(int n) {
1100:
1101: // Get current program length
1102: int curlen = instruction.length;
1103:
1104: // If the current length + n more is too much
1105: if (lenInstruction + n >= curlen) {
1106:
1107: // Double the size of the program array until n more will fit
1108: while (lenInstruction + n >= curlen) {
1109: curlen *= 2;
1110: }
1111:
1112: // Allocate new program array and move data into it
1113: char[] newInstruction = new char[curlen];
1114: System.arraycopy(instruction, 0, newInstruction, 0,
1115: lenInstruction);
1116: instruction = newInstruction;
1117: }
1118: }
1119:
1120: /**
1121: * Match an escape sequence. Handles quoted chars and octal escapes as well
1122: * as normal escape characters. Always advances the input stream by the
1123: * right amount. This code "understands" the subtle difference between an
1124: * octal escape and a backref. You can access the type of ESC_CLASS or
1125: * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
1126: *
1127: * @return ESC_* code or character if simple escape
1128: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1129: */
1130: char escape() throws RESyntaxException {
1131:
1132: // "Shouldn't" happen
1133: if (pattern.charAt(idx) != '\\') {
1134: internalError();
1135: }
1136: // Escape shouldn't occur as last character in string!
1137: if (idx + 1 == len) {
1138: syntaxError("Escape terminates string");
1139: }
1140:
1141: // Switch on character after backslash
1142: idx += 2;
1143:
1144: char escapeChar = pattern.charAt(idx - 1);
1145:
1146: switch (escapeChar) {
1147: case RE.E_BOUND:
1148: case RE.E_NBOUND:
1149: return ESC_COMPLEX;
1150:
1151: case RE.E_ALNUM:
1152: case RE.E_NALNUM:
1153: case RE.E_SPACE:
1154: case RE.E_NSPACE:
1155: case RE.E_DIGIT:
1156: case RE.E_NDIGIT:
1157: return ESC_CLASS;
1158:
1159: case 'u':
1160: case 'x': {
1161:
1162: // Exact required hex digits for escape type
1163: int hexDigits = (escapeChar == 'u' ? 4 : 2);
1164:
1165: // Parse up to hexDigits characters from input
1166: int val = 0;
1167:
1168: for (; idx < len && hexDigits-- > 0; idx++) {
1169:
1170: // Get char
1171: char c = pattern.charAt(idx);
1172:
1173: // If it's a hexadecimal digit (0-9)
1174: if (c >= '0' && c <= '9') {
1175:
1176: // Compute new value
1177: val = (val << 4) + c - '0';
1178: } else {
1179:
1180: // If it's a hexadecimal letter (a-f)
1181: c = Character.toLowerCase(c);
1182:
1183: if (c >= 'a' && c <= 'f') {
1184:
1185: // Compute new value
1186: val = (val << 4) + (c - 'a') + 10;
1187: } else {
1188:
1189: // If it's not a valid digit or hex letter, the escape must be invalid
1190: // because hexDigits of input have not been absorbed yet.
1191: syntaxError("Expected " + hexDigits
1192: + " hexadecimal digits after \\"
1193: + escapeChar);
1194: }
1195: }
1196: }
1197:
1198: return (char) val;
1199: }
1200: case 't':
1201: return '\t';
1202:
1203: case 'n':
1204: return '\n';
1205:
1206: case 'r':
1207: return '\r';
1208:
1209: case 'f':
1210: return '\f';
1211:
1212: case '0':
1213: case '1':
1214: case '2':
1215: case '3':
1216: case '4':
1217: case '5':
1218: case '6':
1219: case '7':
1220: case '8':
1221: case '9':
1222:
1223: // An octal escape starts with a 0 or has two digits in a row
1224: if ((idx < len && Character.isDigit(pattern.charAt(idx)))
1225: || escapeChar == '0') {
1226:
1227: // Handle \nnn octal escapes
1228: int val = escapeChar - '0';
1229:
1230: if (idx < len && Character.isDigit(pattern.charAt(idx))) {
1231: val = ((val << 3) + (pattern.charAt(idx++) - '0'));
1232:
1233: if (idx < len
1234: && Character.isDigit(pattern.charAt(idx))) {
1235: val = ((val << 3) + (pattern.charAt(idx++) - '0'));
1236: }
1237: }
1238:
1239: return (char) val;
1240: }
1241:
1242: // It's actually a backreference (\[1-9]), not an escape
1243: return ESC_BACKREF;
1244:
1245: default:
1246:
1247: // Simple quoting of a character
1248: return escapeChar;
1249: }
1250: }
1251:
1252: /**
1253: * Compile an expression with possible parens around it. Paren matching
1254: * is done at this level so we can tie the branch tails together.
1255: *
1256: * @param flags Flag value passed by reference
1257: * @return Node index of expression in instruction array
1258: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1259: */
1260: int expr(int[] flags) throws RESyntaxException {
1261:
1262: // Create open paren node unless we were called from the top level (which has no parens)
1263: boolean paren = false;
1264: int ret = -1;
1265: int closeParens = parens;
1266:
1267: if ((flags[0] & NODE_TOPLEVEL) == 0
1268: && pattern.charAt(idx) == '(') {
1269: idx++;
1270: paren = true;
1271: ret = node(RE.OP_OPEN, parens++);
1272: }
1273:
1274: flags[0] &= ~NODE_TOPLEVEL;
1275:
1276: // Create a branch node
1277: int branch = branch(flags);
1278:
1279: if (ret == -1) {
1280: ret = branch;
1281: } else {
1282: setNextOfEnd(ret, branch);
1283: }
1284: // Loop through branches
1285: while (idx < len && pattern.charAt(idx) == '|') {
1286: idx++;
1287: branch = branch(flags);
1288: setNextOfEnd(ret, branch);
1289: }
1290:
1291: // Create an ending node (either a close paren or an OP_END)
1292: int end;
1293:
1294: if (paren) {
1295: if (idx < len && pattern.charAt(idx) == ')') {
1296: idx++;
1297: } else {
1298: syntaxError("Missing close paren");
1299: }
1300:
1301: end = node(RE.OP_CLOSE, closeParens);
1302: } else {
1303: end = node(RE.OP_END, 0);
1304: }
1305:
1306: // Append the ending node to the ret nodelist
1307: setNextOfEnd(ret, end);
1308:
1309: // Hook the ends of each branch to the end node
1310: for (int next = -1, i = ret; next != 0; next = instruction[i
1311: + RE.offsetNext], i += next) {
1312:
1313: // If branch, make the end of the branch's operand chain point to the end node.
1314: if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH) {
1315: setNextOfEnd(i + RE.nodeSize, end);
1316: }
1317: }
1318:
1319: // Return the node list
1320: return ret;
1321: }
1322:
1323: /**
1324: * Throws a new internal error exception
1325: *
1326: * @throws Error Thrown in the event of an internal error.
1327: */
1328: void internalError() throws Error {
1329: throw new Error("Internal error!");
1330: }
1331:
1332: /**
1333: * Adds a new node
1334: *
1335: * @param opcode Opcode for node
1336: * @param opdata Opdata for node (only the low 16 bits are currently used)
1337: * @return Index of new node in program
1338: */
1339: int node(char opcode, int opdata) {
1340:
1341: // Make room for a new node
1342: ensure(RE.nodeSize);
1343:
1344: // Add new node at end
1345: instruction[lenInstruction + RE.offsetOpcode] = opcode;
1346: instruction[lenInstruction + RE.offsetOpdata] = (char) opdata;
1347: instruction[lenInstruction + RE.offsetNext] = 0;
1348: lenInstruction += RE.nodeSize;
1349:
1350: // Return index of new node
1351: return lenInstruction - RE.nodeSize;
1352: }
1353:
1354: /**
1355: * Inserts a node with a given opcode and opdata at insertAt. The node relative next
1356: * pointer is initialized to 0.
1357: *
1358: * @param opcode Opcode for new node
1359: * @param opdata Opdata for new node (only the low 16 bits are currently used)
1360: * @param insertAt Index at which to insert the new node in the program
1361: */
1362: void nodeInsert(char opcode, int opdata, int insertAt) {
1363:
1364: // Make room for a new node
1365: ensure(RE.nodeSize);
1366:
1367: // Move everything from insertAt to the end down nodeSize elements
1368: System.arraycopy(instruction, insertAt, instruction, insertAt
1369: + RE.nodeSize, lenInstruction - insertAt);
1370: instruction[insertAt + RE.offsetOpcode] = opcode;
1371: instruction[insertAt + RE.offsetOpdata] = (char) opdata;
1372: instruction[insertAt + RE.offsetNext] = 0;
1373: lenInstruction += RE.nodeSize;
1374: }
1375:
1376: /**
1377: * Appends a node to the end of a node chain
1378: *
1379: * @param node Start of node chain to traverse
1380: * @param pointTo Node to have the tail of the chain point to
1381: */
1382: void setNextOfEnd(int node, int pointTo) {
1383:
1384: // Traverse the chain until the next offset is 0
1385: int next;
1386:
1387: while ((next = instruction[node + RE.offsetNext]) != 0) {
1388: node += next;
1389: }
1390:
1391: // Point the last node in the chain to pointTo.
1392: instruction[node + RE.offsetNext] = (char) (short) (pointTo - node);
1393: }
1394:
1395: /**
1396: * Throws a new syntax error exception
1397: *
1398: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1399: */
1400: void syntaxError(String s) throws RESyntaxException {
1401: throw new RESyntaxException(s);
1402: }
1403:
1404: /**
1405: * Match a terminal node.
1406: *
1407: * @param flags Flags
1408: * @return Index of terminal node (closeable)
1409: * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1410: */
1411: int terminal(int[] flags) throws RESyntaxException {
1412: switch (pattern.charAt(idx)) {
1413: case RE.OP_EOL:
1414: case RE.OP_BOL:
1415: case RE.OP_ANY:
1416: return node(pattern.charAt(idx++), 0);
1417:
1418: case '[':
1419: return characterClass();
1420:
1421: case '(':
1422: return expr(flags);
1423:
1424: case ')':
1425: syntaxError("Unexpected close paren");
1426:
1427: case '|':
1428: internalError();
1429:
1430: case ']':
1431: syntaxError("Mismatched class");
1432:
1433: case 0:
1434: syntaxError("Unexpected end of input");
1435:
1436: case '?':
1437: case '+':
1438: case '{':
1439: case '*':
1440: syntaxError("Missing operand to closure");
1441:
1442: case '\\': {
1443:
1444: // Don't forget, escape() advances the input stream!
1445: int idxBeforeEscape = idx;
1446:
1447: // Switch on escaped character
1448: switch (escape()) {
1449: case ESC_CLASS:
1450: case ESC_COMPLEX:
1451: flags[0] &= ~NODE_NULLABLE;
1452:
1453: return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
1454:
1455: case ESC_BACKREF: {
1456: char backreference = (char) (pattern.charAt(idx - 1) - '0');
1457:
1458: if (parens <= backreference) {
1459: syntaxError("Bad backreference");
1460: }
1461:
1462: flags[0] |= NODE_NULLABLE;
1463:
1464: return node(RE.OP_BACKREF, backreference);
1465: }
1466: default:
1467:
1468: // We had a simple escape and we want to have it end up in
1469: // an atom, so we back up and fall though to the default handling
1470: idx = idxBeforeEscape;
1471: flags[0] &= ~NODE_NULLABLE;
1472: break;
1473: }
1474: }
1475: }
1476:
1477: // Everything above either fails or returns.
1478: // If it wasn't one of the above, it must be the start of an atom.
1479: flags[0] &= ~NODE_NULLABLE;
1480:
1481: return atom();
1482: }
1483: }
|