0001: /* -*- Mode: java; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
0002: *
0003: * ***** BEGIN LICENSE BLOCK *****
0004: * Version: MPL 1.1/GPL 2.0
0005: *
0006: * The contents of this file are subject to the Mozilla Public License Version
0007: * 1.1 (the "License"); you may not use this file except in compliance with
0008: * the License. You may obtain a copy of the License at
0009: * http://www.mozilla.org/MPL/
0010: *
0011: * Software distributed under the License is distributed on an "AS IS" basis,
0012: * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
0013: * for the specific language governing rights and limitations under the
0014: * License.
0015: *
0016: * The Original Code is Rhino code, released
0017: * May 6, 1998.
0018: *
0019: * The Initial Developer of the Original Code is
0020: * Netscape Communications Corporation.
0021: * Portions created by the Initial Developer are Copyright (C) 1997-1999
0022: * the Initial Developer. All Rights Reserved.
0023: *
0024: * Contributor(s):
0025: * Norris Boyd
0026: * Igor Bukanov
0027: * Brendan Eich
0028: * Matthias Radestock
0029: *
0030: * Alternatively, the contents of this file may be used under the terms of
0031: * the GNU General Public License Version 2 or later (the "GPL"), in which
0032: * case the provisions of the GPL are applicable instead of those above. If
0033: * you wish to allow use of your version of this file only under the terms of
0034: * the GPL and not to allow others to use your version of this file under the
0035: * MPL, indicate your decision by deleting the provisions above and replacing
0036: * them with the notice and other provisions required by the GPL. If you do
0037: * not delete the provisions above, a recipient may use your version of this
0038: * file under either the MPL or the GPL.
0039: *
0040: * ***** END LICENSE BLOCK ***** */
0041:
0042: package org.mozilla.javascript.regexp;
0043:
0044: import java.io.Serializable;
0045:
0046: import org.mozilla.javascript.Context;
0047: import org.mozilla.javascript.Function;
0048: import org.mozilla.javascript.IdFunctionObject;
0049: import org.mozilla.javascript.IdScriptableObject;
0050: import org.mozilla.javascript.Kit;
0051: import org.mozilla.javascript.ScriptRuntime;
0052: import org.mozilla.javascript.Scriptable;
0053: import org.mozilla.javascript.ScriptableObject;
0054: import org.mozilla.javascript.Undefined;
0055:
0056: /**
0057: * This class implements the RegExp native object.
0058: *
0059: * Revision History:
0060: * Implementation in C by Brendan Eich
0061: * Initial port to Java by Norris Boyd from jsregexp.c version 1.36
0062: * Merged up to version 1.38, which included Unicode support.
0063: * Merged bug fixes in version 1.39.
0064: * Merged JSFUN13_BRANCH changes up to 1.32.2.13
0065: *
0066: * @author Brendan Eich
0067: * @author Norris Boyd
0068: */
0069:
0070: public class NativeRegExp extends IdScriptableObject implements
0071: Function {
0072: static final long serialVersionUID = 4965263491464903264L;
0073:
0074: private static final Object REGEXP_TAG = new Object();
0075:
0076: public static final int JSREG_GLOB = 0x1; // 'g' flag: global
0077: public static final int JSREG_FOLD = 0x2; // 'i' flag: fold
0078: public static final int JSREG_MULTILINE = 0x4; // 'm' flag: multiline
0079:
0080: //type of match to perform
0081: public static final int TEST = 0;
0082: public static final int MATCH = 1;
0083: public static final int PREFIX = 2;
0084:
0085: private static final boolean debug = false;
0086:
0087: private static final byte REOP_EMPTY = 0; /* match rest of input against rest of r.e. */
0088: private static final byte REOP_ALT = 1; /* alternative subexpressions in kid and next */
0089: private static final byte REOP_BOL = 2; /* beginning of input (or line if multiline) */
0090: private static final byte REOP_EOL = 3; /* end of input (or line if multiline) */
0091: private static final byte REOP_WBDRY = 4; /* match "" at word boundary */
0092: private static final byte REOP_WNONBDRY = 5; /* match "" at word non-boundary */
0093: private static final byte REOP_QUANT = 6; /* quantified atom: atom{1,2} */
0094: private static final byte REOP_STAR = 7; /* zero or more occurrences of kid */
0095: private static final byte REOP_PLUS = 8; /* one or more occurrences of kid */
0096: private static final byte REOP_OPT = 9; /* optional subexpression in kid */
0097: private static final byte REOP_LPAREN = 10; /* left paren bytecode: kid is u.num'th sub-regexp */
0098: private static final byte REOP_RPAREN = 11; /* right paren bytecode */
0099: private static final byte REOP_DOT = 12; /* stands for any character */
0100: // private static final byte REOP_CCLASS = 13; /* character class: [a-f] */
0101: private static final byte REOP_DIGIT = 14; /* match a digit char: [0-9] */
0102: private static final byte REOP_NONDIGIT = 15; /* match a non-digit char: [^0-9] */
0103: private static final byte REOP_ALNUM = 16; /* match an alphanumeric char: [0-9a-z_A-Z] */
0104: private static final byte REOP_NONALNUM = 17; /* match a non-alphanumeric char: [^0-9a-z_A-Z] */
0105: private static final byte REOP_SPACE = 18; /* match a whitespace char */
0106: private static final byte REOP_NONSPACE = 19; /* match a non-whitespace char */
0107: private static final byte REOP_BACKREF = 20; /* back-reference (e.g., \1) to a parenthetical */
0108: private static final byte REOP_FLAT = 21; /* match a flat string */
0109: private static final byte REOP_FLAT1 = 22; /* match a single char */
0110: private static final byte REOP_JUMP = 23; /* for deoptimized closure loops */
0111: // private static final byte REOP_DOTSTAR = 24; /* optimize .* to use a single opcode */
0112: // private static final byte REOP_ANCHOR = 25; /* like .* but skips left context to unanchored r.e. */
0113: // private static final byte REOP_EOLONLY = 26; /* $ not preceded by any pattern */
0114: // private static final byte REOP_UCFLAT = 27; /* flat Unicode string; len immediate counts chars */
0115: private static final byte REOP_UCFLAT1 = 28; /* single Unicode char */
0116: // private static final byte REOP_UCCLASS = 29; /* Unicode character class, vector of chars to match */
0117: // private static final byte REOP_NUCCLASS = 30; /* negated Unicode character class */
0118: // private static final byte REOP_BACKREFi = 31; /* case-independent REOP_BACKREF */
0119: private static final byte REOP_FLATi = 32; /* case-independent REOP_FLAT */
0120: private static final byte REOP_FLAT1i = 33; /* case-independent REOP_FLAT1 */
0121: // private static final byte REOP_UCFLATi = 34; /* case-independent REOP_UCFLAT */
0122: private static final byte REOP_UCFLAT1i = 35; /* case-independent REOP_UCFLAT1 */
0123: // private static final byte REOP_ANCHOR1 = 36; /* first-char discriminating REOP_ANCHOR */
0124: // private static final byte REOP_NCCLASS = 37; /* negated 8-bit character class */
0125: // private static final byte REOP_DOTSTARMIN = 38; /* ungreedy version of REOP_DOTSTAR */
0126: // private static final byte REOP_LPARENNON = 39; /* non-capturing version of REOP_LPAREN */
0127: // private static final byte REOP_RPARENNON = 40; /* non-capturing version of REOP_RPAREN */
0128: private static final byte REOP_ASSERT = 41; /* zero width positive lookahead assertion */
0129: private static final byte REOP_ASSERT_NOT = 42; /* zero width negative lookahead assertion */
0130: private static final byte REOP_ASSERTTEST = 43; /* sentinel at end of assertion child */
0131: private static final byte REOP_ASSERTNOTTEST = 44; /* sentinel at end of !assertion child */
0132: private static final byte REOP_MINIMALSTAR = 45; /* non-greedy version of * */
0133: private static final byte REOP_MINIMALPLUS = 46; /* non-greedy version of + */
0134: private static final byte REOP_MINIMALOPT = 47; /* non-greedy version of ? */
0135: private static final byte REOP_MINIMALQUANT = 48; /* non-greedy version of {} */
0136: private static final byte REOP_ENDCHILD = 49; /* sentinel at end of quantifier child */
0137: private static final byte REOP_CLASS = 50; /* character class with index */
0138: private static final byte REOP_REPEAT = 51; /* directs execution of greedy quantifier */
0139: private static final byte REOP_MINIMALREPEAT = 52; /* directs execution of non-greedy quantifier */
0140: private static final byte REOP_END = 53;
0141:
0142: public static void init(Context cx, Scriptable scope, boolean sealed) {
0143:
0144: NativeRegExp proto = new NativeRegExp();
0145: proto.re = (RECompiled) compileRE(cx, "", null, false);
0146: proto.activatePrototypeMap(MAX_PROTOTYPE_ID);
0147: proto.setParentScope(scope);
0148: proto.setPrototype(getObjectPrototype(scope));
0149:
0150: NativeRegExpCtor ctor = new NativeRegExpCtor();
0151: // Bug #324006: ECMA-262 15.10.6.1 says "The initial value of
0152: // RegExp.prototype.constructor is the builtin RegExp constructor."
0153: proto.put("constructor", proto, ctor);
0154:
0155: ScriptRuntime.setFunctionProtoAndParent(ctor, scope);
0156:
0157: ctor.setImmunePrototypeProperty(proto);
0158:
0159: if (sealed) {
0160: proto.sealObject();
0161: ctor.sealObject();
0162: }
0163:
0164: defineProperty(scope, "RegExp", ctor, ScriptableObject.DONTENUM);
0165: }
0166:
0167: NativeRegExp(Scriptable scope, Object regexpCompiled) {
0168: this .re = (RECompiled) regexpCompiled;
0169: this .lastIndex = 0;
0170: ScriptRuntime.setObjectProtoAndParent(this , scope);
0171: }
0172:
0173: public String getClassName() {
0174: return "RegExp";
0175: }
0176:
0177: public Object call(Context cx, Scriptable scope,
0178: Scriptable this Obj, Object[] args) {
0179: return execSub(cx, scope, args, MATCH);
0180: }
0181:
0182: public Scriptable construct(Context cx, Scriptable scope,
0183: Object[] args) {
0184: return (Scriptable) execSub(cx, scope, args, MATCH);
0185: }
0186:
0187: Scriptable compile(Context cx, Scriptable scope, Object[] args) {
0188: if (args.length > 0 && args[0] instanceof NativeRegExp) {
0189: if (args.length > 1 && args[1] != Undefined.instance) {
0190: // report error
0191: throw ScriptRuntime
0192: .typeError0("msg.bad.regexp.compile");
0193: }
0194: NativeRegExp thatObj = (NativeRegExp) args[0];
0195: this .re = thatObj.re;
0196: this .lastIndex = thatObj.lastIndex;
0197: return this ;
0198: }
0199: String s = args.length == 0 ? "" : ScriptRuntime
0200: .toString(args[0]);
0201: String global = args.length > 1
0202: && args[1] != Undefined.instance ? ScriptRuntime
0203: .toString(args[1]) : null;
0204: this .re = (RECompiled) compileRE(cx, s, global, false);
0205: this .lastIndex = 0;
0206: return this ;
0207: }
0208:
0209: public String toString() {
0210: StringBuffer buf = new StringBuffer();
0211: buf.append('/');
0212: if (re.source.length != 0) {
0213: buf.append(re.source);
0214: } else {
0215: // See bugzilla 226045
0216: buf.append("(?:)");
0217: }
0218: buf.append('/');
0219: if ((re.flags & JSREG_GLOB) != 0)
0220: buf.append('g');
0221: if ((re.flags & JSREG_FOLD) != 0)
0222: buf.append('i');
0223: if ((re.flags & JSREG_MULTILINE) != 0)
0224: buf.append('m');
0225: return buf.toString();
0226: }
0227:
0228: NativeRegExp() {
0229: }
0230:
0231: private static RegExpImpl getImpl(Context cx) {
0232: return (RegExpImpl) ScriptRuntime.getRegExpProxy(cx);
0233: }
0234:
0235: private Object execSub(Context cx, Scriptable scopeObj,
0236: Object[] args, int matchType) {
0237: RegExpImpl reImpl = getImpl(cx);
0238: String str;
0239: if (args.length == 0) {
0240: str = reImpl.input;
0241: if (str == null) {
0242: reportError("msg.no.re.input.for", toString());
0243: }
0244: } else {
0245: str = ScriptRuntime.toString(args[0]);
0246: }
0247: double d = ((re.flags & JSREG_GLOB) != 0) ? lastIndex : 0;
0248:
0249: Object rval;
0250: if (d < 0 || str.length() < d) {
0251: lastIndex = 0;
0252: rval = null;
0253: } else {
0254: int indexp[] = { (int) d };
0255: rval = executeRegExp(cx, scopeObj, reImpl, str, indexp,
0256: matchType);
0257: if ((re.flags & JSREG_GLOB) != 0) {
0258: lastIndex = (rval == null || rval == Undefined.instance) ? 0
0259: : indexp[0];
0260: }
0261: }
0262: return rval;
0263: }
0264:
0265: static Object compileRE(Context cx, String str, String global,
0266: boolean flat) {
0267: RECompiled regexp = new RECompiled();
0268: regexp.source = str.toCharArray();
0269: int length = str.length();
0270:
0271: int flags = 0;
0272: if (global != null) {
0273: for (int i = 0; i < global.length(); i++) {
0274: char c = global.charAt(i);
0275: if (c == 'g') {
0276: flags |= JSREG_GLOB;
0277: } else if (c == 'i') {
0278: flags |= JSREG_FOLD;
0279: } else if (c == 'm') {
0280: flags |= JSREG_MULTILINE;
0281: } else {
0282: reportError("msg.invalid.re.flag", String
0283: .valueOf(c));
0284: }
0285: }
0286: }
0287: regexp.flags = flags;
0288:
0289: CompilerState state = new CompilerState(cx, regexp.source,
0290: length, flags);
0291: if (flat && length > 0) {
0292: if (debug) {
0293: System.out.println("flat = \"" + str + "\"");
0294: }
0295: state.result = new RENode(REOP_FLAT);
0296: state.result.chr = state.cpbegin[0];
0297: state.result.length = length;
0298: state.result.flatIndex = 0;
0299: state.progLength += 5;
0300: } else if (!parseDisjunction(state))
0301: return null;
0302:
0303: regexp.program = new byte[state.progLength + 1];
0304: if (state.classCount != 0) {
0305: regexp.classList = new RECharSet[state.classCount];
0306: regexp.classCount = state.classCount;
0307: }
0308: int endPC = emitREBytecode(state, regexp, 0, state.result);
0309: regexp.program[endPC++] = REOP_END;
0310:
0311: if (debug) {
0312: System.out.println("Prog. length = " + endPC);
0313: for (int i = 0; i < endPC; i++) {
0314: System.out.print(regexp.program[i]);
0315: if (i < (endPC - 1))
0316: System.out.print(", ");
0317: }
0318: System.out.println();
0319: }
0320: regexp.parenCount = state.parenCount;
0321:
0322: // If re starts with literal, init anchorCh accordingly
0323: switch (regexp.program[0]) {
0324: case REOP_UCFLAT1:
0325: case REOP_UCFLAT1i:
0326: regexp.anchorCh = (char) getIndex(regexp.program, 1);
0327: break;
0328: case REOP_FLAT1:
0329: case REOP_FLAT1i:
0330: regexp.anchorCh = (char) (regexp.program[1] & 0xFF);
0331: break;
0332: case REOP_FLAT:
0333: case REOP_FLATi:
0334: int k = getIndex(regexp.program, 1);
0335: regexp.anchorCh = regexp.source[k];
0336: break;
0337: }
0338:
0339: if (debug) {
0340: if (regexp.anchorCh >= 0) {
0341: System.out.println("Anchor ch = '"
0342: + (char) regexp.anchorCh + "'");
0343: }
0344: }
0345: return regexp;
0346: }
0347:
0348: static boolean isDigit(char c) {
0349: return '0' <= c && c <= '9';
0350: }
0351:
0352: private static boolean isWord(char c) {
0353: return Character.isLetter(c) || isDigit(c) || c == '_';
0354: }
0355:
0356: private static boolean isLineTerm(char c) {
0357: return ScriptRuntime.isJSLineTerminator(c);
0358: }
0359:
0360: private static boolean isREWhiteSpace(int c) {
0361: return (c == '\u0020' || c == '\u0009' || c == '\n'
0362: || c == '\r' || c == 0x2028 || c == 0x2029
0363: || c == '\u000C' || c == '\u000B' || c == '\u00A0' || Character
0364: .getType((char) c) == Character.SPACE_SEPARATOR);
0365: }
0366:
0367: /*
0368: *
0369: * 1. If IgnoreCase is false, return ch.
0370: * 2. Let u be ch converted to upper case as if by calling
0371: * String.prototype.toUpperCase on the one-character string ch.
0372: * 3. If u does not consist of a single character, return ch.
0373: * 4. Let cu be u's character.
0374: * 5. If ch's code point value is greater than or equal to decimal 128 and cu's
0375: * code point value is less than decimal 128, then return ch.
0376: * 6. Return cu.
0377: */
0378: private static char upcase(char ch) {
0379: if (ch < 128) {
0380: if ('a' <= ch && ch <= 'z') {
0381: return (char) (ch + ('A' - 'a'));
0382: }
0383: return ch;
0384: }
0385: char cu = Character.toUpperCase(ch);
0386: if ((ch >= 128) && (cu < 128))
0387: return ch;
0388: return cu;
0389: }
0390:
0391: private static char downcase(char ch) {
0392: if (ch < 128) {
0393: if ('A' <= ch && ch <= 'Z') {
0394: return (char) (ch + ('a' - 'A'));
0395: }
0396: return ch;
0397: }
0398: char cl = Character.toLowerCase(ch);
0399: if ((ch >= 128) && (cl < 128))
0400: return ch;
0401: return cl;
0402: }
0403:
0404: /*
0405: * Validates and converts hex ascii value.
0406: */
0407: private static int toASCIIHexDigit(int c) {
0408: if (c < '0')
0409: return -1;
0410: if (c <= '9') {
0411: return c - '0';
0412: }
0413: c |= 0x20;
0414: if ('a' <= c && c <= 'f') {
0415: return c - 'a' + 10;
0416: }
0417: return -1;
0418: }
0419:
0420: /*
0421: * Top-down regular expression grammar, based closely on Perl4.
0422: *
0423: * regexp: altern A regular expression is one or more
0424: * altern '|' regexp alternatives separated by vertical bar.
0425: */
0426: private static boolean parseDisjunction(CompilerState state) {
0427: if (!parseAlternative(state))
0428: return false;
0429: char[] source = state.cpbegin;
0430: int index = state.cp;
0431: if (index != source.length && source[index] == '|') {
0432: RENode altResult;
0433: ++state.cp;
0434: altResult = new RENode(REOP_ALT);
0435: altResult.kid = state.result;
0436: if (!parseDisjunction(state))
0437: return false;
0438: altResult.kid2 = state.result;
0439: state.result = altResult;
0440: /* ALT, <next>, ..., JUMP, <end> ... JUMP <end> */
0441: state.progLength += 9;
0442: }
0443: return true;
0444: }
0445:
0446: /*
0447: * altern: item An alternative is one or more items,
0448: * item altern concatenated together.
0449: */
0450: private static boolean parseAlternative(CompilerState state) {
0451: RENode headTerm = null;
0452: RENode tailTerm = null;
0453: char[] source = state.cpbegin;
0454: while (true) {
0455: if (state.cp == state.cpend
0456: || source[state.cp] == '|'
0457: || (state.parenNesting != 0 && source[state.cp] == ')')) {
0458: if (headTerm == null) {
0459: state.result = new RENode(REOP_EMPTY);
0460: } else
0461: state.result = headTerm;
0462: return true;
0463: }
0464: if (!parseTerm(state))
0465: return false;
0466: if (headTerm == null)
0467: headTerm = state.result;
0468: else {
0469: if (tailTerm == null) {
0470: headTerm.next = state.result;
0471: tailTerm = state.result;
0472: while (tailTerm.next != null)
0473: tailTerm = tailTerm.next;
0474: } else {
0475: tailTerm.next = state.result;
0476: tailTerm = tailTerm.next;
0477: while (tailTerm.next != null)
0478: tailTerm = tailTerm.next;
0479: }
0480: }
0481: }
0482: }
0483:
0484: /* calculate the total size of the bitmap required for a class expression */
0485: private static boolean calculateBitmapSize(CompilerState state,
0486: RENode target, char[] src, int index, int end) {
0487: char rangeStart = 0;
0488: char c;
0489: int n;
0490: int nDigits;
0491: int i;
0492: int max = 0;
0493: boolean inRange = false;
0494:
0495: target.bmsize = 0;
0496:
0497: if (index == end)
0498: return true;
0499:
0500: if (src[index] == '^')
0501: ++index;
0502:
0503: while (index != end) {
0504: int localMax = 0;
0505: nDigits = 2;
0506: switch (src[index]) {
0507: case '\\':
0508: ++index;
0509: c = src[index++];
0510: switch (c) {
0511: case 'b':
0512: localMax = 0x8;
0513: break;
0514: case 'f':
0515: localMax = 0xC;
0516: break;
0517: case 'n':
0518: localMax = 0xA;
0519: break;
0520: case 'r':
0521: localMax = 0xD;
0522: break;
0523: case 't':
0524: localMax = 0x9;
0525: break;
0526: case 'v':
0527: localMax = 0xB;
0528: break;
0529: case 'c':
0530: if (((index + 1) < end)
0531: && Character.isLetter(src[index + 1]))
0532: localMax = (char) (src[index++] & 0x1F);
0533: else
0534: localMax = '\\';
0535: break;
0536: case 'u':
0537: nDigits += 2;
0538: // fall thru...
0539: case 'x':
0540: n = 0;
0541: for (i = 0; (i < nDigits) && (index < end); i++) {
0542: c = src[index++];
0543: n = Kit.xDigitToInt(c, n);
0544: if (n < 0) {
0545: // Back off to accepting the original
0546: // '\' as a literal
0547: index -= (i + 1);
0548: n = '\\';
0549: break;
0550: }
0551: }
0552: localMax = n;
0553: break;
0554: case 'd':
0555: if (inRange) {
0556: reportError("msg.bad.range", "");
0557: return false;
0558: }
0559: localMax = '9';
0560: break;
0561: case 'D':
0562: case 's':
0563: case 'S':
0564: case 'w':
0565: case 'W':
0566: if (inRange) {
0567: reportError("msg.bad.range", "");
0568: return false;
0569: }
0570: target.bmsize = 65535;
0571: return true;
0572: case '0':
0573: case '1':
0574: case '2':
0575: case '3':
0576: case '4':
0577: case '5':
0578: case '6':
0579: case '7':
0580: /*
0581: * This is a non-ECMA extension - decimal escapes (in this
0582: * case, octal!) are supposed to be an error inside class
0583: * ranges, but supported here for backwards compatibility.
0584: *
0585: */
0586: n = (c - '0');
0587: c = src[index];
0588: if ('0' <= c && c <= '7') {
0589: index++;
0590: n = 8 * n + (c - '0');
0591: c = src[index];
0592: if ('0' <= c && c <= '7') {
0593: index++;
0594: i = 8 * n + (c - '0');
0595: if (i <= 0377)
0596: n = i;
0597: else
0598: index--;
0599: }
0600: }
0601: localMax = n;
0602: break;
0603:
0604: default:
0605: localMax = c;
0606: break;
0607: }
0608: break;
0609: default:
0610: localMax = src[index++];
0611: break;
0612: }
0613: if (inRange) {
0614: if (rangeStart > localMax) {
0615: reportError("msg.bad.range", "");
0616: return false;
0617: }
0618: inRange = false;
0619: } else {
0620: if (index < (end - 1)) {
0621: if (src[index] == '-') {
0622: ++index;
0623: inRange = true;
0624: rangeStart = (char) localMax;
0625: continue;
0626: }
0627: }
0628: }
0629: if ((state.flags & JSREG_FOLD) != 0) {
0630: char cu = upcase((char) localMax);
0631: char cd = downcase((char) localMax);
0632: localMax = (cu >= cd) ? cu : cd;
0633: }
0634: if (localMax > max)
0635: max = localMax;
0636: }
0637: target.bmsize = max;
0638: return true;
0639: }
0640:
0641: /*
0642: * item: assertion An item is either an assertion or
0643: * quantatom a quantified atom.
0644: *
0645: * assertion: '^' Assertions match beginning of string
0646: * (or line if the class static property
0647: * RegExp.multiline is true).
0648: * '$' End of string (or line if the class
0649: * static property RegExp.multiline is
0650: * true).
0651: * '\b' Word boundary (between \w and \W).
0652: * '\B' Word non-boundary.
0653: *
0654: * quantatom: atom An unquantified atom.
0655: * quantatom '{' n ',' m '}'
0656: * Atom must occur between n and m times.
0657: * quantatom '{' n ',' '}' Atom must occur at least n times.
0658: * quantatom '{' n '}' Atom must occur exactly n times.
0659: * quantatom '*' Zero or more times (same as {0,}).
0660: * quantatom '+' One or more times (same as {1,}).
0661: * quantatom '?' Zero or one time (same as {0,1}).
0662: *
0663: * any of which can be optionally followed by '?' for ungreedy
0664: *
0665: * atom: '(' regexp ')' A parenthesized regexp (what matched
0666: * can be addressed using a backreference,
0667: * see '\' n below).
0668: * '.' Matches any char except '\n'.
0669: * '[' classlist ']' A character class.
0670: * '[' '^' classlist ']' A negated character class.
0671: * '\f' Form Feed.
0672: * '\n' Newline (Line Feed).
0673: * '\r' Carriage Return.
0674: * '\t' Horizontal Tab.
0675: * '\v' Vertical Tab.
0676: * '\d' A digit (same as [0-9]).
0677: * '\D' A non-digit.
0678: * '\w' A word character, [0-9a-z_A-Z].
0679: * '\W' A non-word character.
0680: * '\s' A whitespace character, [ \b\f\n\r\t\v].
0681: * '\S' A non-whitespace character.
0682: * '\' n A backreference to the nth (n decimal
0683: * and positive) parenthesized expression.
0684: * '\' octal An octal escape sequence (octal must be
0685: * two or three digits long, unless it is
0686: * 0 for the null character).
0687: * '\x' hex A hex escape (hex must be two digits).
0688: * '\c' ctrl A control character, ctrl is a letter.
0689: * '\' literalatomchar Any character except one of the above
0690: * that follow '\' in an atom.
0691: * otheratomchar Any character not first among the other
0692: * atom right-hand sides.
0693: */
0694:
0695: private static void doFlat(CompilerState state, char c) {
0696: state.result = new RENode(REOP_FLAT);
0697: state.result.chr = c;
0698: state.result.length = 1;
0699: state.result.flatIndex = -1;
0700: state.progLength += 3;
0701: }
0702:
0703: private static int getDecimalValue(char c, CompilerState state,
0704: int maxValue, String overflowMessageId) {
0705: boolean overflow = false;
0706: int start = state.cp;
0707: char[] src = state.cpbegin;
0708: int value = c - '0';
0709: for (; state.cp != state.cpend; ++state.cp) {
0710: c = src[state.cp];
0711: if (!isDigit(c)) {
0712: break;
0713: }
0714: if (!overflow) {
0715: int digit = c - '0';
0716: if (value < (maxValue - digit) / 10) {
0717: value = value * 10 + digit;
0718: } else {
0719: overflow = true;
0720: value = maxValue;
0721: }
0722: }
0723: }
0724: if (overflow) {
0725: reportError(overflowMessageId, String.valueOf(src, start,
0726: state.cp - start));
0727: }
0728: return value;
0729: }
0730:
0731: private static boolean parseTerm(CompilerState state) {
0732: char[] src = state.cpbegin;
0733: char c = src[state.cp++];
0734: int nDigits = 2;
0735: int parenBaseCount = state.parenCount;
0736: int num, tmp;
0737: RENode term;
0738: int termStart;
0739:
0740: switch (c) {
0741: /* assertions and atoms */
0742: case '^':
0743: state.result = new RENode(REOP_BOL);
0744: state.progLength++;
0745: return true;
0746: case '$':
0747: state.result = new RENode(REOP_EOL);
0748: state.progLength++;
0749: return true;
0750: case '\\':
0751: if (state.cp < state.cpend) {
0752: c = src[state.cp++];
0753: switch (c) {
0754: /* assertion escapes */
0755: case 'b':
0756: state.result = new RENode(REOP_WBDRY);
0757: state.progLength++;
0758: return true;
0759: case 'B':
0760: state.result = new RENode(REOP_WNONBDRY);
0761: state.progLength++;
0762: return true;
0763: /* Decimal escape */
0764: case '0':
0765: /*
0766: * Under 'strict' ECMA 3, we interpret \0 as NUL and don't accept octal.
0767: * However, (XXX and since Rhino doesn't have a 'strict' mode) we'll just
0768: * behave the old way for compatibility reasons.
0769: * (see http://bugzilla.mozilla.org/show_bug.cgi?id=141078)
0770: *
0771: */
0772: reportWarning(state.cx, "msg.bad.backref", "");
0773: /* octal escape */
0774: num = 0;
0775: while (state.cp < state.cpend) {
0776: c = src[state.cp];
0777: if ((c >= '0') && (c <= '7')) {
0778: state.cp++;
0779: tmp = 8 * num + (c - '0');
0780: if (tmp > 0377)
0781: break;
0782: num = tmp;
0783: } else
0784: break;
0785: }
0786: c = (char) (num);
0787: doFlat(state, c);
0788: break;
0789: case '1':
0790: case '2':
0791: case '3':
0792: case '4':
0793: case '5':
0794: case '6':
0795: case '7':
0796: case '8':
0797: case '9':
0798: termStart = state.cp - 1;
0799: num = getDecimalValue(c, state, 0xFFFF,
0800: "msg.overlarge.backref");
0801: if (num > state.parenCount)
0802: reportWarning(state.cx, "msg.bad.backref", "");
0803: /*
0804: * n > 9 or > count of parentheses,
0805: * then treat as octal instead.
0806: */
0807: if ((num > 9) && (num > state.parenCount)) {
0808: state.cp = termStart;
0809: num = 0;
0810: while (state.cp < state.cpend) {
0811: c = src[state.cp];
0812: if ((c >= '0') && (c <= '7')) {
0813: state.cp++;
0814: tmp = 8 * num + (c - '0');
0815: if (tmp > 0377)
0816: break;
0817: num = tmp;
0818: } else
0819: break;
0820: }
0821: c = (char) (num);
0822: doFlat(state, c);
0823: break;
0824: }
0825: /* otherwise, it's a back-reference */
0826: state.result = new RENode(REOP_BACKREF);
0827: state.result.parenIndex = num - 1;
0828: state.progLength += 3;
0829: break;
0830: /* Control escape */
0831: case 'f':
0832: c = 0xC;
0833: doFlat(state, c);
0834: break;
0835: case 'n':
0836: c = 0xA;
0837: doFlat(state, c);
0838: break;
0839: case 'r':
0840: c = 0xD;
0841: doFlat(state, c);
0842: break;
0843: case 't':
0844: c = 0x9;
0845: doFlat(state, c);
0846: break;
0847: case 'v':
0848: c = 0xB;
0849: doFlat(state, c);
0850: break;
0851: /* Control letter */
0852: case 'c':
0853: if (((state.cp + 1) < state.cpend)
0854: && Character.isLetter(src[state.cp + 1]))
0855: c = (char) (src[state.cp++] & 0x1F);
0856: else {
0857: /* back off to accepting the original '\' as a literal */
0858: --state.cp;
0859: c = '\\';
0860: }
0861: doFlat(state, c);
0862: break;
0863: /* UnicodeEscapeSequence */
0864: case 'u':
0865: nDigits += 2;
0866: // fall thru...
0867: /* HexEscapeSequence */
0868: case 'x': {
0869: int n = 0;
0870: int i;
0871: for (i = 0; (i < nDigits)
0872: && (state.cp < state.cpend); i++) {
0873: c = src[state.cp++];
0874: n = Kit.xDigitToInt(c, n);
0875: if (n < 0) {
0876: // Back off to accepting the original
0877: // 'u' or 'x' as a literal
0878: state.cp -= (i + 2);
0879: n = src[state.cp++];
0880: break;
0881: }
0882: }
0883: c = (char) (n);
0884: }
0885: doFlat(state, c);
0886: break;
0887: /* Character class escapes */
0888: case 'd':
0889: state.result = new RENode(REOP_DIGIT);
0890: state.progLength++;
0891: break;
0892: case 'D':
0893: state.result = new RENode(REOP_NONDIGIT);
0894: state.progLength++;
0895: break;
0896: case 's':
0897: state.result = new RENode(REOP_SPACE);
0898: state.progLength++;
0899: break;
0900: case 'S':
0901: state.result = new RENode(REOP_NONSPACE);
0902: state.progLength++;
0903: break;
0904: case 'w':
0905: state.result = new RENode(REOP_ALNUM);
0906: state.progLength++;
0907: break;
0908: case 'W':
0909: state.result = new RENode(REOP_NONALNUM);
0910: state.progLength++;
0911: break;
0912: /* IdentityEscape */
0913: default:
0914: state.result = new RENode(REOP_FLAT);
0915: state.result.chr = c;
0916: state.result.length = 1;
0917: state.result.flatIndex = state.cp - 1;
0918: state.progLength += 3;
0919: break;
0920: }
0921: break;
0922: } else {
0923: /* a trailing '\' is an error */
0924: reportError("msg.trail.backslash", "");
0925: return false;
0926: }
0927: case '(': {
0928: RENode result = null;
0929: termStart = state.cp;
0930: if (state.cp + 1 < state.cpend
0931: && src[state.cp] == '?'
0932: && ((c = src[state.cp + 1]) == '=' || c == '!' || c == ':')) {
0933: state.cp += 2;
0934: if (c == '=') {
0935: result = new RENode(REOP_ASSERT);
0936: /* ASSERT, <next>, ... ASSERTTEST */
0937: state.progLength += 4;
0938: } else if (c == '!') {
0939: result = new RENode(REOP_ASSERT_NOT);
0940: /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
0941: state.progLength += 4;
0942: }
0943: } else {
0944: result = new RENode(REOP_LPAREN);
0945: /* LPAREN, <index>, ... RPAREN, <index> */
0946: state.progLength += 6;
0947: result.parenIndex = state.parenCount++;
0948: }
0949: ++state.parenNesting;
0950: if (!parseDisjunction(state))
0951: return false;
0952: if (state.cp == state.cpend || src[state.cp] != ')') {
0953: reportError("msg.unterm.paren", "");
0954: return false;
0955: }
0956: ++state.cp;
0957: --state.parenNesting;
0958: if (result != null) {
0959: result.kid = state.result;
0960: state.result = result;
0961: }
0962: break;
0963: }
0964: case ')':
0965: reportError("msg.re.unmatched.right.paren", "");
0966: return false;
0967: case '[':
0968: state.result = new RENode(REOP_CLASS);
0969: termStart = state.cp;
0970: state.result.startIndex = termStart;
0971: while (true) {
0972: if (state.cp == state.cpend) {
0973: reportError("msg.unterm.class", "");
0974: return false;
0975: }
0976: if (src[state.cp] == '\\')
0977: state.cp++;
0978: else {
0979: if (src[state.cp] == ']') {
0980: state.result.kidlen = state.cp - termStart;
0981: break;
0982: }
0983: }
0984: state.cp++;
0985: }
0986: state.result.index = state.classCount++;
0987: /*
0988: * Call calculateBitmapSize now as we want any errors it finds
0989: * to be reported during the parse phase, not at execution.
0990: */
0991: if (!calculateBitmapSize(state, state.result, src,
0992: termStart, state.cp++))
0993: return false;
0994: state.progLength += 3; /* CLASS, <index> */
0995: break;
0996:
0997: case '.':
0998: state.result = new RENode(REOP_DOT);
0999: state.progLength++;
1000: break;
1001: case '*':
1002: case '+':
1003: case '?':
1004: reportError("msg.bad.quant", String
1005: .valueOf(src[state.cp - 1]));
1006: return false;
1007: default:
1008: state.result = new RENode(REOP_FLAT);
1009: state.result.chr = c;
1010: state.result.length = 1;
1011: state.result.flatIndex = state.cp - 1;
1012: state.progLength += 3;
1013: break;
1014: }
1015:
1016: term = state.result;
1017: if (state.cp == state.cpend) {
1018: return true;
1019: }
1020: boolean hasQ = false;
1021: switch (src[state.cp]) {
1022: case '+':
1023: state.result = new RENode(REOP_QUANT);
1024: state.result.min = 1;
1025: state.result.max = -1;
1026: /* <PLUS>, <parencount>, <parenindex>, <next> ... <ENDCHILD> */
1027: state.progLength += 8;
1028: hasQ = true;
1029: break;
1030: case '*':
1031: state.result = new RENode(REOP_QUANT);
1032: state.result.min = 0;
1033: state.result.max = -1;
1034: /* <STAR>, <parencount>, <parenindex>, <next> ... <ENDCHILD> */
1035: state.progLength += 8;
1036: hasQ = true;
1037: break;
1038: case '?':
1039: state.result = new RENode(REOP_QUANT);
1040: state.result.min = 0;
1041: state.result.max = 1;
1042: /* <OPT>, <parencount>, <parenindex>, <next> ... <ENDCHILD> */
1043: state.progLength += 8;
1044: hasQ = true;
1045: break;
1046: case '{': /* balance '}' */
1047: {
1048: int min = 0;
1049: int max = -1;
1050: int leftCurl = state.cp;
1051:
1052: /* For Perl etc. compatibility, if quntifier does not match
1053: * \{\d+(,\d*)?\} exactly back off from it
1054: * being a quantifier, and chew it up as a literal
1055: * atom next time instead.
1056: */
1057:
1058: c = src[++state.cp];
1059: if (isDigit(c)) {
1060: ++state.cp;
1061: min = getDecimalValue(c, state, 0xFFFF,
1062: "msg.overlarge.min");
1063: c = src[state.cp];
1064: if (c == ',') {
1065: c = src[++state.cp];
1066: if (isDigit(c)) {
1067: ++state.cp;
1068: max = getDecimalValue(c, state, 0xFFFF,
1069: "msg.overlarge.max");
1070: c = src[state.cp];
1071: if (min > max) {
1072: reportError("msg.max.lt.min", String
1073: .valueOf(src[state.cp]));
1074: return false;
1075: }
1076: }
1077: } else {
1078: max = min;
1079: }
1080: /* balance '{' */
1081: if (c == '}') {
1082: state.result = new RENode(REOP_QUANT);
1083: state.result.min = min;
1084: state.result.max = max;
1085: // QUANT, <min>, <max>, <parencount>,
1086: // <parenindex>, <next> ... <ENDCHILD>
1087: state.progLength += 12;
1088: hasQ = true;
1089: }
1090: }
1091: if (!hasQ) {
1092: state.cp = leftCurl;
1093: }
1094: break;
1095: }
1096: }
1097: if (!hasQ)
1098: return true;
1099:
1100: ++state.cp;
1101: state.result.kid = term;
1102: state.result.parenIndex = parenBaseCount;
1103: state.result.parenCount = state.parenCount - parenBaseCount;
1104: if ((state.cp < state.cpend) && (src[state.cp] == '?')) {
1105: ++state.cp;
1106: state.result.greedy = false;
1107: } else
1108: state.result.greedy = true;
1109: return true;
1110: }
1111:
1112: private static void resolveForwardJump(byte[] array, int from,
1113: int pc) {
1114: if (from > pc)
1115: throw Kit.codeBug();
1116: addIndex(array, from, pc - from);
1117: }
1118:
1119: private static int getOffset(byte[] array, int pc) {
1120: return getIndex(array, pc);
1121: }
1122:
1123: private static int addIndex(byte[] array, int pc, int index) {
1124: if (index < 0)
1125: throw Kit.codeBug();
1126: if (index > 0xFFFF)
1127: throw Context.reportRuntimeError("Too complex regexp");
1128: array[pc] = (byte) (index >> 8);
1129: array[pc + 1] = (byte) (index);
1130: return pc + 2;
1131: }
1132:
1133: private static int getIndex(byte[] array, int pc) {
1134: return ((array[pc] & 0xFF) << 8) | (array[pc + 1] & 0xFF);
1135: }
1136:
1137: private static final int OFFSET_LEN = 2;
1138: private static final int INDEX_LEN = 2;
1139:
1140: private static int emitREBytecode(CompilerState state,
1141: RECompiled re, int pc, RENode t) {
1142: RENode nextAlt;
1143: int nextAltFixup, nextTermFixup;
1144: byte[] program = re.program;
1145:
1146: while (t != null) {
1147: program[pc++] = t.op;
1148: switch (t.op) {
1149: case REOP_EMPTY:
1150: --pc;
1151: break;
1152: case REOP_ALT:
1153: nextAlt = t.kid2;
1154: nextAltFixup = pc; /* address of next alternate */
1155: pc += OFFSET_LEN;
1156: pc = emitREBytecode(state, re, pc, t.kid);
1157: program[pc++] = REOP_JUMP;
1158: nextTermFixup = pc; /* address of following term */
1159: pc += OFFSET_LEN;
1160: resolveForwardJump(program, nextAltFixup, pc);
1161: pc = emitREBytecode(state, re, pc, nextAlt);
1162:
1163: program[pc++] = REOP_JUMP;
1164: nextAltFixup = pc;
1165: pc += OFFSET_LEN;
1166:
1167: resolveForwardJump(program, nextTermFixup, pc);
1168: resolveForwardJump(program, nextAltFixup, pc);
1169: break;
1170: case REOP_FLAT:
1171: /*
1172: * Consecutize FLAT's if possible.
1173: */
1174: if (t.flatIndex != -1) {
1175: while ((t.next != null)
1176: && (t.next.op == REOP_FLAT)
1177: && ((t.flatIndex + t.length) == t.next.flatIndex)) {
1178: t.length += t.next.length;
1179: t.next = t.next.next;
1180: }
1181: }
1182: if ((t.flatIndex != -1) && (t.length > 1)) {
1183: if ((state.flags & JSREG_FOLD) != 0)
1184: program[pc - 1] = REOP_FLATi;
1185: else
1186: program[pc - 1] = REOP_FLAT;
1187: pc = addIndex(program, pc, t.flatIndex);
1188: pc = addIndex(program, pc, t.length);
1189: } else {
1190: if (t.chr < 256) {
1191: if ((state.flags & JSREG_FOLD) != 0)
1192: program[pc - 1] = REOP_FLAT1i;
1193: else
1194: program[pc - 1] = REOP_FLAT1;
1195: program[pc++] = (byte) (t.chr);
1196: } else {
1197: if ((state.flags & JSREG_FOLD) != 0)
1198: program[pc - 1] = REOP_UCFLAT1i;
1199: else
1200: program[pc - 1] = REOP_UCFLAT1;
1201: pc = addIndex(program, pc, t.chr);
1202: }
1203: }
1204: break;
1205: case REOP_LPAREN:
1206: pc = addIndex(program, pc, t.parenIndex);
1207: pc = emitREBytecode(state, re, pc, t.kid);
1208: program[pc++] = REOP_RPAREN;
1209: pc = addIndex(program, pc, t.parenIndex);
1210: break;
1211: case REOP_BACKREF:
1212: pc = addIndex(program, pc, t.parenIndex);
1213: break;
1214: case REOP_ASSERT:
1215: nextTermFixup = pc;
1216: pc += OFFSET_LEN;
1217: pc = emitREBytecode(state, re, pc, t.kid);
1218: program[pc++] = REOP_ASSERTTEST;
1219: resolveForwardJump(program, nextTermFixup, pc);
1220: break;
1221: case REOP_ASSERT_NOT:
1222: nextTermFixup = pc;
1223: pc += OFFSET_LEN;
1224: pc = emitREBytecode(state, re, pc, t.kid);
1225: program[pc++] = REOP_ASSERTNOTTEST;
1226: resolveForwardJump(program, nextTermFixup, pc);
1227: break;
1228: case REOP_QUANT:
1229: if ((t.min == 0) && (t.max == -1))
1230: program[pc - 1] = (t.greedy) ? REOP_STAR
1231: : REOP_MINIMALSTAR;
1232: else if ((t.min == 0) && (t.max == 1))
1233: program[pc - 1] = (t.greedy) ? REOP_OPT
1234: : REOP_MINIMALOPT;
1235: else if ((t.min == 1) && (t.max == -1))
1236: program[pc - 1] = (t.greedy) ? REOP_PLUS
1237: : REOP_MINIMALPLUS;
1238: else {
1239: if (!t.greedy)
1240: program[pc - 1] = REOP_MINIMALQUANT;
1241: pc = addIndex(program, pc, t.min);
1242: // max can be -1 which addIndex does not accept
1243: pc = addIndex(program, pc, t.max + 1);
1244: }
1245: pc = addIndex(program, pc, t.parenCount);
1246: pc = addIndex(program, pc, t.parenIndex);
1247: nextTermFixup = pc;
1248: pc += OFFSET_LEN;
1249: pc = emitREBytecode(state, re, pc, t.kid);
1250: program[pc++] = REOP_ENDCHILD;
1251: resolveForwardJump(program, nextTermFixup, pc);
1252: break;
1253: case REOP_CLASS:
1254: pc = addIndex(program, pc, t.index);
1255: re.classList[t.index] = new RECharSet(t.bmsize,
1256: t.startIndex, t.kidlen);
1257: break;
1258: default:
1259: break;
1260: }
1261: t = t.next;
1262: }
1263: return pc;
1264: }
1265:
1266: private static void pushProgState(REGlobalData gData, int min,
1267: int max, REBackTrackData backTrackLastToSave,
1268: int continuation_pc, int continuation_op) {
1269: gData.stateStackTop = new REProgState(gData.stateStackTop, min,
1270: max, gData.cp, backTrackLastToSave, continuation_pc,
1271: continuation_op);
1272: }
1273:
1274: private static REProgState popProgState(REGlobalData gData) {
1275: REProgState state = gData.stateStackTop;
1276: gData.stateStackTop = state.previous;
1277: return state;
1278: }
1279:
1280: private static void pushBackTrackState(REGlobalData gData, byte op,
1281: int target) {
1282: gData.backTrackStackTop = new REBackTrackData(gData, op, target);
1283: }
1284:
1285: /*
1286: * Consecutive literal characters.
1287: */
1288: private static boolean flatNMatcher(REGlobalData gData,
1289: int matchChars, int length, char[] chars, int end) {
1290: if ((gData.cp + length) > end)
1291: return false;
1292: for (int i = 0; i < length; i++) {
1293: if (gData.regexp.source[matchChars + i] != chars[gData.cp
1294: + i]) {
1295: return false;
1296: }
1297: }
1298: gData.cp += length;
1299: return true;
1300: }
1301:
1302: private static boolean flatNIMatcher(REGlobalData gData,
1303: int matchChars, int length, char[] chars, int end) {
1304: if ((gData.cp + length) > end)
1305: return false;
1306: for (int i = 0; i < length; i++) {
1307: if (upcase(gData.regexp.source[matchChars + i]) != upcase(chars[gData.cp
1308: + i])) {
1309: return false;
1310: }
1311: }
1312: gData.cp += length;
1313: return true;
1314: }
1315:
1316: /*
1317: 1. Evaluate DecimalEscape to obtain an EscapeValue E.
1318: 2. If E is not a character then go to step 6.
1319: 3. Let ch be E's character.
1320: 4. Let A be a one-element RECharSet containing the character ch.
1321: 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
1322: 6. E must be an integer. Let n be that integer.
1323: 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
1324: 8. Return an internal Matcher closure that takes two arguments, a State x
1325: and a Continuation c, and performs the following:
1326: 1. Let cap be x's captures internal array.
1327: 2. Let s be cap[n].
1328: 3. If s is undefined, then call c(x) and return its result.
1329: 4. Let e be x's endIndex.
1330: 5. Let len be s's length.
1331: 6. Let f be e+len.
1332: 7. If f>InputLength, return failure.
1333: 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
1334: such that Canonicalize(s[i]) is not the same character as
1335: Canonicalize(Input [e+i]), then return failure.
1336: 9. Let y be the State (f, cap).
1337: 10. Call c(y) and return its result.
1338: */
1339: private static boolean backrefMatcher(REGlobalData gData,
1340: int parenIndex, char[] chars, int end) {
1341: int len;
1342: int i;
1343: int parenContent = gData.parens_index(parenIndex);
1344: if (parenContent == -1)
1345: return true;
1346:
1347: len = gData.parens_length(parenIndex);
1348: if ((gData.cp + len) > end)
1349: return false;
1350:
1351: if ((gData.regexp.flags & JSREG_FOLD) != 0) {
1352: for (i = 0; i < len; i++) {
1353: if (upcase(chars[parenContent + i]) != upcase(chars[gData.cp
1354: + i]))
1355: return false;
1356: }
1357: } else {
1358: for (i = 0; i < len; i++) {
1359: if (chars[parenContent + i] != chars[gData.cp + i])
1360: return false;
1361: }
1362: }
1363: gData.cp += len;
1364: return true;
1365: }
1366:
1367: /* Add a single character to the RECharSet */
1368: private static void addCharacterToCharSet(RECharSet cs, char c) {
1369: int byteIndex = (c / 8);
1370: if (c > cs.length)
1371: throw new RuntimeException();
1372: cs.bits[byteIndex] |= 1 << (c & 0x7);
1373: }
1374:
1375: /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
1376: private static void addCharacterRangeToCharSet(RECharSet cs,
1377: char c1, char c2) {
1378: int i;
1379:
1380: int byteIndex1 = (c1 / 8);
1381: int byteIndex2 = (c2 / 8);
1382:
1383: if ((c2 > cs.length) || (c1 > c2))
1384: throw new RuntimeException();
1385:
1386: c1 &= 0x7;
1387: c2 &= 0x7;
1388:
1389: if (byteIndex1 == byteIndex2) {
1390: cs.bits[byteIndex1] |= ((0xFF) >> (7 - (c2 - c1))) << c1;
1391: } else {
1392: cs.bits[byteIndex1] |= 0xFF << c1;
1393: for (i = byteIndex1 + 1; i < byteIndex2; i++)
1394: cs.bits[i] = (byte) 0xFF;
1395: cs.bits[byteIndex2] |= (0xFF) >> (7 - c2);
1396: }
1397: }
1398:
1399: /* Compile the source of the class into a RECharSet */
1400: private static void processCharSet(REGlobalData gData,
1401: RECharSet charSet) {
1402: synchronized (charSet) {
1403: if (!charSet.converted) {
1404: processCharSetImpl(gData, charSet);
1405: charSet.converted = true;
1406: }
1407: }
1408: }
1409:
1410: private static void processCharSetImpl(REGlobalData gData,
1411: RECharSet charSet) {
1412: int src = charSet.startIndex;
1413: int end = src + charSet.strlength;
1414:
1415: char rangeStart = 0, this Ch;
1416: int byteLength;
1417: char c;
1418: int n;
1419: int nDigits;
1420: int i;
1421: boolean inRange = false;
1422:
1423: charSet.sense = true;
1424: byteLength = (charSet.length / 8) + 1;
1425: charSet.bits = new byte[byteLength];
1426:
1427: if (src == end)
1428: return;
1429:
1430: if (gData.regexp.source[src] == '^') {
1431: charSet.sense = false;
1432: ++src;
1433: }
1434:
1435: while (src != end) {
1436: nDigits = 2;
1437: switch (gData.regexp.source[src]) {
1438: case '\\':
1439: ++src;
1440: c = gData.regexp.source[src++];
1441: switch (c) {
1442: case 'b':
1443: this Ch = 0x8;
1444: break;
1445: case 'f':
1446: this Ch = 0xC;
1447: break;
1448: case 'n':
1449: this Ch = 0xA;
1450: break;
1451: case 'r':
1452: this Ch = 0xD;
1453: break;
1454: case 't':
1455: this Ch = 0x9;
1456: break;
1457: case 'v':
1458: this Ch = 0xB;
1459: break;
1460: case 'c':
1461: if (((src + 1) < end)
1462: && isWord(gData.regexp.source[src + 1]))
1463: this Ch = (char) (gData.regexp.source[src++] & 0x1F);
1464: else {
1465: --src;
1466: this Ch = '\\';
1467: }
1468: break;
1469: case 'u':
1470: nDigits += 2;
1471: // fall thru
1472: case 'x':
1473: n = 0;
1474: for (i = 0; (i < nDigits) && (src < end); i++) {
1475: c = gData.regexp.source[src++];
1476: int digit = toASCIIHexDigit(c);
1477: if (digit < 0) {
1478: /* back off to accepting the original '\'
1479: * as a literal
1480: */
1481: src -= (i + 1);
1482: n = '\\';
1483: break;
1484: }
1485: n = (n << 4) | digit;
1486: }
1487: this Ch = (char) (n);
1488: break;
1489: case '0':
1490: case '1':
1491: case '2':
1492: case '3':
1493: case '4':
1494: case '5':
1495: case '6':
1496: case '7':
1497: /*
1498: * This is a non-ECMA extension - decimal escapes (in this
1499: * case, octal!) are supposed to be an error inside class
1500: * ranges, but supported here for backwards compatibility.
1501: *
1502: */
1503: n = (c - '0');
1504: c = gData.regexp.source[src];
1505: if ('0' <= c && c <= '7') {
1506: src++;
1507: n = 8 * n + (c - '0');
1508: c = gData.regexp.source[src];
1509: if ('0' <= c && c <= '7') {
1510: src++;
1511: i = 8 * n + (c - '0');
1512: if (i <= 0377)
1513: n = i;
1514: else
1515: src--;
1516: }
1517: }
1518: this Ch = (char) (n);
1519: break;
1520:
1521: case 'd':
1522: addCharacterRangeToCharSet(charSet, '0', '9');
1523: continue; /* don't need range processing */
1524: case 'D':
1525: addCharacterRangeToCharSet(charSet, (char) 0,
1526: (char) ('0' - 1));
1527: addCharacterRangeToCharSet(charSet,
1528: (char) ('9' + 1), (char) (charSet.length));
1529: continue;
1530: case 's':
1531: for (i = charSet.length; i >= 0; i--)
1532: if (isREWhiteSpace(i))
1533: addCharacterToCharSet(charSet, (char) (i));
1534: continue;
1535: case 'S':
1536: for (i = charSet.length; i >= 0; i--)
1537: if (!isREWhiteSpace(i))
1538: addCharacterToCharSet(charSet, (char) (i));
1539: continue;
1540: case 'w':
1541: for (i = charSet.length; i >= 0; i--)
1542: if (isWord((char) i))
1543: addCharacterToCharSet(charSet, (char) (i));
1544: continue;
1545: case 'W':
1546: for (i = charSet.length; i >= 0; i--)
1547: if (!isWord((char) i))
1548: addCharacterToCharSet(charSet, (char) (i));
1549: continue;
1550: default:
1551: this Ch = c;
1552: break;
1553:
1554: }
1555: break;
1556:
1557: default:
1558: this Ch = gData.regexp.source[src++];
1559: break;
1560:
1561: }
1562: if (inRange) {
1563: if ((gData.regexp.flags & JSREG_FOLD) != 0) {
1564: addCharacterRangeToCharSet(charSet,
1565: upcase(rangeStart), upcase(this Ch));
1566: addCharacterRangeToCharSet(charSet,
1567: downcase(rangeStart), downcase(this Ch));
1568: } else {
1569: addCharacterRangeToCharSet(charSet, rangeStart,
1570: this Ch);
1571: }
1572: inRange = false;
1573: } else {
1574: if ((gData.regexp.flags & JSREG_FOLD) != 0) {
1575: addCharacterToCharSet(charSet, upcase(this Ch));
1576: addCharacterToCharSet(charSet, downcase(this Ch));
1577: } else {
1578: addCharacterToCharSet(charSet, this Ch);
1579: }
1580: if (src < (end - 1)) {
1581: if (gData.regexp.source[src] == '-') {
1582: ++src;
1583: inRange = true;
1584: rangeStart = this Ch;
1585: }
1586: }
1587: }
1588: }
1589: }
1590:
1591: /*
1592: * Initialize the character set if it this is the first call.
1593: * Test the bit - if the ^ flag was specified, non-inclusion is a success
1594: */
1595: private static boolean classMatcher(REGlobalData gData,
1596: RECharSet charSet, char ch) {
1597: if (!charSet.converted) {
1598: processCharSet(gData, charSet);
1599: }
1600:
1601: int byteIndex = ch / 8;
1602: if (charSet.sense) {
1603: if ((charSet.length == 0)
1604: || ((ch > charSet.length) || ((charSet.bits[byteIndex] & (1 << (ch & 0x7))) == 0)))
1605: return false;
1606: } else {
1607: if (!((charSet.length == 0) || ((ch > charSet.length) || ((charSet.bits[byteIndex] & (1 << (ch & 0x7))) == 0))))
1608: return false;
1609: }
1610: return true;
1611: }
1612:
1613: private static boolean executeREBytecode(REGlobalData gData,
1614: char[] chars, int end) {
1615: int pc = 0;
1616: byte program[] = gData.regexp.program;
1617: int currentContinuation_op;
1618: int currentContinuation_pc;
1619: boolean result = false;
1620:
1621: currentContinuation_pc = 0;
1622: currentContinuation_op = REOP_END;
1623: if (debug) {
1624: System.out.println("Input = \"" + new String(chars)
1625: + "\", start at " + gData.cp);
1626: }
1627: int op = program[pc++];
1628: for (;;) {
1629: if (debug) {
1630: System.out.println("Testing at " + gData.cp + ", op = "
1631: + op);
1632: }
1633: switch (op) {
1634: case REOP_EMPTY:
1635: result = true;
1636: break;
1637: case REOP_BOL:
1638: if (gData.cp != 0) {
1639: if (gData.multiline
1640: || ((gData.regexp.flags & JSREG_MULTILINE) != 0)) {
1641: if (!isLineTerm(chars[gData.cp - 1])) {
1642: result = false;
1643: break;
1644: }
1645: } else {
1646: result = false;
1647: break;
1648: }
1649: }
1650: result = true;
1651: break;
1652: case REOP_EOL:
1653: if (gData.cp != end) {
1654: if (gData.multiline
1655: || ((gData.regexp.flags & JSREG_MULTILINE) != 0)) {
1656: if (!isLineTerm(chars[gData.cp])) {
1657: result = false;
1658: break;
1659: }
1660: } else {
1661: result = false;
1662: break;
1663: }
1664: }
1665: result = true;
1666: break;
1667: case REOP_WBDRY:
1668: result = ((gData.cp == 0 || !isWord(chars[gData.cp - 1])) ^ !((gData.cp < end) && isWord(chars[gData.cp])));
1669: break;
1670: case REOP_WNONBDRY:
1671: result = ((gData.cp == 0 || !isWord(chars[gData.cp - 1])) ^ ((gData.cp < end) && isWord(chars[gData.cp])));
1672: break;
1673: case REOP_DOT:
1674: result = (gData.cp != end && !isLineTerm(chars[gData.cp]));
1675: if (result) {
1676: gData.cp++;
1677: }
1678: break;
1679: case REOP_DIGIT:
1680: result = (gData.cp != end && isDigit(chars[gData.cp]));
1681: if (result) {
1682: gData.cp++;
1683: }
1684: break;
1685: case REOP_NONDIGIT:
1686: result = (gData.cp != end && !isDigit(chars[gData.cp]));
1687: if (result) {
1688: gData.cp++;
1689: }
1690: break;
1691: case REOP_SPACE:
1692: result = (gData.cp != end && isREWhiteSpace(chars[gData.cp]));
1693: if (result) {
1694: gData.cp++;
1695: }
1696: break;
1697: case REOP_NONSPACE:
1698: result = (gData.cp != end && !isREWhiteSpace(chars[gData.cp]));
1699: if (result) {
1700: gData.cp++;
1701: }
1702: break;
1703: case REOP_ALNUM:
1704: result = (gData.cp != end && isWord(chars[gData.cp]));
1705: if (result) {
1706: gData.cp++;
1707: }
1708: break;
1709: case REOP_NONALNUM:
1710: result = (gData.cp != end && !isWord(chars[gData.cp]));
1711: if (result) {
1712: gData.cp++;
1713: }
1714: break;
1715: case REOP_FLAT: {
1716: int offset = getIndex(program, pc);
1717: pc += INDEX_LEN;
1718: int length = getIndex(program, pc);
1719: pc += INDEX_LEN;
1720: result = flatNMatcher(gData, offset, length, chars, end);
1721: }
1722: break;
1723: case REOP_FLATi: {
1724: int offset = getIndex(program, pc);
1725: pc += INDEX_LEN;
1726: int length = getIndex(program, pc);
1727: pc += INDEX_LEN;
1728: result = flatNIMatcher(gData, offset, length, chars,
1729: end);
1730: }
1731: break;
1732: case REOP_FLAT1: {
1733: char matchCh = (char) (program[pc++] & 0xFF);
1734: result = (gData.cp != end && chars[gData.cp] == matchCh);
1735: if (result) {
1736: gData.cp++;
1737: }
1738: }
1739: break;
1740: case REOP_FLAT1i: {
1741: char matchCh = (char) (program[pc++] & 0xFF);
1742: result = (gData.cp != end && upcase(chars[gData.cp]) == upcase(matchCh));
1743: if (result) {
1744: gData.cp++;
1745: }
1746: }
1747: break;
1748: case REOP_UCFLAT1: {
1749: char matchCh = (char) getIndex(program, pc);
1750: pc += INDEX_LEN;
1751: result = (gData.cp != end && chars[gData.cp] == matchCh);
1752: if (result) {
1753: gData.cp++;
1754: }
1755: }
1756: break;
1757: case REOP_UCFLAT1i: {
1758: char matchCh = (char) getIndex(program, pc);
1759: pc += INDEX_LEN;
1760: result = (gData.cp != end && upcase(chars[gData.cp]) == upcase(matchCh));
1761: if (result) {
1762: gData.cp++;
1763: }
1764: }
1765: break;
1766: case REOP_ALT: {
1767: int nextpc;
1768: byte nextop;
1769: pushProgState(gData, 0, 0, null,
1770: currentContinuation_pc, currentContinuation_op);
1771: nextpc = pc + getOffset(program, pc);
1772: nextop = program[nextpc++];
1773: pushBackTrackState(gData, nextop, nextpc);
1774: pc += INDEX_LEN;
1775: op = program[pc++];
1776: }
1777: continue;
1778:
1779: case REOP_JUMP: {
1780: int offset;
1781: REProgState state = popProgState(gData);
1782: currentContinuation_pc = state.continuation_pc;
1783: currentContinuation_op = state.continuation_op;
1784: offset = getOffset(program, pc);
1785: pc += offset;
1786: op = program[pc++];
1787: }
1788: continue;
1789:
1790: case REOP_LPAREN: {
1791: int parenIndex = getIndex(program, pc);
1792: pc += INDEX_LEN;
1793: gData.set_parens(parenIndex, gData.cp, 0);
1794: op = program[pc++];
1795: }
1796: continue;
1797: case REOP_RPAREN: {
1798: int cap_index;
1799: int parenIndex = getIndex(program, pc);
1800: pc += INDEX_LEN;
1801: cap_index = gData.parens_index(parenIndex);
1802: gData.set_parens(parenIndex, cap_index, gData.cp
1803: - cap_index);
1804: if (parenIndex > gData.lastParen)
1805: gData.lastParen = parenIndex;
1806: op = program[pc++];
1807: }
1808: continue;
1809: case REOP_BACKREF: {
1810: int parenIndex = getIndex(program, pc);
1811: pc += INDEX_LEN;
1812: result = backrefMatcher(gData, parenIndex, chars, end);
1813: }
1814: break;
1815:
1816: case REOP_CLASS: {
1817: int index = getIndex(program, pc);
1818: pc += INDEX_LEN;
1819: if (gData.cp != end) {
1820: if (classMatcher(gData,
1821: gData.regexp.classList[index],
1822: chars[gData.cp])) {
1823: gData.cp++;
1824: result = true;
1825: break;
1826: }
1827: }
1828: result = false;
1829: }
1830: break;
1831:
1832: case REOP_ASSERT:
1833: case REOP_ASSERT_NOT: {
1834: byte testOp;
1835: pushProgState(gData, 0, 0, gData.backTrackStackTop,
1836: currentContinuation_pc, currentContinuation_op);
1837: if (op == REOP_ASSERT) {
1838: testOp = REOP_ASSERTTEST;
1839: } else {
1840: testOp = REOP_ASSERTNOTTEST;
1841: }
1842: pushBackTrackState(gData, testOp, pc
1843: + getOffset(program, pc));
1844: pc += INDEX_LEN;
1845: op = program[pc++];
1846: }
1847: continue;
1848:
1849: case REOP_ASSERTTEST:
1850: case REOP_ASSERTNOTTEST: {
1851: REProgState state = popProgState(gData);
1852: gData.cp = state.index;
1853: gData.backTrackStackTop = state.backTrack;
1854: currentContinuation_pc = state.continuation_pc;
1855: currentContinuation_op = state.continuation_op;
1856: if (result) {
1857: if (op == REOP_ASSERTTEST) {
1858: result = true;
1859: } else {
1860: result = false;
1861: }
1862: } else {
1863: if (op == REOP_ASSERTTEST) {
1864: // Do nothing
1865: } else {
1866: result = true;
1867: }
1868: }
1869: }
1870: break;
1871:
1872: case REOP_STAR:
1873: case REOP_PLUS:
1874: case REOP_OPT:
1875: case REOP_QUANT:
1876: case REOP_MINIMALSTAR:
1877: case REOP_MINIMALPLUS:
1878: case REOP_MINIMALOPT:
1879: case REOP_MINIMALQUANT: {
1880: int min, max;
1881: boolean greedy = false;
1882: switch (op) {
1883: case REOP_STAR:
1884: greedy = true;
1885: // fallthrough
1886: case REOP_MINIMALSTAR:
1887: min = 0;
1888: max = -1;
1889: break;
1890: case REOP_PLUS:
1891: greedy = true;
1892: // fallthrough
1893: case REOP_MINIMALPLUS:
1894: min = 1;
1895: max = -1;
1896: break;
1897: case REOP_OPT:
1898: greedy = true;
1899: // fallthrough
1900: case REOP_MINIMALOPT:
1901: min = 0;
1902: max = 1;
1903: break;
1904: case REOP_QUANT:
1905: greedy = true;
1906: // fallthrough
1907: case REOP_MINIMALQUANT:
1908: min = getOffset(program, pc);
1909: pc += INDEX_LEN;
1910: // See comments in emitREBytecode for " - 1" reason
1911: max = getOffset(program, pc) - 1;
1912: pc += INDEX_LEN;
1913: break;
1914: default:
1915: throw Kit.codeBug();
1916: }
1917: pushProgState(gData, min, max, null,
1918: currentContinuation_pc, currentContinuation_op);
1919: if (greedy) {
1920: currentContinuation_op = REOP_REPEAT;
1921: currentContinuation_pc = pc;
1922: pushBackTrackState(gData, REOP_REPEAT, pc);
1923: /* Step over <parencount>, <parenindex> & <next> */
1924: pc += 3 * INDEX_LEN;
1925: op = program[pc++];
1926: } else {
1927: if (min != 0) {
1928: currentContinuation_op = REOP_MINIMALREPEAT;
1929: currentContinuation_pc = pc;
1930: /* <parencount> <parenindex> & <next> */
1931: pc += 3 * INDEX_LEN;
1932: op = program[pc++];
1933: } else {
1934: pushBackTrackState(gData, REOP_MINIMALREPEAT,
1935: pc);
1936: popProgState(gData);
1937: pc += 2 * INDEX_LEN; // <parencount> & <parenindex>
1938: pc = pc + getOffset(program, pc);
1939: op = program[pc++];
1940: }
1941: }
1942: }
1943: continue;
1944:
1945: case REOP_ENDCHILD:
1946: // Use the current continuation.
1947: pc = currentContinuation_pc;
1948: op = currentContinuation_op;
1949: continue;
1950:
1951: case REOP_REPEAT: {
1952: REProgState state = popProgState(gData);
1953: if (!result) {
1954: //
1955: // There's been a failure, see if we have enough
1956: // children.
1957: //
1958: if (state.min == 0)
1959: result = true;
1960: currentContinuation_pc = state.continuation_pc;
1961: currentContinuation_op = state.continuation_op;
1962: pc += 2 * INDEX_LEN; /* <parencount> & <parenindex> */
1963: pc = pc + getOffset(program, pc);
1964: break;
1965: } else {
1966: if (state.min == 0 && gData.cp == state.index) {
1967: // matched an empty string, that'll get us nowhere
1968: result = false;
1969: currentContinuation_pc = state.continuation_pc;
1970: currentContinuation_op = state.continuation_op;
1971: pc += 2 * INDEX_LEN;
1972: pc = pc + getOffset(program, pc);
1973: break;
1974: }
1975: int new_min = state.min, new_max = state.max;
1976: if (new_min != 0)
1977: new_min--;
1978: if (new_max != -1)
1979: new_max--;
1980: if (new_max == 0) {
1981: result = true;
1982: currentContinuation_pc = state.continuation_pc;
1983: currentContinuation_op = state.continuation_op;
1984: pc += 2 * INDEX_LEN;
1985: pc = pc + getOffset(program, pc);
1986: break;
1987: }
1988: pushProgState(gData, new_min, new_max, null,
1989: state.continuation_pc,
1990: state.continuation_op);
1991: currentContinuation_op = REOP_REPEAT;
1992: currentContinuation_pc = pc;
1993: pushBackTrackState(gData, REOP_REPEAT, pc);
1994: int parenCount = getIndex(program, pc);
1995: pc += INDEX_LEN;
1996: int parenIndex = getIndex(program, pc);
1997: pc += 2 * INDEX_LEN;
1998: op = program[pc++];
1999: for (int k = 0; k < parenCount; k++) {
2000: gData.set_parens(parenIndex + k, -1, 0);
2001: }
2002: }
2003: }
2004: continue;
2005:
2006: case REOP_MINIMALREPEAT: {
2007: REProgState state = popProgState(gData);
2008: if (!result) {
2009: //
2010: // Non-greedy failure - try to consume another child.
2011: //
2012: if (state.max == -1 || state.max > 0) {
2013: pushProgState(gData, state.min, state.max,
2014: null, state.continuation_pc,
2015: state.continuation_op);
2016: currentContinuation_op = REOP_MINIMALREPEAT;
2017: currentContinuation_pc = pc;
2018: int parenCount = getIndex(program, pc);
2019: pc += INDEX_LEN;
2020: int parenIndex = getIndex(program, pc);
2021: pc += 2 * INDEX_LEN;
2022: for (int k = 0; k < parenCount; k++) {
2023: gData.set_parens(parenIndex + k, -1, 0);
2024: }
2025: op = program[pc++];
2026: continue;
2027: } else {
2028: // Don't need to adjust pc since we're going to pop.
2029: currentContinuation_pc = state.continuation_pc;
2030: currentContinuation_op = state.continuation_op;
2031: break;
2032: }
2033: } else {
2034: if (state.min == 0 && gData.cp == state.index) {
2035: // Matched an empty string, that'll get us nowhere.
2036: result = false;
2037: currentContinuation_pc = state.continuation_pc;
2038: currentContinuation_op = state.continuation_op;
2039: break;
2040: }
2041: int new_min = state.min, new_max = state.max;
2042: if (new_min != 0)
2043: new_min--;
2044: if (new_max != -1)
2045: new_max--;
2046: pushProgState(gData, new_min, new_max, null,
2047: state.continuation_pc,
2048: state.continuation_op);
2049: if (new_min != 0) {
2050: currentContinuation_op = REOP_MINIMALREPEAT;
2051: currentContinuation_pc = pc;
2052: int parenCount = getIndex(program, pc);
2053: pc += INDEX_LEN;
2054: int parenIndex = getIndex(program, pc);
2055: pc += 2 * INDEX_LEN;
2056: for (int k = 0; k < parenCount; k++) {
2057: gData.set_parens(parenIndex + k, -1, 0);
2058: }
2059: op = program[pc++];
2060: } else {
2061: currentContinuation_pc = state.continuation_pc;
2062: currentContinuation_op = state.continuation_op;
2063: pushBackTrackState(gData, REOP_MINIMALREPEAT,
2064: pc);
2065: popProgState(gData);
2066: pc += 2 * INDEX_LEN;
2067: pc = pc + getOffset(program, pc);
2068: op = program[pc++];
2069: }
2070: continue;
2071: }
2072: }
2073:
2074: case REOP_END:
2075: return true;
2076:
2077: default:
2078: throw Kit.codeBug();
2079:
2080: }
2081: /*
2082: * If the match failed and there's a backtrack option, take it.
2083: * Otherwise this is a complete and utter failure.
2084: */
2085: if (!result) {
2086: REBackTrackData backTrackData = gData.backTrackStackTop;
2087: if (backTrackData != null) {
2088: gData.backTrackStackTop = backTrackData.previous;
2089:
2090: gData.lastParen = backTrackData.lastParen;
2091:
2092: // XXX: If backTrackData will no longer be used, then
2093: // there is no need to clone backTrackData.parens
2094: if (backTrackData.parens != null) {
2095: gData.parens = backTrackData.parens.clone();
2096: }
2097:
2098: gData.cp = backTrackData.cp;
2099:
2100: gData.stateStackTop = backTrackData.stateStackTop;
2101:
2102: currentContinuation_op = gData.stateStackTop.continuation_op;
2103: currentContinuation_pc = gData.stateStackTop.continuation_pc;
2104: pc = backTrackData.continuation_pc;
2105: op = backTrackData.continuation_op;
2106: continue;
2107: } else
2108: return false;
2109: }
2110:
2111: op = program[pc++];
2112: }
2113:
2114: }
2115:
2116: private static boolean matchRegExp(REGlobalData gData,
2117: RECompiled re, char[] chars, int start, int end,
2118: boolean multiline) {
2119: if (re.parenCount != 0) {
2120: gData.parens = new long[re.parenCount];
2121: } else {
2122: gData.parens = null;
2123: }
2124:
2125: gData.backTrackStackTop = null;
2126:
2127: gData.stateStackTop = null;
2128:
2129: gData.multiline = multiline;
2130: gData.regexp = re;
2131: gData.lastParen = 0;
2132:
2133: int anchorCh = gData.regexp.anchorCh;
2134: //
2135: // have to include the position beyond the last character
2136: // in order to detect end-of-input/line condition
2137: //
2138: for (int i = start; i <= end; ++i) {
2139: //
2140: // If the first node is a literal match, step the index into
2141: // the string until that match is made, or fail if it can't be
2142: // found at all.
2143: //
2144: if (anchorCh >= 0) {
2145: for (;;) {
2146: if (i == end) {
2147: return false;
2148: }
2149: char matchCh = chars[i];
2150: if (matchCh == anchorCh
2151: || ((gData.regexp.flags & JSREG_FOLD) != 0 && upcase(matchCh) == upcase((char) anchorCh))) {
2152: break;
2153: }
2154: ++i;
2155: }
2156: }
2157: gData.cp = i;
2158: for (int j = 0; j < re.parenCount; j++) {
2159: gData.set_parens(j, -1, 0);
2160: }
2161: boolean result = executeREBytecode(gData, chars, end);
2162:
2163: gData.backTrackStackTop = null;
2164: gData.stateStackTop = null;
2165: if (result) {
2166: gData.skipped = i - start;
2167: return true;
2168: }
2169: }
2170: return false;
2171: }
2172:
2173: /*
2174: * indexp is assumed to be an array of length 1
2175: */
2176: Object executeRegExp(Context cx, Scriptable scopeObj,
2177: RegExpImpl res, String str, int indexp[], int matchType) {
2178: REGlobalData gData = new REGlobalData();
2179:
2180: int start = indexp[0];
2181: char[] charArray = str.toCharArray();
2182: int end = charArray.length;
2183: if (start > end)
2184: start = end;
2185: //
2186: // Call the recursive matcher to do the real work.
2187: //
2188: boolean matches = matchRegExp(gData, re, charArray, start, end,
2189: res.multiline);
2190: if (!matches) {
2191: if (matchType != PREFIX)
2192: return null;
2193: return Undefined.instance;
2194: }
2195: int index = gData.cp;
2196: int i = index;
2197: indexp[0] = i;
2198: int matchlen = i - (start + gData.skipped);
2199: int ep = index;
2200: index -= matchlen;
2201: Object result;
2202: Scriptable obj;
2203:
2204: if (matchType == TEST) {
2205: /*
2206: * Testing for a match and updating cx.regExpImpl: don't allocate
2207: * an array object, do return true.
2208: */
2209: result = Boolean.TRUE;
2210: obj = null;
2211: } else {
2212: /*
2213: * The array returned on match has element 0 bound to the matched
2214: * string, elements 1 through re.parenCount bound to the paren
2215: * matches, an index property telling the length of the left context,
2216: * and an input property referring to the input string.
2217: */
2218: Scriptable scope = getTopLevelScope(scopeObj);
2219: result = ScriptRuntime.newObject(cx, scope, "Array", null);
2220: obj = (Scriptable) result;
2221:
2222: String matchstr = new String(charArray, index, matchlen);
2223: obj.put(0, obj, matchstr);
2224: }
2225:
2226: if (re.parenCount == 0) {
2227: res.parens = null;
2228: res.lastParen = SubString.emptySubString;
2229: } else {
2230: SubString parsub = null;
2231: int num;
2232: res.parens = new SubString[re.parenCount];
2233: for (num = 0; num < re.parenCount; num++) {
2234: int cap_index = gData.parens_index(num);
2235: String parstr;
2236: if (cap_index != -1) {
2237: int cap_length = gData.parens_length(num);
2238: parsub = new SubString(charArray, cap_index,
2239: cap_length);
2240: res.parens[num] = parsub;
2241: if (matchType == TEST)
2242: continue;
2243: parstr = parsub.toString();
2244: obj.put(num + 1, obj, parstr);
2245: } else {
2246: if (matchType != TEST)
2247: obj.put(num + 1, obj, Undefined.instance);
2248: }
2249: }
2250: res.lastParen = parsub;
2251: }
2252:
2253: if (!(matchType == TEST)) {
2254: /*
2255: * Define the index and input properties last for better for/in loop
2256: * order (so they come after the elements).
2257: */
2258: obj.put("index", obj, new Integer(start + gData.skipped));
2259: obj.put("input", obj, str);
2260: }
2261:
2262: if (res.lastMatch == null) {
2263: res.lastMatch = new SubString();
2264: res.leftContext = new SubString();
2265: res.rightContext = new SubString();
2266: }
2267: res.lastMatch.charArray = charArray;
2268: res.lastMatch.index = index;
2269: res.lastMatch.length = matchlen;
2270:
2271: res.leftContext.charArray = charArray;
2272: if (cx.getLanguageVersion() == Context.VERSION_1_2) {
2273: /*
2274: * JS1.2 emulated Perl4.0.1.8 (patch level 36) for global regexps used
2275: * in scalar contexts, and unintentionally for the string.match "list"
2276: * psuedo-context. On "hi there bye", the following would result:
2277: *
2278: * Language while(/ /g){print("$`");} s/ /$`/g
2279: * perl4.036 "hi", "there" "hihitherehi therebye"
2280: * perl5 "hi", "hi there" "hihitherehi therebye"
2281: * js1.2 "hi", "there" "hihitheretherebye"
2282: *
2283: * Insofar as JS1.2 always defined $` as "left context from the last
2284: * match" for global regexps, it was more consistent than perl4.
2285: */
2286: res.leftContext.index = start;
2287: res.leftContext.length = gData.skipped;
2288: } else {
2289: /*
2290: * For JS1.3 and ECMAv2, emulate Perl5 exactly:
2291: *
2292: * js1.3 "hi", "hi there" "hihitherehi therebye"
2293: */
2294: res.leftContext.index = 0;
2295: res.leftContext.length = start + gData.skipped;
2296: }
2297:
2298: res.rightContext.charArray = charArray;
2299: res.rightContext.index = ep;
2300: res.rightContext.length = end - ep;
2301:
2302: return result;
2303: }
2304:
2305: int getFlags() {
2306: return re.flags;
2307: }
2308:
2309: private static void reportWarning(Context cx, String messageId,
2310: String arg) {
2311: if (cx.hasFeature(Context.FEATURE_STRICT_MODE)) {
2312: String msg = ScriptRuntime.getMessage1(messageId, arg);
2313: Context.reportWarning(msg);
2314: }
2315: }
2316:
2317: private static void reportError(String messageId, String arg) {
2318: String msg = ScriptRuntime.getMessage1(messageId, arg);
2319: throw ScriptRuntime.constructError("SyntaxError", msg);
2320: }
2321:
2322: // #string_id_map#
2323:
2324: private static final int Id_lastIndex = 1, Id_source = 2,
2325: Id_global = 3, Id_ignoreCase = 4, Id_multiline = 5,
2326:
2327: MAX_INSTANCE_ID = 5;
2328:
2329: protected int getMaxInstanceId() {
2330: return MAX_INSTANCE_ID;
2331: }
2332:
2333: protected int findInstanceIdInfo(String s) {
2334: int id;
2335: // #generated# Last update: 2007-05-09 08:16:24 EDT
2336: L0: {
2337: id = 0;
2338: String X = null;
2339: int c;
2340: int s_length = s.length();
2341: if (s_length == 6) {
2342: c = s.charAt(0);
2343: if (c == 'g') {
2344: X = "global";
2345: id = Id_global;
2346: } else if (c == 's') {
2347: X = "source";
2348: id = Id_source;
2349: }
2350: } else if (s_length == 9) {
2351: c = s.charAt(0);
2352: if (c == 'l') {
2353: X = "lastIndex";
2354: id = Id_lastIndex;
2355: } else if (c == 'm') {
2356: X = "multiline";
2357: id = Id_multiline;
2358: }
2359: } else if (s_length == 10) {
2360: X = "ignoreCase";
2361: id = Id_ignoreCase;
2362: }
2363: if (X != null && X != s && !X.equals(s))
2364: id = 0;
2365: break L0;
2366: }
2367: // #/generated#
2368: // #/string_id_map#
2369:
2370: if (id == 0)
2371: return super .findInstanceIdInfo(s);
2372:
2373: int attr;
2374: switch (id) {
2375: case Id_lastIndex:
2376: attr = PERMANENT | DONTENUM;
2377: break;
2378: case Id_source:
2379: case Id_global:
2380: case Id_ignoreCase:
2381: case Id_multiline:
2382: attr = PERMANENT | READONLY | DONTENUM;
2383: break;
2384: default:
2385: throw new IllegalStateException();
2386: }
2387: return instanceIdInfo(attr, id);
2388: }
2389:
2390: protected String getInstanceIdName(int id) {
2391: switch (id) {
2392: case Id_lastIndex:
2393: return "lastIndex";
2394: case Id_source:
2395: return "source";
2396: case Id_global:
2397: return "global";
2398: case Id_ignoreCase:
2399: return "ignoreCase";
2400: case Id_multiline:
2401: return "multiline";
2402: }
2403: return super .getInstanceIdName(id);
2404: }
2405:
2406: protected Object getInstanceIdValue(int id) {
2407: switch (id) {
2408: case Id_lastIndex:
2409: return ScriptRuntime.wrapNumber(lastIndex);
2410: case Id_source:
2411: return new String(re.source);
2412: case Id_global:
2413: return ScriptRuntime
2414: .wrapBoolean((re.flags & JSREG_GLOB) != 0);
2415: case Id_ignoreCase:
2416: return ScriptRuntime
2417: .wrapBoolean((re.flags & JSREG_FOLD) != 0);
2418: case Id_multiline:
2419: return ScriptRuntime
2420: .wrapBoolean((re.flags & JSREG_MULTILINE) != 0);
2421: }
2422: return super .getInstanceIdValue(id);
2423: }
2424:
2425: protected void setInstanceIdValue(int id, Object value) {
2426: if (id == Id_lastIndex) {
2427: lastIndex = ScriptRuntime.toNumber(value);
2428: return;
2429: }
2430: super .setInstanceIdValue(id, value);
2431: }
2432:
2433: protected void initPrototypeId(int id) {
2434: String s;
2435: int arity;
2436: switch (id) {
2437: case Id_compile:
2438: arity = 1;
2439: s = "compile";
2440: break;
2441: case Id_toString:
2442: arity = 0;
2443: s = "toString";
2444: break;
2445: case Id_toSource:
2446: arity = 0;
2447: s = "toSource";
2448: break;
2449: case Id_exec:
2450: arity = 1;
2451: s = "exec";
2452: break;
2453: case Id_test:
2454: arity = 1;
2455: s = "test";
2456: break;
2457: case Id_prefix:
2458: arity = 1;
2459: s = "prefix";
2460: break;
2461: default:
2462: throw new IllegalArgumentException(String.valueOf(id));
2463: }
2464: initPrototypeMethod(REGEXP_TAG, id, s, arity);
2465: }
2466:
2467: public Object execIdCall(IdFunctionObject f, Context cx,
2468: Scriptable scope, Scriptable this Obj, Object[] args) {
2469: if (!f.hasTag(REGEXP_TAG)) {
2470: return super .execIdCall(f, cx, scope, this Obj, args);
2471: }
2472: int id = f.methodId();
2473: switch (id) {
2474: case Id_compile:
2475: return realThis(this Obj, f).compile(cx, scope, args);
2476:
2477: case Id_toString:
2478: case Id_toSource:
2479: return realThis(this Obj, f).toString();
2480:
2481: case Id_exec:
2482: return realThis(this Obj, f).execSub(cx, scope, args, MATCH);
2483:
2484: case Id_test: {
2485: Object x = realThis(this Obj, f).execSub(cx, scope, args,
2486: TEST);
2487: return Boolean.TRUE.equals(x) ? Boolean.TRUE
2488: : Boolean.FALSE;
2489: }
2490:
2491: case Id_prefix:
2492: return realThis(this Obj, f)
2493: .execSub(cx, scope, args, PREFIX);
2494: }
2495: throw new IllegalArgumentException(String.valueOf(id));
2496: }
2497:
2498: private static NativeRegExp realThis(Scriptable this Obj,
2499: IdFunctionObject f) {
2500: if (!(this Obj instanceof NativeRegExp))
2501: throw incompatibleCallError(f);
2502: return (NativeRegExp) this Obj;
2503: }
2504:
2505: // #string_id_map#
2506: protected int findPrototypeId(String s) {
2507: int id;
2508: // #generated# Last update: 2007-05-09 08:16:24 EDT
2509: L0: {
2510: id = 0;
2511: String X = null;
2512: int c;
2513: L: switch (s.length()) {
2514: case 4:
2515: c = s.charAt(0);
2516: if (c == 'e') {
2517: X = "exec";
2518: id = Id_exec;
2519: } else if (c == 't') {
2520: X = "test";
2521: id = Id_test;
2522: }
2523: break L;
2524: case 6:
2525: X = "prefix";
2526: id = Id_prefix;
2527: break L;
2528: case 7:
2529: X = "compile";
2530: id = Id_compile;
2531: break L;
2532: case 8:
2533: c = s.charAt(3);
2534: if (c == 'o') {
2535: X = "toSource";
2536: id = Id_toSource;
2537: } else if (c == 't') {
2538: X = "toString";
2539: id = Id_toString;
2540: }
2541: break L;
2542: }
2543: if (X != null && X != s && !X.equals(s))
2544: id = 0;
2545: break L0;
2546: }
2547: // #/generated#
2548: return id;
2549: }
2550:
2551: private static final int Id_compile = 1, Id_toString = 2,
2552: Id_toSource = 3, Id_exec = 4, Id_test = 5, Id_prefix = 6,
2553:
2554: MAX_PROTOTYPE_ID = 6;
2555:
2556: // #/string_id_map#
2557:
2558: private RECompiled re;
2559: double lastIndex; /* index after last match, for //g iterator */
2560:
2561: } // class NativeRegExp
2562:
2563: class RECompiled implements Serializable {
2564: static final long serialVersionUID = -6144956577595844213L;
2565:
2566: char[] source; /* locked source string, sans // */
2567: int parenCount; /* number of parenthesized submatches */
2568: int flags; /* flags */
2569: byte[] program; /* regular expression bytecode */
2570: int classCount; /* count [...] bitmaps */
2571: RECharSet[] classList; /* list of [...] bitmaps */
2572: int anchorCh = -1; /* if >= 0, then re starts with this literal char */
2573: }
2574:
2575: class RENode {
2576:
2577: RENode(byte op) {
2578: this .op = op;
2579: }
2580:
2581: byte op; /* r.e. op bytecode */
2582: RENode next; /* next in concatenation order */
2583: RENode kid; /* first operand */
2584:
2585: RENode kid2; /* second operand */
2586: int num; /* could be a number */
2587: int parenIndex; /* or a parenthesis index */
2588:
2589: /* or a range */
2590: int min;
2591: int max;
2592: int parenCount;
2593: boolean greedy;
2594:
2595: /* or a character class */
2596: int startIndex;
2597: int kidlen; /* length of string at kid, in chars */
2598: int bmsize; /* bitmap size, based on max char code */
2599: int index; /* index into class list */
2600:
2601: /* or a literal sequence */
2602: char chr; /* of one character */
2603: int length; /* or many (via the index) */
2604: int flatIndex; /* which is -1 if not sourced */
2605:
2606: }
2607:
2608: class CompilerState {
2609:
2610: CompilerState(Context cx, char[] source, int length, int flags) {
2611: this .cx = cx;
2612: this .cpbegin = source;
2613: this .cp = 0;
2614: this .cpend = length;
2615: this .flags = flags;
2616: this .parenCount = 0;
2617: this .classCount = 0;
2618: this .progLength = 0;
2619: }
2620:
2621: Context cx;
2622: char cpbegin[];
2623: int cpend;
2624: int cp;
2625: int flags;
2626: int parenCount;
2627: int parenNesting;
2628: int classCount; /* number of [] encountered */
2629: int progLength; /* estimated bytecode length */
2630: RENode result;
2631: }
2632:
2633: class REProgState {
2634: REProgState(REProgState previous, int min, int max, int index,
2635: REBackTrackData backTrack, int continuation_pc,
2636: int continuation_op) {
2637: this .previous = previous;
2638: this .min = min;
2639: this .max = max;
2640: this .index = index;
2641: this .continuation_op = continuation_op;
2642: this .continuation_pc = continuation_pc;
2643: this .backTrack = backTrack;
2644: }
2645:
2646: REProgState previous; // previous state in stack
2647:
2648: int min; /* current quantifier min */
2649: int max; /* current quantifier max */
2650: int index; /* progress in text */
2651: int continuation_op;
2652: int continuation_pc;
2653: REBackTrackData backTrack; // used by ASSERT_ to recover state
2654: }
2655:
2656: class REBackTrackData {
2657:
2658: REBackTrackData(REGlobalData gData, int op, int pc) {
2659: previous = gData.backTrackStackTop;
2660: continuation_op = op;
2661: continuation_pc = pc;
2662: lastParen = gData.lastParen;
2663: if (gData.parens != null) {
2664: parens = gData.parens.clone();
2665: }
2666: cp = gData.cp;
2667: stateStackTop = gData.stateStackTop;
2668: }
2669:
2670: REBackTrackData previous;
2671:
2672: int continuation_op; /* where to backtrack to */
2673: int continuation_pc;
2674: int lastParen;
2675: long[] parens; /* parenthesis captures */
2676: int cp; /* char buffer index */
2677: REProgState stateStackTop; /* state of op that backtracked */
2678: }
2679:
2680: class REGlobalData {
2681: boolean multiline;
2682: RECompiled regexp; /* the RE in execution */
2683: int lastParen; /* highest paren set so far */
2684: int skipped; /* chars skipped anchoring this r.e. */
2685:
2686: int cp; /* char buffer index */
2687: long[] parens; /* parens captures */
2688:
2689: REProgState stateStackTop; /* stack of state of current ancestors */
2690:
2691: REBackTrackData backTrackStackTop; /* last matched-so-far position */
2692:
2693: /**
2694: * Get start of parenthesis capture contents, -1 for empty.
2695: */
2696: int parens_index(int i) {
2697: return (int) (parens[i]);
2698: }
2699:
2700: /**
2701: * Get length of parenthesis capture contents.
2702: */
2703: int parens_length(int i) {
2704: return (int) (parens[i] >>> 32);
2705: }
2706:
2707: void set_parens(int i, int index, int length) {
2708: parens[i] = (index & 0xffffffffL) | ((long) length << 32);
2709: }
2710:
2711: }
2712:
2713: /*
2714: * This struct holds a bitmap representation of a class from a regexp.
2715: * There's a list of these referenced by the classList field in the NativeRegExp
2716: * struct below. The initial state has startIndex set to the offset in the
2717: * original regexp source of the beginning of the class contents. The first
2718: * use of the class converts the source representation into a bitmap.
2719: *
2720: */
2721: final class RECharSet implements Serializable {
2722: static final long serialVersionUID = 7931787979395898394L;
2723:
2724: RECharSet(int length, int startIndex, int strlength) {
2725: this .length = length;
2726: this .startIndex = startIndex;
2727: this .strlength = strlength;
2728: }
2729:
2730: int length;
2731: int startIndex;
2732: int strlength;
2733:
2734: volatile transient boolean converted;
2735: volatile transient boolean sense;
2736: volatile transient byte[] bits;
2737: }
|