0001: /*
0002: * Copyright 2000 Finn Bock
0003: *
0004: * This program contains material copyrighted by:
0005: * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
0006: *
0007: * This version of the SRE library can be redistributed under CNRI's
0008: * Python 1.6 license. For any other use, please contact Secret Labs
0009: * AB (info@pythonware.com).
0010: *
0011: * Portions of this engine have been developed in cooperation with
0012: * CNRI. Hewlett-Packard provided funding for 1.6 integration and
0013: * other compatibility work.
0014: */
0015:
0016: // Last updated to _sre.c: 2.52
0017: package org.python.modules.sre;
0018:
0019: public class SRE_STATE {
0020:
0021: /*
0022: * Generated from Python-2.2.3 like 'python headerToJava.py < Modules/sre_constants.h'
0023: * where headerToJava.py contains the following code
0024: import sys
0025: for line in sys.stdin:
0026: if line.startswith('#define'):
0027: line = line.replace('#define', 'public static final int').strip()
0028: segs = line.split(' ')
0029: print '%s = %s;' % (' '.join(segs[:-1]), segs[-1])
0030: */
0031: //BEGIN generated code
0032: public static final int SRE_MAGIC = 20010701;
0033: public static final int SRE_OP_FAILURE = 0;
0034: public static final int SRE_OP_SUCCESS = 1;
0035: public static final int SRE_OP_ANY = 2;
0036: public static final int SRE_OP_ANY_ALL = 3;
0037: public static final int SRE_OP_ASSERT = 4;
0038: public static final int SRE_OP_ASSERT_NOT = 5;
0039: public static final int SRE_OP_AT = 6;
0040: public static final int SRE_OP_BRANCH = 7;
0041: public static final int SRE_OP_CALL = 8;
0042: public static final int SRE_OP_CATEGORY = 9;
0043: public static final int SRE_OP_CHARSET = 10;
0044: public static final int SRE_OP_BIGCHARSET = 11;
0045: public static final int SRE_OP_GROUPREF = 12;
0046: public static final int SRE_OP_GROUPREF_IGNORE = 13;
0047: public static final int SRE_OP_IN = 14;
0048: public static final int SRE_OP_IN_IGNORE = 15;
0049: public static final int SRE_OP_INFO = 16;
0050: public static final int SRE_OP_JUMP = 17;
0051: public static final int SRE_OP_LITERAL = 18;
0052: public static final int SRE_OP_LITERAL_IGNORE = 19;
0053: public static final int SRE_OP_MARK = 20;
0054: public static final int SRE_OP_MAX_UNTIL = 21;
0055: public static final int SRE_OP_MIN_UNTIL = 22;
0056: public static final int SRE_OP_NOT_LITERAL = 23;
0057: public static final int SRE_OP_NOT_LITERAL_IGNORE = 24;
0058: public static final int SRE_OP_NEGATE = 25;
0059: public static final int SRE_OP_RANGE = 26;
0060: public static final int SRE_OP_REPEAT = 27;
0061: public static final int SRE_OP_REPEAT_ONE = 28;
0062: public static final int SRE_OP_SUBPATTERN = 29;
0063: public static final int SRE_AT_BEGINNING = 0;
0064: public static final int SRE_AT_BEGINNING_LINE = 1;
0065: public static final int SRE_AT_BEGINNING_STRING = 2;
0066: public static final int SRE_AT_BOUNDARY = 3;
0067: public static final int SRE_AT_NON_BOUNDARY = 4;
0068: public static final int SRE_AT_END = 5;
0069: public static final int SRE_AT_END_LINE = 6;
0070: public static final int SRE_AT_END_STRING = 7;
0071: public static final int SRE_AT_LOC_BOUNDARY = 8;
0072: public static final int SRE_AT_LOC_NON_BOUNDARY = 9;
0073: public static final int SRE_AT_UNI_BOUNDARY = 10;
0074: public static final int SRE_AT_UNI_NON_BOUNDARY = 11;
0075: public static final int SRE_CATEGORY_DIGIT = 0;
0076: public static final int SRE_CATEGORY_NOT_DIGIT = 1;
0077: public static final int SRE_CATEGORY_SPACE = 2;
0078: public static final int SRE_CATEGORY_NOT_SPACE = 3;
0079: public static final int SRE_CATEGORY_WORD = 4;
0080: public static final int SRE_CATEGORY_NOT_WORD = 5;
0081: public static final int SRE_CATEGORY_LINEBREAK = 6;
0082: public static final int SRE_CATEGORY_NOT_LINEBREAK = 7;
0083: public static final int SRE_CATEGORY_LOC_WORD = 8;
0084: public static final int SRE_CATEGORY_LOC_NOT_WORD = 9;
0085: public static final int SRE_CATEGORY_UNI_DIGIT = 10;
0086: public static final int SRE_CATEGORY_UNI_NOT_DIGIT = 11;
0087: public static final int SRE_CATEGORY_UNI_SPACE = 12;
0088: public static final int SRE_CATEGORY_UNI_NOT_SPACE = 13;
0089: public static final int SRE_CATEGORY_UNI_WORD = 14;
0090: public static final int SRE_CATEGORY_UNI_NOT_WORD = 15;
0091: public static final int SRE_CATEGORY_UNI_LINEBREAK = 16;
0092: public static final int SRE_CATEGORY_UNI_NOT_LINEBREAK = 17;
0093: public static final int SRE_FLAG_TEMPLATE = 1;
0094: public static final int SRE_FLAG_IGNORECASE = 2;
0095: public static final int SRE_FLAG_LOCALE = 4;
0096: public static final int SRE_FLAG_MULTILINE = 8;
0097: public static final int SRE_FLAG_DOTALL = 16;
0098: public static final int SRE_FLAG_UNICODE = 32;
0099: public static final int SRE_FLAG_VERBOSE = 64;
0100: public static final int SRE_INFO_PREFIX = 1;
0101: public static final int SRE_INFO_LITERAL = 2;
0102: public static final int SRE_INFO_CHARSET = 4;
0103: //END generated code
0104:
0105: //From here we're including things from _sre.c in the order they're defined there
0106: public static final int USE_RECURSION_LIMIT = 5000;
0107:
0108: /* error codes */
0109: public static final int SRE_ERROR_ILLEGAL = -1;
0110: public static final int SRE_ERROR_STATE = -2;
0111: public static final int SRE_ERROR_RECURSION_LIMIT = -3;
0112:
0113: /* default character predicates (run sre_chars.py to regenerate tables) */
0114: static final int SRE_DIGIT_MASK = 1;
0115: static final int SRE_SPACE_MASK = 2;
0116: static final int SRE_LINEBREAK_MASK = 4;
0117: static final int SRE_ALNUM_MASK = 8;
0118: static final int SRE_WORD_MASK = 16;
0119:
0120: static byte[] sre_char_info = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0,
0121: 0, 2, 6, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0122: 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0123: 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0, 0,
0124: 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
0125: 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0,
0126: 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
0127: 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
0128: 0, 0, 0 };
0129:
0130: static byte[] sre_char_lower = new byte[] { 0, 1, 2, 3, 4, 5, 6, 7,
0131: 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
0132: 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
0133: 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
0134: 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,
0135: 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
0136: 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,
0137: 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
0138: 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
0139: 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 };
0140:
0141: final boolean SRE_IS_DIGIT(char ch) {
0142: return ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) != 0
0143: : false);
0144: }
0145:
0146: final boolean SRE_IS_SPACE(char ch) {
0147: return ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) != 0
0148: : false);
0149: }
0150:
0151: final boolean SRE_IS_LINEBREAK(char ch) {
0152: //TODO why is this different than _sre.c
0153: return ch == '\n';
0154: }
0155:
0156: final boolean SRE_IS_WORD(char ch) {
0157: return ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) != 0
0158: : false);
0159: }
0160:
0161: final char lower(char ch) {
0162: if ((flags & SRE_FLAG_LOCALE) != 0)
0163: return ((ch) < 256 ? Character.toLowerCase(ch) : ch);
0164: if ((flags & SRE_FLAG_UNICODE) != 0)
0165: return Character.toLowerCase(ch);
0166: return ((ch) < 128 ? (char) sre_char_lower[ch] : ch);
0167: }
0168:
0169: final boolean SRE_LOC_IS_WORD(char ch) {
0170: return Character.isLetterOrDigit(ch) || ch == '_';
0171: }
0172:
0173: final boolean SRE_UNI_IS_LINEBREAK(char ch) {
0174: switch (ch) {
0175: case 0x000A: /* LINE FEED */
0176: case 0x000D: /* CARRIAGE RETURN */
0177: case 0x001C: /* FILE SEPARATOR */
0178: case 0x001D: /* GROUP SEPARATOR */
0179: case 0x001E: /* RECORD SEPARATOR */
0180: case 0x0085: /* NEXT LINE */
0181: case 0x2028: /* LINE SEPARATOR */
0182: case 0x2029: /* PARAGRAPH SEPARATOR */
0183: return true;
0184: default:
0185: return false;
0186: }
0187: }
0188:
0189: final boolean sre_category(char category, char ch) {
0190: switch (category) {
0191:
0192: case SRE_CATEGORY_DIGIT:
0193: return SRE_IS_DIGIT(ch);
0194: case SRE_CATEGORY_NOT_DIGIT:
0195: return !SRE_IS_DIGIT(ch);
0196:
0197: case SRE_CATEGORY_SPACE:
0198: return SRE_IS_SPACE(ch);
0199: case SRE_CATEGORY_NOT_SPACE:
0200: return !SRE_IS_SPACE(ch);
0201:
0202: case SRE_CATEGORY_WORD:
0203: return SRE_IS_WORD(ch);
0204: case SRE_CATEGORY_NOT_WORD:
0205: return !SRE_IS_WORD(ch);
0206:
0207: case SRE_CATEGORY_LINEBREAK:
0208: return SRE_IS_LINEBREAK(ch);
0209: case SRE_CATEGORY_NOT_LINEBREAK:
0210: return !SRE_IS_LINEBREAK(ch);
0211:
0212: case SRE_CATEGORY_LOC_WORD:
0213: return SRE_LOC_IS_WORD(ch);
0214: case SRE_CATEGORY_LOC_NOT_WORD:
0215: return !SRE_LOC_IS_WORD(ch);
0216:
0217: case SRE_CATEGORY_UNI_DIGIT:
0218: return Character.isDigit(ch);
0219: case SRE_CATEGORY_UNI_NOT_DIGIT:
0220: return !Character.isDigit(ch);
0221:
0222: case SRE_CATEGORY_UNI_SPACE:
0223: return Character.isWhitespace(ch);
0224: case SRE_CATEGORY_UNI_NOT_SPACE:
0225: return !Character.isWhitespace(ch);
0226:
0227: case SRE_CATEGORY_UNI_WORD:
0228: return Character.isLetterOrDigit(ch) || ch == '_';
0229: case SRE_CATEGORY_UNI_NOT_WORD:
0230: return !(Character.isLetterOrDigit(ch) || ch == '_');
0231:
0232: case SRE_CATEGORY_UNI_LINEBREAK:
0233: return SRE_UNI_IS_LINEBREAK(ch);
0234: case SRE_CATEGORY_UNI_NOT_LINEBREAK:
0235: return !SRE_UNI_IS_LINEBREAK(ch);
0236:
0237: }
0238: return false;
0239: }
0240:
0241: private void mark_fini() {
0242: mark_stack = null;
0243: mark_stack_size = mark_stack_base = 0;
0244: }
0245:
0246: private void mark_save(int lo, int hi) {
0247: if (hi <= lo)
0248: return;
0249:
0250: int size = (hi - lo) + 1;
0251:
0252: int newsize = mark_stack_size;
0253: int minsize = mark_stack_base + size;
0254:
0255: int[] stack;
0256:
0257: if (newsize < minsize) {
0258: /* create new stack */
0259: if (newsize == 0) {
0260: newsize = 512;
0261: if (newsize < minsize)
0262: newsize = minsize;
0263: //TRACE(0, ptr, "allocate stack " + newsize);
0264: stack = new int[newsize];
0265: } else {
0266: /* grow the stack */
0267: while (newsize < minsize)
0268: newsize += newsize;
0269: //TRACE(0, ptr, "grow stack to " + newsize);
0270: stack = new int[newsize];
0271: System.arraycopy(mark_stack, 0, stack, 0,
0272: mark_stack.length);
0273: }
0274: mark_stack = stack;
0275: mark_stack_size = newsize;
0276: }
0277:
0278: //TRACE(0, ptr, "copy " + lo + ":" + hi + " to " + mark_stack_base + " (" + size + ")");
0279:
0280: System.arraycopy(mark, lo, mark_stack, mark_stack_base, size);
0281:
0282: mark_stack_base += size;
0283: }
0284:
0285: private void mark_restore(int lo, int hi) {
0286: if (hi <= lo)
0287: return;
0288:
0289: int size = (hi - lo) + 1;
0290:
0291: mark_stack_base -= size;
0292:
0293: //TRACE(0, ptr, "copy " + lo + ":" + hi + " from " + mark_stack_base);
0294:
0295: System.arraycopy(mark_stack, mark_stack_base, mark, lo, size);
0296: }
0297:
0298: final boolean SRE_AT(int ptr, char at) {
0299: /* check if pointer is at given position. */
0300:
0301: boolean thiS, that;
0302:
0303: switch (at) {
0304: case SRE_AT_BEGINNING:
0305: case SRE_AT_BEGINNING_STRING:
0306: return ptr == beginning;
0307:
0308: case SRE_AT_BEGINNING_LINE:
0309: return (ptr == beginning || SRE_IS_LINEBREAK(str[ptr - 1]));
0310:
0311: case SRE_AT_END:
0312: return (ptr + 1 == end && SRE_IS_LINEBREAK(str[ptr]))
0313: || ptr == end;
0314:
0315: case SRE_AT_END_LINE:
0316: return ptr == end || SRE_IS_LINEBREAK(str[ptr]);
0317:
0318: case SRE_AT_END_STRING:
0319: return ptr == end;
0320:
0321: case SRE_AT_BOUNDARY:
0322: /* word boundary */
0323: if (beginning == end)
0324: return false;
0325: that = (ptr > beginning) ? SRE_IS_WORD(str[ptr - 1])
0326: : false;
0327: thiS = (ptr < end) ? SRE_IS_WORD(str[ptr]) : false;
0328: return thiS != that;
0329:
0330: case SRE_AT_NON_BOUNDARY:
0331: /* word non-boundary */
0332: if (beginning == end)
0333: return false;
0334: that = (ptr > beginning) ? SRE_IS_WORD(str[ptr - 1])
0335: : false;
0336: thiS = (ptr < end) ? SRE_IS_WORD(str[ptr]) : false;
0337: return thiS == that;
0338:
0339: case SRE_AT_LOC_BOUNDARY:
0340: case SRE_AT_UNI_BOUNDARY:
0341: if (beginning == end)
0342: return false;
0343: that = (ptr > beginning) ? SRE_LOC_IS_WORD(str[ptr - 1])
0344: : false;
0345: thiS = (ptr < end) ? SRE_LOC_IS_WORD(str[ptr]) : false;
0346: return thiS != that;
0347:
0348: case SRE_AT_LOC_NON_BOUNDARY:
0349: case SRE_AT_UNI_NON_BOUNDARY:
0350: /* word non-boundary */
0351: if (beginning == end)
0352: return false;
0353: that = (ptr > beginning) ? SRE_LOC_IS_WORD(str[ptr - 1])
0354: : false;
0355: thiS = (ptr < end) ? SRE_LOC_IS_WORD(str[ptr]) : false;
0356: return thiS == that;
0357: }
0358:
0359: return false;
0360: }
0361:
0362: final boolean SRE_CHARSET(char[] set, int setidx, char ch) {
0363: /* check if character is a member of the given set. */
0364:
0365: boolean ok = true;
0366:
0367: for (;;) {
0368: switch (set[setidx++]) {
0369:
0370: case SRE_OP_LITERAL:
0371: //TRACE(setidx, ch, "CHARSET LITERAL " + (int) set[setidx]);
0372: /* <LITERAL> <code> */
0373: if (ch == set[setidx])
0374: return ok;
0375: setidx++;
0376: break;
0377:
0378: case SRE_OP_RANGE:
0379: /* <RANGE> <lower> <upper> */
0380: //TRACE(setidx, ch, "CHARSET RANGE " + (int) set[setidx] + " " + (int) set[setidx+1]);
0381: if (set[setidx] <= ch && ch <= set[setidx + 1])
0382: return ok;
0383: setidx += 2;
0384: break;
0385:
0386: case SRE_OP_CHARSET:
0387: //TRACE(setidx, ch, "CHARSET CHARSET ");
0388: /* <CHARSET> <bitmap> (16 bits per code word) */
0389: if (ch < 256
0390: && (set[setidx + (ch >> 4)] & (1 << (ch & 15))) != 0)
0391: return ok;
0392: setidx += 16;
0393: break;
0394:
0395: case SRE_OP_BIGCHARSET:
0396: /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
0397: //TRACE(setidx, ch, "CHARSET BIGCHARSET ");
0398: int count = set[setidx++];
0399: int shift = ((ch >> 8) & 1) == 0 ? 8 : 0;
0400: int block = (set[setidx + (ch >> 8) / 2] >> shift) & 0xFF;
0401: setidx += 128;
0402: int idx = block * 16 + ((ch & 255) >> 4);
0403: if ((set[setidx + idx] & (1 << (ch & 15))) != 0)
0404: return ok;
0405: setidx += count * 16;
0406: break;
0407:
0408: case SRE_OP_CATEGORY:
0409: /* <CATEGORY> <code> */
0410: //TRACE(setidx, ch, "CHARSET CHARSET " + (int) set[setidx]);
0411: if (sre_category(set[setidx], ch))
0412: return ok;
0413: setidx++;
0414: break;
0415:
0416: case SRE_OP_NEGATE:
0417: //TRACE(setidx, ch, "CHARSET NEGATE");
0418: ok = !ok;
0419: break;
0420:
0421: case SRE_OP_FAILURE:
0422: //TRACE(setidx, ch, "CHARSET FAILURE");
0423: return !ok;
0424:
0425: default:
0426: /* internal error -- there's not much we can do about it
0427: here, so let's just pretend it didn't match... */
0428: return false;
0429: }
0430: }
0431: }
0432:
0433: private int SRE_COUNT(char[] pattern, int pidx, int maxcount,
0434: int level) {
0435: char chr;
0436: int ptr = this .ptr;
0437: int end = this .end;
0438: int i;
0439:
0440: /* adjust end */
0441: if (maxcount < end - ptr && maxcount != 65535)
0442: end = ptr + maxcount;
0443:
0444: switch (pattern[pidx]) {
0445:
0446: case SRE_OP_ANY:
0447: /* repeated dot wildcard. */
0448: //TRACE(pidx, ptr, "COUNT ANY");
0449: while (ptr < end && !SRE_IS_LINEBREAK(str[ptr]))
0450: ptr++;
0451: break;
0452:
0453: case SRE_OP_ANY_ALL:
0454: /* repeated dot wildcare. skip to the end of the target
0455: string, and backtrack from there */
0456: //TRACE(pidx, ptr, "COUNT ANY_ALL");
0457: ptr = end;
0458: break;
0459:
0460: case SRE_OP_LITERAL:
0461: /* repeated literal */
0462: chr = pattern[pidx + 1];
0463: //TRACE(pidx, ptr, "COUNT LITERAL " + (int) chr);
0464: while (ptr < end && str[ptr] == chr)
0465: ptr++;
0466: break;
0467:
0468: case SRE_OP_LITERAL_IGNORE:
0469: /* repeated literal */
0470: chr = pattern[pidx + 1];
0471: //TRACE(pidx, ptr, "COUNT LITERAL_IGNORE " + (int) chr);
0472: while (ptr < end && lower(str[ptr]) == chr)
0473: ptr++;
0474: break;
0475:
0476: case SRE_OP_NOT_LITERAL:
0477: /* repeated non-literal */
0478: chr = pattern[pidx + 1];
0479: //TRACE(pidx, ptr, "COUNT NOT_LITERAL " + (int) chr);
0480: while (ptr < end && str[ptr] != chr)
0481: ptr++;
0482: break;
0483:
0484: case SRE_OP_NOT_LITERAL_IGNORE:
0485: /* repeated non-literal */
0486: chr = pattern[pidx + 1];
0487: //TRACE(pidx, ptr, "COUNT NOT_LITERAL_IGNORE " + (int) chr);
0488: while (ptr < end && lower(str[ptr]) != chr)
0489: ptr++;
0490: break;
0491:
0492: case SRE_OP_IN:
0493: /* repeated set */
0494: //TRACE(pidx, ptr, "COUNT IN");
0495: while (ptr < end
0496: && SRE_CHARSET(pattern, pidx + 2, str[ptr]))
0497: ptr++;
0498: break;
0499:
0500: default:
0501: /* repeated single character pattern */
0502: //TRACE(pidx, ptr, "COUNT SUBPATTERN");
0503: while (this .ptr < end) {
0504: i = SRE_MATCH(pattern, pidx, level);
0505: if (i < 0)
0506: return i;
0507: if (i == 0)
0508: break;
0509: }
0510: return this .ptr - ptr;
0511: }
0512:
0513: return ptr - this .ptr;
0514: }
0515:
0516: final int SRE_MATCH(char[] pattern, int pidx, int level) {
0517: /* check if string matches the given pattern. returns <0 for
0518: error, 0 for failure, and 1 for success */
0519:
0520: int end = this .end;
0521: int ptr = this .ptr;
0522: int i, count;
0523: char chr;
0524:
0525: int lastmark;
0526:
0527: //TRACE(pidx, ptr, "ENTER " + level);
0528:
0529: if (level > USE_RECURSION_LIMIT)
0530: return SRE_ERROR_RECURSION_LIMIT;
0531:
0532: if (pattern[pidx] == SRE_OP_INFO) {
0533: /* optimization info block */
0534: /* args: <1=skip> <2=flags> <3=min> ... */
0535: if (pattern[pidx + 3] != 0
0536: && (end - ptr) < pattern[pidx + 3]) {
0537: return 0;
0538: }
0539: pidx += pattern[pidx + 1] + 1;
0540: }
0541:
0542: for (;;) {
0543:
0544: switch (pattern[pidx++]) {
0545:
0546: case SRE_OP_FAILURE:
0547: /* immediate failure */
0548: //TRACE(pidx, ptr, "FAILURE");
0549: return 0;
0550:
0551: case SRE_OP_SUCCESS:
0552: /* end of pattern */
0553: //TRACE(pidx, ptr, "SUCCESS");
0554: this .ptr = ptr;
0555: return 1;
0556:
0557: case SRE_OP_AT:
0558: /* match at given position */
0559: /* <AT> <code> */
0560: //TRACE(pidx, ptr, "AT " + (int) pattern[pidx]);
0561: if (!SRE_AT(ptr, pattern[pidx]))
0562: return 0;
0563: pidx++;
0564: break;
0565:
0566: case SRE_OP_CATEGORY:
0567: /* match at given category */
0568: /* <CATEGORY> <code> */
0569: //TRACE(pidx, ptr, "CATEGORY " + (int)pattern[pidx]);
0570: if (ptr >= end
0571: || !sre_category(pattern[pidx], str[ptr]))
0572: return 0;
0573:
0574: pidx++;
0575: ptr++;
0576: break;
0577:
0578: case SRE_OP_LITERAL:
0579: /* match literal character */
0580: /* <LITERAL> <code> */
0581: //TRACE(pidx, ptr, "LITERAL " + (int) pattern[pidx]);
0582: if (ptr >= end || str[ptr] != pattern[pidx])
0583: return 0;
0584: pidx++;
0585: ptr++;
0586: break;
0587:
0588: case SRE_OP_NOT_LITERAL:
0589: /* match anything that is not literal character */
0590: /* args: <code> */
0591: //TRACE(pidx, ptr, "NOT_LITERAL " + (int) pattern[pidx]);
0592: if (ptr >= end || str[ptr] == pattern[pidx])
0593: return 0;
0594: pidx++;
0595: ptr++;
0596: break;
0597:
0598: case SRE_OP_ANY:
0599: /* match anything */
0600: //TRACE(pidx, ptr, "ANY");
0601: if (ptr >= end || SRE_IS_LINEBREAK(str[ptr]))
0602: return 0;
0603: ptr++;
0604: break;
0605:
0606: case SRE_OP_ANY_ALL:
0607: /* match anything */
0608: /* <ANY_ALL> */
0609: //TRACE(pidx, ptr, "ANY_ALL");
0610: if (ptr >= end)
0611: return 0;
0612: ptr++;
0613: break;
0614:
0615: case SRE_OP_IN:
0616: /* match set member (or non_member) */
0617: /* <IN> <skip> <set> */
0618: //TRACE(pidx, ptr, "IN");
0619: if (ptr >= end
0620: || !SRE_CHARSET(pattern, pidx + 1, str[ptr]))
0621: return 0;
0622: pidx += (int) pattern[pidx];
0623: ptr++;
0624: break;
0625:
0626: case SRE_OP_GROUPREF:
0627: /* match backreference */
0628: i = pattern[pidx];
0629: //TRACE(pidx, ptr, "GROUPREF " + i);
0630: int p = mark[i + i];
0631: int e = mark[i + i + 1];
0632: if (p == -1 || e == -1 || e < p)
0633: return 0;
0634: while (p < e) {
0635: if (ptr >= end || str[ptr] != str[p])
0636: return 0;
0637: p++;
0638: ptr++;
0639: }
0640: pidx++;
0641: break;
0642:
0643: case SRE_OP_GROUPREF_IGNORE:
0644: /* match backreference */
0645: i = pattern[pidx];
0646: //TRACE(pidx, ptr, "GROUPREF_IGNORE " + i);
0647: p = mark[i + i];
0648: e = mark[i + i + 1];
0649: if (p == -1 || e == -1 || e < p)
0650: return 0;
0651: while (p < e) {
0652: if (ptr >= end || lower(str[ptr]) != lower(str[p]))
0653: return 0;
0654: p++;
0655: ptr++;
0656: }
0657: pidx++;
0658: break;
0659:
0660: case SRE_OP_LITERAL_IGNORE:
0661: //TRACE(pidx, ptr, "LITERAL_IGNORE " + (int) pattern[pidx]);
0662: if (ptr >= end
0663: || lower(str[ptr]) != lower(pattern[pidx]))
0664: return 0;
0665: pidx++;
0666: ptr++;
0667: break;
0668:
0669: case SRE_OP_NOT_LITERAL_IGNORE:
0670: //TRACE(pidx, ptr, "NOT_LITERAL_IGNORE " + (int) pattern[pidx]);
0671: if (ptr >= end
0672: || lower(str[ptr]) == lower(pattern[pidx]))
0673: return 0;
0674: pidx++;
0675: ptr++;
0676: break;
0677:
0678: case SRE_OP_IN_IGNORE:
0679: //TRACE(pidx, ptr, "IN_IGNORE");
0680: if (ptr >= end
0681: || !SRE_CHARSET(pattern, pidx + 1,
0682: lower(str[ptr])))
0683: return 0;
0684: pidx += (int) pattern[pidx];
0685: ptr++;
0686: break;
0687:
0688: case SRE_OP_MARK:
0689: /* set mark */
0690: /* <MARK> <gid> */
0691: //TRACE(pidx, ptr, "MARK " + (int) pattern[pidx]);
0692: i = pattern[pidx];
0693: if ((i & 1) != 0)
0694: lastindex = i / 2 + 1;
0695: if (i > this .lastmark)
0696: this .lastmark = i;
0697: mark[i] = ptr;
0698: pidx++;
0699: break;
0700:
0701: case SRE_OP_JUMP:
0702: case SRE_OP_INFO:
0703: /* jump forward */
0704: /* <JUMP> <offset> */
0705: //TRACE(pidx, ptr, "JUMP " + (int) pattern[pidx]);
0706: pidx += (int) pattern[pidx];
0707: break;
0708:
0709: case SRE_OP_ASSERT:
0710: /* assert subpattern */
0711: /* args: <skip> <back> <pattern> */
0712: //TRACE(pidx, ptr, "ASSERT " + (int) pattern[pidx+1]);
0713: this .ptr = ptr - pattern[pidx + 1];
0714: if (this .ptr < this .beginning)
0715: return 0;
0716: i = SRE_MATCH(pattern, pidx + 2, level + 1);
0717: if (i <= 0)
0718: return i;
0719: pidx += pattern[pidx];
0720: break;
0721:
0722: case SRE_OP_ASSERT_NOT:
0723: /* assert not subpattern */
0724: /* args: <skip> <pattern> */
0725: //TRACE(pidx, ptr, "ASSERT_NOT " + (int) pattern[pidx]);
0726: this .ptr = ptr - pattern[pidx + 1];
0727: if (this .ptr >= this .beginning) {
0728: i = SRE_MATCH(pattern, pidx + 2, level + 1);
0729: if (i < 0)
0730: return i;
0731: if (i != 0)
0732: return 0;
0733: }
0734: pidx += pattern[pidx];
0735: break;
0736:
0737: case SRE_OP_BRANCH:
0738: /* try an alternate branch */
0739: /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
0740: //TRACE(pidx, ptr, "BRANCH");
0741: lastmark = this .lastmark;
0742: for (; pattern[pidx] != 0; pidx += pattern[pidx]) {
0743: if (pattern[pidx + 1] == SRE_OP_LITERAL
0744: && (ptr >= end || str[ptr] != pattern[pidx + 2]))
0745: continue;
0746: if (pattern[pidx + 1] == SRE_OP_IN
0747: && (ptr >= end || !SRE_CHARSET(pattern,
0748: pidx + 3, str[ptr])))
0749: continue;
0750: this .ptr = ptr;
0751: i = SRE_MATCH(pattern, pidx + 1, level + 1);
0752: if (i != 0)
0753: return i;
0754: while (this .lastmark > lastmark)
0755: mark[this .lastmark--] = -1;
0756: }
0757:
0758: return 0;
0759:
0760: case SRE_OP_REPEAT_ONE:
0761: /* match repeated sequence (maximizing regexp) */
0762:
0763: /* this operator only works if the repeated item is
0764: exactly one character wide, and we're not already
0765: collecting backtracking points. for other cases,
0766: use the MAX_REPEAT operator */
0767:
0768: /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
0769:
0770: int mincount = pattern[pidx + 1];
0771:
0772: //TRACE(pidx, ptr, "REPEAT_ONE " + mincount + " " + (int)pattern[pidx+2]);
0773: if (ptr + mincount > end)
0774: return 0; /* cannot match */
0775:
0776: this .ptr = ptr;
0777:
0778: count = SRE_COUNT(pattern, pidx + 3, pattern[pidx + 2],
0779: level + 1);
0780: if (count < 0)
0781: return count;
0782:
0783: ptr += count;
0784:
0785: /* when we arrive here, count contains the number of
0786: matches, and ptr points to the tail of the target
0787: string. check if the rest of the pattern matches,
0788: and backtrack if not. */
0789:
0790: if (count < mincount)
0791: return 0;
0792:
0793: if (pattern[pidx + pattern[pidx]] == SRE_OP_SUCCESS) {
0794: /* tail is empty. we're finished */
0795: this .ptr = ptr;
0796: return 1;
0797:
0798: } else if (pattern[pidx + pattern[pidx]] == SRE_OP_LITERAL) {
0799: /* tail starts with a literal. skip positions where
0800: the rest of the pattern cannot possibly match */
0801: chr = pattern[pidx + pattern[pidx] + 1];
0802: for (;;) {
0803: while (count >= mincount
0804: && (ptr >= end || str[ptr] != chr)) {
0805: ptr--;
0806: count--;
0807: }
0808: if (count < mincount)
0809: break;
0810: this .ptr = ptr;
0811: i = SRE_MATCH(pattern, pidx + pattern[pidx],
0812: level + 1);
0813: if (i != 0)
0814: return 1;
0815: ptr--;
0816: count--;
0817: }
0818:
0819: } else {
0820: /* general case */
0821: lastmark = this .lastmark;
0822: while (count >= mincount) {
0823: this .ptr = ptr;
0824: i = SRE_MATCH(pattern, pidx + pattern[pidx],
0825: level + 1);
0826: if (i != 0)
0827: return i;
0828: ptr--;
0829: count--;
0830: while (this .lastmark > lastmark)
0831: mark[this .lastmark--] = -1;
0832: }
0833: }
0834: return 0;
0835:
0836: case SRE_OP_REPEAT:
0837: /* create repeat context. all the hard work is done
0838: by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
0839: /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
0840:
0841: //TRACE(pidx, ptr, "REPEAT " + (int)pattern[pidx+1] + " " + (int)pattern[pidx+2]);
0842: SRE_REPEAT rep = new SRE_REPEAT(repeat);
0843: rep.count = -1;
0844: rep.pidx = pidx;
0845: repeat = rep;
0846:
0847: this .ptr = ptr;
0848: i = SRE_MATCH(pattern, pidx + pattern[pidx], level + 1);
0849:
0850: repeat = rep.prev;
0851: return i;
0852:
0853: case SRE_OP_MAX_UNTIL:
0854: /* maximizing repeat */
0855: /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
0856:
0857: /* FIXME: we probably need to deal with zero-width
0858: matches in here... */
0859:
0860: SRE_REPEAT rp = this .repeat;
0861: if (rp == null)
0862: return SRE_ERROR_STATE;
0863:
0864: this .ptr = ptr;
0865:
0866: count = rp.count + 1;
0867:
0868: //TRACE(pidx, ptr, "MAX_UNTIL " + count);
0869:
0870: if (count < pattern[rp.pidx + 1]) {
0871: /* not enough matches */
0872:
0873: rp.count = count;
0874: i = SRE_MATCH(pattern, rp.pidx + 3, level + 1);
0875: if (i != 0)
0876: return i;
0877: rp.count = count - 1;
0878: this .ptr = ptr;
0879: return 0;
0880: }
0881:
0882: if (count < pattern[rp.pidx + 2]
0883: || pattern[rp.pidx + 2] == 65535) {
0884: /* we may have enough matches, but if we can
0885: match another item, do so */
0886: rp.count = count;
0887: lastmark = this .lastmark;
0888: mark_save(0, lastmark);
0889: /* RECURSIVE */
0890: i = SRE_MATCH(pattern, rp.pidx + 3, level + 1);
0891: if (i != 0)
0892: return i;
0893: mark_restore(0, lastmark);
0894: this .lastmark = lastmark;
0895: rp.count = count - 1;
0896: this .ptr = ptr;
0897: }
0898:
0899: /* cannot match more repeated items here. make sure the
0900: tail matches */
0901: this .repeat = rp.prev;
0902: /* RECURSIVE */
0903: i = SRE_MATCH(pattern, pidx, level + 1);
0904: if (i != 0)
0905: return i;
0906: this .repeat = rp;
0907: this .ptr = ptr;
0908: return 0;
0909:
0910: case SRE_OP_MIN_UNTIL:
0911: /* minimizing repeat */
0912: /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
0913:
0914: rp = this .repeat;
0915: if (rp == null)
0916: return SRE_ERROR_STATE;
0917:
0918: count = rp.count + 1;
0919:
0920: //TRACE(pidx, ptr, "MIN_UNTIL " + count + " " + rp.pidx);
0921:
0922: this .ptr = ptr;
0923:
0924: if (count < pattern[rp.pidx + 1]) {
0925: /* not enough matches */
0926: rp.count = count;
0927: /* RECURSIVE */
0928: i = SRE_MATCH(pattern, rp.pidx + 3, level + 1);
0929: if (i != 0)
0930: return i;
0931: rp.count = count - 1;
0932: this .ptr = ptr;
0933: return 0;
0934: }
0935:
0936: /* see if the tail matches */
0937: this .repeat = rp.prev;
0938: i = SRE_MATCH(pattern, pidx, level + 1);
0939: if (i != 0)
0940: return i;
0941:
0942: this .ptr = ptr;
0943: this .repeat = rp;
0944:
0945: if (count >= pattern[rp.pidx + 2]
0946: && pattern[rp.pidx + 2] != 65535)
0947: return 0;
0948:
0949: rp.count = count;
0950: /* RECURSIVE */
0951: i = SRE_MATCH(pattern, rp.pidx + 3, level + 1);
0952: if (i != 0)
0953: return i;
0954: rp.count = count - 1;
0955: this .ptr = ptr;
0956: return 0;
0957:
0958: default:
0959: TRACE(pidx, ptr, "UNKNOWN " + (int) pattern[pidx - 1]);
0960: return SRE_ERROR_ILLEGAL;
0961: }
0962: }
0963:
0964: //return SRE_ERROR_ILLEGAL;
0965: }
0966:
0967: int SRE_SEARCH(char[] pattern, int pidx) {
0968: int ptr = this .start;
0969: int end = this .end;
0970: int status = 0;
0971: int prefix_len = 0;
0972: int prefix_skip = 0;
0973: int prefix = 0;
0974: int charset = 0;
0975: int overlap = 0;
0976: int flags = 0;
0977:
0978: if (pattern[pidx] == SRE_OP_INFO) {
0979: /* optimization info block */
0980: /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
0981:
0982: flags = pattern[pidx + 2];
0983:
0984: if (pattern[pidx + 3] > 0) {
0985: /* adjust end point (but make sure we leave at least one
0986: character in there, so literal search will work) */
0987: end -= pattern[pidx + 3] - 1;
0988: if (end <= ptr)
0989: end = ptr; // FBO
0990: }
0991:
0992: if ((flags & SRE_INFO_PREFIX) != 0) {
0993: /* pattern starts with a known prefix */
0994: /* <length> <skip> <prefix data> <overlap data> */
0995: prefix_len = pattern[pidx + 5];
0996: prefix_skip = pattern[pidx + 6];
0997: prefix = pidx + 7;
0998: overlap = prefix + prefix_len - 1;
0999: } else if ((flags & SRE_INFO_CHARSET) != 0) {
1000: /* pattern starts with a character from a known set */
1001: /* <charset> */
1002: charset = pidx + 5;
1003: }
1004:
1005: pidx += 1 + pattern[pidx + 1];
1006: }
1007:
1008: if (prefix_len > 1) {
1009: /* pattern starts with a known prefix. use the overlap
1010: table to skip forward as fast as we possibly can */
1011: int i = 0;
1012: end = this .end;
1013: while (ptr < end) {
1014: for (;;) {
1015: if (str[ptr] != pattern[prefix + i]) {
1016: if (i == 0)
1017: break;
1018: else
1019: i = pattern[overlap + i];
1020: } else {
1021: if (++i == prefix_len) {
1022: /* found a potential match */
1023: //TRACE(pidx, ptr, "SEARCH SCAN " + prefix_skip + " " + prefix_len);
1024: this .start = ptr + 1 - prefix_len;
1025: this .ptr = ptr + 1 - prefix_len
1026: + prefix_skip;
1027: if ((flags & SRE_INFO_LITERAL) != 0)
1028: return 1; /* we got all of it */
1029: status = SRE_MATCH(pattern, pidx + 2
1030: * prefix_skip, 1);
1031: if (status != 0)
1032: return status;
1033: /* close but no cigar -- try again */
1034: i = pattern[overlap + i];
1035: }
1036: break;
1037: }
1038:
1039: }
1040: ptr++;
1041: }
1042: return 0;
1043: }
1044:
1045: if (pattern[pidx] == SRE_OP_LITERAL) {
1046: /* pattern starts with a literal */
1047: char chr = pattern[pidx + 1];
1048: end = this .end;
1049: for (;;) {
1050: while (ptr < end && str[ptr] != chr)
1051: ptr++;
1052: if (ptr == end)
1053: return 0;
1054: //TRACE(pidx, ptr, "SEARCH LITERAL");
1055: this .start = ptr;
1056: this .ptr = ++ptr;
1057: if ((flags & SRE_INFO_LITERAL) != 0)
1058: return 1;
1059: status = SRE_MATCH(pattern, pidx + 2, 1);
1060: if (status != 0)
1061: break;
1062: }
1063:
1064: } else if (charset != 0) {
1065: /* pattern starts with a character from a known set */
1066: end = this .end;
1067: for (;;) {
1068: while (ptr < end
1069: && !SRE_CHARSET(pattern, charset, str[ptr]))
1070: ptr++;
1071: if (ptr == end)
1072: return 0;
1073: //TRACE(pidx, ptr, "SEARCH CHARSET");
1074: this .start = ptr;
1075: this .ptr = ptr;
1076: status = SRE_MATCH(pattern, pidx, 1);
1077: if (status != 0)
1078: break;
1079: ptr++;
1080: }
1081:
1082: } else {
1083: /* general case */
1084: while (ptr <= end) {
1085: //TRACE(pidx, ptr, "SEARCH");
1086: this .start = this .ptr = ptr++;
1087: status = SRE_MATCH(pattern, pidx, 1);
1088: if (status != 0)
1089: break;
1090: }
1091: }
1092:
1093: return status;
1094: }
1095:
1096: /* string pointers */
1097: int ptr; /* current position (also end of current slice) */
1098: int beginning; /* start of original string */
1099: int start; /* start of current slice */
1100: int end; /* end of original string */
1101:
1102: /* attributes for the match object */
1103: char[] str;
1104: int pos;
1105: int endpos;
1106:
1107: /* character size */
1108: int charsize;
1109:
1110: /* registers */
1111: int lastindex;
1112: int lastmark;
1113:
1114: /* FIXME: <fl> should be dynamically allocated! */
1115: int[] mark = new int[200];
1116:
1117: /* dynamically allocated stuff */
1118: int[] mark_stack;
1119: int mark_stack_size;
1120: int mark_stack_base;
1121:
1122: SRE_REPEAT repeat; /* current repeat context */
1123:
1124: /* debugging */
1125: int maxlevel;
1126:
1127: /* duplicated from the PatternObject */
1128: int flags;
1129:
1130: public SRE_STATE(String str, int start, int end, int flags) {
1131: this .str = str.toCharArray();
1132: int size = str.length();
1133:
1134: this .charsize = 1;
1135:
1136: /* adjust boundaries */
1137: if (start < 0)
1138: start = 0;
1139: else if (start > size)
1140: start = size;
1141:
1142: if (end < 0)
1143: end = 0;
1144: else if (end > size)
1145: end = size;
1146:
1147: this .start = start;
1148: this .end = end;
1149:
1150: this .pos = start;
1151: this .endpos = end;
1152:
1153: state_reset();
1154:
1155: this .flags = flags;
1156: }
1157:
1158: public static int getlower(int ch, int flags) {
1159: if ((flags & SRE_FLAG_LOCALE) != 0)
1160: return ((ch) < 256 ? Character.toLowerCase((char) ch) : ch);
1161: if ((flags & SRE_FLAG_UNICODE) != 0)
1162: return Character.toLowerCase((char) ch);
1163: return ((ch) < 128 ? (char) sre_char_lower[ch] : ch);
1164: }
1165:
1166: String getslice(int index, String string, boolean empty) {
1167: int i, j;
1168:
1169: index = (index - 1) * 2;
1170:
1171: if (string == null || mark[index] == -1
1172: || mark[index + 1] == -1) {
1173: if (empty) {
1174: /* want empty string */
1175: i = j = 0;
1176: } else {
1177: return null;
1178: }
1179: } else {
1180: i = mark[index];
1181: j = mark[index + 1];
1182: }
1183:
1184: return string.substring(i, j);
1185: }
1186:
1187: void state_reset() {
1188: lastmark = 0;
1189:
1190: /* FIXME: dynamic! */
1191: for (int i = 0; i < mark.length; i++)
1192: mark[i] = -1;
1193:
1194: lastindex = -1;
1195: repeat = null;
1196:
1197: mark_fini();
1198: }
1199:
1200: private void TRACE(int pidx, int ptr, String string) {
1201: System.out
1202: .println(" |" + pidx + "|" + ptr + ": " + string);
1203: }
1204: }
|