0001: /*
0002: * $Id: Perl5Matcher.java,v 1.27 2003/11/07 20:16:25 dfs Exp $
0003: *
0004: * ====================================================================
0005: * The Apache Software License, Version 1.1
0006: *
0007: * Copyright (c) 2000 The Apache Software Foundation. All rights
0008: * reserved.
0009: *
0010: * Redistribution and use in source and binary forms, with or without
0011: * modification, are permitted provided that the following conditions
0012: * are met:
0013: *
0014: * 1. Redistributions of source code must retain the above copyright
0015: * notice, this list of conditions and the following disclaimer.
0016: *
0017: * 2. Redistributions in binary form must reproduce the above copyright
0018: * notice, this list of conditions and the following disclaimer in
0019: * the documentation and/or other materials provided with the
0020: * distribution.
0021: *
0022: * 3. The end-user documentation included with the redistribution,
0023: * if any, must include the following acknowledgment:
0024: * "This product includes software developed by the
0025: * Apache Software Foundation (http://www.apache.org/)."
0026: * Alternately, this acknowledgment may appear in the software itself,
0027: * if and wherever such third-party acknowledgments normally appear.
0028: *
0029: * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
0030: * must not be used to endorse or promote products derived from this
0031: * software without prior written permission. For written
0032: * permission, please contact apache@apache.org.
0033: *
0034: * 5. Products derived from this software may not be called "Apache"
0035: * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
0036: * name, without prior written permission of the Apache Software Foundation.
0037: *
0038: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0039: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0040: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0041: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0042: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0043: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0044: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0045: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0046: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0047: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0048: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0049: * SUCH DAMAGE.
0050: * ====================================================================
0051: *
0052: * This software consists of voluntary contributions made by many
0053: * individuals on behalf of the Apache Software Foundation. For more
0054: * information on the Apache Software Foundation, please see
0055: * <http://www.apache.org/>.
0056: */
0057:
0058: package org.apache.oro.text.regex;
0059:
0060: import java.util.*;
0061:
0062: /**
0063: * The Perl5Matcher class is used to match regular expressions
0064: * (conforming to the Perl5 regular expression syntax) generated by
0065: * Perl5Compiler.
0066: * <p>
0067: * Perl5Compiler and Perl5Matcher are designed with the intent that
0068: * you use a separate instance of each per thread to avoid the overhead
0069: * of both synchronization and concurrent access (e.g., a match that takes
0070: * a long time in one thread will block the progress of another thread with
0071: * a shorter match). If you want to use a single instance of each
0072: * in a concurrent program, you must appropriately protect access to
0073: * the instances with critical sections. If you want to share Perl5Pattern
0074: * instances between concurrently executing instances of Perl5Matcher, you
0075: * must compile the patterns with {@link Perl5Compiler#READ_ONLY_MASK}.
0076: *
0077: * @version @version@
0078: * @since 1.0
0079: * @see PatternMatcher
0080: * @see Perl5Compiler
0081: */
0082: public final class Perl5Matcher implements PatternMatcher {
0083: private static final char __EOS = Character.MAX_VALUE;
0084: private static final int __INITIAL_NUM_OFFSETS = 20;
0085:
0086: private boolean __multiline = false, __lastSuccess = false;
0087: private boolean __caseInsensitive = false;
0088: private char __previousChar, __input[], __originalInput[];
0089: private Perl5Repetition __currentRep;
0090: private int __numParentheses, __bol, __eol, __currentOffset,
0091: __endOffset;
0092:
0093: private char[] __program;
0094: private int __expSize, __inputOffset, __lastParen;
0095: private int[] __beginMatchOffsets, __endMatchOffsets;
0096: private Stack __stack = new Stack();
0097: private Perl5MatchResult __lastMatchResult = null;
0098:
0099: private static boolean __compare(char[] s1, int s1Offs, char[] s2,
0100: int s2Offs, int n) {
0101: int cnt;
0102:
0103: for (cnt = 0; cnt < n; cnt++, s1Offs++, s2Offs++) {
0104: if (s1Offs >= s1.length)
0105: return false;
0106: if (s2Offs >= s2.length)
0107: return false;
0108: if (s1[s1Offs] != s2[s2Offs])
0109: return false;
0110: }
0111:
0112: return true;
0113: }
0114:
0115: private static int __findFirst(char[] input, int current,
0116: int endOffset, char[] mustString) {
0117: int count, saveCurrent;
0118: char ch;
0119:
0120: if (input.length == 0)
0121: return endOffset;
0122:
0123: ch = mustString[0];
0124: // Find the offset of the first character of the must string
0125: while (current < endOffset) {
0126: if (ch == input[current]) {
0127: saveCurrent = current;
0128: count = 0;
0129:
0130: while (current < endOffset && count < mustString.length) {
0131: if (mustString[count] != input[current])
0132: break;
0133: ++count;
0134: ++current;
0135: }
0136:
0137: current = saveCurrent;
0138:
0139: if (count >= mustString.length)
0140: break;
0141: }
0142: ++current;
0143: }
0144:
0145: return current;
0146: }
0147:
0148: private void __pushState(int parenFloor) {
0149: int[] state;
0150: int stateEntries, paren;
0151:
0152: stateEntries = 3 * (__expSize - parenFloor);
0153: if (stateEntries <= 0)
0154: state = new int[3];
0155: else
0156: state = new int[stateEntries + 3];
0157:
0158: state[0] = __expSize;
0159: state[1] = __lastParen;
0160: state[2] = __inputOffset;
0161:
0162: for (paren = __expSize; paren > parenFloor; --paren, stateEntries -= 3) {
0163: state[stateEntries] = __endMatchOffsets[paren];
0164: state[stateEntries + 1] = __beginMatchOffsets[paren];
0165: state[stateEntries + 2] = paren;
0166: }
0167:
0168: __stack.push(state);
0169: }
0170:
0171: private void __popState() {
0172: int[] state;
0173: int entry, paren;
0174:
0175: state = (int[]) __stack.pop();
0176:
0177: __expSize = state[0];
0178: __lastParen = state[1];
0179: __inputOffset = state[2];
0180:
0181: for (entry = 3; entry < state.length; entry += 3) {
0182: paren = state[entry + 2];
0183: __beginMatchOffsets[paren] = state[entry + 1];
0184:
0185: if (paren <= __lastParen)
0186: __endMatchOffsets[paren] = state[entry];
0187: }
0188:
0189: for (paren = __lastParen + 1; paren <= __numParentheses; paren++) {
0190: if (paren > __expSize)
0191: __beginMatchOffsets[paren] = OpCode._NULL_OFFSET;
0192: __endMatchOffsets[paren] = OpCode._NULL_OFFSET;
0193: }
0194: }
0195:
0196: // Initialize globals needed before calling __tryExpression for first time
0197: private void __initInterpreterGlobals(Perl5Pattern expression,
0198: char[] input, int beginOffset, int endOffset,
0199: int currentOffset) {
0200: // Remove this hack after more efficient case-folding and unicode
0201: // character classes are implemented
0202: __caseInsensitive = expression._isCaseInsensitive;
0203: __input = input;
0204: __endOffset = endOffset;
0205: __currentRep = new Perl5Repetition();
0206: __currentRep._numInstances = 0;
0207: __currentRep._lastRepetition = null;
0208: __program = expression._program;
0209: __stack.setSize(0);
0210:
0211: // currentOffset should always be >= beginOffset and should
0212: // always be equal to zero when beginOffset equals 0, but we
0213: // make a weak attempt to protect against a violation of this
0214: // precondition
0215: if (currentOffset == beginOffset || currentOffset <= 0)
0216: __previousChar = '\n';
0217: else {
0218: __previousChar = input[currentOffset - 1];
0219: if (!__multiline && __previousChar == '\n')
0220: __previousChar = '\0';
0221: }
0222:
0223: __numParentheses = expression._numParentheses;
0224: __currentOffset = currentOffset;
0225:
0226: __bol = beginOffset;
0227: __eol = endOffset;
0228:
0229: // Ok, here we're using endOffset as a temporary variable.
0230: endOffset = __numParentheses + 1;
0231: if (__beginMatchOffsets == null
0232: || endOffset > __beginMatchOffsets.length) {
0233: if (endOffset < __INITIAL_NUM_OFFSETS)
0234: endOffset = __INITIAL_NUM_OFFSETS;
0235: __beginMatchOffsets = new int[endOffset];
0236: __endMatchOffsets = new int[endOffset];
0237: }
0238: }
0239:
0240: // Set the match result information. Only call this if we successfully
0241: // matched.
0242: private void __setLastMatchResult() {
0243: int offs, maxEndOffs = 0;
0244:
0245: //endOffset+=dontTry;
0246:
0247: __lastMatchResult = new Perl5MatchResult(__numParentheses + 1);
0248:
0249: // This can happen when using Perl5StreamInput
0250: if (__endMatchOffsets[0] > __originalInput.length)
0251: throw new ArrayIndexOutOfBoundsException();
0252:
0253: __lastMatchResult._matchBeginOffset = __beginMatchOffsets[0];
0254:
0255: while (__numParentheses >= 0) {
0256: offs = __beginMatchOffsets[__numParentheses];
0257:
0258: if (offs >= 0)
0259: __lastMatchResult._beginGroupOffset[__numParentheses] = offs
0260: - __lastMatchResult._matchBeginOffset;
0261: else
0262: __lastMatchResult._beginGroupOffset[__numParentheses] = OpCode._NULL_OFFSET;
0263:
0264: offs = __endMatchOffsets[__numParentheses];
0265:
0266: if (offs >= 0) {
0267: __lastMatchResult._endGroupOffset[__numParentheses] = offs
0268: - __lastMatchResult._matchBeginOffset;
0269: if (offs > maxEndOffs && offs <= __originalInput.length)
0270: maxEndOffs = offs;
0271: } else
0272: __lastMatchResult._endGroupOffset[__numParentheses] = OpCode._NULL_OFFSET;
0273:
0274: --__numParentheses;
0275: }
0276:
0277: __lastMatchResult._match = new String(__originalInput,
0278: __beginMatchOffsets[0], maxEndOffs
0279: - __beginMatchOffsets[0]);
0280:
0281: // Free up for garbage collection
0282: __originalInput = null;
0283: }
0284:
0285: // Expects to receive a valid regular expression program. No checking
0286: // is done to ensure validity.
0287: // __originalInput must be set before calling this method for
0288: // __lastMatchResult to be set correctly.
0289: // beginOffset marks the beginning of the string
0290: // currentOffset marks where to start the pattern search
0291: private boolean __interpret(Perl5Pattern expression, char[] input,
0292: int beginOffset, int endOffset, int currentOffset) {
0293: boolean success;
0294: int minLength = 0, dontTry = 0, offset;
0295: char ch, mustString[];
0296:
0297: __initInterpreterGlobals(expression, input, beginOffset,
0298: endOffset, currentOffset);
0299:
0300: success = false;
0301: mustString = expression._mustString;
0302:
0303: _mainLoop: while (true) {
0304:
0305: if (mustString != null
0306: && ((expression._anchor & Perl5Pattern._OPT_ANCH) == 0 || ((__multiline || (expression._anchor & Perl5Pattern._OPT_ANCH_MBOL) != 0) && expression._back >= 0))) {
0307:
0308: __currentOffset = __findFirst(__input, __currentOffset,
0309: endOffset, mustString);
0310:
0311: if (__currentOffset >= endOffset) {
0312: if ((expression._options & Perl5Compiler.READ_ONLY_MASK) == 0)
0313: expression._mustUtility++;
0314: success = false;
0315: break _mainLoop;
0316: } else if (expression._back >= 0) {
0317: __currentOffset -= expression._back;
0318: if (__currentOffset < currentOffset)
0319: __currentOffset = currentOffset;
0320: minLength = expression._back + mustString.length;
0321: } else if (!expression._isExpensive
0322: && (expression._options & Perl5Compiler.READ_ONLY_MASK) == 0
0323: && (--expression._mustUtility < 0)) {
0324: // Be careful! The preceding logical expression is constructed
0325: // so that mustUtility is only decremented if the expression is
0326: // compiled without READ_ONLY_MASK.
0327: mustString = expression._mustString = null;
0328: __currentOffset = currentOffset;
0329: } else {
0330: __currentOffset = currentOffset;
0331: minLength = mustString.length;
0332: }
0333: }
0334:
0335: if ((expression._anchor & Perl5Pattern._OPT_ANCH) != 0) {
0336: if (__currentOffset == beginOffset
0337: && __tryExpression(beginOffset)) {
0338: success = true;
0339: break _mainLoop;
0340: } else if (__multiline
0341: || (expression._anchor & Perl5Pattern._OPT_ANCH_MBOL) != 0
0342: || (expression._anchor & Perl5Pattern._OPT_IMPLICIT) != 0) {
0343: if (minLength > 0)
0344: dontTry = minLength - 1;
0345: endOffset -= dontTry;
0346:
0347: if (__currentOffset > currentOffset)
0348: --__currentOffset;
0349:
0350: while (__currentOffset < endOffset) {
0351: if (__input[__currentOffset++] == '\n') {
0352: if (__currentOffset < endOffset
0353: && __tryExpression(__currentOffset)) {
0354: success = true;
0355: break _mainLoop;
0356: }
0357: }
0358: }
0359: }
0360:
0361: break _mainLoop;
0362: }
0363:
0364: if (expression._startString != null) {
0365: mustString = expression._startString;
0366: if ((expression._anchor & Perl5Pattern._OPT_SKIP) != 0) {
0367: ch = mustString[0];
0368:
0369: while (__currentOffset < endOffset) {
0370: if (ch == __input[__currentOffset]) {
0371: if (__tryExpression(__currentOffset)) {
0372: success = true;
0373: break _mainLoop;
0374: }
0375: ++__currentOffset;
0376: while (__currentOffset < endOffset
0377: && __input[__currentOffset] == ch)
0378: ++__currentOffset;
0379: }
0380: ++__currentOffset;
0381: }
0382: } else {
0383:
0384: while ((__currentOffset = __findFirst(__input,
0385: __currentOffset, endOffset, mustString)) < endOffset) {
0386: if (__tryExpression(__currentOffset)) {
0387: success = true;
0388: break _mainLoop;
0389: }
0390: ++__currentOffset;
0391: }
0392: }
0393:
0394: break _mainLoop;
0395: }
0396:
0397: if ((offset = expression._startClassOffset) != OpCode._NULL_OFFSET) {
0398: boolean doEvery, tmp;
0399: char op;
0400:
0401: doEvery = ((expression._anchor & Perl5Pattern._OPT_SKIP) == 0);
0402:
0403: if (minLength > 0)
0404: dontTry = minLength - 1;
0405: endOffset -= dontTry;
0406: tmp = true;
0407:
0408: switch (op = __program[offset]) {
0409: case OpCode._ANYOF:
0410: offset = OpCode._getOperand(offset);
0411: while (__currentOffset < endOffset) {
0412: ch = __input[__currentOffset];
0413:
0414: if (ch < 256
0415: && (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
0416: if (tmp && __tryExpression(__currentOffset)) {
0417: success = true;
0418: break _mainLoop;
0419: } else
0420: tmp = doEvery;
0421: } else
0422: tmp = true;
0423: ++__currentOffset;
0424: }
0425:
0426: break;
0427:
0428: case OpCode._ANYOFUN:
0429: case OpCode._NANYOFUN:
0430: offset = OpCode._getOperand(offset);
0431: while (__currentOffset < endOffset) {
0432: ch = __input[__currentOffset];
0433:
0434: if (__matchUnicodeClass(ch, __program, offset,
0435: op)) {
0436: if (tmp && __tryExpression(__currentOffset)) {
0437: success = true;
0438: break _mainLoop;
0439: } else
0440: tmp = doEvery;
0441: } else
0442: tmp = true;
0443: ++__currentOffset;
0444: }
0445:
0446: break;
0447:
0448: case OpCode._BOUND:
0449: if (minLength > 0) {
0450: ++dontTry;
0451: --endOffset;
0452: }
0453:
0454: if (__currentOffset != beginOffset) {
0455: ch = __input[__currentOffset - 1];
0456: tmp = OpCode._isWordCharacter(ch);
0457: } else
0458: tmp = OpCode._isWordCharacter(__previousChar);
0459:
0460: while (__currentOffset < endOffset) {
0461: ch = __input[__currentOffset];
0462: if (tmp != OpCode._isWordCharacter(ch)) {
0463: tmp = !tmp;
0464: if (__tryExpression(__currentOffset)) {
0465: success = true;
0466: break _mainLoop;
0467: }
0468: }
0469: ++__currentOffset;
0470: }
0471:
0472: if ((minLength > 0 || tmp)
0473: && __tryExpression(__currentOffset)) {
0474: success = true;
0475: break _mainLoop;
0476: }
0477: break;
0478:
0479: case OpCode._NBOUND:
0480: if (minLength > 0) {
0481: ++dontTry;
0482: --endOffset;
0483: }
0484:
0485: if (__currentOffset != beginOffset) {
0486: ch = __input[__currentOffset - 1];
0487: tmp = OpCode._isWordCharacter(ch);
0488: } else
0489: tmp = OpCode._isWordCharacter(__previousChar);
0490:
0491: while (__currentOffset < endOffset) {
0492: ch = __input[__currentOffset];
0493: if (tmp != OpCode._isWordCharacter(ch))
0494: tmp = !tmp;
0495: else if (__tryExpression(__currentOffset)) {
0496: success = true;
0497: break _mainLoop;
0498: }
0499:
0500: ++__currentOffset;
0501: }
0502:
0503: if ((minLength > 0 || !tmp)
0504: && __tryExpression(__currentOffset)) {
0505: success = true;
0506: break _mainLoop;
0507: }
0508: break;
0509:
0510: case OpCode._ALNUM:
0511: while (__currentOffset < endOffset) {
0512: ch = __input[__currentOffset];
0513: if (OpCode._isWordCharacter(ch)) {
0514: if (tmp && __tryExpression(__currentOffset)) {
0515: success = true;
0516: break _mainLoop;
0517: } else
0518: tmp = doEvery;
0519: } else
0520: tmp = true;
0521: ++__currentOffset;
0522: }
0523: break;
0524:
0525: case OpCode._NALNUM:
0526: while (__currentOffset < endOffset) {
0527: ch = __input[__currentOffset];
0528: if (!OpCode._isWordCharacter(ch)) {
0529: if (tmp && __tryExpression(__currentOffset)) {
0530: success = true;
0531: break _mainLoop;
0532: } else
0533: tmp = doEvery;
0534: } else
0535: tmp = true;
0536: ++__currentOffset;
0537: }
0538: break;
0539:
0540: case OpCode._SPACE:
0541: while (__currentOffset < endOffset) {
0542: if (Character
0543: .isWhitespace(__input[__currentOffset])) {
0544: if (tmp && __tryExpression(__currentOffset)) {
0545: success = true;
0546: break _mainLoop;
0547: } else
0548: tmp = doEvery;
0549: } else
0550: tmp = true;
0551: ++__currentOffset;
0552: }
0553: break;
0554:
0555: case OpCode._NSPACE:
0556: while (__currentOffset < endOffset) {
0557: if (!Character
0558: .isWhitespace(__input[__currentOffset])) {
0559: if (tmp && __tryExpression(__currentOffset)) {
0560: success = true;
0561: break _mainLoop;
0562: } else
0563: tmp = doEvery;
0564: } else
0565: tmp = true;
0566: ++__currentOffset;
0567: }
0568: break;
0569:
0570: case OpCode._DIGIT:
0571: while (__currentOffset < endOffset) {
0572: if (Character.isDigit(__input[__currentOffset])) {
0573: if (tmp && __tryExpression(__currentOffset)) {
0574: success = true;
0575: break _mainLoop;
0576: } else
0577: tmp = doEvery;
0578: } else
0579: tmp = true;
0580: ++__currentOffset;
0581: }
0582: break;
0583:
0584: case OpCode._NDIGIT:
0585: while (__currentOffset < endOffset) {
0586: if (!Character
0587: .isDigit(__input[__currentOffset])) {
0588: if (tmp && __tryExpression(__currentOffset)) {
0589: success = true;
0590: break _mainLoop;
0591: } else
0592: tmp = doEvery;
0593: } else
0594: tmp = true;
0595: ++__currentOffset;
0596: }
0597: break;
0598: } // end switch
0599:
0600: } else {
0601: if (minLength > 0)
0602: dontTry = minLength - 1;
0603: endOffset -= dontTry;
0604:
0605: do {
0606: if (__tryExpression(__currentOffset)) {
0607: success = true;
0608: break _mainLoop;
0609: }
0610: } while (__currentOffset++ < endOffset);
0611:
0612: }
0613:
0614: break _mainLoop;
0615: } // end while
0616:
0617: __lastSuccess = success;
0618: __lastMatchResult = null;
0619:
0620: return success;
0621: }
0622:
0623: private boolean __matchUnicodeClass(char code, char __program[],
0624: int offset, char opcode) {
0625: boolean isANYOF = (opcode == OpCode._ANYOFUN);
0626:
0627: while (__program[offset] != OpCode._END) {
0628: if (__program[offset] == OpCode._RANGE) {
0629: offset++;
0630: if ((code >= __program[offset])
0631: && (code <= __program[offset + 1])) {
0632: return isANYOF;
0633: } else {
0634: offset += 2;
0635: }
0636:
0637: } else if (__program[offset] == OpCode._ONECHAR) {
0638: offset++;
0639: if (__program[offset++] == code)
0640: return isANYOF;
0641:
0642: } else {
0643: isANYOF = (__program[offset] == OpCode._OPCODE) ? isANYOF
0644: : !isANYOF;
0645:
0646: offset++;
0647: switch (__program[offset++]) {
0648: case OpCode._ALNUM:
0649: if (OpCode._isWordCharacter(code))
0650: return isANYOF;
0651: break;
0652: case OpCode._NALNUM:
0653: if (!OpCode._isWordCharacter(code))
0654: return isANYOF;
0655: break;
0656: case OpCode._SPACE:
0657: if (Character.isWhitespace(code))
0658: return isANYOF;
0659: break;
0660: case OpCode._NSPACE:
0661: if (!Character.isWhitespace(code))
0662: return isANYOF;
0663: break;
0664: case OpCode._DIGIT:
0665: if (Character.isDigit(code))
0666: return isANYOF;
0667: break;
0668: case OpCode._NDIGIT:
0669: if (!Character.isDigit(code))
0670: return isANYOF;
0671: break;
0672: case OpCode._ALNUMC:
0673: if (Character.isLetterOrDigit(code))
0674: return isANYOF;
0675: break;
0676: case OpCode._ALPHA:
0677: if (Character.isLetter(code))
0678: return isANYOF;
0679: break;
0680: case OpCode._BLANK:
0681: if (Character.isSpaceChar(code))
0682: return isANYOF;
0683: break;
0684: case OpCode._CNTRL:
0685: if (Character.isISOControl(code))
0686: return isANYOF;
0687: break;
0688: case OpCode._LOWER:
0689: if (Character.isLowerCase(code))
0690: return isANYOF;
0691: // Remove this hack after more efficient case-folding and unicode
0692: // character classes are implemented
0693: if (__caseInsensitive
0694: && Character.isUpperCase(code))
0695: return isANYOF;
0696: break;
0697: case OpCode._UPPER:
0698: if (Character.isUpperCase(code))
0699: return isANYOF;
0700: // Remove this hack after more efficient case-folding and unicode
0701: // character classes are implemented
0702: if (__caseInsensitive
0703: && Character.isLowerCase(code))
0704: return isANYOF;
0705: break;
0706: case OpCode._PRINT:
0707: if (Character.isSpaceChar(code))
0708: return isANYOF;
0709: // Fall through to check if the character is alphanumeric,
0710: // or a punctuation mark. Printable characters are either
0711: // alphanumeric, punctuation marks, or spaces.
0712: case OpCode._GRAPH:
0713: if (Character.isLetterOrDigit(code))
0714: return isANYOF;
0715: // Fall through to check if the character is a punctuation mark.
0716: // Graph characters are either alphanumeric or punctuation.
0717: case OpCode._PUNCT:
0718: switch (Character.getType(code)) {
0719: case Character.DASH_PUNCTUATION:
0720: case Character.START_PUNCTUATION:
0721: case Character.END_PUNCTUATION:
0722: case Character.CONNECTOR_PUNCTUATION:
0723: case Character.OTHER_PUNCTUATION:
0724: case Character.MATH_SYMBOL:
0725: case Character.CURRENCY_SYMBOL:
0726: case Character.MODIFIER_SYMBOL:
0727: return isANYOF;
0728: default:
0729: break;
0730: }
0731: break;
0732: case OpCode._XDIGIT:
0733: if ((code >= '0' && code <= '9')
0734: || (code >= 'a' && code <= 'f')
0735: || (code >= 'A' && code <= 'F'))
0736: return isANYOF;
0737: break;
0738: case OpCode._ASCII:
0739: if (code < 0x80)
0740: return isANYOF;
0741: }
0742: }
0743: }
0744: return !isANYOF;
0745: }
0746:
0747: private boolean __tryExpression(int offset) {
0748: int count;
0749:
0750: __inputOffset = offset;
0751: __lastParen = 0;
0752: __expSize = 0;
0753:
0754: if (__numParentheses > 0) {
0755: for (count = 0; count <= __numParentheses; count++) {
0756: __beginMatchOffsets[count] = OpCode._NULL_OFFSET;
0757: __endMatchOffsets[count] = OpCode._NULL_OFFSET;
0758: }
0759: }
0760:
0761: if (__match(1)) {
0762: __beginMatchOffsets[0] = offset;
0763: __endMatchOffsets[0] = __inputOffset;
0764: return true;
0765: }
0766:
0767: return false;
0768: }
0769:
0770: private int __repeat(int offset, int max) {
0771: int scan, eol, operand, ret;
0772: char ch;
0773: char op;
0774:
0775: scan = __inputOffset;
0776: eol = __eol;
0777:
0778: if (max != Character.MAX_VALUE && max < eol - scan)
0779: eol = scan + max;
0780:
0781: operand = OpCode._getOperand(offset);
0782:
0783: switch (op = __program[offset]) {
0784:
0785: case OpCode._ANY:
0786: while (scan < eol && __input[scan] != '\n')
0787: ++scan;
0788: break;
0789:
0790: case OpCode._SANY:
0791: scan = eol;
0792: break;
0793:
0794: case OpCode._EXACTLY:
0795: ++operand;
0796: while (scan < eol && __program[operand] == __input[scan])
0797: ++scan;
0798: break;
0799:
0800: case OpCode._ANYOF:
0801: if (scan < eol && (ch = __input[scan]) < 256) {
0802: while ((ch < 256)
0803: && (__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
0804: if (++scan < eol)
0805: ch = __input[scan];
0806: else
0807: break;
0808: }
0809: }
0810: break;
0811:
0812: case OpCode._ANYOFUN:
0813: case OpCode._NANYOFUN:
0814: if (scan < eol) {
0815: ch = __input[scan];
0816: while (__matchUnicodeClass(ch, __program, operand, op)) {
0817: if (++scan < eol)
0818: ch = __input[scan];
0819: else
0820: break;
0821: }
0822: }
0823: break;
0824:
0825: case OpCode._ALNUM:
0826: while (scan < eol && OpCode._isWordCharacter(__input[scan]))
0827: ++scan;
0828: break;
0829:
0830: case OpCode._NALNUM:
0831: while (scan < eol
0832: && !OpCode._isWordCharacter(__input[scan]))
0833: ++scan;
0834: break;
0835:
0836: case OpCode._SPACE:
0837: while (scan < eol && Character.isWhitespace(__input[scan]))
0838: ++scan;
0839: break;
0840:
0841: case OpCode._NSPACE:
0842: while (scan < eol && !Character.isWhitespace(__input[scan]))
0843: ++scan;
0844: break;
0845:
0846: case OpCode._DIGIT:
0847: while (scan < eol && Character.isDigit(__input[scan]))
0848: ++scan;
0849: break;
0850:
0851: case OpCode._NDIGIT:
0852: while (scan < eol && !Character.isDigit(__input[scan]))
0853: ++scan;
0854: break;
0855:
0856: default:
0857: break;
0858:
0859: }
0860:
0861: ret = scan - __inputOffset;
0862: __inputOffset = scan;
0863:
0864: return ret;
0865: }
0866:
0867: private boolean __match(int offset) {
0868: char nextChar, op;
0869: int scan, next, input, maxScan, current, line, arg;
0870: boolean inputRemains = true, minMod = false;
0871: Perl5Repetition rep;
0872:
0873: input = __inputOffset;
0874: inputRemains = (input < __endOffset);
0875: nextChar = (inputRemains ? __input[input] : __EOS);
0876:
0877: scan = offset;
0878: maxScan = __program.length;
0879:
0880: while (scan < maxScan /*&& scan > 0*/) {
0881: next = OpCode._getNext(__program, scan);
0882:
0883: switch (op = __program[scan]) {
0884:
0885: case OpCode._BOL:
0886: if (input == __bol ? __previousChar == '\n'
0887: : (__multiline
0888: && (inputRemains || input < __eol) && __input[input - 1] == '\n'))
0889: break;
0890: return false;
0891:
0892: case OpCode._MBOL:
0893: if (input == __bol ? __previousChar == '\n'
0894: : ((inputRemains || input < __eol) && __input[input - 1] == '\n'))
0895: break;
0896: return false;
0897:
0898: case OpCode._SBOL:
0899: if (input == __bol && __previousChar == '\n')
0900: break;
0901: return false;
0902:
0903: case OpCode._GBOL:
0904: if (input == __bol)
0905: break;
0906: return true;
0907:
0908: case OpCode._EOL:
0909: if ((inputRemains || input < __eol) && nextChar != '\n')
0910: return false;
0911: if (!__multiline && __eol - input > 1)
0912: return false;
0913: break;
0914:
0915: case OpCode._MEOL:
0916: if ((inputRemains || input < __eol) && nextChar != '\n')
0917: return false;
0918: break;
0919:
0920: case OpCode._SEOL:
0921: if ((inputRemains || input < __eol) && nextChar != '\n')
0922: return false;
0923: if (__eol - input > 1)
0924: return false;
0925: break;
0926:
0927: case OpCode._SANY:
0928: if (!inputRemains && input >= __eol)
0929: return false;
0930: inputRemains = (++input < __endOffset);
0931: nextChar = (inputRemains ? __input[input] : __EOS);
0932: break;
0933:
0934: case OpCode._ANY:
0935: if ((!inputRemains && input >= __eol)
0936: || nextChar == '\n')
0937: return false;
0938: inputRemains = (++input < __endOffset);
0939: nextChar = (inputRemains ? __input[input] : __EOS);
0940: break;
0941:
0942: case OpCode._EXACTLY:
0943: current = OpCode._getOperand(scan);
0944: line = __program[current++];
0945:
0946: if (__program[current] != nextChar)
0947: return false;
0948: if (__eol - input < line)
0949: return false;
0950:
0951: if (line > 1
0952: && !__compare(__program, current, __input,
0953: input, line))
0954: return false;
0955:
0956: input += line;
0957: inputRemains = (input < __endOffset);
0958: nextChar = (inputRemains ? __input[input] : __EOS);
0959: break;
0960:
0961: case OpCode._ANYOF:
0962: current = OpCode._getOperand(scan);
0963:
0964: if (nextChar == __EOS && inputRemains)
0965: nextChar = __input[input];
0966:
0967: if (nextChar >= 256
0968: || (__program[current + (nextChar >> 4)] & (1 << (nextChar & 0xf))) != 0)
0969: return false;
0970:
0971: if (!inputRemains && input >= __eol)
0972: return false;
0973:
0974: inputRemains = (++input < __endOffset);
0975: nextChar = (inputRemains ? __input[input] : __EOS);
0976: break;
0977:
0978: case OpCode._ANYOFUN:
0979: case OpCode._NANYOFUN:
0980: current = OpCode._getOperand(scan);
0981:
0982: if (nextChar == __EOS && inputRemains)
0983: nextChar = __input[input];
0984:
0985: if (!__matchUnicodeClass(nextChar, __program, current,
0986: op))
0987: return false;
0988:
0989: if (!inputRemains && input >= __eol)
0990: return false;
0991:
0992: inputRemains = (++input < __endOffset);
0993: nextChar = (inputRemains ? __input[input] : __EOS);
0994: break;
0995:
0996: case OpCode._ALNUM:
0997: if (!inputRemains)
0998: return false;
0999: if (!OpCode._isWordCharacter(nextChar))
1000: return false;
1001: inputRemains = (++input < __endOffset);
1002: nextChar = (inputRemains ? __input[input] : __EOS);
1003: break;
1004:
1005: case OpCode._NALNUM:
1006: if (!inputRemains && input >= __eol)
1007: return false;
1008: if (OpCode._isWordCharacter(nextChar))
1009: return false;
1010: inputRemains = (++input < __endOffset);
1011: nextChar = (inputRemains ? __input[input] : __EOS);
1012: break;
1013:
1014: case OpCode._NBOUND:
1015: case OpCode._BOUND:
1016: boolean a,
1017: b;
1018:
1019: if (input == __bol)
1020: a = OpCode._isWordCharacter(__previousChar);
1021: else
1022: a = OpCode._isWordCharacter(__input[input - 1]);
1023:
1024: b = OpCode._isWordCharacter(nextChar);
1025:
1026: if ((a == b) == (__program[scan] == OpCode._BOUND))
1027: return false;
1028: break;
1029:
1030: case OpCode._SPACE:
1031: if (!inputRemains && input >= __eol)
1032: return false;
1033: if (!Character.isWhitespace(nextChar))
1034: return false;
1035: inputRemains = (++input < __endOffset);
1036: nextChar = (inputRemains ? __input[input] : __EOS);
1037: break;
1038:
1039: case OpCode._NSPACE:
1040: if (!inputRemains)
1041: return false;
1042: if (Character.isWhitespace(nextChar))
1043: return false;
1044: inputRemains = (++input < __endOffset);
1045: nextChar = (inputRemains ? __input[input] : __EOS);
1046: break;
1047:
1048: case OpCode._DIGIT:
1049: if (!Character.isDigit(nextChar))
1050: return false;
1051: inputRemains = (++input < __endOffset);
1052: nextChar = (inputRemains ? __input[input] : __EOS);
1053: break;
1054:
1055: case OpCode._NDIGIT:
1056: if (!inputRemains && input >= __eol)
1057: return false;
1058: if (Character.isDigit(nextChar))
1059: return false;
1060: inputRemains = (++input < __endOffset);
1061: nextChar = (inputRemains ? __input[input] : __EOS);
1062: break;
1063:
1064: case OpCode._REF:
1065: arg = OpCode._getArg1(__program, scan);
1066: current = __beginMatchOffsets[arg];
1067:
1068: if (current == OpCode._NULL_OFFSET)
1069: return false;
1070:
1071: if (__endMatchOffsets[arg] == OpCode._NULL_OFFSET)
1072: return false;
1073:
1074: if (current == __endMatchOffsets[arg])
1075: break;
1076:
1077: if (__input[current] != nextChar)
1078: return false;
1079:
1080: line = __endMatchOffsets[arg] - current;
1081:
1082: if (input + line > __eol)
1083: return false;
1084:
1085: if (line > 1
1086: && !__compare(__input, current, __input, input,
1087: line))
1088: return false;
1089:
1090: input += line;
1091: inputRemains = (input < __endOffset);
1092: nextChar = (inputRemains ? __input[input] : __EOS);
1093: break;
1094:
1095: case OpCode._NOTHING:
1096: break;
1097:
1098: case OpCode._BACK:
1099: break;
1100:
1101: case OpCode._OPEN:
1102: arg = OpCode._getArg1(__program, scan);
1103: __beginMatchOffsets[arg] = input;
1104:
1105: if (arg > __expSize)
1106: __expSize = arg;
1107: break;
1108:
1109: case OpCode._CLOSE:
1110: arg = OpCode._getArg1(__program, scan);
1111: __endMatchOffsets[arg] = input;
1112:
1113: if (arg > __lastParen)
1114: __lastParen = arg;
1115: break;
1116:
1117: case OpCode._CURLYX:
1118: rep = new Perl5Repetition();
1119: rep._lastRepetition = __currentRep;
1120: __currentRep = rep;
1121:
1122: rep._parenFloor = __lastParen;
1123: rep._numInstances = -1;
1124: rep._min = OpCode._getArg1(__program, scan);
1125: rep._max = OpCode._getArg2(__program, scan);
1126: rep._scan = OpCode._getNextOperator(scan) + 2;
1127: rep._next = next;
1128: rep._minMod = minMod;
1129: // Must initialize to -1 because if we initialize to 0 and are
1130: // at the beginning of the input the OpCode._WHILEM case will
1131: // not work right.
1132: rep._lastLocation = -1;
1133: __inputOffset = input;
1134:
1135: // use minMod as temporary
1136: minMod = __match(OpCode._getPrevOperator(next));
1137:
1138: // leave scope call not pertinent?
1139: __currentRep = rep._lastRepetition;
1140: return minMod;
1141:
1142: case OpCode._WHILEM:
1143: rep = __currentRep;
1144:
1145: arg = rep._numInstances + 1;
1146: __inputOffset = input;
1147:
1148: if (input == rep._lastLocation) {
1149: __currentRep = rep._lastRepetition;
1150: line = __currentRep._numInstances;
1151: if (__match(rep._next))
1152: return true;
1153: __currentRep._numInstances = line;
1154: __currentRep = rep;
1155: return false;
1156: }
1157:
1158: if (arg < rep._min) {
1159: rep._numInstances = arg;
1160: rep._lastLocation = input;
1161: if (__match(rep._scan))
1162: return true;
1163: rep._numInstances = arg - 1;
1164: return false;
1165: }
1166:
1167: if (rep._minMod) {
1168: __currentRep = rep._lastRepetition;
1169: line = __currentRep._numInstances;
1170: if (__match(rep._next))
1171: return true;
1172: __currentRep._numInstances = line;
1173: __currentRep = rep;
1174:
1175: if (arg >= rep._max)
1176: return false;
1177:
1178: __inputOffset = input;
1179: rep._numInstances = arg;
1180: rep._lastLocation = input;
1181:
1182: if (__match(rep._scan))
1183: return true;
1184:
1185: rep._numInstances = arg - 1;
1186: return false;
1187: }
1188:
1189: if (arg < rep._max) {
1190: __pushState(rep._parenFloor);
1191: rep._numInstances = arg;
1192: rep._lastLocation = input;
1193: if (__match(rep._scan))
1194: return true;
1195: __popState();
1196: __inputOffset = input;
1197: }
1198:
1199: __currentRep = rep._lastRepetition;
1200: line = __currentRep._numInstances;
1201: if (__match(rep._next))
1202: return true;
1203:
1204: rep._numInstances = line;
1205: __currentRep = rep;
1206: rep._numInstances = arg - 1;
1207: return false;
1208:
1209: case OpCode._BRANCH:
1210: if (__program[next] != OpCode._BRANCH)
1211: next = OpCode._getNextOperator(scan);
1212: else {
1213: int lastParen;
1214:
1215: lastParen = __lastParen;
1216:
1217: do {
1218:
1219: __inputOffset = input;
1220:
1221: if (__match(OpCode._getNextOperator(scan)))
1222: return true;
1223:
1224: for (arg = __lastParen; arg > lastParen; --arg)
1225: //__endMatchOffsets[arg] = 0;
1226: __endMatchOffsets[arg] = OpCode._NULL_OFFSET;
1227: __lastParen = arg;
1228:
1229: scan = OpCode._getNext(__program, scan);
1230: } while (scan != OpCode._NULL_OFFSET
1231: && __program[scan] == OpCode._BRANCH);
1232: return false;
1233: }
1234:
1235: break;
1236:
1237: case OpCode._MINMOD:
1238: minMod = true;
1239: break;
1240:
1241: case OpCode._CURLY:
1242: case OpCode._STAR:
1243: case OpCode._PLUS:
1244: if (op == OpCode._CURLY) {
1245: line = OpCode._getArg1(__program, scan);
1246: arg = OpCode._getArg2(__program, scan);
1247: scan = OpCode._getNextOperator(scan) + 2;
1248: } else if (op == OpCode._STAR) {
1249: line = 0;
1250: arg = Character.MAX_VALUE;
1251: scan = OpCode._getNextOperator(scan);
1252: } else {
1253: line = 1;
1254: arg = Character.MAX_VALUE;
1255: scan = OpCode._getNextOperator(scan);
1256: }
1257:
1258: if (__program[next] == OpCode._EXACTLY) {
1259: nextChar = __program[OpCode._getOperand(next) + 1];
1260: current = 0;
1261: } else {
1262: nextChar = __EOS;
1263: current = -1000;
1264: }
1265: __inputOffset = input;
1266:
1267: if (minMod) {
1268: minMod = false;
1269:
1270: if (line > 0 && __repeat(scan, line) < line)
1271: return false;
1272:
1273: while (arg >= line
1274: || (arg == Character.MAX_VALUE && line > 0)) {
1275: // there may be a bug here with respect to
1276: // __inputOffset >= __endOffset, but it seems to be right for
1277: // now. the issue is with __inputOffset being reset later.
1278: // is this test really supposed to happen here?
1279: if (current == -1000
1280: || __inputOffset >= __endOffset
1281: || __input[__inputOffset] == nextChar) {
1282: if (__match(next))
1283: return true;
1284: }
1285:
1286: __inputOffset = input + line;
1287:
1288: if (__repeat(scan, 1) != 0) {
1289: ++line;
1290: __inputOffset = input + line;
1291: } else
1292: return false;
1293: }
1294:
1295: } else {
1296: arg = __repeat(scan, arg);
1297:
1298: if (line < arg
1299: && OpCode._opType[__program[next]] == OpCode._EOL
1300: && ((!__multiline && __program[next] != OpCode._MEOL) || __program[next] == OpCode._SEOL))
1301: line = arg;
1302:
1303: while (arg >= line) {
1304: // there may be a bug here with respect to
1305: // __inputOffset >= __endOffset, but it seems to be right for
1306: // now. the issue is with __inputOffset being reset later.
1307: // is this test really supposed to happen here?
1308: if (current == -1000
1309: || __inputOffset >= __endOffset
1310: || __input[__inputOffset] == nextChar) {
1311: if (__match(next))
1312: return true;
1313: }
1314:
1315: --arg;
1316: __inputOffset = input + arg;
1317: }
1318: }
1319:
1320: return false;
1321:
1322: case OpCode._SUCCEED:
1323: case OpCode._END:
1324: __inputOffset = input;
1325: // This enforces the rule that two consecutive matches cannot have
1326: // the same end offset.
1327: if (__inputOffset == __lastMatchInputEndOffset)
1328: return false;
1329: return true;
1330:
1331: case OpCode._IFMATCH:
1332: __inputOffset = input;
1333: scan = OpCode._getNextOperator(scan);
1334: if (!__match(scan))
1335: return false;
1336: break;
1337:
1338: case OpCode._UNLESSM:
1339: __inputOffset = input;
1340: scan = OpCode._getNextOperator(scan);
1341: if (__match(scan))
1342: return false;
1343: break;
1344:
1345: default:
1346: // todo: Need to throw an exception here.
1347:
1348: } // end switch
1349:
1350: //scan = (next > 0 ? next : 0);
1351: scan = next;
1352: } // end while scan
1353:
1354: return false;
1355: }
1356:
1357: /**
1358: * Set whether or not subsequent calls to {@link #matches matches()}
1359: * or {@link #contains contains()} should treat the input as
1360: * consisting of multiple lines. The default behavior is for
1361: * input to be treated as consisting of multiple lines. This method
1362: * should only be called if the Perl5Pattern used for a match was
1363: * compiled without either of the Perl5Compiler.MULTILINE_MASK or
1364: * Perl5Compiler.SINGLELINE_MASK flags, and you want to alter the
1365: * behavior of how the <b>^</b>, <b>$</b>, and <b>.</b> metacharacters are
1366: * interpreted on the fly. The compilation options used when compiling
1367: * a pattern ALWAYS override the behavior specified by setMultiline(). See
1368: * {@link Perl5Compiler} for more details.
1369: * <p>
1370: * @param multiline If set to true treats the input as consisting of
1371: * multiple lines with respect to the <b>^</b> and <b>$</b>
1372: * metacharacters. If set to false treats the input as consisting
1373: * of a single line with respect to the <b>^</b> and <b>$</b>
1374: * metacharacters.
1375: */
1376: public void setMultiline(boolean multiline) {
1377: __multiline = multiline;
1378: }
1379:
1380: /**
1381: * @return True if the matcher is treating input as consisting of multiple
1382: * lines with respect to the <b>^</b> and <b>$</b> metacharacters,
1383: * false otherwise.
1384: */
1385: public boolean isMultiline() {
1386: return __multiline;
1387: }
1388:
1389: char[] _toLower(char[] input) {
1390: int current;
1391: char[] inp;
1392: // todo:
1393: // Certainly not the best way to do case insensitive matching.
1394: // Must definitely change this in some way, but for now we
1395: // do what Perl does and make a copy of the input, converting
1396: // it all to lowercase. This is truly better handled in the
1397: // compilation phase.
1398: inp = new char[input.length];
1399: System.arraycopy(input, 0, inp, 0, input.length);
1400: input = inp;
1401:
1402: // todo: Need to inline toLowerCase()
1403: for (current = 0; current < input.length; current++)
1404: if (Character.isUpperCase(input[current]))
1405: input[current] = Character.toLowerCase(input[current]);
1406:
1407: return input;
1408: }
1409:
1410: /**
1411: * Determines if a prefix of a string (represented as a char[])
1412: * matches a given pattern, starting from a given offset into the string.
1413: * If a prefix of the string matches the pattern, a MatchResult instance
1414: * representing the match is made accesible via
1415: * {@link #getMatch()}.
1416: * <p>
1417: * This method is useful for certain common token identification tasks
1418: * that are made more difficult without this functionality.
1419: * <p>
1420: * @param input The char[] to test for a prefix match.
1421: * @param pattern The Pattern to be matched.
1422: * @param offset The offset at which to start searching for the prefix.
1423: * @return True if input matches pattern, false otherwise.
1424: */
1425: public boolean matchesPrefix(char[] input, Pattern pattern,
1426: int offset) {
1427: Perl5Pattern expression;
1428:
1429: expression = (Perl5Pattern) pattern;
1430: __originalInput = input;
1431: if (expression._isCaseInsensitive)
1432: input = _toLower(input);
1433:
1434: __initInterpreterGlobals(expression, input, 0, input.length,
1435: offset);
1436:
1437: __lastSuccess = __tryExpression(offset);
1438: __lastMatchResult = null;
1439:
1440: return __lastSuccess;
1441: }
1442:
1443: /**
1444: * Determines if a prefix of a string (represented as a char[])
1445: * matches a given pattern.
1446: * If a prefix of the string matches the pattern, a MatchResult instance
1447: * representing the match is made accesible via
1448: * {@link #getMatch()}.
1449: * <p>
1450: * This method is useful for certain common token identification tasks
1451: * that are made more difficult without this functionality.
1452: * <p>
1453: * @param input The char[] to test for a prefix match.
1454: * @param pattern The Pattern to be matched.
1455: * @return True if input matches pattern, false otherwise.
1456: */
1457: public boolean matchesPrefix(char[] input, Pattern pattern) {
1458: return matchesPrefix(input, pattern, 0);
1459: }
1460:
1461: /**
1462: * Determines if a prefix of a string matches a given pattern.
1463: * If a prefix of the string matches the pattern, a MatchResult instance
1464: * representing the match is made accesible via
1465: * {@link #getMatch()}.
1466: * <p>
1467: * This method is useful for certain common token identification tasks
1468: * that are made more difficult without this functionality.
1469: * <p>
1470: * @param input The String to test for a prefix match.
1471: * @param pattern The Pattern to be matched.
1472: * @return True if input matches pattern, false otherwise.
1473: */
1474: public boolean matchesPrefix(String input, Pattern pattern) {
1475: return matchesPrefix(input.toCharArray(), pattern, 0);
1476: }
1477:
1478: /**
1479: * Determines if a prefix of a PatternMatcherInput instance
1480: * matches a given pattern. If there is a match, a MatchResult instance
1481: * representing the match is made accesible via
1482: * {@link #getMatch()}. Unlike the
1483: * {@link #contains(PatternMatcherInput, Pattern)}
1484: * method, the current offset of the PatternMatcherInput argument
1485: * is not updated. However, unlike the
1486: * {@link #matches matches(PatternMatcherInput, Pattern)} method,
1487: * matchesPrefix() will start its search from the current offset
1488: * rather than the begin offset of the PatternMatcherInput.
1489: * <p>
1490: * This method is useful for certain common token identification tasks
1491: * that are made more difficult without this functionality.
1492: * <p>
1493: * @param input The PatternMatcherInput to test for a prefix match.
1494: * @param pattern The Pattern to be matched.
1495: * @return True if input matches pattern, false otherwise.
1496: */
1497: public boolean matchesPrefix(PatternMatcherInput input,
1498: Pattern pattern) {
1499: char[] inp;
1500: Perl5Pattern expression;
1501:
1502: expression = (Perl5Pattern) pattern;
1503:
1504: __originalInput = input._originalBuffer;
1505: if (expression._isCaseInsensitive) {
1506: if (input._toLowerBuffer == null)
1507: input._toLowerBuffer = _toLower(__originalInput);
1508: inp = input._toLowerBuffer;
1509: } else
1510: inp = __originalInput;
1511:
1512: __initInterpreterGlobals(expression, inp, input._beginOffset,
1513: input._endOffset, input._currentOffset);
1514: __lastSuccess = __tryExpression(input._currentOffset);
1515: __lastMatchResult = null;
1516:
1517: return __lastSuccess;
1518: }
1519:
1520: /**
1521: * Determines if a string (represented as a char[]) exactly
1522: * matches a given pattern. If
1523: * there is an exact match, a MatchResult instance
1524: * representing the match is made accesible via
1525: * {@link #getMatch()}. The pattern must be
1526: * a Perl5Pattern instance, otherwise a ClassCastException will
1527: * be thrown. You are not required to, and indeed should NOT try to
1528: * (for performance reasons), catch a ClassCastException because it
1529: * will never be thrown as long as you use a Perl5Pattern as the pattern
1530: * parameter.
1531: * <p>
1532: * <b>Note:</b> matches() is not the same as sticking a ^ in front of
1533: * your expression and a $ at the end of your expression in Perl5
1534: * and using the =~ operator, even though in many cases it will be
1535: * equivalent. matches() literally looks for an exact match according
1536: * to the rules of Perl5 expression matching. Therefore, if you have
1537: * a pattern <em>foo|foot</em> and are matching the input <em>foot</em>
1538: * it will not produce an exact match. But <em>foot|foo</em> will
1539: * produce an exact match for either <em>foot</em> or <em>foo</em>.
1540: * Remember, Perl5 regular expressions do not match the longest
1541: * possible match. From the perlre manpage:
1542: * <blockquote>
1543: * Alternatives are tried from left to right, so the first
1544: * alternative found for which the entire expression matches,
1545: * is the one that is chosen. This means that alternatives
1546: * are not necessarily greedy. For example: when matching
1547: * foo|foot against "barefoot", only the "foo" part will
1548: * match, as that is the first alternative tried, and it
1549: * successfully matches the target string.
1550: * </blockquote>
1551: * <p>
1552: * @param input The char[] to test for an exact match.
1553: * @param pattern The Perl5Pattern to be matched.
1554: * @return True if input matches pattern, false otherwise.
1555: * @exception ClassCastException If a Pattern instance other than a
1556: * Perl5Pattern is passed as the pattern parameter.
1557: */
1558: public boolean matches(char[] input, Pattern pattern) {
1559: Perl5Pattern expression;
1560:
1561: expression = (Perl5Pattern) pattern;
1562: __originalInput = input;
1563: if (expression._isCaseInsensitive)
1564: input = _toLower(input);
1565:
1566: __initInterpreterGlobals(expression, input, 0, input.length, 0);
1567: __lastSuccess = (__tryExpression(0) && __endMatchOffsets[0] == input.length);
1568: __lastMatchResult = null;
1569:
1570: return __lastSuccess;
1571: }
1572:
1573: /**
1574: * Determines if a string exactly matches a given pattern. If
1575: * there is an exact match, a MatchResult instance
1576: * representing the match is made accesible via
1577: * {@link #getMatch()}. The pattern must be
1578: * a Perl5Pattern instance, otherwise a ClassCastException will
1579: * be thrown. You are not required to, and indeed should NOT try to
1580: * (for performance reasons), catch a ClassCastException because it
1581: * will never be thrown as long as you use a Perl5Pattern as the pattern
1582: * parameter.
1583: * <p>
1584: * <b>Note:</b> matches() is not the same as sticking a ^ in front of
1585: * your expression and a $ at the end of your expression in Perl5
1586: * and using the =~ operator, even though in many cases it will be
1587: * equivalent. matches() literally looks for an exact match according
1588: * to the rules of Perl5 expression matching. Therefore, if you have
1589: * a pattern <em>foo|foot</em> and are matching the input <em>foot</em>
1590: * it will not produce an exact match. But <em>foot|foo</em> will
1591: * produce an exact match for either <em>foot</em> or <em>foo</em>.
1592: * Remember, Perl5 regular expressions do not match the longest
1593: * possible match. From the perlre manpage:
1594: * <blockquote>
1595: * Alternatives are tried from left to right, so the first
1596: * alternative found for which the entire expression matches,
1597: * is the one that is chosen. This means that alternatives
1598: * are not necessarily greedy. For example: when matching
1599: * foo|foot against "barefoot", only the "foo" part will
1600: * match, as that is the first alternative tried, and it
1601: * successfully matches the target string.
1602: * </blockquote>
1603: * <p>
1604: * @param input The String to test for an exact match.
1605: * @param pattern The Perl5Pattern to be matched.
1606: * @return True if input matches pattern, false otherwise.
1607: * @exception ClassCastException If a Pattern instance other than a
1608: * Perl5Pattern is passed as the pattern parameter.
1609: */
1610: public boolean matches(String input, Pattern pattern) {
1611: return matches(input.toCharArray(), pattern);
1612: }
1613:
1614: /**
1615: * Determines if the contents of a PatternMatcherInput instance
1616: * exactly matches a given pattern. If
1617: * there is an exact match, a MatchResult instance
1618: * representing the match is made accesible via
1619: * {@link #getMatch()}. Unlike the
1620: * {@link #contains(PatternMatcherInput, Pattern)}
1621: * method, the current offset of the PatternMatcherInput argument
1622: * is not updated. You should remember that the region between
1623: * the begin (NOT the current) and end offsets of the PatternMatcherInput
1624: * will be tested for an exact match.
1625: * <p>
1626: * The pattern must be a Perl5Pattern instance, otherwise a
1627: * ClassCastException will be thrown. You are not required to, and
1628: * indeed should NOT try to (for performance reasons), catch a
1629: * ClassCastException because it will never be thrown as long as you use
1630: * a Perl5Pattern as the pattern parameter.
1631: * <p>
1632: * <b>Note:</b> matches() is not the same as sticking a ^ in front of
1633: * your expression and a $ at the end of your expression in Perl5
1634: * and using the =~ operator, even though in many cases it will be
1635: * equivalent. matches() literally looks for an exact match according
1636: * to the rules of Perl5 expression matching. Therefore, if you have
1637: * a pattern <em>foo|foot</em> and are matching the input <em>foot</em>
1638: * it will not produce an exact match. But <em>foot|foo</em> will
1639: * produce an exact match for either <em>foot</em> or <em>foo</em>.
1640: * Remember, Perl5 regular expressions do not match the longest
1641: * possible match. From the perlre manpage:
1642: * <blockquote>
1643: * Alternatives are tried from left to right, so the first
1644: * alternative found for which the entire expression matches,
1645: * is the one that is chosen. This means that alternatives
1646: * are not necessarily greedy. For example: when matching
1647: * foo|foot against "barefoot", only the "foo" part will
1648: * match, as that is the first alternative tried, and it
1649: * successfully matches the target string.
1650: * </blockquote>
1651: * <p>
1652: * @param input The PatternMatcherInput to test for a match.
1653: * @param pattern The Perl5Pattern to be matched.
1654: * @return True if input matches pattern, false otherwise.
1655: * @exception ClassCastException If a Pattern instance other than a
1656: * Perl5Pattern is passed as the pattern parameter.
1657: */
1658: public boolean matches(PatternMatcherInput input, Pattern pattern) {
1659: char[] inp;
1660: Perl5Pattern expression;
1661:
1662: expression = (Perl5Pattern) pattern;
1663:
1664: __originalInput = input._originalBuffer;
1665: if (expression._isCaseInsensitive) {
1666: if (input._toLowerBuffer == null)
1667: input._toLowerBuffer = _toLower(__originalInput);
1668: inp = input._toLowerBuffer;
1669: } else
1670: inp = __originalInput;
1671:
1672: __initInterpreterGlobals(expression, inp, input._beginOffset,
1673: input._endOffset, input._beginOffset);
1674:
1675: __lastMatchResult = null;
1676:
1677: if (__tryExpression(input._beginOffset)) {
1678: if (__endMatchOffsets[0] == input._endOffset
1679: || input.length() == 0
1680: || input._beginOffset == input._endOffset) {
1681: __lastSuccess = true;
1682: return true;
1683: }
1684: }
1685:
1686: __lastSuccess = false;
1687:
1688: return false;
1689: }
1690:
1691: /**
1692: * Determines if a string contains a pattern. If the pattern is
1693: * matched by some substring of the input, a MatchResult instance
1694: * representing the <b> first </b> such match is made acessible via
1695: * {@link #getMatch()}. If you want to access
1696: * subsequent matches you should either use a PatternMatcherInput object
1697: * or use the offset information in the MatchResult to create a substring
1698: * representing the remaining input. Using the MatchResult offset
1699: * information is the recommended method of obtaining the parts of the
1700: * string preceeding the match and following the match.
1701: * <p>
1702: * The pattern must be a Perl5Pattern instance, otherwise a
1703: * ClassCastException will be thrown. You are not required to, and
1704: * indeed should NOT try to (for performance reasons), catch a
1705: * ClassCastException because it will never be thrown as long as you use
1706: * a Perl5Pattern as the pattern parameter.
1707: * <p>
1708: * @param input The String to test for a match.
1709: * @param pattern The Perl5Pattern to be matched.
1710: * @return True if the input contains a pattern match, false otherwise.
1711: * @exception ClassCastException If a Pattern instance other than a
1712: * Perl5Pattern is passed as the pattern parameter.
1713: */
1714: public boolean contains(String input, Pattern pattern) {
1715: return contains(input.toCharArray(), pattern);
1716: }
1717:
1718: /**
1719: * Determines if a string (represented as a char[]) contains a pattern.
1720: * If the pattern is
1721: * matched by some substring of the input, a MatchResult instance
1722: * representing the <b> first </b> such match is made acessible via
1723: * {@link #getMatch()}. If you want to access
1724: * subsequent matches you should either use a PatternMatcherInput object
1725: * or use the offset information in the MatchResult to create a substring
1726: * representing the remaining input. Using the MatchResult offset
1727: * information is the recommended method of obtaining the parts of the
1728: * string preceeding the match and following the match.
1729: * <p>
1730: * The pattern must be a Perl5Pattern instance, otherwise a
1731: * ClassCastException will be thrown. You are not required to, and
1732: * indeed should NOT try to (for performance reasons), catch a
1733: * ClassCastException because it will never be thrown as long as you use
1734: * a Perl5Pattern as the pattern parameter.
1735: * <p>
1736: * @param input The char[] to test for a match.
1737: * @param pattern The Perl5Pattern to be matched.
1738: * @return True if the input contains a pattern match, false otherwise.
1739: * @exception ClassCastException If a Pattern instance other than a
1740: * Perl5Pattern is passed as the pattern parameter.
1741: */
1742: public boolean contains(char[] input, Pattern pattern) {
1743: Perl5Pattern expression;
1744:
1745: expression = (Perl5Pattern) pattern;
1746:
1747: __originalInput = input;
1748:
1749: if (expression._isCaseInsensitive)
1750: input = _toLower(input);
1751:
1752: return __interpret(expression, input, 0, input.length, 0);
1753: }
1754:
1755: private static final int __DEFAULT_LAST_MATCH_END_OFFSET = -100;
1756: private int __lastMatchInputEndOffset = __DEFAULT_LAST_MATCH_END_OFFSET;
1757:
1758: /**
1759: * Determines if the contents of a PatternMatcherInput, starting from the
1760: * current offset of the input contains a pattern.
1761: * If a pattern match is found, a MatchResult
1762: * instance representing the <b>first</b> such match is made acessible via
1763: * {@link #getMatch()}. The current offset of the
1764: * PatternMatcherInput is set to the offset corresponding to the end
1765: * of the match, so that a subsequent call to this method will continue
1766: * searching where the last call left off. You should remember that the
1767: * region between the begin and end offsets of the PatternMatcherInput are
1768: * considered the input to be searched, and that the current offset
1769: * of the PatternMatcherInput reflects where a search will start from.
1770: * Matches extending beyond the end offset of the PatternMatcherInput
1771: * will not be matched. In other words, a match must occur entirely
1772: * between the begin and end offsets of the input. See
1773: * {@link PatternMatcherInput} for more details.
1774: * <p>
1775: * As a side effect, if a match is found, the PatternMatcherInput match
1776: * offset information is updated. See the
1777: * {@link PatternMatcherInput#setMatchOffsets(int, int)}
1778: * method for more details.
1779: * <p>
1780: * The pattern must be a Perl5Pattern instance, otherwise a
1781: * ClassCastException will be thrown. You are not required to, and
1782: * indeed should NOT try to (for performance reasons), catch a
1783: * ClassCastException because it will never be thrown as long as you use
1784: * a Perl5Pattern as the pattern parameter.
1785: * <p>
1786: * This method is usually used in a loop as follows:
1787: * <blockquote><pre>
1788: * PatternMatcher matcher;
1789: * PatternCompiler compiler;
1790: * Pattern pattern;
1791: * PatternMatcherInput input;
1792: * MatchResult result;
1793: *
1794: * compiler = new Perl5Compiler();
1795: * matcher = new Perl5Matcher();
1796: *
1797: * try {
1798: * pattern = compiler.compile(somePatternString);
1799: * } catch(MalformedPatternException e) {
1800: * System.err.println("Bad pattern.");
1801: * System.err.println(e.getMessage());
1802: * return;
1803: * }
1804: *
1805: * input = new PatternMatcherInput(someStringInput);
1806: *
1807: * while(matcher.contains(input, pattern)) {
1808: * result = matcher.getMatch();
1809: * // Perform whatever processing on the result you want.
1810: * }
1811: *
1812: * </pre></blockquote>
1813: * <p>
1814: * @param input The PatternMatcherInput to test for a match.
1815: * @param pattern The Pattern to be matched.
1816: * @return True if the input contains a pattern match, false otherwise.
1817: * @exception ClassCastException If a Pattern instance other than a
1818: * Perl5Pattern is passed as the pattern parameter.
1819: */
1820: public boolean contains(PatternMatcherInput input, Pattern pattern) {
1821: char[] inp;
1822: Perl5Pattern expression;
1823: boolean matchFound;
1824:
1825: //if(input.length() > 0) {
1826: // We want to allow a null string to match at the end of the input
1827: // which is why we don't check endOfInput. Not sure if this is a
1828: // safe thing to do or not.
1829: if (input._currentOffset > input._endOffset)
1830: return false;
1831: //}
1832: /* else
1833: if(input._endOfInput())
1834: return false;
1835: */
1836: expression = (Perl5Pattern) pattern;
1837: __originalInput = input._originalBuffer;
1838:
1839: // Todo:
1840: // Really should only reduce to lowercase that part of the
1841: // input that is necessary, instead of the whole thing.
1842: // Adjust MatchResult offsets accordingly. Actually, pass an adjustment
1843: // value to __interpret.
1844: __originalInput = input._originalBuffer;
1845: if (expression._isCaseInsensitive) {
1846: if (input._toLowerBuffer == null)
1847: input._toLowerBuffer = _toLower(__originalInput);
1848: inp = input._toLowerBuffer;
1849: } else
1850: inp = __originalInput;
1851:
1852: __lastMatchInputEndOffset = input.getMatchEndOffset();
1853:
1854: matchFound = __interpret(expression, inp, input._beginOffset,
1855: input._endOffset, input._currentOffset);
1856:
1857: if (matchFound) {
1858: input.setCurrentOffset(__endMatchOffsets[0]);
1859: input.setMatchOffsets(__beginMatchOffsets[0],
1860: __endMatchOffsets[0]);
1861: } else {
1862: input.setCurrentOffset(input._endOffset + 1);
1863: }
1864:
1865: // Restore so it doesn't interfere with other unrelated matches.
1866: __lastMatchInputEndOffset = __DEFAULT_LAST_MATCH_END_OFFSET;
1867:
1868: return matchFound;
1869: }
1870:
1871: /**
1872: * Fetches the last match found by a call to a matches() or contains()
1873: * method. If you plan on modifying the original search input, you
1874: * must call this method BEFORE you modify the original search input,
1875: * as a lazy evaluation technique is used to create the MatchResult.
1876: * This reduces the cost of pattern matching when you don't care about
1877: * the actual match and only care if the pattern occurs in the input.
1878: * Otherwise, a MatchResult would be created for every match found,
1879: * whether or not the MatchResult was later used by a call to getMatch().
1880: * <p>
1881: * @return A MatchResult instance containing the pattern match found
1882: * by the last call to any one of the matches() or contains()
1883: * methods. If no match was found by the last call, returns
1884: * null.
1885: */
1886: public MatchResult getMatch() {
1887: if (!__lastSuccess)
1888: return null;
1889:
1890: if (__lastMatchResult == null)
1891: __setLastMatchResult();
1892:
1893: return __lastMatchResult;
1894: }
1895:
1896: }
|