001: /*
002: * $Id: OpCode.java,v 1.11 2003/11/07 20:16:25 dfs Exp $
003: *
004: * ====================================================================
005: * The Apache Software License, Version 1.1
006: *
007: * Copyright (c) 2000 The Apache Software Foundation. All rights
008: * reserved.
009: *
010: * Redistribution and use in source and binary forms, with or without
011: * modification, are permitted provided that the following conditions
012: * are met:
013: *
014: * 1. Redistributions of source code must retain the above copyright
015: * notice, this list of conditions and the following disclaimer.
016: *
017: * 2. Redistributions in binary form must reproduce the above copyright
018: * notice, this list of conditions and the following disclaimer in
019: * the documentation and/or other materials provided with the
020: * distribution.
021: *
022: * 3. The end-user documentation included with the redistribution,
023: * if any, must include the following acknowledgment:
024: * "This product includes software developed by the
025: * Apache Software Foundation (http://www.apache.org/)."
026: * Alternately, this acknowledgment may appear in the software itself,
027: * if and wherever such third-party acknowledgments normally appear.
028: *
029: * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
030: * must not be used to endorse or promote products derived from this
031: * software without prior written permission. For written
032: * permission, please contact apache@apache.org.
033: *
034: * 5. Products derived from this software may not be called "Apache"
035: * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
036: * name, without prior written permission of the Apache Software Foundation.
037: *
038: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
039: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
040: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
041: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
042: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
043: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
044: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
045: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
046: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
047: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
048: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
049: * SUCH DAMAGE.
050: * ====================================================================
051: *
052: * This software consists of voluntary contributions made by many
053: * individuals on behalf of the Apache Software Foundation. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package org.apache.oro.text.regex;
059:
060: /**
061: * The OpCode class should not be instantiated. It is a holder of various
062: * constants and static methods pertaining to the manipulation of the
063: * op-codes used in a compiled regular expression.
064: *
065: * @version @version@
066: * @since 1.0
067: */
068: final class OpCode {
069:
070: private OpCode() {
071: }
072:
073: // Names, values, and descriptions of operators correspond to those of
074: // Perl regex bytecodes and for compatibility purposes are drawn from
075: // regcomp.h in the Perl source tree by Larry Wall.
076: static final char // Has Operand Meaning
077: _END = 0, // no End of program.
078: _BOL = 1, // no Match "" at beginning of line.
079: _MBOL = 2, // no Same, assuming multiline.
080: _SBOL = 3, // no Same, assuming singleline.
081: _EOL = 4, // no Match "" at end of line.
082: _MEOL = 5, // no Same, assuming multiline.
083: _SEOL = 6, // no Same, assuming singleline.
084: _ANY = 7, // no Match any one character (except newline).
085: _SANY = 8, // no Match any one character.
086: _ANYOF = 9, // yes Match character in (or not in) this class.
087: _CURLY = 10, // yes Match this simple thing {n,m} times.
088: _CURLYX = 11, // yes Match this complex thing {n,m} times.
089: _BRANCH = 12, // yes Match this alternative, or the next...
090: _BACK = 13, // no Match "", "next" ptr points backward.
091: _EXACTLY = 14, // yes Match this string (preceded by length).
092: _NOTHING = 15, // no Match empty string.
093: _STAR = 16, // yes Match this (simple) thing 0 or more times.
094: _PLUS = 17, // yes Match this (simple) thing 1 or more times.
095: _ALNUM = 18, // no Match any word character
096: _NALNUM = 19, // no Match any non-word character
097: _BOUND = 20, // no Match "" at any word boundary
098: _NBOUND = 21, // no Match "" at any word non-boundary
099: _SPACE = 22, // no Match any whitespace character
100: _NSPACE = 23, // no Match any non-whitespace character
101: _DIGIT = 24, // no Match any numeric character
102: _NDIGIT = 25, // no Match any non-numeric character
103: _REF = 26, // yes Match some already matched string
104: _OPEN = 27, // yes Mark this point in input as start of #n.
105: _CLOSE = 28, // yes Analogous to OPEN.
106: _MINMOD = 29, // no Next operator is not greedy.
107: _GBOL = 30, // no Matches where last m//g left off.
108: _IFMATCH = 31, // no Succeeds if the following matches.
109: _UNLESSM = 32, // no Fails if the following matches.
110: _SUCCEED = 33, // no Return from a subroutine, basically.
111: _WHILEM = 34, // no Do curly processing and see if rest matches.
112: _ANYOFUN = 35, // yes Match unicode character in this class.
113: _NANYOFUN = 36, // yes Match unicode character not in this class.
114: _RANGE = 37, // yes Range flag in
115: // Change the names of these constants later to make it clear they
116: // are POSIX classes.
117: _ALPHA = 38, _BLANK = 39,
118: _CNTRL = 40,
119: _GRAPH = 41,
120: _LOWER = 42, _PRINT = 43,
121: _PUNCT = 44,
122: _UPPER = 45,
123: _XDIGIT = 46, _OPCODE = 47,
124: _NOPCODE = 48,
125: _ONECHAR = 49,
126: _ALNUMC = 50, _ASCII = 51;
127:
128: // Lengths of the various operands.
129: static final int _operandLength[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 0-9
130: 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 10-19
131: 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, // OpCode 20-29
132: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 30-39
133: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 40-49
134: 0, 0 // OpCode 50-51
135: };
136:
137: static final char _opType[] = { _END, _BOL, _BOL, _BOL, _EOL, _EOL,
138: _EOL, _ANY, _ANY, _ANYOF, _CURLY, _CURLY, _BRANCH, _BACK,
139: _EXACTLY, _NOTHING, _STAR, _PLUS, _ALNUM, _NALNUM, _BOUND,
140: _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF, _OPEN,
141: _CLOSE, _MINMOD, _BOL, _BRANCH, _BRANCH, _END, _WHILEM,
142: _ANYOFUN, _NANYOFUN, _RANGE, _ALPHA, _BLANK, _CNTRL,
143: _GRAPH, _LOWER, _PRINT, _PUNCT, _UPPER, _XDIGIT, _OPCODE,
144: _NOPCODE, _ONECHAR, _ALNUMC, _ASCII };
145:
146: static final char _opLengthVaries[] = { _BRANCH, _BACK, _STAR,
147: _PLUS, _CURLY, _CURLYX, _REF, _WHILEM };
148:
149: static final char _opLengthOne[] = { _ANY, _SANY, _ANYOF, _ALNUM,
150: _NALNUM, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _ANYOFUN,
151: _NANYOFUN, _ALPHA, _BLANK, _CNTRL, _GRAPH, _LOWER, _PRINT,
152: _PUNCT, _UPPER, _XDIGIT, _OPCODE, _NOPCODE, _ONECHAR,
153: _ALNUMC, _ASCII };
154:
155: static final int _NULL_OFFSET = -1;
156: static final char _NULL_POINTER = 0;
157:
158: static final int _getNextOffset(char[] program, int offset) {
159: return ((int) program[offset + 1]);
160: }
161:
162: static final char _getArg1(char[] program, int offset) {
163: return program[offset + 2];
164: }
165:
166: static final char _getArg2(char[] program, int offset) {
167: return program[offset + 3];
168: }
169:
170: static final int _getOperand(int offset) {
171: return (offset + 2);
172: }
173:
174: static final boolean _isInArray(char ch, char[] array, int start) {
175: while (start < array.length)
176: if (ch == array[start++])
177: return true;
178: return false;
179: }
180:
181: static final int _getNextOperator(int offset) {
182: return (offset + 2);
183: }
184:
185: static final int _getPrevOperator(int offset) {
186: return (offset - 2);
187: }
188:
189: static final int _getNext(char[] program, int offset) {
190: int offs;
191:
192: if (program == null)
193: return _NULL_OFFSET;
194:
195: offs = _getNextOffset(program, offset);
196: if (offs == _NULL_POINTER)
197: return _NULL_OFFSET;
198:
199: if (program[offset] == OpCode._BACK)
200: return (offset - offs);
201:
202: return (offset + offs);
203: }
204:
205: // doesn't really belong in this class, but we want Perl5Matcher not to
206: // depend on Perl5Compiler
207: // Matches Perl's definition of \w, which is different from [:alnum:]
208: static final boolean _isWordCharacter(char token) {
209: return (Character.isLetterOrDigit(token) || token == '_');
210: }
211: }
|