001: package org.apache.regexp;
002:
003: /*
004: * ====================================================================
005: *
006: * The Apache Software License, Version 1.1
007: *
008: * Copyright (c) 1999 The Apache Software Foundation. All rights
009: * reserved.
010: *
011: * Redistribution and use in source and binary forms, with or without
012: * modification, are permitted provided that the following conditions
013: * are met:
014: *
015: * 1. Redistributions of source code must retain the above copyright
016: * notice, this list of conditions and the following disclaimer.
017: *
018: * 2. Redistributions in binary form must reproduce the above copyright
019: * notice, this list of conditions and the following disclaimer in
020: * the documentation and/or other materials provided with the
021: * distribution.
022: *
023: * 3. The end-user documentation included with the redistribution, if
024: * any, must include the following acknowlegement:
025: * "This product includes software developed by the
026: * Apache Software Foundation (http://www.apache.org/)."
027: * Alternately, this acknowlegement may appear in the software itself,
028: * if and wherever such third-party acknowlegements normally appear.
029: *
030: * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
031: * Foundation" must not be used to endorse or promote products derived
032: * from this software without prior written permission. For written
033: * permission, please contact apache@apache.org.
034: *
035: * 5. Products derived from this software may not be called "Apache"
036: * nor may "Apache" appear in their names without prior written
037: * permission of the Apache Group.
038: *
039: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
040: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
041: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
042: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
043: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
044: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
045: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
046: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
047: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
048: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
049: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
050: * SUCH DAMAGE.
051: * ====================================================================
052: *
053: * This software consists of voluntary contributions made by many
054: * individuals on behalf of the Apache Software Foundation. For more
055: * information on the Apache Software Foundation, please see
056: * <http://www.apache.org/>.
057: *
058: */
059:
060: import org.apache.regexp.RE;
061: import java.util.Hashtable;
062:
063: /**
064: * A class that holds compiled regular expressions. This is exposed mainly
065: * for use by the recompile utility (which helps you produce precompiled
066: * REProgram objects). You should not otherwise need to work directly with
067: * this class.
068: *
069: * @see RE
070: * @see RECompiler
071: *
072: * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
073: * @version $Id: REProgram.java,v 1.1.1.1 2002/01/31 03:14:36 rcm Exp $
074: */
075: public class REProgram {
076: static final int OPT_HASBACKREFS = 1;
077:
078: char[] instruction; // The compiled regular expression 'program'
079: int lenInstruction; // The amount of the instruction buffer in use
080: char[] prefix; // Prefix string optimization
081: int flags; // Optimization flags (REProgram.OPT_*)
082:
083: /**
084: * Constructs a program object from a character array
085: * @param instruction Character array with RE opcode instructions in it
086: */
087: public REProgram(char[] instruction) {
088: this (instruction, instruction.length);
089: }
090:
091: /**
092: * Constructs a program object from a character array
093: * @param instruction Character array with RE opcode instructions in it
094: * @param lenInstruction Amount of instruction array in use
095: */
096: public REProgram(char[] instruction, int lenInstruction) {
097: setInstructions(instruction, lenInstruction);
098: }
099:
100: /**
101: * Returns a copy of the current regular expression program in a character
102: * array that is exactly the right length to hold the program. If there is
103: * no program compiled yet, getInstructions() will return null.
104: * @return A copy of the current compiled RE program
105: */
106: public char[] getInstructions() {
107: // Ensure program has been compiled!
108: if (lenInstruction != 0) {
109: // Return copy of program
110: char[] ret = new char[lenInstruction];
111: System.arraycopy(instruction, 0, ret, 0, lenInstruction);
112: return ret;
113: }
114: return null;
115: }
116:
117: /**
118: * Sets a new regular expression program to run. It is this method which
119: * performs any special compile-time search optimizations. Currently only
120: * two optimizations are in place - one which checks for backreferences
121: * (so that they can be lazily allocated) and another which attempts to
122: * find an prefix anchor string so that substantial amounts of input can
123: * potentially be skipped without running the actual program.
124: * @param instruction Program instruction buffer
125: * @param lenInstruction Length of instruction buffer in use
126: */
127: public void setInstructions(char[] instruction, int lenInstruction) {
128: // Save reference to instruction array
129: this .instruction = instruction;
130: this .lenInstruction = lenInstruction;
131:
132: // Initialize other program-related variables
133: flags = 0;
134: prefix = null;
135:
136: // Try various compile-time optimizations if there's a program
137: if (instruction != null && lenInstruction != 0) {
138: // If the first node is a branch
139: if (lenInstruction >= RE.nodeSize
140: && instruction[0 + RE.offsetOpcode] == RE.OP_BRANCH) {
141: // to the end node
142: int next = instruction[0 + RE.offsetNext];
143: if (instruction[next + RE.offsetOpcode] == RE.OP_END) {
144: // and the branch starts with an atom
145: if (lenInstruction >= (RE.nodeSize * 2)
146: && instruction[RE.nodeSize
147: + RE.offsetOpcode] == RE.OP_ATOM) {
148: // then get that atom as an prefix because there's no other choice
149: int lenAtom = instruction[RE.nodeSize
150: + RE.offsetOpdata];
151: prefix = new char[lenAtom];
152: System.arraycopy(instruction, RE.nodeSize * 2,
153: prefix, 0, lenAtom);
154: }
155: }
156: }
157:
158: BackrefScanLoop:
159:
160: // Check for backreferences
161: for (int i = 0; i < lenInstruction; i += RE.nodeSize) {
162: switch (instruction[i + RE.offsetOpcode]) {
163: case RE.OP_ANYOF:
164: i += (instruction[i + RE.offsetOpdata] * 2);
165: break;
166:
167: case RE.OP_ATOM:
168: i += instruction[i + RE.offsetOpdata];
169: break;
170:
171: case RE.OP_BACKREF:
172: flags |= OPT_HASBACKREFS;
173: break BackrefScanLoop;
174: }
175: }
176: }
177: }
178: }
|