001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package org.apache.regexp;
019:
020: import java.io.Serializable;
021:
022: /**
023: * A class that holds compiled regular expressions. This is exposed mainly
024: * for use by the recompile utility (which helps you produce precompiled
025: * REProgram objects). You should not otherwise need to work directly with
026: * this class.
027: *
028: * @see RE
029: * @see RECompiler
030: *
031: * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
032: * @version $Id: REProgram.java 518156 2007-03-14 14:31:26Z vgritsenko $
033: */
034: public class REProgram implements Serializable {
035: static final int OPT_HASBACKREFS = 1;
036: static final int OPT_HASBOL = 2;
037:
038: char[] instruction; // The compiled regular expression 'program'
039: int lenInstruction; // The amount of the instruction buffer in use
040: char[] prefix; // Prefix string optimization
041: int flags; // Optimization flags (REProgram.OPT_*)
042: int maxParens = -1;
043:
044: /**
045: * Constructs a program object from a character array
046: * @param instruction Character array with RE opcode instructions in it
047: */
048: public REProgram(char[] instruction) {
049: this (instruction, instruction.length);
050: }
051:
052: /**
053: * Constructs a program object from a character array
054: * @param parens Count of parens in the program
055: * @param instruction Character array with RE opcode instructions in it
056: */
057: public REProgram(int parens, char[] instruction) {
058: this (instruction, instruction.length);
059: this .maxParens = parens;
060: }
061:
062: /**
063: * Constructs a program object from a character array
064: * @param instruction Character array with RE opcode instructions in it
065: * @param lenInstruction Amount of instruction array in use
066: */
067: public REProgram(char[] instruction, int lenInstruction) {
068: setInstructions(instruction, lenInstruction);
069: }
070:
071: /**
072: * Returns a copy of the current regular expression program in a character
073: * array that is exactly the right length to hold the program. If there is
074: * no program compiled yet, getInstructions() will return null.
075: * @return A copy of the current compiled RE program
076: */
077: public char[] getInstructions() {
078: // Ensure program has been compiled!
079: if (lenInstruction != 0) {
080: // Return copy of program
081: char[] ret = new char[lenInstruction];
082: System.arraycopy(instruction, 0, ret, 0, lenInstruction);
083: return ret;
084: }
085: return null;
086: }
087:
088: /**
089: * Sets a new regular expression program to run. It is this method which
090: * performs any special compile-time search optimizations. Currently only
091: * two optimizations are in place - one which checks for backreferences
092: * (so that they can be lazily allocated) and another which attempts to
093: * find an prefix anchor string so that substantial amounts of input can
094: * potentially be skipped without running the actual program.
095: * @param instruction Program instruction buffer
096: * @param lenInstruction Length of instruction buffer in use
097: */
098: public void setInstructions(char[] instruction, int lenInstruction) {
099: // Save reference to instruction array
100: this .instruction = instruction;
101: this .lenInstruction = lenInstruction;
102:
103: // Initialize other program-related variables
104: this .flags = 0;
105: this .prefix = null;
106:
107: // Try various compile-time optimizations if there's a program
108: if (instruction != null && lenInstruction != 0) {
109: // If the first node is a branch
110: if (lenInstruction >= RE.nodeSize
111: && instruction[0 + RE.offsetOpcode] == RE.OP_BRANCH) {
112: // to the end node
113: int next = (short) instruction[0 + RE.offsetNext];
114: if (instruction[next + RE.offsetOpcode] == RE.OP_END
115: && lenInstruction >= (RE.nodeSize * 2)) {
116: final char nextOp = instruction[RE.nodeSize
117: + RE.offsetOpcode];
118: // the branch starts with an atom
119: if (nextOp == RE.OP_ATOM) {
120: // then get that atom as an prefix because there's no other choice
121: int lenAtom = instruction[RE.nodeSize
122: + RE.offsetOpdata];
123: this .prefix = new char[lenAtom];
124: System.arraycopy(instruction, RE.nodeSize * 2,
125: prefix, 0, lenAtom);
126: }
127: // the branch starts with a BOL
128: else if (nextOp == RE.OP_BOL) {
129: // then set the flag indicating that BOL is present
130: this .flags |= OPT_HASBOL;
131: }
132: }
133: }
134:
135: BackrefScanLoop:
136:
137: // Check for backreferences
138: for (int i = 0; i < lenInstruction; i += RE.nodeSize) {
139: switch (instruction[i + RE.offsetOpcode]) {
140: case RE.OP_ANYOF:
141: i += (instruction[i + RE.offsetOpdata] * 2);
142: break;
143:
144: case RE.OP_ATOM:
145: i += instruction[i + RE.offsetOpdata];
146: break;
147:
148: case RE.OP_BACKREF:
149: flags |= OPT_HASBACKREFS;
150: break BackrefScanLoop;
151: }
152: }
153: }
154: }
155:
156: /**
157: * Returns a copy of the prefix of current regular expression program
158: * in a character array. If there is no prefix, or there is no program
159: * compiled yet, <code>getPrefix</code> will return null.
160: * @return A copy of the prefix of current compiled RE program
161: */
162: public char[] getPrefix() {
163: if (prefix != null) {
164: // Return copy of prefix
165: char[] ret = new char[prefix.length];
166: System.arraycopy(prefix, 0, ret, 0, prefix.length);
167: return ret;
168: }
169: return null;
170: }
171: }
|