001: /*
002: * Copyright (c) 1998-2007 Caucho Technology -- all rights reserved
003: *
004: * This file is part of Resin(R) Open Source
005: *
006: * Each copy or derived work must preserve the copyright notice and this
007: * notice unmodified.
008: *
009: * Resin Open Source is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU General Public License as published by
011: * the Free Software Foundation; either version 2 of the License, or
012: * (at your option) any later version.
013: *
014: * Resin Open Source is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
017: * of NON-INFRINGEMENT. See the GNU General Public License for more
018: * details.
019: *
020: * You should have received a copy of the GNU General Public License
021: * along with Resin Open Source; if not, write to the
022: *
023: * Free Software Foundation, Inc.
024: * 59 Temple Place, Suite 330
025: * Boston, MA 02111-1307 USA
026: *
027: * @author Scott Ferguson
028: */
029:
030: package com.caucho.quercus.lib.regexp;
031:
032: import java.io.UnsupportedEncodingException;
033: import java.util.*;
034: import java.util.logging.*;
035:
036: import com.caucho.quercus.QuercusException;
037: import com.caucho.quercus.QuercusRuntimeException;
038: import com.caucho.quercus.env.Env;
039: import com.caucho.quercus.env.StringBuilderValue;
040: import com.caucho.quercus.env.StringValue;
041: import com.caucho.quercus.env.UnicodeBuilderValue;
042: import com.caucho.util.*;
043:
044: public class Regexp {
045: private static final Logger log = Logger.getLogger(Regexp.class
046: .getName());
047:
048: private static final L10N L = new L10N(Regexp.class);
049:
050: public static final int FAIL = -1;
051: public static final int SUCCESS = 0;
052:
053: StringValue _pattern;
054:
055: RegexpNode _prog;
056: boolean _ignoreCase;
057: boolean _isGlobal;
058:
059: int _nLoop;
060: int _nGroup;
061:
062: // optim stuff
063: CharBuffer _prefix; // initial string
064: int _minLength; // minimum length possible for this regexp
065: int _firstChar;
066: boolean[] _firstSet;
067:
068: StringValue[] _groupNames;
069:
070: boolean _isUnicode;
071: boolean _isPHP5String;
072:
073: boolean _isUTF8;
074: boolean _isEval;
075:
076: public Regexp(Env env, StringValue rawRegexp)
077: throws IllegalRegexpException {
078: if (rawRegexp.length() < 2) {
079: throw new IllegalStateException(L
080: .l("Can't find delimiters in regexp '{0}'.",
081: rawRegexp));
082: }
083:
084: int head = 0;
085:
086: char delim = '/';
087:
088: for (; head < rawRegexp.length()
089: && Character.isWhitespace((delim = rawRegexp
090: .charAt(head))); head++) {
091: }
092:
093: if (delim == '{')
094: delim = '}';
095: else if (delim == '[')
096: delim = ']';
097: else if (delim == '(')
098: delim = ')';
099: else if (delim == '<')
100: delim = '>';
101: else if (delim == '\\' || Character.isLetterOrDigit(delim)) {
102: throw new QuercusException(
103: L
104: .l(
105: "Delimiter {0} in regexp '{1}' must not be backslash or alphanumeric.",
106: String.valueOf(delim), rawRegexp));
107: }
108:
109: int tail = rawRegexp.lastIndexOf(delim);
110:
111: if (tail <= 0)
112: throw new QuercusException(L.l(
113: "Can't find second {0} in regexp '{1}'.", String
114: .valueOf(delim), rawRegexp));
115:
116: StringValue sflags = rawRegexp.substring(tail);
117: StringValue pattern = rawRegexp.substring(head + 1, tail);
118:
119: int flags = 0;
120:
121: for (int i = 0; sflags != null && i < sflags.length(); i++) {
122: switch (sflags.charAt(i)) {
123: case 'm':
124: flags |= Regcomp.MULTILINE;
125: break;
126: case 's':
127: flags |= Regcomp.SINGLE_LINE;
128: break;
129: case 'i':
130: flags |= Regcomp.IGNORE_CASE;
131: break;
132: case 'x':
133: flags |= Regcomp.IGNORE_WS;
134: break;
135: case 'g':
136: flags |= Regcomp.GLOBAL;
137: break;
138:
139: case 'A':
140: flags |= Regcomp.ANCHORED;
141: break;
142: case 'D':
143: flags |= Regcomp.END_ONLY;
144: break;
145: case 'U':
146: flags |= Regcomp.UNGREEDY;
147: break;
148: case 'X':
149: flags |= Regcomp.STRICT;
150: break;
151:
152: case 'u':
153: _isUTF8 = true;
154: break;
155: case 'e':
156: _isEval = true;
157: break;
158: }
159: }
160:
161: // XXX: what if unicode.semantics='true'?
162:
163: if (_isUTF8)
164: pattern = fromUtf8(env, pattern);
165:
166: _pattern = pattern;
167:
168: Regcomp comp = new Regcomp(flags);
169: _prog = comp.parse(new PeekString(_pattern));
170:
171: compile(env, _prog, comp);
172: }
173:
174: protected Regexp(Env env, RegexpNode prog, Regcomp comp) {
175: _prog = prog;
176:
177: compile(env, _prog, comp);
178: }
179:
180: private Regexp() {
181: }
182:
183: public StringValue getPattern() {
184: return _pattern;
185: }
186:
187: public boolean isUTF8() {
188: return _isUTF8;
189: }
190:
191: public boolean isEval() {
192: return _isEval;
193: }
194:
195: public StringValue convertSubject(Env env, StringValue subject) {
196: if (_isUTF8)
197: return fromUtf8(env, subject);
198: else
199: return subject;
200: }
201:
202: public StringValue convertResult(Env env, StringValue result) {
203: if (_isUTF8)
204: return toUtf8(env, result);
205: else
206: return result;
207: }
208:
209: private void compile(Env env, RegexpNode prog, Regcomp comp) {
210: _ignoreCase = (comp._flags & Regcomp.IGNORE_CASE) != 0;
211: _isGlobal = (comp._flags & Regcomp.GLOBAL) != 0;
212:
213: /*
214: if (_ignoreCase)
215: RegOptim.ignoreCase(prog);
216:
217: if (! _ignoreCase)
218: RegOptim.eliminateBacktrack(prog, null);
219: */
220:
221: _minLength = prog.minLength();
222: _firstChar = prog.firstChar();
223: _firstSet = prog.firstSet(new boolean[256]);
224: _prefix = new CharBuffer(prog.prefix());
225:
226: //this._prog = RegOptim.linkLoops(prog);
227:
228: _nGroup = comp._maxGroup;
229: _nLoop = comp._nLoop;
230:
231: _groupNames = new StringValue[_nGroup + 1];
232: for (Map.Entry<Integer, StringValue> entry : comp._groupNameMap
233: .entrySet()) {
234: StringValue groupName = entry.getValue();
235:
236: if (_isUnicode) {
237: } else if (_isUTF8)
238: groupName.toBinaryValue(env, "UTF-8");
239: else
240: groupName.toBinaryValue(env);
241:
242: _groupNames[entry.getKey().intValue()] = groupName;
243: }
244: }
245:
246: public StringValue getGroupName(int i) {
247: return _groupNames[i];
248: }
249:
250: public boolean isGlobal() {
251: return _isGlobal;
252: }
253:
254: public boolean ignoreCase() {
255: return _ignoreCase;
256: }
257:
258: static StringValue fromUtf8(Env env, StringValue source) {
259: StringValue target = env.createUnicodeBuilder();
260: int len = source.length();
261:
262: for (int i = 0; i < len; i++) {
263: char ch = source.charAt(i);
264:
265: if (ch < 0x80)
266: target.append(ch);
267: else if ((ch & 0xe0) == 0xc0) {
268: if (i + 1 >= len)
269: throw new QuercusRuntimeException(
270: "bad UTF-8 sequence, saw EOF");
271:
272: char ch2 = source.charAt(++i);
273:
274: target
275: .append((char) (((ch & 0x1f) << 6) + (ch2 & 0x3f)));
276: } else {
277: if (i + 2 >= len)
278: throw new QuercusRuntimeException(
279: "bad UTF-8 sequence, saw EOF");
280:
281: char ch2 = source.charAt(++i);
282: char ch3 = source.charAt(++i);
283:
284: target.append((char) (((ch & 0xf) << 12)
285: + ((ch2 & 0x3f) << 6) + (ch3 & 0x3f)));
286: }
287: }
288:
289: return target;
290: }
291:
292: static StringValue toUtf8(Env env, StringValue source) {
293: StringValue target = env.createBinaryBuilder();
294: int len = source.length();
295:
296: for (int i = 0; i < len; i++) {
297: char ch = source.charAt(i);
298:
299: if (ch < 0x80) {
300: target.append(ch);
301: } else if (ch < 0x800) {
302: target.append((char) (0xc0 | (ch >> 6)));
303: target.append((char) (0x80 | (ch & 0x3f)));
304: } else {
305: target.append((char) (0xe0 | (ch >> 12)));
306: target.append((char) (0x80 | ((ch >> 6) & 0x3f)));
307: target.append((char) (0x80 | (ch & 0x3f)));
308: }
309: }
310:
311: return target;
312: }
313:
314: public String toString() {
315: return "Regexp[" + _pattern + "]";
316: }
317: }
|