001: /**
002: * Copyright (c) 2001, Sergey A. Samokhodkin
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without modification,
006: * are permitted provided that the following conditions are met:
007: *
008: * - Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * - Redistributions in binary form
011: * must reproduce the above copyright notice, this list of conditions and the following
012: * disclaimer in the documentation and/or other materials provided with the distribution.
013: * - Neither the name of jregex nor the names of its contributors may be used
014: * to endorse or promote products derived from this software without specific prior
015: * written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
018: * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
019: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
020: * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
021: * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
022: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
023: * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
024: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
025: * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
026: *
027: * @version 1.2_01
028: */package jregex;
029:
030: import java.io.*;
031: import java.util.*;
032:
033: /**
034: * A handle for a precompiled regular expression.<br>
035: * To match a regular expression <code>myExpr</code> against a text <code>myString</code> one should first create a Pattern object:<pre>
036: * Pattern p=new Pattern(myExpr);
037: * </pre>
038: * then obtain a Matcher object:<pre>
039: * Matcher matcher=p.matcher(myText);
040: * </pre>
041: * The latter is an automaton that actually performs a search. It provides the following methods:
042: * <li> search for matching substrings : matcher.find() or matcher.findAll();
043: * <li> test whether the text matches the whole pattern : matcher.matches();
044: * <li> test whether the text matches the beginning of the pattern : matcher.matchesPrefix();
045: * <li> search with custom options : matcher.find(int options)
046: * <p>
047: * <b>Flags</b><br>
048: * Flags (see REFlags interface) change the meaning of some regular expression elements at compiletime.
049: * These flags may be passed both as string(see Pattern(String,String)) and as bitwise OR of:
050: * <li><b>REFlags.IGNORE_CASE</b> - enables case insensitivity
051: * <li><b>REFlags.MULTILINE</b> - forces "^" and "$" to match both at the start and the end of line;
052: * <li><b>REFlags.DOTALL</b> - forces "." to match eols('\r' and '\n' in ASCII);
053: * <li><b>REFlags.IGNORE_SPACES</b> - literal spaces in expression are ignored for better readability;
054: * <li><b>REFlags.UNICODE</b> - the predefined classes('\w','\d',etc) are referenced to Unicode;
055: * <li><b>REFlags.XML_SCHEMA</b> - permits XML Schema regular expressions syntax extentions.
056: * <p>
057: * <b>Multithreading</b><br>
058: * Pattern instances are thread-safe, i.e. the same Pattern object may be used
059: * by any number of threads simultaniously. On the other hand, the Matcher objects
060: * are NOT thread safe, so, given a Pattern instance, each thread must obtain
061: * and use its own Matcher.
062: *
063: * @see REFlags
064: * @see Matcher
065: * @see Matcher#setTarget(java.lang.String)
066: * @see Matcher#setTarget(java.lang.String,int,int)
067: * @see Matcher#setTarget(char[],int,int)
068: * @see Matcher#setTarget(java.io.Reader,int)
069: * @see MatchResult
070: * @see MatchResult#group(int)
071: * @see MatchResult#start(int)
072: * @see MatchResult#end(int)
073: * @see MatchResult#length(int)
074: * @see MatchResult#charAt(int,int)
075: * @see MatchResult#prefix()
076: * @see MatchResult#suffix()
077: */
078:
079: public class Pattern implements Serializable, REFlags {
080: String stringRepr;
081:
082: // tree entry
083: Term root, root0;
084:
085: // required number of memory slots
086: int memregs;
087:
088: // required number of iteration counters
089: int counters;
090:
091: // number of lookahead groups
092: int lookaheads;
093:
094: Hashtable namedGroupMap;
095:
096: protected Pattern() throws PatternSyntaxException {
097: }
098:
099: /**
100: * Compiles an expression with default flags.
101: * @param <code>regex</code> the Perl5-compatible regular expression string.
102: * @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
103: * @see Pattern#Pattern(java.lang.String,java.lang.String)
104: * @see Pattern#Pattern(java.lang.String,int)
105: */
106: public Pattern(String regex) throws PatternSyntaxException {
107: this (regex, DEFAULT);
108: }
109:
110: /**
111: * Compiles a regular expression using Perl5-style flags.
112: * The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen.
113: * The meaning of letters:
114: * <ul>
115: * <li><b>i</b> - case insensitivity, corresponds to REFLlags.IGNORE_CASE;
116: * <li><b>m</b> - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFLlags.MULTILINE flag;
117: * <li><b>s</b> - single line treatment('.' matches \r's and \n's),corresponds to REFLlags.DOTALL;
118: * <li><b>x</b> - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFLlags.IGNORE_SPACES.
119: * <li><b>u</b> - predefined classes are regarded as belonging to Unicode, corresponds to REFLlags.UNICODE; this may yield some performance penalty.
120: * <li><b>X</b> - compatibility with XML Schema, corresponds to REFLlags.XML_SCHEMA.
121: * </ul>
122: * @param <code>regex</code> the Perl5-compatible regular expression string.
123: * @param <code>flags</code> the Perl5-compatible flags.
124: * @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
125: * see REFlags
126: */
127: public Pattern(String regex, String flags)
128: throws PatternSyntaxException {
129: stringRepr = regex;
130: compile(regex, parseFlags(flags));
131: }
132:
133: /**
134: * Compiles a regular expression using REFlags.
135: * The <code>flags</code> parameter is a bitwise OR of the folloing values:
136: * <ul>
137: * <li><b>REFLlags.IGNORE_CASE</b> - case insensitivity, corresponds to '<b>i</b>' letter;
138: * <li><b>REFLlags.MULTILINE</b> - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to '<b>m</b>';
139: * <li><b>REFLlags.DOTALL</b> - single line treatment('.' matches \r's and \n's),corresponds to '<b>s</b>';
140: * <li><b>REFLlags.IGNORE_SPACES</b> - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to '<b>x</b>'.
141: * <li><b>REFLlags.UNICODE</b> - predefined classes are regarded as belonging to Unicode, corresponds to '<b>u</b>'; this may yield some performance penalty.
142: * <li><b>REFLlags.XML_SCHEMA</b> - compatibility with XML Schema, corresponds to '<b>X</b>'.
143: * </ul>
144: * @param <code>regex</code> the Perl5-compatible regular expression string.
145: * @param <code>flags</code> the Perl5-compatible flags.
146: * @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
147: * see REFlags
148: */
149: public Pattern(String regex, int flags)
150: throws PatternSyntaxException {
151: compile(regex, flags);
152: }
153:
154: /*
155: //java.util.regex.* compatibility
156: public static Pattern compile(String regex,int flags) throws PatternSyntaxException{
157: Pattern p=new Pattern();
158: p.compile(regex,flags);
159: return flags;
160: }
161: */
162:
163: protected void compile(String regex, int flags)
164: throws PatternSyntaxException {
165: stringRepr = regex;
166: Term.makeTree(regex, flags, this );
167: }
168:
169: /**
170: * How many capturing groups this expression includes?
171: */
172: public int groupCount() {
173: return memregs;
174: }
175:
176: /**
177: * Get numeric id for a group name.
178: * @return <code>null</code> if no such name found.
179: * @see MatchResult#group(java.lang.String)
180: * @see MatchResult#isCaptured(java.lang.String)
181: */
182: public Integer groupId(String name) {
183: return ((Integer) namedGroupMap.get(name));
184: }
185:
186: /**
187: * A shorthand for Pattern.matcher(String).matches().<br>
188: * @param s the target
189: * @return true if the entire target matches the pattern
190: * @see Matcher#matches()
191: * @see Matcher#matches(String)
192: */
193: public boolean matches(String s) {
194: return matcher(s).matches();
195: }
196:
197: /**
198: * A shorthand for Pattern.matcher(String).matchesPrefix().<br>
199: * @param s the target
200: * @return true if the entire target matches the beginning of the pattern
201: * @see Matcher#matchesPrefix()
202: */
203: public boolean startsWith(String s) {
204: return matcher(s).matchesPrefix();
205: }
206:
207: /**
208: * Returns a targetless matcher.
209: * Don't forget to supply a target.
210: */
211: public Matcher matcher() {
212: return new Matcher(this );
213: }
214:
215: /**
216: * Returns a matcher for a specified string.
217: */
218: public Matcher matcher(String s) {
219: Matcher m = new Matcher(this );
220: m.setTarget(s);
221: return m;
222: }
223:
224: /**
225: * Returns a matcher for a specified region.
226: */
227: public Matcher matcher(char[] data, int start, int end) {
228: Matcher m = new Matcher(this );
229: m.setTarget(data, start, end);
230: return m;
231: }
232:
233: /**
234: * Returns a matcher for a match result (in a performance-friendly way).
235: * <code>groupId</code> parameter specifies which group is a target.
236: * @param groupId which group is a target; either positive integer(group id), or one of MatchResult.MATCH,MatchResult.PREFIX,MatchResult.SUFFIX,MatchResult.TARGET.
237: */
238: public Matcher matcher(MatchResult res, int groupId) {
239: Matcher m = new Matcher(this );
240: if (res instanceof Matcher) {
241: m.setTarget((Matcher) res, groupId);
242: } else {
243: m.setTarget(res.targetChars(), res.start(groupId)
244: + res.targetStart(), res.length(groupId));
245: }
246: return m;
247: }
248:
249: /**
250: * Just as above, yet with symbolic group name.
251: * @exception NullPointerException if there is no group with such name
252: */
253: public Matcher matcher(MatchResult res, String groupName) {
254: Integer id = res.pattern().groupId(groupName);
255: if (id == null)
256: throw new IllegalArgumentException("group not found:"
257: + groupName);
258: int group = id.intValue();
259: return matcher(res, group);
260: }
261:
262: /**
263: * Returns a matcher taking a text stream as target.
264: * <b>Note that this is not a true POSIX-style stream matching</b>, i.e. the whole length of the text is preliminary read and stored in a char array.
265: * @param text a text stream
266: * @param len the length to read from a stream; if <code>len</code> is <code>-1</code>, the whole stream is read in.
267: * @exception IOException indicates an IO problem
268: * @exception OutOfMemoryException if a stream is too lengthy
269: */
270: public Matcher matcher(Reader text, int length) throws IOException {
271: Matcher m = new Matcher(this );
272: m.setTarget(text, length);
273: return m;
274: }
275:
276: /**
277: * Returns a replacer of a pattern by specified perl-like expression.
278: * Such replacer will substitute all occurences of a pattern by an evaluated expression
279: * ("$&" and "$0" will substitute by the whole match, "$1" will substitute by group#1, etc).
280: * Example:<pre>
281: * String text="The quick brown fox jumped over the lazy dog";
282: * Pattern word=new Pattern("\\w+");
283: * System.out.println(word.replacer("[$&]").replace(text));
284: * //prints "[The] [quick] [brown] [fox] [jumped] [over] [the] [lazy] [dog]"
285: * Pattern swap=new Pattern("(fox|dog)(.*?)(fox|dog)");
286: * System.out.println(swap.replacer("$3$2$1").replace(text));
287: * //prints "The quick brown dog jumped over the lazy fox"
288: * Pattern scramble=new Pattern("(\\w+)(.*?)(\\w+)");
289: * System.out.println(scramble.replacer("$3$2$1").replace(text));
290: * //prints "quick The fox brown over jumped lazy the dog"
291: * </pre>
292: * @param expr a perl-like expression, the "$&" and "${&}" standing for whole match, the "$N" and "${N}" standing for group#N, and "${Foo}" standing for named group Foo.
293: * @see Replacer
294: */
295: public Replacer replacer(String expr) {
296: return new Replacer(this , expr);
297: }
298:
299: /**
300: * Returns a replacer will substitute all occurences of a pattern
301: * through applying a user-defined substitution model.
302: * @param model a Substitution object which is in charge for match substitution
303: * @see Replacer
304: */
305: public Replacer replacer(Substitution model) {
306: return new Replacer(this , model);
307: }
308:
309: /**
310: * Tokenizes a text by an occurences of the pattern.
311: * Note that a series of adjacent matches are regarded as a single separator.
312: * The same as new RETokenizer(Pattern,String);
313: * @see RETokenizer
314: * @see RETokenizer#RETokenizer(jregex.Pattern,java.lang.String)
315: *
316: */
317: public RETokenizer tokenizer(String text) {
318: return new RETokenizer(this , text);
319: }
320:
321: /**
322: * Tokenizes a specified region by an occurences of the pattern.
323: * Note that a series of adjacent matches are regarded as a single separator.
324: * The same as new RETokenizer(Pattern,char[],int,int);
325: * @see RETokenizer
326: * @see RETokenizer#RETokenizer(jregex.Pattern,char[],int,int)
327: */
328: public RETokenizer tokenizer(char[] data, int off, int len) {
329: return new RETokenizer(this , data, off, len);
330: }
331:
332: /**
333: * Tokenizes a specified region by an occurences of the pattern.
334: * Note that a series of adjacent matches are regarded as a single separator.
335: * The same as new RETokenizer(Pattern,Reader,int);
336: * @see RETokenizer
337: * @see RETokenizer#RETokenizer(jregex.Pattern,java.io.Reader,int)
338: */
339: public RETokenizer tokenizer(Reader in, int length)
340: throws IOException {
341: return new RETokenizer(this , in, length);
342: }
343:
344: public String toString() {
345: return stringRepr;
346: }
347:
348: /**
349: * Returns a less or more readable representation of a bytecode for the pattern.
350: */
351: public String toString_d() {
352: return root.toStringAll();
353: }
354:
355: static int parseFlags(String flags) throws PatternSyntaxException {
356: boolean enable = true;
357: int len = flags.length();
358: int result = DEFAULT;
359: for (int i = 0; i < len; i++) {
360: char c = flags.charAt(i);
361: switch (c) {
362: case '+':
363: enable = true;
364: break;
365: case '-':
366: enable = false;
367: break;
368: default:
369: int flag = getFlag(c);
370: if (enable)
371: result |= flag;
372: else
373: result &= (~flag);
374: }
375: }
376: return result;
377: }
378:
379: static int parseFlags(char[] data, int start, int len)
380: throws PatternSyntaxException {
381: boolean enable = true;
382: int result = DEFAULT;
383: for (int i = 0; i < len; i++) {
384: char c = data[start + i];
385: switch (c) {
386: case '+':
387: enable = true;
388: break;
389: case '-':
390: enable = false;
391: break;
392: default:
393: int flag = getFlag(c);
394: if (enable)
395: result |= flag;
396: else
397: result &= (~flag);
398: }
399: }
400: return result;
401: }
402:
403: private static int getFlag(char c) throws PatternSyntaxException {
404: switch (c) {
405: case 'i':
406: return IGNORE_CASE;
407: case 'm':
408: return MULTILINE | DOTALL;
409: // case 's':
410: // return DOTALL;
411: case 'x':
412: return IGNORE_SPACES;
413: // case 'u':
414: // return UNICODE;
415: // case 'X':
416: // return XML_SCHEMA;
417: }
418: throw new PatternSyntaxException("unknown flag: " + c);
419: }
420: }
|