001: /*
002: * gnu/regexp/RESyntax.java
003: * Copyright (C) 1998-2002 Wes Biggs
004: *
005: * This library is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser General Public License as published
007: * by the Free Software Foundation; either version 2.1 of the License, or
008: * (at your option) any later version.
009: *
010: * This library is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser General Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser General Public License
016: * along with this program; if not, write to the Free Software
017: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
018: */
019:
020: package gnu.regexp;
021:
022: import java.io.Serializable;
023: import java.util.BitSet;
024:
025: /**
026: * An RESyntax specifies the way a regular expression will be compiled.
027: * This class provides a number of predefined useful constants for
028: * emulating popular regular expression syntaxes. Additionally the
029: * user may construct his or her own syntax, using any combination of the
030: * syntax bit constants. The syntax is an optional argument to any of the
031: * matching methods on class RE.
032: *
033: * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
034: */
035:
036: public final class RESyntax implements Serializable {
037: static final String DEFAULT_LINE_SEPARATOR = System
038: .getProperty("line.separator");
039:
040: private static final String SYNTAX_IS_FINAL = RE
041: .getLocalizedMessage("syntax.final");
042:
043: private BitSet bits;
044:
045: // true for the constant defined syntaxes
046: private boolean isFinal = false;
047:
048: private String lineSeparator = DEFAULT_LINE_SEPARATOR;
049:
050: // Values for constants are bit indexes
051:
052: /**
053: * Syntax bit. Backslash is an escape character in lists.
054: */
055: public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0;
056:
057: /**
058: * Syntax bit. Use \? instead of ? and \+ instead of +.
059: */
060: public static final int RE_BK_PLUS_QM = 1;
061:
062: /**
063: * Syntax bit. POSIX character classes ([:...:]) in lists are allowed.
064: */
065: public static final int RE_CHAR_CLASSES = 2;
066:
067: /**
068: * Syntax bit. ^ and $ are special everywhere.
069: * <B>Not implemented.</B>
070: */
071: public static final int RE_CONTEXT_INDEP_ANCHORS = 3;
072:
073: /**
074: * Syntax bit. Repetition operators are only special in valid positions.
075: * <B>Not implemented.</B>
076: */
077: public static final int RE_CONTEXT_INDEP_OPS = 4;
078:
079: /**
080: * Syntax bit. Repetition and alternation operators are invalid
081: * at start and end of pattern and other places.
082: * <B>Not implemented</B>.
083: */
084: public static final int RE_CONTEXT_INVALID_OPS = 5;
085:
086: /**
087: * Syntax bit. Match-any-character operator (.) matches a newline.
088: */
089: public static final int RE_DOT_NEWLINE = 6;
090:
091: /**
092: * Syntax bit. Match-any-character operator (.) does not match a null.
093: */
094: public static final int RE_DOT_NOT_NULL = 7;
095:
096: /**
097: * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed.
098: */
099: public static final int RE_INTERVALS = 8;
100:
101: /**
102: * Syntax bit. No alternation (|), match one-or-more (+), or
103: * match zero-or-one (?) operators.
104: */
105: public static final int RE_LIMITED_OPS = 9;
106:
107: /**
108: * Syntax bit. Newline is an alternation operator.
109: */
110: public static final int RE_NEWLINE_ALT = 10; // impl.
111:
112: /**
113: * Syntax bit. Intervals use { } instead of \{ \}
114: */
115: public static final int RE_NO_BK_BRACES = 11;
116:
117: /**
118: * Syntax bit. Grouping uses ( ) instead of \( \).
119: */
120: public static final int RE_NO_BK_PARENS = 12;
121:
122: /**
123: * Syntax bit. Backreferences not allowed.
124: */
125: public static final int RE_NO_BK_REFS = 13;
126:
127: /**
128: * Syntax bit. Alternation uses | instead of \|
129: */
130: public static final int RE_NO_BK_VBAR = 14;
131:
132: /**
133: * Syntax bit. <B>Not implemented</B>.
134: */
135: public static final int RE_NO_EMPTY_RANGES = 15;
136:
137: /**
138: * Syntax bit. An unmatched right parenthesis (')' or '\)', depending
139: * on RE_NO_BK_PARENS) will throw an exception when compiling.
140: */
141: public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16;
142:
143: /**
144: * Syntax bit. <B>Not implemented.</B>
145: */
146: public static final int RE_HAT_LISTS_NOT_NEWLINE = 17;
147:
148: /**
149: * Syntax bit. Stingy matching is allowed (+?, *?, ??, {x,y}?).
150: */
151: public static final int RE_STINGY_OPS = 18;
152:
153: /**
154: * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W).
155: */
156: public static final int RE_CHAR_CLASS_ESCAPES = 19;
157:
158: /**
159: * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved).
160: */
161: public static final int RE_PURE_GROUPING = 20;
162:
163: /**
164: * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression
165: * to the text following the current position without consuming that text.
166: */
167: public static final int RE_LOOKAHEAD = 21;
168:
169: /**
170: * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z).
171: */
172: public static final int RE_STRING_ANCHORS = 22;
173:
174: /**
175: * Syntax bit. Allow embedded comments, (?#comment), as in Perl5.
176: */
177: public static final int RE_COMMENTS = 23;
178:
179: /**
180: * Syntax bit. Allow character class escapes within lists, as in Perl5.
181: */
182: public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24;
183:
184: private static final int BIT_TOTAL = 25;
185:
186: /**
187: * Predefined syntax.
188: * Emulates regular expression support in the awk utility.
189: */
190: public static final RESyntax RE_SYNTAX_AWK;
191:
192: /**
193: * Predefined syntax.
194: * Emulates regular expression support in the ed utility.
195: */
196: public static final RESyntax RE_SYNTAX_ED;
197:
198: /**
199: * Predefined syntax.
200: * Emulates regular expression support in the egrep utility.
201: */
202: public static final RESyntax RE_SYNTAX_EGREP;
203:
204: /**
205: * Predefined syntax.
206: * Emulates regular expression support in the GNU Emacs editor.
207: */
208: public static final RESyntax RE_SYNTAX_EMACS;
209:
210: /**
211: * Predefined syntax.
212: * Emulates regular expression support in the grep utility.
213: */
214: public static final RESyntax RE_SYNTAX_GREP;
215:
216: /**
217: * Predefined syntax.
218: * Emulates regular expression support in the POSIX awk specification.
219: */
220: public static final RESyntax RE_SYNTAX_POSIX_AWK;
221:
222: /**
223: * Predefined syntax.
224: * Emulates POSIX basic regular expression support.
225: */
226: public static final RESyntax RE_SYNTAX_POSIX_BASIC;
227:
228: /**
229: * Predefined syntax.
230: * Emulates regular expression support in the POSIX egrep specification.
231: */
232: public static final RESyntax RE_SYNTAX_POSIX_EGREP;
233:
234: /**
235: * Predefined syntax.
236: * Emulates POSIX extended regular expression support.
237: */
238: public static final RESyntax RE_SYNTAX_POSIX_EXTENDED;
239:
240: /**
241: * Predefined syntax.
242: * Emulates POSIX basic minimal regular expressions.
243: */
244: public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC;
245:
246: /**
247: * Predefined syntax.
248: * Emulates POSIX extended minimal regular expressions.
249: */
250: public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
251:
252: /**
253: * Predefined syntax.
254: * Emulates regular expression support in the sed utility.
255: */
256: public static final RESyntax RE_SYNTAX_SED;
257:
258: /**
259: * Predefined syntax.
260: * Emulates regular expression support in Larry Wall's perl, version 4,
261: */
262: public static final RESyntax RE_SYNTAX_PERL4;
263:
264: /**
265: * Predefined syntax.
266: * Emulates regular expression support in Larry Wall's perl, version 4,
267: * using single line mode (/s modifier).
268: */
269: public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s)
270:
271: /**
272: * Predefined syntax.
273: * Emulates regular expression support in Larry Wall's perl, version 5.
274: */
275: public static final RESyntax RE_SYNTAX_PERL5;
276:
277: /**
278: * Predefined syntax.
279: * Emulates regular expression support in Larry Wall's perl, version 5,
280: * using single line mode (/s modifier).
281: */
282: public static final RESyntax RE_SYNTAX_PERL5_S;
283:
284: /**
285: * Predefined syntax.
286: * Emulates regular expression support in Java 1.4's java.util.regex
287: * package.
288: */
289: public static final RESyntax RE_SYNTAX_JAVA_1_4;
290:
291: static {
292: // Define syntaxes
293:
294: RE_SYNTAX_EMACS = new RESyntax().makeFinal();
295:
296: RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax().set(
297: RE_CHAR_CLASSES).set(RE_DOT_NEWLINE).set(
298: RE_DOT_NOT_NULL).set(RE_INTERVALS).set(
299: RE_NO_EMPTY_RANGES).makeFinal();
300:
301: RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
302: .set(RE_BK_PLUS_QM).makeFinal();
303:
304: RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
305: .set(RE_CONTEXT_INDEP_ANCHORS)
306: .set(RE_CONTEXT_INDEP_OPS).set(RE_NO_BK_BRACES).set(
307: RE_NO_BK_PARENS).set(RE_NO_BK_VBAR).set(
308: RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal();
309:
310: RE_SYNTAX_AWK = new RESyntax()
311: .set(RE_BACKSLASH_ESCAPE_IN_LISTS).set(RE_DOT_NOT_NULL)
312: .set(RE_NO_BK_PARENS).set(RE_NO_BK_REFS).set(
313: RE_NO_BK_VBAR).set(RE_NO_EMPTY_RANGES).set(
314: RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal();
315:
316: RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED)
317: .set(RE_BACKSLASH_ESCAPE_IN_LISTS).makeFinal();
318:
319: RE_SYNTAX_GREP = new RESyntax().set(RE_BK_PLUS_QM).set(
320: RE_CHAR_CLASSES).set(RE_HAT_LISTS_NOT_NEWLINE).set(
321: RE_INTERVALS).set(RE_NEWLINE_ALT).makeFinal();
322:
323: RE_SYNTAX_EGREP = new RESyntax().set(RE_CHAR_CLASSES).set(
324: RE_CONTEXT_INDEP_ANCHORS).set(RE_CONTEXT_INDEP_OPS)
325: .set(RE_HAT_LISTS_NOT_NEWLINE).set(RE_NEWLINE_ALT).set(
326: RE_NO_BK_PARENS).set(RE_NO_BK_VBAR).makeFinal();
327:
328: RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP).set(
329: RE_INTERVALS).set(RE_NO_BK_BRACES).makeFinal();
330:
331: /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
332:
333: RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC).makeFinal();
334:
335: RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC).makeFinal();
336:
337: RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(
338: RE_SYNTAX_POSIX_COMMON).set(RE_LIMITED_OPS).makeFinal();
339:
340: /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
341: replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
342:
343: RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(
344: RE_SYNTAX_POSIX_COMMON).set(RE_CONTEXT_INDEP_ANCHORS)
345: .set(RE_CONTEXT_INVALID_OPS).set(RE_NO_BK_BRACES).set(
346: RE_NO_BK_PARENS).set(RE_NO_BK_REFS).set(
347: RE_NO_BK_VBAR)
348: .set(RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal();
349:
350: /* There is no official Perl spec, but here's a "best guess" */
351:
352: RE_SYNTAX_PERL4 = new RESyntax().set(
353: RE_BACKSLASH_ESCAPE_IN_LISTS).set(
354: RE_CONTEXT_INDEP_ANCHORS).set(RE_CONTEXT_INDEP_OPS) // except for '{', apparently
355: .set(RE_INTERVALS).set(RE_NO_BK_BRACES).set(
356: RE_NO_BK_PARENS).set(RE_NO_BK_VBAR).set(
357: RE_NO_EMPTY_RANGES).set(RE_CHAR_CLASS_ESCAPES) // \d,\D,\w,\W,\s,\S
358: .makeFinal();
359:
360: RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4).set(
361: RE_DOT_NEWLINE).makeFinal();
362:
363: RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4).set(
364: RE_PURE_GROUPING) // (?:)
365: .set(RE_STINGY_OPS) // *?,??,+?,{}?
366: .set(RE_LOOKAHEAD) // (?=)(?!)
367: .set(RE_STRING_ANCHORS) // \A,\Z
368: .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
369: .set(RE_COMMENTS) // (?#)
370: .makeFinal();
371:
372: RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5).set(
373: RE_DOT_NEWLINE).makeFinal();
374:
375: RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5)
376: // XXX
377: .makeFinal();
378: }
379:
380: /**
381: * Construct a new syntax object with all bits turned off.
382: * This is equivalent to RE_SYNTAX_EMACS.
383: */
384: public RESyntax() {
385: bits = new BitSet(BIT_TOTAL);
386: }
387:
388: /**
389: * Called internally when constructing predefined syntaxes
390: * so their interpretation cannot vary. Conceivably useful
391: * for your syntaxes as well. Causes IllegalAccessError to
392: * be thrown if any attempt to modify the syntax is made.
393: *
394: * @return this object for convenient chaining
395: */
396: public RESyntax makeFinal() {
397: isFinal = true;
398: return this ;
399: }
400:
401: /**
402: * Construct a new syntax object with all bits set the same
403: * as the other syntax.
404: */
405: public RESyntax(RESyntax other) {
406: bits = (BitSet) other.bits.clone();
407: }
408:
409: /**
410: * Check if a given bit is set in this syntax.
411: */
412: public boolean get(int index) {
413: return bits.get(index);
414: }
415:
416: /**
417: * Set a given bit in this syntax.
418: *
419: * @param index the constant (RESyntax.RE_xxx) bit to set.
420: * @return a reference to this object for easy chaining.
421: */
422: public RESyntax set(int index) {
423: if (isFinal)
424: throw new IllegalAccessError(SYNTAX_IS_FINAL);
425: bits.set(index);
426: return this ;
427: }
428:
429: /**
430: * Clear a given bit in this syntax.
431: *
432: * @param index the constant (RESyntax.RE_xxx) bit to clear.
433: * @return a reference to this object for easy chaining.
434: */
435: public RESyntax clear(int index) {
436: if (isFinal)
437: throw new IllegalAccessError(SYNTAX_IS_FINAL);
438: bits.clear(index);
439: return this ;
440: }
441:
442: /**
443: * Changes the line separator string for regular expressions
444: * created using this RESyntax. The default separator is the
445: * value returned by the system property "line.separator", which
446: * should be correct when reading platform-specific files from a
447: * filesystem. However, many programs may collect input from
448: * sources where the line separator is differently specified (for
449: * example, in the applet environment, the text box widget
450: * interprets line breaks as single-character newlines,
451: * regardless of the host platform.
452: *
453: * Note that setting the line separator to a character or
454: * characters that have specific meaning within the current syntax
455: * can cause unexpected chronosynclastic infundibula.
456: *
457: * @return this object for convenient chaining
458: */
459: public RESyntax setLineSeparator(String aSeparator) {
460: if (isFinal)
461: throw new IllegalAccessError(SYNTAX_IS_FINAL);
462: lineSeparator = aSeparator;
463: return this ;
464: }
465:
466: /**
467: * Returns the currently active line separator string. The default
468: * is the platform-dependent system property "line.separator".
469: */
470: public String getLineSeparator() {
471: return lineSeparator;
472: }
473: }
|