001: /*
002: * gnu/regexp/RESyntax.java
003: * Copyright (C) 1998-2001 Wes Biggs
004: *
005: * This library is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser General Public License as published
007: * by the Free Software Foundation; either version 2.1 of the License, or
008: * (at your option) any later version.
009: *
010: * This library is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser General Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser General Public License
016: * along with this program; if not, write to the Free Software
017: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
018: */
019:
020: package gnu.regexp;
021:
022: import java.io.Serializable;
023: import java.util.BitSet;
024:
025: /**
026: * An RESyntax specifies the way a regular expression will be compiled.
027: * This class provides a number of predefined useful constants for
028: * emulating popular regular expression syntaxes. Additionally the
029: * user may construct his or her own syntax, using any combination of the
030: * syntax bit constants. The syntax is an optional argument to any of the
031: * matching methods on class RE.
032: *
033: * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
034: */
035:
036: public final class RESyntax implements Serializable {
037: static final String DEFAULT_LINE_SEPARATOR = System
038: .getProperty("line.separator");
039:
040: private static final String SYNTAX_IS_FINAL = RE
041: .getLocalizedMessage("syntax.final");
042:
043: private BitSet bits;
044:
045: // true for the constant defined syntaxes
046: private boolean isFinal = false;
047:
048: private String lineSeparator = DEFAULT_LINE_SEPARATOR;
049:
050: // Values for constants are bit indexes
051:
052: /**
053: * Syntax bit. Backslash is an escape character in lists.
054: */
055: public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0;
056:
057: /**
058: * Syntax bit. Use \? instead of ? and \+ instead of +.
059: */
060: public static final int RE_BK_PLUS_QM = 1;
061:
062: /**
063: * Syntax bit. POSIX character classes ([:...:]) in lists are allowed.
064: */
065: public static final int RE_CHAR_CLASSES = 2;
066:
067: /**
068: * Syntax bit. ^ and $ are special everywhere.
069: * <B>Not implemented.</B>
070: */
071: public static final int RE_CONTEXT_INDEP_ANCHORS = 3;
072:
073: /**
074: * Syntax bit. Repetition operators are only special in valid positions.
075: * <B>Not implemented.</B>
076: */
077: public static final int RE_CONTEXT_INDEP_OPS = 4;
078:
079: /**
080: * Syntax bit. Repetition and alternation operators are invalid
081: * at start and end of pattern and other places.
082: * <B>Not implemented</B>.
083: */
084: public static final int RE_CONTEXT_INVALID_OPS = 5;
085:
086: /**
087: * Syntax bit. Match-any-character operator (.) matches a newline.
088: */
089: public static final int RE_DOT_NEWLINE = 6;
090:
091: /**
092: * Syntax bit. Match-any-character operator (.) does not match a null.
093: */
094: public static final int RE_DOT_NOT_NULL = 7;
095:
096: /**
097: * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed.
098: */
099: public static final int RE_INTERVALS = 8;
100:
101: /**
102: * Syntax bit. No alternation (|), match one-or-more (+), or
103: * match zero-or-one (?) operators.
104: */
105: public static final int RE_LIMITED_OPS = 9;
106:
107: /**
108: * Syntax bit. Newline is an alternation operator.
109: */
110: public static final int RE_NEWLINE_ALT = 10; // impl.
111:
112: /**
113: * Syntax bit. Intervals use { } instead of \{ \}
114: */
115: public static final int RE_NO_BK_BRACES = 11;
116:
117: /**
118: * Syntax bit. Grouping uses ( ) instead of \( \).
119: */
120: public static final int RE_NO_BK_PARENS = 12;
121:
122: /**
123: * Syntax bit. Backreferences not allowed.
124: */
125: public static final int RE_NO_BK_REFS = 13;
126:
127: /**
128: * Syntax bit. Alternation uses | instead of \|
129: */
130: public static final int RE_NO_BK_VBAR = 14;
131:
132: /**
133: * Syntax bit. <B>Not implemented</B>.
134: */
135: public static final int RE_NO_EMPTY_RANGES = 15;
136:
137: /**
138: * Syntax bit. An unmatched right parenthesis (')' or '\)', depending
139: * on RE_NO_BK_PARENS) will throw an exception when compiling.
140: */
141: public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16;
142:
143: /**
144: * Syntax bit. <B>Not implemented.</B>
145: */
146: public static final int RE_HAT_LISTS_NOT_NEWLINE = 17;
147:
148: /**
149: * Syntax bit. Stingy matching is allowed (+?, *?, ??, {x,y}?).
150: */
151: public static final int RE_STINGY_OPS = 18;
152:
153: /**
154: * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W).
155: */
156: public static final int RE_CHAR_CLASS_ESCAPES = 19;
157:
158: /**
159: * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved).
160: */
161: public static final int RE_PURE_GROUPING = 20;
162:
163: /**
164: * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression
165: * to the text following the current position without consuming that text.
166: */
167: public static final int RE_LOOKAHEAD = 21;
168:
169: /**
170: * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z).
171: */
172: public static final int RE_STRING_ANCHORS = 22;
173:
174: /**
175: * Syntax bit. Allow embedded comments, (?#comment), as in Perl5.
176: */
177: public static final int RE_COMMENTS = 23;
178:
179: /**
180: * Syntax bit. Allow character class escapes within lists, as in Perl5.
181: */
182: public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24;
183:
184: private static final int BIT_TOTAL = 25;
185:
186: /**
187: * Predefined syntax.
188: * Emulates regular expression support in the awk utility.
189: */
190: public static final RESyntax RE_SYNTAX_AWK;
191:
192: /**
193: * Predefined syntax.
194: * Emulates regular expression support in the ed utility.
195: */
196: public static final RESyntax RE_SYNTAX_ED;
197:
198: /**
199: * Predefined syntax.
200: * Emulates regular expression support in the egrep utility.
201: */
202: public static final RESyntax RE_SYNTAX_EGREP;
203:
204: /**
205: * Predefined syntax.
206: * Emulates regular expression support in the GNU Emacs editor.
207: */
208: public static final RESyntax RE_SYNTAX_EMACS;
209:
210: /**
211: * Predefined syntax.
212: * Emulates regular expression support in the grep utility.
213: */
214: public static final RESyntax RE_SYNTAX_GREP;
215:
216: /**
217: * Predefined syntax.
218: * Emulates regular expression support in the POSIX awk specification.
219: */
220: public static final RESyntax RE_SYNTAX_POSIX_AWK;
221:
222: /**
223: * Predefined syntax.
224: * Emulates POSIX basic regular expression support.
225: */
226: public static final RESyntax RE_SYNTAX_POSIX_BASIC;
227:
228: /**
229: * Predefined syntax.
230: * Emulates regular expression support in the POSIX egrep specification.
231: */
232: public static final RESyntax RE_SYNTAX_POSIX_EGREP;
233:
234: /**
235: * Predefined syntax.
236: * Emulates POSIX extended regular expression support.
237: */
238: public static final RESyntax RE_SYNTAX_POSIX_EXTENDED;
239:
240: /**
241: * Predefined syntax.
242: * Emulates POSIX basic minimal regular expressions.
243: */
244: public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC;
245:
246: /**
247: * Predefined syntax.
248: * Emulates POSIX extended minimal regular expressions.
249: */
250: public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
251:
252: /**
253: * Predefined syntax.
254: * Emulates regular expression support in the sed utility.
255: */
256: public static final RESyntax RE_SYNTAX_SED;
257:
258: /**
259: * Predefined syntax.
260: * Emulates regular expression support in Larry Wall's perl, version 4,
261: */
262: public static final RESyntax RE_SYNTAX_PERL4;
263:
264: /**
265: * Predefined syntax.
266: * Emulates regular expression support in Larry Wall's perl, version 4,
267: * using single line mode (/s modifier).
268: */
269: public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s)
270:
271: /**
272: * Predefined syntax.
273: * Emulates regular expression support in Larry Wall's perl, version 5.
274: */
275: public static final RESyntax RE_SYNTAX_PERL5;
276:
277: /**
278: * Predefined syntax.
279: * Emulates regular expression support in Larry Wall's perl, version 5,
280: * using single line mode (/s modifier).
281: */
282: public static final RESyntax RE_SYNTAX_PERL5_S;
283:
284: static {
285: // Define syntaxes
286:
287: RE_SYNTAX_EMACS = new RESyntax().makeFinal();
288:
289: RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax().set(
290: RE_CHAR_CLASSES).set(RE_DOT_NEWLINE).set(
291: RE_DOT_NOT_NULL).set(RE_INTERVALS).set(
292: RE_NO_EMPTY_RANGES).makeFinal();
293:
294: RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
295: .set(RE_BK_PLUS_QM).makeFinal();
296:
297: RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
298: .set(RE_CONTEXT_INDEP_ANCHORS)
299: .set(RE_CONTEXT_INDEP_OPS).set(RE_NO_BK_BRACES).set(
300: RE_NO_BK_PARENS).set(RE_NO_BK_VBAR).set(
301: RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal();
302:
303: RE_SYNTAX_AWK = new RESyntax()
304: .set(RE_BACKSLASH_ESCAPE_IN_LISTS).set(RE_DOT_NOT_NULL)
305: .set(RE_NO_BK_PARENS).set(RE_NO_BK_REFS).set(
306: RE_NO_BK_VBAR).set(RE_NO_EMPTY_RANGES).set(
307: RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal();
308:
309: RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED)
310: .set(RE_BACKSLASH_ESCAPE_IN_LISTS).makeFinal();
311:
312: RE_SYNTAX_GREP = new RESyntax().set(RE_BK_PLUS_QM).set(
313: RE_CHAR_CLASSES).set(RE_HAT_LISTS_NOT_NEWLINE).set(
314: RE_INTERVALS).set(RE_NEWLINE_ALT).makeFinal();
315:
316: RE_SYNTAX_EGREP = new RESyntax().set(RE_CHAR_CLASSES).set(
317: RE_CONTEXT_INDEP_ANCHORS).set(RE_CONTEXT_INDEP_OPS)
318: .set(RE_HAT_LISTS_NOT_NEWLINE).set(RE_NEWLINE_ALT).set(
319: RE_NO_BK_PARENS).set(RE_NO_BK_VBAR).makeFinal();
320:
321: RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP).set(
322: RE_INTERVALS).set(RE_NO_BK_BRACES).makeFinal();
323:
324: /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
325:
326: RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC).makeFinal();
327:
328: RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC).makeFinal();
329:
330: RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(
331: RE_SYNTAX_POSIX_COMMON).set(RE_LIMITED_OPS).makeFinal();
332:
333: /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
334: replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
335:
336: RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(
337: RE_SYNTAX_POSIX_COMMON).set(RE_CONTEXT_INDEP_ANCHORS)
338: .set(RE_CONTEXT_INVALID_OPS).set(RE_NO_BK_BRACES).set(
339: RE_NO_BK_PARENS).set(RE_NO_BK_REFS).set(
340: RE_NO_BK_VBAR)
341: .set(RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal();
342:
343: /* There is no official Perl spec, but here's a "best guess" */
344:
345: RE_SYNTAX_PERL4 = new RESyntax().set(
346: RE_BACKSLASH_ESCAPE_IN_LISTS).set(
347: RE_CONTEXT_INDEP_ANCHORS).set(RE_CONTEXT_INDEP_OPS) // except for '{', apparently
348: .set(RE_INTERVALS).set(RE_NO_BK_BRACES).set(
349: RE_NO_BK_PARENS).set(RE_NO_BK_VBAR).set(
350: RE_NO_EMPTY_RANGES).set(RE_CHAR_CLASS_ESCAPES) // \d,\D,\w,\W,\s,\S
351: .makeFinal();
352:
353: RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4).set(
354: RE_DOT_NEWLINE).makeFinal();
355:
356: RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4).set(
357: RE_PURE_GROUPING) // (?:)
358: .set(RE_STINGY_OPS) // *?,??,+?,{}?
359: .set(RE_LOOKAHEAD) // (?=)(?!)
360: .set(RE_STRING_ANCHORS) // \A,\Z
361: .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
362: .set(RE_COMMENTS) // (?#)
363: .makeFinal();
364:
365: RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5).set(
366: RE_DOT_NEWLINE).makeFinal();
367: }
368:
369: /**
370: * Construct a new syntax object with all bits turned off.
371: * This is equivalent to RE_SYNTAX_EMACS.
372: */
373: public RESyntax() {
374: bits = new BitSet(BIT_TOTAL);
375: }
376:
377: /**
378: * Called internally when constructing predefined syntaxes
379: * so their interpretation cannot vary. Conceivably useful
380: * for your syntaxes as well. Causes IllegalAccessError to
381: * be thrown if any attempt to modify the syntax is made.
382: *
383: * @return this object for convenient chaining
384: */
385: public RESyntax makeFinal() {
386: isFinal = true;
387: return this ;
388: }
389:
390: /**
391: * Construct a new syntax object with all bits set the same
392: * as the other syntax.
393: */
394: public RESyntax(RESyntax other) {
395: bits = (BitSet) other.bits.clone();
396: }
397:
398: /**
399: * Check if a given bit is set in this syntax.
400: */
401: public boolean get(int index) {
402: return bits.get(index);
403: }
404:
405: /**
406: * Set a given bit in this syntax.
407: *
408: * @param index the constant (RESyntax.RE_xxx) bit to set.
409: * @return a reference to this object for easy chaining.
410: */
411: public RESyntax set(int index) {
412: if (isFinal)
413: throw new IllegalAccessError(SYNTAX_IS_FINAL);
414: bits.set(index);
415: return this ;
416: }
417:
418: /**
419: * Clear a given bit in this syntax.
420: *
421: * @param index the constant (RESyntax.RE_xxx) bit to clear.
422: * @return a reference to this object for easy chaining.
423: */
424: public RESyntax clear(int index) {
425: if (isFinal)
426: throw new IllegalAccessError(SYNTAX_IS_FINAL);
427: bits.clear(index);
428: return this ;
429: }
430:
431: /**
432: * Changes the line separator string for regular expressions
433: * created using this RESyntax. The default separator is the
434: * value returned by the system property "line.separator", which
435: * should be correct when reading platform-specific files from a
436: * filesystem. However, many programs may collect input from
437: * sources where the line separator is differently specified (for
438: * example, in the applet environment, the text box widget
439: * interprets line breaks as single-character newlines,
440: * regardless of the host platform.
441: *
442: * Note that setting the line separator to a character or
443: * characters that have specific meaning within the current syntax
444: * can cause unexpected chronosynclastic infundibula.
445: *
446: * @return this object for convenient chaining
447: */
448: public RESyntax setLineSeparator(String aSeparator) {
449: if (isFinal)
450: throw new IllegalAccessError(SYNTAX_IS_FINAL);
451: lineSeparator = aSeparator;
452: return this ;
453: }
454:
455: /**
456: * Returns the currently active line separator string. The default
457: * is the platform-dependent system property "line.separator".
458: */
459: public String getLineSeparator() {
460: return lineSeparator;
461: }
462: }
|