001: /*
002: * gnu/regexp/RESyntax.java
003: * Copyright (C) 1998 Wes Biggs
004: *
005: * This library is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Library General Public License as published
007: * by the Free Software Foundation; either version 2 of the License, or
008: * (at your option) any later version.
009: *
010: * This library is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Library General Public License for more details.
014: *
015: * You should have received a copy of the GNU Library General Public License
016: * along with this program; if not, write to the Free Software
017: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
018: */
019:
020: package gnu.regexp;
021:
022: import java.util.BitSet;
023:
024: /**
025: * An RESyntax specifies the way a regular expression will be compiled.
026: * This class provides a number of predefined useful constants for
027: * emulating popular regular expression syntaxes. Additionally the
028: * user may construct his or her own syntax, using any combination of the
029: * syntax bit constants. The syntax is an optional argument to any of the
030: * matching methods on class RE.
031: *
032: * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
033: */
034:
035: public class RESyntax {
036: private BitSet bits;
037:
038: // Values for constants are bit indexes
039:
040: /**
041: * Syntax bit. Backslash is an escape character in lists.
042: */
043: public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0;
044:
045: /**
046: * Syntax bit. Use \? instead of ? and \+ instead of +.
047: */
048: public static final int RE_BK_PLUS_QM = 1;
049:
050: /**
051: * Syntax bit. POSIX character classes ([:...:]) in lists are allowed.
052: */
053: public static final int RE_CHAR_CLASSES = 2;
054:
055: /**
056: * Syntax bit. ^ and $ are special everywhere.
057: * <B>Not implemented.</B>
058: */
059: public static final int RE_CONTEXT_INDEP_ANCHORS = 3;
060:
061: /**
062: * Syntax bit. Repetition operators are only special in valid positions.
063: * <B>Not implemented.</B>
064: */
065: public static final int RE_CONTEXT_INDEP_OPS = 4;
066:
067: /**
068: * Syntax bit. Repetition and alternation operators are invalid
069: * at start and end of pattern and other places.
070: * <B>Not implemented</B>.
071: */
072: public static final int RE_CONTEXT_INVALID_OPS = 5;
073:
074: /**
075: * Syntax bit. Match-any-character operator (.) matches a newline.
076: */
077: public static final int RE_DOT_NEWLINE = 6;
078:
079: /**
080: * Syntax bit. Match-any-character operator (.) does not match a null.
081: */
082: public static final int RE_DOT_NOT_NULL = 7;
083:
084: /**
085: * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed.
086: */
087: public static final int RE_INTERVALS = 8;
088:
089: /**
090: * Syntax bit. No alternation (|), match one-or-more (+), or
091: * match zero-or-one (?) operators.
092: */
093: public static final int RE_LIMITED_OPS = 9;
094:
095: /**
096: * Syntax bit. Newline is an alternation operator.
097: */
098: public static final int RE_NEWLINE_ALT = 10; // impl.
099:
100: /**
101: * Syntax bit. Intervals use { } instead of \{ \}
102: */
103: public static final int RE_NO_BK_BRACES = 11;
104:
105: /**
106: * Syntax bit. Grouping uses ( ) instead of \( \).
107: */
108: public static final int RE_NO_BK_PARENS = 12;
109:
110: /**
111: * Syntax bit. Backreferences not allowed.
112: */
113: public static final int RE_NO_BK_REFS = 13;
114:
115: /**
116: * Syntax bit. Alternation uses | instead of \|
117: */
118: public static final int RE_NO_BK_VBAR = 14;
119:
120: /**
121: * Syntax bit. <B>Not implemented</B>.
122: */
123: public static final int RE_NO_EMPTY_RANGES = 15;
124:
125: /**
126: * Syntax bit. An unmatched right parenthesis (')' or '\)', depending
127: * on RE_NO_BK_PARENS) will throw an exception when compiling.
128: */
129: public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16;
130:
131: /**
132: * Syntax bit. <B>Not implemented.</B>
133: */
134: public static final int RE_HAT_LISTS_NOT_NEWLINE = 17;
135:
136: /**
137: * Syntax bit. Stingy matching is allowed (+?, *?, ??, {x,y}?).
138: */
139: public static final int RE_STINGY_OPS = 18;
140:
141: /**
142: * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W).
143: */
144: public static final int RE_CHAR_CLASS_ESCAPES = 19;
145:
146: /**
147: * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved).
148: */
149: public static final int RE_PURE_GROUPING = 20;
150:
151: /**
152: * Syntax bit. <B>Not implemented</B>.
153: */
154: public static final int RE_LOOKAHEAD = 21;
155:
156: /**
157: * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z).
158: */
159: public static final int RE_STRING_ANCHORS = 22;
160:
161: /**
162: * Syntax bit. Allow embedded comments, (#comment), as in Perl5.
163: */
164: public static final int RE_COMMENTS = 23;
165:
166: /**
167: * Syntax bit. Allow character class escapes within lists, as in Perl5.
168: */
169: public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24;
170:
171: private static final int BIT_TOTAL = 25;
172:
173: /**
174: * Predefined syntax.
175: * Emulates regular expression support in the awk utility.
176: */
177: public static final RESyntax RE_SYNTAX_AWK;
178:
179: /**
180: * Predefined syntax.
181: * Emulates regular expression support in the ed utility.
182: */
183: public static final RESyntax RE_SYNTAX_ED;
184:
185: /**
186: * Predefined syntax.
187: * Emulates regular expression support in the egrep utility.
188: */
189: public static final RESyntax RE_SYNTAX_EGREP;
190:
191: /**
192: * Predefined syntax.
193: * Emulates regular expression support in the GNU Emacs editor.
194: */
195: public static final RESyntax RE_SYNTAX_EMACS;
196:
197: /**
198: * Predefined syntax.
199: * Emulates regular expression support in the grep utility.
200: */
201: public static final RESyntax RE_SYNTAX_GREP;
202:
203: /**
204: * Predefined syntax.
205: * Emulates regular expression support in the POSIX awk specification.
206: */
207: public static final RESyntax RE_SYNTAX_POSIX_AWK;
208:
209: /**
210: * Predefined syntax.
211: * Emulates POSIX basic regular expression support.
212: */
213: public static final RESyntax RE_SYNTAX_POSIX_BASIC;
214:
215: /**
216: * Predefined syntax.
217: * Emulates regular expression support in the POSIX egrep specification.
218: */
219: public static final RESyntax RE_SYNTAX_POSIX_EGREP;
220:
221: /**
222: * Predefined syntax.
223: * Emulates POSIX extended regular expression support.
224: */
225: public static final RESyntax RE_SYNTAX_POSIX_EXTENDED;
226:
227: /**
228: * Predefined syntax.
229: * Emulates POSIX basic minimal regular expressions.
230: */
231: public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC;
232:
233: /**
234: * Predefined syntax.
235: * Emulates POSIX extended minimal regular expressions.
236: */
237: public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
238:
239: /**
240: * Predefined syntax.
241: * Emulates regular expression support in the sed utility.
242: */
243: public static final RESyntax RE_SYNTAX_SED;
244:
245: /**
246: * Predefined syntax.
247: * Emulates regular expression support in Larry Wall's perl, version 4,
248: */
249: public static final RESyntax RE_SYNTAX_PERL4;
250:
251: /**
252: * Predefined syntax.
253: * Emulates regular expression support in Larry Wall's perl, version 4,
254: * using single line mode (/s modifier).
255: */
256: public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s)
257:
258: /**
259: * Predefined syntax.
260: * Emulates regular expression support in Larry Wall's perl, version 5.
261: */
262: public static final RESyntax RE_SYNTAX_PERL5;
263:
264: /**
265: * Predefined syntax.
266: * Emulates regular expression support in Larry Wall's perl, version 5,
267: * using single line mode (/s modifier).
268: */
269: public static final RESyntax RE_SYNTAX_PERL5_S;
270:
271: static {
272: // Define syntaxes
273:
274: RE_SYNTAX_EMACS = new RESyntax();
275:
276: RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax().set(
277: RE_CHAR_CLASSES).set(RE_DOT_NEWLINE).set(
278: RE_DOT_NOT_NULL).set(RE_INTERVALS).set(
279: RE_NO_EMPTY_RANGES);
280:
281: RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
282: .set(RE_BK_PLUS_QM);
283:
284: RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
285: .set(RE_CONTEXT_INDEP_ANCHORS)
286: .set(RE_CONTEXT_INDEP_OPS).set(RE_NO_BK_BRACES).set(
287: RE_NO_BK_PARENS).set(RE_NO_BK_VBAR).set(
288: RE_UNMATCHED_RIGHT_PAREN_ORD);
289:
290: RE_SYNTAX_AWK = new RESyntax()
291: .set(RE_BACKSLASH_ESCAPE_IN_LISTS).set(RE_DOT_NOT_NULL)
292: .set(RE_NO_BK_PARENS).set(RE_NO_BK_REFS).set(
293: RE_NO_BK_VBAR).set(RE_NO_EMPTY_RANGES).set(
294: RE_UNMATCHED_RIGHT_PAREN_ORD);
295:
296: RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED)
297: .set(RE_BACKSLASH_ESCAPE_IN_LISTS);
298:
299: RE_SYNTAX_GREP = new RESyntax().set(RE_BK_PLUS_QM).set(
300: RE_CHAR_CLASSES).set(RE_HAT_LISTS_NOT_NEWLINE).set(
301: RE_INTERVALS).set(RE_NEWLINE_ALT);
302:
303: RE_SYNTAX_EGREP = new RESyntax().set(RE_CHAR_CLASSES).set(
304: RE_CONTEXT_INDEP_ANCHORS).set(RE_CONTEXT_INDEP_OPS)
305: .set(RE_HAT_LISTS_NOT_NEWLINE).set(RE_NEWLINE_ALT).set(
306: RE_NO_BK_PARENS).set(RE_NO_BK_VBAR);
307:
308: RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP).set(
309: RE_INTERVALS).set(RE_NO_BK_BRACES);
310:
311: /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
312:
313: RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC);
314:
315: RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC);
316:
317: RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(
318: RE_SYNTAX_POSIX_COMMON).set(RE_LIMITED_OPS);
319:
320: /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
321: replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
322:
323: RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(
324: RE_SYNTAX_POSIX_COMMON).set(RE_CONTEXT_INDEP_ANCHORS)
325: .set(RE_CONTEXT_INVALID_OPS).set(RE_NO_BK_BRACES).set(
326: RE_NO_BK_PARENS).set(RE_NO_BK_REFS).set(
327: RE_NO_BK_VBAR)
328: .set(RE_UNMATCHED_RIGHT_PAREN_ORD);
329:
330: /* There is no official Perl spec, but here's a "best guess" */
331:
332: RE_SYNTAX_PERL4 = new RESyntax().set(
333: RE_BACKSLASH_ESCAPE_IN_LISTS).set(
334: RE_CONTEXT_INDEP_ANCHORS).set(RE_CONTEXT_INDEP_OPS) // except for '{', apparently
335: .set(RE_INTERVALS).set(RE_NO_BK_BRACES).set(
336: RE_NO_BK_PARENS).set(RE_NO_BK_VBAR).set(
337: RE_NO_EMPTY_RANGES).set(RE_CHAR_CLASS_ESCAPES); // \d,\D,\w,\W,\s,\S
338:
339: RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4)
340: .set(RE_DOT_NEWLINE);
341:
342: RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4).set(
343: RE_PURE_GROUPING) // (?:)
344: .set(RE_STINGY_OPS) // *?,??,+?,{}?
345: .set(RE_LOOKAHEAD) // (?=)(?!) not implemented
346: .set(RE_STRING_ANCHORS) // \A,\Z
347: .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
348: .set(RE_COMMENTS); // (?#)
349:
350: RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
351: .set(RE_DOT_NEWLINE);
352: }
353:
354: /**
355: * Construct a new syntax object with all bits turned off.
356: * This is equivalent to RE_SYNTAX_EMACS.
357: */
358: public RESyntax() {
359: bits = new BitSet(BIT_TOTAL);
360: }
361:
362: /**
363: * Construct a new syntax object with all bits set the same
364: * as the other syntax.
365: */
366: public RESyntax(RESyntax other) {
367: bits = (BitSet) other.bits.clone();
368: }
369:
370: /**
371: * Check if a given bit is set in this syntax.
372: */
373: public boolean get(int index) {
374: return bits.get(index);
375: }
376:
377: /**
378: * Set a given bit in this syntax. Returns a reference to this syntax
379: * for easy chaining.
380: */
381: public RESyntax set(int index) {
382: bits.set(index);
383: return this;
384: }
385: }
|