001: /*
002: * $Id: AwkCompiler.java,v 1.10 2003/11/07 20:16:24 dfs Exp $
003: *
004: * ====================================================================
005: * The Apache Software License, Version 1.1
006: *
007: * Copyright (c) 2000 The Apache Software Foundation. All rights
008: * reserved.
009: *
010: * Redistribution and use in source and binary forms, with or without
011: * modification, are permitted provided that the following conditions
012: * are met:
013: *
014: * 1. Redistributions of source code must retain the above copyright
015: * notice, this list of conditions and the following disclaimer.
016: *
017: * 2. Redistributions in binary form must reproduce the above copyright
018: * notice, this list of conditions and the following disclaimer in
019: * the documentation and/or other materials provided with the
020: * distribution.
021: *
022: * 3. The end-user documentation included with the redistribution,
023: * if any, must include the following acknowledgment:
024: * "This product includes software developed by the
025: * Apache Software Foundation (http://www.apache.org/)."
026: * Alternately, this acknowledgment may appear in the software itself,
027: * if and wherever such third-party acknowledgments normally appear.
028: *
029: * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
030: * must not be used to endorse or promote products derived from this
031: * software without prior written permission. For written
032: * permission, please contact apache@apache.org.
033: *
034: * 5. Products derived from this software may not be called "Apache"
035: * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
036: * name, without prior written permission of the Apache Software Foundation.
037: *
038: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
039: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
040: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
041: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
042: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
043: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
044: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
045: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
046: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
047: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
048: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
049: * SUCH DAMAGE.
050: * ====================================================================
051: *
052: * This software consists of voluntary contributions made by many
053: * individuals on behalf of the Apache Software Foundation. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package org.apache.oro.text.awk;
059:
060: import org.apache.oro.text.regex.*;
061:
062: /**
063: * The AwkCompiler class is used to create compiled regular expressions
064: * conforming to the Awk regular expression syntax. It generates
065: * AwkPattern instances upon compilation to be used in conjunction
066: * with an AwkMatcher instance. AwkMatcher finds true leftmost-longest
067: * matches, so you must take care with how you formulate your regular
068: * expression to avoid matching more than you really want.
069: * <p>
070: * The supported regular expression syntax is a superset of traditional AWK,
071: * but NOT to be confused with GNU AWK or other AWK variants. Additionally,
072: * this AWK implementation is DFA-based and only supports 8-bit ASCII.
073: * Consequently, these classes can perform very fast pattern matches in
074: * most cases.
075: * <p>
076: * This is the traditional Awk syntax that is supported:
077: * <ul>
078: * <li> Alternatives separated by |
079: * <li> Quantified atoms
080: * <dl compact>
081: * <dt> * <dd> Match 0 or more times.
082: * <dt> + <dd> Match 1 or more times.
083: * <dt> ? <dd> Match 0 or 1 times.
084: * </dl>
085: * <li> Atoms
086: * <ul>
087: * <li> regular expression within parentheses
088: * <li> a . matches everything including newline
089: * <li> a ^ is a null token matching the beginning of a string
090: * but has no relation to newlines (and is only valid at the
091: * beginning of a regex; this differs from traditional awk
092: * for the sake of efficiency in Java).
093: * <li> a $ is a null token matching the end of a string but has
094: * no relation to newlines (and is only valid at the
095: * end of a regex; this differs from traditional awk for the
096: * sake of efficiency in Java).
097: * <li> Character classes (e.g., [abcd]) and ranges (e.g. [a-z])
098: * <ul>
099: * <li> Special backslashed characters work within a character class
100: * </ul>
101: * <li> Special backslashed characters
102: * <dl compact>
103: * <dt> \b <dd> backspace
104: * <dt> \n <dd> newline
105: * <dt> \r <dd> carriage return
106: * <dt> \t <dd> tab
107: * <dt> \f <dd> formfeed
108: * <dt> \xnn <dd> hexadecimal representation of character
109: * <dt> \nn or \nnn <dd> octal representation of character
110: * <dt> Any other backslashed character matches itself
111: * </dl>
112: * </ul></ul>
113: * <p>
114: * This is the extended syntax that is supported:
115: * <ul>
116: * <li> Quantified atoms
117: * <dl compact>
118: * <dt> {n,m} <dd> Match at least n but not more than m times.
119: * <dt> {n,} <dd> Match at least n times.
120: * <dt> {n} <dd> Match exactly n times.
121: * </dl>
122: * <li> Atoms
123: * <ul>
124: * <li> Special backslashed characters
125: * <dl compact>
126: * <dt> \d <dd> digit [0-9]
127: * <dt> \D <dd> non-digit [^0-9]
128: * <dt> \w <dd> word character [0-9a-z_A-Z]
129: * <dt> \W <dd> a non-word character [^0-9a-z_A-Z]
130: * <dt> \s <dd> a whitespace character [ \t\n\r\f]
131: * <dt> \S <dd> a non-whitespace character [^ \t\n\r\f]
132: * <dt> \cD <dd> matches the corresponding control character
133: * <dt> \0 <dd> matches null character
134: * </dl>
135: * </ul></ul>
136: *
137: * @version @version@
138: * @since 1.0
139: * @see org.apache.oro.text.regex.PatternCompiler
140: * @see org.apache.oro.text.regex.MalformedPatternException
141: * @see AwkPattern
142: * @see AwkMatcher
143: */
144: public final class AwkCompiler implements PatternCompiler {
145:
146: /**
147: * The default mask for the {@link #compile compile} methods.
148: * It is equal to 0 and indicates no special options are active.
149: */
150: public static final int DEFAULT_MASK = 0;
151:
152: /**
153: * A mask passed as an option to the {@link #compile compile} methods
154: * to indicate a compiled regular expression should be case insensitive.
155: */
156: public static final int CASE_INSENSITIVE_MASK = 0x0001;
157:
158: /**
159: * A mask passed as an option to the {@link #compile compile} methods
160: * to indicate a compiled regular expression should treat input as having
161: * multiple lines. This option affects the interpretation of
162: * the <b> . </b> metacharacters. When this mask is used,
163: * the <b> . </b> metacharacter will not match newlines. The default
164: * behavior is for <b> . </b> to match newlines.
165: */
166: public static final int MULTILINE_MASK = 0x0002;
167:
168: static final char _END_OF_INPUT = '\uFFFF';
169:
170: // All of these are initialized by the compile() and _parse() methods
171: // so there is no need or use in initializing them in the constructor
172: // although this may change in the future.
173: private boolean __inCharacterClass, __caseSensitive, __multiline;
174: private boolean __beginAnchor, __endAnchor;
175: private char __lookahead;
176: private int __position, __bytesRead, __expressionLength;
177: private char[] __regularExpression;
178: private int __openParen, __closeParen;
179:
180: // We do not currently need to initialize any state, but keep this
181: // commented out as a reminder that we may have to at some point.
182: //public AwkCompiler() { }
183:
184: private static boolean __isMetachar(char token) {
185: return (token == '*' || token == '?' || token == '+'
186: || token == '[' || token == ']' || token == '('
187: || token == ')' || token == '|' || /* token == '^' ||
188: token == '$' || */token == '.');
189: }
190:
191: static boolean _isWordCharacter(char token) {
192: return ((token >= 'a' && token <= 'z')
193: || (token >= 'A' && token <= 'Z')
194: || (token >= '0' && token <= '9') || (token == '_'));
195: }
196:
197: static boolean _isLowerCase(char token) {
198: return (token >= 'a' && token <= 'z');
199: }
200:
201: static boolean _isUpperCase(char token) {
202: return (token >= 'A' && token <= 'Z');
203: }
204:
205: static char _toggleCase(char token) {
206: if (_isUpperCase(token))
207: return (char) (token + 32);
208: else if (_isLowerCase(token))
209: return (char) (token - 32);
210:
211: return token;
212: }
213:
214: private void __match(char token) throws MalformedPatternException {
215: if (token == __lookahead) {
216: if (__bytesRead < __expressionLength)
217: __lookahead = __regularExpression[__bytesRead++];
218: else
219: __lookahead = _END_OF_INPUT;
220: } else
221: throw new MalformedPatternException("token: " + token
222: + " does not match lookahead: " + __lookahead
223: + " at position: " + __bytesRead);
224: }
225:
226: private void __putback() {
227: if (__lookahead != _END_OF_INPUT)
228: --__bytesRead;
229: __lookahead = __regularExpression[__bytesRead - 1];
230: }
231:
232: private SyntaxNode __regex() throws MalformedPatternException {
233: SyntaxNode left;
234:
235: left = __branch();
236:
237: if (__lookahead == '|') {
238: __match('|');
239: return (new OrNode(left, __regex()));
240: }
241:
242: return left;
243: }
244:
245: private SyntaxNode __branch() throws MalformedPatternException {
246: CatNode current;
247: SyntaxNode left, root;
248:
249: left = __piece();
250:
251: if (__lookahead == ')') {
252: if (__openParen > __closeParen)
253: return left;
254: else
255: throw new MalformedPatternException(
256: "Parse error: close parenthesis"
257: + " without matching open parenthesis at position "
258: + __bytesRead);
259: } else if (__lookahead == '|' || __lookahead == _END_OF_INPUT)
260: return left;
261:
262: root = current = new CatNode();
263: current._left = left;
264:
265: while (true) {
266: left = __piece();
267:
268: if (__lookahead == ')') {
269: if (__openParen > __closeParen) {
270: current._right = left;
271: break;
272: } else
273: throw new MalformedPatternException(
274: "Parse error: close parenthesis"
275: + " without matching open parenthesis at position "
276: + __bytesRead);
277: } else if (__lookahead == '|'
278: || __lookahead == _END_OF_INPUT) {
279: current._right = left;
280: break;
281: }
282:
283: current._right = new CatNode();
284: current = (CatNode) current._right;
285: current._left = left;
286: }
287:
288: return root;
289: }
290:
291: private SyntaxNode __piece() throws MalformedPatternException {
292: SyntaxNode left;
293:
294: left = __atom();
295:
296: switch (__lookahead) {
297: case '+':
298: __match('+');
299: return (new PlusNode(left));
300: case '?':
301: __match('?');
302: return (new QuestionNode(left));
303: case '*':
304: __match('*');
305: return (new StarNode(left));
306: case '{':
307: return __repetition(left);
308: }
309:
310: return left;
311: }
312:
313: // if numChars is 0, this means match as many as you want
314: private int __parseUnsignedInteger(int radix, int minDigits,
315: int maxDigits) throws MalformedPatternException {
316: int num, digits = 0;
317: StringBuffer buf;
318:
319: // We don't expect huge numbers, so an initial buffer of 4 is fine.
320: buf = new StringBuffer(4);
321:
322: while (Character.digit(__lookahead, radix) != -1
323: && digits < maxDigits) {
324: buf.append((char) __lookahead);
325: __match(__lookahead);
326: ++digits;
327: }
328:
329: if (digits < minDigits || digits > maxDigits)
330: throw new MalformedPatternException(
331: "Parse error: unexpected number of digits at position "
332: + __bytesRead);
333:
334: try {
335: num = Integer.parseInt(buf.toString(), radix);
336: } catch (NumberFormatException e) {
337: throw new MalformedPatternException(
338: "Parse error: numeric value at " + "position "
339: + __bytesRead + " is invalid");
340: }
341:
342: return num;
343: }
344:
345: private SyntaxNode __repetition(SyntaxNode atom)
346: throws MalformedPatternException {
347: int min, max, startPosition[];
348: SyntaxNode root = null;
349: CatNode catNode;
350:
351: __match('{');
352:
353: min = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE);
354: startPosition = new int[1];
355: startPosition[0] = __position;
356:
357: if (__lookahead == '}') {
358: // Match exactly min times. Concatenate the atom min times.
359: __match('}');
360:
361: if (min == 0)
362: throw new MalformedPatternException(
363: "Parse error: Superfluous interval specified at position "
364: + __bytesRead
365: + ". Number of occurences was set to zero.");
366:
367: if (min == 1)
368: return atom;
369:
370: root = catNode = new CatNode();
371: catNode._left = atom;
372:
373: while (--min > 1) {
374: atom = atom._clone(startPosition);
375:
376: catNode._right = new CatNode();
377: catNode = (CatNode) catNode._right;
378: catNode._left = atom;
379: }
380:
381: catNode._right = atom._clone(startPosition);
382: } else if (__lookahead == ',') {
383: __match(',');
384:
385: if (__lookahead == '}') {
386: // match at least min times
387: __match('}');
388:
389: if (min == 0)
390: return new StarNode(atom);
391:
392: if (min == 1)
393: return new PlusNode(atom);
394:
395: root = catNode = new CatNode();
396: catNode._left = atom;
397:
398: while (--min > 0) {
399: atom = atom._clone(startPosition);
400:
401: catNode._right = new CatNode();
402: catNode = (CatNode) catNode._right;
403: catNode._left = atom;
404: }
405:
406: catNode._right = new StarNode(atom
407: ._clone(startPosition));
408: } else {
409: // match at least min times and at most max times
410: max = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE);
411: __match('}');
412:
413: if (max < min)
414: throw new MalformedPatternException(
415: "Parse error: invalid interval; " + max
416: + " is less than " + min
417: + " at position " + __bytesRead);
418: if (max == 0)
419: throw new MalformedPatternException(
420: "Parse error: Superfluous interval specified at position "
421: + __bytesRead
422: + ". Number of occurences was set to zero.");
423:
424: if (min == 0) {
425: if (max == 1)
426: return new QuestionNode(atom);
427:
428: root = catNode = new CatNode();
429: atom = new QuestionNode(atom);
430: catNode._left = atom;
431:
432: while (--max > 1) {
433: atom = atom._clone(startPosition);
434:
435: catNode._right = new CatNode();
436: catNode = (CatNode) catNode._right;
437: catNode._left = atom;
438: }
439:
440: catNode._right = atom._clone(startPosition);
441: } else if (min == max) {
442: if (min == 1)
443: return atom;
444:
445: root = catNode = new CatNode();
446: catNode._left = atom;
447:
448: while (--min > 1) {
449: atom = atom._clone(startPosition);
450:
451: catNode._right = new CatNode();
452: catNode = (CatNode) catNode._right;
453: catNode._left = atom;
454: }
455:
456: catNode._right = atom._clone(startPosition);
457: } else {
458: int count;
459:
460: root = catNode = new CatNode();
461: catNode._left = atom;
462:
463: for (count = 1; count < min; count++) {
464: atom = atom._clone(startPosition);
465:
466: catNode._right = new CatNode();
467: catNode = (CatNode) catNode._right;
468: catNode._left = atom;
469: }
470:
471: atom = new QuestionNode(atom._clone(startPosition));
472:
473: count = max - min;
474:
475: if (count == 1)
476: catNode._right = atom;
477: else {
478: catNode._right = new CatNode();
479: catNode = (CatNode) catNode._right;
480: catNode._left = atom;
481:
482: while (--count > 1) {
483: atom = atom._clone(startPosition);
484:
485: catNode._right = new CatNode();
486: catNode = (CatNode) catNode._right;
487: catNode._left = atom;
488: }
489:
490: catNode._right = atom._clone(startPosition);
491: }
492: }
493: }
494: } else
495: throw new MalformedPatternException(
496: "Parse error: unexpected character " + __lookahead
497: + " in interval at position " + __bytesRead);
498: __position = startPosition[0];
499: return root;
500: }
501:
502: private SyntaxNode __backslashToken()
503: throws MalformedPatternException {
504: SyntaxNode current;
505: char token;
506: int number;
507:
508: __match('\\');
509:
510: if (__lookahead == 'x') {
511: __match('x');
512: // Parse a hexadecimal number
513: current = _newTokenNode((char) __parseUnsignedInteger(16,
514: 2, 2), __position++);
515: } else if (__lookahead == 'c') {
516: __match('c');
517: // Create a control character
518: token = Character.toUpperCase(__lookahead);
519: token = (char) (token > 63 ? token - 64 : token + 64);
520: current = new TokenNode(token, __position++);
521: __match(__lookahead);
522: } else if (__lookahead >= '0' && __lookahead <= '9') {
523: __match(__lookahead);
524:
525: if (__lookahead >= '0' && __lookahead <= '9') {
526: // We have an octal character or a multi-digit backreference.
527: // Assume octal character for now.
528: __putback();
529: number = __parseUnsignedInteger(10, 2, 3);
530: number = Integer.parseInt(Integer.toString(number), 8);
531: current = _newTokenNode((char) number, __position++);
532: } else {
533: // We have either \0, an escaped digit, or a backreference.
534: __putback();
535: if (__lookahead == '0') {
536: // \0 matches the null character
537: __match('0');
538: current = new TokenNode('\0', __position++);
539: } else {
540: // Either an escaped digit or backreference.
541: number = Character.digit(__lookahead, 10);
542: current = _newTokenNode(__lookahead, __position++);
543: __match(__lookahead);
544: }
545: }
546: } else if (__lookahead == 'b') {
547: // Inside of a character class the \b means backspace, otherwise
548: // it means a word boundary
549: //if(__inCharacterClass)
550: // \b always means backspace
551: current = new TokenNode('\b', __position++);
552: /*
553: else
554: current = new TokenNode((char)LeafNode._WORD_BOUNDARY_MARKER_TOKEN,
555: position++);
556: */
557: __match('b');
558: } /*else if(__lookahead == 'B' && !__inCharacterClass){
559: current = new TokenNode((char)LeafNode._NONWORD_BOUNDARY_MARKER_TOKEN,
560: position++);
561: __match('B');
562: } */else {
563: CharacterClassNode characterSet;
564: token = __lookahead;
565:
566: switch (__lookahead) {
567: case 'n':
568: token = '\n';
569: break;
570: case 'r':
571: token = '\r';
572: break;
573: case 't':
574: token = '\t';
575: break;
576: case 'f':
577: token = '\f';
578: break;
579: }
580:
581: switch (token) {
582: case 'd':
583: characterSet = new CharacterClassNode(__position++);
584: characterSet._addTokenRange('0', '9');
585: current = characterSet;
586: break;
587: case 'D':
588: characterSet = new NegativeCharacterClassNode(
589: __position++);
590: characterSet._addTokenRange('0', '9');
591: current = characterSet;
592: break;
593: case 'w':
594: characterSet = new CharacterClassNode(__position++);
595: characterSet._addTokenRange('0', '9');
596: characterSet._addTokenRange('a', 'z');
597: characterSet._addTokenRange('A', 'Z');
598: characterSet._addToken('_');
599: current = characterSet;
600: break;
601: case 'W':
602: characterSet = new NegativeCharacterClassNode(
603: __position++);
604: characterSet._addTokenRange('0', '9');
605: characterSet._addTokenRange('a', 'z');
606: characterSet._addTokenRange('A', 'Z');
607: characterSet._addToken('_');
608: current = characterSet;
609: break;
610: case 's':
611: characterSet = new CharacterClassNode(__position++);
612: characterSet._addToken(' ');
613: characterSet._addToken('\f');
614: characterSet._addToken('\n');
615: characterSet._addToken('\r');
616: characterSet._addToken('\t');
617: current = characterSet;
618: break;
619: case 'S':
620: characterSet = new NegativeCharacterClassNode(
621: __position++);
622: characterSet._addToken(' ');
623: characterSet._addToken('\f');
624: characterSet._addToken('\n');
625: characterSet._addToken('\r');
626: characterSet._addToken('\t');
627: current = characterSet;
628: break;
629: default:
630: current = _newTokenNode(token, __position++);
631: break;
632: }
633:
634: __match(__lookahead);
635: }
636:
637: return current;
638: }
639:
640: private SyntaxNode __atom() throws MalformedPatternException {
641: SyntaxNode current;
642:
643: if (__lookahead == '(') {
644: __match('(');
645: ++__openParen;
646: current = __regex();
647: __match(')');
648: ++__closeParen;
649: } else if (__lookahead == '[')
650: current = __characterClass();
651: else if (__lookahead == '.') {
652: CharacterClassNode characterSet;
653:
654: __match('.');
655: characterSet = new NegativeCharacterClassNode(__position++);
656: if (__multiline)
657: characterSet._addToken('\n');
658: current = characterSet;
659: } else if (__lookahead == '\\') {
660: current = __backslashToken();
661: } /*else if(__lookahead == '^') {
662: current =
663: new TokenNode((char)LeafNode._BEGIN_LINE_MARKER_TOKEN, __position++);
664: __match('^');
665: } else if(__lookahead == '$') {
666: current =
667: new TokenNode((char)LeafNode._END_LINE_MARKER_TOKEN, __position++);
668: __match('$');
669: } */else if (!__isMetachar(__lookahead)) {
670: current = _newTokenNode(__lookahead, __position++);
671: __match(__lookahead);
672: } else
673: throw new MalformedPatternException(
674: "Parse error: unexpected character " + __lookahead
675: + " at position " + __bytesRead);
676:
677: return current;
678: }
679:
680: private SyntaxNode __characterClass()
681: throws MalformedPatternException {
682: char lastToken, token;
683: SyntaxNode node;
684: CharacterClassNode current;
685:
686: __match('[');
687: __inCharacterClass = true;
688:
689: if (__lookahead == '^') {
690: __match('^');
691: current = new NegativeCharacterClassNode(__position++);
692: } else
693: current = new CharacterClassNode(__position++);
694:
695: while (__lookahead != ']' && __lookahead != _END_OF_INPUT) {
696:
697: if (__lookahead == '\\') {
698: node = __backslashToken();
699: --__position;
700:
701: // __backslashToken() (actually newTokenNode()) does not take care of
702: // case insensitivity when __inCharacterClass is true.
703: if (node instanceof TokenNode) {
704: lastToken = ((TokenNode) node)._token;
705: current._addToken(lastToken);
706: if (!__caseSensitive)
707: current._addToken(_toggleCase(lastToken));
708: } else {
709: CharacterClassNode slash;
710: slash = (CharacterClassNode) node;
711: // This could be made more efficient by manipulating the
712: // characterSet elements of the CharacterClassNodes but
713: // for the moment, this is more clear.
714: for (token = 0; token < LeafNode._NUM_TOKENS; token++) {
715: if (slash._matches(token))
716: current._addToken(token);
717: }
718:
719: // A byproduct of this act is that when a '-' occurs after
720: // a \d, \w, etc. it is not interpreted as a range and no
721: // parse exception is thrown.
722: // This is considered a feature and not a bug for now.
723: continue;
724: }
725: } else {
726: lastToken = __lookahead;
727: current._addToken(__lookahead);
728: if (!__caseSensitive)
729: current._addToken(_toggleCase(__lookahead));
730: __match(__lookahead);
731: }
732:
733: // In Perl, a - is a token if it occurs at the beginning
734: // or end of the character class. Anywhere else, it indicates
735: // a range.
736: // A byproduct of this implementation is that if a '-' occurs
737: // after the end of a range, it is interpreted as a '-' and no
738: // exception is thrown. e.g., the second dash in [a-z-x]
739: // This is considered a feature and not a bug for now.
740: if (__lookahead == '-') {
741: __match('-');
742: if (__lookahead == ']') {
743: current._addToken('-');
744: break;
745: } else if (__lookahead == '\\') {
746: node = __backslashToken();
747: --__position;
748: if (node instanceof TokenNode)
749: token = ((TokenNode) node)._token;
750: else
751: throw new MalformedPatternException(
752: "Parse error: invalid range specified at position "
753: + __bytesRead);
754: } else {
755: token = __lookahead;
756: __match(__lookahead);
757: }
758:
759: if (token < lastToken)
760: throw new MalformedPatternException(
761: "Parse error: invalid range specified at position "
762: + __bytesRead);
763: current._addTokenRange(lastToken + 1, token);
764: if (!__caseSensitive)
765: current._addTokenRange(
766: _toggleCase((char) (lastToken + 1)),
767: _toggleCase(token));
768: }
769: }
770:
771: __match(']');
772: __inCharacterClass = false;
773: return current;
774: }
775:
776: SyntaxNode _newTokenNode(char token, int position) {
777: if (!__inCharacterClass && !__caseSensitive
778: && (_isUpperCase(token) || _isLowerCase(token))) {
779: CharacterClassNode node = new CharacterClassNode(position);
780: node._addToken(token);
781: node._addToken(_toggleCase(token));
782: return node;
783: }
784:
785: return new TokenNode(token, position);
786: }
787:
788: SyntaxTree _parse(char[] expression)
789: throws MalformedPatternException {
790: SyntaxTree tree;
791:
792: __openParen = __closeParen = 0;
793: __regularExpression = expression;
794: __bytesRead = 0;
795: __expressionLength = expression.length;
796: __inCharacterClass = false;
797:
798: __position = 0;
799: __match(__lookahead); // Call match to read first input.
800:
801: if (__lookahead == '^') {
802: __beginAnchor = true;
803: __match(__lookahead);
804: }
805:
806: if (__expressionLength > 0
807: && expression[__expressionLength - 1] == '$') {
808: --__expressionLength;
809: __endAnchor = true;
810: }
811:
812: if (__expressionLength > 1
813: || (__expressionLength == 1 && !__beginAnchor)) {
814: CatNode root;
815: root = new CatNode();
816: root._left = __regex();
817: // end marker
818: root._right = new TokenNode(
819: (char) LeafNode._END_MARKER_TOKEN, __position++);
820: tree = new SyntaxTree(root, __position);
821: } else
822: tree = new SyntaxTree(new TokenNode(
823: (char) LeafNode._END_MARKER_TOKEN, 0), 1);
824:
825: tree._computeFollowPositions();
826:
827: return tree;
828: }
829:
830: /**
831: * Compiles an Awk regular expression into an AwkPattern instance that
832: * can be used by an AwkMatcher object to perform pattern matching.
833: * <p>
834: * @param pattern An Awk regular expression to compile.
835: * @param options A set of flags giving the compiler instructions on
836: * how to treat the regular expression. Currently the
837: * only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.
838: * @return A Pattern instance constituting the compiled regular expression.
839: * This instance will always be an AwkPattern and can be reliably
840: * be casted to an AwkPattern.
841: * @exception MalformedPatternException If the compiled expression
842: * is not a valid Awk regular expression.
843: */
844: public Pattern compile(char[] pattern, int options)
845: throws MalformedPatternException {
846: SyntaxTree tree;
847: AwkPattern regexp;
848:
849: __beginAnchor = __endAnchor = false;
850: __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);
851: __multiline = ((options & MULTILINE_MASK) != 0);
852: tree = _parse(pattern);
853: regexp = new AwkPattern(new String(pattern), tree);
854: regexp._options = options;
855: regexp._hasBeginAnchor = __beginAnchor;
856: regexp._hasEndAnchor = __endAnchor;
857:
858: return regexp;
859: }
860:
861: /**
862: * Compiles an Awk regular expression into an AwkPattern instance that
863: * can be used by an AwkMatcher object to perform pattern matching.
864: * <p>
865: * @param pattern An Awk regular expression to compile.
866: * @param options A set of flags giving the compiler instructions on
867: * how to treat the regular expression. Currently the
868: * only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.
869: * @return A Pattern instance constituting the compiled regular expression.
870: * This instance will always be an AwkPattern and can be reliably
871: * be casted to an AwkPattern.
872: * @exception MalformedPatternException If the compiled expression
873: * is not a valid Awk regular expression.
874: */
875: public Pattern compile(String pattern, int options)
876: throws MalformedPatternException {
877: SyntaxTree tree;
878: AwkPattern regexp;
879:
880: __beginAnchor = __endAnchor = false;
881: __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);
882: __multiline = ((options & MULTILINE_MASK) != 0);
883: tree = _parse(pattern.toCharArray());
884: regexp = new AwkPattern(pattern, tree);
885: regexp._options = options;
886: regexp._hasBeginAnchor = __beginAnchor;
887: regexp._hasEndAnchor = __endAnchor;
888:
889: return regexp;
890: }
891:
892: /**
893: * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>
894: * <p>
895: * @param pattern A regular expression to compile.
896: * @return A Pattern instance constituting the compiled regular expression.
897: * This instance will always be an AwkPattern and can be reliably
898: * be casted to an AwkPattern.
899: * @exception MalformedPatternException If the compiled expression
900: * is not a valid Awk regular expression.
901: */
902: public Pattern compile(char[] pattern)
903: throws MalformedPatternException {
904: return compile(pattern, DEFAULT_MASK);
905: }
906:
907: /**
908: * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>
909: * <p>
910: * @param pattern A regular expression to compile.
911: * @return A Pattern instance constituting the compiled regular expression.
912: * This instance will always be an AwkPattern and can be reliably
913: * be casted to an AwkPattern.
914: * @exception MalformedPatternException If the compiled expression
915: * is not a valid Awk regular expression.
916: */
917: public Pattern compile(String pattern)
918: throws MalformedPatternException {
919: return compile(pattern, DEFAULT_MASK);
920: }
921:
922: }
|