001: /*
002: (c) Copyright 2004, 2005, 2006, 2007, 2008 Hewlett-Packard Development Company, LP, all rights reserved.
003: [See end of file]
004: $Id: PerlPatternParser.java,v 1.16 2008/01/02 12:06:04 andy_seaborne Exp $
005: */
006: package com.hp.hpl.jena.graph.query.regexptrees;
007:
008: import java.util.*;
009:
010: /**
011: Parse Perl5 patterns into RegexpTree structures, or throw an exception for
012: cases that haven't been implemented.
013:
014: @author hedgehog
015: */
016: public class PerlPatternParser {
017: /**
018: The string being parsed, as supplied to the constructor(s).
019: */
020: protected final String toParse;
021:
022: /**
023: The index into the string of the next undealt-with character, ie, it starts at 0.
024: */
025: protected int pointer;
026:
027: /**
028: The length of the string to parse, used as a limit.
029: */
030: protected final int limit;
031:
032: /**
033: The generator for the RegexpTree nodes to be used in the parse.
034: */
035: protected RegexpTreeGenerator generator;
036:
037: /**
038: Count of how many back-references match-points seen so far.
039: */
040: protected int matchPointsSeen;
041:
042: /**
043: The digits, in order.
044: */
045: public static final String digits = "0123456789";
046:
047: /**
048: The characters that are (non-)matchable by \w[W].
049: */
050: public static final String wordChars = digits
051: + "abcdefghijklmnopqrstuvwxyz" + "_"
052: + "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
053:
054: /**
055: Initialise this parser with the string to parse and with the default
056: generator (SimpleGenerator).
057: */
058: public PerlPatternParser(String toParse) {
059: this (toParse, new SimpleGenerator());
060: }
061:
062: /**
063: Initialise this parser with the string to parse and with the generator to
064: use for node construction.
065: */
066: public PerlPatternParser(String toParse, RegexpTreeGenerator gen) {
067: this .toParse = toParse;
068: this .limit = toParse.length();
069: this .generator = gen;
070: }
071:
072: /**
073: Answer the result of parsing the given string as a sequence of alternatives.
074: */
075: public static RegexpTree parse(String string) {
076: return new PerlPatternParser(string).parseAlts();
077: }
078:
079: /**
080: Answer the result of parsing the given string as a sequence of alternatives,
081: using the supplied generator for the pattern nodes.
082: */
083: public static RegexpTree parse(String string,
084: RegexpTreeGenerator gen) {
085: return new PerlPatternParser(string, gen).parseAlts();
086: }
087:
088: /**
089: Exception thrown if a syntax error is detected. Further details are in the
090: error message - it doesn't seem worth worrying about having different
091: classes for different errors. Possibly this should be a non-static class so
092: that it can get at the current context?
093: */
094: public static class SyntaxException extends RuntimeException {
095: public SyntaxException(String message) {
096: super (message);
097: }
098: }
099:
100: /**
101: Answer the string that this parser is parsing.
102: */
103: public String getString() {
104: return toParse;
105: }
106:
107: /**
108: Answer the current index into the parse string.
109: */
110: public int getPointer() {
111: return pointer;
112: }
113:
114: /**
115: Answer the character under the pointer, and advance the pointer.
116: */
117: protected char nextChar() {
118: return toParse.charAt(pointer++);
119: }
120:
121: /**
122: Parse a single atom and return the tree for it, advancing the pointer. This
123: does not deal with quantifiers, for which see parseQuantifier. Unmatched
124: right parentheses, unexpected (hence unbound) quantifiers, and those things
125: that aren't implemented, throw exceptions. An empty atom is permitted
126: (at the end of a string or before a |).
127: */
128: public RegexpTree parseAtom() {
129: if (pointer < limit) {
130: char ch = nextChar();
131: switch (ch) {
132: case '.':
133: return generator.getAnySingle();
134: case '^':
135: return generator.getStartOfLine();
136: case '$':
137: return generator.getEndOfLine();
138: case '|':
139: pointer -= 1;
140: return generator.getNothing();
141: case '[':
142: return parseClass();
143: case ')':
144: pointer -= 1;
145: return generator.getNothing();
146: case '(':
147: return parseParens();
148: case '\\':
149: return parseBackslash();
150: case '*':
151: case '+':
152: case '?':
153: case '{':
154: throw new PerlPatternParser.SyntaxException(
155: "unbound quantifier " + ch);
156: case ']':
157: case '}':
158: default:
159: return generator.getText(ch);
160: }
161: }
162: return generator.getNothing();
163: }
164:
165: /**
166: Parse a class expression and answer an appropriate tree.
167: */
168: protected RegexpTree parseClass() {
169: StringBuffer b = new StringBuffer();
170: boolean negated = parseClassNegation();
171: while (true) {
172: int ch = nextClassChar();
173: if (ch == ']')
174: break;
175: if (ch == '-' && b.length() > 0) {
176: char begin = (char) (b.charAt(b.length() - 1) + 1);
177: char end = (char) Math.abs(nextClassChar());
178: for (char i = begin; i <= end; i += 1)
179: b.append(i);
180: } else
181: b.append((char) Math.abs(ch));
182: }
183: pointer += 1;
184: return generator.getClass(b.toString(), negated);
185: }
186:
187: /**
188: Answer the next character, if it's suitable for part of a class expression,
189: negated if it's been escaped. Iffy.
190: */
191: private int nextClassChar() {
192: char ch = nextChar();
193: if (ch == '\\') {
194: RegexpTree t = parseAtom();
195: if (t instanceof Text)
196: return -((Text) t).getString().charAt(0);
197: throw new SyntaxException("not allowed in class");
198: } else
199: return ch;
200: }
201:
202: protected boolean parseClassNegation() {
203: if (toParse.charAt(pointer) == '^') {
204: pointer += 1;
205: return true;
206: } else
207: return false;
208: }
209:
210: /**
211: Parse a parenthesised expression. Throw a SyntaxException if the closing
212: bracket is missing. Answer the wrapped sub-expression. Does not cater
213: for the (? ...) stuff.
214: */
215: protected RegexpTree parseParens() {
216: RegexpTree operand = parseAlts();
217: if (pointer < limit && toParse.charAt(pointer) == ')')
218: pointer += 1;
219: else
220: throw new SyntaxException("missing closing bracket");
221: matchPointsSeen += 1;
222: return generator.getParen(operand, matchPointsSeen);
223: }
224:
225: /**
226: Parse a backslash-escape and answer the appropriate regexp tree.
227: Unhandled escapes throw an exception.
228: */
229: private RegexpTree parseBackslash() {
230: char ch = nextChar();
231: if ("bBAZnrtfdDwWSsxc0123456789".indexOf(ch) < 0)
232: return generator.getText(ch);
233: else if (ch == 'n')
234: return generator.getText('\n');
235: else if (ch == 'r')
236: return generator.getText('\r');
237: else if (ch == 'f')
238: return generator.getText('\f');
239: else if (ch == 't')
240: return generator.getText('\t');
241: else if (ch == 's')
242: return generator.getClass(" \r\n\t\f", false);
243: else if (ch == 'S')
244: return generator.getClass(" \r\n\t\f", true);
245: else if (ch == 'd')
246: return generator.getClass(digits, false);
247: else if (ch == 'D')
248: return generator.getClass(digits, true);
249: else if (ch == 'w')
250: return generator.getClass(wordChars, false);
251: else if (ch == 'W')
252: return generator.getClass(wordChars, true);
253: else if ('0' <= ch && ch <= '9')
254: return backReferenceOrOctalChar(ch);
255: else if (ch == 'x')
256: return hexEscape();
257: else if (ch == 'c')
258: return control(nextChar());
259: else
260: throw new PerlPatternParser.SyntaxException("can't do \\"
261: + ch + " yet");
262: }
263:
264: /**
265: Answer a RegexpTree representing the single character which is CTRL-ch.
266: */
267: protected RegexpTree control(char ch) {
268: return Text.create((char) (ch - 'A' + 1));
269: }
270:
271: /**
272: Answer a RegexpTree representing the single character whose value is
273: given by the next two hexadecimal digits.
274: */
275: protected RegexpTree hexEscape() {
276: char hi = nextChar(), lo = nextChar();
277: return Text.create((char) (deHex(hi) * 16 + deHex(lo)));
278: }
279:
280: /**
281: Answer the integer value corresponding to the hex digit <code>ch</code>.
282: */
283: private int deHex(char ch) {
284: if (Character.isDigit(ch))
285: return ch - '0';
286: if ('a' <= ch && ch <= 'f')
287: return 10 + ch - 'a';
288: if ('A' <= ch && ch <= 'F')
289: return 10 + ch - 'A';
290: throw new SyntaxException("'" + ch + "' is not a hex digit");
291: }
292:
293: /**
294: Answer the backreference or octal character described by \nnnn sequences.
295: */
296: protected RegexpTree backReferenceOrOctalChar(char ch) {
297: char[] chars = new char[20];
298: int index = 0;
299: chars[index++] = ch;
300: while (pointer < limit) {
301: ch = nextChar();
302: if (!Character.isDigit(ch))
303: break;
304: chars[index++] = ch;
305: }
306: int n = numeric(chars, 10, index);
307: return 0 < n && n <= matchPointsSeen ? generator
308: .getBackReference(n) : generator.getText(numeric(chars,
309: 8, index));
310: }
311:
312: /**
313: Answer the numeric value represented by chars[0..limit-1] in the given base.
314: */
315: protected char numeric(char[] chars, int base, int limit) {
316: int result = 0;
317: for (int i = 0; i < limit; i += 1)
318: result = result * base + (chars[i] - '0');
319: return (char) result;
320: }
321:
322: /**
323: Parse any quantifier and answer the quantified version of the argument
324: tree <code>d</code>. TODO: handle non-greedy quantifiers. (These will
325: currently generate syntax errors when their flagging ? is encountered by
326: parseAtom.)
327: */
328: public RegexpTree parseQuantifier(RegexpTree d) {
329: if (pointer < limit) {
330: char ch = toParse.charAt(pointer);
331: switch (ch) {
332: case '*':
333: pointer += 1;
334: return generator.getZeroOrMore(d);
335:
336: case '+':
337: pointer += 1;
338: return generator.getOneOrMore(d);
339:
340: case '?':
341: pointer += 1;
342: return generator.getOptional(d);
343:
344: case '{':
345: throw new SyntaxException(
346: "numeric quantifiers not done yet");
347: }
348: }
349: return d;
350: }
351:
352: /**
353: Parse an element (an atom and any following quantifier) and answer the
354: possibly-quantified tree.
355: */
356: public RegexpTree parseElement() {
357: return parseQuantifier(parseAtom());
358: }
359:
360: /**
361: Parse a sequence of elements [possibly-quantified atoms] and answer the
362: sequence (singular sequences may be reduced to its single element).
363: */
364: public RegexpTree parseSeq() {
365: List operands = new ArrayList();
366: while (true) {
367: RegexpTree next = parseElement();
368: if (next.equals(generator.getNothing()))
369: break;
370: operands.add(next);
371: }
372: return generator.getSequence(operands);
373: }
374:
375: /**
376: Parse an alternation of sequences and answer an alternative tree (or the
377: single component if there is just one alternative).
378: */
379: public RegexpTree parseAlts() {
380: List operands = new ArrayList();
381: while (true) {
382: operands.add(parseSeq());
383: if (pointer < limit && toParse.charAt(pointer) == '|')
384: pointer += 1;
385: else
386: break;
387: }
388: return generator.getAlternatives(operands);
389: }
390: }
391:
392: /*
393: (c) Copyright 2004, 2005, 2006, 2007, 2008 Hewlett-Packard Development Company, LP
394: All rights reserved.
395:
396: Redistribution and use in source and binary forms, with or without
397: modification, are permitted provided that the following conditions
398: are met:
399:
400: 1. Redistributions of source code must retain the above copyright
401: notice, this list of conditions and the following disclaimer.
402:
403: 2. Redistributions in binary form must reproduce the above copyright
404: notice, this list of conditions and the following disclaimer in the
405: documentation and/or other materials provided with the distribution.
406:
407: 3. The name of the author may not be used to endorse or promote products
408: derived from this software without specific prior written permission.
409:
410: THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
411: IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
412: OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
413: IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
414: INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
415: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
416: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
417: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
418: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
419: THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
420: */
|