001: /*
002: * xtc - The eXTensible Compiler
003: * Copyright (C) 2004 Robert Grimm
004: *
005: * This program is free software; you can redistribute it and/or
006: * modify it under the terms of the GNU General Public License
007: * as published by the Free Software Foundation; either version 2
008: * of the License, or (at your option) any later version.
009: *
010: * This program is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU General Public License for more details.
014: *
015: * You should have received a copy of the GNU General Public License
016: * along with this program; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
018: */
019: package xtc.parser;
020:
021: import java.util.ArrayList;
022: import java.util.Collections;
023: import java.util.Iterator;
024: import java.util.List;
025:
026: import xtc.util.Utilities;
027:
028: /**
029: * A character class terminal.
030: *
031: * <p />Note that {@link #equals(Object)} only determines whether the
032: * two character class terminals have the same structure (that is, are
033: * both exclusive or non-exclusive and have the same list of character
034: * ranges), but does not determine whether the two character class
035: * terminals recognize the same characters.
036: *
037: * @author Robert Grimm
038: * @version $Revision: 1.1 $
039: */
040: public class CharClass extends CharTerminal {
041:
042: /** Parser for a character class specification. */
043: public static class Parser {
044:
045: /** The string. */
046: protected String s;
047:
048: /** The index into the string. */
049: protected int idx;
050:
051: /**
052: * Create a new character class parser for the specified string.
053: * Note that the string must not include the leading
054: * '<code>[</code>' and trailing '<code>]</code>' characters.
055: *
056: * @param s The string to parse.
057: */
058: public Parser(String s) {
059: this .s = s;
060: idx = 0;
061: }
062:
063: /**
064: * Determine whether there are more characters.
065: *
066: * @return <code>true</code> if there are more characters.
067: */
068: public boolean hasNext() {
069: return idx < s.length();
070: }
071:
072: /**
073: * Determine whether the next character is a range delimiter
074: * '<code>-</code>'. Note that this test is <i>destructive</i>:
075: * if the next character is a range delimiter, it is consumed.
076: *
077: * @return <code>true</code> if the next character is a range
078: * delimiter.
079: */
080: public boolean hasRange() {
081: if (idx >= s.length()) {
082: return false;
083: }
084:
085: char c = s.charAt(idx);
086:
087: if ('-' == c) {
088: idx++;
089: return true;
090: } else {
091: return false;
092: }
093: }
094:
095: /**
096: * Return the next character. If the character is represented by
097: * an escape sequence (including Java Unicode and regex-like
098: * escapes), it is unescaped.
099: *
100: * @return The next character.
101: */
102: public char next() {
103: char c = s.charAt(idx);
104: idx++;
105:
106: if ('\\' != c) {
107: return c;
108:
109: } else {
110: c = s.charAt(idx);
111: idx++;
112:
113: switch (c) {
114: case 'b':
115: return '\b';
116: case 't':
117: return '\t';
118: case 'n':
119: return '\n';
120: case 'f':
121: return '\f';
122: case 'r':
123: return '\r';
124: case '"':
125: return '"';
126: case '\'':
127: return '\'';
128: case '-':
129: return '-';
130: case '[':
131: return '[';
132: case '\\':
133: return '\\';
134: case ']':
135: return ']';
136: case 'u':
137: idx += 4;
138: int n;
139: try {
140: n = Integer.parseInt(s.substring(idx - 4, idx),
141: 16);
142: } catch (NumberFormatException x) {
143: throw new IllegalArgumentException(
144: "Illegal Unicode escape (\'\\u"
145: + s.substring(idx - 4, idx)
146: + "\')");
147: }
148: return (char) n;
149: default:
150: throw new IllegalArgumentException(
151: "Illegal character escape (\'\\" + c
152: + "\')");
153: }
154: }
155: }
156:
157: }
158:
159: /** The flag for whether the character class is exclusive. */
160: public boolean exclusive;
161:
162: /**
163: * The list of character ranges. Note that, strictly speaking, this
164: * should be a set of disjoint character ranges. However, it is
165: * implemented as a list so that a character class can be printed as
166: * it was specified.
167: */
168: public List ranges;
169:
170: /**
171: * Create a new, non-exclusive character class.
172: *
173: * @param ranges The list of character ranges.
174: */
175: public CharClass(List ranges) {
176: this (false, ranges);
177: }
178:
179: /**
180: * Create a new character class.
181: *
182: * @param exclusive The exclusive flag.
183: * @param ranges The list of character ranges.
184: */
185: public CharClass(boolean exclusive, List ranges) {
186: this .exclusive = exclusive;
187: this .ranges = ranges;
188: }
189:
190: /**
191: * Create a new, non-exclusive character class for the specified
192: * character.
193: *
194: * @param c The character.
195: */
196: public CharClass(char c) {
197: exclusive = false;
198: ranges = new ArrayList(1);
199: ranges.add(new CharRange(c));
200: }
201:
202: /**
203: * Create a new, non-exclusive character class based on the supplied
204: * character class specification. Note that the character class
205: * specification must not include the leading '<code>[</code>' and
206: * trailing '<code>]</code>' characters.
207: *
208: * @param s The character class specification.
209: */
210: public CharClass(String s) {
211: exclusive = false;
212: ranges = new ArrayList();
213: Parser p = new Parser(s);
214:
215: while (p.hasNext()) {
216: char c1 = p.next();
217: char c2 = (p.hasRange()) ? p.next() : c1;
218: ranges.add(new CharRange(c1, c2));
219: }
220: }
221:
222: /**
223: * Normalize this character class. This method sorts the list of
224: * character ranges by each range's first character and combines
225: * adjacent or overlapping ranges. However, it does <i>not</i> turn
226: * exclusive character classes into non-exclusive ones (as that
227: * conversion might negatively impact recognition performance).
228: *
229: * @return This character class.
230: */
231: public CharClass normalize() {
232: Collections.sort(ranges);
233:
234: for (int i = 0; i < ranges.size() - 1; i++) {
235: CharRange r1 = (CharRange) ranges.get(i);
236: CharRange r2 = (CharRange) ranges.get(i + 1);
237: if (r1.last >= r2.last) {
238: ranges.remove(i + 1);
239: i--;
240: } else if (r1.last >= r2.first - 1) {
241: ranges.set(i, new CharRange(r1.first, r2.last));
242: ranges.remove(i + 1);
243: i--;
244: }
245: }
246:
247: return this ;
248: }
249:
250: /**
251: * Determine whether this character class overlaps the specified
252: * character class. Two character classes overlap if they have
253: * common characters, though they need not necessarily be the same.
254: * Note that the result of this method is only well-defined if both
255: * character classes are non-exclusive.
256: *
257: * @param klass The other character class.
258: * @return <code>true</code> if the two character classes overlap.
259: */
260: public boolean overlaps(CharClass klass) {
261: Iterator iter = klass.ranges.iterator();
262: while (iter.hasNext()) {
263: CharRange r1 = (CharRange) iter.next();
264: Iterator iter2 = ranges.iterator();
265: while (iter2.hasNext()) {
266: CharRange r2 = (CharRange) iter2.next();
267: if (r1.contains(r2.first) || r1.contains(r2.last)
268: || r2.contains(r1.first)
269: || r2.contains(r1.last)) {
270: return true;
271: }
272: }
273: }
274: return false;
275: }
276:
277: /**
278: * Determine the number of characters covered by this character
279: * class. Note that for exclusive character classes this method
280: * returns the number of <i>excluded</i> characters.
281: *
282: * @return The number of characters for this character class.
283: */
284: public int count() {
285: int count = 0;
286: Iterator iter = ranges.iterator();
287: while (iter.hasNext()) {
288: count += ((CharRange) iter.next()).count();
289: }
290: return count;
291: }
292:
293: public int hashCode() {
294: int hash = 0;
295:
296: Iterator iter = ranges.iterator();
297: while (iter.hasNext()) {
298: hash += iter.next().hashCode();
299: }
300:
301: return hash;
302: }
303:
304: public boolean equals(Object o) {
305: if (this == o)
306: return true;
307: if (!(o instanceof CharClass))
308: return false;
309: CharClass other = (CharClass) o;
310: if (exclusive != other.exclusive)
311: return false;
312: if (ranges.size() != other.ranges.size())
313: return false;
314: return ranges.containsAll(other.ranges);
315: }
316:
317: public String toString() {
318: StringBuffer buf = new StringBuffer();
319:
320: if (exclusive) {
321: buf.append('!');
322: }
323:
324: buf.append('[');
325: Iterator iter = ranges.iterator();
326: while (iter.hasNext()) {
327: CharRange r = (CharRange) iter.next();
328:
329: if (r.first == r.last) {
330: Utilities.escape(r.first, buf, Utilities.FULL_ESCAPES);
331: } else {
332: Utilities.escape(r.first, buf, Utilities.FULL_ESCAPES);
333: buf.append('-');
334: Utilities.escape(r.last, buf, Utilities.FULL_ESCAPES);
335: }
336: }
337: buf.append(']');
338:
339: if (exclusive) {
340: buf.append(" .");
341: }
342:
343: return buf.toString();
344: }
345:
346: }
|