001: /*
002: **********************************************************************
003: * Copyright (c) 2003, International Business Machines
004: * Corporation and others. All Rights Reserved.
005: **********************************************************************
006: * Author: Alan Liu
007: * Created: September 23 2003
008: * Since: ICU 2.8
009: **********************************************************************
010: */
011: package com.ibm.icu.impl;
012:
013: import java.text.ParsePosition;
014: import com.ibm.icu.text.SymbolTable;
015: import com.ibm.icu.text.UTF16;
016:
017: /**
018: * An iterator that returns 32-bit code points. This class is deliberately
019: * <em>not</em> related to any of the JDK or ICU4J character iterator classes
020: * in order to minimize complexity.
021: * @author Alan Liu
022: * @since ICU 2.8
023: */
024: public class RuleCharacterIterator {
025:
026: // TODO: Ideas for later. (Do not implement if not needed, lest the
027: // code coverage numbers go down due to unused methods.)
028: // 1. Add a copy constructor, equals() method, clone() method.
029: // 2. Rather than return DONE, throw an exception if the end
030: // is reached -- this is an alternate usage model, probably not useful.
031: // 3. Return isEscaped from next(). If this happens,
032: // don't keep an isEscaped member variable.
033:
034: /**
035: * Text being iterated.
036: */
037: private String text;
038:
039: /**
040: * Position of iterator.
041: */
042: private ParsePosition pos;
043:
044: /**
045: * Symbol table used to parse and dereference variables. May be null.
046: */
047: private SymbolTable sym;
048:
049: /**
050: * Current variable expansion, or null if none.
051: */
052: private char[] buf;
053:
054: /**
055: * Position within buf[]. Meaningless if buf == null.
056: */
057: private int bufPos;
058:
059: /**
060: * Flag indicating whether the last character was parsed from an escape.
061: */
062: private boolean isEscaped;
063:
064: /**
065: * Value returned when there are no more characters to iterate.
066: */
067: public static final int DONE = -1;
068:
069: /**
070: * Bitmask option to enable parsing of variable names. If (options &
071: * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
072: * its value. Variables are parsed using the SymbolTable API.
073: */
074: public static final int PARSE_VARIABLES = 1;
075:
076: /**
077: * Bitmask option to enable parsing of escape sequences. If (options &
078: * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
079: * to its value. Escapes are parsed using Utility.unescapeAt().
080: */
081: public static final int PARSE_ESCAPES = 2;
082:
083: /**
084: * Bitmask option to enable skipping of whitespace. If (options &
085: * SKIP_WHITESPACE) != 0, then whitespace characters will be silently
086: * skipped, as if they were not present in the input. Whitespace
087: * characters are defined by UCharacterProperty.isRuleWhiteSpace().
088: */
089: public static final int SKIP_WHITESPACE = 4;
090:
091: /**
092: * Constructs an iterator over the given text, starting at the given
093: * position.
094: * @param text the text to be iterated
095: * @param sym the symbol table, or null if there is none. If sym is null,
096: * then variables will not be deferenced, even if the PARSE_VARIABLES
097: * option is set.
098: * @param pos upon input, the index of the next character to return. If a
099: * variable has been dereferenced, then pos will <em>not</em> increment as
100: * characters of the variable value are iterated.
101: */
102: public RuleCharacterIterator(String text, SymbolTable sym,
103: ParsePosition pos) {
104: if (text == null || pos.getIndex() > text.length()) {
105: throw new IllegalArgumentException();
106: }
107: this .text = text;
108: this .sym = sym;
109: this .pos = pos;
110: buf = null;
111: }
112:
113: /**
114: * Returns true if this iterator has no more characters to return.
115: */
116: public boolean atEnd() {
117: return buf == null && pos.getIndex() == text.length();
118: }
119:
120: /**
121: * Returns the next character using the given options, or DONE if there
122: * are no more characters, and advance the position to the next
123: * character.
124: * @param options one or more of the following options, bitwise-OR-ed
125: * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
126: * @return the current 32-bit code point, or DONE
127: */
128: public int next(int options) {
129: int c = DONE;
130: isEscaped = false;
131:
132: for (;;) {
133: c = _current();
134: _advance(UTF16.getCharCount(c));
135:
136: if (c == SymbolTable.SYMBOL_REF && buf == null
137: && (options & PARSE_VARIABLES) != 0 && sym != null) {
138: String name = sym.parseReference(text, pos, text
139: .length());
140: // If name == null there was an isolated SYMBOL_REF;
141: // return it. Caller must be prepared for this.
142: if (name == null) {
143: break;
144: }
145: bufPos = 0;
146: buf = sym.lookup(name);
147: if (buf == null) {
148: throw new IllegalArgumentException(
149: "Undefined variable: " + name);
150: }
151: // Handle empty variable value
152: if (buf.length == 0) {
153: buf = null;
154: }
155: continue;
156: }
157:
158: if ((options & SKIP_WHITESPACE) != 0
159: && UCharacterProperty.isRuleWhiteSpace(c)) {
160: continue;
161: }
162:
163: if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
164: int offset[] = new int[] { 0 };
165: c = Utility.unescapeAt(lookahead(), offset);
166: jumpahead(offset[0]);
167: isEscaped = true;
168: if (c < 0) {
169: throw new IllegalArgumentException("Invalid escape");
170: }
171: }
172:
173: break;
174: }
175:
176: return c;
177: }
178:
179: /**
180: * Returns true if the last character returned by next() was
181: * escaped. This will only be the case if the option passed in to
182: * next() included PARSE_ESCAPED and the next character was an
183: * escape sequence.
184: */
185: public boolean isEscaped() {
186: return isEscaped;
187: }
188:
189: /**
190: * Returns true if this iterator is currently within a variable expansion.
191: */
192: public boolean inVariable() {
193: return buf != null;
194: }
195:
196: /**
197: * Returns an object which, when later passed to setPos(), will
198: * restore this iterator's position. Usage idiom:
199: *
200: * RuleCharacterIterator iterator = ...;
201: * Object pos = iterator.getPos(null); // allocate position object
202: * for (;;) {
203: * pos = iterator.getPos(pos); // reuse position object
204: * int c = iterator.next(...);
205: * ...
206: * }
207: * iterator.setPos(pos);
208: *
209: * @param p a position object previously returned by getPos(),
210: * or null. If not null, it will be updated and returned. If
211: * null, a new position object will be allocated and returned.
212: * @return a position object which may be passed to setPos(),
213: * either `p,' or if `p' == null, a newly-allocated object
214: */
215: public Object getPos(Object p) {
216: if (p == null) {
217: return new Object[] { buf,
218: new int[] { pos.getIndex(), bufPos } };
219: }
220: Object[] a = (Object[]) p;
221: a[0] = buf;
222: int[] v = (int[]) a[1];
223: v[0] = pos.getIndex();
224: v[1] = bufPos;
225: return p;
226: }
227:
228: /**
229: * Restores this iterator to the position it had when getPos()
230: * returned the given object.
231: * @param p a position object previously returned by getPos()
232: */
233: public void setPos(Object p) {
234: Object[] a = (Object[]) p;
235: buf = (char[]) a[0];
236: int[] v = (int[]) a[1];
237: pos.setIndex(v[0]);
238: bufPos = v[1];
239: }
240:
241: /**
242: * Skips ahead past any ignored characters, as indicated by the given
243: * options. This is useful in conjunction with the lookahead() method.
244: *
245: * Currently, this only has an effect for SKIP_WHITESPACE.
246: * @param options one or more of the following options, bitwise-OR-ed
247: * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
248: */
249: public void skipIgnored(int options) {
250: if ((options & SKIP_WHITESPACE) != 0) {
251: for (;;) {
252: int a = _current();
253: if (!UCharacterProperty.isRuleWhiteSpace(a))
254: break;
255: _advance(UTF16.getCharCount(a));
256: }
257: }
258: }
259:
260: /**
261: * Returns a string containing the remainder of the characters to be
262: * returned by this iterator, without any option processing. If the
263: * iterator is currently within a variable expansion, this will only
264: * extend to the end of the variable expansion. This method is provided
265: * so that iterators may interoperate with string-based APIs. The typical
266: * sequence of calls is to call skipIgnored(), then call lookahead(), then
267: * parse the string returned by lookahead(), then call jumpahead() to
268: * resynchronize the iterator.
269: * @return a string containing the characters to be returned by future
270: * calls to next()
271: */
272: public String lookahead() {
273: if (buf != null) {
274: return new String(buf, bufPos, buf.length - bufPos);
275: } else {
276: return text.substring(pos.getIndex());
277: }
278: }
279:
280: /**
281: * Advances the position by the given number of 16-bit code units.
282: * This is useful in conjunction with the lookahead() method.
283: * @param count the number of 16-bit code units to jump over
284: */
285: public void jumpahead(int count) {
286: if (count < 0) {
287: throw new IllegalArgumentException();
288: }
289: if (buf != null) {
290: bufPos += count;
291: if (bufPos > buf.length) {
292: throw new IllegalArgumentException();
293: }
294: if (bufPos == buf.length) {
295: buf = null;
296: }
297: } else {
298: int i = pos.getIndex() + count;
299: pos.setIndex(i);
300: if (i > text.length()) {
301: throw new IllegalArgumentException();
302: }
303: }
304: }
305:
306: /**
307: * Returns a string representation of this object, consisting of the
308: * characters being iterated, with a '|' marking the current position.
309: * Position within an expanded variable is <em>not</em> indicated.
310: * @return a string representation of this object
311: */
312: public String toString() {
313: int b = pos.getIndex();
314: return text.substring(0, b) + '|' + text.substring(b);
315: }
316:
317: /**
318: * Returns the current 32-bit code point without parsing escapes, parsing
319: * variables, or skipping whitespace.
320: * @return the current 32-bit code point
321: */
322: private int _current() {
323: if (buf != null) {
324: return UTF16.charAt(buf, 0, buf.length, bufPos);
325: } else {
326: int i = pos.getIndex();
327: return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
328: }
329: }
330:
331: /**
332: * Advances the position by the given amount.
333: * @param count the number of 16-bit code units to advance past
334: */
335: private void _advance(int count) {
336: if (buf != null) {
337: bufPos += count;
338: if (bufPos == buf.length) {
339: buf = null;
340: }
341: } else {
342: pos.setIndex(pos.getIndex() + count);
343: if (pos.getIndex() > text.length()) {
344: pos.setIndex(text.length());
345: }
346: }
347: }
348: }
|