001: //##header
002: /*
003: *******************************************************************************
004: * Copyright (C) 2002-2006, International Business Machines Corporation and *
005: * others. All Rights Reserved. *
006: *******************************************************************************
007: */
008: //#ifndef FOUNDATION
009: package com.ibm.icu.dev.test.util;
010:
011: import java.text.ParsePosition;
012:
013: import com.ibm.icu.text.*;
014: import com.ibm.icu.lang.*;
015:
016: import java.util.HashMap;
017: import java.util.HashSet;
018: import java.util.Set;
019: import java.util.Map;
020:
021: public class Tokenizer {
022: protected String source;
023:
024: protected StringBuffer buffer = new StringBuffer();
025: protected long number;
026: protected UnicodeSet unicodeSet = null;
027: protected int index;
028: boolean backedup = false;
029: protected int lastIndex = -1;
030: protected int nextIndex;
031: int lastValue = BACKEDUP_TOO_FAR;
032: TokenSymbolTable symbolTable = new TokenSymbolTable();
033:
034: private static final char QUOTE = '\'', BSLASH = '\\';
035: private static final UnicodeSet QUOTERS = new UnicodeSet().add(
036: QUOTE).add(BSLASH);
037: private static final UnicodeSet WHITESPACE = new UnicodeSet(
038: "["
039: + "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029"
040: + "]");
041: private static final UnicodeSet SYNTAX = new UnicodeSet(
042: "["
043: + "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E"
044: + "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE"
045: + "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7"
046: + "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF"
047: + "\\u3001\\u3003\\u3008-\\u3020\\u3030"
048: + "\\uFD3E\\uFD3F\\uFE45\\uFE46" + "]").removeAll(
049: QUOTERS).remove('$');
050: private static final UnicodeSet NEWLINE = new UnicodeSet(
051: "[\\u000A\\u000D\\u0085\\u2028\\u2029]");
052: private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]");
053: private static final UnicodeSet NON_STRING = new UnicodeSet()
054: .addAll(WHITESPACE).addAll(SYNTAX);
055:
056: protected UnicodeSet whiteSpace = WHITESPACE;
057: protected UnicodeSet syntax = SYNTAX;
058: private UnicodeSet non_string = NON_STRING;
059:
060: private void fixSets() {
061: if (syntax.containsSome(QUOTERS)
062: || syntax.containsSome(whiteSpace)) {
063: syntax = ((UnicodeSet) syntax.clone()).removeAll(QUOTERS)
064: .removeAll(whiteSpace);
065: }
066: if (whiteSpace.containsSome(QUOTERS)) {
067: whiteSpace = ((UnicodeSet) whiteSpace.clone())
068: .removeAll(QUOTERS);
069: }
070: non_string = new UnicodeSet(syntax).addAll(whiteSpace);
071: }
072:
073: public Tokenizer setSource(String source) {
074: this .source = source;
075: this .index = 0;
076: return this ; // for chaining
077: }
078:
079: public Tokenizer setIndex(int index) {
080: this .index = index;
081: return this ; // for chaining
082: }
083:
084: public static final int DONE = -1, NUMBER = -2, STRING = -3,
085: UNICODESET = -4, UNTERMINATED_QUOTE = -5,
086: BACKEDUP_TOO_FAR = -6;
087:
088: private static final int FIRST = 0, IN_NUMBER = 1, IN_SPACE = 2,
089: AFTER_QUOTE = 3, // warning: order is important for switch statement
090: IN_STRING = 4, AFTER_BSLASH = 5, IN_QUOTE = 6;
091:
092: public String toString(int type, boolean backedupBefore) {
093: String s = backedup ? "@" : "*";
094: switch (type) {
095: case DONE:
096: return s + "Done" + s;
097: case BACKEDUP_TOO_FAR:
098: return s + "Illegal Backup" + s;
099: case UNTERMINATED_QUOTE:
100: return s + "Unterminated Quote=" + getString() + s;
101: case STRING:
102: return s + "s=" + getString() + s;
103: case NUMBER:
104: return s + "n=" + getNumber() + s;
105: case UNICODESET:
106: return s + "n=" + getUnicodeSet() + s;
107: default:
108: return s + "c=" + usf.getName(type, true) + s;
109: }
110: }
111:
112: private static final BagFormatter usf = new BagFormatter();
113:
114: public void backup() {
115: if (backedup)
116: throw new IllegalArgumentException("backup too far");
117: backedup = true;
118: nextIndex = index;
119: index = lastIndex;
120: }
121:
122: /*
123: public int next2() {
124: boolean backedupBefore = backedup;
125: int result = next();
126: System.out.println(toString(result, backedupBefore));
127: return result;
128: }
129: */
130:
131: public int next() {
132: if (backedup) {
133: backedup = false;
134: index = nextIndex;
135: return lastValue;
136: }
137: int cp = 0;
138: boolean inComment = false;
139: // clean off any leading whitespace or comments
140: while (true) {
141: if (index >= source.length())
142: return lastValue = DONE;
143: cp = nextChar();
144: if (inComment) {
145: if (NEWLINE.contains(cp))
146: inComment = false;
147: } else {
148: if (cp == '#')
149: inComment = true;
150: else if (!whiteSpace.contains(cp))
151: break;
152: }
153: }
154: // record the last index in case we have to backup
155: lastIndex = index;
156:
157: if (cp == '[') {
158: ParsePosition pos = new ParsePosition(index - 1);
159: unicodeSet = new UnicodeSet(source, pos, symbolTable);
160: index = pos.getIndex();
161: return lastValue = UNICODESET;
162: }
163: // get syntax character
164: if (syntax.contains(cp))
165: return lastValue = cp;
166:
167: // get number, if there is one
168: if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) {
169: number = UCharacter.getNumericValue(cp);
170: while (index < source.length()) {
171: cp = nextChar();
172: if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
173: index -= UTF16.getCharCount(cp); // BACKUP!
174: break;
175: }
176: number *= 10;
177: number += UCharacter.getNumericValue(cp);
178: }
179: return lastValue = NUMBER;
180: }
181: buffer.setLength(0);
182: int status = IN_STRING;
183: main: while (true) {
184: switch (status) {
185: case AFTER_QUOTE: // check for double ''?
186: if (cp == QUOTE) {
187: UTF16.append(buffer, QUOTE);
188: status = IN_QUOTE;
189: break;
190: }
191: // OTHERWISE FALL THROUGH!!!
192: case IN_STRING:
193: if (cp == QUOTE)
194: status = IN_QUOTE;
195: else if (cp == BSLASH)
196: status = AFTER_BSLASH;
197: else if (non_string.contains(cp)) {
198: index -= UTF16.getCharCount(cp); // BACKUP!
199: break main;
200: } else
201: UTF16.append(buffer, cp);
202: break;
203: case IN_QUOTE:
204: if (cp == QUOTE)
205: status = AFTER_QUOTE;
206: else
207: UTF16.append(buffer, cp);
208: break;
209: case AFTER_BSLASH:
210: switch (cp) {
211: case 'n':
212: cp = '\n';
213: break;
214: case 'r':
215: cp = '\r';
216: break;
217: case 't':
218: cp = '\t';
219: break;
220: }
221: UTF16.append(buffer, cp);
222: status = IN_STRING;
223: break;
224: default:
225: throw new IllegalArgumentException("Internal Error");
226: }
227: if (index >= source.length())
228: break;
229: cp = nextChar();
230: }
231: if (status > IN_STRING)
232: return lastValue = UNTERMINATED_QUOTE;
233: return lastValue = STRING;
234: }
235:
236: public String getString() {
237: return buffer.toString();
238: }
239:
240: public String toString() {
241: return source.substring(0, index) + "$$$"
242: + source.substring(index);
243: }
244:
245: public long getNumber() {
246: return number;
247: }
248:
249: public UnicodeSet getUnicodeSet() {
250: return unicodeSet;
251: }
252:
253: private int nextChar() {
254: int cp = UTF16.charAt(source, index);
255: index += UTF16.getCharCount(cp);
256: return cp;
257: }
258:
259: public int getIndex() {
260: return index;
261: }
262:
263: public String getSource() {
264: return source;
265: }
266:
267: public UnicodeSet getSyntax() {
268: return syntax;
269: }
270:
271: public UnicodeSet getWhiteSpace() {
272: return whiteSpace;
273: }
274:
275: public void setSyntax(UnicodeSet set) {
276: syntax = set;
277: fixSets();
278: }
279:
280: public void setWhiteSpace(UnicodeSet set) {
281: whiteSpace = set;
282: fixSets();
283: }
284:
285: public Set getLookedUpItems() {
286: return symbolTable.itemsLookedUp;
287: }
288:
289: public void addSymbol(String var, String value, int start, int limit) {
290: // the limit is after the ';', so remove it
291: --limit;
292: char[] body = new char[limit - start];
293: value.getChars(start, limit, body, 0);
294: symbolTable.add(var, body);
295: }
296:
297: public class TokenSymbolTable implements SymbolTable {
298: Map contents = new HashMap();
299: Set itemsLookedUp = new HashSet();
300:
301: public void add(String var, char[] body) {
302: // start from 1 to avoid the $
303: contents.put(var.substring(1), body);
304: }
305:
306: /* (non-Javadoc)
307: * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)
308: */
309: public char[] lookup(String s) {
310: itemsLookedUp.add('$' + s);
311: return (char[]) contents.get(s);
312: }
313:
314: /* (non-Javadoc)
315: * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)
316: */
317: public UnicodeMatcher lookupMatcher(int ch) {
318: // TODO Auto-generated method stub
319: return null;
320: }
321:
322: /* (non-Javadoc)
323: * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int)
324: */
325: public String parseReference(String text, ParsePosition pos,
326: int limit) {
327: int cp;
328: int start = pos.getIndex();
329: int i;
330: for (i = start; i < limit; i += UTF16.getCharCount(cp)) {
331: cp = UTF16.charAt(text, i);
332: if (!com.ibm.icu.lang.UCharacter
333: .isUnicodeIdentifierPart(cp)) {
334: break;
335: }
336: }
337: pos.setIndex(i);
338: return text.substring(start, i);
339: }
340:
341: }
342: }
343:
344: //#endif
|