001: //##header
002: //#ifndef FOUNDATION
003: /*
004: *******************************************************************************
005: * Copyright (C) 2006, Google, International Business Machines Corporation and *
006: * others. All Rights Reserved. *
007: *******************************************************************************
008: */
009: package com.ibm.icu.impl;
010:
011: import com.ibm.icu.text.UTF16;
012: import com.ibm.icu.text.UnicodeSet;
013: import com.ibm.icu.text.DateTimePatternGenerator.FormatParser;
014: import com.ibm.icu.text.DateTimePatternGenerator.VariableField;
015:
016: import java.util.BitSet;
017: import java.util.Iterator;
018: import java.util.List;
019:
020: /**
021: * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
022: * The '' (two quotes) is treated as a single quote, inside or outside a quote
023: * <ul>
024: * <li>Any ignorable characters are ignored in parsing.</li>
025: * <li>Any syntax characters are broken into separate tokens</li>
026: * <li>Quote characters can be specified: '...', "...", and \x </li>
027: * <li>Other characters are treated as literals</li>
028: * </ul>
029: */
030: public class PatternTokenizer {
031: // settings used in the interpretation of the pattern
032: private UnicodeSet ignorableCharacters = new UnicodeSet();
033: private UnicodeSet syntaxCharacters = new UnicodeSet();
034: private UnicodeSet escapeCharacters = new UnicodeSet();
035: private boolean usingSlash = false;
036: private boolean usingQuote = false;
037:
038: // transient data, set when needed. Null it out for any changes in the above fields.
039: private transient UnicodeSet needingQuoteCharacters = null;
040:
041: // data about the current pattern being parsed. start gets moved as we go along.
042: private int start;
043: private int limit;
044: private CharSequence pattern;
045:
046: public UnicodeSet getIgnorableCharacters() {
047: return (UnicodeSet) ignorableCharacters.clone();
048: }
049:
050: /**
051: * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
052: * @param ignorableCharacters
053: * @return
054: */
055: public PatternTokenizer setIgnorableCharacters(
056: UnicodeSet ignorableCharacters) {
057: this .ignorableCharacters = (UnicodeSet) ignorableCharacters
058: .clone();
059: needingQuoteCharacters = null;
060: return this ;
061: }
062:
063: public UnicodeSet getSyntaxCharacters() {
064: return (UnicodeSet) syntaxCharacters.clone();
065: }
066:
067: /**
068: * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
069: * @param syntaxCharacters
070: * @return
071: */
072: public PatternTokenizer setSyntaxCharacters(
073: UnicodeSet syntaxCharacters) {
074: this .syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
075: needingQuoteCharacters = null;
076: return this ;
077: }
078:
079: public UnicodeSet getEscapeCharacters() {
080: return (UnicodeSet) escapeCharacters.clone();
081: }
082:
083: /**
084: * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
085: * @param escapeCharacters
086: * @return
087: */
088: public PatternTokenizer setEscapeCharacters(
089: UnicodeSet escapeCharacters) {
090: this .escapeCharacters = (UnicodeSet) escapeCharacters.clone();
091: return this ;
092: }
093:
094: public boolean isUsingQuote() {
095: return usingQuote;
096: }
097:
098: public PatternTokenizer setUsingQuote(boolean usingQuote) {
099: this .usingQuote = usingQuote;
100: needingQuoteCharacters = null;
101: return this ;
102: }
103:
104: public boolean isUsingSlash() {
105: return usingSlash;
106: }
107:
108: public PatternTokenizer setUsingSlash(boolean usingSlash) {
109: this .usingSlash = usingSlash;
110: needingQuoteCharacters = null;
111: return this ;
112: }
113:
114: // public UnicodeSet getQuoteCharacters() {
115: // return (UnicodeSet) quoteCharacters.clone();
116: // }
117: // public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
118: // this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
119: // needingQuoteCharacters = null;
120: // return this;
121: // }
122: public int getLimit() {
123: return limit;
124: }
125:
126: public PatternTokenizer setLimit(int limit) {
127: this .limit = limit;
128: return this ;
129: }
130:
131: public int getStart() {
132: return start;
133: }
134:
135: public PatternTokenizer setStart(int start) {
136: this .start = start;
137: return this ;
138: }
139:
140: public PatternTokenizer setPattern(CharSequence pattern) {
141: if (pattern == null) {
142: throw new IllegalArgumentException("Inconsistent arguments");
143: }
144: this .start = 0;
145: this .limit = pattern.length();
146: this .pattern = pattern;
147: return this ;
148: }
149:
150: public static final char SINGLE_QUOTE = '\'';
151: public static final char BACK_SLASH = '\\';
152: private static int NO_QUOTE = -1, IN_QUOTE = -2;
153:
154: /**
155: * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
156: * @param string
157: * @return
158: */
159: public String quoteLiteral(CharSequence string) {
160: if (needingQuoteCharacters == null) {
161: needingQuoteCharacters = new UnicodeSet().addAll(
162: syntaxCharacters).addAll(ignorableCharacters); // .addAll(quoteCharacters)
163: if (usingSlash)
164: needingQuoteCharacters.add(BACK_SLASH);
165: if (usingQuote)
166: needingQuoteCharacters.add(SINGLE_QUOTE);
167: }
168: StringBuffer result = new StringBuffer();
169: int quotedChar = NO_QUOTE;
170: int cp;
171: for (int i = 0; i < string.length(); i += UTF16
172: .getCharCount(cp)) {
173: cp = UTF16.charAt(string, i);
174: if (escapeCharacters.contains(cp)) {
175: // we may have to fix up previous characters
176: if (quotedChar == IN_QUOTE) {
177: result.append(SINGLE_QUOTE);
178: quotedChar = NO_QUOTE;
179: }
180: appendEscaped(result, cp);
181: continue;
182: }
183:
184: if (needingQuoteCharacters.contains(cp)) {
185: // if we have already started a quote
186: if (quotedChar == IN_QUOTE) {
187: UTF16.append(result, cp);
188: if (usingQuote && cp == SINGLE_QUOTE) { // double it
189: result.append(SINGLE_QUOTE);
190: }
191: continue;
192: }
193: // otherwise not already in quote
194: if (usingSlash) {
195: result.append(BACK_SLASH);
196: UTF16.append(result, cp);
197: continue;
198: }
199: if (usingQuote) {
200: if (cp == SINGLE_QUOTE) { // double it and continue
201: result.append(SINGLE_QUOTE);
202: result.append(SINGLE_QUOTE);
203: continue;
204: }
205: result.append(SINGLE_QUOTE);
206: UTF16.append(result, cp);
207: quotedChar = IN_QUOTE;
208: continue;
209: }
210: // we have no choice but to use \\u or \\U
211:appendEscaped(result, cp);
212: continue;
213: }
214: // otherwise cp doesn't need quoting
215: // we may have to fix up previous characters
216: if (quotedChar == IN_QUOTE) {
217: result.append(SINGLE_QUOTE);
218: quotedChar = NO_QUOTE;
219: }
220: UTF16.append(result, cp);
221: }
222: // all done.
223: // we may have to fix up previous characters
224: if (quotedChar == IN_QUOTE) {
225: result.append(SINGLE_QUOTE);
226: }
227: return result.toString();
228: }
229:
230: private void appendEscaped(StringBuffer result, int cp) {
231: if (cp <= 0xFFFF) {
232: result.append("\\u").append(Utility.hex(cp, 4));
233: } else {
234: result.append("\\U").append(Utility.hex(cp, 8));
235: }
236: }
237:
238: public String normalize() {
239: int oldStart = start;
240: StringBuffer result = new StringBuffer();
241: StringBuffer buffer = new StringBuffer();
242: while (true) {
243: buffer.setLength(0);
244: int status = next(buffer);
245: if (status == DONE) {
246: start = oldStart;
247: return result.toString();
248: }
249: if (status != SYNTAX) {
250: result.append(quoteLiteral(buffer));
251: } else {
252: result.append(buffer);
253: }
254: }
255: }
256:
257: public static final int DONE = 0, SYNTAX = 1, LITERAL = 2,
258: BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
259:
260: private static final int AFTER_QUOTE = -1, NONE = 0,
261: START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3,
262: HEX = 4;
263:
264: public int next(StringBuffer buffer) {
265: if (start >= limit)
266: return DONE;
267: int status = UNKNOWN;
268: int lastQuote = UNKNOWN;
269: int quoteStatus = NONE;
270: int hexCount = 0;
271: int hexValue = 0;
272: int cp;
273: main: for (int i = start; i < limit; i += UTF16
274: .getCharCount(cp)) {
275: cp = UTF16.charAt(pattern, i);
276: // if we are in a quote, then handle it.
277: switch (quoteStatus) {
278: case SLASH_START:
279: switch (cp) {
280: case 'u':
281: quoteStatus = HEX;
282: hexCount = 4;
283: hexValue = 0;
284: continue main;
285: case 'U':
286: quoteStatus = HEX;
287: hexCount = 8;
288: hexValue = 0;
289: continue main;
290: default:
291: if (usingSlash) {
292: UTF16.append(buffer, cp);
293: quoteStatus = NONE;
294: continue main;
295: } else {
296: buffer.append(BACK_SLASH);
297: quoteStatus = NONE;
298: }
299: }
300: break; // fall through to NONE
301: case HEX:
302: hexValue <<= 4;
303: hexValue += cp;
304: switch (cp) {
305: case '0':
306: case '1':
307: case '2':
308: case '3':
309: case '4':
310: case '5':
311: case '6':
312: case '7':
313: case '8':
314: case '9':
315: hexValue -= '0';
316: break;
317: case 'a':
318: case 'b':
319: case 'c':
320: case 'd':
321: case 'e':
322: case 'f':
323: hexValue -= 'a' - 10;
324: break;
325: case 'A':
326: case 'B':
327: case 'C':
328: case 'D':
329: case 'E':
330: case 'F':
331: hexValue -= 'A' - 10;
332: break;
333: default:
334: start = i;
335: return BROKEN_ESCAPE;
336: }
337: --hexCount;
338: if (hexCount == 0) {
339: quoteStatus = NONE;
340: UTF16.append(buffer, hexValue);
341: }
342: continue main;
343: case AFTER_QUOTE:
344: // see if we get another quote character
345: // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
346: if (cp == lastQuote) {
347: UTF16.append(buffer, cp);
348: quoteStatus = NORMAL_QUOTE;
349: continue main;
350: }
351: quoteStatus = NONE;
352: break; // fall through to NONE
353: case START_QUOTE:
354: // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
355: if (cp == lastQuote) {
356: UTF16.append(buffer, cp);
357: quoteStatus = NONE; // get out of quote, with no trace remaining
358: continue;
359: }
360: // otherwise get into quote
361: UTF16.append(buffer, cp);
362: quoteStatus = NORMAL_QUOTE;
363: continue main;
364: case NORMAL_QUOTE:
365: if (cp == lastQuote) {
366: quoteStatus = AFTER_QUOTE; // get out of quote
367: continue main;
368: }
369: UTF16.append(buffer, cp);
370: continue main;
371: }
372:
373: if (ignorableCharacters.contains(cp)) {
374: continue;
375: }
376: // do syntax characters
377: if (syntaxCharacters.contains(cp)) {
378: if (status == UNKNOWN) {
379: UTF16.append(buffer, cp);
380: start = i + UTF16.getCharCount(cp);
381: return SYNTAX;
382: } else { // LITERAL, so back up and break
383: start = i;
384: return status;
385: }
386: }
387: // otherwise it is a literal; keep on going
388: status = LITERAL;
389: if (cp == BACK_SLASH) {
390: quoteStatus = SLASH_START;
391: continue;
392: } else if (usingQuote && cp == SINGLE_QUOTE) {
393: lastQuote = cp;
394: quoteStatus = START_QUOTE;
395: continue;
396: }
397: // normal literals
398: UTF16.append(buffer, cp);
399: }
400: // handle final cleanup
401: start = limit;
402: switch (quoteStatus) {
403: case HEX:
404: status = BROKEN_ESCAPE;
405: break;
406: case SLASH_START:
407: if (usingSlash) {
408: status = BROKEN_ESCAPE;
409: } else {
410: buffer.append(BACK_SLASH);
411: }
412: break;
413: case START_QUOTE:
414: case NORMAL_QUOTE:
415: status = BROKEN_QUOTE;
416: break;
417: }
418: return status;
419: }
420:
421: }
422: //#endif
423: //eof
|