Source Code Cross Referenced for PatternTokenizer.java in » Internationalization-Localization » icu4j » com » ibm » icu » impl » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.impl

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        //##header
002:        //#ifndef FOUNDATION
003:        /*
004:         *******************************************************************************
005:         * Copyright (C) 2006, Google, International Business Machines Corporation and    *
006:         * others. All Rights Reserved.                                                *
007:         *******************************************************************************
008:         */
009:        package com.ibm.icu.impl;
010:
011:        import com.ibm.icu.text.UTF16;
012:        import com.ibm.icu.text.UnicodeSet;
013:        import com.ibm.icu.text.DateTimePatternGenerator.FormatParser;
014:        import com.ibm.icu.text.DateTimePatternGenerator.VariableField;
015:
016:        import java.util.BitSet;
017:        import java.util.Iterator;
018:        import java.util.List;
019:
020:        /**
021:         * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
022: * The '' (two quotes) is treated as a single quote, inside or outside a quote
023: * <ul>
024: * <li>Any ignorable characters are ignored in parsing.</li>
025: * <li>Any syntax characters are broken into separate tokens</li>
026: * <li>Quote characters can be specified: '...', "...", and \x </li>
027: * <li>Other characters are treated as literals</li>
028: * </ul>
029: */
030:        public class PatternTokenizer {
031:            // settings used in the interpretation of the pattern
032:            private UnicodeSet ignorableCharacters = new UnicodeSet();
033:            private UnicodeSet syntaxCharacters = new UnicodeSet();
034:            private UnicodeSet escapeCharacters = new UnicodeSet();
035:            private boolean usingSlash = false;
036:            private boolean usingQuote = false;
037:
038:            // transient data, set when needed. Null it out for any changes in the above fields.
039:            private transient UnicodeSet needingQuoteCharacters = null;
040:
041:            // data about the current pattern being parsed. start gets moved as we go along.
042:            private int start;
043:            private int limit;
044:            private CharSequence pattern;
045:
046:            public UnicodeSet getIgnorableCharacters() {
047:                return (UnicodeSet) ignorableCharacters.clone();
048:            }
049:
050:            /**
051:             * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
052:             * @param ignorableCharacters
053:             * @return
054:             */
055:            public PatternTokenizer setIgnorableCharacters(
056:                    UnicodeSet ignorableCharacters) {
057:                this .ignorableCharacters = (UnicodeSet) ignorableCharacters
058:                        .clone();
059:                needingQuoteCharacters = null;
060:                return this ;
061:            }
062:
063:            public UnicodeSet getSyntaxCharacters() {
064:                return (UnicodeSet) syntaxCharacters.clone();
065:            }
066:
067:            /**
068:             *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
069:             * @param syntaxCharacters
070:             * @return
071:             */
072:            public PatternTokenizer setSyntaxCharacters(
073:                    UnicodeSet syntaxCharacters) {
074:                this .syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
075:                needingQuoteCharacters = null;
076:                return this ;
077:            }
078:
079:            public UnicodeSet getEscapeCharacters() {
080:                return (UnicodeSet) escapeCharacters.clone();
081:            }
082:
083:            /**
084:             * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
085:             * @param escapeCharacters
086:             * @return
087:             */
088:            public PatternTokenizer setEscapeCharacters(
089:                    UnicodeSet escapeCharacters) {
090:                this .escapeCharacters = (UnicodeSet) escapeCharacters.clone();
091:                return this ;
092:            }
093:
094:            public boolean isUsingQuote() {
095:                return usingQuote;
096:            }
097:
098:            public PatternTokenizer setUsingQuote(boolean usingQuote) {
099:                this .usingQuote = usingQuote;
100:                needingQuoteCharacters = null;
101:                return this ;
102:            }
103:
104:            public boolean isUsingSlash() {
105:                return usingSlash;
106:            }
107:
108:            public PatternTokenizer setUsingSlash(boolean usingSlash) {
109:                this .usingSlash = usingSlash;
110:                needingQuoteCharacters = null;
111:                return this ;
112:            }
113:
114:            //    public UnicodeSet getQuoteCharacters() {
115:            //  return (UnicodeSet) quoteCharacters.clone();
116:            //  }
117:            //  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
118:            //  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
119:            //  needingQuoteCharacters = null;
120:            //  return this;
121:            //  }
122:            public int getLimit() {
123:                return limit;
124:            }
125:
126:            public PatternTokenizer setLimit(int limit) {
127:                this .limit = limit;
128:                return this ;
129:            }
130:
131:            public int getStart() {
132:                return start;
133:            }
134:
135:            public PatternTokenizer setStart(int start) {
136:                this .start = start;
137:                return this ;
138:            }
139:
140:            public PatternTokenizer setPattern(CharSequence pattern) {
141:                if (pattern == null) {
142:                    throw new IllegalArgumentException("Inconsistent arguments");
143:                }
144:                this .start = 0;
145:                this .limit = pattern.length();
146:                this .pattern = pattern;
147:                return this ;
148:            }
149:
150:            public static final char SINGLE_QUOTE = '\'';
151:            public static final char BACK_SLASH = '\\';
152:            private static int NO_QUOTE = -1, IN_QUOTE = -2;
153:
154:            /**
155:             * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
156:             * @param string
157:             * @return
158:             */
159:            public String quoteLiteral(CharSequence string) {
160:                if (needingQuoteCharacters == null) {
161:                    needingQuoteCharacters = new UnicodeSet().addAll(
162:                            syntaxCharacters).addAll(ignorableCharacters); // .addAll(quoteCharacters)
163:                    if (usingSlash)
164:                        needingQuoteCharacters.add(BACK_SLASH);
165:                    if (usingQuote)
166:                        needingQuoteCharacters.add(SINGLE_QUOTE);
167:                }
168:                StringBuffer result = new StringBuffer();
169:                int quotedChar = NO_QUOTE;
170:                int cp;
171:                for (int i = 0; i < string.length(); i += UTF16
172:                        .getCharCount(cp)) {
173:                    cp = UTF16.charAt(string, i);
174:                    if (escapeCharacters.contains(cp)) {
175:                        // we may have to fix up previous characters
176:                        if (quotedChar == IN_QUOTE) {
177:                            result.append(SINGLE_QUOTE);
178:                            quotedChar = NO_QUOTE;
179:                        }
180:                        appendEscaped(result, cp);
181:                        continue;
182:                    }
183:
184:                    if (needingQuoteCharacters.contains(cp)) {
185:                        // if we have already started a quote
186:                        if (quotedChar == IN_QUOTE) {
187:                            UTF16.append(result, cp);
188:                            if (usingQuote && cp == SINGLE_QUOTE) { // double it
189:                                result.append(SINGLE_QUOTE);
190:                            }
191:                            continue;
192:                        }
193:                        // otherwise not already in quote
194:                        if (usingSlash) {
195:                            result.append(BACK_SLASH);
196:                            UTF16.append(result, cp);
197:                            continue;
198:                        }
199:                        if (usingQuote) {
200:                            if (cp == SINGLE_QUOTE) { // double it and continue
201:                                result.append(SINGLE_QUOTE);
202:                                result.append(SINGLE_QUOTE);
203:                                continue;
204:                            }
205:                            result.append(SINGLE_QUOTE);
206:                            UTF16.append(result, cp);
207:                            quotedChar = IN_QUOTE;
208:                            continue;
209:                        }
210:                        // we have no choice but to use \\u or \\U
211:appendEscaped(result, cp);
212:                        continue;
213:                    }
214:                    // otherwise cp doesn't need quoting
215:                    // we may have to fix up previous characters
216:                    if (quotedChar == IN_QUOTE) {
217:                        result.append(SINGLE_QUOTE);
218:                        quotedChar = NO_QUOTE;
219:                    }
220:                    UTF16.append(result, cp);
221:                }
222:                // all done. 
223:                // we may have to fix up previous characters
224:                if (quotedChar == IN_QUOTE) {
225:                    result.append(SINGLE_QUOTE);
226:                }
227:                return result.toString();
228:            }
229:
230:            private void appendEscaped(StringBuffer result, int cp) {
231:                if (cp <= 0xFFFF) {
232:                    result.append("\\u").append(Utility.hex(cp, 4));
233:                } else {
234:                    result.append("\\U").append(Utility.hex(cp, 8));
235:                }
236:            }
237:
238:            public String normalize() {
239:                int oldStart = start;
240:                StringBuffer result = new StringBuffer();
241:                StringBuffer buffer = new StringBuffer();
242:                while (true) {
243:                    buffer.setLength(0);
244:                    int status = next(buffer);
245:                    if (status == DONE) {
246:                        start = oldStart;
247:                        return result.toString();
248:                    }
249:                    if (status != SYNTAX) {
250:                        result.append(quoteLiteral(buffer));
251:                    } else {
252:                        result.append(buffer);
253:                    }
254:                }
255:            }
256:
257:            public static final int DONE = 0, SYNTAX = 1, LITERAL = 2,
258:                    BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
259:
260:            private static final int AFTER_QUOTE = -1, NONE = 0,
261:                    START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3,
262:                    HEX = 4;
263:
264:            public int next(StringBuffer buffer) {
265:                if (start >= limit)
266:                    return DONE;
267:                int status = UNKNOWN;
268:                int lastQuote = UNKNOWN;
269:                int quoteStatus = NONE;
270:                int hexCount = 0;
271:                int hexValue = 0;
272:                int cp;
273:                main: for (int i = start; i < limit; i += UTF16
274:                        .getCharCount(cp)) {
275:                    cp = UTF16.charAt(pattern, i);
276:                    // if we are in a quote, then handle it.
277:                    switch (quoteStatus) {
278:                    case SLASH_START:
279:                        switch (cp) {
280:                        case 'u':
281:                            quoteStatus = HEX;
282:                            hexCount = 4;
283:                            hexValue = 0;
284:                            continue main;
285:                        case 'U':
286:                            quoteStatus = HEX;
287:                            hexCount = 8;
288:                            hexValue = 0;
289:                            continue main;
290:                        default:
291:                            if (usingSlash) {
292:                                UTF16.append(buffer, cp);
293:                                quoteStatus = NONE;
294:                                continue main;
295:                            } else {
296:                                buffer.append(BACK_SLASH);
297:                                quoteStatus = NONE;
298:                            }
299:                        }
300:                        break; // fall through to NONE
301:                    case HEX:
302:                        hexValue <<= 4;
303:                        hexValue += cp;
304:                        switch (cp) {
305:                        case '0':
306:                        case '1':
307:                        case '2':
308:                        case '3':
309:                        case '4':
310:                        case '5':
311:                        case '6':
312:                        case '7':
313:                        case '8':
314:                        case '9':
315:                            hexValue -= '0';
316:                            break;
317:                        case 'a':
318:                        case 'b':
319:                        case 'c':
320:                        case 'd':
321:                        case 'e':
322:                        case 'f':
323:                            hexValue -= 'a' - 10;
324:                            break;
325:                        case 'A':
326:                        case 'B':
327:                        case 'C':
328:                        case 'D':
329:                        case 'E':
330:                        case 'F':
331:                            hexValue -= 'A' - 10;
332:                            break;
333:                        default:
334:                            start = i;
335:                            return BROKEN_ESCAPE;
336:                        }
337:                        --hexCount;
338:                        if (hexCount == 0) {
339:                            quoteStatus = NONE;
340:                            UTF16.append(buffer, hexValue);
341:                        }
342:                        continue main;
343:                    case AFTER_QUOTE:
344:                        // see if we get another quote character
345:                        // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
346:                        if (cp == lastQuote) {
347:                            UTF16.append(buffer, cp);
348:                            quoteStatus = NORMAL_QUOTE;
349:                            continue main;
350:                        }
351:                        quoteStatus = NONE;
352:                        break; // fall through to NONE
353:                    case START_QUOTE:
354:                        // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
355:                        if (cp == lastQuote) {
356:                            UTF16.append(buffer, cp);
357:                            quoteStatus = NONE; // get out of quote, with no trace remaining
358:                            continue;
359:                        }
360:                        // otherwise get into quote
361:                        UTF16.append(buffer, cp);
362:                        quoteStatus = NORMAL_QUOTE;
363:                        continue main;
364:                    case NORMAL_QUOTE:
365:                        if (cp == lastQuote) {
366:                            quoteStatus = AFTER_QUOTE; // get out of quote
367:                            continue main;
368:                        }
369:                        UTF16.append(buffer, cp);
370:                        continue main;
371:                    }
372:
373:                    if (ignorableCharacters.contains(cp)) {
374:                        continue;
375:                    }
376:                    // do syntax characters
377:                    if (syntaxCharacters.contains(cp)) {
378:                        if (status == UNKNOWN) {
379:                            UTF16.append(buffer, cp);
380:                            start = i + UTF16.getCharCount(cp);
381:                            return SYNTAX;
382:                        } else { // LITERAL, so back up and break
383:                            start = i;
384:                            return status;
385:                        }
386:                    }
387:                    // otherwise it is a literal; keep on going
388:                    status = LITERAL;
389:                    if (cp == BACK_SLASH) {
390:                        quoteStatus = SLASH_START;
391:                        continue;
392:                    } else if (usingQuote && cp == SINGLE_QUOTE) {
393:                        lastQuote = cp;
394:                        quoteStatus = START_QUOTE;
395:                        continue;
396:                    }
397:                    // normal literals
398:                    UTF16.append(buffer, cp);
399:                }
400:                // handle final cleanup
401:                start = limit;
402:                switch (quoteStatus) {
403:                case HEX:
404:                    status = BROKEN_ESCAPE;
405:                    break;
406:                case SLASH_START:
407:                    if (usingSlash) {
408:                        status = BROKEN_ESCAPE;
409:                    } else {
410:                        buffer.append(BACK_SLASH);
411:                    }
412:                    break;
413:                case START_QUOTE:
414:                case NORMAL_QUOTE:
415:                    status = BROKEN_QUOTE;
416:                    break;
417:                }
418:                return status;
419:            }
420:
421:        }
422:        //#endif
423:        //eof

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.