001: /*
002: **********************************************************************
003: * Copyright (c) 2001-2004, International Business Machines
004: * Corporation and others. All Rights Reserved.
005: **********************************************************************
006: * Date Name Description
007: * 11/19/2001 aliu Creation.
008: **********************************************************************
009: */
010: package com.ibm.icu.text;
011:
012: import com.ibm.icu.lang.*;
013:
014: /**
015: * A transliterator that converts Unicode escape forms to the
016: * characters they represent. Escape forms have a prefix, a suffix, a
017: * radix, and minimum and maximum digit counts.
018: *
019: * <p>This class is package private. It registers several standard
020: * variants with the system which are then accessed via their IDs.
021: *
022: * @author Alan Liu
023: */
024: class UnescapeTransliterator extends Transliterator {
025:
026: /**
027: * The encoded pattern specification. The pattern consists of
028: * zero or more forms. Each form consists of a prefix, suffix,
029: * radix, minimum digit count, and maximum digit count. These
030: * values are stored as a five character header. That is, their
031: * numeric values are cast to 16-bit characters and stored in the
032: * string. Following these five characters, the prefix
033: * characters, then suffix characters are stored. Each form thus
034: * takes n+5 characters, where n is the total length of the prefix
035: * and suffix. The end is marked by a header of length one
036: * consisting of the character END.
037: */
038: private char spec[];
039:
040: /**
041: * Special character marking the end of the spec[] array.
042: */
043: private static final char END = 0xFFFF;
044:
045: /**
046: * Registers standard variants with the system. Called by
047: * Transliterator during initialization.
048: */
049: static void register() {
050: // Unicode: "U+10FFFF" hex, min=4, max=6
051: Transliterator.registerFactory("Hex-Any/Unicode",
052: new Transliterator.Factory() {
053: public Transliterator getInstance(String ID) {
054: return new UnescapeTransliterator(
055: "Hex-Any/Unicode", new char[] { 2, 0,
056: 16, 4, 6, 'U', '+', END });
057: }
058: });
059:
060: // Java: "\\uFFFF" hex, min=4, max=4
061: Transliterator.registerFactory("Hex-Any/Java",
062: new Transliterator.Factory() {
063: public Transliterator getInstance(String ID) {
064: return new UnescapeTransliterator(
065: "Hex-Any/Java", new char[] { 2, 0, 16,
066: 4, 4, '\\', 'u', END });
067: }
068: });
069:
070: // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
071: Transliterator.registerFactory("Hex-Any/C",
072: new Transliterator.Factory() {
073: public Transliterator getInstance(String ID) {
074: return new UnescapeTransliterator(
075: "Hex-Any/C",
076: new char[] { 2, 0, 16, 4, 4, '\\', 'u',
077: 2, 0, 16, 8, 8, '\\', 'U', END });
078: }
079: });
080:
081: // XML: "" hex, min=1, max=6
082: Transliterator.registerFactory("Hex-Any/XML",
083: new Transliterator.Factory() {
084: public Transliterator getInstance(String ID) {
085: return new UnescapeTransliterator(
086: "Hex-Any/XML", new char[] { 3, 1, 16,
087: 1, 6, '&', '#', 'x', ';', END });
088: }
089: });
090:
091: // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any")
092: Transliterator.registerFactory("Hex-Any/XML10",
093: new Transliterator.Factory() {
094: public Transliterator getInstance(String ID) {
095: return new UnescapeTransliterator(
096: "Hex-Any/XML10", new char[] { 2, 1, 10,
097: 1, 7, '&', '#', ';', END });
098: }
099: });
100:
101: // Perl: "\\x{263A}" hex, min=1, max=6
102: Transliterator.registerFactory("Hex-Any/Perl",
103: new Transliterator.Factory() {
104: public Transliterator getInstance(String ID) {
105: return new UnescapeTransliterator(
106: "Hex-Any/Perl",
107: new char[] { 3, 1, 16, 1, 6, '\\', 'x',
108: '{', '}', END });
109: }
110: });
111:
112: // All: Java, C, Perl, XML, XML10, Unicode
113: Transliterator.registerFactory("Hex-Any",
114: new Transliterator.Factory() {
115: public Transliterator getInstance(String ID) {
116: return new UnescapeTransliterator("Hex-Any",
117: new char[] { 2, 0, 16, 4, 6,
118: 'U',
119: '+', // Unicode
120: 2, 0, 16, 4, 4,
121: '\\',
122: 'u', // Java
123: 2, 0, 16, 8, 8,
124: '\\',
125: 'U', // C (surrogates)
126: 3, 1, 16, 1, 6, '&', '#',
127: 'x',
128: ';', // XML
129: 2, 1, 10, 1, 7, '&', '#',
130: ';', // XML10
131: 3, 1, 16, 1, 6, '\\', 'x', '{',
132: '}', // Perl
133: END });
134: }
135: });
136: }
137:
138: /**
139: * Package private constructor. Takes the encoded spec array.
140: */
141: UnescapeTransliterator(String ID, char spec[]) {
142: super (ID, null);
143: this .spec = spec;
144: }
145:
146: /**
147: * Implements {@link Transliterator#handleTransliterate}.
148: */
149: protected void handleTransliterate(Replaceable text, Position pos,
150: boolean isIncremental) {
151: int start = pos.start;
152: int limit = pos.limit;
153: int i, j, ipat;
154:
155: loop: while (start < limit) {
156: // Loop over the forms in spec[]. Exit this loop when we
157: // match one of the specs. Exit the outer loop if a
158: // partial match is detected and isIncremental is true.
159: for (j = 0, ipat = 0; spec[ipat] != END; ++j) {
160:
161: // Read the header
162: int prefixLen = spec[ipat++];
163: int suffixLen = spec[ipat++];
164: int radix = spec[ipat++];
165: int minDigits = spec[ipat++];
166: int maxDigits = spec[ipat++];
167:
168: // s is a copy of start that is advanced over the
169: // characters as we parse them.
170: int s = start;
171: boolean match = true;
172:
173: for (i = 0; i < prefixLen; ++i) {
174: if (s >= limit) {
175: if (i > 0) {
176: // We've already matched a character. This is
177: // a partial match, so we return if in
178: // incremental mode. In non-incremental mode,
179: // go to the next spec.
180: if (isIncremental) {
181: break loop;
182: }
183: match = false;
184: break;
185: }
186: }
187: char c = text.charAt(s++);
188: if (c != spec[ipat + i]) {
189: match = false;
190: break;
191: }
192: }
193:
194: if (match) {
195: int u = 0;
196: int digitCount = 0;
197: for (;;) {
198: if (s >= limit) {
199: // Check for partial match in incremental mode.
200: if (s > start && isIncremental) {
201: break loop;
202: }
203: break;
204: }
205: int ch = text.char32At(s);
206: int digit = UCharacter.digit(ch, radix);
207: if (digit < 0) {
208: break;
209: }
210: s += UTF16.getCharCount(ch);
211: u = (u * radix) + digit;
212: if (++digitCount == maxDigits) {
213: break;
214: }
215: }
216:
217: match = (digitCount >= minDigits);
218:
219: if (match) {
220: for (i = 0; i < suffixLen; ++i) {
221: if (s >= limit) {
222: // Check for partial match in incremental mode.
223: if (s > start && isIncremental) {
224: break loop;
225: }
226: match = false;
227: break;
228: }
229: char c = text.charAt(s++);
230: if (c != spec[ipat + prefixLen + i]) {
231: match = false;
232: break;
233: }
234: }
235:
236: if (match) {
237: // At this point, we have a match
238: String str = UTF16.valueOf(u);
239: text.replace(start, s, str);
240: limit -= s - start - str.length();
241: // The following break statement leaves the
242: // loop that is traversing the forms in
243: // spec[]. We then parse the next input
244: // character.
245: break;
246: }
247: }
248: }
249:
250: ipat += prefixLen + suffixLen;
251: }
252:
253: if (start < limit) {
254: start += UTF16.getCharCount(text.char32At(start));
255: }
256: }
257:
258: pos.contextLimit += limit - pos.limit;
259: pos.limit = limit;
260: pos.start = start;
261: }
262: }
|