001: /*
002: * Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004: *
005: * This code is free software; you can redistribute it and/or modify it
006: * under the terms of the GNU General Public License version 2 only, as
007: * published by the Free Software Foundation. Sun designates this
008: * particular file as subject to the "Classpath" exception as provided
009: * by Sun in the LICENSE file that accompanied this code.
010: *
011: * This code is distributed in the hope that it will be useful, but WITHOUT
012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014: * version 2 for more details (a copy is included in the LICENSE file that
015: * accompanied this code).
016: *
017: * You should have received a copy of the GNU General Public License version
018: * 2 along with this work; if not, write to the Free Software Foundation,
019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020: *
021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022: * CA 95054 USA or visit www.sun.com if you need additional information or
023: * have any questions.
024: */
025: /*
026: *******************************************************************************
027: * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
028: * *
029: * The original version of this source code and documentation is copyrighted *
030: * and owned by IBM, These materials are provided under terms of a License *
031: * Agreement between IBM and Sun. This technology is protected by multiple *
032: * US and International patents. This notice and attribution to IBM may not *
033: * to removed. *
034: *******************************************************************************
035: */
036:
037: package sun.text.normalizer;
038:
039: // This class contains utility functions so testing not needed
040: ///CLOVER:OFF
041: public final class Utility {
042:
043: /**
044: * Convert characters outside the range U+0020 to U+007F to
045: * Unicode escapes, and convert backslash to a double backslash.
046: */
047: public static final String escape(String s) {
048: StringBuffer buf = new StringBuffer();
049: for (int i = 0; i < s.length();) {
050: int c = UTF16.charAt(s, i);
051: i += UTF16.getCharCount(c);
052: if (c >= ' ' && c <= 0x007F) {
053: if (c == '\\') {
054: buf.append("\\\\"); // That is, "\\"
055: } else {
056: buf.append((char) c);
057: }
058: } else {
059: boolean four = c <= 0xFFFF;
060: buf.append(four ? "\\u" : "\\U");
061: hex(c, four ? 4 : 8, buf);
062: }
063: }
064: return buf.toString();
065: }
066:
067: /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
068: static private final char[] UNESCAPE_MAP = {
069: /*" 0x22, 0x22 */
070: /*' 0x27, 0x27 */
071: /*? 0x3F, 0x3F */
072: /*\ 0x5C, 0x5C */
073: /*a*/0x61, 0x07,
074: /*b*/0x62, 0x08,
075: /*e*/0x65, 0x1b,
076: /*f*/0x66, 0x0c,
077: /*n*/0x6E, 0x0a,
078: /*r*/0x72, 0x0d,
079: /*t*/0x74, 0x09,
080: /*v*/0x76, 0x0b };
081:
082: /**
083: * Convert an escape to a 32-bit code point value. We attempt
084: * to parallel the icu4c unescapeAt() function.
085: * @param offset16 an array containing offset to the character
086: * <em>after</em> the backslash. Upon return offset16[0] will
087: * be updated to point after the escape sequence.
088: * @return character value from 0 to 10FFFF, or -1 on error.
089: */
090: public static int unescapeAt(String s, int[] offset16) {
091: int c;
092: int result = 0;
093: int n = 0;
094: int minDig = 0;
095: int maxDig = 0;
096: int bitsPerDigit = 4;
097: int dig;
098: int i;
099: boolean braces = false;
100:
101: /* Check that offset is in range */
102: int offset = offset16[0];
103: int length = s.length();
104: if (offset < 0 || offset >= length) {
105: return -1;
106: }
107:
108: /* Fetch first UChar after '\\' */
109: c = UTF16.charAt(s, offset);
110: offset += UTF16.getCharCount(c);
111:
112: /* Convert hexadecimal and octal escapes */
113: switch (c) {
114: case 'u':
115: minDig = maxDig = 4;
116: break;
117: case 'U':
118: minDig = maxDig = 8;
119: break;
120: case 'x':
121: minDig = 1;
122: if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
123: ++offset;
124: braces = true;
125: maxDig = 8;
126: } else {
127: maxDig = 2;
128: }
129: break;
130: default:
131: dig = UCharacter.digit(c, 8);
132: if (dig >= 0) {
133: minDig = 1;
134: maxDig = 3;
135: n = 1; /* Already have first octal digit */
136: bitsPerDigit = 3;
137: result = dig;
138: }
139: break;
140: }
141: if (minDig != 0) {
142: while (offset < length && n < maxDig) {
143: c = UTF16.charAt(s, offset);
144: dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
145: if (dig < 0) {
146: break;
147: }
148: result = (result << bitsPerDigit) | dig;
149: offset += UTF16.getCharCount(c);
150: ++n;
151: }
152: if (n < minDig) {
153: return -1;
154: }
155: if (braces) {
156: if (c != 0x7D /*}*/) {
157: return -1;
158: }
159: ++offset;
160: }
161: if (result < 0 || result >= 0x110000) {
162: return -1;
163: }
164: // If an escape sequence specifies a lead surrogate, see
165: // if there is a trail surrogate after it, either as an
166: // escape or as a literal. If so, join them up into a
167: // supplementary.
168: if (offset < length && UTF16.isLeadSurrogate((char) result)) {
169: int ahead = offset + 1;
170: c = s.charAt(offset); // [sic] get 16-bit code unit
171: if (c == '\\' && ahead < length) {
172: int o[] = new int[] { ahead };
173: c = unescapeAt(s, o);
174: ahead = o[0];
175: }
176: if (UTF16.isTrailSurrogate((char) c)) {
177: offset = ahead;
178: result = UCharacterProperty.getRawSupplementary(
179: (char) result, (char) c);
180: }
181: }
182: offset16[0] = offset;
183: return result;
184: }
185:
186: /* Convert C-style escapes in table */
187: for (i = 0; i < UNESCAPE_MAP.length; i += 2) {
188: if (c == UNESCAPE_MAP[i]) {
189: offset16[0] = offset;
190: return UNESCAPE_MAP[i + 1];
191: } else if (c < UNESCAPE_MAP[i]) {
192: break;
193: }
194: }
195:
196: /* Map \cX to control-X: X & 0x1F */
197: if (c == 'c' && offset < length) {
198: c = UTF16.charAt(s, offset);
199: offset16[0] = offset + UTF16.getCharCount(c);
200: return 0x1F & c;
201: }
202:
203: /* If no special forms are recognized, then consider
204: * the backslash to generically escape the next character. */
205: offset16[0] = offset;
206: return c;
207: }
208:
209: /**
210: * Convert a integer to size width hex uppercase digits.
211: * E.g., hex('a', 4, str) => "0041".
212: * Append the output to the given StringBuffer.
213: * If width is too small to fit, nothing will be appended to output.
214: */
215: public static StringBuffer hex(int ch, int width,
216: StringBuffer output) {
217: return appendNumber(output, ch, 16, width);
218: }
219:
220: /**
221: * Convert a integer to size width (minimum) hex uppercase digits.
222: * E.g., hex('a', 4, str) => "0041". If the integer requires more
223: * than width digits, more will be used.
224: */
225: public static String hex(int ch, int width) {
226: StringBuffer buf = new StringBuffer();
227: return appendNumber(buf, ch, 16, width).toString();
228: }
229:
230: /**
231: * Skip over a sequence of zero or more white space characters
232: * at pos. Return the index of the first non-white-space character
233: * at or after pos, or str.length(), if there is none.
234: */
235: public static int skipWhitespace(String str, int pos) {
236: while (pos < str.length()) {
237: int c = UTF16.charAt(str, pos);
238: if (!UCharacterProperty.isRuleWhiteSpace(c)) {
239: break;
240: }
241: pos += UTF16.getCharCount(c);
242: }
243: return pos;
244: }
245:
246: static final char DIGITS[] = { '0', '1', '2', '3', '4', '5', '6',
247: '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
248: 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
249: 'V', 'W', 'X', 'Y', 'Z' };
250:
251: /**
252: * Append the digits of a positive integer to the given
253: * <code>StringBuffer</code> in the given radix. This is
254: * done recursively since it is easiest to generate the low-
255: * order digit first, but it must be appended last.
256: *
257: * @param result is the <code>StringBuffer</code> to append to
258: * @param n is the positive integer
259: * @param radix is the radix, from 2 to 36 inclusive
260: * @param minDigits is the minimum number of digits to append.
261: */
262: private static void recursiveAppendNumber(StringBuffer result,
263: int n, int radix, int minDigits) {
264: int digit = n % radix;
265:
266: if (n >= radix || minDigits > 1) {
267: recursiveAppendNumber(result, n / radix, radix,
268: minDigits - 1);
269: }
270:
271: result.append(DIGITS[digit]);
272: }
273:
274: /**
275: * Append a number to the given StringBuffer in the given radix.
276: * Standard digits '0'-'9' are used and letters 'A'-'Z' for
277: * radices 11 through 36.
278: * @param result the digits of the number are appended here
279: * @param n the number to be converted to digits; may be negative.
280: * If negative, a '-' is prepended to the digits.
281: * @param radix a radix from 2 to 36 inclusive.
282: * @param minDigits the minimum number of digits, not including
283: * any '-', to produce. Values less than 2 have no effect. One
284: * digit is always emitted regardless of this parameter.
285: * @return a reference to result
286: */
287: public static StringBuffer appendNumber(StringBuffer result, int n,
288: int radix, int minDigits) throws IllegalArgumentException {
289: if (radix < 2 || radix > 36) {
290: throw new IllegalArgumentException("Illegal radix " + radix);
291: }
292:
293: int abs = n;
294:
295: if (n < 0) {
296: abs = -n;
297: result.append("-");
298: }
299:
300: recursiveAppendNumber(result, abs, radix, minDigits);
301:
302: return result;
303: }
304:
305: /**
306: * Return true if the character is NOT printable ASCII. The tab,
307: * newline and linefeed characters are considered unprintable.
308: */
309: public static boolean isUnprintable(int c) {
310: return !(c >= 0x20 && c <= 0x7E);
311: }
312:
313: /**
314: * Escape unprintable characters using <backslash>uxxxx notation
315: * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
316: * above. If the character is printable ASCII, then do nothing
317: * and return FALSE. Otherwise, append the escaped notation and
318: * return TRUE.
319: */
320: public static boolean escapeUnprintable(StringBuffer result, int c) {
321: if (isUnprintable(c)) {
322: result.append('\\');
323: if ((c & ~0xFFFF) != 0) {
324: result.append('U');
325: result.append(DIGITS[0xF & (c >> 28)]);
326: result.append(DIGITS[0xF & (c >> 24)]);
327: result.append(DIGITS[0xF & (c >> 20)]);
328: result.append(DIGITS[0xF & (c >> 16)]);
329: } else {
330: result.append('u');
331: }
332: result.append(DIGITS[0xF & (c >> 12)]);
333: result.append(DIGITS[0xF & (c >> 8)]);
334: result.append(DIGITS[0xF & (c >> 4)]);
335: result.append(DIGITS[0xF & c]);
336: return true;
337: }
338: return false;
339: }
340:
341: //// for StringPrep
342: /**
343: * Similar to StringBuffer.getChars, version 1.3.
344: * Since JDK 1.2 implements StringBuffer.getChars differently, this method
345: * is here to provide consistent results.
346: * To be removed after JDK 1.2 ceased to be the reference platform.
347: * @param src source string buffer
348: * @param srcBegin offset to the start of the src to retrieve from
349: * @param srcEnd offset to the end of the src to retrieve from
350: * @param dst char array to store the retrieved chars
351: * @param dstBegin offset to the start of the destination char array to
352: * store the retrieved chars
353: * @draft since ICU4J 2.0
354: */
355: public static void getChars(StringBuffer src, int srcBegin,
356: int srcEnd, char dst[], int dstBegin) {
357: if (srcBegin == srcEnd) {
358: return;
359: }
360: src.getChars(srcBegin, srcEnd, dst, dstBegin);
361: }
362:
363: /**
364: * Convenience utility to compare two char[]s.
365: * @param len the length to compare.
366: * The start indices and start+len must be valid.
367: */
368: public final static boolean arrayRegionMatches(char[] source,
369: int sourceStart, char[] target, int targetStart, int len) {
370: int sourceEnd = sourceStart + len;
371: int delta = targetStart - sourceStart;
372: for (int i = sourceStart; i < sourceEnd; i++) {
373: if (source[i] != target[i + delta])
374: return false;
375: }
376: return true;
377: }
378:
379: }
380: ///CLOVER:ON
|