001: /**
002: *******************************************************************************
003: * Copyright (C) 2002-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.dev.test;
007:
008: /**
009: * Utility class for supplementary code point
010: * support. This one is written purely for updating
011: * Normalization sample from the unicode.org site.
012: * If you want the real thing, use UTF16 class
013: * from ICU4J
014: * @author Vladimir Weinstein, Markus Scherer
015: */
016: public class UTF16Util {
017: static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000;
018:
019: /**
020: * Method nextCodePoint. Returns the next code point
021: * in a string.
022: * @param s String in question
023: * @param i index from which we want a code point
024: * @return int codepoint at index i
025: */
026: public static final int nextCodePoint(String s, int i) {
027: int ch = s.charAt(i);
028: if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
029: int ch2 = s.charAt(i);
030: if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
031: ch = (ch << 10) + ch2 - suppOffset;
032: }
033: }
034: return ch;
035: }
036:
037: /**
038: * Method prevCodePoint. Gets the code point preceding
039: * index i (predecrement).
040: * @param s String in question
041: * @param i index in string
042: * @return int codepoint at index --i
043: */
044: public static final int prevCodePoint(String s, int i) {
045: int ch = s.charAt(--i);
046: if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
047: int ch2 = s.charAt(i);
048: if (0xd800 <= ch2 && ch2 <= 0xdbff) {
049: ch = (ch2 << 10) + ch - suppOffset;
050: }
051: }
052: return ch;
053: }
054:
055: /**
056: * Method nextCodePoint. Returns the next code point
057: * in a string.
058: * @param s StringBuffer in question
059: * @param i index from which we want a code point
060: * @return int codepoint at index i
061: */
062: public static final int nextCodePoint(StringBuffer s, int i) {
063: int ch = s.charAt(i);
064: if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
065: int ch2 = s.charAt(i);
066: if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
067: ch = (ch << 10) + ch2 - suppOffset;
068: }
069: }
070: return ch;
071: }
072:
073: /**
074: * Method prevCodePoint. Gets the code point preceding
075: * index i (predecrement).
076: * @param s StringBuffer in question
077: * @param i index in string
078: * @return int codepoint at index --i
079: */
080: public static final int prevCodePoint(StringBuffer s, int i) {
081: int ch = s.charAt(--i);
082: if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
083: int ch2 = s.charAt(i);
084: if (0xd800 <= ch2 && ch2 <= 0xdbff) {
085: ch = (ch2 << 10) + ch - suppOffset;
086: }
087: }
088: return ch;
089: }
090:
091: /**
092: * Method codePointLength. Returns the length
093: * in UTF-16 code units of a given code point
094: * @param c code point in question
095: * @return int length in UTF-16 code units. Can be 1 or 2
096: */
097: public static final int codePointLength(int c) {
098: return c <= 0xffff ? 1 : 2;
099: }
100:
101: /**
102: * Method appendCodePoint. Appends a code point
103: * to a StringBuffer
104: * @param buffer StringBuffer in question
105: * @param ch code point to append
106: */
107: public static final void appendCodePoint(StringBuffer buffer, int ch) {
108: if (ch <= 0xffff) {
109: buffer.append((char) ch);
110: } else {
111: buffer.append((char) (0xd7c0 + (ch >> 10)));
112: buffer.append((char) (0xdc00 + (ch & 0x3ff)));
113: }
114: }
115:
116: /**
117: * Method insertCodePoint. Inserts a code point in
118: * a StringBuffer
119: * @param buffer StringBuffer in question
120: * @param i index at which we want code point to be inserted
121: * @param ch code point to be inserted
122: */
123: public static final void insertCodePoint(StringBuffer buffer,
124: int i, int ch) {
125: if (ch <= 0xffff) {
126: buffer.insert(i, (char) ch);
127: } else {
128: buffer.insert(i, (char) (0xd7c0 + (ch >> 10))).insert(
129: i + 1, (char) (0xdc00 + (ch & 0x3ff)));
130: }
131: }
132:
133: /**
134: * Method setCodePointAt. Changes a code point at a
135: * given index. Can change the length of the string.
136: * @param buffer StringBuffer in question
137: * @param i index at which we want to change the contents
138: * @param ch replacement code point
139: * @return int difference in resulting StringBuffer length
140: */
141: public static final int setCodePointAt(StringBuffer buffer, int i,
142: int ch) {
143: int cp = nextCodePoint(buffer, i);
144:
145: if (ch <= 0xffff && cp <= 0xffff) { // Both BMP
146: buffer.setCharAt(i, (char) ch);
147: return 0;
148: } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary
149: buffer.setCharAt(i, (char) (0xd7c0 + (ch >> 10)));
150: buffer.setCharAt(i + 1, (char) (0xdc00 + (ch & 0x3ff)));
151: return 0;
152: } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks
153: buffer.setCharAt(i, (char) ch);
154: buffer.deleteCharAt(i + 1);
155: return -1;
156: } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows
157: buffer.setCharAt(i, (char) (0xd7c0 + (ch >> 10)));
158: buffer.insert(i + 1, (char) (0xdc00 + (ch & 0x3ff)));
159: return 1;
160: }
161: }
162:
163: /**
164: * Method countCodePoint. Counts the UTF-32 code points
165: * in a UTF-16 encoded string.
166: * @param source String in question.
167: * @return int number of code points in this string
168: */
169: public static final int countCodePoint(String source) {
170: int result = 0;
171: char ch;
172: boolean hadLeadSurrogate = false;
173:
174: for (int i = 0; i < source.length(); ++i) {
175: ch = source.charAt(i);
176: if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
177: hadLeadSurrogate = false; // count valid trail as zero
178: } else {
179: hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
180: ++result; // count others as 1
181: }
182: }
183:
184: return result;
185: }
186:
187: /**
188: * Method countCodePoint. Counts the UTF-32 code points
189: * in a UTF-16 encoded string.
190: * @param source StringBuffer in question.
191: * @return int number of code points in this string
192: */
193: public static final int countCodePoint(StringBuffer source) {
194: int result = 0;
195: char ch;
196: boolean hadLeadSurrogate = false;
197:
198: for (int i = 0; i < source.length(); ++i) {
199: ch = source.charAt(i);
200: if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
201: hadLeadSurrogate = false; // count valid trail as zero
202: } else {
203: hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
204: ++result; // count others as 1
205: }
206: }
207:
208: return result;
209: }
210:
211: /**
212: * The minimum value for Supplementary code points
213: */
214: public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
215:
216: /**
217: * Determines how many chars this char32 requires.
218: * If a validity check is required, use <code>
219: * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on
220: * char32 before calling.
221: * @param char32 the input codepoint.
222: * @return 2 if is in supplementary space, otherwise 1.
223: */
224: public static int getCharCount(int char32) {
225: if (char32 < SUPPLEMENTARY_MIN_VALUE) {
226: return 1;
227: }
228: return 2;
229: }
230:
231: /**
232: * Lead surrogate maximum value
233: * @stable ICU 2.1
234: */
235: public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
236: /**
237: * Lead surrogate minimum value
238: * @stable ICU 2.1
239: */
240: public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
241:
242: /**
243: * Trail surrogate minimum value
244: * @stable ICU 2.1
245: */
246: public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
247: /**
248: * Trail surrogate maximum value
249: * @stable ICU 2.1
250: */
251: public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
252:
253: /**
254: * Determines whether the code value is a surrogate.
255: * @param char16 the input character.
256: * @return true iff the input character is a surrogate.
257: * @stable ICU 2.1
258: */
259: public static boolean isSurrogate(char char16) {
260: return LEAD_SURROGATE_MIN_VALUE <= char16
261: && char16 <= TRAIL_SURROGATE_MAX_VALUE;
262: }
263:
264: /**
265: * Determines whether the character is a trail surrogate.
266: * @param char16 the input character.
267: * @return true iff the input character is a trail surrogate.
268: * @stable ICU 2.1
269: */
270: public static boolean isTrailSurrogate(char char16) {
271: return (TRAIL_SURROGATE_MIN_VALUE <= char16 && char16 <= TRAIL_SURROGATE_MAX_VALUE);
272: }
273:
274: /**
275: * Determines whether the character is a lead surrogate.
276: * @param char16 the input character.
277: * @return true iff the input character is a lead surrogate
278: * @stable ICU 2.1
279: */
280: public static boolean isLeadSurrogate(char char16) {
281: return LEAD_SURROGATE_MIN_VALUE <= char16
282: && char16 <= LEAD_SURROGATE_MAX_VALUE;
283: }
284:
285: /**
286: * Extract a single UTF-32 value from a substring.
287: * Used when iterating forwards or backwards (with
288: * <code>UTF16.getCharCount()</code>, as well as random access. If a
289: * validity check is required, use
290: * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal()
291: * </a></code> on the return value.
292: * If the char retrieved is part of a surrogate pair, its supplementary
293: * character will be returned. If a complete supplementary character is
294: * not found the incomplete character will be returned
295: * @param source array of UTF-16 chars
296: * @param start offset to substring in the source array for analyzing
297: * @param limit offset to substring in the source array for analyzing
298: * @param offset16 UTF-16 offset relative to start
299: * @return UTF-32 value for the UTF-32 value that contains the char at
300: * offset16. The boundaries of that codepoint are the same as in
301: * <code>bounds32()</code>.
302: * @exception IndexOutOfBoundsException thrown if offset16 is not within
303: * the range of start and limit.
304: * @stable ICU 2.1
305: */
306: public static int charAt(char source[], int start, int limit,
307: int offset16) {
308: offset16 += start;
309: if (offset16 < start || offset16 >= limit) {
310: throw new ArrayIndexOutOfBoundsException(offset16);
311: }
312:
313: char single = source[offset16];
314: if (!isSurrogate(single)) {
315: return single;
316: }
317:
318: // Convert the UTF-16 surrogate pair if necessary.
319: // For simplicity in usage, and because the frequency of pairs is
320: // low, look both directions.
321: if (single <= LEAD_SURROGATE_MAX_VALUE) {
322: offset16++;
323: if (offset16 >= limit) {
324: return single;
325: }
326: char trail = source[offset16];
327: if (isTrailSurrogate(trail)) {
328: return getRawSupplementary(single, trail);
329: }
330: } else { // isTrailSurrogate(single), so
331: if (offset16 == start) {
332: return single;
333: }
334: offset16--;
335: char lead = source[offset16];
336: if (isLeadSurrogate(lead))
337: return getRawSupplementary(lead, single);
338: }
339: return single; // return unmatched surrogate
340: }
341:
342: /**
343: * Shift value for lead surrogate to form a supplementary character.
344: */
345: private static final int LEAD_SURROGATE_SHIFT_ = 10;
346:
347: /**
348: * Offset to add to combined surrogate pair to avoid msking.
349: */
350: private static final int SURROGATE_OFFSET_ = SUPPLEMENTARY_MIN_VALUE
351: - (LEAD_SURROGATE_MIN_VALUE << LEAD_SURROGATE_SHIFT_)
352: - TRAIL_SURROGATE_MIN_VALUE;
353:
354: /**
355: * Forms a supplementary code point from the argument character<br>
356: * Note this is for internal use hence no checks for the validity of the
357: * surrogate characters are done
358: * @param lead lead surrogate character
359: * @param trail trailing surrogate character
360: * @return code point of the supplementary character
361: */
362: public static int getRawSupplementary(char lead, char trail) {
363: return (lead << LEAD_SURROGATE_SHIFT_) + trail
364: + SURROGATE_OFFSET_;
365: }
366:
367: }
|