001: /**
002: *******************************************************************************
003: * Copyright (C) 2000-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.dev.test.translit;
007:
008: import com.ibm.icu.text.*;
009:
010: public final class TestUtility {
011:
012: public static String hex(char ch) {
013: String foo = Integer.toString(ch, 16).toUpperCase();
014: return "0000".substring(0, 4 - foo.length()) + foo;
015: }
016:
017: public static String hex(int ch) {
018: String foo = Integer.toString(ch, 16).toUpperCase();
019: return "00000000".substring(0, 4 - foo.length()) + foo;
020: }
021:
022: public static String hex(String s) {
023: return hex(s, ",");
024: }
025:
026: public static String hex(String s, String sep) {
027: if (s.length() == 0)
028: return "";
029: String result = hex(s.charAt(0));
030: for (int i = 1; i < s.length(); ++i) {
031: result += sep;
032: result += hex(s.charAt(i));
033: }
034: return result;
035: }
036:
037: public static String replace(String source, String toBeReplaced,
038: String replacement) {
039: StringBuffer results = new StringBuffer();
040: int len = toBeReplaced.length();
041: for (int i = 0; i < source.length(); ++i) {
042: if (source.regionMatches(false, i, toBeReplaced, 0, len)) {
043: results.append(replacement);
044: i += len - 1; // minus one, since we will increment
045: } else {
046: results.append(source.charAt(i));
047: }
048: }
049: return results.toString();
050: }
051:
052: public static String replaceAll(String source, UnicodeSet set,
053: String replacement) {
054: StringBuffer results = new StringBuffer();
055: int cp;
056: for (int i = 0; i < source.length(); i += UTF16
057: .getCharCount(cp)) {
058: cp = UTF16.charAt(source, i);
059: if (set.contains(cp)) {
060: results.append(replacement);
061: } else {
062: UTF16.append(results, cp);
063: }
064: }
065: return results.toString();
066: }
067:
068: // COMMENTED OUT ALL THE OLD SCRIPT STUFF
069: /*
070: public static byte getScript(char c) {
071: return getScript(getBlock(c));
072: }
073:
074: public static byte getScript(byte block) {
075: return blockToScript[block];
076: }
077:
078: public static byte getBlock(char c) {
079: int index = c >> 7;
080: byte block = charToBlock[index];
081: while (block < 0) { // take care of exceptions, blocks split across 128 boundaries
082: int[] tuple = split[-block-1];
083: if (c < tuple[0]) block = (byte)tuple[1];
084: else block = (byte)tuple[2];
085: }
086: return block;
087: }
088:
089: // returns next letter of script, or 0xFFFF if done
090:
091: public static char getNextLetter(char c, byte script) {
092: while (c < 0xFFFF) {
093: ++c;
094: if (getScript(c) == script && Character.isLetter(c)) {
095: return c;
096: }
097: }
098: return c;
099: }
100:
101: // Supplements to Character methods; these methods go through
102: // UCharacter if possible. If not, they fall back to Character.
103:
104: public static boolean isUnassigned(char c) {
105: try {
106: return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED;
107: } catch (NullPointerException e) {
108: System.out.print("");
109: }
110: return Character.getType(c) == Character.UNASSIGNED;
111: }
112:
113: public static boolean isLetter(char c) {
114: try {
115: return UCharacter.isLetter(c);
116: } catch (NullPointerException e) {
117: System.out.print("");
118: }
119: return Character.isLetter(c);
120: }
121:
122: public static void main(String[] args) {
123: System.out.println("Blocks: ");
124: byte lastblock = -128;
125: for (char cc = 0; cc < 0xFFFF; ++cc) {
126: byte block = TestUtility.getBlock(cc);
127: if (block != lastblock) {
128: System.out.println(TestUtility.hex(cc) + "\t" + block);
129: lastblock = block;
130: }
131: }
132: System.out.println();
133: System.out.println("Scripts: ");
134: byte lastScript = -128;
135: for (char cc = 0; cc < 0xFFFF; ++cc) {
136: byte script = TestUtility.getScript(cc);
137: if (script != lastScript) {
138: System.out.println(TestUtility.hex(cc) + "\t" + script);
139: lastScript = script;
140: }
141: }
142: }
143:
144:
145:
146: public static final byte // SCRIPT CODE
147: COMMON_SCRIPT = 0,
148: LATIN_SCRIPT = 1,
149: GREEK_SCRIPT = 2,
150: CYRILLIC_SCRIPT = 3,
151: ARMENIAN_SCRIPT = 4,
152: HEBREW_SCRIPT = 5,
153: ARABIC_SCRIPT = 6,
154: SYRIAC_SCRIPT = 7,
155: THAANA_SCRIPT = 8,
156: DEVANAGARI_SCRIPT = 9,
157: BENGALI_SCRIPT = 10,
158: GURMUKHI_SCRIPT = 11,
159: GUJARATI_SCRIPT = 12,
160: ORIYA_SCRIPT = 13,
161: TAMIL_SCRIPT = 14,
162: TELUGU_SCRIPT = 15,
163: KANNADA_SCRIPT = 16,
164: MALAYALAM_SCRIPT = 17,
165: SINHALA_SCRIPT = 18,
166: THAI_SCRIPT = 19,
167: LAO_SCRIPT = 20,
168: TIBETAN_SCRIPT = 21,
169: MYANMAR_SCRIPT = 22,
170: GEORGIAN_SCRIPT = 23,
171: JAMO_SCRIPT = 24,
172: HANGUL_SCRIPT = 25,
173: ETHIOPIC_SCRIPT = 26,
174: CHEROKEE_SCRIPT = 27,
175: ABORIGINAL_SCRIPT = 28,
176: OGHAM_SCRIPT = 29,
177: RUNIC_SCRIPT = 30,
178: KHMER_SCRIPT = 31,
179: MONGOLIAN_SCRIPT = 32,
180: HIRAGANA_SCRIPT = 33,
181: KATAKANA_SCRIPT = 34,
182: BOPOMOFO_SCRIPT = 35,
183: HAN_SCRIPT = 36,
184: YI_SCRIPT = 37;
185:
186: public static final byte // block code
187: RESERVED_BLOCK = 0,
188: BASIC_LATIN = 1,
189: LATIN_1_SUPPLEMENT = 2,
190: LATIN_EXTENDED_A = 3,
191: LATIN_EXTENDED_B = 4,
192: IPA_EXTENSIONS = 5,
193: SPACING_MODIFIER_LETTERS = 6,
194: COMBINING_DIACRITICAL_MARKS = 7,
195: GREEK = 8,
196: CYRILLIC = 9,
197: ARMENIAN = 10,
198: HEBREW = 11,
199: ARABIC = 12,
200: SYRIAC = 13,
201: THAANA = 14,
202: DEVANAGARI = 15,
203: BENGALI = 16,
204: GURMUKHI = 17,
205: GUJARATI = 18,
206: ORIYA = 19,
207: TAMIL = 20,
208: TELUGU = 21,
209: KANNADA = 22,
210: MALAYALAM = 23,
211: SINHALA = 24,
212: THAI = 25,
213: LAO = 26,
214: TIBETAN = 27,
215: MYANMAR = 28,
216: GEORGIAN = 29,
217: HANGUL_JAMO = 30,
218: ETHIOPIC = 31,
219: CHEROKEE = 32,
220: UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33,
221: OGHAM = 34,
222: RUNIC = 35,
223: KHMER = 36,
224: MONGOLIAN = 37,
225: LATIN_EXTENDED_ADDITIONAL = 38,
226: GREEK_EXTENDED = 39,
227: GENERAL_PUNCTUATION = 40,
228: SUPERSCRIPTS_AND_SUBSCRIPTS = 41,
229: CURRENCY_SYMBOLS = 42,
230: COMBINING_MARKS_FOR_SYMBOLS = 43,
231: LETTERLIKE_SYMBOLS = 44,
232: NUMBER_FORMS = 45,
233: ARROWS = 46,
234: MATHEMATICAL_OPERATORS = 47,
235: MISCELLANEOUS_TECHNICAL = 48,
236: CONTROL_PICTURES = 49,
237: OPTICAL_CHARACTER_RECOGNITION = 50,
238: ENCLOSED_ALPHANUMERICS = 51,
239: BOX_DRAWING = 52,
240: BLOCK_ELEMENTS = 53,
241: GEOMETRIC_SHAPES = 54,
242: MISCELLANEOUS_SYMBOLS = 55,
243: DINGBATS = 56,
244: BRAILLE_PATTERNS = 57,
245: CJK_RADICALS_SUPPLEMENT = 58,
246: KANGXI_RADICALS = 59,
247: IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
248: CJK_SYMBOLS_AND_PUNCTUATION = 61,
249: HIRAGANA = 62,
250: KATAKANA = 63,
251: BOPOMOFO = 64,
252: HANGUL_COMPATIBILITY_JAMO = 65,
253: KANBUN = 66,
254: BOPOMOFO_EXTENDED = 67,
255: ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
256: CJK_COMPATIBILITY = 69,
257: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
258: CJK_UNIFIED_IDEOGRAPHS = 71,
259: YI_SYLLABLES = 72,
260: YI_RADICALS = 73,
261: HANGUL_SYLLABLES = 74,
262: HIGH_SURROGATES = 75,
263: HIGH_PRIVATE_USE_SURROGATES = 76,
264: LOW_SURROGATES = 77,
265: PRIVATE_USE = 78,
266: CJK_COMPATIBILITY_IDEOGRAPHS = 79,
267: ALPHABETIC_PRESENTATION_FORMS = 80,
268: ARABIC_PRESENTATION_FORMS_A = 81,
269: COMBINING_HALF_MARKS = 82,
270: CJK_COMPATIBILITY_FORMS = 83,
271: SMALL_FORM_VARIANTS = 84,
272: ARABIC_PRESENTATION_FORMS_B = 85,
273: SPECIALS = 86,
274: HALFWIDTH_AND_FULLWIDTH_FORMS = 87;
275:
276: static final byte[] blockToScript = {
277: COMMON_SCRIPT, // 0, <RESERVED_BLOCK>
278: LATIN_SCRIPT, // 1, BASIC_LATIN
279: LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT
280: LATIN_SCRIPT, // 3, LATIN_EXTENDED_A
281: LATIN_SCRIPT, // 4, LATIN_EXTENDED_B
282: LATIN_SCRIPT, // 5, IPA_EXTENSIONS
283: COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS
284: COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS
285: GREEK_SCRIPT, // 8, GREEK
286: CYRILLIC_SCRIPT, // 9, CYRILLIC
287: ARMENIAN_SCRIPT, // 10, ARMENIAN
288: HEBREW_SCRIPT, // 11, HEBREW
289: ARABIC_SCRIPT, // 12, ARABIC
290: SYRIAC_SCRIPT, // 13, SYRIAC
291: THAANA_SCRIPT, // 14, THAANA
292: DEVANAGARI_SCRIPT, // 15, DEVANAGARI
293: BENGALI_SCRIPT, // 16, BENGALI
294: GURMUKHI_SCRIPT, // 17, GURMUKHI
295: GUJARATI_SCRIPT, // 18, GUJARATI
296: ORIYA_SCRIPT, // 19, ORIYA
297: TAMIL_SCRIPT, // 20, TAMIL
298: TELUGU_SCRIPT, // 21, TELUGU
299: KANNADA_SCRIPT, // 22, KANNADA
300: MALAYALAM_SCRIPT, // 23, MALAYALAM
301: SINHALA_SCRIPT, // 24, SINHALA
302: THAI_SCRIPT, // 25, THAI
303: LAO_SCRIPT, // 26, LAO
304: TIBETAN_SCRIPT, // 27, TIBETAN
305: MYANMAR_SCRIPT, // 28, MYANMAR
306: GEORGIAN_SCRIPT, // 29, GEORGIAN
307: JAMO_SCRIPT, // 30, HANGUL_JAMO
308: ETHIOPIC_SCRIPT, // 31, ETHIOPIC
309: CHEROKEE_SCRIPT, // 32, CHEROKEE
310: ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
311: OGHAM_SCRIPT, // 34, OGHAM
312: RUNIC_SCRIPT, // 35, RUNIC
313: KHMER_SCRIPT, // 36, KHMER
314: MONGOLIAN_SCRIPT, // 37, MONGOLIAN
315: LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL
316: GREEK_SCRIPT, // 39, GREEK_EXTENDED
317: COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION
318: COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS
319: COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS
320: COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS
321: COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS
322: COMMON_SCRIPT, // 45, NUMBER_FORMS
323: COMMON_SCRIPT, // 46, ARROWS
324: COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS
325: COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL
326: COMMON_SCRIPT, // 49, CONTROL_PICTURES
327: COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION
328: COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS
329: COMMON_SCRIPT, // 52, BOX_DRAWING
330: COMMON_SCRIPT, // 53, BLOCK_ELEMENTS
331: COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES
332: COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS
333: COMMON_SCRIPT, // 56, DINGBATS
334: COMMON_SCRIPT, // 57, BRAILLE_PATTERNS
335: HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT
336: HAN_SCRIPT, // 59, KANGXI_RADICALS
337: HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS
338: COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION
339: HIRAGANA_SCRIPT, // 62, HIRAGANA
340: KATAKANA_SCRIPT, // 63, KATAKANA
341: BOPOMOFO_SCRIPT, // 64, BOPOMOFO
342: JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO
343: HAN_SCRIPT, // 66, KANBUN
344: BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED
345: COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS
346: COMMON_SCRIPT, // 69, CJK_COMPATIBILITY
347: HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
348: HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS
349: YI_SCRIPT, // 72, YI_SYLLABLES
350: YI_SCRIPT, // 73, YI_RADICALS
351: HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES
352: COMMON_SCRIPT, // 75, HIGH_SURROGATES
353: COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES
354: COMMON_SCRIPT, // 77, LOW_SURROGATES
355: COMMON_SCRIPT, // 78, PRIVATE_USE
356: HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS
357: COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS
358: ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A
359: COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS
360: COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS
361: COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS
362: ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B
363: COMMON_SCRIPT, // 86, SPECIALS
364: COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS
365: COMMON_SCRIPT, // 88, SPECIALS
366: };
367:
368: // could be further reduced to a byte array, but I didn't bother.
369: static final int[][] split = {
370: {0x0250, 4, 5}, // -1
371: {0x02B0, 5, 6}, // -2
372: {0x0370, 7, 8}, // -3
373: {0x0530, 0, 10}, // -4
374: {0x0590, 10, 11}, // -5
375: {0x0750, 13, 0}, // -6
376: {0x07C0, 14, 0}, // -7
377: {0x10A0, 28, 29}, // -8
378: {0x13A0, 0, 32}, // -9
379: {0x16A0, 34, 35}, // -10
380: {0x18B0, 37, 0}, // -11
381: {0x2070, 40, 41}, // -12
382: {0x20A0, 41, -31}, // -13
383: {0x2150, 44, 45}, // -14
384: {0x2190, 45, 46}, // -15
385: {0x2440, 49, -32}, // -16
386: {0x25A0, 53, 54}, // -17
387: {0x27C0, 56, 0}, // -18
388: {0x2FE0, 59, -33}, // -19
389: {0x3040, 61, 62}, // -20
390: {0x30A0, 62, 63}, // -21
391: {0x3130, 64, 65}, // -22
392: {0x3190, 65, -34}, // -23
393: {0x4DB6, 70, 0}, // -24
394: {0xA490, 72, -35}, // -25
395: {0xD7A4, 74, 0}, // -26
396: {0xFB50, 80, 81}, // -27
397: {0xFE20, 0, -36}, // -28
398: {0xFEFF, 85, 86}, // -29
399: {0xFFF0, 87, -37}, // -30
400: {0x20D0, 42, 43}, // -31
401: {0x2460, 50, 51}, // -32
402: {0x2FF0, 0, 60}, // -33
403: {0x31A0, 66, -38}, // -34
404: {0xA4D0, 73, 0}, //-35
405: {0xFE30, 82, -39}, //-36
406: {0xFFFE, 88, 0}, //-37
407: {0x31C0, 67, 0}, // -38
408: {0xFE50, 83, -40}, //-39
409: {0xFE70, 84, 85} // -40
410: };
411:
412: static final byte[] charToBlock = {
413: 1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7,
414: 0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27,
415: 28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36,
416: 37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39,
417: -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18,
418: 57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19,
419: -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
420: 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
421: 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
422: 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71,
423: 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
424: 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
425: 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
426: 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
427: 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
428: 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
429: 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
430: 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
431: 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
432: 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
433: 72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0,
434: 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74,
435: 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
436: 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
437: 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
438: 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
439: 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26,
440: 75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77,
441: 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
442: 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
443: 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
444: 78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30
445: };
446: */
447: }
|