001: /**
002: * Builds the normalization tables. This is a separate class so that it
003: * can be unloaded once not needed.<br>
004: * Copyright (C) 1998-2004 International Business Machines Corporation and
005: * Unicode, Inc. All Rights Reserved.<br>
006: * The Unicode Consortium makes no expressed or implied warranty of any
007: * kind, and assumes no liability for errors or omissions.
008: * No liability is assumed for incidental and consequential damages
009: * in connection with or arising out of the use of the information here.
010: * @author Mark Davis
011: * Updates for supplementary code points:
012: * Vladimir Weinstein & Markus Scherer
013: */package com.ibm.icu.dev.test.normalizer;
014:
015: import java.io.BufferedReader;
016: import java.util.BitSet;
017:
018: import com.ibm.icu.dev.test.TestUtil;
019: import com.ibm.icu.dev.test.UTF16Util;
020:
021: class NormalizerBuilder {
022: static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
023:
024: /**
025: * Testing flags
026: */
027:
028: private static final boolean DEBUG = false;
029: private static final boolean GENERATING = false;
030:
031: /**
032: * Constants for the data file version to use.
033: */
034: /*static final boolean NEW_VERSION = true;
035: private static final String DIR = "D:\\UnicodeData\\" + (NEW_VERSION ? "WorkingGroups\\" : "");
036:
037: static final String UNIDATA_VERSION = NEW_VERSION ? "3.0.0d12" : "2.1.9";
038: static final String EXCLUSIONS_VERSION = NEW_VERSION ? "1d4" : "1";
039:
040: public static final String UNICODE_DATA = DIR + "UnicodeData-" + UNIDATA_VERSION + ".txt";
041: public static final String COMPOSITION_EXCLUSIONS = DIR + "CompositionExclusions-" + EXCLUSIONS_VERSION +".txt";
042: */
043:
044: /**
045: * Called exactly once by NormalizerData to build the static data
046: */
047:
048: static NormalizerData build(boolean fullData) {
049: try {
050: IntHashtable canonicalClass = new IntHashtable(0);
051: IntStringHashtable decompose = new IntStringHashtable(null);
052: LongHashtable compose = new LongHashtable(
053: NormalizerData.NOT_COMPOSITE);
054: BitSet isCompatibility = new BitSet();
055: BitSet isExcluded = new BitSet();
056: if (fullData) {
057: //System.out.println("Building Normalizer Data from file.");
058: readExclusionList(isExcluded);
059: //System.out.println(isExcluded.get(0x00C0));
060: buildDecompositionTables(canonicalClass, decompose,
061: compose, isCompatibility, isExcluded);
062: } else { // for use in Applets
063: //System.out.println("Building abridged data.");
064: setMinimalDecomp(canonicalClass, decompose, compose,
065: isCompatibility, isExcluded);
066: }
067: return new NormalizerData(canonicalClass, decompose,
068: compose, isCompatibility, isExcluded);
069: } catch (java.io.IOException e) {
070: System.err.println("Can't load data file." + e + ", "
071: + e.getMessage());
072: return null;
073: }
074: }
075:
076: // =============================================================
077: // Building Decomposition Tables
078: // =============================================================
079:
080: /**
081: * Reads exclusion list and stores the data
082: */
083: private static void readExclusionList(BitSet isExcluded)
084: throws java.io.IOException {
085: if (DEBUG)
086: System.out.println("Reading Exclusions");
087:
088: BufferedReader in = TestUtil
089: .getDataReader("unicode/CompositionExclusions.txt");
090:
091: while (true) {
092: // read a line, discarding comments and blank lines
093:
094: String line = in.readLine();
095: if (line == null)
096: break;
097: int comment = line.indexOf('#'); // strip comments
098: if (comment != -1)
099: line = line.substring(0, comment);
100: if (line.length() == 0)
101: continue; // ignore blanks
102: if (line.indexOf(' ') != -1) {
103: line = line.substring(0, line.indexOf(' '));
104: }
105: // store -1 in the excluded table for each character hit
106:
107: int value = Integer.parseInt(line, 16);
108: isExcluded.set(value);
109: //System.out.println("Excluding " + hex(value));
110: }
111: in.close();
112: if (DEBUG)
113: System.out.println("Done reading Exclusions");
114: }
115:
116: /**
117: * Builds a decomposition table from a UnicodeData file
118: */
119: private static void buildDecompositionTables(
120: IntHashtable canonicalClass, IntStringHashtable decompose,
121: LongHashtable compose, BitSet isCompatibility,
122: BitSet isExcluded) throws java.io.IOException {
123: if (DEBUG)
124: System.out.println("Reading Unicode Character Database");
125: //BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024);
126: BufferedReader in = null;
127: try {
128: in = TestUtil.getDataReader("unicode/UnicodeData.txt");
129: } catch (Exception e) {
130: System.err.println("Failed to read UnicodeData.txt");
131: System.exit(1);
132: }
133:
134: int value;
135: long pair;
136: int counter = 0;
137: while (true) {
138:
139: // read a line, discarding comments and blank lines
140:
141: String line = in.readLine();
142: if (line == null)
143: break;
144: int comment = line.indexOf('#'); // strip comments
145: if (comment != -1)
146: line = line.substring(0, comment);
147: if (line.length() == 0)
148: continue;
149: if (DEBUG) {
150: counter++;
151: if ((counter & 0xFF) == 0)
152: System.out.println("At: " + line);
153: }
154:
155: // find the values of the particular fields that we need
156: // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
157:
158: int start = 0;
159: int end = line.indexOf(';'); // code
160: value = Integer.parseInt(line.substring(start, end), 16);
161: if (true && value == '\u00c0') {
162: //System.out.println("debug: " + line);
163: }
164: end = line.indexOf(';', start = end + 1); // name
165: /*String name = line.substring(start,end);*/
166: end = line.indexOf(';', start = end + 1); // general category
167: end = line.indexOf(';', start = end + 1); // canonical class
168:
169: // check consistency: canonical classes must be from 0 to 255
170:
171: int cc = Integer.parseInt(line.substring(start, end));
172: if (cc != (cc & 0xFF))
173: System.err.println("Bad canonical class at: " + line);
174: canonicalClass.put(value, cc);
175: end = line.indexOf(';', start = end + 1); // BIDI
176: end = line.indexOf(';', start = end + 1); // decomp
177:
178: // decomp requires more processing.
179: // store whether it is canonical or compatibility.
180: // store the decomp in one table, and the reverse mapping (from pairs) in another
181:
182: if (start != end) {
183: String segment = line.substring(start, end);
184: boolean compat = segment.charAt(0) == '<';
185: if (compat)
186: isCompatibility.set(value);
187: String decomp = fromHex(segment);
188:
189: // a small snippet of code to generate the Applet data
190:
191: /*if (GENERATING) {
192: if (value < 0xFF) {
193: System.out.println(
194: "\"\\u" + hex((char)value) + "\", "
195: + "\"\\u" + hex(decomp, "\\u") + "\", "
196: + (compat ? "\"K\"," : "\"\",")
197: + "// " + name);
198: }
199: }*/
200:
201: // check consistency: all canon decomps must be singles or pairs!
202: int decompLen = UTF16Util.countCodePoint(decomp);
203: if (decompLen < 1 || decompLen > 2 && !compat) {
204: System.err.println("Bad decomp at: " + line);
205: }
206: decompose.put(value, decomp);
207:
208: // only compositions are canonical pairs
209: // skip if script exclusion
210:
211: if (!compat && !isExcluded.get(value)) {
212: int first = '\u0000';
213: int second = UTF16Util.nextCodePoint(decomp, 0);
214: if (decompLen > 1) {
215: first = second;
216: second = UTF16Util.nextCodePoint(decomp,
217: UTF16Util.codePointLength(first));
218: }
219:
220: // store composition pair in single integer
221:
222: pair = ((long) first << 32) | second;
223: if (DEBUG && value == '\u00C0') {
224: System.out.println("debug2: " + line);
225: }
226: compose.put(pair, value);
227: } else if (DEBUG) {
228: System.out.println("Excluding: " + decomp);
229: }
230: }
231: }
232: in.close();
233: if (DEBUG)
234: System.out
235: .println("Done reading Unicode Character Database");
236:
237: // add algorithmic Hangul decompositions
238: // this is more compact if done at runtime, but for simplicity we
239: // do it this way.
240:
241: if (DEBUG)
242: System.out.println("Adding Hangul");
243:
244: for (int SIndex = 0; SIndex < SCount; ++SIndex) {
245: int TIndex = SIndex % TCount;
246: char first, second;
247: if (TIndex != 0) { // triple
248: first = (char) (SBase + SIndex - TIndex);
249: second = (char) (TBase + TIndex);
250: } else {
251: first = (char) (LBase + SIndex / NCount);
252: second = (char) (VBase + (SIndex % NCount) / TCount);
253: }
254: pair = ((long) first << 32) | second;
255: value = SIndex + SBase;
256: decompose.put(value, String.valueOf(first) + second);
257: compose.put(pair, value);
258: }
259: if (DEBUG)
260: System.out.println("Done adding Hangul");
261: }
262:
263: /**
264: * Hangul composition constants
265: */
266: static final int SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161,
267: TBase = 0x11A7, LCount = 19, VCount = 21, TCount = 28,
268: NCount = VCount * TCount, // 588
269: SCount = LCount * NCount; // 11172
270:
271: /**
272: * For use in an applet: just load a minimal set of data.
273: */
274: private static void setMinimalDecomp(IntHashtable canonicalClass,
275: IntStringHashtable decompose, LongHashtable compose,
276: BitSet isCompatibility, BitSet isExcluded) {
277: String[] decomposeData = { "\u005E", "\u0020\u0302", "K",
278: "\u005F", "\u0020\u0332", "K", "\u0060",
279: "\u0020\u0300", "K", "\u00A0", "\u0020", "K", "\u00A8",
280: "\u0020\u0308", "K", "\u00AA", "\u0061", "K", "\u00AF",
281: "\u0020\u0304", "K", "\u00B2", "\u0032", "K", "\u00B3",
282: "\u0033", "K", "\u00B4", "\u0020\u0301", "K", "\u00B5",
283: "\u03BC", "K", "\u00B8", "\u0020\u0327", "K", "\u00B9",
284: "\u0031", "K", "\u00BA", "\u006F", "K", "\u00BC",
285: "\u0031\u2044\u0034", "K", "\u00BD",
286: "\u0031\u2044\u0032", "K", "\u00BE",
287: "\u0033\u2044\u0034", "K", "\u00C0", "\u0041\u0300",
288: "", "\u00C1", "\u0041\u0301", "", "\u00C2",
289: "\u0041\u0302", "", "\u00C3", "\u0041\u0303", "",
290: "\u00C4", "\u0041\u0308", "", "\u00C5", "\u0041\u030A",
291: "", "\u00C7", "\u0043\u0327", "", "\u00C8",
292: "\u0045\u0300", "", "\u00C9", "\u0045\u0301", "",
293: "\u00CA", "\u0045\u0302", "", "\u00CB", "\u0045\u0308",
294: "", "\u00CC", "\u0049\u0300", "", "\u00CD",
295: "\u0049\u0301", "", "\u00CE", "\u0049\u0302", "",
296: "\u00CF", "\u0049\u0308", "", "\u00D1", "\u004E\u0303",
297: "", "\u00D2", "\u004F\u0300", "", "\u00D3",
298: "\u004F\u0301", "", "\u00D4", "\u004F\u0302", "",
299: "\u00D5", "\u004F\u0303", "", "\u00D6", "\u004F\u0308",
300: "", "\u00D9", "\u0055\u0300", "", "\u00DA",
301: "\u0055\u0301", "", "\u00DB", "\u0055\u0302", "",
302: "\u00DC", "\u0055\u0308", "", "\u00DD", "\u0059\u0301",
303: "", "\u00E0", "\u0061\u0300", "", "\u00E1",
304: "\u0061\u0301", "", "\u00E2", "\u0061\u0302", "",
305: "\u00E3", "\u0061\u0303", "", "\u00E4", "\u0061\u0308",
306: "", "\u00E5", "\u0061\u030A", "", "\u00E7",
307: "\u0063\u0327", "", "\u00E8", "\u0065\u0300", "",
308: "\u00E9", "\u0065\u0301", "", "\u00EA", "\u0065\u0302",
309: "", "\u00EB", "\u0065\u0308", "", "\u00EC",
310: "\u0069\u0300", "", "\u00ED", "\u0069\u0301", "",
311: "\u00EE", "\u0069\u0302", "", "\u00EF", "\u0069\u0308",
312: "", "\u00F1", "\u006E\u0303", "", "\u00F2",
313: "\u006F\u0300", "", "\u00F3", "\u006F\u0301", "",
314: "\u00F4", "\u006F\u0302", "", "\u00F5", "\u006F\u0303",
315: "", "\u00F6", "\u006F\u0308", "", "\u00F9",
316: "\u0075\u0300", "", "\u00FA", "\u0075\u0301", "",
317: "\u00FB", "\u0075\u0302", "", "\u00FC", "\u0075\u0308",
318: "", "\u00FD", "\u0079\u0301",
319: "",
320: // EXTRAS, outside of Latin 1
321: "\u1EA4", "\u00C2\u0301", "", "\u1EA5", "\u00E2\u0301",
322: "", "\u1EA6", "\u00C2\u0300", "", "\u1EA7",
323: "\u00E2\u0300", "", };
324:
325: int[] classData = { 0x0300, 230, 0x0301, 230, 0x0302, 230,
326: 0x0303, 230, 0x0304, 230, 0x0305, 230, 0x0306, 230,
327: 0x0307, 230, 0x0308, 230, 0x0309, 230, 0x030A, 230,
328: 0x030B, 230, 0x030C, 230, 0x030D, 230, 0x030E, 230,
329: 0x030F, 230, 0x0310, 230, 0x0311, 230, 0x0312, 230,
330: 0x0313, 230, 0x0314, 230, 0x0315, 232, 0x0316, 220,
331: 0x0317, 220, 0x0318, 220, 0x0319, 220, 0x031A, 232,
332: 0x031B, 216, 0x031C, 220, 0x031D, 220, 0x031E, 220,
333: 0x031F, 220, 0x0320, 220, 0x0321, 202, 0x0322, 202,
334: 0x0323, 220, 0x0324, 220, 0x0325, 220, 0x0326, 220,
335: 0x0327, 202, 0x0328, 202, 0x0329, 220, 0x032A, 220,
336: 0x032B, 220, 0x032C, 220, 0x032D, 220, 0x032E, 220,
337: 0x032F, 220, 0x0330, 220, 0x0331, 220, 0x0332, 220,
338: 0x0333, 220, 0x0334, 1, 0x0335, 1, 0x0336, 1, 0x0337,
339: 1, 0x0338, 1, 0x0339, 220, 0x033A, 220, 0x033B, 220,
340: 0x033C, 220, 0x033D, 230, 0x033E, 230, 0x033F, 230,
341: 0x0340, 230, 0x0341, 230, 0x0342, 230, 0x0343, 230,
342: 0x0344, 230, 0x0345, 240, 0x0360, 234, 0x0361, 234 };
343:
344: // build the same tables we would otherwise get from the
345: // Unicode Character Database, just with limited data
346:
347: for (int i = 0; i < decomposeData.length; i += 3) {
348: char value = decomposeData[i].charAt(0);
349: String decomp = decomposeData[i + 1];
350: boolean compat = decomposeData[i + 2].equals("K");
351: if (compat)
352: isCompatibility.set(value);
353: decompose.put(value, decomp);
354: if (!compat) {
355: int first = '\u0000';
356: int second = UTF16Util.nextCodePoint(decomp, 0);
357: if (decomp.length() > 1) {
358: first = second;
359: second = UTF16Util.nextCodePoint(decomp, UTF16Util
360: .codePointLength(first));
361: }
362: long pair = (first << 16) | second;
363: compose.put(pair, value);
364: }
365: }
366:
367: for (int i = 0; i < classData.length;) {
368: canonicalClass.put(classData[i++], classData[i++]);
369: }
370: }
371:
372: /**
373: * Utility: Parses a sequence of hex Unicode characters separated by spaces
374: */
375: static public String fromHex(String source) {
376: StringBuffer result = new StringBuffer();
377: for (int i = 0; i < source.length(); ++i) {
378: char c = source.charAt(i);
379: switch (c) {
380: case ' ':
381: break; // ignore
382: case '0':
383: case '1':
384: case '2':
385: case '3':
386: case '4':
387: case '5':
388: case '6':
389: case '7':
390: case '8':
391: case '9':
392: case 'A':
393: case 'B':
394: case 'C':
395: case 'D':
396: case 'E':
397: case 'F':
398: case 'a':
399: case 'b':
400: case 'c':
401: case 'd':
402: case 'e':
403: case 'f':
404: int end = 0;
405: int value = 0;
406: try {
407: //System.out.println(source.substring(i, i + 4) + "************" + source);
408: end = source.indexOf(' ', i);
409: if (end < 0) {
410: end = source.length();
411: }
412: value = Integer.parseInt(source.substring(i, end),
413: 16);
414: UTF16Util.appendCodePoint(result, value);
415: } catch (Exception e) {
416: System.out.println("i: " + i + ";end:" + end
417: + "source:" + source);
418: //System.out.println(source.substring(i, i + 4) + "************" + source);
419: System.exit(1);
420: }
421: //i+= 3; // skip rest of number
422: i = end;
423: break;
424: case '<':
425: int j = source.indexOf('>', i); // skip <...>
426: if (j > 0) {
427: i = j;
428: break;
429: } // else fall through--error
430: default:
431: throw new IllegalArgumentException("Bad hex value in "
432: + source);
433: }
434: }
435: return result.toString();
436: }
437:
438: /**
439: * Utility: Supplies a zero-padded hex representation of an integer (without 0x)
440: */
441: static public String hex(int i) {
442: String result = Long.toString(i & 0xFFFFFFFFL, 16)
443: .toUpperCase();
444: return "00000000".substring(result.length(), 8) + result;
445: }
446:
447: /**
448: * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
449: */
450: static public String hex(char i) {
451: String result = Integer.toString(i, 16).toUpperCase();
452: return "0000".substring(result.length(), 4) + result;
453: }
454:
455: /**
456: * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
457: */
458: public static String hex(String s, String sep) {
459: StringBuffer result = new StringBuffer();
460: for (int i = 0; i < s.length(); ++i) {
461: if (i != 0)
462: result.append(sep);
463: result.append(hex(s.charAt(i)));
464: }
465: return result.toString();
466: }
467: }
|