001: /*
002: **********************************************************************
003: * Copyright (c) 2003, International Business Machines
004: * Corporation and others. All Rights Reserved.
005: **********************************************************************
006: * Author: Alan Liu
007: * Created: February 11 2003
008: * Since: ICU 2.6
009: **********************************************************************
010: */
011: package com.ibm.icu.dev.tool.translit;
012:
013: import com.ibm.icu.lang.UCharacter;
014: import com.ibm.icu.text.*;
015: import com.ibm.icu.impl.Utility;
016: import java.util.Collections;
017: import java.util.Comparator;
018: import java.util.HashMap;
019: import java.util.Iterator;
020: import java.util.Map;
021: import java.util.Set;
022: import java.util.TreeSet;
023: import java.util.Vector;
024: import java.util.Locale;
025: import java.io.*;
026:
027: /**
028: * This class produces the data tables used by the closeOver() method
029: * of UnicodeSet.
030: *
031: * Whenever the Unicode database changes, this tool must be re-run
032: * (AFTER the data file(s) underlying ICU4J are udpated).
033: *
034: * The output of this tool should then be pasted into the appropriate
035: * files:
036: *
037: * ICU4J: com.ibm.icu.text.UnicodeSet.java
038: * ICU4C: /icu/source/common/uniset.cpp
039: */
040: class UnicodeSetCloseOver {
041:
042: // Our output files
043: static final String JAVA_OUT = "to_UnicodeSet.java";
044: static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java";
045: static final String C_SET_OUT = "to_uniset.cpp";
046: static final String C_UCHAR_OUT = "to_uchar.c";
047:
048: // Source code "do not edit" warning
049: static final String WARNING = "MACHINE-GENERATED; Unicode version "
050: + UCharacter.getUnicodeVersion() + "; DO NOT EDIT; See "
051: + UnicodeSetCloseOver.class.getName();
052:
053: // Case folding options flag. This must correspond to the options
054: // used in UnicodeSet.closeOver() in Java and C++.
055: static final boolean DEFAULT_CASE_MAP = true; // false for Turkish
056:
057: public static void main(String[] args) throws IOException {
058: System.out
059: .println("This tool will generate several output files. Each is named according");
060: System.out
061: .println("the target file. For example, the contents of to_UnicodeSet.java should");
062: System.out.println("be pasted into UnicodeSet.java.");
063: System.out.println();
064:
065: generateCaseData();
066: }
067:
068: /**
069: * Create a map of String => Set. The String in this case is a
070: * folded string for which
071: * UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded).
072: * The Set contains all single-character strings x for which
073: * UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as
074: * well as folded itself.
075: */
076: static Map createCaseFoldEquivalencyClasses() {
077: Map equivClasses = new HashMap();
078: for (int i = 0; i <= 0x10FFFF; ++i) {
079: int cat = UCharacter.getType(i);
080: if (cat == Character.UNASSIGNED
081: || cat == Character.PRIVATE_USE)
082: continue;
083:
084: String cp = UTF16.valueOf(i);
085: String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
086: if (folded.equals(cp))
087: continue;
088:
089: // At this point, have different case folding. Add
090: // the code point and its folded equivalent into the
091: // equivalency class.
092: TreeSet s = (TreeSet) equivClasses.get(folded);
093: if (s == null) {
094: s = new TreeSet();
095: s.add(folded); // add the case fold result itself
096: equivClasses.put(folded, s);
097: }
098: s.add(cp);
099: }
100: return equivClasses;
101: }
102:
103: /**
104: * Analyze the case fold equivalency classes. Break them into two
105: * groups: 'pairs', and 'nonpairs'. Create a tally of the length
106: * configurations of the nonpairs.
107: *
108: * Length configurations of equivalency classes, as of Unicode
109: * 3.2. Most of the classes (83%) have two single codepoints.
110: * Here "112:28" means there are 28 equivalency classes with 2
111: * single codepoints and one string of length 2.
112: *
113: * 11:656
114: * 111:16
115: * 1111:3
116: * 112:28
117: * 113:2
118: * 12:31
119: * 13:12
120: * 22:38
121: *
122: * Note: This method does not count the frequencies of the
123: * different length configurations (as shown above after ':'); it
124: * merely records which configurations occur.
125: *
126: * @param pairs Accumulate equivalency classes that consist of
127: * exactly two codepoints here. This is 83+% of the classes.
128: * E.g., {"a", "A"}.
129: * @param nonpairs Accumulate other equivalency classes here, as
130: * lists of strings. E,g, {"st", "\uFB05", "\uFB06"}.
131: * @param lengths Accumulate a list of unique length structures,
132: * not including pairs. Each length structure is represented by a
133: * string of digits. The digit string "12" means the equivalency
134: * class contains a single code point and a string of length 2.
135: * Typical contents of 'lengths': { "111", "1111", "112",
136: * "113", "12", "13", "22" }. Note the absence of "11".
137: */
138: static void analyzeCaseData(Map equivClasses, StringBuffer pairs,
139: Vector nonpairs, Vector lengths) {
140: Iterator i = new TreeSet(equivClasses.keySet()).iterator();
141: StringBuffer buf = new StringBuffer();
142: while (i.hasNext()) {
143: Object key = i.next();
144: Vector v = new Vector((Set) equivClasses.get(key));
145: if (v.size() == 2) {
146: String a = (String) v.elementAt(0);
147: String b = (String) v.elementAt(1);
148: if (a.length() == 1 && b.length() == 1) {
149: pairs.append(a).append(b);
150: continue;
151: // Note that pairs are included in 'lengths'
152: }
153: }
154: String[] a = new String[v.size()];
155: v.toArray(a);
156: nonpairs.add(a);
157: //int singleCount = 0;
158: //int stringCount = 0;
159: // Make a string of the lengths, e.g., "111" means 3
160: // single code points; "13" means a single code point
161: // and a string of length 3.
162: v.clear();
163: for (int j = 0; j < a.length; ++j) {
164: v.add(new Integer(a[j].length()));
165: }
166: Collections.sort(v);
167: buf.setLength(0);
168: for (int j = 0; j < v.size(); ++j) {
169: buf.append(String.valueOf(v.elementAt(j)));
170: }
171: if (!lengths.contains(buf.toString())) {
172: lengths.add(buf.toString());
173: }
174: }
175: }
176:
177: static void generateCaseData() throws IOException {
178:
179: Map equivClasses = createCaseFoldEquivalencyClasses();
180:
181: // Accumulate equivalency classes that consist of exactly
182: // two codepoints here. This is 83+% of the classes.
183: // E.g., {"a", "A"}.
184: StringBuffer pairs = new StringBuffer();
185:
186: // Accumulate other equivalency classes here, as lists
187: // of strings. E,g, {"st", "\uFB05", "\uFB06"}.
188: Vector nonpairs = new Vector(); // contains String[]
189: Vector lengths = new Vector(); // "111", "12", "22", etc.
190:
191: analyzeCaseData(equivClasses, pairs, nonpairs, lengths);
192:
193: //-------------------------------------------------------------
194: // Emit Java source
195: PrintStream out = new PrintStream(
196: new FileOutputStream(JAVA_OUT));
197: System.out.println("Writing " + JAVA_OUT);
198:
199: out.println(" // " + WARNING);
200: out.println(" private static final String CASE_PAIRS =");
201: out.println(Utility.formatForSource(pairs.toString()) + ";");
202: out.println();
203: out.println(" // " + WARNING);
204: out
205: .println(" private static final String[][] CASE_NONPAIRS = {");
206: for (int j = 0; j < nonpairs.size(); ++j) {
207: String[] a = (String[]) nonpairs.elementAt(j);
208: out.print(" {");
209: for (int k = 0; k < a.length; ++k) {
210: if (k != 0)
211: out.print(", ");
212: out.print(Utility.format1ForSource(a[k]));
213: }
214: out.println("},");
215: }
216: out.println(" };");
217:
218: //-------------------------------------------------------------
219: // Emit C++ source
220:
221: // In C++, the pairs are again emitted in an array, but this
222: // array is the final representation form -- it will not be
223: // reprocessed into a hash. It will be binary searched by
224: // looking at the even elements [0], [2], [4], etc., and
225: // ignoring the odd elements. The even elements must contain
226: // the folded members of the pairs. That is, in the pair
227: // {'A', 'a'}, the even element must be 'a', not 'A'. Then a
228: // code point to be located is first folded ('Y' => 'y') then
229: // it binary searched against [0]='A', [2]='B', etc. When a
230: // match is found at k, the pair is [k], [k+1].
231:
232: out = new PrintStream(new FileOutputStream(C_SET_OUT));
233: System.out.println("Writing " + C_SET_OUT);
234:
235: // Sort the pairs. They must be ordered by the folded element.
236: // Store these as two-character strings, with charAt(0) being
237: // the folded member of the pair.
238: TreeSet sortPairs = new TreeSet(new Comparator() {
239: public int compare(Object a, Object b) {
240: return ((int) ((String) a).charAt(0))
241: - ((int) ((String) b).charAt(0));
242: }
243:
244: public boolean equals(Object obj) {
245: return false;
246: }
247: });
248: for (int i = 0; i < pairs.length(); i += 2) {
249: String a = String.valueOf(pairs.charAt(i));
250: String b = String.valueOf(pairs.charAt(i + 1));
251: String folded = UCharacter.foldCase(a, DEFAULT_CASE_MAP);
252: if (a.equals(folded)) {
253: sortPairs.add(a + b);
254: } else {
255: sortPairs.add(b + a);
256: }
257: }
258:
259: // Emit the pairs
260: out.println("// " + WARNING);
261: out.println("static const UChar CASE_PAIRS[] = {");
262: Iterator it = sortPairs.iterator();
263: while (it.hasNext()) {
264: out.print(" ");
265: int n = 0;
266: while (n++ < 5 && it.hasNext()) {
267: String s = (String) it.next();
268: //out.print((int) s.charAt(0) + "," +
269: // (int) s.charAt(1) + ",");
270: out.print("0x" + Utility.hex(s.charAt(0)) + ",0x"
271: + Utility.hex(s.charAt(1)) + ",");
272: }
273: out.println();
274: }
275: out.println("};");
276: out.println();
277:
278: // The non-pairs are encoded in the following way. All the
279: // single codepoints in each class are grouped together
280: // followed by a zero. Then each multi-character string is
281: // added, followed by a zero. Finally, another zero is added.
282: // Some examples:
283: // {"iQ", "R"} => [ 'R', 0, 'i', 'Q', 0, 0 ]
284: // {"S", "D", "F", "G"} => [ 'S', 'D', 'F', 'G', 0, 0 ]
285: // {"jW", "jY"} => [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ]
286: // The end-result is a short, flat array of UChar values that
287: // can be used to initialize a UChar[] array in C.
288:
289: int maxLen = 0; // Maximum encoded length of any class, including zeros
290: out.println("// " + WARNING);
291: out.println("static const CaseEquivClass CASE_NONPAIRS[] = {");
292: for (int j = 0; j < nonpairs.size(); ++j) {
293: int len = 0;
294: String[] a = (String[]) nonpairs.elementAt(j);
295: out.print(" {");
296: // Emit single code points
297: for (int k = 0; k < a.length; ++k) {
298: if (a[k].length() != 1)
299: continue;
300: //out.print((int) a[k].charAt(0) + ",");
301: out.print("0x" + Utility.hex(a[k].charAt(0)) + ",");
302: ++len;
303: }
304: out.print("0, "); // End of single code points
305: ++len;
306: // Emit multi-character strings
307: for (int k = 0; k < a.length; ++k) {
308: if (a[k].length() == 1)
309: continue;
310: for (int m = 0; m < a[k].length(); ++m) {
311: //out.print((int) a[k].charAt(m) + ",");
312: out.print("0x" + Utility.hex(a[k].charAt(m)) + ",");
313: ++len;
314: }
315: out.print("0, "); // End of string
316: ++len;
317: }
318: out.println("0},"); // End of equivalency class
319: ++len;
320: if (len > maxLen)
321: maxLen = len;
322: }
323: out.println("};");
324:
325: // Make sure the CaseEquivClass data can fit.
326: if (maxLen > 8) {
327: throw new RuntimeException(
328: "Must adjust CaseEquivClass to accomodate "
329: + maxLen + " UChars");
330: }
331:
332: // Also make sure that we can map into this array using a
333: // CompactByteArray. We could do this check above, but we
334: // keep it here, adjacent to the maxLen check. We use one
335: // value (-1 == 255) to indicate "no value."
336: if (nonpairs.size() > 255) {
337: throw new RuntimeException(
338: "Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray");
339: }
340:
341: //-------------------------------------------------------------
342: // Case-unique set: All characters c for which closeOver(c)==c.
343:
344: // UPDATE: Instead of using this, we're using the related
345: // notion of Case_Sensitive. See below. Note that
346: // Case_Sensitive != ^Case_Unique.
347:
348: if (false) {
349: UnicodeSet caseUnique = new UnicodeSet();
350: for (int i = 0; i <= 0x10FFFF; ++i) {
351: String cp = UTF16.valueOf(i);
352: if (equivClasses.get(UCharacter.foldCase(cp,
353: DEFAULT_CASE_MAP)) == null) {
354: caseUnique.add(i);
355: }
356: }
357: // out.println("caseUnique = " + caseUnique.toPattern(true));
358: }
359:
360: UnicodeSet caseSensitive = getCaseSensitive();
361: //System.out.println("caseSensitive = " + caseSensitive.toPattern(true));
362:
363: // Now for C, emit an array of ranges
364: out = new PrintStream(new FileOutputStream(C_UCHAR_OUT));
365: System.out.println("Writing " + C_UCHAR_OUT);
366:
367: out.println("/* " + WARNING + " */");
368: emitUCharRangesArray(out, caseSensitive,
369: "CASE_SENSITIVE_RANGES");
370:
371: // For Java, emit a string with the ranges (each pair of chars
372: // in the string is a range).
373: out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT));
374: System.out.println("Writing " + JAVA_CHARPROP_OUT);
375: out.println(" // " + WARNING);
376: emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES");
377: }
378:
379: /**
380: * Create the set of case-sensitive characters. These are characters
381: * that participate in any case mapping operation as a source or
382: * as a member of a target string.
383: */
384: static UnicodeSet getCaseSensitive() {
385: UnicodeSet caseSensitive = new UnicodeSet();
386: Locale loc = Locale.US;
387: BreakIterator bi = BreakIterator.getTitleInstance(loc);
388: for (int c = 0; c <= 0x10FFFF; ++c) {
389: String cp = UTF16.valueOf(c);
390: for (int j = 0; j < 4; ++j) {
391: String s = null;
392: switch (j) {
393: case 0:
394: s = UCharacter.toUpperCase(loc, cp);
395: break;
396: case 1:
397: s = UCharacter.toLowerCase(loc, cp);
398: break;
399: case 2:
400: s = UCharacter.toTitleCase(loc, cp, bi);
401: break;
402: case 3:
403: s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
404: break;
405: }
406: if (!s.equals(cp)) {
407: int cc;
408: for (int k = 0; k < s.length(); k += UTF16
409: .getCharCount(cc)) {
410: cc = UTF16.charAt(s, k);
411: caseSensitive.add(cc);
412: }
413: for (int k = 0; k < cp.length(); k += UTF16
414: .getCharCount(cc)) {
415: cc = UTF16.charAt(cp, k);
416: caseSensitive.add(cc);
417: }
418: }
419: }
420: // Also do the single-codepoint API. This shouldn't add any
421: // code points, but we do it for completeness.
422: for (int j = 0; j < 4; ++j) {
423: int d = 0;
424: switch (j) {
425: case 0:
426: d = UCharacter.toUpperCase(c);
427: break;
428: case 1:
429: d = UCharacter.toLowerCase(c);
430: break;
431: case 2:
432: d = UCharacter.toTitleCase(c);
433: break;
434: case 3:
435: d = UCharacter.foldCase(c, DEFAULT_CASE_MAP);
436: break;
437: }
438: if (d != c) {
439: if (!caseSensitive.contains(c)
440: || !caseSensitive.contains(d)) {
441: System.out.println("Warning: code point " + c
442: + " => " + d + " created NEW MAPPING"
443: + " for Case_Sensitive");
444: }
445: caseSensitive.add(c);
446: caseSensitive.add(d);
447: }
448: }
449: }
450: return caseSensitive;
451: }
452:
453: /**
454: * Given a UnicodeSet, emit it as an array of UChar pairs. Each
455: * pair will be the start/end of a range. Code points >= U+10000
456: * will be represented as surrogate pairs.
457: */
458: static void emitUCharRangesArray(PrintStream out, UnicodeSet set,
459: String id) {
460: // Store the pairs in a StringBuffer. This handles surrogate
461: // representation.
462: StringBuffer buf = new StringBuffer();
463: for (int i = 0; i < set.getRangeCount(); ++i) {
464: UTF16.append(buf, set.getRangeStart(i));
465: UTF16.append(buf, set.getRangeEnd(i));
466: }
467: // Emit the pairs
468: out.println("static const UChar " + id + "[] = {");
469: for (int i = 0; i < buf.length();) {
470: out.print(" ");
471: for (int n = 0; n++ < 10 && i < buf.length(); ++i) {
472: out.print("0x" + Utility.hex(buf.charAt(i), 4) + ',');
473: }
474: out.println();
475: }
476: out.println("};");
477: out.println("#define " + id + "_LENGTH (sizeof(" + id
478: + ")/sizeof(" + id + "[0]))");
479: }
480:
481: /**
482: * Given a UnicodeSet, emit it as a Java string. The most economical
483: * format is not the pattern, but instead a pairs list, with each
484: * range pair represented as two adjacent characters.
485: */
486: static void emitRangesString(PrintStream out, UnicodeSet set,
487: String id) {
488: // Store the pairs in a StringBuffer. This handles surrogate
489: // representation.
490: StringBuffer buf = new StringBuffer();
491: for (int i = 0; i < set.getRangeCount(); ++i) {
492: UTF16.append(buf, set.getRangeStart(i));
493: UTF16.append(buf, set.getRangeEnd(i));
494: }
495: // Emit the pairs
496: out.println(" private static final String " + id + " =");
497: out.println(Utility.formatForSource(buf.toString()) + ";");
498: }
499: }
|