001: /**
002: *******************************************************************************
003: * Copyright (C) 2002-2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.dev.tool.layout;
007:
008: import com.ibm.icu.impl.Utility;
009: import com.ibm.icu.lang.UCharacter;
010: import com.ibm.icu.lang.UScript;
011: import com.ibm.icu.text.UnicodeSet;
012: import com.ibm.icu.text.CanonicalIterator;
013: import com.ibm.icu.text.UTF16;
014: import java.util.Vector;
015:
016: public class CanonicalCharacterData {
017: private static int THRESHOLD = 4;
018:
019: public class Record {
020: // TODO: might want to save arrays of Char32's rather than UTF16 strings...
021: Record(int character, int script) {
022: String char32 = UCharacter.toString(character);
023: CanonicalIterator iterator = new CanonicalIterator(char32);
024: Vector equivs = new Vector();
025:
026: composed = character;
027:
028: for (String equiv = iterator.next(); equiv != null; equiv = iterator
029: .next()) {
030: // Skip all equivalents of length 1; it's either the original
031: // characeter or something like Angstrom for A-Ring, which we don't care about
032: if (UTF16.countCodePoint(equiv) > 1) {
033: equivs.add(equiv);
034: }
035: }
036:
037: int nEquivalents = equivs.size();
038:
039: if (nEquivalents > maxEquivalents[script]) {
040: maxEquivalents[script] = nEquivalents;
041: }
042:
043: if (nEquivalents > 0) {
044: equivalents = new String[nEquivalents];
045:
046: if (nEquivalents > THRESHOLD) {
047: dumpEquivalents(character, equivs);
048: }
049:
050: sortEquivalents(equivalents, equivs);
051: }
052: }
053:
054: public int getComposedCharacter() {
055: return composed;
056: }
057:
058: public int countEquivalents() {
059: if (equivalents == null) {
060: return 0;
061: }
062:
063: return equivalents.length;
064: }
065:
066: public String[] getEquivalents() {
067: return equivalents;
068: }
069:
070: public String getEquivalent(int index) {
071: if (equivalents == null || index < 0
072: || index >= equivalents.length) {
073: return null;
074: }
075:
076: return equivalents[index];
077: }
078:
079: private void dumpEquivalents(int character, Vector equivs) {
080: int count = equivs.size();
081:
082: System.out.println(Utility.hex(character, 6) + " - "
083: + count + ":");
084:
085: for (int i = 0; i < count; i += 1) {
086: String equiv = (String) equivs.elementAt(i);
087: int codePoints = UTF16.countCodePoint(equiv);
088:
089: for (int c = 0; c < codePoints; c += 1) {
090: if (c > 0) {
091: System.out.print(" ");
092: }
093:
094: System.out.print(Utility.hex(
095: UTF16.charAt(equiv, c), 6));
096: }
097:
098: System.out.println();
099: }
100:
101: System.out.println();
102: }
103:
104: private int composed;
105: private String[] equivalents = null;
106: }
107:
108: public CanonicalCharacterData() {
109: // nothing to do...
110: }
111:
112: public void add(int character) {
113: int script = UScript.getScript(character);
114: Vector recordVector = recordVectors[script];
115:
116: if (recordVector == null) {
117: recordVector = recordVectors[script] = new Vector();
118: }
119:
120: recordVector.add(new Record(character, script));
121: }
122:
123: public int getMaxEquivalents(int script) {
124: if (script < 0 || script >= UScript.CODE_LIMIT) {
125: return 0;
126: }
127:
128: return maxEquivalents[script];
129: }
130:
131: public Record getRecord(int script, int index) {
132: if (script < 0 || script >= UScript.CODE_LIMIT) {
133: return null;
134: }
135:
136: Vector recordVector = recordVectors[script];
137:
138: if (recordVector == null || index < 0
139: || index >= recordVector.size()) {
140: return null;
141: }
142:
143: return (Record) recordVector.elementAt(index);
144: }
145:
146: public int countRecords(int script) {
147: if (script < 0 || script >= UScript.CODE_LIMIT
148: || recordVectors[script] == null) {
149: return 0;
150: }
151:
152: return recordVectors[script].size();
153: }
154:
155: public static CanonicalCharacterData factory(UnicodeSet characterSet) {
156: int charCount = characterSet.size();
157: CanonicalCharacterData data = new CanonicalCharacterData();
158:
159: System.out.println("There are " + charCount
160: + " characters with a canonical decomposition.");
161:
162: for (int i = 0; i < charCount; i += 1) {
163: data.add(characterSet.charAt(i));
164: }
165:
166: return data;
167: }
168:
169: private static int compareEquivalents(String a, String b) {
170: int result = UTF16.countCodePoint(a) - UTF16.countCodePoint(b);
171:
172: if (result == 0) {
173: return a.compareTo(b);
174: }
175:
176: return result;
177: }
178:
179: //
180: // Straight insertion sort from Knuth vol. III, pg. 81
181: //
182: private static void sortEquivalents(String[] equivalents,
183: Vector unsorted) {
184: int nEquivalents = equivalents.length;
185:
186: for (int e = 0; e < nEquivalents; e += 1) {
187: String v = (String) unsorted.elementAt(e);
188: int i;
189:
190: for (i = e - 1; i >= 0; i -= 1) {
191: if (compareEquivalents(v, equivalents[i]) >= 0) {
192: break;
193: }
194:
195: equivalents[i + 1] = equivalents[i];
196: }
197:
198: equivalents[i + 1] = v;
199: }
200: }
201:
202: private Vector recordVectors[] = new Vector[UScript.CODE_LIMIT];
203: private int maxEquivalents[] = new int[UScript.CODE_LIMIT];
204:
205: }
|