001: /**
002: *******************************************************************************
003: * Copyright (C) 2001-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.dev.demo.translit;
007:
008: import com.ibm.icu.lang.UScript;
009: import com.ibm.icu.lang.UCharacter;
010: import com.ibm.icu.text.UTF16;
011: import com.ibm.icu.text.Transliterator;
012: import com.ibm.icu.text.UnicodeSet;
013: import com.ibm.icu.text.UnicodeSetIterator;
014: import com.ibm.icu.text.Normalizer;
015: import com.ibm.icu.impl.Utility;
016:
017: import java.util.*;
018: import java.io.*;
019:
020: public class TransliterationChart {
021: public static void main(String[] args) throws IOException {
022: System.out.println("Start");
023: UnicodeSet lengthMarks = new UnicodeSet(
024: "[\u09D7\u0B56-\u0B57\u0BD7\u0C56\u0CD5-\u0CD6\u0D57\u0C55\u0CD5]");
025: int[] indicScripts = { UScript.LATIN, UScript.DEVANAGARI,
026: UScript.BENGALI, UScript.GURMUKHI, UScript.GUJARATI,
027: UScript.ORIYA, UScript.TAMIL, UScript.TELUGU,
028: UScript.KANNADA, UScript.MALAYALAM, };
029: String[] names = new String[indicScripts.length];
030: UnicodeSet[] sets = new UnicodeSet[indicScripts.length];
031: Transliterator[] fallbacks = new Transliterator[indicScripts.length];
032: for (int i = 0; i < indicScripts.length; ++i) {
033: names[i] = UScript.getName(indicScripts[i]);
034: sets[i] = new UnicodeSet("[[:" + names[i]
035: + ":]&[[:L:][:M:]]&[:age=3.1:]]");
036: fallbacks[i] = Transliterator
037: .getInstance("any-" + names[i]);
038: }
039: EquivClass eq = new EquivClass(new ReverseComparator());
040: PrintWriter pw = openPrintWriter("transChart.html");
041: pw
042: .println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
043: pw.println("<title>Indic Transliteration Chart</title><style>");
044: pw.println("td { text-align: Center; font-size: 200% }");
045: pw.println("tt { font-size: 50% }");
046: pw.println("td.miss { background-color: #CCCCFF }");
047: pw.println("</style></head><body bgcolor='#FFFFFF'>");
048:
049: Transliterator anyToLatin = Transliterator
050: .getInstance("any-latin");
051:
052: String testString = "\u0946\u093E";
053:
054: UnicodeSet failNorm = new UnicodeSet();
055: Set latinFail = new TreeSet();
056:
057: for (int i = 0; i < indicScripts.length; ++i) {
058: if (indicScripts[i] == UScript.LATIN)
059: continue;
060: String source = names[i];
061: System.out.println(source);
062: UnicodeSet sourceChars = sets[i];
063:
064: for (int j = 0; j < indicScripts.length; ++j) {
065: if (i == j)
066: continue;
067: String target = names[j];
068: Transliterator forward = Transliterator
069: .getInstance(source + '-' + target);
070: Transliterator backward = forward.getInverse();
071: UnicodeSetIterator it = new UnicodeSetIterator(
072: sourceChars);
073: while (it.next()) {
074: if (lengthMarks.contains(it.codepoint))
075: continue;
076: String s = Normalizer.normalize(it.codepoint,
077: Normalizer.NFC, 0);
078: //if (!Normalizer.isNormalized(s,Normalizer.NFC,0)) continue;
079: if (!s.equals(Normalizer.normalize(s,
080: Normalizer.NFD, 0))) {
081: failNorm.add(it.codepoint);
082: }
083: String t = fix(forward.transliterate(s));
084: if (t.equals(testString)) {
085: System.out.println("debug");
086: }
087:
088: String r = fix(backward.transliterate(t));
089: if (Normalizer.compare(s, r, 0) == 0) {
090: if (indicScripts[j] != UScript.LATIN)
091: eq.add(s, t);
092: } else {
093: if (indicScripts[j] == UScript.LATIN) {
094: latinFail.add(s + " - " + t + " - " + r);
095: }
096: }
097: }
098: }
099: }
100: // collect equivalents
101: pw.println("<table border='1' cellspacing='0'><tr>");
102: for (int i = 0; i < indicScripts.length; ++i) {
103: pw.print("<th width='10%'>" + names[i].substring(0, 3)
104: + "</th>");
105: }
106: pw.println("</tr>");
107:
108: Iterator rit = eq.getSetIterator(new MyComparator());
109: while (rit.hasNext()) {
110: Set equivs = (Set) rit.next();
111: pw.print("<tr>");
112: Iterator sit = equivs.iterator();
113: String source = (String) sit.next();
114: String item = anyToLatin.transliterate(source);
115: if (item.equals("") || source.equals(item))
116: item = " ";
117: pw.print("<td>" + item + "</td>");
118: for (int i = 1; i < indicScripts.length; ++i) {
119: sit = equivs.iterator();
120: item = "";
121: while (sit.hasNext()) {
122: String trial = (String) sit.next();
123: if (!sets[i].containsAll(trial))
124: continue;
125: item = trial;
126: break;
127: }
128: String classString = "";
129: if (item.equals("")) {
130: classString = " class='miss'";
131: String temp = fallbacks[i].transliterate(source);
132: if (!temp.equals("") && !temp.equals(source))
133: item = temp;
134: }
135: String backup = item.equals("") ? " " : item;
136: pw
137: .print("<td" + classString + " title='"
138: + getName(item, "; ") + "'>" + backup
139: + "<br><tt>" + Utility.hex(item)
140: + "</tt></td>");
141: }
142: /*
143: Iterator sit = equivs.iterator();
144: while (sit.hasNext()) {
145: String item = (String)sit.next();
146: pw.print("<td>" + item + "</td>");
147: }
148: */
149: pw.println("</tr>");
150: }
151: pw.println("</table>");
152: if (true) {
153: pw.println("<h2>Failed Normalization</h2>");
154:
155: UnicodeSetIterator it = new UnicodeSetIterator(failNorm);
156: UnicodeSet pieces = new UnicodeSet();
157: while (it.next()) {
158: String s = UTF16.valueOf(it.codepoint);
159: String d = Normalizer.normalize(s, Normalizer.NFD, 0);
160: pw.println("Norm:" + s + ", " + Utility.hex(s) + " "
161: + UCharacter.getName(it.codepoint) + "; " + d
162: + ", " + Utility.hex(d) + ", ");
163: pw.println(UCharacter.getName(d.charAt(1)) + "<br>");
164: if (UCharacter.getName(d.charAt(1)).indexOf("LENGTH") >= 0)
165: pieces.add(d.charAt(1));
166: }
167: pw.println(pieces);
168:
169: pw.println("<h2>Failed Round-Trip</h2>");
170: Iterator cit = latinFail.iterator();
171: while (cit.hasNext()) {
172: pw.println(cit.next() + "<br>");
173: }
174: }
175:
176: pw.println("</table></body></html>");
177: pw.close();
178: System.out.println("Done");
179: }
180:
181: public static String fix(String s) {
182: if (s.equals("\u0946\u093E"))
183: return "\u094A";
184: if (s.equals("\u0C46\u0C3E"))
185: return "\u0C4A";
186: if (s.equals("\u0CC6\u0CBE"))
187: return "\u0CCA";
188:
189: if (s.equals("\u0947\u093E"))
190: return "\u094B";
191: if (s.equals("\u0A47\u0A3E"))
192: return "\u0A4B";
193: if (s.equals("\u0AC7\u0ABE"))
194: return "\u0ACB";
195: if (s.equals("\u0C47\u0C3E"))
196: return "\u0C4B";
197: if (s.equals("\u0CC7\u0CBE"))
198: return "\u0CCB";
199:
200: //return Normalizer.normalize(s,Normalizer.NFD,0);
201: return s;
202: }
203:
204: public static PrintWriter openPrintWriter(String fileName)
205: throws IOException {
206: File lf = new File(fileName);
207: System.out.println("Creating file: " + lf.getAbsoluteFile());
208:
209: return new PrintWriter(new BufferedWriter(
210: new OutputStreamWriter(new FileOutputStream(fileName),
211: "UTF8"), 4 * 1024));
212: }
213:
214: public static String getName(String s, String separator) {
215: int cp;
216: StringBuffer sb = new StringBuffer();
217: for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
218: cp = UTF16.charAt(s, i);
219: if (i != 0)
220: sb.append(separator);
221: sb.append(UCharacter.getName(cp));
222: }
223: return sb.toString();
224: }
225:
226: static class MyComparator implements Comparator {
227: public int compare(Object o1, Object o2) {
228: Iterator i1 = ((TreeSet) o1).iterator();
229: Iterator i2 = ((TreeSet) o2).iterator();
230: while (i1.hasNext() && i2.hasNext()) {
231: String a = (String) i1.next();
232: String b = (String) i2.next();
233: int result = a.compareTo(b);
234: if (result != 0)
235: return result;
236: }
237: if (i1.hasNext())
238: return 1;
239: if (i2.hasNext())
240: return -1;
241: return 0;
242: }
243:
244: }
245:
246: static class ReverseComparator implements Comparator {
247: public int compare(Object o1, Object o2) {
248: String a = o1.toString();
249: char a1 = a.charAt(0);
250: String b = o2.toString();
251: char b1 = b.charAt(0);
252: if (a1 < 0x900 && b1 > 0x900)
253: return -1;
254: if (a1 > 0x900 && b1 < 0x900)
255: return +1;
256: return a.compareTo(b);
257: }
258: }
259:
260: static class EquivClass {
261: EquivClass(Comparator c) {
262: comparator = c;
263: }
264:
265: private HashMap itemToSet = new HashMap();
266: private Comparator comparator;
267:
268: void add(Object a, Object b) {
269: Set sa = (Set) itemToSet.get(a);
270: Set sb = (Set) itemToSet.get(b);
271: if (sa == null && sb == null) { // new set!
272: Set s = new TreeSet(comparator);
273: s.add(a);
274: s.add(b);
275: itemToSet.put(a, s);
276: itemToSet.put(b, s);
277: } else if (sa == null) {
278: sb.add(a);
279: } else if (sb == null) {
280: sa.add(b);
281: } else { // merge sets, dumping sb
282: sa.addAll(sb);
283: Iterator it = sb.iterator();
284: while (it.hasNext()) {
285: itemToSet.put(it.next(), sa);
286: }
287: }
288: }
289:
290: private class MyIterator implements Iterator {
291: private Iterator it;
292:
293: MyIterator(Comparator comp) {
294: TreeSet values = new TreeSet(comp);
295: values.addAll(itemToSet.values());
296: it = values.iterator();
297: }
298:
299: public boolean hasNext() {
300: return it.hasNext();
301: }
302:
303: public Object next() {
304: return it.next();
305: }
306:
307: public void remove() {
308: throw new IllegalArgumentException("can't remove");
309: }
310: }
311:
312: public Iterator getSetIterator(Comparator comp) {
313: return new MyIterator(comp);
314: }
315:
316: }
317: }
|