001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007:
008: package com.ibm.icu.dev.test.translit;
009:
010: import com.ibm.icu.lang.*;
011: import com.ibm.icu.text.*;
012: import java.util.*;
013: import java.io.*;
014:
015: public class WriteCharts {
016: public static void main(String[] args) throws IOException {
017: if (false) {
018: printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
019: }
020: String testSet = "";
021: if (args.length == 0)
022: args = getAllScripts();
023: for (int i = 0; i < args.length; ++i) {
024: // Enumeration enum = Transliterator.getAvailableIDs();
025: if (args[i].startsWith("[")) {
026: testSet = args[i];
027: } else {
028: print(testSet, args[i]);
029: testSet = "";
030: }
031: }
032: }
033:
034: public static void printSet(String source) {
035: UnicodeSet s = new UnicodeSet(source);
036: System.out.println("Printout for '" + source + "'");
037: int count = s.getRangeCount();
038: for (int i = 0; i < count; ++i) {
039: int start = s.getRangeStart(i);
040: int end = s.getRangeEnd(i);
041: System.out.println(Integer.toString(start, 16) + ".."
042: + Integer.toString(end, 16));
043: }
044: }
045:
046: public static String[] getAllScripts() {
047: Set set = new TreeSet();
048: int scripts[];
049: Enumeration sources = Transliterator.getAvailableSources();
050: while (sources.hasMoreElements()) {
051: String source = (String) sources.nextElement();
052: scripts = UScript.getCode(source);
053: if (scripts == null) {
054: System.out.println("[Skipping " + source + "]");
055: continue;
056: }
057: int sourceScript = scripts[0];
058: System.out.println("Source: " + source + ";\tScripts: "
059: + showScripts(scripts));
060: Enumeration targets = Transliterator
061: .getAvailableTargets(source);
062: while (targets.hasMoreElements()) {
063: String target = (String) targets.nextElement();
064: scripts = UScript.getCode(target);
065: if (scripts == null
066: || priority(scripts[0]) < priority(sourceScript)) {
067: // skip doing both directions
068: System.out.println("[Skipping '" + source + "-"
069: + target + "']");
070: continue;
071: }
072: System.out.println("\tTarget: " + target
073: + ";\tScripts: " + showScripts(scripts));
074: Enumeration variants = Transliterator
075: .getAvailableVariants(source, target);
076: while (variants.hasMoreElements()) {
077: String variant = (String) variants.nextElement();
078: String id = source + "-" + target;
079: if (variant.length() != 0) {
080: id += "/" + variant;
081: if (false) {
082: System.out
083: .println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t"
084: + id);
085: continue;
086: }
087: }
088: System.out.println("\t\t\t\tAdding: '" + id + "'");
089: set.add(id);
090: }
091: }
092: }
093: String[] results = new String[set.size()];
094: set.toArray(results);
095: return results;
096: }
097:
098: static public int priority(int script) {
099: if (script == UScript.LATIN)
100: return -2;
101: return script;
102: }
103:
104: public static String showScripts(int[] scripts) {
105: StringBuffer results = new StringBuffer();
106: for (int i = 0; i < scripts.length; ++i) {
107: if (i != 0)
108: results.append(", ");
109: results.append(UScript.getName(scripts[i]));
110: }
111: return results.toString();
112: }
113:
114: public static void print(String testSet, String rawId)
115: throws IOException {
116: System.out.println("Processing " + rawId);
117: Transliterator t = Transliterator.getInstance(rawId);
118: String id = t.getID();
119:
120: // clean up IDs. Ought to be API for getting source, target, variant
121: int minusPos = id.indexOf('-');
122: String source = id.substring(0, minusPos);
123: String target = id.substring(minusPos + 1);
124: int slashPos = target.indexOf('/');
125: if (slashPos >= 0)
126: target = target.substring(0, slashPos);
127:
128: // check that the source is a script
129: if (testSet.equals("")) {
130: int[] scripts = UScript.getCode(source);
131: if (scripts == null) {
132: System.out.println("FAILED: "
133: + Transliterator.getDisplayName(id)
134: + " does not have a script as the source");
135: return;
136: } else {
137: testSet = "[:" + source + ":]";
138: if (source.equalsIgnoreCase("katakana")) {
139: testSet = "[" + testSet + "\u30FC]";
140: printSet(testSet);
141: }
142: }
143: }
144: UnicodeSet sourceSet = new UnicodeSet(testSet);
145:
146: // check that the target is a script
147: int[] scripts = UScript.getCode(target);
148: if (scripts == null) {
149: target = "[:Latin:]";
150: } else {
151: target = "[:" + target + ":]";
152: }
153: UnicodeSet targetSet = new UnicodeSet(target);
154:
155: Transliterator inverse = t.getInverse();
156:
157: //Transliterator hex = Transliterator.getInstance("Any-Hex");
158:
159: // iterate through script
160: System.out.println("Transliterating "
161: + sourceSet.toPattern(true) + " with "
162: + Transliterator.getDisplayName(id));
163:
164: UnicodeSet leftOverSet = new UnicodeSet(targetSet);
165: UnicodeSet privateUse = new UnicodeSet("[:private use:]");
166:
167: Map map = new TreeMap();
168:
169: UnicodeSet targetSetPlusAnyways = new UnicodeSet(targetSet);
170: targetSetPlusAnyways.addAll(okAnyway);
171:
172: UnicodeSet sourceSetPlusAnyways = new UnicodeSet(sourceSet);
173: sourceSetPlusAnyways.addAll(okAnyway);
174:
175: UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet);
176:
177: while (usi.next()) {
178: int j = usi.codepoint;
179: /*
180: int count = sourceSet.getRangeCount();
181: for (int i = 0; i < count; ++i) {
182: int end = sourceSet.getRangeEnd(i);
183: for (int j = sourceSet.getRangeStart(i); j <= end; ++j) {
184: */
185: // String flag = "";
186: String ss = UTF16.valueOf(j);
187: String ts = t.transliterate(ss);
188: char group = 0;
189: if (!targetSetPlusAnyways.containsAll(ts)) {
190: group |= 1;
191: }
192: if (UTF16.countCodePoint(ts) == 1) {
193: leftOverSet.remove(UTF16.charAt(ts, 0));
194: }
195: String rt = inverse.transliterate(ts);
196: if (!sourceSetPlusAnyways.containsAll(rt)) {
197: group |= 2;
198: } else if (!ss.equals(rt)) {
199: group |= 4;
200: }
201:
202: if (!privateUse.containsNone(ts)
203: || !privateUse.containsNone(rt)) {
204: group |= 16;
205: }
206:
207: map.put(group
208: + UCharacter.toLowerCase(Normalizer.normalize(ss,
209: Normalizer.NFKD)) + "\u0000" + ss,
210: "<td class='s'>" + ss + "<br><tt>" + hex(ss)
211: + "</tt></td><td class='t'>" + ts
212: + "<br><tt>" + hex(ts)
213: + "</tt></td><td class='r'>" + rt
214: + "<br><tt>" + hex(rt) + "</tt></td>");
215:
216: // Check Duals
217: /*
218: int maxDual = 200;
219: dual:
220: for (int i2 = 0; i2 < count; ++i2) {
221: int end2 = sourceSet.getRangeEnd(i2);
222: for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {
223: String ss2 = UTF16.valueOf(j2);
224: String ts2 = t.transliterate(ss2);
225: String rt2 = inverse.transliterate(ts2);
226:
227: String ss12 = ss + ss2;
228: String ts12 = t.transliterate(ss + ss12);
229: String rt12 = inverse.transliterate(ts12);
230: if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;
231: if (--maxDual < 0) break dual;
232:
233: // transliteration of whole differs from that of parts
234: group = 0x100;
235: map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))
236: + "\u0000" + ss12,
237: "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)
238: + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)
239: + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );
240: }
241: }
242: */
243: //}
244: }
245:
246: leftOverSet.remove(0x0100, 0x02FF); // remove extended & IPA
247:
248: /*int count = leftOverSet.getRangeCount();
249: for (int i = 0; i < count; ++i) {
250: int end = leftOverSet.getRangeEnd(i);
251: for (int j = leftOverSet.getRangeStart(i); j <= end; ++j) {
252: */
253:
254: usi.reset(leftOverSet);
255: while (usi.next()) {
256: int j = usi.codepoint;
257:
258: String ts = UTF16.valueOf(j);
259: // String decomp = Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0);
260: // if (!decomp.equals(ts)) continue;
261:
262: String rt = inverse.transliterate(ts);
263: // String flag = "";
264: char group = 0x80;
265:
266: if (!sourceSetPlusAnyways.containsAll(rt)) {
267: group |= 8;
268: }
269: if (!privateUse.containsNone(rt)) {
270: group |= 16;
271: }
272:
273: map.put(group
274: + UCharacter.toLowerCase(Normalizer.normalize(ts,
275: Normalizer.NFKD)) + ts,
276: "<td class='s'>-</td><td class='t'>" + ts
277: + "<br><tt>" + hex(ts)
278: + "</tt></td><td class='r'>" + rt
279: + "<br><tt>" + hex(rt) + "</tt></td>");
280: //}
281: }
282:
283: // make file name and open
284: File f = new File("transliteration/chart_"
285: + id.replace('/', '_') + ".html");
286: String filename = f.getCanonicalFile().toString();
287: PrintWriter out = new PrintWriter(new OutputStreamWriter(
288: new FileOutputStream(filename), "UTF-8"));
289: //out.print('\uFEFF'); // BOM
290:
291: System.out.println("Writing " + filename);
292:
293: try {
294: out
295: .println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");
296: out.println("<HTML><HEAD>");
297: out
298: .println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");
299: out
300: .println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");
301:
302: out.println("<BODY>");
303: out.println("<h1>Transliteration Samples for '"
304: + Transliterator.getDisplayName(id) + "'</h1>");
305: out
306: .println("<p>This file illustrates the transliterations of "
307: + Transliterator.getDisplayName(id) + ".");
308: out
309: .println("The samples are mechanically generated, and only include single characters");
310: out
311: .println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");
312: out
313: .println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");
314: out
315: .println("<a href='http://www.ibm.com/software/globalization/icu/demo/transform'>Transliteration Demo</a></p><hr>");
316:
317: // set up the headers
318: int columnCount = 3;
319: String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";
320: String headers = headerBase;
321: for (int i = columnCount - 1; i > 0; --i) {
322: if (i != columnCount - 1)
323: headers += "<th> </th>";
324: headers += headerBase;
325: }
326:
327: String tableHeader = "<p><table border='1'><tr>" + headers
328: + "</tr>";
329: String tableFooter = "</table></p>";
330: out.println("<h2>Round Trip</h2>");
331: out.println(tableHeader);
332:
333: Iterator it = map.keySet().iterator();
334: char lastGroup = 0;
335: int count = 0;
336: int column = 0;
337: while (it.hasNext()) {
338: String key = (String) it.next();
339: char group = key.charAt(0);
340: if (group != lastGroup || count++ > 50) {
341: lastGroup = group;
342: count = 0;
343: if (column != 0) {
344: out.println("</tr>");
345: column = 0;
346: }
347: out.println(tableFooter);
348:
349: // String title = "";
350: if ((group & 0x100) != 0)
351: out.println("<hr><h2>Duals</h2>");
352: else if ((group & 0x80) != 0)
353: out.println("<hr><h2>Completeness</h2>");
354: else
355: out.println("<hr><h2>Round Trip</h2>");
356: if ((group & 16) != 0)
357: out
358: .println("<h3>Errors: Contains Private Use Characters</h3>");
359: if ((group & 8) != 0)
360: out
361: .println("<h3>Possible Errors: Return not in Source Set</h3>");
362: if ((group & 4) != 0)
363: out
364: .println("<h3>One-Way Mapping: Return not equal to Source</h3>");
365: if ((group & 2) != 0)
366: out
367: .println("<h3>Errors: Return not in Source Set</h3>");
368: if ((group & 1) != 0)
369: out
370: .println("<h3>Errors: Target not in Target Set</h3>");
371:
372: out.println(tableHeader);
373: column = 0;
374: }
375: String value = (String) map.get(key);
376: if (column++ == 0)
377: out.print("<tr>");
378: else
379: out.print("<th> </th>");
380: out.println(value);
381: if (column == 3) {
382: out.println("</tr>");
383: column = 0;
384: }
385: }
386: if (column != 0) {
387: out.println("</tr>");
388: column = 0;
389: }
390: out.println(tableFooter + "</BODY></HTML>");
391:
392: } finally {
393: out.close();
394: }
395: }
396:
397: public static String hex(String s) {
398: int cp;
399: StringBuffer results = new StringBuffer();
400: for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
401: cp = UTF16.charAt(s, i);
402: if (i != 0)
403: results.append(' ');
404: results.append(Integer.toHexString(cp));
405: }
406: return results.toString().toUpperCase();
407: }
408:
409: static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");
410:
411: /*
412: // tests whether a string is in a set. Also checks for Common and Inherited
413: public static boolean isIn(String s, UnicodeSet set) {
414: int cp;
415: for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
416: cp = UTF16.charAt(s, i);
417: if (set.contains(cp)) continue;
418: if (okAnyway.contains(cp)) continue;
419: return false;
420: }
421: return true;
422: }
423: */
424:
425: }
|