Source Code Cross Referenced for UnicodeSetCloseOver.java in » Internationalization-Localization » icu4j » com » ibm » icu » dev » tool » translit » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.dev.tool.translit
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         **********************************************************************
003:         * Copyright (c) 2003, International Business Machines
004:         * Corporation and others.  All Rights Reserved.
005:         **********************************************************************
006:         * Author: Alan Liu
007:         * Created: February 11 2003
008:         * Since: ICU 2.6
009:         **********************************************************************
010:         */
011:        package com.ibm.icu.dev.tool.translit;
012:
013:        import com.ibm.icu.lang.UCharacter;
014:        import com.ibm.icu.text.*;
015:        import com.ibm.icu.impl.Utility;
016:        import java.util.Collections;
017:        import java.util.Comparator;
018:        import java.util.HashMap;
019:        import java.util.Iterator;
020:        import java.util.Map;
021:        import java.util.Set;
022:        import java.util.TreeSet;
023:        import java.util.Vector;
024:        import java.util.Locale;
025:        import java.io.*;
026:
027:        /**
028:         * This class produces the data tables used by the closeOver() method
029:         * of UnicodeSet.
030:         *
031:         * Whenever the Unicode database changes, this tool must be re-run
032:         * (AFTER the data file(s) underlying ICU4J are udpated).
033:         *
034:         * The output of this tool should then be pasted into the appropriate
035:         * files:
036:         *
037:         * ICU4J: com.ibm.icu.text.UnicodeSet.java
038:         * ICU4C: /icu/source/common/uniset.cpp
039:         */
040:        class UnicodeSetCloseOver {
041:
042:            // Our output files
043:            static final String JAVA_OUT = "to_UnicodeSet.java";
044:            static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java";
045:            static final String C_SET_OUT = "to_uniset.cpp";
046:            static final String C_UCHAR_OUT = "to_uchar.c";
047:
048:            // Source code "do not edit" warning
049:            static final String WARNING = "MACHINE-GENERATED; Unicode version "
050:                    + UCharacter.getUnicodeVersion() + "; DO NOT EDIT; See "
051:                    + UnicodeSetCloseOver.class.getName();
052:
053:            // Case folding options flag.  This must correspond to the options
054:            // used in UnicodeSet.closeOver() in Java and C++.
055:            static final boolean DEFAULT_CASE_MAP = true; // false for Turkish
056:
057:            public static void main(String[] args) throws IOException {
058:                System.out
059:                        .println("This tool will generate several output files.  Each is named according");
060:                System.out
061:                        .println("the target file.  For example, the contents of to_UnicodeSet.java should");
062:                System.out.println("be pasted into UnicodeSet.java.");
063:                System.out.println();
064:
065:                generateCaseData();
066:            }
067:
068:            /**
069:             * Create a map of String => Set.  The String in this case is a
070:             * folded string for which
071:             * UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded).
072:             * The Set contains all single-character strings x for which
073:             * UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as
074:             * well as folded itself.
075:             */
076:            static Map createCaseFoldEquivalencyClasses() {
077:                Map equivClasses = new HashMap();
078:                for (int i = 0; i <= 0x10FFFF; ++i) {
079:                    int cat = UCharacter.getType(i);
080:                    if (cat == Character.UNASSIGNED
081:                            || cat == Character.PRIVATE_USE)
082:                        continue;
083:
084:                    String cp = UTF16.valueOf(i);
085:                    String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
086:                    if (folded.equals(cp))
087:                        continue;
088:
089:                    // At this point, have different case folding.  Add
090:                    // the code point and its folded equivalent into the
091:                    // equivalency class.
092:                    TreeSet s = (TreeSet) equivClasses.get(folded);
093:                    if (s == null) {
094:                        s = new TreeSet();
095:                        s.add(folded); // add the case fold result itself
096:                        equivClasses.put(folded, s);
097:                    }
098:                    s.add(cp);
099:                }
100:                return equivClasses;
101:            }
102:
103:            /**
104:             * Analyze the case fold equivalency classes.  Break them into two
105:             * groups: 'pairs', and 'nonpairs'.  Create a tally of the length
106:             * configurations of the nonpairs.
107:             *
108:             * Length configurations of equivalency classes, as of Unicode
109:             * 3.2.  Most of the classes (83%) have two single codepoints.
110:             * Here "112:28" means there are 28 equivalency classes with 2
111:             * single codepoints and one string of length 2.
112:             *
113:             * 11:656
114:             * 111:16
115:             * 1111:3
116:             * 112:28
117:             * 113:2
118:             * 12:31
119:             * 13:12
120:             * 22:38
121:             *
122:             * Note: This method does not count the frequencies of the
123:             * different length configurations (as shown above after ':'); it
124:             * merely records which configurations occur.
125:             *
126:             * @param pairs Accumulate equivalency classes that consist of
127:             * exactly two codepoints here.  This is 83+% of the classes.
128:             * E.g., {"a", "A"}.
129:             * @param nonpairs Accumulate other equivalency classes here, as
130:             * lists of strings.  E,g, {"st", "\uFB05", "\uFB06"}.
131:             * @param lengths Accumulate a list of unique length structures,
132:             * not including pairs.  Each length structure is represented by a
133:             * string of digits.  The digit string "12" means the equivalency
134:             * class contains a single code point and a string of length 2.
135:             * Typical contents of 'lengths': { "111", "1111", "112",
136:             * "113", "12", "13", "22" }.  Note the absence of "11".
137:             */
138:            static void analyzeCaseData(Map equivClasses, StringBuffer pairs,
139:                    Vector nonpairs, Vector lengths) {
140:                Iterator i = new TreeSet(equivClasses.keySet()).iterator();
141:                StringBuffer buf = new StringBuffer();
142:                while (i.hasNext()) {
143:                    Object key = i.next();
144:                    Vector v = new Vector((Set) equivClasses.get(key));
145:                    if (v.size() == 2) {
146:                        String a = (String) v.elementAt(0);
147:                        String b = (String) v.elementAt(1);
148:                        if (a.length() == 1 && b.length() == 1) {
149:                            pairs.append(a).append(b);
150:                            continue;
151:                            // Note that pairs are included in 'lengths'
152:                        }
153:                    }
154:                    String[] a = new String[v.size()];
155:                    v.toArray(a);
156:                    nonpairs.add(a);
157:                    //int singleCount = 0;
158:                    //int stringCount = 0;
159:                    // Make a string of the lengths, e.g., "111" means 3
160:                    // single code points; "13" means a single code point
161:                    // and a string of length 3.
162:                    v.clear();
163:                    for (int j = 0; j < a.length; ++j) {
164:                        v.add(new Integer(a[j].length()));
165:                    }
166:                    Collections.sort(v);
167:                    buf.setLength(0);
168:                    for (int j = 0; j < v.size(); ++j) {
169:                        buf.append(String.valueOf(v.elementAt(j)));
170:                    }
171:                    if (!lengths.contains(buf.toString())) {
172:                        lengths.add(buf.toString());
173:                    }
174:                }
175:            }
176:
177:            static void generateCaseData() throws IOException {
178:
179:                Map equivClasses = createCaseFoldEquivalencyClasses();
180:
181:                // Accumulate equivalency classes that consist of exactly
182:                // two codepoints here.  This is 83+% of the classes.
183:                // E.g., {"a", "A"}.
184:                StringBuffer pairs = new StringBuffer();
185:
186:                // Accumulate other equivalency classes here, as lists
187:                // of strings.  E,g, {"st", "\uFB05", "\uFB06"}.
188:                Vector nonpairs = new Vector(); // contains String[]
189:                Vector lengths = new Vector(); // "111", "12", "22", etc.
190:
191:                analyzeCaseData(equivClasses, pairs, nonpairs, lengths);
192:
193:                //-------------------------------------------------------------
194:                // Emit Java source
195:                PrintStream out = new PrintStream(
196:                        new FileOutputStream(JAVA_OUT));
197:                System.out.println("Writing " + JAVA_OUT);
198:
199:                out.println("    // " + WARNING);
200:                out.println("    private static final String CASE_PAIRS =");
201:                out.println(Utility.formatForSource(pairs.toString()) + ";");
202:                out.println();
203:                out.println("    // " + WARNING);
204:                out
205:                        .println("    private static final String[][] CASE_NONPAIRS = {");
206:                for (int j = 0; j < nonpairs.size(); ++j) {
207:                    String[] a = (String[]) nonpairs.elementAt(j);
208:                    out.print("        {");
209:                    for (int k = 0; k < a.length; ++k) {
210:                        if (k != 0)
211:                            out.print(", ");
212:                        out.print(Utility.format1ForSource(a[k]));
213:                    }
214:                    out.println("},");
215:                }
216:                out.println("    };");
217:
218:                //-------------------------------------------------------------
219:                // Emit C++ source
220:
221:                // In C++, the pairs are again emitted in an array, but this
222:                // array is the final representation form -- it will not be
223:                // reprocessed into a hash.  It will be binary searched by
224:                // looking at the even elements [0], [2], [4], etc., and
225:                // ignoring the odd elements.  The even elements must contain
226:                // the folded members of the pairs.  That is, in the pair
227:                // {'A', 'a'}, the even element must be 'a', not 'A'.  Then a
228:                // code point to be located is first folded ('Y' => 'y') then
229:                // it binary searched against [0]='A', [2]='B', etc.  When a
230:                // match is found at k, the pair is [k], [k+1].
231:
232:                out = new PrintStream(new FileOutputStream(C_SET_OUT));
233:                System.out.println("Writing " + C_SET_OUT);
234:
235:                // Sort the pairs.  They must be ordered by the folded element.
236:                // Store these as two-character strings, with charAt(0) being
237:                // the folded member of the pair.
238:                TreeSet sortPairs = new TreeSet(new Comparator() {
239:                    public int compare(Object a, Object b) {
240:                        return ((int) ((String) a).charAt(0))
241:                                - ((int) ((String) b).charAt(0));
242:                    }
243:
244:                    public boolean equals(Object obj) {
245:                        return false;
246:                    }
247:                });
248:                for (int i = 0; i < pairs.length(); i += 2) {
249:                    String a = String.valueOf(pairs.charAt(i));
250:                    String b = String.valueOf(pairs.charAt(i + 1));
251:                    String folded = UCharacter.foldCase(a, DEFAULT_CASE_MAP);
252:                    if (a.equals(folded)) {
253:                        sortPairs.add(a + b);
254:                    } else {
255:                        sortPairs.add(b + a);
256:                    }
257:                }
258:
259:                // Emit the pairs
260:                out.println("// " + WARNING);
261:                out.println("static const UChar CASE_PAIRS[] = {");
262:                Iterator it = sortPairs.iterator();
263:                while (it.hasNext()) {
264:                    out.print("    ");
265:                    int n = 0;
266:                    while (n++ < 5 && it.hasNext()) {
267:                        String s = (String) it.next();
268:                        //out.print((int) s.charAt(0) + "," +
269:                        //                 (int) s.charAt(1) + ",");
270:                        out.print("0x" + Utility.hex(s.charAt(0)) + ",0x"
271:                                + Utility.hex(s.charAt(1)) + ",");
272:                    }
273:                    out.println();
274:                }
275:                out.println("};");
276:                out.println();
277:
278:                // The non-pairs are encoded in the following way.  All the
279:                // single codepoints in each class are grouped together
280:                // followed by a zero.  Then each multi-character string is
281:                // added, followed by a zero.  Finally, another zero is added.
282:                // Some examples:
283:                //  {"iQ", "R"}           =>  [ 'R', 0, 'i', 'Q', 0, 0 ]
284:                //  {"S", "D", "F", "G"}  =>  [ 'S', 'D', 'F', 'G', 0, 0 ]
285:                //  {"jW", "jY"}          =>  [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ]
286:                // The end-result is a short, flat array of UChar values that
287:                // can be used to initialize a UChar[] array in C.
288:
289:                int maxLen = 0; // Maximum encoded length of any class, including zeros
290:                out.println("// " + WARNING);
291:                out.println("static const CaseEquivClass CASE_NONPAIRS[] = {");
292:                for (int j = 0; j < nonpairs.size(); ++j) {
293:                    int len = 0;
294:                    String[] a = (String[]) nonpairs.elementAt(j);
295:                    out.print("    {");
296:                    // Emit single code points
297:                    for (int k = 0; k < a.length; ++k) {
298:                        if (a[k].length() != 1)
299:                            continue;
300:                        //out.print((int) a[k].charAt(0) + ",");
301:                        out.print("0x" + Utility.hex(a[k].charAt(0)) + ",");
302:                        ++len;
303:                    }
304:                    out.print("0,  "); // End of single code points
305:                    ++len;
306:                    // Emit multi-character strings
307:                    for (int k = 0; k < a.length; ++k) {
308:                        if (a[k].length() == 1)
309:                            continue;
310:                        for (int m = 0; m < a[k].length(); ++m) {
311:                            //out.print((int) a[k].charAt(m) + ",");
312:                            out.print("0x" + Utility.hex(a[k].charAt(m)) + ",");
313:                            ++len;
314:                        }
315:                        out.print("0, "); // End of string
316:                        ++len;
317:                    }
318:                    out.println("0},"); // End of equivalency class
319:                    ++len;
320:                    if (len > maxLen)
321:                        maxLen = len;
322:                }
323:                out.println("};");
324:
325:                // Make sure the CaseEquivClass data can fit.
326:                if (maxLen > 8) {
327:                    throw new RuntimeException(
328:                            "Must adjust CaseEquivClass to accomodate "
329:                                    + maxLen + " UChars");
330:                }
331:
332:                // Also make sure that we can map into this array using a
333:                // CompactByteArray.  We could do this check above, but we
334:                // keep it here, adjacent to the maxLen check.  We use one
335:                // value (-1 == 255) to indicate "no value."
336:                if (nonpairs.size() > 255) {
337:                    throw new RuntimeException(
338:                            "Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray");
339:                }
340:
341:                //-------------------------------------------------------------
342:                // Case-unique set:  All characters c for which closeOver(c)==c.
343:
344:                // UPDATE: Instead of using this, we're using the related
345:                // notion of Case_Sensitive.  See below.  Note that
346:                // Case_Sensitive != ^Case_Unique.
347:
348:                if (false) {
349:                    UnicodeSet caseUnique = new UnicodeSet();
350:                    for (int i = 0; i <= 0x10FFFF; ++i) {
351:                        String cp = UTF16.valueOf(i);
352:                        if (equivClasses.get(UCharacter.foldCase(cp,
353:                                DEFAULT_CASE_MAP)) == null) {
354:                            caseUnique.add(i);
355:                        }
356:                    }
357:                    // out.println("caseUnique = " + caseUnique.toPattern(true));
358:                }
359:
360:                UnicodeSet caseSensitive = getCaseSensitive();
361:                //System.out.println("caseSensitive = " + caseSensitive.toPattern(true));
362:
363:                // Now for C, emit an array of ranges
364:                out = new PrintStream(new FileOutputStream(C_UCHAR_OUT));
365:                System.out.println("Writing " + C_UCHAR_OUT);
366:
367:                out.println("/* " + WARNING + " */");
368:                emitUCharRangesArray(out, caseSensitive,
369:                        "CASE_SENSITIVE_RANGES");
370:
371:                // For Java, emit a string with the ranges (each pair of chars
372:                // in the string is a range).
373:                out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT));
374:                System.out.println("Writing " + JAVA_CHARPROP_OUT);
375:                out.println("    // " + WARNING);
376:                emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES");
377:            }
378:
379:            /**
380:             * Create the set of case-sensitive characters.  These are characters
381:             * that participate in any case mapping operation as a source or
382:             * as a member of a target string.
383:             */
384:            static UnicodeSet getCaseSensitive() {
385:                UnicodeSet caseSensitive = new UnicodeSet();
386:                Locale loc = Locale.US;
387:                BreakIterator bi = BreakIterator.getTitleInstance(loc);
388:                for (int c = 0; c <= 0x10FFFF; ++c) {
389:                    String cp = UTF16.valueOf(c);
390:                    for (int j = 0; j < 4; ++j) {
391:                        String s = null;
392:                        switch (j) {
393:                        case 0:
394:                            s = UCharacter.toUpperCase(loc, cp);
395:                            break;
396:                        case 1:
397:                            s = UCharacter.toLowerCase(loc, cp);
398:                            break;
399:                        case 2:
400:                            s = UCharacter.toTitleCase(loc, cp, bi);
401:                            break;
402:                        case 3:
403:                            s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
404:                            break;
405:                        }
406:                        if (!s.equals(cp)) {
407:                            int cc;
408:                            for (int k = 0; k < s.length(); k += UTF16
409:                                    .getCharCount(cc)) {
410:                                cc = UTF16.charAt(s, k);
411:                                caseSensitive.add(cc);
412:                            }
413:                            for (int k = 0; k < cp.length(); k += UTF16
414:                                    .getCharCount(cc)) {
415:                                cc = UTF16.charAt(cp, k);
416:                                caseSensitive.add(cc);
417:                            }
418:                        }
419:                    }
420:                    // Also do the single-codepoint API.  This shouldn't add any
421:                    // code points, but we do it for completeness.
422:                    for (int j = 0; j < 4; ++j) {
423:                        int d = 0;
424:                        switch (j) {
425:                        case 0:
426:                            d = UCharacter.toUpperCase(c);
427:                            break;
428:                        case 1:
429:                            d = UCharacter.toLowerCase(c);
430:                            break;
431:                        case 2:
432:                            d = UCharacter.toTitleCase(c);
433:                            break;
434:                        case 3:
435:                            d = UCharacter.foldCase(c, DEFAULT_CASE_MAP);
436:                            break;
437:                        }
438:                        if (d != c) {
439:                            if (!caseSensitive.contains(c)
440:                                    || !caseSensitive.contains(d)) {
441:                                System.out.println("Warning: code point " + c
442:                                        + " => " + d + " created NEW MAPPING"
443:                                        + " for Case_Sensitive");
444:                            }
445:                            caseSensitive.add(c);
446:                            caseSensitive.add(d);
447:                        }
448:                    }
449:                }
450:                return caseSensitive;
451:            }
452:
453:            /**
454:             * Given a UnicodeSet, emit it as an array of UChar pairs.  Each
455:             * pair will be the start/end of a range.  Code points >= U+10000
456:             * will be represented as surrogate pairs.
457:             */
458:            static void emitUCharRangesArray(PrintStream out, UnicodeSet set,
459:                    String id) {
460:                // Store the pairs in a StringBuffer.  This handles surrogate
461:                // representation.
462:                StringBuffer buf = new StringBuffer();
463:                for (int i = 0; i < set.getRangeCount(); ++i) {
464:                    UTF16.append(buf, set.getRangeStart(i));
465:                    UTF16.append(buf, set.getRangeEnd(i));
466:                }
467:                // Emit the pairs
468:                out.println("static const UChar " + id + "[] = {");
469:                for (int i = 0; i < buf.length();) {
470:                    out.print("    ");
471:                    for (int n = 0; n++ < 10 && i < buf.length(); ++i) {
472:                        out.print("0x" + Utility.hex(buf.charAt(i), 4) + ',');
473:                    }
474:                    out.println();
475:                }
476:                out.println("};");
477:                out.println("#define " + id + "_LENGTH (sizeof(" + id
478:                        + ")/sizeof(" + id + "[0]))");
479:            }
480:
481:            /**
482:             * Given a UnicodeSet, emit it as a Java string.  The most economical
483:             * format is not the pattern, but instead a pairs list, with each
484:             * range pair represented as two adjacent characters.
485:             */
486:            static void emitRangesString(PrintStream out, UnicodeSet set,
487:                    String id) {
488:                // Store the pairs in a StringBuffer.  This handles surrogate
489:                // representation.
490:                StringBuffer buf = new StringBuffer();
491:                for (int i = 0; i < set.getRangeCount(); ++i) {
492:                    UTF16.append(buf, set.getRangeStart(i));
493:                    UTF16.append(buf, set.getRangeEnd(i));
494:                }
495:                // Emit the pairs
496:                out.println("    private static final String " + id + " =");
497:                out.println(Utility.formatForSource(buf.toString()) + ";");
498:            }
499:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.