Source Code Cross Referenced for NormalizerBuilder.java in » Internationalization-Localization » icu4j » com » ibm » icu » dev » test » normalizer » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.dev.test.normalizer
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /**
002:         * Builds the normalization tables. This is a separate class so that it
003:         * can be unloaded once not needed.<br>
004:         * Copyright (C) 1998-2004 International Business Machines Corporation and
005:         * Unicode, Inc. All Rights Reserved.<br>
006:         * The Unicode Consortium makes no expressed or implied warranty of any
007:         * kind, and assumes no liability for errors or omissions.
008:         * No liability is assumed for incidental and consequential damages
009:         * in connection with or arising out of the use of the information here.
010:         * @author Mark Davis
011:         * Updates for supplementary code points:
012:         * Vladimir Weinstein & Markus Scherer
013:         */package com.ibm.icu.dev.test.normalizer;
014:
015:        import java.io.BufferedReader;
016:        import java.util.BitSet;
017:
018:        import com.ibm.icu.dev.test.TestUtil;
019:        import com.ibm.icu.dev.test.UTF16Util;
020:
021:        class NormalizerBuilder {
022:            static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
023:
024:            /**
025:             * Testing flags
026:             */
027:
028:            private static final boolean DEBUG = false;
029:            private static final boolean GENERATING = false;
030:
031:            /**
032:             * Constants for the data file version to use.
033:             */
034:            /*static final boolean NEW_VERSION = true;
035:            private static final String DIR = "D:\\UnicodeData\\" + (NEW_VERSION ? "WorkingGroups\\" : "");
036:
037:            static final String UNIDATA_VERSION = NEW_VERSION ? "3.0.0d12" : "2.1.9";
038:            static final String EXCLUSIONS_VERSION = NEW_VERSION ? "1d4" : "1";
039:
040:            public static final String UNICODE_DATA = DIR + "UnicodeData-" + UNIDATA_VERSION + ".txt";
041:            public static final String COMPOSITION_EXCLUSIONS = DIR + "CompositionExclusions-" + EXCLUSIONS_VERSION +".txt";
042:             */
043:
044:            /**
045:             * Called exactly once by NormalizerData to build the static data
046:             */
047:
048:            static NormalizerData build(boolean fullData) {
049:                try {
050:                    IntHashtable canonicalClass = new IntHashtable(0);
051:                    IntStringHashtable decompose = new IntStringHashtable(null);
052:                    LongHashtable compose = new LongHashtable(
053:                            NormalizerData.NOT_COMPOSITE);
054:                    BitSet isCompatibility = new BitSet();
055:                    BitSet isExcluded = new BitSet();
056:                    if (fullData) {
057:                        //System.out.println("Building Normalizer Data from file.");
058:                        readExclusionList(isExcluded);
059:                        //System.out.println(isExcluded.get(0x00C0));
060:                        buildDecompositionTables(canonicalClass, decompose,
061:                                compose, isCompatibility, isExcluded);
062:                    } else { // for use in Applets
063:                        //System.out.println("Building abridged data.");
064:                        setMinimalDecomp(canonicalClass, decompose, compose,
065:                                isCompatibility, isExcluded);
066:                    }
067:                    return new NormalizerData(canonicalClass, decompose,
068:                            compose, isCompatibility, isExcluded);
069:                } catch (java.io.IOException e) {
070:                    System.err.println("Can't load data file." + e + ", "
071:                            + e.getMessage());
072:                    return null;
073:                }
074:            }
075:
076:            // =============================================================
077:            // Building Decomposition Tables
078:            // =============================================================
079:
080:            /**
081:             * Reads exclusion list and stores the data
082:             */
083:            private static void readExclusionList(BitSet isExcluded)
084:                    throws java.io.IOException {
085:                if (DEBUG)
086:                    System.out.println("Reading Exclusions");
087:
088:                BufferedReader in = TestUtil
089:                        .getDataReader("unicode/CompositionExclusions.txt");
090:
091:                while (true) {
092:                    // read a line, discarding comments and blank lines
093:
094:                    String line = in.readLine();
095:                    if (line == null)
096:                        break;
097:                    int comment = line.indexOf('#'); // strip comments
098:                    if (comment != -1)
099:                        line = line.substring(0, comment);
100:                    if (line.length() == 0)
101:                        continue; // ignore blanks
102:                    if (line.indexOf(' ') != -1) {
103:                        line = line.substring(0, line.indexOf(' '));
104:                    }
105:                    // store -1 in the excluded table for each character hit
106:
107:                    int value = Integer.parseInt(line, 16);
108:                    isExcluded.set(value);
109:                    //System.out.println("Excluding " + hex(value));
110:                }
111:                in.close();
112:                if (DEBUG)
113:                    System.out.println("Done reading Exclusions");
114:            }
115:
116:            /**
117:             * Builds a decomposition table from a UnicodeData file
118:             */
119:            private static void buildDecompositionTables(
120:                    IntHashtable canonicalClass, IntStringHashtable decompose,
121:                    LongHashtable compose, BitSet isCompatibility,
122:                    BitSet isExcluded) throws java.io.IOException {
123:                if (DEBUG)
124:                    System.out.println("Reading Unicode Character Database");
125:                //BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024);
126:                BufferedReader in = null;
127:                try {
128:                    in = TestUtil.getDataReader("unicode/UnicodeData.txt");
129:                } catch (Exception e) {
130:                    System.err.println("Failed to read UnicodeData.txt");
131:                    System.exit(1);
132:                }
133:
134:                int value;
135:                long pair;
136:                int counter = 0;
137:                while (true) {
138:
139:                    // read a line, discarding comments and blank lines
140:
141:                    String line = in.readLine();
142:                    if (line == null)
143:                        break;
144:                    int comment = line.indexOf('#'); // strip comments
145:                    if (comment != -1)
146:                        line = line.substring(0, comment);
147:                    if (line.length() == 0)
148:                        continue;
149:                    if (DEBUG) {
150:                        counter++;
151:                        if ((counter & 0xFF) == 0)
152:                            System.out.println("At: " + line);
153:                    }
154:
155:                    // find the values of the particular fields that we need
156:                    // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
157:
158:                    int start = 0;
159:                    int end = line.indexOf(';'); // code
160:                    value = Integer.parseInt(line.substring(start, end), 16);
161:                    if (true && value == '\u00c0') {
162:                        //System.out.println("debug: " + line);
163:                    }
164:                    end = line.indexOf(';', start = end + 1); // name
165:                    /*String name = line.substring(start,end);*/
166:                    end = line.indexOf(';', start = end + 1); // general category
167:                    end = line.indexOf(';', start = end + 1); // canonical class
168:
169:                    // check consistency: canonical classes must be from 0 to 255
170:
171:                    int cc = Integer.parseInt(line.substring(start, end));
172:                    if (cc != (cc & 0xFF))
173:                        System.err.println("Bad canonical class at: " + line);
174:                    canonicalClass.put(value, cc);
175:                    end = line.indexOf(';', start = end + 1); // BIDI
176:                    end = line.indexOf(';', start = end + 1); // decomp
177:
178:                    // decomp requires more processing.
179:                    // store whether it is canonical or compatibility.
180:                    // store the decomp in one table, and the reverse mapping (from pairs) in another
181:
182:                    if (start != end) {
183:                        String segment = line.substring(start, end);
184:                        boolean compat = segment.charAt(0) == '<';
185:                        if (compat)
186:                            isCompatibility.set(value);
187:                        String decomp = fromHex(segment);
188:
189:                        // a small snippet of code to generate the Applet data
190:
191:                        /*if (GENERATING) {
192:                            if (value < 0xFF) {
193:                                System.out.println(
194:                                    "\"\\u" + hex((char)value) + "\", "
195:                            + "\"\\u" + hex(decomp, "\\u") + "\", "
196:                            + (compat ? "\"K\"," : "\"\",")
197:                            + "// " + name);
198:                    }
199:                }*/
200:
201:                        // check consistency: all canon decomps must be singles or pairs!
202:                        int decompLen = UTF16Util.countCodePoint(decomp);
203:                        if (decompLen < 1 || decompLen > 2 && !compat) {
204:                            System.err.println("Bad decomp at: " + line);
205:                        }
206:                        decompose.put(value, decomp);
207:
208:                        // only compositions are canonical pairs
209:                        // skip if script exclusion
210:
211:                        if (!compat && !isExcluded.get(value)) {
212:                            int first = '\u0000';
213:                            int second = UTF16Util.nextCodePoint(decomp, 0);
214:                            if (decompLen > 1) {
215:                                first = second;
216:                                second = UTF16Util.nextCodePoint(decomp,
217:                                        UTF16Util.codePointLength(first));
218:                            }
219:
220:                            // store composition pair in single integer
221:
222:                            pair = ((long) first << 32) | second;
223:                            if (DEBUG && value == '\u00C0') {
224:                                System.out.println("debug2: " + line);
225:                            }
226:                            compose.put(pair, value);
227:                        } else if (DEBUG) {
228:                            System.out.println("Excluding: " + decomp);
229:                        }
230:                    }
231:                }
232:                in.close();
233:                if (DEBUG)
234:                    System.out
235:                            .println("Done reading Unicode Character Database");
236:
237:                // add algorithmic Hangul decompositions
238:                // this is more compact if done at runtime, but for simplicity we
239:                // do it this way.
240:
241:                if (DEBUG)
242:                    System.out.println("Adding Hangul");
243:
244:                for (int SIndex = 0; SIndex < SCount; ++SIndex) {
245:                    int TIndex = SIndex % TCount;
246:                    char first, second;
247:                    if (TIndex != 0) { // triple
248:                        first = (char) (SBase + SIndex - TIndex);
249:                        second = (char) (TBase + TIndex);
250:                    } else {
251:                        first = (char) (LBase + SIndex / NCount);
252:                        second = (char) (VBase + (SIndex % NCount) / TCount);
253:                    }
254:                    pair = ((long) first << 32) | second;
255:                    value = SIndex + SBase;
256:                    decompose.put(value, String.valueOf(first) + second);
257:                    compose.put(pair, value);
258:                }
259:                if (DEBUG)
260:                    System.out.println("Done adding Hangul");
261:            }
262:
263:            /**
264:             * Hangul composition constants
265:             */
266:            static final int SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161,
267:                    TBase = 0x11A7, LCount = 19, VCount = 21, TCount = 28,
268:                    NCount = VCount * TCount, // 588
269:                    SCount = LCount * NCount; // 11172
270:
271:            /**
272:             * For use in an applet: just load a minimal set of data.
273:             */
274:            private static void setMinimalDecomp(IntHashtable canonicalClass,
275:                    IntStringHashtable decompose, LongHashtable compose,
276:                    BitSet isCompatibility, BitSet isExcluded) {
277:                String[] decomposeData = { "\u005E", "\u0020\u0302", "K",
278:                        "\u005F", "\u0020\u0332", "K", "\u0060",
279:                        "\u0020\u0300", "K", "\u00A0", "\u0020", "K", "\u00A8",
280:                        "\u0020\u0308", "K", "\u00AA", "\u0061", "K", "\u00AF",
281:                        "\u0020\u0304", "K", "\u00B2", "\u0032", "K", "\u00B3",
282:                        "\u0033", "K", "\u00B4", "\u0020\u0301", "K", "\u00B5",
283:                        "\u03BC", "K", "\u00B8", "\u0020\u0327", "K", "\u00B9",
284:                        "\u0031", "K", "\u00BA", "\u006F", "K", "\u00BC",
285:                        "\u0031\u2044\u0034", "K", "\u00BD",
286:                        "\u0031\u2044\u0032", "K", "\u00BE",
287:                        "\u0033\u2044\u0034", "K", "\u00C0", "\u0041\u0300",
288:                        "", "\u00C1", "\u0041\u0301", "", "\u00C2",
289:                        "\u0041\u0302", "", "\u00C3", "\u0041\u0303", "",
290:                        "\u00C4", "\u0041\u0308", "", "\u00C5", "\u0041\u030A",
291:                        "", "\u00C7", "\u0043\u0327", "", "\u00C8",
292:                        "\u0045\u0300", "", "\u00C9", "\u0045\u0301", "",
293:                        "\u00CA", "\u0045\u0302", "", "\u00CB", "\u0045\u0308",
294:                        "", "\u00CC", "\u0049\u0300", "", "\u00CD",
295:                        "\u0049\u0301", "", "\u00CE", "\u0049\u0302", "",
296:                        "\u00CF", "\u0049\u0308", "", "\u00D1", "\u004E\u0303",
297:                        "", "\u00D2", "\u004F\u0300", "", "\u00D3",
298:                        "\u004F\u0301", "", "\u00D4", "\u004F\u0302", "",
299:                        "\u00D5", "\u004F\u0303", "", "\u00D6", "\u004F\u0308",
300:                        "", "\u00D9", "\u0055\u0300", "", "\u00DA",
301:                        "\u0055\u0301", "", "\u00DB", "\u0055\u0302", "",
302:                        "\u00DC", "\u0055\u0308", "", "\u00DD", "\u0059\u0301",
303:                        "", "\u00E0", "\u0061\u0300", "", "\u00E1",
304:                        "\u0061\u0301", "", "\u00E2", "\u0061\u0302", "",
305:                        "\u00E3", "\u0061\u0303", "", "\u00E4", "\u0061\u0308",
306:                        "", "\u00E5", "\u0061\u030A", "", "\u00E7",
307:                        "\u0063\u0327", "", "\u00E8", "\u0065\u0300", "",
308:                        "\u00E9", "\u0065\u0301", "", "\u00EA", "\u0065\u0302",
309:                        "", "\u00EB", "\u0065\u0308", "", "\u00EC",
310:                        "\u0069\u0300", "", "\u00ED", "\u0069\u0301", "",
311:                        "\u00EE", "\u0069\u0302", "", "\u00EF", "\u0069\u0308",
312:                        "", "\u00F1", "\u006E\u0303", "", "\u00F2",
313:                        "\u006F\u0300", "", "\u00F3", "\u006F\u0301", "",
314:                        "\u00F4", "\u006F\u0302", "", "\u00F5", "\u006F\u0303",
315:                        "", "\u00F6", "\u006F\u0308", "", "\u00F9",
316:                        "\u0075\u0300", "", "\u00FA", "\u0075\u0301", "",
317:                        "\u00FB", "\u0075\u0302", "", "\u00FC", "\u0075\u0308",
318:                        "", "\u00FD", "\u0079\u0301",
319:                        "",
320:                        // EXTRAS, outside of Latin 1
321:                        "\u1EA4", "\u00C2\u0301", "", "\u1EA5", "\u00E2\u0301",
322:                        "", "\u1EA6", "\u00C2\u0300", "", "\u1EA7",
323:                        "\u00E2\u0300", "", };
324:
325:                int[] classData = { 0x0300, 230, 0x0301, 230, 0x0302, 230,
326:                        0x0303, 230, 0x0304, 230, 0x0305, 230, 0x0306, 230,
327:                        0x0307, 230, 0x0308, 230, 0x0309, 230, 0x030A, 230,
328:                        0x030B, 230, 0x030C, 230, 0x030D, 230, 0x030E, 230,
329:                        0x030F, 230, 0x0310, 230, 0x0311, 230, 0x0312, 230,
330:                        0x0313, 230, 0x0314, 230, 0x0315, 232, 0x0316, 220,
331:                        0x0317, 220, 0x0318, 220, 0x0319, 220, 0x031A, 232,
332:                        0x031B, 216, 0x031C, 220, 0x031D, 220, 0x031E, 220,
333:                        0x031F, 220, 0x0320, 220, 0x0321, 202, 0x0322, 202,
334:                        0x0323, 220, 0x0324, 220, 0x0325, 220, 0x0326, 220,
335:                        0x0327, 202, 0x0328, 202, 0x0329, 220, 0x032A, 220,
336:                        0x032B, 220, 0x032C, 220, 0x032D, 220, 0x032E, 220,
337:                        0x032F, 220, 0x0330, 220, 0x0331, 220, 0x0332, 220,
338:                        0x0333, 220, 0x0334, 1, 0x0335, 1, 0x0336, 1, 0x0337,
339:                        1, 0x0338, 1, 0x0339, 220, 0x033A, 220, 0x033B, 220,
340:                        0x033C, 220, 0x033D, 230, 0x033E, 230, 0x033F, 230,
341:                        0x0340, 230, 0x0341, 230, 0x0342, 230, 0x0343, 230,
342:                        0x0344, 230, 0x0345, 240, 0x0360, 234, 0x0361, 234 };
343:
344:                // build the same tables we would otherwise get from the
345:                // Unicode Character Database, just with limited data
346:
347:                for (int i = 0; i < decomposeData.length; i += 3) {
348:                    char value = decomposeData[i].charAt(0);
349:                    String decomp = decomposeData[i + 1];
350:                    boolean compat = decomposeData[i + 2].equals("K");
351:                    if (compat)
352:                        isCompatibility.set(value);
353:                    decompose.put(value, decomp);
354:                    if (!compat) {
355:                        int first = '\u0000';
356:                        int second = UTF16Util.nextCodePoint(decomp, 0);
357:                        if (decomp.length() > 1) {
358:                            first = second;
359:                            second = UTF16Util.nextCodePoint(decomp, UTF16Util
360:                                    .codePointLength(first));
361:                        }
362:                        long pair = (first << 16) | second;
363:                        compose.put(pair, value);
364:                    }
365:                }
366:
367:                for (int i = 0; i < classData.length;) {
368:                    canonicalClass.put(classData[i++], classData[i++]);
369:                }
370:            }
371:
372:            /**
373:             * Utility: Parses a sequence of hex Unicode characters separated by spaces
374:             */
375:            static public String fromHex(String source) {
376:                StringBuffer result = new StringBuffer();
377:                for (int i = 0; i < source.length(); ++i) {
378:                    char c = source.charAt(i);
379:                    switch (c) {
380:                    case ' ':
381:                        break; // ignore
382:                    case '0':
383:                    case '1':
384:                    case '2':
385:                    case '3':
386:                    case '4':
387:                    case '5':
388:                    case '6':
389:                    case '7':
390:                    case '8':
391:                    case '9':
392:                    case 'A':
393:                    case 'B':
394:                    case 'C':
395:                    case 'D':
396:                    case 'E':
397:                    case 'F':
398:                    case 'a':
399:                    case 'b':
400:                    case 'c':
401:                    case 'd':
402:                    case 'e':
403:                    case 'f':
404:                        int end = 0;
405:                        int value = 0;
406:                        try {
407:                            //System.out.println(source.substring(i, i + 4) + "************" + source);
408:                            end = source.indexOf(' ', i);
409:                            if (end < 0) {
410:                                end = source.length();
411:                            }
412:                            value = Integer.parseInt(source.substring(i, end),
413:                                    16);
414:                            UTF16Util.appendCodePoint(result, value);
415:                        } catch (Exception e) {
416:                            System.out.println("i: " + i + ";end:" + end
417:                                    + "source:" + source);
418:                            //System.out.println(source.substring(i, i + 4) + "************" + source);
419:                            System.exit(1);
420:                        }
421:                        //i+= 3; // skip rest of number
422:                        i = end;
423:                        break;
424:                    case '<':
425:                        int j = source.indexOf('>', i); // skip <...>
426:                        if (j > 0) {
427:                            i = j;
428:                            break;
429:                        } // else fall through--error
430:                    default:
431:                        throw new IllegalArgumentException("Bad hex value in "
432:                                + source);
433:                    }
434:                }
435:                return result.toString();
436:            }
437:
438:            /**
439:             * Utility: Supplies a zero-padded hex representation of an integer (without 0x)
440:             */
441:            static public String hex(int i) {
442:                String result = Long.toString(i & 0xFFFFFFFFL, 16)
443:                        .toUpperCase();
444:                return "00000000".substring(result.length(), 8) + result;
445:            }
446:
447:            /**
448:             * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
449:     */
450:            static public String hex(char i) {
451:                String result = Integer.toString(i, 16).toUpperCase();
452:                return "0000".substring(result.length(), 4) + result;
453:            }
454:
455:            /**
456:             * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
457:     */
458:            public static String hex(String s, String sep) {
459:                StringBuffer result = new StringBuffer();
460:                for (int i = 0; i < s.length(); ++i) {
461:                    if (i != 0)
462:                        result.append(sep);
463:                    result.append(hex(s.charAt(i)));
464:                }
465:                return result.toString();
466:            }
467:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.