Source Code Cross Referenced for UnicodeDataGenerator.java in » XML » XPath-Saxon » net » sf » saxon » codenorm » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » XML » XPath Saxon » net.sf.saxon.codenorm
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        package net.sf.saxon.codenorm;
002:
003:        import net.sf.saxon.om.FastStringBuffer;
004:
005:        import java.io.*;
006:        import java.util.ArrayList;
007:        import java.util.Iterator;
008:        import java.util.List;
009:
010:        /**
011:         * This class reads the Unicode character database, extracts information needed
012:         * to perform unicode normalization, and writes this information out in the form of the
013:         * Java "source" module UnicodeData.java. This class is therefore executed (via its main()
014:         * method) at the time Saxon is built - it only needs to be rerun when the Unicode data tables
015:         * have changed.
016:         * <p>
017:         * The class is derived from the sample program NormalizerData.java published by the
018:         * Unicode consortium. That code has been modified so that instead of building the run-time
019:         * data structures directly, they are written to a Java "source" module, which is then
020:         * compiled. Also, the ability to construct a condensed version of the data tables has been
021:         * removed.
022:         * <p>
023:         * Copyright (c) 1991-2005 Unicode, Inc.
024:         * For terms of use, see http://www.unicode.org/terms_of_use.html
025:         * For documentation, see UAX#15.<br>
026:         * @author Mark Davis
027:         * @author Michael Kay: Saxon modifications.
028:         */
029:        class UnicodeDataGenerator {
030:            static final String copyright = "Copyright � 1998-1999 Unicode, Inc.";
031:
032:            /**
033:             * Testing flags
034:             */
035:
036:            private static final boolean DEBUG = false;
037:
038:            /**
039:             * Constants for the data file version to use.
040:             */
041:            //    static final boolean NEW_VERSION = true;
042:            private static String dir;
043:
044:            private static String UNICODE_DATA = "UnicodeData.txt";
045:            private static String COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt";
046:
047:            private static List canonicalClassKeys = new ArrayList(30000);
048:            private static List canonicalClassValues = new ArrayList(30000);
049:
050:            private static List decompositionKeys = new ArrayList(6000);
051:            private static List decompositionValues = new ArrayList(6000);
052:
053:            private static List exclusionList = new ArrayList(200);
054:            private static List compatibilityList = new ArrayList(8000);
055:
056:            private UnicodeDataGenerator() {
057:            }
058:
059:            /**
060:             * Called exactly once by NormalizerData to build the static data
061:             */
062:
063:            static void build() {
064:                try {
065:                    readExclusionList();
066:                    buildDecompositionTables();
067:                } catch (java.io.IOException e) {
068:                    System.err.println("Can't load data file." + e + ", "
069:                            + e.getMessage());
070:                }
071:            }
072:
073:            // =============================================================
074:            // Building Decomposition Tables
075:            // =============================================================
076:
077:            /**
078:             * Reads exclusion list and stores the data
079:             */
080:
081:            // Modified by MHK: the original code expects the hex character code to be always four hex digits
082:            private static void readExclusionList() throws java.io.IOException {
083:                if (DEBUG)
084:                    System.out.println("Reading Exclusions");
085:                BufferedReader in = new BufferedReader(new FileReader(dir + '/'
086:                        + COMPOSITION_EXCLUSIONS), 5 * 1024);
087:                while (true) {
088:
089:                    // read a line, discarding comments and blank lines
090:
091:                    String line = in.readLine();
092:                    if (line == null)
093:                        break;
094:                    int comment = line.indexOf('#'); // strip comments
095:                    if (comment != -1)
096:                        line = line.substring(0, comment);
097:                    if (line.length() == 0)
098:                        continue; // ignore blanks
099:
100:                    // store -1 in the excluded table for each character hit
101:
102:                    int z = line.indexOf(' ');
103:                    if (z < 0) {
104:                        z = line.length();
105:                    }
106:                    int value = Integer.parseInt(line.substring(0, z), 16);
107:                    exclusionList.add(new Integer(value));
108:
109:                }
110:                in.close();
111:            }
112:
113:            /**
114:             * Builds a decomposition table from a UnicodeData file
115:             */
116:            private static void buildDecompositionTables()
117:                    throws java.io.IOException {
118:                if (DEBUG)
119:                    System.out.println("Reading Unicode Character Database");
120:                BufferedReader in = new BufferedReader(new FileReader(dir + '/'
121:                        + UNICODE_DATA), 64 * 1024);
122:                int value;
123:                int counter = 0;
124:                while (true) {
125:
126:                    // read a line, discarding comments and blank lines
127:
128:                    String line = in.readLine();
129:                    if (line == null)
130:                        break;
131:                    int comment = line.indexOf('#'); // strip comments
132:                    if (comment != -1)
133:                        line = line.substring(0, comment);
134:                    if (line.length() == 0)
135:                        continue;
136:                    if (DEBUG) {
137:                        counter++;
138:                        if ((counter & 0xFF) == 0)
139:                            System.out.println("At: " + line);
140:                    }
141:
142:                    // find the values of the particular fields that we need
143:                    // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
144:
145:                    int start = 0;
146:                    int end = line.indexOf(';'); // code
147:                    try {
148:                        value = Integer
149:                                .parseInt(line.substring(start, end), 16);
150:                    } catch (NumberFormatException e) {
151:                        throw new IllegalStateException(
152:                                "Bad hex value in line:\n" + line);
153:                    }
154:                    if (true && value == '\u00c0') {
155:                        System.out.println("debug: " + line);
156:                    }
157:                    end = line.indexOf(';', end + 1); // name
158:                    //String name = line.substring(start,end);
159:                    end = line.indexOf(';', end + 1); // general category
160:                    end = line.indexOf(';', start = end + 1); // canonical class
161:
162:                    // check consistency: canonical classes must be from 0 to 255
163:
164:                    int cc = Integer.parseInt(line.substring(start, end));
165:                    if (cc != (cc & 0xFF))
166:                        System.err.println("Bad canonical class at: " + line);
167:                    canonicalClassKeys.add(new Integer(value));
168:                    canonicalClassValues.add(new Integer(cc));
169:                    //canonicalClass.put(value,cc);
170:                    end = line.indexOf(';', end + 1); // BIDI
171:                    end = line.indexOf(';', start = end + 1); // decomp
172:
173:                    // decomp requires more processing.
174:                    // store whether it is canonical or compatibility.
175:                    // store the decomp in one table, and the reverse mapping (from pairs) in another
176:
177:                    if (start != end) {
178:                        String segment = line.substring(start, end);
179:                        boolean compat = segment.charAt(0) == '<';
180:                        if (compat) {
181:                            compatibilityList.add(new Integer(value));
182:                            //isCompatibility.set(value);
183:                        }
184:                        String decomp = fromHex(segment);
185:
186:                        // check consistency: all canon decomps must be singles or pairs!
187:
188:                        if (decomp.length() < 1 || decomp.length() > 2
189:                                && !compat) {
190:                            System.err.println("Bad decomp at: " + line);
191:                        }
192:
193:                        decompositionKeys.add(new Integer(value));
194:                        decompositionValues.add(decomp);
195:                        //decompose.put(value, decomp);
196:
197:                        // only compositions are canonical pairs
198:                        // skip if script exclusion
199:
200:                        //                if (!compat && !isExcluded.get(value)) {
201:                        //                    char first = '\u0000';
202:                        //                    char second = decomp.charAt(0);
203:                        //                    if (decomp.length() > 1) {
204:                        //                        first = second;
205:                        //                        second = decomp.charAt(1);
206:                        //                    }
207:                        //
208:                        //                    // store composition pair in single integer
209:                        //
210:                        //                    pair = (first << 16) | second;
211:                        //                    if (DEBUG && value == '\u00C0') {
212:                        //                        System.out.println("debug2: " + line);
213:                        //                    }
214:                        //                    compose.put(pair, value);
215:                        //                } else if (DEBUG) {
216:                        //                    System.out.println("Excluding: " + decomp);
217:                        //                }
218:                    }
219:                }
220:                in.close();
221:                if (DEBUG)
222:                    System.out
223:                            .println("Done reading Unicode Character Database");
224:
225:                // add algorithmic Hangul decompositions
226:                // this is more compact if done at runtime, but for simplicity we
227:                // do it this way.
228:
229:                //        if (DEBUG) System.out.println("Adding Hangul");
230:                //
231:                //        for (int SIndex = 0; SIndex < SCount; ++SIndex) {
232:                //            int TIndex = SIndex % TCount;
233:                //            char first, second;
234:                //            if (TIndex != 0) { // triple
235:                //                first = (char)(SBase + SIndex - TIndex);
236:                //                second = (char)(TBase + TIndex);
237:                //            } else {
238:                //                first = (char)(LBase + SIndex / NCount);
239:                //                second = (char)(VBase + (SIndex % NCount) / TCount);
240:                //            }
241:                //            pair = (first << 16) | second;
242:                //            value = SIndex + SBase;
243:                //            decompose.put(value, String.valueOf(first) + second);
244:                //            compose.put(pair, value);
245:                //        }
246:                //        if (DEBUG) System.out.println("Done adding Hangul");
247:            }
248:
249:            /**
250:             * Hangul composition constants
251:             */
252:            //    static final int
253:            //        SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
254:            //        LCount = 19, VCount = 21, TCount = 28,
255:            //        NCount = VCount * TCount,   // 588
256:            //        SCount = LCount * NCount;   // 11172
257:            /**
258:             * Utility: Parses a sequence of hex Unicode characters separated by spaces
259:             */
260:
261:            // Modified by MHK. Original code assumed the characters were each 4 hex digits!
262:            public static String fromHex(String source) {
263:                FastStringBuffer result = new FastStringBuffer(5);
264:                for (int i = 0; i < source.length(); ++i) {
265:                    char c = source.charAt(i);
266:                    switch (c) {
267:                    case ' ':
268:                        break; // ignore
269:                    case '0':
270:                    case '1':
271:                    case '2':
272:                    case '3':
273:                    case '4':
274:                    case '5':
275:                    case '6':
276:                    case '7':
277:                    case '8':
278:                    case '9':
279:                    case 'A':
280:                    case 'B':
281:                    case 'C':
282:                    case 'D':
283:                    case 'E':
284:                    case 'F':
285:                    case 'a':
286:                    case 'b':
287:                    case 'c':
288:                    case 'd':
289:                    case 'e':
290:                    case 'f':
291:                        int z = source.indexOf(' ', i);
292:                        if (z < 0) {
293:                            z = source.length();
294:                        }
295:                        try {
296:                            result.append((char) Integer.parseInt(source
297:                                    .substring(i, z), 16));
298:                        } catch (NumberFormatException e) {
299:                            throw new IllegalArgumentException(
300:                                    "Bad hex value in " + source);
301:                        }
302:                        i = z; // skip rest of number
303:                        break;
304:                    case '<':
305:                        int j = source.indexOf('>', i); // skip <...>
306:                        if (j > 0) {
307:                            i = j;
308:                            break;
309:                        } // else fall through--error
310:                    default:
311:                        throw new IllegalArgumentException("Bad hex value in "
312:                                + source);
313:                    }
314:                }
315:                return result.toString();
316:            }
317:
318:            /**
319:             * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
320:     */
321:            public static String hex(char i) {
322:                String result = Integer.toString(i, 16).toUpperCase();
323:                return "0000".substring(result.length(), 4) + result;
324:            }
325:
326:            /**
327:             * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
328:     */
329:            public static String hex(String s, String sep) {
330:                FastStringBuffer result = new FastStringBuffer(20);
331:                for (int i = 0; i < s.length(); ++i) {
332:                    if (i != 0)
333:                        result.append(sep);
334:                    result.append(hex(s.charAt(i)));
335:                }
336:                return result.toString();
337:            }
338:
339:            /**
340:             * Generate the Java output from the data structure
341:             */
342:
343:            private static void generateJava(PrintStream o) {
344:                o.println("package net.sf.saxon.codenorm;");
345:                o.println("");
346:                o
347:                        .println("//This module was generated by running net.sf.saxon.codenorm.UnicodeDataGenerator");
348:                o.println("//*** DO NOT EDIT! ***");
349:                o
350:                        .println("//The strange format of this file is carefully chosen to avoid breaking Java compiler limits");
351:                o.println("");
352:                o.println("public class UnicodeData {");
353:
354:                // Output the canonical class table
355:                o
356:                        .println("public static final String[] canonicalClassKeys = {");
357:                printArray(o, canonicalClassKeys.iterator());
358:                o.println("};");
359:                o
360:                        .println("public static final String[] canonicalClassValues = {");
361:                printArray(o, canonicalClassValues.iterator());
362:                o.println("};");
363:
364:                // Output the decomposition values (not including Hangul algorithmic decompositions)
365:                o.println("public static final String[] decompositionKeys = {");
366:                printArray(o, decompositionKeys.iterator());
367:                o.println("};");
368:                o
369:                        .println("public static final String[] decompositionValues = {");
370:                printStringArray(o, decompositionValues.iterator());
371:                o.println("};");
372:
373:                // Output the composition exclusions
374:                o.println("public static final String[] exclusionList = {");
375:                printArray(o, exclusionList.iterator());
376:                o.println("};");
377:
378:                // Output the compatibility list
379:                o.println("public static final String[] compatibilityList = {");
380:                printArray(o, compatibilityList.iterator());
381:                o.println("};");
382:
383:                o.println("}");
384:
385:            }
386:
387:            /**
388:             * Output an array of integer values
389:             */
390:
391:            private static void printArray(PrintStream o, Iterator iter) {
392:                int count = 0;
393:                FastStringBuffer buff = new FastStringBuffer(120);
394:                if (!iter.hasNext())
395:                    return;
396:                buff.append('"');
397:                while (true) {
398:                    if (++count == 20) {
399:                        count = 0;
400:                        buff.append("\",");
401:                        o.println(buff.toString());
402:                        buff.setLength(0);
403:                        buff.append('"');
404:                    }
405:                    int next = ((Integer) iter.next()).intValue();
406:                    buff.append(Integer.toString(next, 32)); // values are written in base-32 notation
407:                    if (iter.hasNext()) {
408:                        buff.append(",");
409:                    } else {
410:                        buff.append("\"");
411:                        o.println(buff.toString());
412:                        return;
413:                    }
414:                }
415:            }
416:
417:            /**
418:             * Output an array of string values (using backslash-uuuu notation where appropriate)
419:             */
420:
421:            private static void printStringArray(PrintStream o, Iterator iter) {
422:                int count = 0;
423:                FastStringBuffer buff = new FastStringBuffer(120);
424:                if (!iter.hasNext())
425:                    return;
426:                while (true) {
427:                    if (++count == 20) {
428:                        count = 0;
429:                        o.println(buff.toString());
430:                        buff.setLength(0);
431:                    }
432:                    String next = (String) iter.next();
433:                    appendJavaString(next, buff);
434:                    if (iter.hasNext()) {
435:                        buff.append(", ");
436:                    } else {
437:                        o.println(buff.toString());
438:                        return;
439:                    }
440:                }
441:            }
442:
443:            private static void appendJavaString(String value,
444:                    FastStringBuffer buff) {
445:                buff.append('"');
446:                for (int i = 0; i < value.length(); i++) {
447:                    char c = value.charAt(i);
448:                    if (c == '\\') {
449:                        buff.append("\\\\");
450:                    } else if (c == '"') {
451:                        buff.append("\\\"");
452:                    } else if (c > 32 && c < 127) {
453:                        buff.append(c);
454:                    } else {
455:                        buff.append("\\u");
456:                        char b0 = "0123456789abcdef".charAt(c & 0xf);
457:                        char b1 = "0123456789abcdef".charAt((c >> 4) & 0xf);
458:                        char b2 = "0123456789abcdef".charAt((c >> 8) & 0xf);
459:                        char b3 = "0123456789abcdef".charAt((c >> 12) & 0xf);
460:                        buff.append(b3);
461:                        buff.append(b2);
462:                        buff.append(b1);
463:                        buff.append(b0);
464:                    }
465:                }
466:                buff.append('"');
467:            }
468:
469:            /**
470:             * Main program. Run this program to regenerate the Java module UnicodeData.java against revised data
471:             * from the Unicode character database.
472:             * <p>
473:             * Usage: java UnicodeDataGenerator dir >UnicodeData.java
474:             * <p>
475:             * where dir is the directory containing the files UnicodeData.text and CompositionExclusions.txt from the
476:             * Unicode character database.
477:             */
478:
479:            public static void main(String[] args) throws Exception {
480:                if (args.length != 2) {
481:                    System.err
482:                            .println("Usage: java UnicodeDataGenerator dir UnicodeData.java");
483:                    System.err
484:                            .println("where dir is the directory containing the files UnicodeData.text and"
485:                                    + " CompositionExclusions.txt from the Unicode character database");
486:                }
487:                dir = args[0];
488:                build();
489:                PrintStream o = new PrintStream(new FileOutputStream(new File(
490:                        args[1])));
491:                generateJava(o);
492:            }
493:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.