001: package net.sf.saxon.codenorm;
002:
003: import net.sf.saxon.om.FastStringBuffer;
004:
005: import java.io.*;
006: import java.util.ArrayList;
007: import java.util.Iterator;
008: import java.util.List;
009:
010: /**
011: * This class reads the Unicode character database, extracts information needed
012: * to perform unicode normalization, and writes this information out in the form of the
013: * Java "source" module UnicodeData.java. This class is therefore executed (via its main()
014: * method) at the time Saxon is built - it only needs to be rerun when the Unicode data tables
015: * have changed.
016: * <p>
017: * The class is derived from the sample program NormalizerData.java published by the
018: * Unicode consortium. That code has been modified so that instead of building the run-time
019: * data structures directly, they are written to a Java "source" module, which is then
020: * compiled. Also, the ability to construct a condensed version of the data tables has been
021: * removed.
022: * <p>
023: * Copyright (c) 1991-2005 Unicode, Inc.
024: * For terms of use, see http://www.unicode.org/terms_of_use.html
025: * For documentation, see UAX#15.<br>
026: * @author Mark Davis
027: * @author Michael Kay: Saxon modifications.
028: */
029: class UnicodeDataGenerator {
030: static final String copyright = "Copyright © 1998-1999 Unicode, Inc.";
031:
032: /**
033: * Testing flags
034: */
035:
036: private static final boolean DEBUG = false;
037:
038: /**
039: * Constants for the data file version to use.
040: */
041: // static final boolean NEW_VERSION = true;
042: private static String dir;
043:
044: private static String UNICODE_DATA = "UnicodeData.txt";
045: private static String COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt";
046:
047: private static List canonicalClassKeys = new ArrayList(30000);
048: private static List canonicalClassValues = new ArrayList(30000);
049:
050: private static List decompositionKeys = new ArrayList(6000);
051: private static List decompositionValues = new ArrayList(6000);
052:
053: private static List exclusionList = new ArrayList(200);
054: private static List compatibilityList = new ArrayList(8000);
055:
056: private UnicodeDataGenerator() {
057: }
058:
059: /**
060: * Called exactly once by NormalizerData to build the static data
061: */
062:
063: static void build() {
064: try {
065: readExclusionList();
066: buildDecompositionTables();
067: } catch (java.io.IOException e) {
068: System.err.println("Can't load data file." + e + ", "
069: + e.getMessage());
070: }
071: }
072:
073: // =============================================================
074: // Building Decomposition Tables
075: // =============================================================
076:
077: /**
078: * Reads exclusion list and stores the data
079: */
080:
081: // Modified by MHK: the original code expects the hex character code to be always four hex digits
082: private static void readExclusionList() throws java.io.IOException {
083: if (DEBUG)
084: System.out.println("Reading Exclusions");
085: BufferedReader in = new BufferedReader(new FileReader(dir + '/'
086: + COMPOSITION_EXCLUSIONS), 5 * 1024);
087: while (true) {
088:
089: // read a line, discarding comments and blank lines
090:
091: String line = in.readLine();
092: if (line == null)
093: break;
094: int comment = line.indexOf('#'); // strip comments
095: if (comment != -1)
096: line = line.substring(0, comment);
097: if (line.length() == 0)
098: continue; // ignore blanks
099:
100: // store -1 in the excluded table for each character hit
101:
102: int z = line.indexOf(' ');
103: if (z < 0) {
104: z = line.length();
105: }
106: int value = Integer.parseInt(line.substring(0, z), 16);
107: exclusionList.add(new Integer(value));
108:
109: }
110: in.close();
111: }
112:
113: /**
114: * Builds a decomposition table from a UnicodeData file
115: */
116: private static void buildDecompositionTables()
117: throws java.io.IOException {
118: if (DEBUG)
119: System.out.println("Reading Unicode Character Database");
120: BufferedReader in = new BufferedReader(new FileReader(dir + '/'
121: + UNICODE_DATA), 64 * 1024);
122: int value;
123: int counter = 0;
124: while (true) {
125:
126: // read a line, discarding comments and blank lines
127:
128: String line = in.readLine();
129: if (line == null)
130: break;
131: int comment = line.indexOf('#'); // strip comments
132: if (comment != -1)
133: line = line.substring(0, comment);
134: if (line.length() == 0)
135: continue;
136: if (DEBUG) {
137: counter++;
138: if ((counter & 0xFF) == 0)
139: System.out.println("At: " + line);
140: }
141:
142: // find the values of the particular fields that we need
143: // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
144:
145: int start = 0;
146: int end = line.indexOf(';'); // code
147: try {
148: value = Integer
149: .parseInt(line.substring(start, end), 16);
150: } catch (NumberFormatException e) {
151: throw new IllegalStateException(
152: "Bad hex value in line:\n" + line);
153: }
154: if (true && value == '\u00c0') {
155: System.out.println("debug: " + line);
156: }
157: end = line.indexOf(';', end + 1); // name
158: //String name = line.substring(start,end);
159: end = line.indexOf(';', end + 1); // general category
160: end = line.indexOf(';', start = end + 1); // canonical class
161:
162: // check consistency: canonical classes must be from 0 to 255
163:
164: int cc = Integer.parseInt(line.substring(start, end));
165: if (cc != (cc & 0xFF))
166: System.err.println("Bad canonical class at: " + line);
167: canonicalClassKeys.add(new Integer(value));
168: canonicalClassValues.add(new Integer(cc));
169: //canonicalClass.put(value,cc);
170: end = line.indexOf(';', end + 1); // BIDI
171: end = line.indexOf(';', start = end + 1); // decomp
172:
173: // decomp requires more processing.
174: // store whether it is canonical or compatibility.
175: // store the decomp in one table, and the reverse mapping (from pairs) in another
176:
177: if (start != end) {
178: String segment = line.substring(start, end);
179: boolean compat = segment.charAt(0) == '<';
180: if (compat) {
181: compatibilityList.add(new Integer(value));
182: //isCompatibility.set(value);
183: }
184: String decomp = fromHex(segment);
185:
186: // check consistency: all canon decomps must be singles or pairs!
187:
188: if (decomp.length() < 1 || decomp.length() > 2
189: && !compat) {
190: System.err.println("Bad decomp at: " + line);
191: }
192:
193: decompositionKeys.add(new Integer(value));
194: decompositionValues.add(decomp);
195: //decompose.put(value, decomp);
196:
197: // only compositions are canonical pairs
198: // skip if script exclusion
199:
200: // if (!compat && !isExcluded.get(value)) {
201: // char first = '\u0000';
202: // char second = decomp.charAt(0);
203: // if (decomp.length() > 1) {
204: // first = second;
205: // second = decomp.charAt(1);
206: // }
207: //
208: // // store composition pair in single integer
209: //
210: // pair = (first << 16) | second;
211: // if (DEBUG && value == '\u00C0') {
212: // System.out.println("debug2: " + line);
213: // }
214: // compose.put(pair, value);
215: // } else if (DEBUG) {
216: // System.out.println("Excluding: " + decomp);
217: // }
218: }
219: }
220: in.close();
221: if (DEBUG)
222: System.out
223: .println("Done reading Unicode Character Database");
224:
225: // add algorithmic Hangul decompositions
226: // this is more compact if done at runtime, but for simplicity we
227: // do it this way.
228:
229: // if (DEBUG) System.out.println("Adding Hangul");
230: //
231: // for (int SIndex = 0; SIndex < SCount; ++SIndex) {
232: // int TIndex = SIndex % TCount;
233: // char first, second;
234: // if (TIndex != 0) { // triple
235: // first = (char)(SBase + SIndex - TIndex);
236: // second = (char)(TBase + TIndex);
237: // } else {
238: // first = (char)(LBase + SIndex / NCount);
239: // second = (char)(VBase + (SIndex % NCount) / TCount);
240: // }
241: // pair = (first << 16) | second;
242: // value = SIndex + SBase;
243: // decompose.put(value, String.valueOf(first) + second);
244: // compose.put(pair, value);
245: // }
246: // if (DEBUG) System.out.println("Done adding Hangul");
247: }
248:
249: /**
250: * Hangul composition constants
251: */
252: // static final int
253: // SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
254: // LCount = 19, VCount = 21, TCount = 28,
255: // NCount = VCount * TCount, // 588
256: // SCount = LCount * NCount; // 11172
257: /**
258: * Utility: Parses a sequence of hex Unicode characters separated by spaces
259: */
260:
261: // Modified by MHK. Original code assumed the characters were each 4 hex digits!
262: public static String fromHex(String source) {
263: FastStringBuffer result = new FastStringBuffer(5);
264: for (int i = 0; i < source.length(); ++i) {
265: char c = source.charAt(i);
266: switch (c) {
267: case ' ':
268: break; // ignore
269: case '0':
270: case '1':
271: case '2':
272: case '3':
273: case '4':
274: case '5':
275: case '6':
276: case '7':
277: case '8':
278: case '9':
279: case 'A':
280: case 'B':
281: case 'C':
282: case 'D':
283: case 'E':
284: case 'F':
285: case 'a':
286: case 'b':
287: case 'c':
288: case 'd':
289: case 'e':
290: case 'f':
291: int z = source.indexOf(' ', i);
292: if (z < 0) {
293: z = source.length();
294: }
295: try {
296: result.append((char) Integer.parseInt(source
297: .substring(i, z), 16));
298: } catch (NumberFormatException e) {
299: throw new IllegalArgumentException(
300: "Bad hex value in " + source);
301: }
302: i = z; // skip rest of number
303: break;
304: case '<':
305: int j = source.indexOf('>', i); // skip <...>
306: if (j > 0) {
307: i = j;
308: break;
309: } // else fall through--error
310: default:
311: throw new IllegalArgumentException("Bad hex value in "
312: + source);
313: }
314: }
315: return result.toString();
316: }
317:
318: /**
319: * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
320: */
321: public static String hex(char i) {
322: String result = Integer.toString(i, 16).toUpperCase();
323: return "0000".substring(result.length(), 4) + result;
324: }
325:
326: /**
327: * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
328: */
329: public static String hex(String s, String sep) {
330: FastStringBuffer result = new FastStringBuffer(20);
331: for (int i = 0; i < s.length(); ++i) {
332: if (i != 0)
333: result.append(sep);
334: result.append(hex(s.charAt(i)));
335: }
336: return result.toString();
337: }
338:
339: /**
340: * Generate the Java output from the data structure
341: */
342:
343: private static void generateJava(PrintStream o) {
344: o.println("package net.sf.saxon.codenorm;");
345: o.println("");
346: o
347: .println("//This module was generated by running net.sf.saxon.codenorm.UnicodeDataGenerator");
348: o.println("//*** DO NOT EDIT! ***");
349: o
350: .println("//The strange format of this file is carefully chosen to avoid breaking Java compiler limits");
351: o.println("");
352: o.println("public class UnicodeData {");
353:
354: // Output the canonical class table
355: o
356: .println("public static final String[] canonicalClassKeys = {");
357: printArray(o, canonicalClassKeys.iterator());
358: o.println("};");
359: o
360: .println("public static final String[] canonicalClassValues = {");
361: printArray(o, canonicalClassValues.iterator());
362: o.println("};");
363:
364: // Output the decomposition values (not including Hangul algorithmic decompositions)
365: o.println("public static final String[] decompositionKeys = {");
366: printArray(o, decompositionKeys.iterator());
367: o.println("};");
368: o
369: .println("public static final String[] decompositionValues = {");
370: printStringArray(o, decompositionValues.iterator());
371: o.println("};");
372:
373: // Output the composition exclusions
374: o.println("public static final String[] exclusionList = {");
375: printArray(o, exclusionList.iterator());
376: o.println("};");
377:
378: // Output the compatibility list
379: o.println("public static final String[] compatibilityList = {");
380: printArray(o, compatibilityList.iterator());
381: o.println("};");
382:
383: o.println("}");
384:
385: }
386:
387: /**
388: * Output an array of integer values
389: */
390:
391: private static void printArray(PrintStream o, Iterator iter) {
392: int count = 0;
393: FastStringBuffer buff = new FastStringBuffer(120);
394: if (!iter.hasNext())
395: return;
396: buff.append('"');
397: while (true) {
398: if (++count == 20) {
399: count = 0;
400: buff.append("\",");
401: o.println(buff.toString());
402: buff.setLength(0);
403: buff.append('"');
404: }
405: int next = ((Integer) iter.next()).intValue();
406: buff.append(Integer.toString(next, 32)); // values are written in base-32 notation
407: if (iter.hasNext()) {
408: buff.append(",");
409: } else {
410: buff.append("\"");
411: o.println(buff.toString());
412: return;
413: }
414: }
415: }
416:
417: /**
418: * Output an array of string values (using backslash-uuuu notation where appropriate)
419: */
420:
421: private static void printStringArray(PrintStream o, Iterator iter) {
422: int count = 0;
423: FastStringBuffer buff = new FastStringBuffer(120);
424: if (!iter.hasNext())
425: return;
426: while (true) {
427: if (++count == 20) {
428: count = 0;
429: o.println(buff.toString());
430: buff.setLength(0);
431: }
432: String next = (String) iter.next();
433: appendJavaString(next, buff);
434: if (iter.hasNext()) {
435: buff.append(", ");
436: } else {
437: o.println(buff.toString());
438: return;
439: }
440: }
441: }
442:
443: private static void appendJavaString(String value,
444: FastStringBuffer buff) {
445: buff.append('"');
446: for (int i = 0; i < value.length(); i++) {
447: char c = value.charAt(i);
448: if (c == '\\') {
449: buff.append("\\\\");
450: } else if (c == '"') {
451: buff.append("\\\"");
452: } else if (c > 32 && c < 127) {
453: buff.append(c);
454: } else {
455: buff.append("\\u");
456: char b0 = "0123456789abcdef".charAt(c & 0xf);
457: char b1 = "0123456789abcdef".charAt((c >> 4) & 0xf);
458: char b2 = "0123456789abcdef".charAt((c >> 8) & 0xf);
459: char b3 = "0123456789abcdef".charAt((c >> 12) & 0xf);
460: buff.append(b3);
461: buff.append(b2);
462: buff.append(b1);
463: buff.append(b0);
464: }
465: }
466: buff.append('"');
467: }
468:
469: /**
470: * Main program. Run this program to regenerate the Java module UnicodeData.java against revised data
471: * from the Unicode character database.
472: * <p>
473: * Usage: java UnicodeDataGenerator dir >UnicodeData.java
474: * <p>
475: * where dir is the directory containing the files UnicodeData.text and CompositionExclusions.txt from the
476: * Unicode character database.
477: */
478:
479: public static void main(String[] args) throws Exception {
480: if (args.length != 2) {
481: System.err
482: .println("Usage: java UnicodeDataGenerator dir UnicodeData.java");
483: System.err
484: .println("where dir is the directory containing the files UnicodeData.text and"
485: + " CompositionExclusions.txt from the Unicode character database");
486: }
487: dir = args[0];
488: build();
489: PrintStream o = new PrintStream(new FileOutputStream(new File(
490: args[1])));
491: generateJava(o);
492: }
493: }
|