001: package net.sf.saxon.codenorm;
002:
003: import net.sf.saxon.sort.IntHashMap;
004: import net.sf.saxon.sort.IntToIntHashMap;
005:
006: import java.util.ArrayList;
007: import java.util.BitSet;
008: import java.util.StringTokenizer;
009:
010: /**
011: * This class reads the data compiled into class UnicodeData, and builds hash tables
012: * that can be used by the Unicode normalization routines. This operation is performed
013: * once only, the first time normalization is attempted after Saxon is loaded.
014: */
015:
016: class UnicodeDataParser {
017:
018: // This class is never instantiated
019: private UnicodeDataParser() {
020: }
021:
022: /**
023: * Called exactly once by NormalizerData to build the static data
024: */
025:
026: static NormalizerData build() {
027: IntToIntHashMap canonicalClass = new IntToIntHashMap(400);
028: canonicalClass.setDefaultValue(0);
029: IntHashMap decompose = new IntHashMap(18000);
030: IntToIntHashMap compose = new IntToIntHashMap(15000);
031: compose.setDefaultValue(NormalizerData.NOT_COMPOSITE);
032: BitSet isCompatibility = new BitSet(128000);
033: BitSet isExcluded = new BitSet(128000);
034:
035: readExclusionList(isExcluded);
036: readCompatibilityList(isCompatibility);
037: readCanonicalClassTable(canonicalClass);
038: readDecompositionTable(decompose, compose, isExcluded,
039: isCompatibility);
040:
041: return new NormalizerData(canonicalClass, decompose, compose,
042: isCompatibility, isExcluded);
043: }
044:
045: /**
046: * Reads exclusion list and stores the data
047: */
048:
049: private static void readExclusionList(BitSet isExcluded) {
050: for (int i = 0; i < UnicodeData.exclusionList.length; i++) {
051: String s = UnicodeData.exclusionList[i];
052: StringTokenizer st = new StringTokenizer(s, ",");
053: while (st.hasMoreTokens()) {
054: String tok = st.nextToken();
055: int value = Integer.parseInt(tok, 32);
056: isExcluded.set(value);
057: }
058: }
059: }
060:
061: /**
062: * Reads exclusion list and stores the data
063: */
064:
065: private static void readCompatibilityList(BitSet isCompatible) {
066: for (int i = 0; i < UnicodeData.compatibilityList.length; i++) {
067: String s = UnicodeData.compatibilityList[i];
068: StringTokenizer st = new StringTokenizer(s, ",");
069: while (st.hasMoreTokens()) {
070: String tok = st.nextToken();
071: int value = Integer.parseInt(tok, 32);
072: isCompatible.set(value);
073: }
074: }
075: }
076:
077: /**
078: * Read canonical class table (mapping from character codes to their canonical class)
079: */
080:
081: private static void readCanonicalClassTable(
082: IntToIntHashMap canonicalClasses) {
083: ArrayList keys = new ArrayList(5000);
084: for (int i = 0; i < UnicodeData.canonicalClassKeys.length; i++) {
085: String s = UnicodeData.canonicalClassKeys[i];
086: StringTokenizer st = new StringTokenizer(s, ",");
087: while (st.hasMoreTokens()) {
088: String tok = st.nextToken();
089: int value = Integer.parseInt(tok, 32);
090: keys.add(new Integer(value));
091: }
092: }
093: int k = 0;
094: for (int i = 0; i < UnicodeData.canonicalClassValues.length; i++) {
095: String s = UnicodeData.canonicalClassValues[i];
096: StringTokenizer st = new StringTokenizer(s, ",");
097: while (st.hasMoreTokens()) {
098: String tok = st.nextToken();
099: int clss = Integer.parseInt(tok, 32);
100: canonicalClasses.put(((Integer) keys.get(k++))
101: .intValue(), clss);
102: }
103: }
104: }
105:
106: /**
107: * Read canonical class table (mapping from character codes to their canonical class)
108: */
109:
110: private static void readDecompositionTable(IntHashMap decompose,
111: IntToIntHashMap compose, BitSet isExcluded,
112: BitSet isCompatibility) {
113: int k = 0;
114: for (int i = 0; i < UnicodeData.decompositionKeys.length; i++) {
115: String s = UnicodeData.decompositionKeys[i];
116: StringTokenizer st = new StringTokenizer(s, ",");
117: while (st.hasMoreTokens()) {
118: String tok = st.nextToken();
119: int key = Integer.parseInt(tok, 32);
120: String value = UnicodeData.decompositionValues[k++];
121: decompose.put(key, value);
122: // only compositions are canonical pairs
123: // skip if script exclusion
124:
125: if (!isCompatibility.get(key) && !isExcluded.get(key)) {
126: char first = '\u0000';
127: char second = value.charAt(0);
128: if (value.length() > 1) {
129: first = second;
130: second = value.charAt(1);
131: }
132:
133: // store composition pair in single integer
134:
135: int pair = (first << 16) | second;
136: compose.put(pair, key);
137: }
138: }
139: }
140:
141: // Add algorithmic Hangul decompositions
142: // This fragment code is copied from the normalization code published by Unicode consortium.
143: // See module net.sf.saxon.codenorm.Normalizer for applicable copyright information.
144:
145: for (int SIndex = 0; SIndex < SCount; ++SIndex) {
146: int TIndex = SIndex % TCount;
147: char first, second;
148: if (TIndex != 0) { // triple
149: first = (char) (SBase + SIndex - TIndex);
150: second = (char) (TBase + TIndex);
151: } else {
152: first = (char) (LBase + SIndex / NCount);
153: second = (char) (VBase + (SIndex % NCount) / TCount);
154: }
155: int pair = (first << 16) | second;
156: int key = SIndex + SBase;
157: decompose.put(key, String.valueOf(first) + second);
158: compose.put(pair, key);
159: }
160: }
161:
162: /**
163: * Hangul composition constants
164: */
165: private static final int SBase = 0xAC00, LBase = 0x1100,
166: VBase = 0x1161, TBase = 0x11A7, LCount = 19, VCount = 21,
167: TCount = 28, NCount = VCount * TCount, // 588
168: SCount = LCount * NCount; // 11172
169:
170: // end of Unicode consortium code
171:
172: }
173:
174: //
175: // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
176: // you may not use this file except in compliance with the License. You may obtain a copy of the
177: // License at http://www.mozilla.org/MPL/
178: //
179: // Software distributed under the License is distributed on an "AS IS" basis,
180: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
181: // See the License for the specific language governing rights and limitations under the License.
182: //
183: // The Original Code is: all this file.
184: //
185: // The Initial Developer of the Original Code is Michael H. Kay.
186: //
187: // The code for generating Hangul decompositions is Copyright (C) Unicode, Inc. All Rights Reserved.
188: // See statement below.
189: //
190: // Contributor(s): none.
191: //
192:
193: // * Copyright (c) 1991-2005 Unicode, Inc.
194: // * For terms of use, see http://www.unicode.org/terms_of_use.html
195: // * For documentation, see UAX#15.<br>
196: // * The Unicode Consortium makes no expressed or implied warranty of any
197: // * kind, and assumes no liability for errors or omissions.
198: // * No liability is assumed for incidental and consequential damages
199: // * in connection with or arising out of the use of the information here.
|