001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007:
008: package com.ibm.icu.impl;
009:
010: import java.io.*;
011: import com.ibm.icu.impl.ICUDebug;
012:
013: /**
014: * @version 1.0
015: * @author Ram Viswanadha
016: */
017:
018: /*
019: * Description of the format of unorm.icu version 2.1.
020: *
021: * Main change from version 1 to version 2:
022: * Use of new, common Trie instead of normalization-specific tries.
023: * Change to version 2.1: add third/auxiliary trie with associated data.
024: *
025: * For more details of how to use the data structures see the code
026: * in unorm.cpp (runtime normalization code) and
027: * in gennorm.c and gennorm/store.c (build-time data generation).
028: *
029: * For the serialized format of Trie see Trie.c/TrieHeader.
030: *
031: * - Overall partition
032: *
033: * unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
034: * After that there are the following structures:
035: *
036: * char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file
037: *
038: * Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE]
039: *
040: * char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
041: * extraData[0] contains the number of units for
042: * FC_NFKC_Closure (formatVersion>=2.1)
043: *
044: * char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
045: * combiningTableTop may include one 16-bit padding unit
046: * to make sure that fcdTrie is 32-bit-aligned
047: *
048: * Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
049: *
050: * Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
051: *
052: * char canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[INDEX_CANON_SET_COUNT]
053: * serialized USets, see uset.c
054: *
055: *
056: * The indexes array contains lengths and sizes of the following arrays and structures
057: * as well as the following values:
058: * indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
059: * -- one more than the highest combining index computed for forward-only-combining characters
060: * indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
061: * -- number of combining indexes computed for both-ways-combining characters
062: * indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
063: * -- number of combining indexes computed for backward-only-combining characters
064: *
065: * indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
066: * -- first code point with a quick check NF* value of NO/MAYBE
067: *
068: *
069: * - Tries
070: *
071: * The main structures are two Trie tables ("compact arrays"),
072: * each with one index array and one data array.
073: * See Trie.h and Trie.c.
074: *
075: *
076: * - Tries in unorm.icu
077: *
078: * The first trie (normTrie above)
079: * provides data for the NF* quick checks and normalization.
080: * The second trie (fcdTrie above) provides data just for FCD checks.
081: *
082: *
083: * - norm32 data words from the first trie
084: *
085: * The norm32Table contains one 32-bit word "norm32" per code point.
086: * It contains the following bit fields:
087: * 31..16 extra data index, EXTRA_SHIFT is used to shift this field down
088: * if this index is <EXTRA_INDEX_TOP then it is an index into
089: * extraData[] where variable-length normalization data for this
090: * code point is found
091: * if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
092: * then this is a norm32 for a leading surrogate, and the index
093: * value is used together with the following trailing surrogate
094: * code unit in the second trie access
095: * if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
096: * then this is a norm32 for a "special" character,
097: * i.e., the character is a Hangul syllable or a Jamo
098: * see EXTRA_HANGUL etc.
099: * generally, instead of extracting this index from the norm32 and
100: * comparing it with the above constants,
101: * the normalization code compares the entire norm32 value
102: * with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
103: *
104: * 15..8 combining class (cc) according to UnicodeData.txt
105: *
106: * 7..6 COMBINES_ANY flags, used in composition to see if a character
107: * combines with any following or preceding character(s)
108: * at all
109: * 7 COMBINES_BACK
110: * 6 COMBINES_FWD
111: *
112: * 5..0 quick check flags, set for "no" or "maybe", with separate flags for
113: * each normalization form
114: * the higher bits are "maybe" flags; for NF*D there are no such flags
115: * the lower bits are "no" flags for all forms, in the same order
116: * as the "maybe" flags,
117: * which is (MSB to LSB): NFKD NFD NFKC NFC
118: * 5..4 QC_ANY_MAYBE
119: * 3..0 QC_ANY_NO
120: * see further related constants
121: *
122: *
123: * - Extra data per code point
124: *
125: * "Extra data" is referenced by the index in norm32.
126: * It is variable-length data. It is only present, and only those parts
127: * of it are, as needed for a given character.
128: * The norm32 extra data index is added to the beginning of extraData[]
129: * to get to a vector of 16-bit words with data at the following offsets:
130: *
131: * [-1] Combining index for composition.
132: * Stored only if norm32&COMBINES_ANY .
133: * [0] Lengths of the canonical and compatibility decomposition strings.
134: * Stored only if there are decompositions, i.e.,
135: * if norm32&(QC_NFD|QC_NFKD)
136: * High byte: length of NFKD, or 0 if none
137: * Low byte: length of NFD, or 0 if none
138: * Each length byte also has another flag:
139: * Bit 7 of a length byte is set if there are non-zero
140: * combining classes (cc's) associated with the respective
141: * decomposition. If this flag is set, then the decomposition
142: * is preceded by a 16-bit word that contains the
143: * leading and trailing cc's.
144: * Bits 6..0 of a length byte are the length of the
145: * decomposition string, not counting the cc word.
146: * [1..n] NFD
147: * [n+1..] NFKD
148: *
149: * Each of the two decompositions consists of up to two parts:
150: * - The 16-bit words with the leading and trailing cc's.
151: * This is only stored if bit 7 of the corresponding length byte
152: * is set. In this case, at least one of the cc's is not zero.
153: * High byte: leading cc==cc of the first code point in the decomposition string
154: * Low byte: trailing cc==cc of the last code point in the decomposition string
155: * - The decomposition string in UTF-16, with length code units.
156: *
157: *
158: * - Combining indexes and combiningTable[]
159: *
160: * Combining indexes are stored at the [-1] offset of the extra data
161: * if the character combines forward or backward with any other characters.
162: * They are used for (re)composition in NF*C.
163: * Values of combining indexes are arranged according to whether a character
164: * combines forward, backward, or both ways:
165: * forward-only < both ways < backward-only
166: *
167: * The index values for forward-only and both-ways combining characters
168: * are indexes into the combiningTable[].
169: * The index values for backward-only combining characters are simply
170: * incremented from the preceding index values to be unique.
171: *
172: * In the combiningTable[], a variable-length list
173: * of variable-length (back-index, code point) pair entries is stored
174: * for each forward-combining character.
175: *
176: * These back-indexes are the combining indexes of both-ways or backward-only
177: * combining characters that the forward-combining character combines with.
178: *
179: * Each list is sorted in ascending order of back-indexes.
180: * Each list is terminated with the last back-index having bit 15 set.
181: *
182: * Each pair (back-index, code point) takes up either 2 or 3
183: * 16-bit words.
184: * The first word of a list entry is the back-index, with its bit 15 set if
185: * this is the last pair in the list.
186: *
187: * The second word contains flags in bits 15..13 that determine
188: * if there is a third word and how the combined character is encoded:
189: * 15 set if there is a third word in this list entry
190: * 14 set if the result is a supplementary character
191: * 13 set if the result itself combines forward
192: *
193: * According to these bits 15..14 of the second word,
194: * the result character is encoded as follows:
195: * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
196: * the second word.
197: * 10 The result is 0x2000..0xffff and stored in the third word.
198: * Bits 12..0 of the second word are not used.
199: * 11 The result is a supplementary character.
200: * Bits 9..0 of the leading surrogate are in bits 9..0 of
201: * the second word.
202: * Add 0xd800 to these bits to get the complete surrogate.
203: * Bits 12..10 of the second word are not used.
204: * The trailing surrogate is stored in the third word.
205: *
206: *
207: * - FCD trie
208: *
209: * The FCD trie is very simple.
210: * It is a folded trie with 16-bit data words.
211: * In each word, the high byte contains the leading cc of the character,
212: * and the low byte contains the trailing cc of the character.
213: * These cc's are the cc's of the first and last code points in the
214: * canonical decomposition of the character.
215: *
216: * Since all 16 bits are used for cc's, lead surrogates must be tested
217: * by checking the code unit instead of the trie data.
218: * This is done only if the 16-bit data word is not zero.
219: * If the code unit is a leading surrogate and the data word is not zero,
220: * then instead of cc's it contains the offset for the second trie lookup.
221: *
222: *
223: * - Auxiliary trie and data
224: *
225: *
226: * The auxiliary 16-bit trie contains data for additional properties.
227: * Bits
228: * 15..13 reserved
229: * 12 not NFC_Skippable (f) (formatVersion>=2.2)
230: * 11 flag: not a safe starter for canonical closure
231: * 10 composition exclusion
232: * 9.. 0 index into extraData[] to FC_NFKC_Closure string
233: * (not for lead surrogate),
234: * or lead surrogate offset (for lead surrogate, if 9..0 not zero)
235: *
236: * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
237: * (used in NormalizerTransliterator)
238: *
239: * A skippable character is
240: * a) unassigned, or ALL of the following:
241: * b) of combining class 0.
242: * c) not decomposed by this normalization form.
243: * AND if NFC or NFKC,
244: * d) can never compose with a previous character.
245: * e) can never compose with a following character.
246: * f) can never change if another character is added.
247: * Example: a-breve might satisfy all but f, but if you
248: * add an ogonek it changes to a-ogonek + breve
249: *
250: * a)..e) must be tested from norm32.
251: * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
252: * into the auxiliary trie.
253: * The same bit is used for NFC and NFKC; (c) differs for them.
254: * As usual, we build the "not skippable" flags so that unassigned
255: * code points get a 0 bit.
256: * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
257: * Test Hangul LV syllables entirely in code.
258: *
259: *
260: * - FC_NFKC_Closure strings in extraData[]
261: *
262: * Strings are either stored as a single code unit or as the length
263: * followed by that many units.
264: *
265: * - structure inside canonStartSets[]
266: *
267: * This array maps from code points c to sets of code points (USerializedSet).
268: * The result sets are the code points whose canonical decompositions start
269: * with c.
270: *
271: * canonStartSets[] contains the following sub-arrays:
272: *
273: * indexes[_NORM_SET_INDEX_TOP]
274: * - contains lengths of sub-arrays etc.
275: *
276: * startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP]
277: * - contains serialized sets (USerializedSet) of canonical starters for
278: * enumerating canonically equivalent strings
279: * indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP
280: * for details about the structure see uset.c
281: *
282: * bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]]
283: * - a sorted search table for BMP code points whose results are
284: * either indexes to USerializedSets or single code points for
285: * single-code point sets;
286: * each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx
287: * if yy==01 then there is a USerializedSet at canonStartSets+x
288: * else build a USerializedSet with result as the single code point
289: *
290: * suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]]
291: * - a sorted search table for supplementary code points whose results are
292: * either indexes to USerializedSets or single code points for
293: * single-code point sets;
294: * each entry is a triplet of { high16(cp), low16(cp), result }
295: * each code point's high-word may contain extra data in bits 15..5:
296: * if the high word has bit 15 set, then build a set with a single code point
297: * which is (((high16(cp)&0x1f00)<<8)|result;
298: * else there is a USerializedSet at canonStartSets+result
299: */
300: final class NormalizerDataReader implements ICUBinary.Authenticate {
301: private final static boolean debug = ICUDebug
302: .enabled("NormalizerDataReader");
303:
304: /**
305: * <p>Protected constructor.</p>
306: * @param inputStream ICU uprop.dat file input stream
307: * @exception IOException throw if data file fails authentication
308: * @draft 2.1
309: */
310: protected NormalizerDataReader(InputStream inputStream)
311: throws IOException {
312: if (debug)
313: System.out.println("Bytes in inputStream "
314: + inputStream.available());
315:
316: unicodeVersion = ICUBinary.readHeader(inputStream,
317: DATA_FORMAT_ID, this );
318:
319: if (debug)
320: System.out.println("Bytes left in inputStream "
321: + inputStream.available());
322:
323: dataInputStream = new DataInputStream(inputStream);
324:
325: if (debug)
326: System.out.println("Bytes left in dataInputStream "
327: + dataInputStream.available());
328: }
329:
330: // protected methods -------------------------------------------------
331:
332: protected int[] readIndexes(int length) throws IOException {
333: int[] indexes = new int[length];
334: //Read the indexes
335: for (int i = 0; i < length; i++) {
336: indexes[i] = dataInputStream.readInt();
337: }
338: return indexes;
339: }
340:
341: /**
342: * <p>Reads unorm.icu, parse it into blocks of data to be stored in
343: * NormalizerImpl.</P
344: * @param normBytes
345: * @param fcdBytes
346: * @param auxBytes
347: * @param extraData
348: * @param combiningTable
349: * @param canonStartSets
350: * @exception IOException thrown when data reading fails
351: * @draft 2.1
352: */
353: protected void read(byte[] normBytes, byte[] fcdBytes,
354: byte[] auxBytes, char[] extraData, char[] combiningTable,
355: Object[] canonStartSets) throws IOException {
356:
357: //Read the bytes that make up the normTrie
358: dataInputStream.read(normBytes);
359:
360: //normTrieStream= new ByteArrayInputStream(normBytes);
361:
362: //Read the extra data
363: for (int i = 0; i < extraData.length; i++) {
364: extraData[i] = dataInputStream.readChar();
365: }
366:
367: //Read the combining class table
368: for (int i = 0; i < combiningTable.length; i++) {
369: combiningTable[i] = dataInputStream.readChar();
370: }
371:
372: //Read the fcdTrie
373: dataInputStream.read(fcdBytes);
374:
375: //Read the AuxTrie
376: dataInputStream.read(auxBytes);
377:
378: //Read the canonical start sets
379: int[] canonStartSetsIndexes = new int[NormalizerImpl.SET_INDEX_TOP];
380:
381: for (int i = 0; i < canonStartSetsIndexes.length; i++) {
382: canonStartSetsIndexes[i] = dataInputStream.readChar();
383: }
384:
385: char[] startSets = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SETS_LENGTH]
386: - NormalizerImpl.SET_INDEX_TOP];
387:
388: for (int i = 0; i < startSets.length; i++) {
389: startSets[i] = dataInputStream.readChar();
390: }
391: char[] bmpTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_BMP_TABLE_LENGTH]];
392: for (int i = 0; i < bmpTable.length; i++) {
393: bmpTable[i] = dataInputStream.readChar();
394: }
395: char[] suppTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SUPP_TABLE_LENGTH]];
396: for (int i = 0; i < suppTable.length; i++) {
397: suppTable[i] = dataInputStream.readChar();
398: }
399: canonStartSets[NormalizerImpl.CANON_SET_INDICIES_INDEX] = canonStartSetsIndexes;
400: canonStartSets[NormalizerImpl.CANON_SET_START_SETS_INDEX] = startSets;
401: canonStartSets[NormalizerImpl.CANON_SET_BMP_TABLE_INDEX] = bmpTable;
402: canonStartSets[NormalizerImpl.CANON_SET_SUPP_TABLE_INDEX] = suppTable;
403: }
404:
405: public byte[] getDataFormatVersion() {
406: return DATA_FORMAT_VERSION;
407: }
408:
409: public boolean isDataVersionAcceptable(byte version[]) {
410: return version[0] == DATA_FORMAT_VERSION[0]
411: && version[2] == DATA_FORMAT_VERSION[2]
412: && version[3] == DATA_FORMAT_VERSION[3];
413: }
414:
415: public byte[] getUnicodeVersion() {
416: return unicodeVersion;
417: }
418:
419: // private data members -------------------------------------------------
420:
421: /**
422: * ICU data file input stream
423: */
424: private DataInputStream dataInputStream;
425:
426: private byte[] unicodeVersion;
427:
428: /**
429: * File format version that this class understands.
430: * No guarantees are made if a older version is used
431: * see store.c of gennorm for more information and values
432: */
433: private static final byte DATA_FORMAT_ID[] = { (byte) 0x4E,
434: (byte) 0x6F, (byte) 0x72, (byte) 0x6D };
435: private static final byte DATA_FORMAT_VERSION[] = { (byte) 0x2,
436: (byte) 0x2, (byte) 0x5, (byte) 0x2 };
437:
438: }
|