001: /*
002: * Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004: *
005: * This code is free software; you can redistribute it and/or modify it
006: * under the terms of the GNU General Public License version 2 only, as
007: * published by the Free Software Foundation. Sun designates this
008: * particular file as subject to the "Classpath" exception as provided
009: * by Sun in the LICENSE file that accompanied this code.
010: *
011: * This code is distributed in the hope that it will be useful, but WITHOUT
012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014: * version 2 for more details (a copy is included in the LICENSE file that
015: * accompanied this code).
016: *
017: * You should have received a copy of the GNU General Public License version
018: * 2 along with this work; if not, write to the Free Software Foundation,
019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020: *
021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022: * CA 95054 USA or visit www.sun.com if you need additional information or
023: * have any questions.
024: */
025:
026: /*
027: *******************************************************************************
028: * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
029: * *
030: * The original version of this source code and documentation is copyrighted *
031: * and owned by IBM, These materials are provided under terms of a License *
032: * Agreement between IBM and Sun. This technology is protected by multiple *
033: * US and International patents. This notice and attribution to IBM may not *
034: * to removed. *
035: *******************************************************************************
036: */
037:
038: package sun.text.normalizer;
039:
040: import java.io.InputStream;
041: import java.io.DataInputStream;
042: import java.io.IOException;
043:
044: /**
045: * <p>Internal reader class for ICU data file uprops.icu containing
046: * Unicode codepoint data.</p>
047: * <p>This class simply reads uprops.icu, authenticates that it is a valid
048: * ICU data file and split its contents up into blocks of data for use in
049: * <a href=UCharacterProperty.html>com.ibm.icu.impl.UCharacterProperty</a>.
050: * </p>
051: * <p>uprops.icu which is in big-endian format is jared together with this
052: * package.</p>
053: * @author Syn Wee Quek
054: * @since release 2.1, February 1st 2002
055: * @draft 2.1
056: */
057: /* Unicode character properties file format ------------------------------------
058:
059: The file format prepared and written here contains several data
060: structures that store indexes or data.
061:
062:
063:
064: The following is a description of format version 3 .
065:
066: Data contents:
067:
068: The contents is a parsed, binary form of several Unicode character
069: database files, most prominently UnicodeData.txt.
070:
071: Any Unicode code point from 0 to 0x10ffff can be looked up to get
072: the properties, if any, for that code point. This means that the input
073: to the lookup are 21-bit unsigned integers, with not all of the
074: 21-bit range used.
075:
076: It is assumed that client code keeps a uint32_t pointer
077: to the beginning of the data:
078:
079: const uint32_t *p32;
080:
081: Formally, the file contains the following structures:
082:
083: const int32_t indexes[16] with values i0..i15:
084:
085: i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
086: i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
087: i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
088:
089: i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
090: i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
091: i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
092:
093: i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
094: i7..i9 reservedIndexes; -- reserved values; 0 for now
095:
096: i10 maxValues; -- maximum code values for vector word 0, see uprops.h (format version 3.1+)
097: i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (format version 3.2)
098: i12..i15 reservedIndexes; -- reserved values; 0 for now
099:
100: PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
101:
102: P const uint32_t props32[i1-i0];
103: E const uint32_t exceptions[i2-i1];
104: U const UChar uchars[2*(i3-i2)];
105:
106: AT serialized trie for additional properties (byte size: 4*(i4-i3))
107: PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
108:
109: Trie lookup and properties:
110:
111: In order to condense the data for the 21-bit code space, several properties of
112: the Unicode code assignment are exploited:
113: - The code space is sparse.
114: - There are several 10k of consecutive codes with the same properties.
115: - Characters and scripts are allocated in groups of 16 code points.
116: - Inside blocks for scripts the properties are often repetitive.
117: - The 21-bit space is not fully used for Unicode.
118:
119: The lookup of properties for a given code point is done with a trie lookup,
120: using the UTrie implementation.
121: The trie lookup result is a 16-bit index in the props32[] table where the
122: actual 32-bit properties word is stored. This is done to save space.
123:
124: (There are thousands of 16-bit entries in the trie data table, but
125: only a few hundred unique 32-bit properties words.
126: If the trie data table contained 32-bit words directly, then that would be
127: larger because the length of the table would be the same as now but the
128: width would be 32 bits instead of 16. This saves more than 10kB.)
129:
130: With a given Unicode code point
131:
132: UChar32 c;
133:
134: and 0<=c<0x110000, the lookup is done like this:
135:
136: uint16_t i;
137: UTRIE_GET16(c, i);
138: uint32_t props=p32[i];
139:
140: For some characters, not all of the properties can be efficiently encoded
141: using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
142: array:
143:
144: if(props&EXCEPTION_BIT)) {
145: uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
146: ...
147: }
148:
149: The exception values are a variable number of uint32_t starting at
150:
151: const uint32_t *pe=p32+exceptionsIndex+e;
152:
153: The first uint32_t there contains flags about what values actually follow it.
154: Some of the exception values are UChar32 code points for the case mappings,
155: others are numeric values etc.
156:
157: 32-bit properties sets:
158:
159: Each 32-bit properties word contains:
160:
161: 0.. 4 general category
162: 5 has exception values
163: 6..10 BiDi category
164: 11 is mirrored
165: 12..14 numericType:
166: 0 no numeric value
167: 1 decimal digit value
168: 2 digit value
169: 3 numeric value
170: ### TODO: type 4 for Han digits & numbers?!
171: 15..19 reserved
172: 20..31 value according to bits 0..5:
173: if(has exception) {
174: exception index;
175: } else switch(general category) {
176: case Ll: delta to uppercase; -- same as titlecase
177: case Lu: -delta to lowercase; -- titlecase is same as c
178: case Lt: -delta to lowercase; -- uppercase is same as c
179: default:
180: if(is mirrored) {
181: delta to mirror;
182: } else if(numericType!=0) {
183: numericValue;
184: } else {
185: 0;
186: };
187: }
188:
189: Exception values:
190:
191: In the first uint32_t exception word for a code point,
192: bits
193: 31..16 reserved
194: 15..0 flags that indicate which values follow:
195:
196: bit
197: 0 has uppercase mapping
198: 1 has lowercase mapping
199: 2 has titlecase mapping
200: 3 unused
201: 4 has numeric value (numerator)
202: if numericValue=0x7fffff00+x then numericValue=10^x
203: 5 has denominator value
204: 6 has a mirror-image Unicode code point
205: 7 has SpecialCasing.txt entries
206: 8 has CaseFolding.txt entries
207:
208: According to the flags in this word, one or more uint32_t words follow it
209: in the sequence of the bit flags in the flags word; if a flag is not set,
210: then the value is missing or 0:
211:
212: For the case mappings and the mirror-image Unicode code point,
213: one uint32_t or UChar32 each is the code point.
214: If the titlecase mapping is missing, then it is the same as the uppercase mapping.
215:
216: For the digit values, bits 31..16 contain the decimal digit value, and
217: bits 15..0 contain the digit value. A value of -1 indicates that
218: this value is missing.
219:
220: For the numeric/numerator value, an int32_t word contains the value directly,
221: except for when there is no numerator but a denominator, then the numerator
222: is implicitly 1. This means:
223: numerator denominator result
224: none none none
225: x none x
226: none y 1/y
227: x y x/y
228:
229: If the numerator value is 0x7fffff00+x then it is replaced with 10^x.
230:
231: For the denominator value, a uint32_t word contains the value directly.
232:
233: For special casing mappings, the 32-bit exception word contains:
234: 31 if set, this character has complex, conditional mappings
235: that are not stored;
236: otherwise, the mappings are stored according to the following bits
237: 30..24 number of UChars used for mappings
238: 23..16 reserved
239: 15.. 0 UChar offset from the beginning of the UChars array where the
240: UChars for the special case mappings are stored in the following format:
241:
242: Format of special casing UChars:
243: One UChar value with lengths as follows:
244: 14..10 number of UChars for titlecase mapping
245: 9.. 5 number of UChars for uppercase mapping
246: 4.. 0 number of UChars for lowercase mapping
247:
248: Followed by the UChars for lowercase, uppercase, titlecase mappings in this order.
249:
250: For case folding mappings, the 32-bit exception word contains:
251: 31..24 number of UChars used for the full mapping
252: 23..16 reserved
253: 15.. 0 UChar offset from the beginning of the UChars array where the
254: UChars for the special case mappings are stored in the following format:
255:
256: Format of case folding UChars:
257: Two UChars contain the simple mapping as follows:
258: 0, 0 no simple mapping
259: BMP,0 a simple mapping to a BMP code point
260: s1, s2 a simple mapping to a supplementary code point stored as two surrogates
261: This is followed by the UChars for the full case folding mappings.
262:
263: Example:
264: U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
265: mapping and a numeric value.
266: Its exception values would be stored as 3 uint32_t words:
267:
268: - flags=0x0a (see above) with combining class 0
269: - lowercase mapping 0x2170
270: - numeric value=1
271:
272: --- Additional properties (new in format version 2.1) ---
273:
274: The second trie for additional properties (AT) is also a UTrie with 16-bit data.
275: The data words consist of 32-bit unit indexes (not row indexes!) into the
276: table of unique properties vectors (PV).
277: Each vector contains a set of properties.
278: The width of a vector (number of uint32_t per row) may change
279: with the formatVersion, it is stored in i5.
280:
281: Current properties: see icu/source/common/uprops.h
282:
283: --- Changes in format version 3.1 ---
284:
285: See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
286:
287: --- Changes in format version 3.2 ---
288:
289: - The tries use linear Latin-1 ranges.
290: - The additional properties bits store full properties XYZ instead
291: of partial Other_XYZ, so that changes in the derivation formulas
292: need not be tracked in runtime library code.
293: - Joining Type and Line Break are also stored completely, so that uprops.c
294: needs no runtime formulas for enumerated properties either.
295: - Store the case-sensitive flag in the main properties word.
296: - i10 also contains U_LB_COUNT and U_EA_COUNT.
297: - i11 contains maxValues2 for vector word 2.
298:
299: ----------------------------------------------------------------------------- */
300:
301: final class UCharacterPropertyReader implements ICUBinary.Authenticate {
302: // public methods ----------------------------------------------------
303:
304: public boolean isDataVersionAcceptable(byte version[]) {
305: return version[0] == DATA_FORMAT_VERSION_[0]
306: && version[2] == DATA_FORMAT_VERSION_[2]
307: && version[3] == DATA_FORMAT_VERSION_[3];
308: }
309:
310: // protected constructor ---------------------------------------------
311:
312: /**
313: * <p>Protected constructor.</p>
314: * @param inputStream ICU uprop.dat file input stream
315: * @exception IOException throw if data file fails authentication
316: * @draft 2.1
317: */
318: protected UCharacterPropertyReader(InputStream inputStream)
319: throws IOException {
320: m_unicodeVersion_ = ICUBinary.readHeader(inputStream,
321: DATA_FORMAT_ID_, this );
322: m_dataInputStream_ = new DataInputStream(inputStream);
323: }
324:
325: // protected methods -------------------------------------------------
326:
327: /**
328: * <p>Reads uprops.icu, parse it into blocks of data to be stored in
329: * UCharacterProperty.</P
330: * @param ucharppty UCharacterProperty instance
331: * @exception thrown when data reading fails
332: * @draft 2.1
333: */
334: protected void read(UCharacterProperty ucharppty)
335: throws IOException {
336: // read the indexes
337: int count = INDEX_SIZE_;
338: m_propertyOffset_ = m_dataInputStream_.readInt();
339: count--;
340: m_exceptionOffset_ = m_dataInputStream_.readInt();
341: count--;
342: m_caseOffset_ = m_dataInputStream_.readInt();
343: count--;
344: m_additionalOffset_ = m_dataInputStream_.readInt();
345: count--;
346: m_additionalVectorsOffset_ = m_dataInputStream_.readInt();
347: count--;
348: m_additionalColumnsCount_ = m_dataInputStream_.readInt();
349: count--;
350: m_reservedOffset_ = m_dataInputStream_.readInt();
351: count--;
352: m_dataInputStream_.skipBytes(3 << 2);
353: count -= 3;
354: ucharppty.m_maxBlockScriptValue_ = m_dataInputStream_.readInt();
355: count--; // 10
356: ucharppty.m_maxJTGValue_ = m_dataInputStream_.readInt();
357: count--; // 11
358: m_dataInputStream_.skipBytes(count << 2);
359:
360: // read the trie index block
361: // m_props_index_ in terms of ints
362: ucharppty.m_trie_ = new CharTrie(m_dataInputStream_, ucharppty);
363:
364: // reads the 32 bit properties block
365: int size = m_exceptionOffset_ - m_propertyOffset_;
366: ucharppty.m_property_ = new int[size];
367: for (int i = 0; i < size; i++) {
368: ucharppty.m_property_[i] = m_dataInputStream_.readInt();
369: }
370:
371: // reads the 32 bit exceptions block
372: size = m_caseOffset_ - m_exceptionOffset_;
373: ucharppty.m_exception_ = new int[size];
374: for (int i = 0; i < size; i++) {
375: ucharppty.m_exception_[i] = m_dataInputStream_.readInt();
376: }
377:
378: // reads the 32 bit case block
379: size = (m_additionalOffset_ - m_caseOffset_) << 1;
380: ucharppty.m_case_ = new char[size];
381: for (int i = 0; i < size; i++) {
382: ucharppty.m_case_[i] = m_dataInputStream_.readChar();
383: }
384:
385: // reads the additional property block
386: ucharppty.m_additionalTrie_ = new CharTrie(m_dataInputStream_,
387: ucharppty);
388:
389: // additional properties
390: size = m_reservedOffset_ - m_additionalVectorsOffset_;
391: ucharppty.m_additionalVectors_ = new int[size];
392: for (int i = 0; i < size; i++) {
393: ucharppty.m_additionalVectors_[i] = m_dataInputStream_
394: .readInt();
395: }
396:
397: m_dataInputStream_.close();
398: ucharppty.m_additionalColumnsCount_ = m_additionalColumnsCount_;
399: ucharppty.m_unicodeVersion_ = VersionInfo.getInstance(
400: (int) m_unicodeVersion_[0], (int) m_unicodeVersion_[1],
401: (int) m_unicodeVersion_[2], (int) m_unicodeVersion_[3]);
402: }
403:
404: // private variables -------------------------------------------------
405:
406: /**
407: * Index size
408: */
409: private static final int INDEX_SIZE_ = 16;
410:
411: /**
412: * ICU data file input stream
413: */
414: private DataInputStream m_dataInputStream_;
415:
416: /**
417: * Offset information in the indexes.
418: */
419: private int m_propertyOffset_;
420: private int m_exceptionOffset_;
421: private int m_caseOffset_;
422: private int m_additionalOffset_;
423: private int m_additionalVectorsOffset_;
424: private int m_additionalColumnsCount_;
425: private int m_reservedOffset_;
426: private byte m_unicodeVersion_[];
427:
428: /**
429: * File format version that this class understands.
430: * No guarantees are made if a older version is used
431: */
432: private static final byte DATA_FORMAT_ID_[] = { (byte) 0x55,
433: (byte) 0x50, (byte) 0x72, (byte) 0x6F };
434: private static final byte DATA_FORMAT_VERSION_[] = { (byte) 0x3,
435: (byte) 0x1, (byte) Trie.INDEX_STAGE_1_SHIFT_,
436: (byte) Trie.INDEX_STAGE_2_SHIFT_ };
437: }
|