001: /*
002: ******************************************************************************
003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: ******************************************************************************
006: */
007:
008: package com.ibm.icu.impl;
009:
010: import java.io.InputStream;
011: import java.io.DataInputStream;
012: import java.io.IOException;
013: import java.util.Arrays;
014: import com.ibm.icu.text.UTF16;
015: import com.ibm.icu.lang.UCharacter;
016:
017: /**
018: * <p>A trie is a kind of compressed, serializable table of values
019: * associated with Unicode code points (0..0x10ffff).</p>
020: * <p>This class defines the basic structure of a trie and provides methods
021: * to <b>retrieve the offsets to the actual data</b>.</p>
022: * <p>Data will be the form of an array of basic types, char or int.</p>
023: * <p>The actual data format will have to be specified by the user in the
024: * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
025: * <p>This trie implementation is optimized for getting offset while walking
026: * forward through a UTF-16 string.
027: * Therefore, the simplest and fastest access macros are the
028: * fromLead() and fromOffsetTrail() methods.
029: * The fromBMP() method are a little more complicated; they get offsets even
030: * for lead surrogate codepoints, while the fromLead() method get special
031: * "folded" offsets for lead surrogate code units if there is relevant data
032: * associated with them.
033: * From such a folded offsets, an offset needs to be extracted to supply
034: * to the fromOffsetTrail() methods.
035: * To handle such supplementary codepoints, some offset information are kept
036: * in the data.</p>
037: * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
038: * that offset from the folded value for the lead surrogate unit.</p>
039: * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
040: * com.ibm.icu.impl.IntTrie.</p>
041: * @author synwee
042: * @see com.ibm.icu.impl.CharTrie
043: * @see com.ibm.icu.impl.IntTrie
044: * @since release 2.1, Jan 01 2002
045: */
046: public abstract class Trie {
047: // public class declaration ----------------------------------------
048:
049: /**
050: * Character data in com.ibm.impl.Trie have different user-specified format
051: * for different purposes.
052: * This interface specifies methods to be implemented in order for
053: * com.ibm.impl.Trie, to surrogate offset information encapsulated within
054: * the data.
055: * @draft 2.1
056: */
057: public static interface DataManipulate {
058: /**
059: * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
060: * data
061: * the index array offset of the indexes for that lead surrogate.
062: * @param value data value for a surrogate from the trie, including the
063: * folding offset
064: * @return data offset or 0 if there is no data for the lead surrogate
065: * @draft 2.1
066: */
067: public int getFoldingOffset(int value);
068: }
069:
070: // default implementation
071: private static class DefaultGetFoldingOffset implements
072: DataManipulate {
073: public int getFoldingOffset(int value) {
074: return value;
075: }
076: }
077:
078: // public methods --------------------------------------------------
079:
080: /**
081: * Determines if this trie has a linear latin 1 array
082: * @return true if this trie has a linear latin 1 array, false otherwise
083: */
084: public final boolean isLatin1Linear() {
085: return m_isLatin1Linear_;
086: }
087:
088: /**
089: * Checks if the argument Trie has the same data as this Trie.
090: * Attributes are checked but not the index data.
091: * @param other Trie to check
092: * @return true if the argument Trie has the same data as this Trie, false
093: * otherwise
094: */
095: ///CLOVER:OFF
096: public boolean equals(Object other) {
097: if (other == this ) {
098: return true;
099: }
100: if (!(other instanceof Trie)) {
101: return false;
102: }
103: Trie othertrie = (Trie) other;
104: return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_
105: && m_options_ == othertrie.m_options_
106: && m_dataLength_ == othertrie.m_dataLength_
107: && Arrays.equals(m_index_, othertrie.m_index_);
108: }
109:
110: ///CLOVER:ON
111:
112: /**
113: * Gets the serialized data file size of the Trie. This is used during
114: * trie data reading for size checking purposes.
115: * @return size size of serialized trie data file in terms of the number
116: * of bytes
117: */
118: public int getSerializedDataSize() {
119: // includes signature, option, dataoffset and datalength output
120: int result = (4 << 2);
121: result += (m_dataOffset_ << 1);
122: if (isCharTrie()) {
123: result += (m_dataLength_ << 1);
124: } else if (isIntTrie()) {
125: result += (m_dataLength_ << 2);
126: }
127: return result;
128: }
129:
130: // protected constructor -------------------------------------------
131:
132: /**
133: * Trie constructor for CharTrie use.
134: * @param inputStream ICU data file input stream which contains the
135: * trie
136: * @param dataManipulate object containing the information to parse the
137: * trie data
138: * @throws IOException thrown when input stream does not have the
139: * right header.
140: * @draft 2.1
141: */
142: protected Trie(InputStream inputStream,
143: DataManipulate dataManipulate) throws IOException {
144: DataInputStream input = new DataInputStream(inputStream);
145: // Magic number to authenticate the data.
146: int signature = input.readInt();
147: m_options_ = input.readInt();
148:
149: if (!checkHeader(signature)) {
150: throw new IllegalArgumentException(
151: "ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
152: }
153:
154: if (dataManipulate != null) {
155: m_dataManipulate_ = dataManipulate;
156: } else {
157: m_dataManipulate_ = new DefaultGetFoldingOffset();
158: }
159: m_isLatin1Linear_ = (m_options_ & HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
160: m_dataOffset_ = input.readInt();
161: m_dataLength_ = input.readInt();
162: unserialize(inputStream);
163: }
164:
165: /**
166: * Trie constructor
167: * @param index array to be used for index
168: * @param options used by the trie
169: * @param dataManipulate object containing the information to parse the
170: * trie data
171: * @draft 2.2
172: */
173: protected Trie(char index[], int options,
174: DataManipulate dataManipulate) {
175: m_options_ = options;
176: if (dataManipulate != null) {
177: m_dataManipulate_ = dataManipulate;
178: } else {
179: m_dataManipulate_ = new DefaultGetFoldingOffset();
180: }
181: m_isLatin1Linear_ = (m_options_ & HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
182: m_index_ = index;
183: m_dataOffset_ = m_index_.length;
184: }
185:
186: // protected data members ------------------------------------------
187:
188: /**
189: * Lead surrogate code points' index displacement in the index array.
190: * 0x10000-0xd800=0x2800
191: * 0x2800 >> INDEX_STAGE_1_SHIFT_
192: */
193: protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
194: /**
195: * Shift size for shifting right the input index. 1..9
196: * @draft 2.1
197: */
198: protected static final int INDEX_STAGE_1_SHIFT_ = 5;
199: /**
200: * Shift size for shifting left the index array values.
201: * Increases possible data size with 16-bit index values at the cost
202: * of compactability.
203: * This requires blocks of stage 2 data to be aligned by
204: * DATA_GRANULARITY.
205: * 0..INDEX_STAGE_1_SHIFT
206: * @draft 2.1
207: */
208: protected static final int INDEX_STAGE_2_SHIFT_ = 2;
209: /**
210: * Number of data values in a stage 2 (data array) block.
211: */
212: protected static final int DATA_BLOCK_LENGTH = 1 << INDEX_STAGE_1_SHIFT_;
213: /**
214: * Mask for getting the lower bits from the input index.
215: * DATA_BLOCK_LENGTH - 1.
216: * @draft 2.1
217: */
218: protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
219: /** Number of bits of a trail surrogate that are used in index table lookups. */
220: protected static final int SURROGATE_BLOCK_BITS = 10 - INDEX_STAGE_1_SHIFT_;
221: /**
222: * Number of index (stage 1) entries per lead surrogate.
223: * Same as number of index entries for 1024 trail surrogates,
224: * ==0x400>>INDEX_STAGE_1_SHIFT_
225: */
226: protected static final int SURROGATE_BLOCK_COUNT = (1 << SURROGATE_BLOCK_BITS);
227: /** Length of the BMP portion of the index (stage 1) array. */
228: protected static final int BMP_INDEX_LENGTH = 0x10000 >> INDEX_STAGE_1_SHIFT_;
229: /**
230: * Surrogate mask to use when shifting offset to retrieve supplementary
231: * values
232: * @draft 2.1
233: */
234: protected static final int SURROGATE_MASK_ = 0x3FF;
235: /**
236: * Index or UTF16 characters
237: * @draft 2.1
238: */
239: protected char m_index_[];
240: /**
241: * Internal TrieValue which handles the parsing of the data value.
242: * This class is to be implemented by the user
243: * @draft 2.1
244: */
245: protected DataManipulate m_dataManipulate_;
246: /**
247: * Start index of the data portion of the trie. CharTrie combines
248: * index and data into a char array, so this is used to indicate the
249: * initial offset to the data portion.
250: * Note this index always points to the initial value.
251: * @draft 2.1
252: */
253: protected int m_dataOffset_;
254: /**
255: * Length of the data array
256: */
257: protected int m_dataLength_;
258:
259: // protected methods -----------------------------------------------
260:
261: /**
262: * Gets the offset to the data which the surrogate pair points to.
263: * @param lead lead surrogate
264: * @param trail trailing surrogate
265: * @return offset to data
266: * @draft 2.1
267: */
268: protected abstract int getSurrogateOffset(char lead, char trail);
269:
270: /**
271: * Gets the value at the argument index
272: * @param index value at index will be retrieved
273: * @return 32 bit value
274: * @draft 2.1
275: */
276: protected abstract int getValue(int index);
277:
278: /**
279: * Gets the default initial value
280: * @return 32 bit value
281: * @draft 2.1
282: */
283: protected abstract int getInitialValue();
284:
285: /**
286: * Gets the offset to the data which the index ch after variable offset
287: * points to.
288: * Note for locating a non-supplementary character data offset, calling
289: * <p>
290: * getRawOffset(0, ch);
291: * </p>
292: * will do. Otherwise if it is a supplementary character formed by
293: * surrogates lead and trail. Then we would have to call getRawOffset()
294: * with getFoldingIndexOffset(). See getSurrogateOffset().
295: * @param offset index offset which ch is to start from
296: * @param ch index to be used after offset
297: * @return offset to the data
298: * @draft 2.1
299: */
300: protected final int getRawOffset(int offset, char ch) {
301: return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] << INDEX_STAGE_2_SHIFT_)
302: + (ch & INDEX_STAGE_3_MASK_);
303: }
304:
305: /**
306: * Gets the offset to data which the BMP character points to
307: * Treats a lead surrogate as a normal code point.
308: * @param ch BMP character
309: * @return offset to data
310: * @draft 2.1
311: */
312: protected final int getBMPOffset(char ch) {
313: return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) ? getRawOffset(
314: LEAD_INDEX_OFFSET_, ch)
315: : getRawOffset(0, ch);
316: // using a getRawOffset(ch) makes no diff
317: }
318:
319: /**
320: * Gets the offset to the data which this lead surrogate character points
321: * to.
322: * Data at the returned offset may contain folding offset information for
323: * the next trailing surrogate character.
324: * @param ch lead surrogate character
325: * @return offset to data
326: * @draft 2.1
327: */
328: protected final int getLeadOffset(char ch) {
329: return getRawOffset(0, ch);
330: }
331:
332: /**
333: * Internal trie getter from a code point.
334: * Could be faster(?) but longer with
335: * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
336: * Gets the offset to data which the codepoint points to
337: * @param ch codepoint
338: * @return offset to data
339: * @draft 2.1
340: */
341: protected final int getCodePointOffset(int ch) {
342: // if ((ch >> 16) == 0) slower
343: if (ch < 0) {
344: return -1;
345: } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
346: // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
347: return getRawOffset(0, (char) ch);
348: } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
349: // BMP codepoint
350: return getBMPOffset((char) ch);
351: } else if (ch <= UCharacter.MAX_VALUE) {
352: // look at the construction of supplementary characters
353: // trail forms the ends of it.
354: return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
355: (char) (ch & SURROGATE_MASK_));
356: } else {
357: // return -1 if there is an error, in this case we return
358: return -1;
359: }
360: }
361:
362: /**
363: * <p>Parses the inputstream and creates the trie index with it.</p>
364: * <p>This is overwritten by the child classes.
365: * @param inputStream input stream containing the trie information
366: * @exception IOException thrown when data reading fails.
367: * @draft 2.1
368: */
369: protected void unserialize(InputStream inputStream)
370: throws IOException {
371: //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
372: m_index_ = new char[m_dataOffset_];
373: DataInputStream input = new DataInputStream(inputStream);
374: for (int i = 0; i < m_dataOffset_; i++) {
375: m_index_[i] = input.readChar();
376: }
377: }
378:
379: /**
380: * Determines if this is a 32 bit trie
381: * @return true if options specifies this is a 32 bit trie
382: * @draft 2.1
383: */
384: protected final boolean isIntTrie() {
385: return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
386: }
387:
388: /**
389: * Determines if this is a 16 bit trie
390: * @return true if this is a 16 bit trie
391: * @draft 2.1
392: */
393: protected final boolean isCharTrie() {
394: return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
395: }
396:
397: // private data members --------------------------------------------
398:
399: // struct UTrieHeader {
400: // int32_t signature;
401: // int32_t options (a bit field)
402: // int32_t indexLength
403: // int32_t dataLength
404:
405: /**
406: * Size of Trie header in bytes
407: */
408: protected static final int HEADER_LENGTH_ = 4 * 4;
409: /**
410: * Latin 1 option mask
411: */
412: protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
413: /**
414: * Constant number to authenticate the byte block
415: */
416: protected static final int HEADER_SIGNATURE_ = 0x54726965;
417: /**
418: * Header option formatting
419: */
420: private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
421: protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
422: protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
423:
424: /**
425: * Flag indicator for Latin quick access data block
426: */
427: private boolean m_isLatin1Linear_;
428:
429: /**
430: * <p>Trie options field.</p>
431: * <p>options bit field:<br>
432: * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
433: * 8 0 = 16-bit data, 1=32-bit data<br>
434: * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
435: * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
436: */
437: private int m_options_;
438:
439: // private methods ---------------------------------------------------
440:
441: /**
442: * Authenticates raw data header.
443: * Checking the header information, signature and options.
444: * @param signature This contains the options and type of a Trie
445: * @return true if the header is authenticated valid
446: * @draft 2.1
447: */
448: private final boolean checkHeader(int signature) {
449: // check the signature
450: // Trie in big-endian US-ASCII (0x54726965).
451: // Magic number to authenticate the data.
452: if (signature != HEADER_SIGNATURE_) {
453: return false;
454: }
455:
456: if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != INDEX_STAGE_1_SHIFT_
457: || ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & HEADER_OPTIONS_SHIFT_MASK_) != INDEX_STAGE_2_SHIFT_) {
458: return false;
459: }
460: return true;
461: }
462: }
|