001: /*
002: ******************************************************************************
003: * Copyright (C) 1996-2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: ******************************************************************************
006: */
007:
008: package com.ibm.icu.impl;
009:
010: import com.ibm.icu.lang.UCharacter;
011: import java.util.Arrays;
012:
013: /**
014: * Builder class to manipulate and generate a trie.
015: * This is useful for ICU data in primitive types.
016: * Provides a compact way to store information that is indexed by Unicode
017: * values, such as character properties, types, keyboard values, etc. This is
018: * very useful when you have a block of Unicode data that contains significant
019: * values while the rest of the Unicode data is unused in the application or
020: * when you have a lot of redundance, such as where all 21,000 Han ideographs
021: * have the same value. However, lookup is much faster than a hash table.
022: * A trie of any primitive data type serves two purposes:
023: * <UL type = round>
024: * <LI>Fast access of the indexed values.
025: * <LI>Smaller memory footprint.
026: * </UL>
027: * This is a direct port from the ICU4C version
028: * @author Syn Wee Quek
029: */
030: public class TrieBuilder {
031: // public data member ----------------------------------------------
032:
033: /**
034: * Number of data values in a stage 2 (data array) block. 2, 4, 8, ..,
035: * 0x200
036: */
037: public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_;
038:
039: // public class declaration ----------------------------------------
040:
041: /**
042: * Character data in com.ibm.impl.Trie have different user-specified format
043: * for different purposes.
044: * This interface specifies methods to be implemented in order for
045: * com.ibm.impl.Trie, to surrogate offset information encapsulated within
046: * the data.
047: * @draft 2.2
048: */
049: public static interface DataManipulate {
050: /**
051: * Build-time trie callback function, used with serialize().
052: * This function calculates a lead surrogate's value including a
053: * folding offset from the 1024 supplementary code points
054: * [start..start+1024[ .
055: * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
056: * The folding offset is provided by the caller.
057: * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT
058: * with n=0..1023.
059: * Instead of the offset itself, n can be stored in 10 bits - or fewer
060: * if it can be assumed that few lead surrogates have associated data.
061: * The returned value must be
062: * - not zero if and only if there is relevant data for the
063: * corresponding 1024 supplementary code points
064: * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(...,
065: * offset))==offset
066: * @return a folded value, or 0 if there is no relevant data for the
067: * lead surrogate.
068: */
069: public int getFoldedValue(int start, int offset);
070: }
071:
072: // public methods ----------------------------------------------------
073:
074: /**
075: * Checks if the character belongs to a zero block in the trie
076: * @param ch codepoint which data is to be retrieved
077: * @return true if ch is in the zero block
078: */
079: public boolean isInZeroBlock(int ch) {
080: // valid, uncompacted trie and valid c?
081: if (m_isCompacted_ || ch > UCharacter.MAX_VALUE
082: || ch < UCharacter.MIN_VALUE) {
083: return true;
084: }
085:
086: return m_index_[ch >> SHIFT_] == 0;
087: }
088:
089: // package private method -----------------------------------------------
090:
091: // protected data member -----------------------------------------------
092:
093: /**
094: * Index values at build-time are 32 bits wide for easier processing.
095: * Bit 31 is set if the data block is used by multiple index values
096: * (from setRange()).
097: */
098: protected int m_index_[];
099: protected int m_indexLength_;
100: protected int m_dataCapacity_;
101: protected int m_dataLength_;
102: protected boolean m_isLatin1Linear_;
103: protected boolean m_isCompacted_;
104: /**
105: * Map of adjusted indexes, used in utrie_compact().
106: * Maps from original indexes to new ones.
107: */
108: protected int m_map_[];
109:
110: /**
111: * Shift size for shifting right the input index. 1..9
112: */
113: protected static final int SHIFT_ = Trie.INDEX_STAGE_1_SHIFT_;
114: /**
115: * Length of the index (stage 1) array before folding.
116: * Maximum number of Unicode code points (0x110000) shifted right by
117: * SHIFT.
118: */
119: protected static final int MAX_INDEX_LENGTH_ = (0x110000 >> SHIFT_);
120: /**
121: * Length of the BMP portion of the index (stage 1) array.
122: */
123: protected static final int BMP_INDEX_LENGTH_ = 0x10000 >> SHIFT_;
124: /**
125: * Number of index (stage 1) entries per lead surrogate.
126: * Same as number of indexe entries for 1024 trail surrogates,
127: * ==0x400>>UTRIE_SHIFT
128: * 10 - SHIFT == Number of bits of a trail surrogate that are used in
129: * index table lookups.
130: */
131: protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_);
132: /**
133: * Mask for getting the lower bits from the input index.
134: * DATA_BLOCK_LENGTH - 1.
135: */
136: protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_;
137: /**
138: * Shift size for shifting left the index array values.
139: * Increases possible data size with 16-bit index values at the cost
140: * of compactability.
141: * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
142: * 0..UTRIE_SHIFT
143: */
144: protected static final int INDEX_SHIFT_ = Trie.INDEX_STAGE_2_SHIFT_;
145: /**
146: * Maximum length of the runtime data (stage 2) array.
147: * Limited by 16-bit index values that are left-shifted by INDEX_SHIFT_.
148: */
149: protected static final int MAX_DATA_LENGTH_ = (0x10000 << INDEX_SHIFT_);
150: /**
151: * Shifting to position the index value in options
152: */
153: protected static final int OPTIONS_INDEX_SHIFT_ = 4;
154: /**
155: * If set, then the data (stage 2) array is 32 bits wide.
156: */
157: protected static final int OPTIONS_DATA_IS_32_BIT_ = 0x100;
158: /**
159: * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data
160: * (stage 2) array as a simple, linear array at data + DATA_BLOCK_LENGTH.
161: */
162: protected static final int OPTIONS_LATIN1_IS_LINEAR_ = 0x200;
163: /**
164: * The alignment size of a stage 2 data block. Also the granularity for
165: * compaction.
166: */
167: protected static final int DATA_GRANULARITY_ = 1 << INDEX_SHIFT_;
168:
169: // protected constructor ----------------------------------------------
170:
171: protected TrieBuilder() {
172: m_index_ = new int[MAX_INDEX_LENGTH_];
173: m_map_ = new int[MAX_BUILD_TIME_DATA_LENGTH_ >> SHIFT_];
174: m_isLatin1Linear_ = false;
175: m_isCompacted_ = false;
176: m_indexLength_ = MAX_INDEX_LENGTH_;
177: }
178:
179: protected TrieBuilder(TrieBuilder table) {
180: m_index_ = new int[MAX_INDEX_LENGTH_];
181: m_indexLength_ = table.m_indexLength_;
182: System
183: .arraycopy(table.m_index_, 0, m_index_, 0,
184: m_indexLength_);
185: m_dataCapacity_ = table.m_dataCapacity_;
186: m_dataLength_ = table.m_dataLength_;
187: m_map_ = new int[table.m_map_.length];
188: System.arraycopy(table.m_map_, 0, m_map_, 0, m_map_.length);
189: m_isLatin1Linear_ = table.m_isLatin1Linear_;
190: m_isCompacted_ = table.m_isCompacted_;
191: }
192:
193: // protected functions ------------------------------------------------
194:
195: /**
196: * Compare two sections of an array for equality.
197: */
198: protected static final boolean equal_int(int[] array, int start1,
199: int start2, int length) {
200: while (length > 0 && array[start1] == array[start2]) {
201: ++start1;
202: ++start2;
203: --length;
204: }
205: return length == 0;
206: }
207:
208: /**
209: * Set a value in the trie index map to indicate which data block
210: * is referenced and which one is not.
211: * utrie_compact() will remove data blocks that are not used at all.
212: * Set
213: * - 0 if it is used
214: * - -1 if it is not used
215: */
216: protected void findUnusedBlocks() {
217: // fill the entire map with "not used"
218: Arrays.fill(m_map_, 0xff);
219:
220: // mark each block that _is_ used with 0
221: for (int i = 0; i < m_indexLength_; ++i) {
222: m_map_[Math.abs(m_index_[i]) >> SHIFT_] = 0;
223: }
224:
225: // never move the all-initial-value block 0
226: m_map_[0] = 0;
227: }
228:
229: /**
230: * Finds the same index block as the otherBlock
231: * @param index array
232: * @param indexLength size of index
233: * @param otherBlock
234: * @return same index block
235: */
236: protected static final int findSameIndexBlock(int index[],
237: int indexLength, int otherBlock) {
238: for (int block = BMP_INDEX_LENGTH_; block < indexLength; block += SURROGATE_BLOCK_COUNT_) {
239: if (equal_int(index, block, otherBlock,
240: SURROGATE_BLOCK_COUNT_)) {
241: return block;
242: }
243: }
244: return indexLength;
245: }
246:
247: // private data member ------------------------------------------------
248:
249: /**
250: * Maximum length of the build-time data (stage 2) array.
251: * The maximum length is 0x110000 + DATA_BLOCK_LENGTH + 0x400.
252: * (Number of Unicode code points + one all-initial-value block +
253: * possible duplicate entries for 1024 lead surrogates.)
254: */
255: private static final int MAX_BUILD_TIME_DATA_LENGTH_ = 0x110000 + DATA_BLOCK_LENGTH + 0x400;
256: }
|