001: /*
002: ******************************************************************************
003: * Copyright (C) 1996-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: ******************************************************************************
006: */
007:
008: package com.ibm.icu.impl;
009:
010: import com.ibm.icu.lang.UCharacter;
011: import com.ibm.icu.text.UTF16;
012: import com.ibm.icu.util.RangeValueIterator;
013:
014: /**
015: * <p>Class enabling iteration of the values in a Trie.</p>
016: * <p>Result of each iteration contains the interval of codepoints that have
017: * the same value type and the value type itself.</p>
018: * <p>The comparison of each codepoint value is done via extract(), which the
019: * default implementation is to return the value as it is.</p>
020: * <p>Method extract() can be overwritten to perform manipulations on
021: * codepoint values in order to perform specialized comparison.</p>
022: * <p>TrieIterator is designed to be a generic iterator for the CharTrie
023: * and the IntTrie, hence to accommodate both types of data, the return
024: * result will be in terms of int (32 bit) values.</p>
025: * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
026: * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
027: * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
028: * sense, the caller will have to pass a object with a callback function
029: * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
030: * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
031: * codepoints with the same value as determined by
032: * UTrieEnumValue(const void *context, uint32_t value). for each range,
033: * utrie_enum calls the callback function to perform a task. In this way,
034: * icu4c performs the iteration within utrie_enum.
035: * To follow the JDK model, icu4j is slightly different from icu4c.
036: * Instead of requesting the caller to implement an object for a callback.
037: * The caller will have to implement a subclass of TrieIterator, fleshing out
038: * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
039: * the caller will have to code his own iteration and flesh out the task
040: * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
041: * </p>
042: * <p>There are basically 3 usage scenarios for porting:</p>
043: * <p>1) UTrieEnumValue is the only implemented callback then just implement a
044: * subclass of TrieIterator and override the extract(int) method. The
045: * extract(int) method is analogus to UTrieEnumValue callback.
046: * </p>
047: * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
048: * a subclass of TrieIterator, override the extract method and iterate, e.g
049: * </p>
050: * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
051: * set);<br>
052: * In Java :<br>
053: * <pre>
054: * class TrieIteratorImpl extends TrieIterator{
055: * public TrieIteratorImpl(Trie data){
056: * super(data);
057: * }
058: * public int extract(int value){
059: * // port the implementation of _enumPropertyStartsValue here
060: * }
061: * }
062: * ....
063: * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
064: * while(fcdIter.next(result)) {
065: * // port the implementation of _enumPropertyStartsRange
066: * }
067: * </pre>
068: * </p>
069: * <p>3) UTrieEnumRange is the only implemented callback then just implement
070: * the while loop, when utrie_enum is called
071: * <pre>
072: * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
073: * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
074: * while(fcdIter.next(result)){
075: * set.add(result.start);
076: * }
077: * </p>
078: * @author synwee
079: * @see com.ibm.icu.impl.Trie
080: * @see com.ibm.icu.lang.UCharacterTypeIterator
081: * @since release 2.1, Jan 17 2002
082: */
083: public class TrieIterator implements RangeValueIterator
084:
085: {
086: // public constructor ---------------------------------------------
087:
088: /**
089: * TrieEnumeration constructor
090: * @param trie to be used
091: * @exception IllegalArgumentException throw when argument is null.
092: * @draft 2.1
093: */
094: public TrieIterator(Trie trie) {
095: if (trie == null) {
096: throw new IllegalArgumentException(
097: "Argument trie cannot be null");
098: }
099: m_trie_ = trie;
100: // synwee: check that extract belongs to the child class
101: m_initialValue_ = extract(m_trie_.getInitialValue());
102: reset();
103: }
104:
105: // public methods -------------------------------------------------
106:
107: /**
108: * <p>Returns true if we are not at the end of the iteration, false
109: * otherwise.</p>
110: * <p>The next set of codepoints with the same value type will be
111: * calculated during this call and returned in the arguement element.</p>
112: * @param element return result
113: * @return true if we are not at the end of the iteration, false otherwise.
114: * @exception NoSuchElementException - if no more elements exist.
115: * @see com.ibm.icu.util.RangeValueIterator.Element
116: * @draft 2.1
117: */
118: public final boolean next(Element element) {
119: if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
120: return false;
121: }
122: if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE
123: && calculateNextBMPElement(element)) {
124: return true;
125: }
126: calculateNextSupplementaryElement(element);
127: return true;
128: }
129:
130: /**
131: * Resets the iterator to the beginning of the iteration
132: * @draft 2.1
133: */
134: public final void reset() {
135: m_currentCodepoint_ = 0;
136: m_nextCodepoint_ = 0;
137: m_nextIndex_ = 0;
138: m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
139: if (m_nextBlock_ == 0) {
140: m_nextValue_ = m_initialValue_;
141: } else {
142: m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
143: }
144: m_nextBlockIndex_ = 0;
145: m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
146: }
147:
148: // protected methods ----------------------------------------------
149:
150: /**
151: * Called by next() to extracts a 32 bit value from a trie value
152: * used for comparison.
153: * This method is to be overwritten if special manipulation is to be done
154: * to retrieve a relevant comparison.
155: * The default function is to return the value as it is.
156: * @param value a value from the trie
157: * @return extracted value
158: * @draft 2.1
159: */
160: protected int extract(int value) {
161: return value;
162: }
163:
164: // private methods ------------------------------------------------
165:
166: /**
167: * Set the result values
168: * @param element return result object
169: * @param start codepoint of range
170: * @param limit (end + 1) codepoint of range
171: * @param value common value of range
172: */
173: private final void setResult(Element element, int start, int limit,
174: int value) {
175: element.start = start;
176: element.limit = limit;
177: element.value = value;
178: }
179:
180: /**
181: * Finding the next element.
182: * This method is called just before returning the result of
183: * next().
184: * We always store the next element before it is requested.
185: * In the case that we have to continue calculations into the
186: * supplementary planes, a false will be returned.
187: * @param element return result object
188: * @return true if the next range is found, false if we have to proceed to
189: * the supplementary range.
190: */
191: private final boolean calculateNextBMPElement(Element element) {
192: int currentBlock = m_nextBlock_;
193: int currentValue = m_nextValue_;
194: m_currentCodepoint_ = m_nextCodepoint_;
195: m_nextCodepoint_++;
196: m_nextBlockIndex_++;
197: if (!checkBlockDetail(currentValue)) {
198: setResult(element, m_currentCodepoint_, m_nextCodepoint_,
199: currentValue);
200: return true;
201: }
202: // synwee check that next block index == 0 here
203: // enumerate BMP - the main loop enumerates data blocks
204: while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
205: m_nextIndex_++;
206: // because of the way the character is split to form the index
207: // the lead surrogate and trail surrogate can not be in the
208: // mid of a block
209: if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
210: // skip lead surrogate code units,
211: // go to lead surrogate codepoints
212: m_nextIndex_ = BMP_INDEX_LENGTH_;
213: } else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
214: // go back to regular BMP code points
215: m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
216: }
217:
218: m_nextBlockIndex_ = 0;
219: if (!checkBlock(currentBlock, currentValue)) {
220: setResult(element, m_currentCodepoint_,
221: m_nextCodepoint_, currentValue);
222: return true;
223: }
224: }
225: m_nextCodepoint_--; // step one back since this value has not been
226: m_nextBlockIndex_--; // retrieved yet.
227: return false;
228: }
229:
230: /**
231: * Finds the next supplementary element.
232: * For each entry in the trie, the value to be delivered is passed through
233: * extract().
234: * We always store the next element before it is requested.
235: * Called after calculateNextBMP() completes its round of BMP characters.
236: * There is a slight difference in the usage of m_currentCodepoint_
237: * here as compared to calculateNextBMP(). Though both represents the
238: * lower bound of the next element, in calculateNextBMP() it gets set
239: * at the start of any loop, where-else, in calculateNextSupplementary()
240: * since m_currentCodepoint_ already contains the lower bound of the
241: * next element (passed down from calculateNextBMP()), we keep it till
242: * the end before resetting it to the new value.
243: * Note, if there are no more iterations, it will never get to here.
244: * Blocked out by next().
245: * @param element return result object
246: * @draft 2.1
247: */
248: private final void calculateNextSupplementaryElement(Element element) {
249: int currentValue = m_nextValue_;
250: int currentBlock = m_nextBlock_;
251: m_nextCodepoint_++;
252: m_nextBlockIndex_++;
253:
254: if (UTF16.getTrailSurrogate(m_nextCodepoint_) != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
255: // this piece is only called when we are in the middle of a lead
256: // surrogate block
257: if (!checkNullNextTrailIndex()
258: && !checkBlockDetail(currentValue)) {
259: setResult(element, m_currentCodepoint_,
260: m_nextCodepoint_, currentValue);
261: m_currentCodepoint_ = m_nextCodepoint_;
262: return;
263: }
264: // we have cleared one block
265: m_nextIndex_++;
266: m_nextTrailIndexOffset_++;
267: if (!checkTrailBlock(currentBlock, currentValue)) {
268: setResult(element, m_currentCodepoint_,
269: m_nextCodepoint_, currentValue);
270: m_currentCodepoint_ = m_nextCodepoint_;
271: return;
272: }
273: }
274: int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
275: // enumerate supplementary code points
276: while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
277: // lead surrogate access
278: int leadBlock = m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_;
279: if (leadBlock == m_trie_.m_dataOffset_) {
280: // no entries for a whole block of lead surrogates
281: if (currentValue != m_initialValue_) {
282: m_nextValue_ = m_initialValue_;
283: m_nextBlock_ = 0;
284: m_nextBlockIndex_ = 0;
285: setResult(element, m_currentCodepoint_,
286: m_nextCodepoint_, currentValue);
287: m_currentCodepoint_ = m_nextCodepoint_;
288: return;
289: }
290:
291: nextLead += DATA_BLOCK_LENGTH_;
292: // number of total affected supplementary codepoints in one
293: // block
294: // this is not a simple addition of
295: // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
296: // that we might have moved some of the codepoints
297: m_nextCodepoint_ = UCharacterProperty
298: .getRawSupplementary((char) nextLead,
299: (char) UTF16.TRAIL_SURROGATE_MIN_VALUE);
300: continue;
301: }
302: if (m_trie_.m_dataManipulate_ == null) {
303: throw new NullPointerException(
304: "The field DataManipulate in this Trie is null");
305: }
306: // enumerate trail surrogates for this lead surrogate
307: m_nextIndex_ = m_trie_.m_dataManipulate_
308: .getFoldingOffset(m_trie_.getValue(leadBlock
309: + (nextLead & Trie.INDEX_STAGE_3_MASK_)));
310: if (m_nextIndex_ <= 0) {
311: // no data for this lead surrogate
312: if (currentValue != m_initialValue_) {
313: m_nextValue_ = m_initialValue_;
314: m_nextBlock_ = 0;
315: m_nextBlockIndex_ = 0;
316: setResult(element, m_currentCodepoint_,
317: m_nextCodepoint_, currentValue);
318: m_currentCodepoint_ = m_nextCodepoint_;
319: return;
320: }
321: m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
322: } else {
323: m_nextTrailIndexOffset_ = 0;
324: if (!checkTrailBlock(currentBlock, currentValue)) {
325: setResult(element, m_currentCodepoint_,
326: m_nextCodepoint_, currentValue);
327: m_currentCodepoint_ = m_nextCodepoint_;
328: return;
329: }
330: }
331: nextLead++;
332: }
333:
334: // deliver last range
335: setResult(element, m_currentCodepoint_,
336: UCharacter.MAX_VALUE + 1, currentValue);
337: }
338:
339: /**
340: * Internal block value calculations
341: * Performs calculations on a data block to find codepoints in m_nextBlock_
342: * after the index m_nextBlockIndex_ that has the same value.
343: * Note m_*_ variables at this point is the next codepoint whose value
344: * has not been calculated.
345: * But when returned with false, it will be the last codepoint whose
346: * value has been calculated.
347: * @param currentValue the value which other codepoints are tested against
348: * @return true if the whole block has the same value as currentValue or if
349: * the whole block has been calculated, false otherwise.
350: */
351: private final boolean checkBlockDetail(int currentValue) {
352: while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
353: m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_
354: + m_nextBlockIndex_));
355: if (m_nextValue_ != currentValue) {
356: return false;
357: }
358: ++m_nextBlockIndex_;
359: ++m_nextCodepoint_;
360: }
361: return true;
362: }
363:
364: /**
365: * Internal block value calculations
366: * Performs calculations on a data block to find codepoints in m_nextBlock_
367: * that has the same value.
368: * Will call checkBlockDetail() if highlevel check fails.
369: * Note m_*_ variables at this point is the next codepoint whose value
370: * has not been calculated.
371: * @param currentBlock the initial block containing all currentValue
372: * @param currentValue the value which other codepoints are tested against
373: * @return true if the whole block has the same value as currentValue or if
374: * the whole block has been calculated, false otherwise.
375: */
376: private final boolean checkBlock(int currentBlock, int currentValue) {
377: m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] << Trie.INDEX_STAGE_2_SHIFT_;
378: if (m_nextBlock_ == currentBlock
379: && (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
380: // the block is the same as the previous one, filled with
381: // currentValue
382: m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
383: } else if (m_nextBlock_ == 0) {
384: // this is the all-initial-value block
385: if (currentValue != m_initialValue_) {
386: m_nextValue_ = m_initialValue_;
387: m_nextBlockIndex_ = 0;
388: return false;
389: }
390: m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
391: } else {
392: if (!checkBlockDetail(currentValue)) {
393: return false;
394: }
395: }
396: return true;
397: }
398:
399: /**
400: * Internal block value calculations
401: * Performs calculations on multiple data blocks for a set of trail
402: * surrogates to find codepoints in m_nextBlock_ that has the same value.
403: * Will call checkBlock() for internal block checks.
404: * Note m_*_ variables at this point is the next codepoint whose value
405: * has not been calculated.
406: * @param currentBlock the initial block containing all currentValue
407: * @param currentValue the value which other codepoints are tested against
408: * @return true if the whole block has the same value as currentValue or if
409: * the whole block has been calculated, false otherwise.
410: */
411: private final boolean checkTrailBlock(int currentBlock,
412: int currentValue) {
413: // enumerate code points for this lead surrogate
414: while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_) {
415: // if we ever reach here, we are at the start of a new block
416: m_nextBlockIndex_ = 0;
417: // copy of most of the body of the BMP loop
418: if (!checkBlock(currentBlock, currentValue)) {
419: return false;
420: }
421: m_nextTrailIndexOffset_++;
422: m_nextIndex_++;
423: }
424: return true;
425: }
426:
427: /**
428: * Checks if we are beginning at the start of a initial block.
429: * If we are then the rest of the codepoints in this initial block
430: * has the same values.
431: * We increment m_nextCodepoint_ and relevant data members if so.
432: * This is used only in for the supplementary codepoints because
433: * the offset to the trail indexes could be 0.
434: * @return true if we are at the start of a initial block.
435: */
436: private final boolean checkNullNextTrailIndex() {
437: if (m_nextIndex_ <= 0) {
438: m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
439: int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
440: int leadBlock = m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_;
441: if (m_trie_.m_dataManipulate_ == null) {
442: throw new NullPointerException(
443: "The field DataManipulate in this Trie is null");
444: }
445: m_nextIndex_ = m_trie_.m_dataManipulate_
446: .getFoldingOffset(m_trie_.getValue(leadBlock
447: + (nextLead & Trie.INDEX_STAGE_3_MASK_)));
448: m_nextIndex_--;
449: m_nextBlockIndex_ = DATA_BLOCK_LENGTH_;
450: return true;
451: }
452: return false;
453: }
454:
455: // private data members --------------------------------------------
456:
457: /**
458: * Size of the stage 1 BMP indexes
459: */
460: private static final int BMP_INDEX_LENGTH_ = 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
461: /**
462: * Lead surrogate minimum value
463: */
464: private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
465: /**
466: * Trail surrogate minimum value
467: */
468: private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
469: /**
470: * Trail surrogate maximum value
471: */
472: private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
473: /**
474: * Number of trail surrogate
475: */
476: private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
477: /**
478: * Number of stage 1 indexes for supplementary calculations that maps to
479: * each lead surrogate character.
480: * See second pass into getRawOffset for the trail surrogate character.
481: * 10 for significant number of bits for trail surrogates, 5 for what we
482: * discard during shifting.
483: */
484: private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ = 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
485: /**
486: * Number of data values in a stage 2 (data array) block.
487: */
488: private static final int DATA_BLOCK_LENGTH_ = 1 << Trie.INDEX_STAGE_1_SHIFT_;
489: /**
490: * Number of codepoints in a stage 2 block
491: */
492: private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ = DATA_BLOCK_LENGTH_ << 10;
493: /**
494: * Trie instance
495: */
496: private Trie m_trie_;
497: /**
498: * Initial value for trie values
499: */
500: private int m_initialValue_;
501: /**
502: * Next element results and data.
503: */
504: private int m_currentCodepoint_;
505: private int m_nextCodepoint_;
506: private int m_nextValue_;
507: private int m_nextIndex_;
508: private int m_nextBlock_;
509: private int m_nextBlockIndex_;
510: private int m_nextTrailIndexOffset_;
511: /**
512: * This is the return result element
513: */
514: private int m_start_;
515: private int m_limit_;
516: private int m_value_;
517: }
|