001: /*
002: * Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004: *
005: * This code is free software; you can redistribute it and/or modify it
006: * under the terms of the GNU General Public License version 2 only, as
007: * published by the Free Software Foundation. Sun designates this
008: * particular file as subject to the "Classpath" exception as provided
009: * by Sun in the LICENSE file that accompanied this code.
010: *
011: * This code is distributed in the hope that it will be useful, but WITHOUT
012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014: * version 2 for more details (a copy is included in the LICENSE file that
015: * accompanied this code).
016: *
017: * You should have received a copy of the GNU General Public License version
018: * 2 along with this work; if not, write to the Free Software Foundation,
019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020: *
021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022: * CA 95054 USA or visit www.sun.com if you need additional information or
023: * have any questions.
024: */
025:
026: /*
027: *******************************************************************************
028: * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
029: * *
030: * The original version of this source code and documentation is copyrighted *
031: * and owned by IBM, These materials are provided under terms of a License *
032: * Agreement between IBM and Sun. This technology is protected by multiple *
033: * US and International patents. This notice and attribution to IBM may not *
034: * to removed. *
035: *******************************************************************************
036: */
037:
038: package sun.text.normalizer;
039:
040: /**
041: * <p>Class enabling iteration of the values in a Trie.</p>
042: * <p>Result of each iteration contains the interval of codepoints that have
043: * the same value type and the value type itself.</p>
044: * <p>The comparison of each codepoint value is done via extract(), which the
045: * default implementation is to return the value as it is.</p>
046: * <p>Method extract() can be overwritten to perform manipulations on
047: * codepoint values in order to perform specialized comparison.</p>
048: * <p>TrieIterator is designed to be a generic iterator for the CharTrie
049: * and the IntTrie, hence to accommodate both types of data, the return
050: * result will be in terms of int (32 bit) values.</p>
051: * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
052: * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
053: * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
054: * sense, the caller will have to pass a object with a callback function
055: * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
056: * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
057: * codepoints with the same value as determined by
058: * UTrieEnumValue(const void *context, uint32_t value). for each range,
059: * utrie_enum calls the callback function to perform a task. In this way,
060: * icu4c performs the iteration within utrie_enum.
061: * To follow the JDK model, icu4j is slightly different from icu4c.
062: * Instead of requesting the caller to implement an object for a callback.
063: * The caller will have to implement a subclass of TrieIterator, fleshing out
064: * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
065: * the caller will have to code his own iteration and flesh out the task
066: * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
067: * </p>
068: * <p>There are basically 3 usage scenarios for porting:</p>
069: * <p>1) UTrieEnumValue is the only implemented callback then just implement a
070: * subclass of TrieIterator and override the extract(int) method. The
071: * extract(int) method is analogus to UTrieEnumValue callback.
072: * </p>
073: * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
074: * a subclass of TrieIterator, override the extract method and iterate, e.g
075: * </p>
076: * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
077: * set);<br>
078: * In Java :<br>
079: * <pre>
080: * class TrieIteratorImpl extends TrieIterator{
081: * public TrieIteratorImpl(Trie data){
082: * super(data);
083: * }
084: * public int extract(int value){
085: * // port the implementation of _enumPropertyStartsValue here
086: * }
087: * }
088: * ....
089: * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
090: * while(fcdIter.next(result)) {
091: * // port the implementation of _enumPropertyStartsRange
092: * }
093: * </pre>
094: * </p>
095: * <p>3) UTrieEnumRange is the only implemented callback then just implement
096: * the while loop, when utrie_enum is called
097: * <pre>
098: * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
099: * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
100: * while(fcdIter.next(result)){
101: * set.add(result.start);
102: * }
103: * </pre>
104: * </p>
105: * @author synwee
106: * @see com.ibm.icu.impl.Trie
107: * @see com.ibm.icu.lang.UCharacterTypeIterator
108: * @since release 2.1, Jan 17 2002
109: */
110: public class TrieIterator implements RangeValueIterator
111:
112: {
113: // public constructor ---------------------------------------------
114:
115: /**
116: * TrieEnumeration constructor
117: * @param trie to be used
118: * @exception IllegalArgumentException throw when argument is null.
119: * @draft 2.1
120: */
121: public TrieIterator(Trie trie) {
122: if (trie == null) {
123: throw new IllegalArgumentException(
124: "Argument trie cannot be null");
125: }
126: m_trie_ = trie;
127: // synwee: check that extract belongs to the child class
128: m_initialValue_ = extract(m_trie_.getInitialValue());
129: reset();
130: }
131:
132: // public methods -------------------------------------------------
133:
134: /**
135: * <p>Returns true if we are not at the end of the iteration, false
136: * otherwise.</p>
137: * <p>The next set of codepoints with the same value type will be
138: * calculated during this call and returned in the arguement element.</p>
139: * @param element return result
140: * @return true if we are not at the end of the iteration, false otherwise.
141: * @exception NoSuchElementException - if no more elements exist.
142: * @see com.ibm.icu.util.RangeValueIterator.Element
143: * @draft 2.1
144: */
145: public final boolean next(Element element) {
146: if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
147: return false;
148: }
149: if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE
150: && calculateNextBMPElement(element)) {
151: return true;
152: }
153: calculateNextSupplementaryElement(element);
154: return true;
155: }
156:
157: /**
158: * Resets the iterator to the beginning of the iteration
159: * @draft 2.1
160: */
161: public final void reset() {
162: m_currentCodepoint_ = 0;
163: m_nextCodepoint_ = 0;
164: m_nextIndex_ = 0;
165: m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
166: if (m_nextBlock_ == 0) {
167: m_nextValue_ = m_initialValue_;
168: } else {
169: m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
170: }
171: m_nextBlockIndex_ = 0;
172: m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
173: }
174:
175: // protected methods ----------------------------------------------
176:
177: /**
178: * Called by next() to extracts a 32 bit value from a trie value
179: * used for comparison.
180: * This method is to be overwritten if special manipulation is to be done
181: * to retrieve a relevant comparison.
182: * The default function is to return the value as it is.
183: * @param value a value from the trie
184: * @return extracted value
185: * @draft 2.1
186: */
187: protected int extract(int value) {
188: return value;
189: }
190:
191: // private methods ------------------------------------------------
192:
193: /**
194: * Set the result values
195: * @param element return result object
196: * @param start codepoint of range
197: * @param limit (end + 1) codepoint of range
198: * @param value common value of range
199: */
200: private final void setResult(Element element, int start, int limit,
201: int value) {
202: element.start = start;
203: element.limit = limit;
204: element.value = value;
205: }
206:
207: /**
208: * Finding the next element.
209: * This method is called just before returning the result of
210: * next().
211: * We always store the next element before it is requested.
212: * In the case that we have to continue calculations into the
213: * supplementary planes, a false will be returned.
214: * @param element return result object
215: * @return true if the next range is found, false if we have to proceed to
216: * the supplementary range.
217: */
218: private final boolean calculateNextBMPElement(Element element) {
219: int currentBlock = m_nextBlock_;
220: int currentValue = m_nextValue_;
221: m_currentCodepoint_ = m_nextCodepoint_;
222: m_nextCodepoint_++;
223: m_nextBlockIndex_++;
224: if (!checkBlockDetail(currentValue)) {
225: setResult(element, m_currentCodepoint_, m_nextCodepoint_,
226: currentValue);
227: return true;
228: }
229: // synwee check that next block index == 0 here
230: // enumerate BMP - the main loop enumerates data blocks
231: while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
232: m_nextIndex_++;
233: // because of the way the character is split to form the index
234: // the lead surrogate and trail surrogate can not be in the
235: // mid of a block
236: if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
237: // skip lead surrogate code units,
238: // go to lead surrogate codepoints
239: m_nextIndex_ = BMP_INDEX_LENGTH_;
240: } else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
241: // go back to regular BMP code points
242: m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
243: }
244:
245: m_nextBlockIndex_ = 0;
246: if (!checkBlock(currentBlock, currentValue)) {
247: setResult(element, m_currentCodepoint_,
248: m_nextCodepoint_, currentValue);
249: return true;
250: }
251: }
252: m_nextCodepoint_--; // step one back since this value has not been
253: m_nextBlockIndex_--; // retrieved yet.
254: return false;
255: }
256:
257: /**
258: * Finds the next supplementary element.
259: * For each entry in the trie, the value to be delivered is passed through
260: * extract().
261: * We always store the next element before it is requested.
262: * Called after calculateNextBMP() completes its round of BMP characters.
263: * There is a slight difference in the usage of m_currentCodepoint_
264: * here as compared to calculateNextBMP(). Though both represents the
265: * lower bound of the next element, in calculateNextBMP() it gets set
266: * at the start of any loop, where-else, in calculateNextSupplementary()
267: * since m_currentCodepoint_ already contains the lower bound of the
268: * next element (passed down from calculateNextBMP()), we keep it till
269: * the end before resetting it to the new value.
270: * Note, if there are no more iterations, it will never get to here.
271: * Blocked out by next().
272: * @param element return result object
273: * @draft 2.1
274: */
275: private final void calculateNextSupplementaryElement(Element element) {
276: int currentValue = m_nextValue_;
277: int currentBlock = m_nextBlock_;
278: m_nextCodepoint_++;
279: m_nextBlockIndex_++;
280:
281: if (UTF16.getTrailSurrogate(m_nextCodepoint_) != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
282: // this piece is only called when we are in the middle of a lead
283: // surrogate block
284: if (!checkNullNextTrailIndex()
285: && !checkBlockDetail(currentValue)) {
286: setResult(element, m_currentCodepoint_,
287: m_nextCodepoint_, currentValue);
288: m_currentCodepoint_ = m_nextCodepoint_;
289: return;
290: }
291: // we have cleared one block
292: m_nextIndex_++;
293: m_nextTrailIndexOffset_++;
294: if (!checkTrailBlock(currentBlock, currentValue)) {
295: setResult(element, m_currentCodepoint_,
296: m_nextCodepoint_, currentValue);
297: m_currentCodepoint_ = m_nextCodepoint_;
298: return;
299: }
300: }
301: int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
302: // enumerate supplementary code points
303: while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
304: // lead surrogate access
305: int leadBlock = m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_;
306: if (leadBlock == m_trie_.m_dataOffset_) {
307: // no entries for a whole block of lead surrogates
308: if (currentValue != m_initialValue_) {
309: m_nextValue_ = m_initialValue_;
310: m_nextBlock_ = 0;
311: m_nextBlockIndex_ = 0;
312: setResult(element, m_currentCodepoint_,
313: m_nextCodepoint_, currentValue);
314: m_currentCodepoint_ = m_nextCodepoint_;
315: return;
316: }
317:
318: nextLead += DATA_BLOCK_LENGTH_;
319: // number of total affected supplementary codepoints in one
320: // block
321: // this is not a simple addition of
322: // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
323: // that we might have moved some of the codepoints
324: m_nextCodepoint_ = UCharacterProperty
325: .getRawSupplementary((char) nextLead,
326: (char) UTF16.TRAIL_SURROGATE_MIN_VALUE);
327: continue;
328: }
329: if (m_trie_.m_dataManipulate_ == null) {
330: throw new NullPointerException(
331: "The field DataManipulate in this Trie is null");
332: }
333: // enumerate trail surrogates for this lead surrogate
334: m_nextIndex_ = m_trie_.m_dataManipulate_
335: .getFoldingOffset(m_trie_.getValue(leadBlock
336: + (nextLead & Trie.INDEX_STAGE_3_MASK_)));
337: if (m_nextIndex_ <= 0) {
338: // no data for this lead surrogate
339: if (currentValue != m_initialValue_) {
340: m_nextValue_ = m_initialValue_;
341: m_nextBlock_ = 0;
342: m_nextBlockIndex_ = 0;
343: setResult(element, m_currentCodepoint_,
344: m_nextCodepoint_, currentValue);
345: m_currentCodepoint_ = m_nextCodepoint_;
346: return;
347: }
348: m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
349: } else {
350: m_nextTrailIndexOffset_ = 0;
351: if (!checkTrailBlock(currentBlock, currentValue)) {
352: setResult(element, m_currentCodepoint_,
353: m_nextCodepoint_, currentValue);
354: m_currentCodepoint_ = m_nextCodepoint_;
355: return;
356: }
357: }
358: nextLead++;
359: }
360:
361: // deliver last range
362: setResult(element, m_currentCodepoint_,
363: UCharacter.MAX_VALUE + 1, currentValue);
364: }
365:
366: /**
367: * Internal block value calculations
368: * Performs calculations on a data block to find codepoints in m_nextBlock_
369: * after the index m_nextBlockIndex_ that has the same value.
370: * Note m_*_ variables at this point is the next codepoint whose value
371: * has not been calculated.
372: * But when returned with false, it will be the last codepoint whose
373: * value has been calculated.
374: * @param currentValue the value which other codepoints are tested against
375: * @return true if the whole block has the same value as currentValue or if
376: * the whole block has been calculated, false otherwise.
377: */
378: private final boolean checkBlockDetail(int currentValue) {
379: while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
380: m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_
381: + m_nextBlockIndex_));
382: if (m_nextValue_ != currentValue) {
383: return false;
384: }
385: ++m_nextBlockIndex_;
386: ++m_nextCodepoint_;
387: }
388: return true;
389: }
390:
391: /**
392: * Internal block value calculations
393: * Performs calculations on a data block to find codepoints in m_nextBlock_
394: * that has the same value.
395: * Will call checkBlockDetail() if highlevel check fails.
396: * Note m_*_ variables at this point is the next codepoint whose value
397: * has not been calculated.
398: * @param currentBlock the initial block containing all currentValue
399: * @param currentValue the value which other codepoints are tested against
400: * @return true if the whole block has the same value as currentValue or if
401: * the whole block has been calculated, false otherwise.
402: */
403: private final boolean checkBlock(int currentBlock, int currentValue) {
404: m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] << Trie.INDEX_STAGE_2_SHIFT_;
405: if (m_nextBlock_ == currentBlock
406: && (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
407: // the block is the same as the previous one, filled with
408: // currentValue
409: m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
410: } else if (m_nextBlock_ == 0) {
411: // this is the all-initial-value block
412: if (currentValue != m_initialValue_) {
413: m_nextValue_ = m_initialValue_;
414: m_nextBlockIndex_ = 0;
415: return false;
416: }
417: m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
418: } else {
419: if (!checkBlockDetail(currentValue)) {
420: return false;
421: }
422: }
423: return true;
424: }
425:
426: /**
427: * Internal block value calculations
428: * Performs calculations on multiple data blocks for a set of trail
429: * surrogates to find codepoints in m_nextBlock_ that has the same value.
430: * Will call checkBlock() for internal block checks.
431: * Note m_*_ variables at this point is the next codepoint whose value
432: * has not been calculated.
433: * @param currentBlock the initial block containing all currentValue
434: * @param currentValue the value which other codepoints are tested against
435: * @return true if the whole block has the same value as currentValue or if
436: * the whole block has been calculated, false otherwise.
437: */
438: private final boolean checkTrailBlock(int currentBlock,
439: int currentValue) {
440: // enumerate code points for this lead surrogate
441: while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_) {
442: // if we ever reach here, we are at the start of a new block
443: m_nextBlockIndex_ = 0;
444: // copy of most of the body of the BMP loop
445: if (!checkBlock(currentBlock, currentValue)) {
446: return false;
447: }
448: m_nextTrailIndexOffset_++;
449: m_nextIndex_++;
450: }
451: return true;
452: }
453:
454: /**
455: * Checks if we are beginning at the start of a initial block.
456: * If we are then the rest of the codepoints in this initial block
457: * has the same values.
458: * We increment m_nextCodepoint_ and relevant data members if so.
459: * This is used only in for the supplementary codepoints because
460: * the offset to the trail indexes could be 0.
461: * @return true if we are at the start of a initial block.
462: */
463: private final boolean checkNullNextTrailIndex() {
464: if (m_nextIndex_ <= 0) {
465: m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
466: int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
467: int leadBlock = m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_;
468: if (m_trie_.m_dataManipulate_ == null) {
469: throw new NullPointerException(
470: "The field DataManipulate in this Trie is null");
471: }
472: m_nextIndex_ = m_trie_.m_dataManipulate_
473: .getFoldingOffset(m_trie_.getValue(leadBlock
474: + (nextLead & Trie.INDEX_STAGE_3_MASK_)));
475: m_nextIndex_--;
476: m_nextBlockIndex_ = DATA_BLOCK_LENGTH_;
477: return true;
478: }
479: return false;
480: }
481:
482: // private data members --------------------------------------------
483:
484: /**
485: * Size of the stage 1 BMP indexes
486: */
487: private static final int BMP_INDEX_LENGTH_ = 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
488: /**
489: * Lead surrogate minimum value
490: */
491: private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
492: /**
493: * Trail surrogate minimum value
494: */
495: private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
496: /**
497: * Trail surrogate maximum value
498: */
499: private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
500: /**
501: * Number of trail surrogate
502: */
503: private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
504: /**
505: * Number of stage 1 indexes for supplementary calculations that maps to
506: * each lead surrogate character.
507: * See second pass into getRawOffset for the trail surrogate character.
508: * 10 for significant number of bits for trail surrogates, 5 for what we
509: * discard during shifting.
510: */
511: private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ = 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
512: /**
513: * Number of data values in a stage 2 (data array) block.
514: */
515: private static final int DATA_BLOCK_LENGTH_ = 1 << Trie.INDEX_STAGE_1_SHIFT_;
516: /**
517: * Number of codepoints in a stage 2 block
518: */
519: private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ = DATA_BLOCK_LENGTH_ << 10;
520: /**
521: * Trie instance
522: */
523: private Trie m_trie_;
524: /**
525: * Initial value for trie values
526: */
527: private int m_initialValue_;
528: /**
529: * Next element results and data.
530: */
531: private int m_currentCodepoint_;
532: private int m_nextCodepoint_;
533: private int m_nextValue_;
534: private int m_nextIndex_;
535: private int m_nextBlock_;
536: private int m_nextBlockIndex_;
537: private int m_nextTrailIndexOffset_;
538: /**
539: * This is the return result element
540: */
541: private int m_start_;
542: private int m_limit_;
543: private int m_value_;
544: }
|