0001: /**
0002: *******************************************************************************
0003: * Copyright (C) 1996-2005, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: *
0007: *
0008: *******************************************************************************
0009: */package com.ibm.icu.text;
0010:
0011: /***
0012: * import java.text.StringCharacterIterator;
0013: * import java.text.CharacterIterator;
0014: */
0015: import com.ibm.icu.impl.NormalizerImpl;
0016: import com.ibm.icu.impl.UCharacterProperty;
0017: import com.ibm.icu.impl.StringUCharacterIterator;
0018: import com.ibm.icu.impl.CharacterIteratorWrapper;
0019: import com.ibm.icu.impl.ICUDebug;
0020: import com.ibm.icu.lang.UCharacter;
0021: import java.text.CharacterIterator;
0022: import java.util.MissingResourceException;
0023:
0024: /**
0025: * <p><code>CollationElementIterator</code> is an iterator created by
0026: * a RuleBasedCollator to walk through a string. The return result of
0027: * each iteration is a 32-bit collation element that defines the
0028: * ordering priority of the next character or sequence of characters
0029: * in the source string.</p>
0030: *
0031: * <p>For illustration, consider the following in Spanish:
0032: * <blockquote>
0033: * <pre>
0034: * "ca" -> the first collation element is collation_element('c') and second
0035: * collation element is collation_element('a').
0036: *
0037: * Since "ch" in Spanish sorts as one entity, the below example returns one
0038: * collation element for the two characters 'c' and 'h'
0039: *
0040: * "cha" -> the first collation element is collation_element('ch') and second
0041: * collation element is collation_element('a').
0042: * </pre>
0043: * </blockquote>
0044: * And in German,
0045: * <blockquote>
0046: * <pre>
0047: * Since the character 'æ' is a composed character of 'a' and 'e', the
0048: * iterator returns two collation elements for the single character 'æ'
0049: *
0050: * "æb" -> the first collation element is collation_element('a'), the
0051: * second collation element is collation_element('e'), and the
0052: * third collation element is collation_element('b').
0053: * </pre>
0054: * </blockquote>
0055: * </p>
0056: *
0057: * <p>For collation ordering comparison, the collation element results
0058: * can not be compared simply by using basic arithmetric operators,
0059: * e.g. <, == or >, further processing has to be done. Details
0060: * can be found in the ICU
0061: * <a href="http://icu.sourceforge.net/userguide/Collate_ServiceArchitecture.html">
0062: * user guide</a>. An example of using the CollationElementIterator
0063: * for collation ordering comparison is the class
0064: * <a href=StringSearch.html> com.ibm.icu.text.StringSearch</a>.</p>
0065: *
0066: * <p>To construct a CollationElementIterator object, users
0067: * call the method getCollationElementIterator() on a
0068: * RuleBasedCollator that defines the desired sorting order.</p>
0069: *
0070: * <p> Example:
0071: * <blockquote>
0072: * <pre>
0073: * String testString = "This is a test";
0074: * RuleBasedCollator rbc = new RuleBasedCollator("&a<b");
0075: * CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
0076: * int primaryOrder = iterator.IGNORABLE;
0077: * while (primaryOrder != iterator.NULLORDER) {
0078: * int order = iterator.next();
0079: * if (order != iterator.IGNORABLE &&
0080: * order != iterator.NULLORDER) {
0081: * // order is valid, not ignorable and we have not passed the end
0082: * // of the iteration, we do something
0083: * primaryOrder = CollationElementIterator.primaryOrder(order);
0084: * System.out.println("Next primary order 0x" +
0085: * Integer.toHexString(primaryOrder));
0086: * }
0087: * }
0088: * </pre>
0089: * </blockquote>
0090: * </p>
0091: * <p>
0092: * This class is not subclassable
0093: * </p>
0094: * @see Collator
0095: * @see RuleBasedCollator
0096: * @see StringSearch
0097: * @author Syn Wee Quek
0098: * @stable ICU 2.8
0099: */
0100: public final class CollationElementIterator {
0101:
0102: // public data members --------------------------------------------------
0103:
0104: /**
0105: * <p>This constant is returned by the iterator in the methods
0106: * next() and previous() when the end or the beginning of the
0107: * source string has been reached, and there are no more valid
0108: * collation elements to return.</p>
0109: *
0110: * <p>See class documentation for an example of use.</p>
0111: * @stable ICU 2.8
0112: * @see #next
0113: * @see #previous */
0114: public final static int NULLORDER = 0xffffffff;
0115:
0116: /**
0117: * <p>This constant is returned by the iterator in the methods
0118: * next() and previous() when a collation element result is to be
0119: * ignored.</p>
0120: *
0121: * <p>See class documentation for an example of use.</p>
0122: * @stable ICU 2.8
0123: * @see #next
0124: * @see #previous */
0125: public static final int IGNORABLE = 0;
0126:
0127: // public methods -------------------------------------------------------
0128:
0129: // public getters -------------------------------------------------------
0130:
0131: /**
0132: * <p>Returns the character offset in the source string
0133: * corresponding to the next collation element. I.e., getOffset()
0134: * returns the position in the source string corresponding to the
0135: * collation element that will be returned by the next call to
0136: * next(). This value could be any of:
0137: * <ul>
0138: * <li> The index of the <b>first</b> character corresponding to
0139: * the next collation element. (This means that if
0140: * <code>setOffset(offset)</code> sets the index in the middle of
0141: * a contraction, <code>getOffset()</code> returns the index of
0142: * the first character in the contraction, which may not be equal
0143: * to the original offset that was set. Hence calling getOffset()
0144: * immediately after setOffset(offset) does not guarantee that the
0145: * original offset set will be returned.)
0146: * <li> If normalization is on, the index of the <b>immediate</b>
0147: * subsequent character, or composite character with the first
0148: * character, having a combining class of 0.
0149: * <li> The length of the source string, if iteration has reached
0150: * the end.
0151: *</ul>
0152: * </p>
0153: * @return The character offset in the source string corresponding to the
0154: * collation element that will be returned by the next call to
0155: * next().
0156: * @stable ICU 2.8
0157: */
0158: public int getOffset() {
0159: if (m_bufferOffset_ != -1) {
0160: if (m_isForwards_) {
0161: return m_FCDLimit_;
0162: }
0163: return m_FCDStart_;
0164: }
0165: return m_source_.getIndex();
0166: }
0167:
0168: /**
0169: * <p> Returns the maximum length of any expansion sequence that ends with
0170: * the specified collation element. If there is no expansion with this
0171: * collation element as the last element, returns 1.
0172: * </p>
0173: * @param ce a collation element returned by previous() or next().
0174: * @return the maximum length of any expansion sequence ending
0175: * with the specified collation element.
0176: * @stable ICU 2.8
0177: */
0178: public int getMaxExpansion(int ce) {
0179: int start = 0;
0180: int limit = m_collator_.m_expansionEndCE_.length;
0181: long unsignedce = ce & 0xFFFFFFFFl;
0182: while (start < limit - 1) {
0183: int mid = start + ((limit - start) >> 1);
0184: long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
0185: if (unsignedce <= midce) {
0186: limit = mid;
0187: } else {
0188: start = mid;
0189: }
0190: }
0191: int result = 1;
0192: if (m_collator_.m_expansionEndCE_[start] == ce) {
0193: result = m_collator_.m_expansionEndCEMaxSize_[start];
0194: } else if (limit < m_collator_.m_expansionEndCE_.length
0195: && m_collator_.m_expansionEndCE_[limit] == ce) {
0196: result = m_collator_.m_expansionEndCEMaxSize_[limit];
0197: } else if ((ce & 0xFFFF) == 0x00C0) {
0198: result = 2;
0199: }
0200: return result;
0201: }
0202:
0203: // public other methods -------------------------------------------------
0204:
0205: /**
0206: * <p> Resets the cursor to the beginning of the string. The next
0207: * call to next() or previous() will return the first and last
0208: * collation element in the string, respectively.</p>
0209: *
0210: * <p>If the RuleBasedCollator used by this iterator has had its
0211: * attributes changed, calling reset() will reinitialize the
0212: * iterator to use the new attributes.</p>
0213: *
0214: * @stable ICU 2.8
0215: */
0216: public void reset() {
0217: m_source_.setToStart();
0218: updateInternalState();
0219: }
0220:
0221: /**
0222: * <p>Get the next collation element in the source string.</p>
0223: *
0224: * <p>This iterator iterates over a sequence of collation elements
0225: * that were built from the string. Because there isn't
0226: * necessarily a one-to-one mapping from characters to collation
0227: * elements, this doesn't mean the same thing as "return the
0228: * collation element [or ordering priority] of the next character
0229: * in the string".</p>
0230: *
0231: * <p>This function returns the collation element that the
0232: * iterator is currently pointing to, and then updates the
0233: * internal pointer to point to the next element. Previous()
0234: * updates the pointer first, and then returns the element. This
0235: * means that when you change direction while iterating (i.e.,
0236: * call next() and then call previous(), or call previous() and
0237: * then call next()), you'll get back the same element twice.</p>
0238: *
0239: * @return the next collation element or NULLORDER if the end of the
0240: * iteration has been reached.
0241: * @stable ICU 2.8
0242: */
0243: public int next() {
0244: m_isForwards_ = true;
0245: if (m_CEBufferSize_ > 0) {
0246: if (m_CEBufferOffset_ < m_CEBufferSize_) {
0247: // if there are expansions left in the buffer, we return it
0248: return m_CEBuffer_[m_CEBufferOffset_++];
0249: }
0250: m_CEBufferSize_ = 0;
0251: m_CEBufferOffset_ = 0;
0252: }
0253:
0254: int ch_int = nextChar();
0255:
0256: if (ch_int == UCharacterIterator.DONE) {
0257: return NULLORDER;
0258: }
0259: char ch = (char) ch_int;
0260: if (m_collator_.m_isHiragana4_) {
0261: m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309e)
0262: && !(ch > 0x3094 && ch < 0x309d);
0263: }
0264:
0265: int result = NULLORDER;
0266: if (ch <= 0xFF) {
0267: // For latin-1 characters we never need to fall back to the UCA
0268: // table because all of the UCA data is replicated in the
0269: // latinOneMapping array
0270: result = m_collator_.m_trie_.getLatin1LinearValue(ch);
0271: if (RuleBasedCollator.isSpecial(result)) {
0272: result = nextSpecial(m_collator_, result, ch);
0273: }
0274: } else {
0275: result = m_collator_.m_trie_.getLeadValue(ch);
0276: //System.out.println(Integer.toHexString(result));
0277: if (RuleBasedCollator.isSpecial(result)) {
0278: // surrogate leads are handled as special ces
0279: result = nextSpecial(m_collator_, result, ch);
0280: }
0281: if (result == CE_NOT_FOUND_
0282: && RuleBasedCollator.UCA_ != null) {
0283: // couldn't find a good CE in the tailoring
0284: // if we got here, the codepoint MUST be over 0xFF - so we look
0285: // directly in the UCA
0286: result = RuleBasedCollator.UCA_.m_trie_
0287: .getLeadValue(ch);
0288: if (RuleBasedCollator.isSpecial(result)) {
0289: // UCA also gives us a special CE
0290: result = nextSpecial(RuleBasedCollator.UCA_,
0291: result, ch);
0292: }
0293: }
0294: }
0295: if (result == CE_NOT_FOUND_) {
0296: // maybe there is no UCA, unlikely in Java, but ported for consistency
0297: result = nextImplicit(ch);
0298: }
0299: return result;
0300: }
0301:
0302: /**
0303: * <p>Get the previous collation element in the source string.</p>
0304: *
0305: * <p>This iterator iterates over a sequence of collation elements
0306: * that were built from the string. Because there isn't
0307: * necessarily a one-to-one mapping from characters to collation
0308: * elements, this doesn't mean the same thing as "return the
0309: * collation element [or ordering priority] of the previous
0310: * character in the string".</p>
0311: *
0312: * <p>This function updates the iterator's internal pointer to
0313: * point to the collation element preceding the one it's currently
0314: * pointing to and then returns that element, while next() returns
0315: * the current element and then updates the pointer. This means
0316: * that when you change direction while iterating (i.e., call
0317: * next() and then call previous(), or call previous() and then
0318: * call next()), you'll get back the same element twice.</p>
0319: *
0320: * @return the previous collation element, or NULLORDER when the start of
0321: * the iteration has been reached.
0322: * @stable ICU 2.8
0323: */
0324: public int previous() {
0325: if (m_source_.getIndex() <= 0 && m_isForwards_) {
0326: // if iterator is new or reset, we can immediate perform backwards
0327: // iteration even when the offset is not right.
0328: m_source_.setToLimit();
0329: updateInternalState();
0330: }
0331: m_isForwards_ = false;
0332: int result = NULLORDER;
0333: if (m_CEBufferSize_ > 0) {
0334: if (m_CEBufferOffset_ > 0) {
0335: return m_CEBuffer_[--m_CEBufferOffset_];
0336: }
0337: m_CEBufferSize_ = 0;
0338: m_CEBufferOffset_ = 0;
0339: }
0340: int ch_int = previousChar();
0341: if (ch_int == UCharacterIterator.DONE) {
0342: return NULLORDER;
0343: }
0344: char ch = (char) ch_int;
0345: if (m_collator_.m_isHiragana4_) {
0346: m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f);
0347: }
0348: if (m_collator_.isContractionEnd(ch) && !isBackwardsStart()) {
0349: result = previousSpecial(m_collator_, CE_CONTRACTION_, ch);
0350: } else {
0351: if (ch <= 0xFF) {
0352: result = m_collator_.m_trie_.getLatin1LinearValue(ch);
0353: } else {
0354: result = m_collator_.m_trie_.getLeadValue(ch);
0355: }
0356: if (RuleBasedCollator.isSpecial(result)) {
0357: result = previousSpecial(m_collator_, result, ch);
0358: }
0359: if (result == CE_NOT_FOUND_) {
0360: if (!isBackwardsStart()
0361: && m_collator_.isContractionEnd(ch)) {
0362: result = CE_CONTRACTION_;
0363: } else {
0364: if (RuleBasedCollator.UCA_ != null) {
0365: result = RuleBasedCollator.UCA_.m_trie_
0366: .getLeadValue(ch);
0367: }
0368: }
0369:
0370: if (RuleBasedCollator.isSpecial(result)) {
0371: if (RuleBasedCollator.UCA_ != null) {
0372: result = previousSpecial(
0373: RuleBasedCollator.UCA_, result, ch);
0374: }
0375: }
0376: }
0377: }
0378: if (result == CE_NOT_FOUND_) {
0379: result = previousImplicit(ch);
0380: }
0381: return result;
0382: }
0383:
0384: /**
0385: * Return the primary order of the specified collation element,
0386: * i.e. the first 16 bits. This value is unsigned.
0387: * @param ce the collation element
0388: * @return the element's 16 bits primary order.
0389: * @stable ICU 2.8
0390: */
0391: public final static int primaryOrder(int ce) {
0392: return (ce & RuleBasedCollator.CE_PRIMARY_MASK_) >>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
0393: }
0394:
0395: /**
0396: * Return the secondary order of the specified collation element,
0397: * i.e. the 16th to 23th bits, inclusive. This value is unsigned.
0398: * @param ce the collation element
0399: * @return the element's 8 bits secondary order
0400: * @stable ICU 2.8
0401: */
0402: public final static int secondaryOrder(int ce) {
0403: return (ce & RuleBasedCollator.CE_SECONDARY_MASK_) >> RuleBasedCollator.CE_SECONDARY_SHIFT_;
0404: }
0405:
0406: /**
0407: * Return the tertiary order of the specified collation element, i.e. the last
0408: * 8 bits. This value is unsigned.
0409: * @param ce the collation element
0410: * @return the element's 8 bits tertiary order
0411: * @stable ICU 2.8
0412: */
0413: public final static int tertiaryOrder(int ce) {
0414: return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
0415: }
0416:
0417: /**
0418: * <p> Sets the iterator to point to the collation element
0419: * corresponding to the character at the specified offset. The
0420: * value returned by the next call to next() will be the collation
0421: * element corresponding to the characters at offset.</p>
0422: *
0423: * <p>If offset is in the middle of a contracting character
0424: * sequence, the iterator is adjusted to the start of the
0425: * contracting sequence. This means that getOffset() is not
0426: * guaranteed to return the same value set by this method.</p>
0427: *
0428: * <p>If the decomposition mode is on, and offset is in the middle
0429: * of a decomposible range of source text, the iterator may not
0430: * return a correct result for the next forwards or backwards
0431: * iteration. The user must ensure that the offset is not in the
0432: * middle of a decomposible range.</p>
0433: *
0434: * @param offset the character offset into the original source string to
0435: * set. Note that this is not an offset into the corresponding
0436: * sequence of collation elements.
0437: * @stable ICU 2.8
0438: */
0439: public void setOffset(int offset) {
0440: m_source_.setIndex(offset);
0441: int ch_int = m_source_.current();
0442: char ch = (char) ch_int;
0443: if (ch_int != UCharacterIterator.DONE
0444: && m_collator_.isUnsafe(ch)) {
0445: // if it is unsafe we need to check if it is part of a contraction
0446: // or a surrogate character
0447: if (UTF16.isTrailSurrogate(ch)) {
0448: // if it is a surrogate pair we move up one character
0449: char prevch = (char) m_source_.previous();
0450: if (!UTF16.isLeadSurrogate(prevch)) {
0451: m_source_.setIndex(offset); // go back to the same index
0452: }
0453: } else {
0454: // could be part of a contraction
0455: // backup to a safe point and iterate till we pass offset
0456: while (m_source_.getIndex() > 0) {
0457: if (!m_collator_.isUnsafe(ch)) {
0458: break;
0459: }
0460: ch = (char) m_source_.previous();
0461: }
0462: updateInternalState();
0463: int prevoffset = 0;
0464: while (m_source_.getIndex() <= offset) {
0465: prevoffset = m_source_.getIndex();
0466: next();
0467: }
0468: m_source_.setIndex(prevoffset);
0469: }
0470: }
0471: updateInternalState();
0472: // direction code to prevent next and previous from returning a
0473: // character if we are already at the ends
0474: offset = m_source_.getIndex();
0475: if (offset == 0/* m_source_.getBeginIndex() */) {
0476: // preventing previous() from returning characters from the end of
0477: // the string again if we are at the beginning
0478: m_isForwards_ = false;
0479: } else if (offset == m_source_.getLength()) {
0480: // preventing next() from returning characters from the start of
0481: // the string again if we are at the end
0482: m_isForwards_ = true;
0483: }
0484: }
0485:
0486: /**
0487: * <p>Set a new source string for iteration, and reset the offset
0488: * to the beginning of the text.</p>
0489: *
0490: * @param source the new source string for iteration.
0491: * @stable ICU 2.8
0492: */
0493: public void setText(String source) {
0494: m_srcUtilIter_.setText(source);
0495: m_source_ = m_srcUtilIter_;
0496: updateInternalState();
0497: }
0498:
0499: /**
0500: * <p>Set a new source string iterator for iteration, and reset the
0501: * offset to the beginning of the text.
0502: * </p>
0503: * <p>The source iterator's integrity will be preserved since a new copy
0504: * will be created for use.</p>
0505: * @param source the new source string iterator for iteration.
0506: * @stable ICU 2.8
0507: */
0508: public void setText(UCharacterIterator source) {
0509: m_srcUtilIter_.setText(source.getText());
0510: m_source_ = m_srcUtilIter_;
0511: updateInternalState();
0512: }
0513:
0514: /**
0515: * <p>Set a new source string iterator for iteration, and reset the
0516: * offset to the beginning of the text.
0517: * </p>
0518: * @param source the new source string iterator for iteration.
0519: * @stable ICU 2.8
0520: */
0521: public void setText(CharacterIterator source) {
0522: m_source_ = new CharacterIteratorWrapper(source);
0523: m_source_.setToStart();
0524: updateInternalState();
0525: }
0526:
0527: // public miscellaneous methods -----------------------------------------
0528:
0529: /**
0530: * Tests that argument object is equals to this CollationElementIterator.
0531: * Iterators are equal if the objects uses the same RuleBasedCollator,
0532: * the same source text and have the same current position in iteration.
0533: * @param that object to test if it is equals to this
0534: * CollationElementIterator
0535: * @stable ICU 2.8
0536: */
0537: public boolean equals(Object that) {
0538: if (that == this ) {
0539: return true;
0540: }
0541: if (that instanceof CollationElementIterator) {
0542: CollationElementIterator thatceiter = (CollationElementIterator) that;
0543: if (!m_collator_.equals(thatceiter.m_collator_)) {
0544: return false;
0545: }
0546: // checks the text
0547: return m_source_.getIndex() == thatceiter.m_source_
0548: .getIndex()
0549: && m_source_.getText().equals(
0550: thatceiter.m_source_.getText());
0551: }
0552: return false;
0553: }
0554:
0555: // package private constructors ------------------------------------------
0556:
0557: /**
0558: * <p>CollationElementIterator constructor. This takes a source
0559: * string and a RuleBasedCollator. The iterator will walk through
0560: * the source string based on the rules defined by the
0561: * collator. If the source string is empty, NULLORDER will be
0562: * returned on the first call to next().</p>
0563: *
0564: * @param source the source string.
0565: * @param collator the RuleBasedCollator
0566: * @stable ICU 2.8
0567: */
0568: CollationElementIterator(String source, RuleBasedCollator collator) {
0569: m_srcUtilIter_ = new StringUCharacterIterator(source);
0570: m_utilStringBuffer_ = new StringBuffer();
0571: m_source_ = m_srcUtilIter_;
0572: m_collator_ = collator;
0573: m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
0574: m_buffer_ = new StringBuffer();
0575: m_utilSpecialBackUp_ = new Backup();
0576: updateInternalState();
0577: }
0578:
0579: /**
0580: * <p>CollationElementIterator constructor. This takes a source
0581: * character iterator and a RuleBasedCollator. The iterator will
0582: * walk through the source string based on the rules defined by
0583: * the collator. If the source string is empty, NULLORDER will be
0584: * returned on the first call to next().</p>
0585: *
0586: * @param source the source string iterator.
0587: * @param collator the RuleBasedCollator
0588: * @stable ICU 2.8
0589: */
0590: CollationElementIterator(CharacterIterator source,
0591: RuleBasedCollator collator) {
0592: m_srcUtilIter_ = new StringUCharacterIterator();
0593: m_utilStringBuffer_ = new StringBuffer();
0594: m_source_ = new CharacterIteratorWrapper(source);
0595: m_collator_ = collator;
0596: m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
0597: m_buffer_ = new StringBuffer();
0598: m_utilSpecialBackUp_ = new Backup();
0599: updateInternalState();
0600: }
0601:
0602: /**
0603: * <p>CollationElementIterator constructor. This takes a source
0604: * character iterator and a RuleBasedCollator. The iterator will
0605: * walk through the source string based on the rules defined by
0606: * the collator. If the source string is empty, NULLORDER will be
0607: * returned on the first call to next().</p>
0608: *
0609: * @param source the source string iterator.
0610: * @param collator the RuleBasedCollator
0611: * @stable ICU 2.8
0612: */
0613: CollationElementIterator(UCharacterIterator source,
0614: RuleBasedCollator collator) {
0615: m_srcUtilIter_ = new StringUCharacterIterator();
0616: m_utilStringBuffer_ = new StringBuffer();
0617: m_srcUtilIter_.setText(source.getText());
0618: m_source_ = m_srcUtilIter_;
0619: m_collator_ = collator;
0620: m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
0621: m_buffer_ = new StringBuffer();
0622: m_utilSpecialBackUp_ = new Backup();
0623: updateInternalState();
0624: }
0625:
0626: // package private data members -----------------------------------------
0627:
0628: /**
0629: * true if current codepoint was Hiragana
0630: */
0631: boolean m_isCodePointHiragana_;
0632: /**
0633: * Position in the original string that starts with a non-FCD sequence
0634: */
0635: int m_FCDStart_;
0636: /**
0637: * This is the CE from CEs buffer that should be returned.
0638: * Initial value is 0.
0639: * Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
0640: * backwards will end with m_CEBufferOffset_ == 0.
0641: * The next/previous after we reach the end/beginning of the m_CEBuffer_
0642: * will cause this value to be reset to 0.
0643: */
0644: int m_CEBufferOffset_;
0645:
0646: /**
0647: * This is the position to which we have stored processed CEs.
0648: * Initial value is 0.
0649: * The next/previous after we reach the end/beginning of the m_CEBuffer_
0650: * will cause this value to be reset to 0.
0651: */
0652: int m_CEBufferSize_;
0653: static final int CE_NOT_FOUND_ = 0xF0000000;
0654: static final int CE_EXPANSION_TAG_ = 1;
0655: static final int CE_CONTRACTION_TAG_ = 2;
0656: /**
0657: * Collate Digits As Numbers (CODAN) implementation
0658: */
0659: static final int CE_DIGIT_TAG_ = 13;
0660:
0661: // package private methods ----------------------------------------------
0662:
0663: /**
0664: * Sets the collator used.
0665: * Internal use, all data members will be reset to the default values
0666: * @param collator to set
0667: */
0668: void setCollator(RuleBasedCollator collator) {
0669: m_collator_ = collator;
0670: updateInternalState();
0671: }
0672:
0673: /**
0674: * <p>Sets the iterator to point to the collation element corresponding to
0675: * the specified character (the parameter is a CHARACTER offset in the
0676: * original string, not an offset into its corresponding sequence of
0677: * collation elements). The value returned by the next call to next()
0678: * will be the collation element corresponding to the specified position
0679: * in the text. Unlike the public method setOffset(int), this method does
0680: * not try to readjust the offset to the start of a contracting sequence.
0681: * getOffset() is guaranteed to return the same value as was passed to a
0682: * preceding call to setOffset().</p>
0683: * @param offset new character offset into the original text to set.
0684: */
0685: void setExactOffset(int offset) {
0686: m_source_.setIndex(offset);
0687: updateInternalState();
0688: }
0689:
0690: /**
0691: * Checks if iterator is in the buffer zone
0692: * @return true if iterator is in buffer zone, false otherwise
0693: */
0694: boolean isInBuffer() {
0695: return m_bufferOffset_ > 0;
0696: }
0697:
0698: /**
0699: * <p>Sets the iterator to point to the collation element corresponding to
0700: * the specified character (the parameter is a CHARACTER offset in the
0701: * original string, not an offset into its corresponding sequence of
0702: * collation elements). The value returned by the next call to next()
0703: * will be the collation element corresponding to the specified position
0704: * in the text. Unlike the public method setOffset(int), this method does
0705: * not try to readjust the offset to the start of a contracting sequence.
0706: * getOffset() is guaranteed to return the same value as was passed to a
0707: * preceding call to setOffset().</p>
0708: * </p>
0709: * @param source the new source string iterator for iteration.
0710: * @param offset to the source
0711: */
0712: void setText(UCharacterIterator source, int offset) {
0713: m_srcUtilIter_.setText(source.getText());
0714: m_source_ = m_srcUtilIter_;
0715: m_source_.setIndex(offset);
0716: updateInternalState();
0717: }
0718:
0719: // private inner class --------------------------------------------------
0720:
0721: /**
0722: * Backup data class
0723: */
0724: private static final class Backup {
0725: // protected data members -------------------------------------------
0726:
0727: /**
0728: * Backup non FCD sequence limit
0729: */
0730: protected int m_FCDLimit_;
0731: /**
0732: * Backup non FCD sequence start
0733: */
0734: protected int m_FCDStart_;
0735: /**
0736: * Backup if previous Codepoint is Hiragana quatenary
0737: */
0738: protected boolean m_isCodePointHiragana_;
0739: /**
0740: * Backup buffer position
0741: */
0742: protected int m_bufferOffset_;
0743: /**
0744: * Backup source iterator offset
0745: */
0746: protected int m_offset_;
0747: /**
0748: * Backup buffer contents
0749: */
0750: protected StringBuffer m_buffer_;
0751:
0752: // protected constructor --------------------------------------------
0753:
0754: /**
0755: * Empty constructor
0756: */
0757: protected Backup() {
0758: m_buffer_ = new StringBuffer();
0759: }
0760: }
0761:
0762: // end inner class ------------------------------------------------------
0763:
0764: /**
0765: * Direction of travel
0766: */
0767: private boolean m_isForwards_;
0768: /**
0769: * Source string iterator
0770: */
0771: private UCharacterIterator m_source_;
0772: /**
0773: * This is position to the m_buffer_, -1 if iterator is not in m_buffer_
0774: */
0775: private int m_bufferOffset_;
0776: /**
0777: * Buffer for temporary storage of normalized characters, discontiguous
0778: * characters and Thai characters
0779: */
0780: private StringBuffer m_buffer_;
0781: /**
0782: * Position in the original string to continue forward FCD check from.
0783: */
0784: private int m_FCDLimit_;
0785: /**
0786: * The collator this iterator is based on
0787: */
0788: private RuleBasedCollator m_collator_;
0789: /**
0790: * true if Hiragana quatenary is on
0791: */
0792: private boolean m_isHiragana4_;
0793: /**
0794: * CE buffer
0795: */
0796: private int m_CEBuffer_[];
0797: /**
0798: * In reality we should not have to deal with expansion sequences longer
0799: * then 16. However this value can be change if a bigger buffer is needed.
0800: * Note, if the size is change to too small a number, BIG trouble.
0801: * Reasonable small value is around 10, if there's no Arabic or other
0802: * funky collations that have long expansion sequence. This is the longest
0803: * expansion sequence this can handle without bombing out.
0804: */
0805: private static final int CE_BUFFER_INIT_SIZE_ = 512;
0806: /**
0807: * Backup storage for special processing inner cases
0808: */
0809: private Backup m_utilSpecialBackUp_;
0810: /**
0811: * Backup storage in special processing entry state
0812: */
0813: private Backup m_utilSpecialEntryBackUp_;
0814: /**
0815: * Backup storage in special processing discontiguous state
0816: */
0817: private Backup m_utilSpecialDiscontiguousBackUp_;
0818: /**
0819: * Utility
0820: */
0821: private StringUCharacterIterator m_srcUtilIter_;
0822: private StringBuffer m_utilStringBuffer_;
0823: private StringBuffer m_utilSkippedBuffer_;
0824: private CollationElementIterator m_utilColEIter_;
0825: /**
0826: * One character before the first non-zero combining class character
0827: */
0828: private static final int FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0xC0;
0829: /**
0830: * One character before the first character with leading non-zero combining
0831: * class
0832: */
0833: private static final int LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0x300;
0834: /**
0835: * Mask for the last byte
0836: */
0837: private static final int LAST_BYTE_MASK_ = 0xFF;
0838: /**
0839: * Shift value for the second last byte
0840: */
0841: private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
0842:
0843: // special ce values and tags -------------------------------------------
0844:
0845: private static final int CE_EXPANSION_ = 0xF1000000;
0846: private static final int CE_CONTRACTION_ = 0xF2000000;
0847: /**
0848: * Indicates the last ce has been consumed. Compare with NULLORDER.
0849: * NULLORDER is returned if error occurs.
0850: */
0851: private static final int CE_NO_MORE_CES_ = 0x00010101;
0852: private static final int CE_NO_MORE_CES_PRIMARY_ = 0x00010000;
0853: private static final int CE_NO_MORE_CES_SECONDARY_ = 0x00000100;
0854: private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
0855:
0856: private static final int CE_NOT_FOUND_TAG_ = 0;
0857: /**
0858: * Charset processing, not yet implemented
0859: */
0860: private static final int CE_CHARSET_TAG_ = 4;
0861: /**
0862: * AC00-D7AF
0863: */
0864: private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
0865: /**
0866: * D800-DBFF
0867: */
0868: private static final int CE_LEAD_SURROGATE_TAG_ = 7;
0869: /**
0870: * DC00-DFFF
0871: */
0872: private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
0873: /**
0874: * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
0875: */
0876: private static final int CE_CJK_IMPLICIT_TAG_ = 9;
0877: private static final int CE_IMPLICIT_TAG_ = 10;
0878: static final int CE_SPEC_PROC_TAG_ = 11;
0879: /**
0880: * This is a 3 byte primary with starting secondaries and tertiaries.
0881: * It fits in a single 32 bit CE and is used instead of expansion to save
0882: * space without affecting the performance (hopefully).
0883: */
0884: private static final int CE_LONG_PRIMARY_TAG_ = 12;
0885:
0886: private static final int CE_CE_TAGS_COUNT = 14;
0887: private static final int CE_BYTE_COMMON_ = 0x05;
0888:
0889: // end special ce values and tags ---------------------------------------
0890:
0891: private static final int HANGUL_SBASE_ = 0xAC00;
0892: private static final int HANGUL_LBASE_ = 0x1100;
0893: private static final int HANGUL_VBASE_ = 0x1161;
0894: private static final int HANGUL_TBASE_ = 0x11A7;
0895: private static final int HANGUL_VCOUNT_ = 21;
0896: private static final int HANGUL_TCOUNT_ = 28;
0897:
0898: // CJK stuff ------------------------------------------------------------
0899:
0900: private static final int CJK_BASE_ = 0x4E00;
0901: private static final int CJK_LIMIT_ = 0x9FFF + 1;
0902: private static final int CJK_COMPAT_USED_BASE_ = 0xFA0E;
0903: private static final int CJK_COMPAT_USED_LIMIT_ = 0xFA2F + 1;
0904: private static final int CJK_A_BASE_ = 0x3400;
0905: private static final int CJK_A_LIMIT_ = 0x4DBF + 1;
0906: private static final int CJK_B_BASE_ = 0x20000;
0907: private static final int CJK_B_LIMIT_ = 0x2A6DF + 1;
0908: private static final int NON_CJK_OFFSET_ = 0x110000;
0909:
0910: private static final boolean DEBUG = ICUDebug.enabled("collator");
0911:
0912: // private methods ------------------------------------------------------
0913:
0914: /**
0915: * Reset the iterator internally
0916: */
0917: private void updateInternalState() {
0918: m_isCodePointHiragana_ = false;
0919: m_buffer_.setLength(0);
0920: m_bufferOffset_ = -1;
0921: m_CEBufferOffset_ = 0;
0922: m_CEBufferSize_ = 0;
0923: m_FCDLimit_ = -1;
0924: m_FCDStart_ = m_source_.getLength();
0925: m_isHiragana4_ = m_collator_.m_isHiragana4_;
0926: m_isForwards_ = true;
0927: }
0928:
0929: /**
0930: * Backup the current internal state
0931: * @param backup object to store the data
0932: */
0933: private void backupInternalState(Backup backup) {
0934: backup.m_offset_ = m_source_.getIndex();
0935: backup.m_FCDLimit_ = m_FCDLimit_;
0936: backup.m_FCDStart_ = m_FCDStart_;
0937: backup.m_isCodePointHiragana_ = m_isCodePointHiragana_;
0938: backup.m_bufferOffset_ = m_bufferOffset_;
0939: backup.m_buffer_.setLength(0);
0940: if (m_bufferOffset_ >= 0) {
0941: // jdk 1.3.1 does not have append(StringBuffer) yet
0942: if (ICUDebug.isJDK14OrHigher) {
0943: backup.m_buffer_.append(m_buffer_);
0944: } else {
0945: backup.m_buffer_.append(m_buffer_.toString());
0946: }
0947: }
0948: }
0949:
0950: /**
0951: * Update the iterator internally with backed-up state
0952: * @param backup object that stored the data
0953: */
0954: private void updateInternalState(Backup backup) {
0955: m_source_.setIndex(backup.m_offset_);
0956: m_isCodePointHiragana_ = backup.m_isCodePointHiragana_;
0957: m_bufferOffset_ = backup.m_bufferOffset_;
0958: m_FCDLimit_ = backup.m_FCDLimit_;
0959: m_FCDStart_ = backup.m_FCDStart_;
0960: m_buffer_.setLength(0);
0961: if (m_bufferOffset_ >= 0) {
0962: // jdk 1.3.1 does not have append(StringBuffer) yet
0963: m_buffer_.append(backup.m_buffer_.toString());
0964: }
0965: }
0966:
0967: /**
0968: * A fast combining class retrieval system.
0969: * @param ch UTF16 character
0970: * @return combining class of ch
0971: */
0972: private int getCombiningClass(int ch) {
0973: if (ch >= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
0974: && m_collator_.isUnsafe((char) ch) || ch > 0xFFFF) {
0975: return NormalizerImpl.getCombiningClass(ch);
0976: }
0977: return 0;
0978: }
0979:
0980: /**
0981: * <p>Incremental normalization, this is an essential optimization.
0982: * Assuming FCD checks has been done, normalize the non-FCD characters into
0983: * the buffer.
0984: * Source offsets points to the current processing character.
0985: * </p>
0986: */
0987: private void normalize() {
0988: int size = m_FCDLimit_ - m_FCDStart_;
0989: m_buffer_.setLength(0);
0990: m_source_.setIndex(m_FCDStart_);
0991: for (int i = 0; i < size; i++) {
0992: m_buffer_.append((char) m_source_.next());
0993: }
0994: String decomp = Normalizer.decompose(m_buffer_.toString(),
0995: false);
0996: m_buffer_.setLength(0);
0997: m_buffer_.append(decomp);
0998: m_bufferOffset_ = 0;
0999: }
1000:
1001: /**
1002: * <p>Incremental FCD check and normalization. Gets the next base character
1003: * position and determines if the in-between characters needs normalization.
1004: * </p>
1005: * <p>When entering, the state is known to be this:
1006: * <ul>
1007: * <li>We are working on source string, not the buffer.
1008: * <li>The leading combining class from the current character is 0 or the
1009: * trailing combining class of the previous char was zero.
1010: * </ul>
1011: * Incoming source offsets points to the current processing character.
1012: * Return source offsets points to the current processing character.
1013: * </p>
1014: * @param ch current character
1015: * @param offset current character offset
1016: * @return true if FCDCheck passes, false otherwise
1017: */
1018: private boolean FCDCheck(char ch, int offset) {
1019: boolean result = true;
1020:
1021: // Get the trailing combining class of the current character.
1022: // If it's zero, we are OK.
1023: m_FCDStart_ = offset;
1024: m_source_.setIndex(offset);
1025: // trie access
1026: char fcd = NormalizerImpl.getFCD16(ch);
1027: if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
1028: m_source_.next();
1029: ch = (char) m_source_.current();
1030: // UCharacterIterator.DONE has 0 fcd
1031: if (UTF16.isTrailSurrogate(ch)) {
1032: fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
1033: } else {
1034: fcd = 0;
1035: }
1036: }
1037:
1038: int prevTrailCC = fcd & LAST_BYTE_MASK_;
1039:
1040: if (prevTrailCC != 0) {
1041: // The current char has a non-zero trailing CC. Scan forward until
1042: // we find a char with a leading cc of zero.
1043: while (true) {
1044: m_source_.next();
1045: int ch_int = m_source_.current();
1046: if (ch_int == UCharacterIterator.DONE) {
1047: break;
1048: }
1049: ch = (char) ch_int;
1050: // trie access
1051: fcd = NormalizerImpl.getFCD16(ch);
1052: if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
1053: m_source_.next();
1054: ch = (char) m_source_.current();
1055: if (UTF16.isTrailSurrogate(ch)) {
1056: fcd = NormalizerImpl.getFCD16FromSurrogatePair(
1057: fcd, ch);
1058: } else {
1059: fcd = 0;
1060: }
1061: }
1062: int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
1063: if (leadCC == 0) {
1064: // this is a base character, we stop the FCD checks
1065: break;
1066: }
1067:
1068: if (leadCC < prevTrailCC) {
1069: result = false;
1070: }
1071:
1072: prevTrailCC = fcd & LAST_BYTE_MASK_;
1073: }
1074: }
1075: m_FCDLimit_ = m_source_.getIndex();
1076: m_source_.setIndex(m_FCDStart_);
1077: m_source_.next();
1078: return result;
1079: }
1080:
1081: /**
1082: * <p>Method tries to fetch the next character that is in fcd form.</p>
1083: * <p>Normalization is done if required.</p>
1084: * <p>Offsets are returned at the next character.</p>
1085: * @return next fcd character
1086: */
1087: private int nextChar() {
1088: int result;
1089:
1090: // loop handles the next character whether it is in the buffer or not.
1091: if (m_bufferOffset_ < 0) {
1092: // we're working on the source and not normalizing. fast path.
1093: // note Thai pre-vowel reordering uses buffer too
1094: result = m_source_.current();
1095: } else {
1096: // we are in the buffer, buffer offset will never be 0 here
1097: if (m_bufferOffset_ >= m_buffer_.length()) {
1098: // Null marked end of buffer, revert to the source string and
1099: // loop back to top to try again to get a character.
1100: m_source_.setIndex(m_FCDLimit_);
1101: m_bufferOffset_ = -1;
1102: m_buffer_.setLength(0);
1103: return nextChar();
1104: }
1105: return m_buffer_.charAt(m_bufferOffset_++);
1106: }
1107: int startoffset = m_source_.getIndex();
1108: if (result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
1109: // Fast fcd safe path. trail combining class == 0.
1110: || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1111: || m_bufferOffset_ >= 0 || m_FCDLimit_ > startoffset) {
1112: // skip the fcd checks
1113: m_source_.next();
1114: return result;
1115: }
1116:
1117: if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1118: // We need to peek at the next character in order to tell if we are
1119: // FCD
1120: m_source_.next();
1121: int next = m_source_.current();
1122: if (next == UCharacterIterator.DONE
1123: || next < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1124: return result; // end of source string and if next character
1125: // starts with a base character is always fcd.
1126: }
1127: }
1128:
1129: // Need a more complete FCD check and possible normalization.
1130: if (!FCDCheck((char) result, startoffset)) {
1131: normalize();
1132: result = m_buffer_.charAt(0);
1133: m_bufferOffset_ = 1;
1134: }
1135: return result;
1136: }
1137:
1138: /**
1139: * <p>Incremental normalization, this is an essential optimization.
1140: * Assuming FCD checks has been done, normalize the non-FCD characters into
1141: * the buffer.
1142: * Source offsets points to the current processing character.</p>
1143: */
1144: private void normalizeBackwards() {
1145: normalize();
1146: m_bufferOffset_ = m_buffer_.length();
1147: }
1148:
1149: /**
1150: * <p>Incremental backwards FCD check and normalization. Gets the previous
1151: * base character position and determines if the in-between characters
1152: * needs normalization.
1153: * </p>
1154: * <p>When entering, the state is known to be this:
1155: * <ul>
1156: * <li>We are working on source string, not the buffer.
1157: * <li>The trailing combining class from the current character is 0 or the
1158: * leading combining class of the next char was zero.
1159: * </ul>
1160: * Input source offsets points to the previous character.
1161: * Return source offsets points to the current processing character.
1162: * </p>
1163: * @param ch current character
1164: * @param offset current character offset
1165: * @return true if FCDCheck passes, false otherwise
1166: */
1167: private boolean FCDCheckBackwards(char ch, int offset) {
1168: boolean result = true;
1169: char fcd = 0;
1170: m_FCDLimit_ = offset + 1;
1171: m_source_.setIndex(offset);
1172: if (!UTF16.isSurrogate(ch)) {
1173: fcd = NormalizerImpl.getFCD16(ch);
1174: } else if (UTF16.isTrailSurrogate(ch) && m_FCDLimit_ > 0) {
1175: // note trail surrogate characters gets 0 fcd
1176: char trailch = ch;
1177: ch = (char) m_source_.previous();
1178: if (UTF16.isLeadSurrogate(ch)) {
1179: fcd = NormalizerImpl.getFCD16(ch);
1180: if (fcd != 0) {
1181: fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd,
1182: trailch);
1183: }
1184: } else {
1185: fcd = 0; // unpaired surrogate
1186: }
1187: }
1188:
1189: int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
1190: // The current char has a non-zero leading combining class.
1191: // Scan backward until we find a char with a trailing cc of zero.
1192:
1193: while (leadCC != 0) {
1194: offset = m_source_.getIndex();
1195: if (offset == 0) {
1196: break;
1197: }
1198: ch = (char) m_source_.previous();
1199: if (!UTF16.isSurrogate(ch)) {
1200: fcd = NormalizerImpl.getFCD16(ch);
1201: } else if (UTF16.isTrailSurrogate(ch)
1202: && m_source_.getIndex() > 0) {
1203: char trail = ch;
1204: ch = (char) m_source_.previous();
1205: if (UTF16.isLeadSurrogate(ch)) {
1206: fcd = NormalizerImpl.getFCD16(ch);
1207: }
1208: if (fcd != 0) {
1209: fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd,
1210: trail);
1211: }
1212: } else {
1213: fcd = 0; // unpaired surrogate
1214: }
1215: int prevTrailCC = fcd & LAST_BYTE_MASK_;
1216: if (leadCC < prevTrailCC) {
1217: result = false;
1218: }
1219: leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
1220: }
1221:
1222: // storing character with 0 lead fcd or the 1st accent with a base
1223: // character before it
1224: if (fcd == 0) {
1225: m_FCDStart_ = offset;
1226: } else {
1227: m_FCDStart_ = m_source_.getIndex();
1228: }
1229: m_source_.setIndex(m_FCDLimit_);
1230: return result;
1231: }
1232:
1233: /**
1234: * <p>Method tries to fetch the previous character that is in fcd form.</p>
1235: * <p>Normalization is done if required.</p>
1236: * <p>Offsets are returned at the current character.</p>
1237: * @return previous fcd character
1238: */
1239: private int previousChar() {
1240: if (m_bufferOffset_ >= 0) {
1241: m_bufferOffset_--;
1242: if (m_bufferOffset_ >= 0) {
1243: return m_buffer_.charAt(m_bufferOffset_);
1244: } else {
1245: // At the start of buffer, route back to string.
1246: m_buffer_.setLength(0);
1247: if (m_FCDStart_ == 0) {
1248: m_FCDStart_ = -1;
1249: m_source_.setIndex(0);
1250: return UCharacterIterator.DONE;
1251: } else {
1252: m_FCDLimit_ = m_FCDStart_;
1253: m_source_.setIndex(m_FCDStart_);
1254: return previousChar();
1255: }
1256: }
1257: }
1258: int result = m_source_.previous();
1259: int startoffset = m_source_.getIndex();
1260: if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
1261: || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1262: || m_FCDStart_ <= startoffset
1263: || m_source_.getIndex() == 0) {
1264: return result;
1265: }
1266: int ch = m_source_.previous();
1267: if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1268: // if previous character is FCD
1269: m_source_.next();
1270: return result;
1271: }
1272: // Need a more complete FCD check and possible normalization.
1273: if (!FCDCheckBackwards((char) result, startoffset)) {
1274: normalizeBackwards();
1275: m_bufferOffset_--;
1276: result = m_buffer_.charAt(m_bufferOffset_);
1277: } else {
1278: // fcd checks alway reset m_source_ to the limit of the FCD
1279: m_source_.setIndex(startoffset);
1280: }
1281: return result;
1282: }
1283:
1284: /**
1285: * Determines if it is at the start of source iteration
1286: * @return true if iterator at the start, false otherwise
1287: */
1288: private final boolean isBackwardsStart() {
1289: return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
1290: || (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
1291: }
1292:
1293: /**
1294: * Checks if iterator is at the end of its source string.
1295: * @return true if it is at the end, false otherwise
1296: */
1297: private final boolean isEnd() {
1298: if (m_bufferOffset_ >= 0) {
1299: if (m_bufferOffset_ != m_buffer_.length()) {
1300: return false;
1301: } else {
1302: // at end of buffer. check if fcd is at the end
1303: return m_FCDLimit_ == m_source_.getLength();
1304: }
1305: }
1306: return m_source_.getLength() == m_source_.getIndex();
1307: }
1308:
1309: /**
1310: * <p>Special CE management for surrogates</p>
1311: * <p>Lead surrogate is encountered. CE to be retrieved by using the
1312: * following code unit. If next character is a trail surrogate, both
1313: * characters will be combined to retrieve the CE, otherwise completely
1314: * ignorable (UCA specification) is returned.</p>
1315: * @param collator collator to use
1316: * @param ce current CE
1317: * @param trail character
1318: * @return next CE for the surrogate characters
1319: */
1320: private final int nextSurrogate(RuleBasedCollator collator, int ce,
1321: char trail) {
1322: if (!UTF16.isTrailSurrogate(trail)) {
1323: updateInternalState(m_utilSpecialBackUp_);
1324: return IGNORABLE;
1325: }
1326: // TODO: CE contain the data from the previous CE + the mask.
1327: // It should at least be unmasked
1328: int result = collator.m_trie_.getTrailValue(ce, trail);
1329: if (result == CE_NOT_FOUND_) {
1330: updateInternalState(m_utilSpecialBackUp_);
1331: }
1332: return result;
1333: }
1334:
1335: /**
1336: * Gets the CE expansion offset
1337: * @param collator current collator
1338: * @param ce ce to test
1339: * @return expansion offset
1340: */
1341: private int getExpansionOffset(RuleBasedCollator collator, int ce) {
1342: return ((ce & 0xFFFFF0) >> 4) - collator.m_expansionOffset_;
1343: }
1344:
1345: /**
1346: * Gets the contraction ce offset
1347: * @param collator current collator
1348: * @param ce current ce
1349: * @return contraction offset
1350: */
1351: private int getContractionOffset(RuleBasedCollator collator, int ce) {
1352: return (ce & 0xFFFFFF) - collator.m_contractionOffset_;
1353: }
1354:
1355: /**
1356: * Checks if CE is a special tag CE
1357: * @param ce to check
1358: * @return true if CE is a special tag CE, false otherwise
1359: */
1360: private boolean isSpecialPrefixTag(int ce) {
1361: return RuleBasedCollator.isSpecial(ce)
1362: && RuleBasedCollator.getTag(ce) == CE_SPEC_PROC_TAG_;
1363: }
1364:
1365: /**
1366: * <p>Special processing getting a CE that is preceded by a certain
1367: * prefix.</p>
1368: * <p>Used for optimizing Japanese length and iteration marks. When a
1369: * special processing tag is encountered, iterate backwards to see if
1370: * there's a match.</p>
1371: * <p>Contraction tables are used, prefix data is stored backwards in the
1372: * table.</p>
1373: * @param collator collator to use
1374: * @param ce current ce
1375: * @param entrybackup entry backup iterator status
1376: * @return next collation element
1377: */
1378: private int nextSpecialPrefix(RuleBasedCollator collator, int ce,
1379: Backup entrybackup) {
1380: backupInternalState(m_utilSpecialBackUp_);
1381: updateInternalState(entrybackup);
1382: previousChar();
1383: // We want to look at the character where we entered
1384:
1385: while (true) {
1386: // This loop will run once per source string character, for as
1387: // long as we are matching a potential contraction sequence
1388: // First we position ourselves at the begining of contraction
1389: // sequence
1390: int entryoffset = getContractionOffset(collator, ce);
1391: int offset = entryoffset;
1392: if (isBackwardsStart()) {
1393: ce = collator.m_contractionCE_[offset];
1394: break;
1395: }
1396: char previous = (char) previousChar();
1397: while (previous > collator.m_contractionIndex_[offset]) {
1398: // contraction characters are ordered, skip smaller characters
1399: offset++;
1400: }
1401:
1402: if (previous == collator.m_contractionIndex_[offset]) {
1403: // Found the source string char in the table.
1404: // Pick up the corresponding CE from the table.
1405: ce = collator.m_contractionCE_[offset];
1406: } else {
1407: // Source string char was not in the table, prefix not found
1408: ce = collator.m_contractionCE_[entryoffset];
1409: }
1410:
1411: if (!isSpecialPrefixTag(ce)) {
1412: // The source string char was in the contraction table, and
1413: // the corresponding CE is not a prefix CE. We found the
1414: // prefix, break out of loop, this CE will end up being
1415: // returned. This is the normal way out of prefix handling
1416: // when the source actually contained the prefix.
1417: break;
1418: }
1419: }
1420: if (ce != CE_NOT_FOUND_) {
1421: // we found something and we can merilly continue
1422: updateInternalState(m_utilSpecialBackUp_);
1423: } else { // prefix search was a failure, we have to backup all the way to
1424: // the start
1425: updateInternalState(entrybackup);
1426: }
1427: return ce;
1428: }
1429:
1430: /**
1431: * Checks if the ce is a contraction tag
1432: * @param ce ce to check
1433: * @return true if ce is a contraction tag, false otherwise
1434: */
1435: private boolean isContractionTag(int ce) {
1436: return RuleBasedCollator.isSpecial(ce)
1437: && RuleBasedCollator.getTag(ce) == CE_CONTRACTION_TAG_;
1438: }
1439:
1440: /**
1441: * Method to copy skipped characters into the buffer and sets the fcd
1442: * position. To ensure that the skipped characters are considered later,
1443: * we need to place it in the appropriate position in the buffer and
1444: * reassign the source index. simple case if index reside in string,
1445: * simply copy to buffer and fcdposition = pos, pos = start of buffer.
1446: * if pos in normalization buffer, we'll insert the copy infront of pos
1447: * and point pos to the start of the buffer. why am i doing these copies?
1448: * well, so that the whole chunk of codes in the getNextCE,
1449: * ucol_prv_getSpecialCE does not require any changes, which will be
1450: * really painful.
1451: * @param skipped character buffer
1452: */
1453: private void setDiscontiguous(StringBuffer skipped) {
1454: if (m_bufferOffset_ >= 0) {
1455: m_buffer_.replace(0, m_bufferOffset_, skipped.toString());
1456: } else {
1457: m_FCDLimit_ = m_source_.getIndex();
1458: m_buffer_.setLength(0);
1459: m_buffer_.append(skipped.toString());
1460: }
1461:
1462: m_bufferOffset_ = 0;
1463: }
1464:
1465: /**
1466: * Returns the current character for forward iteration
1467: * @return current character
1468: */
1469: private int currentChar() {
1470: if (m_bufferOffset_ < 0) {
1471: m_source_.previous();
1472: return m_source_.next();
1473: }
1474:
1475: // m_bufferOffset_ is never 0 in normal circumstances except after a
1476: // discontiguous contraction since it is always returned and moved
1477: // by 1 when we do nextChar()
1478: return m_buffer_.charAt(m_bufferOffset_ - 1);
1479: }
1480:
1481: /**
1482: * Method to get the discontiguous collation element within the source.
1483: * Note this function will set the position to the appropriate places.
1484: * Passed in character offset points to the second combining character
1485: * after the start character.
1486: * @param collator current collator used
1487: * @param entryoffset index to the start character in the contraction table
1488: * @return discontiguous collation element offset
1489: */
1490: private int nextDiscontiguous(RuleBasedCollator collator,
1491: int entryoffset) {
1492: int offset = entryoffset;
1493: boolean multicontraction = false;
1494: // since it will be stuffed into this iterator and ran over again
1495: if (m_utilSkippedBuffer_ == null) {
1496: m_utilSkippedBuffer_ = new StringBuffer();
1497: } else {
1498: m_utilSkippedBuffer_.setLength(0);
1499: }
1500: char ch = (char) currentChar();
1501: m_utilSkippedBuffer_.append((char) currentChar());
1502: // accent after the first character
1503: if (m_utilSpecialDiscontiguousBackUp_ == null) {
1504: m_utilSpecialDiscontiguousBackUp_ = new Backup();
1505: }
1506: backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1507: char nextch = ch;
1508: while (true) {
1509: ch = nextch;
1510: int ch_int = nextChar();
1511: nextch = (char) ch_int;
1512: if (ch_int == UCharacterIterator.DONE
1513: || getCombiningClass(nextch) == 0) {
1514: // if there are no more accents to move around
1515: // we don't have to shift previousChar, since we are resetting
1516: // the offset later
1517: if (multicontraction) {
1518: if (ch_int != UCharacterIterator.DONE) {
1519: previousChar(); // backtrack
1520: }
1521: setDiscontiguous(m_utilSkippedBuffer_);
1522: return collator.m_contractionCE_[offset];
1523: }
1524: break;
1525: }
1526:
1527: offset++; // skip the combining class offset
1528: while (nextch > collator.m_contractionIndex_[offset]) {
1529: offset++;
1530: }
1531:
1532: int ce = CE_NOT_FOUND_;
1533: if (nextch != collator.m_contractionIndex_[offset]
1534: || getCombiningClass(nextch) == getCombiningClass(ch)) {
1535: // unmatched or blocked character
1536: m_utilSkippedBuffer_.append(nextch);
1537: continue;
1538: } else {
1539: ce = collator.m_contractionCE_[offset];
1540: }
1541:
1542: if (ce == CE_NOT_FOUND_) {
1543: break;
1544: } else if (isContractionTag(ce)) {
1545: // this is a multi-contraction
1546: offset = getContractionOffset(collator, ce);
1547: if (collator.m_contractionCE_[offset] != CE_NOT_FOUND_) {
1548: multicontraction = true;
1549: backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1550: }
1551: } else {
1552: setDiscontiguous(m_utilSkippedBuffer_);
1553: return ce;
1554: }
1555: }
1556:
1557: updateInternalState(m_utilSpecialDiscontiguousBackUp_);
1558: // backup is one forward of the base character, we need to move back
1559: // one more
1560: previousChar();
1561: return collator.m_contractionCE_[entryoffset];
1562: }
1563:
1564: /**
1565: * Gets the next contraction ce
1566: * @param collator collator to use
1567: * @param ce current ce
1568: * @param entrybackup entry backup iterator status
1569: * @return ce of the next contraction
1570: */
1571: private int nextContraction(RuleBasedCollator collator, int ce) {
1572: backupInternalState(m_utilSpecialBackUp_);
1573: int entryce = collator.m_contractionCE_[getContractionOffset(
1574: collator, ce)]; //CE_NOT_FOUND_;
1575: while (true) {
1576: int entryoffset = getContractionOffset(collator, ce);
1577: int offset = entryoffset;
1578:
1579: if (isEnd()) {
1580: ce = collator.m_contractionCE_[offset];
1581: if (ce == CE_NOT_FOUND_) {
1582: // back up the source over all the chars we scanned going
1583: // into this contraction.
1584: ce = entryce;
1585: updateInternalState(m_utilSpecialBackUp_);
1586: }
1587: break;
1588: }
1589:
1590: // get the discontiguos maximum combining class
1591: int maxCC = (collator.m_contractionIndex_[offset] & 0xFF);
1592: // checks if all characters have the same combining class
1593: byte allSame = (byte) (collator.m_contractionIndex_[offset] >> 8);
1594: char ch = (char) nextChar();
1595: offset++;
1596: while (ch > collator.m_contractionIndex_[offset]) {
1597: // contraction characters are ordered, skip all smaller
1598: offset++;
1599: }
1600:
1601: if (ch == collator.m_contractionIndex_[offset]) {
1602: // Found the source string char in the contraction table.
1603: // Pick up the corresponding CE from the table.
1604: ce = collator.m_contractionCE_[offset];
1605: } else {
1606: // Source string char was not in contraction table.
1607: // Unless it is a discontiguous contraction, we are done
1608: int miss = ch;
1609: if (UTF16.isLeadSurrogate(ch)) { // in order to do the proper detection, we
1610: // need to see if we're dealing with a supplementary
1611: miss = UCharacterProperty.getRawSupplementary(ch,
1612: (char) nextChar());
1613: }
1614: int sCC;
1615: if (maxCC == 0 || (sCC = getCombiningClass(miss)) == 0
1616: || sCC > maxCC
1617: || (allSame != 0 && sCC == maxCC) || isEnd()) {
1618: // Contraction can not be discontiguous, back up by one
1619: previousChar();
1620: if (miss > 0xFFFF) {
1621: previousChar();
1622: }
1623: ce = collator.m_contractionCE_[entryoffset];
1624: } else {
1625: // Contraction is possibly discontiguous.
1626: // find the next character if ch is not a base character
1627: int ch_int = nextChar();
1628: if (ch_int != UCharacterIterator.DONE) {
1629: previousChar();
1630: }
1631: char nextch = (char) ch_int;
1632: if (getCombiningClass(nextch) == 0) {
1633: previousChar();
1634: if (miss > 0xFFFF) {
1635: previousChar();
1636: }
1637: // base character not part of discontiguous contraction
1638: ce = collator.m_contractionCE_[entryoffset];
1639: } else {
1640: ce = nextDiscontiguous(collator, entryoffset);
1641: }
1642: }
1643: }
1644:
1645: if (ce == CE_NOT_FOUND_) {
1646: // source did not match the contraction, revert back original
1647: updateInternalState(m_utilSpecialBackUp_);
1648: ce = entryce;
1649: break;
1650: }
1651:
1652: // source was a contraction
1653: if (!isContractionTag(ce)) {
1654: break;
1655: }
1656:
1657: // ccontinue looping to check for the remaining contraction.
1658: if (collator.m_contractionCE_[entryoffset] != CE_NOT_FOUND_) {
1659: // there are further contractions to be performed, so we store
1660: // the so-far completed ce, so that if we fail in the next
1661: // round we just return this one.
1662: entryce = collator.m_contractionCE_[entryoffset];
1663: backupInternalState(m_utilSpecialBackUp_);
1664: if (m_utilSpecialBackUp_.m_bufferOffset_ >= 0) {
1665: m_utilSpecialBackUp_.m_bufferOffset_--;
1666: } else {
1667: m_utilSpecialBackUp_.m_offset_--;
1668: }
1669: }
1670: }
1671: return ce;
1672: }
1673:
1674: /**
1675: * Gets the next ce for long primaries, stuffs the rest of the collation
1676: * elements into the ce buffer
1677: * @param ce current ce
1678: * @return next ce
1679: */
1680: private int nextLongPrimary(int ce) {
1681: m_CEBuffer_[1] = ((ce & 0xFF) << 24)
1682: | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1683: m_CEBufferOffset_ = 1;
1684: m_CEBufferSize_ = 2;
1685: m_CEBuffer_[0] = ((ce & 0xFFFF00) << 8)
1686: | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_;
1687: return m_CEBuffer_[0];
1688: }
1689:
1690: /**
1691: * Gets the number of expansion
1692: * @param ce current ce
1693: * @return number of expansion
1694: */
1695: private int getExpansionCount(int ce) {
1696: return ce & 0xF;
1697: }
1698:
1699: /**
1700: * Gets the next expansion ce and stuffs the rest of the collation elements
1701: * into the ce buffer
1702: * @param collator current collator
1703: * @param ce current ce
1704: * @return next expansion ce
1705: */
1706: private int nextExpansion(RuleBasedCollator collator, int ce) {
1707: // NOTE: we can encounter both continuations and expansions in an
1708: // expansion!
1709: // I have to decide where continuations are going to be dealt with
1710: int offset = getExpansionOffset(collator, ce);
1711: m_CEBufferSize_ = getExpansionCount(ce);
1712: m_CEBufferOffset_ = 1;
1713: m_CEBuffer_[0] = collator.m_expansion_[offset];
1714: if (m_CEBufferSize_ != 0) {
1715: // if there are less than 16 elements in expansion
1716: for (int i = 1; i < m_CEBufferSize_; i++) {
1717: m_CEBuffer_[i] = collator.m_expansion_[offset + i];
1718: }
1719: } else {
1720: // ce are terminated
1721: m_CEBufferSize_ = 1;
1722: while (collator.m_expansion_[offset] != 0) {
1723: m_CEBuffer_[m_CEBufferSize_++] = collator.m_expansion_[++offset];
1724: }
1725: }
1726: // in case of one element expansion, we
1727: // want to immediately return CEpos
1728: if (m_CEBufferSize_ == 1) {
1729: m_CEBufferSize_ = 0;
1730: m_CEBufferOffset_ = 0;
1731: }
1732: return m_CEBuffer_[0];
1733: }
1734:
1735: /**
1736: * Gets the next digit ce
1737: * @param collator current collator
1738: * @param ce current collation element
1739: * @param cp current codepoint
1740: * @return next digit ce
1741: */
1742: private int nextDigit(RuleBasedCollator collator, int ce, int cp) {
1743: // We do a check to see if we want to collate digits as numbers;
1744: // if so we generate a custom collation key. Otherwise we pull out
1745: // the value stored in the expansion table.
1746:
1747: if (m_collator_.m_isNumericCollation_) {
1748: int collateVal = 0;
1749: int trailingZeroIndex = 0;
1750: boolean nonZeroValReached = false;
1751:
1752: // I just need a temporary place to store my generated CEs.
1753: // icu4c uses a unsigned byte array, i'll use a stringbuffer here
1754: // to avoid dealing with the sign problems and array allocation
1755: // clear and set initial string buffer length
1756: m_utilStringBuffer_.setLength(3);
1757:
1758: // We parse the source string until we hit a char that's NOT a
1759: // digit.
1760: // Use this u_charDigitValue. This might be slow because we have
1761: // to handle surrogates...
1762: int digVal = UCharacter.digit(cp);
1763: // if we have arrived here, we have already processed possible
1764: // supplementaries that trigered the digit tag -
1765: // all supplementaries are marked in the UCA.
1766: // We pad a zero in front of the first element anyways.
1767: // This takes care of the (probably) most common case where
1768: // people are sorting things followed by a single digit
1769: int digIndx = 1;
1770: for (;;) {
1771: // Make sure we have enough space.
1772: if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
1773: m_utilStringBuffer_.setLength(m_utilStringBuffer_
1774: .length() << 1);
1775: }
1776: // Skipping over leading zeroes.
1777: if (digVal != 0 || nonZeroValReached) {
1778: if (digVal != 0 && !nonZeroValReached) {
1779: nonZeroValReached = true;
1780: }
1781: // We parse the digit string into base 100 numbers
1782: // (this fits into a byte).
1783: // We only add to the buffer in twos, thus if we are
1784: // parsing an odd character, that serves as the
1785: // 'tens' digit while the if we are parsing an even
1786: // one, that is the 'ones' digit. We dumped the
1787: // parsed base 100 value (collateVal) into a buffer.
1788: // We multiply each collateVal by 2 (to give us room)
1789: // and add 5 (to avoid overlapping magic CE byte
1790: // values). The last byte we subtract 1 to ensure it is
1791: // less than all the other bytes.
1792: if (digIndx % 2 == 1) {
1793: collateVal += digVal;
1794: // This removes trailing zeroes.
1795: if (collateVal == 0 && trailingZeroIndex == 0) {
1796: trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
1797: } else if (trailingZeroIndex != 0) {
1798: trailingZeroIndex = 0;
1799: }
1800: m_utilStringBuffer_.setCharAt(
1801: ((digIndx - 1) >>> 1) + 2,
1802: (char) ((collateVal << 1) + 6));
1803: collateVal = 0;
1804: } else {
1805: // We drop the collation value into the buffer so if
1806: // we need to do a "front patch" we don't have to
1807: // check to see if we're hitting the last element.
1808: collateVal = digVal * 10;
1809: m_utilStringBuffer_.setCharAt(
1810: (digIndx >>> 1) + 2,
1811: (char) ((collateVal << 1) + 6));
1812: }
1813: digIndx++;
1814: }
1815:
1816: // Get next character.
1817: if (!isEnd()) {
1818: backupInternalState(m_utilSpecialBackUp_);
1819: int char32 = nextChar();
1820: char ch = (char) char32;
1821: if (UTF16.isLeadSurrogate(ch)) {
1822: if (!isEnd()) {
1823: char trail = (char) nextChar();
1824: if (UTF16.isTrailSurrogate(trail)) {
1825: char32 = UCharacterProperty
1826: .getRawSupplementary(ch, trail);
1827: } else {
1828: goBackOne();
1829: }
1830: }
1831: }
1832:
1833: digVal = UCharacter.digit(char32);
1834: if (digVal == -1) {
1835: // Resetting position to point to the next unprocessed
1836: // char. We overshot it when doing our test/set for
1837: // numbers.
1838: updateInternalState(m_utilSpecialBackUp_);
1839: break;
1840: }
1841: } else {
1842: break;
1843: }
1844: }
1845:
1846: if (nonZeroValReached == false) {
1847: digIndx = 2;
1848: m_utilStringBuffer_.setCharAt(2, (char) 6);
1849: }
1850:
1851: int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex
1852: : (digIndx >>> 1) + 2;
1853: if (digIndx % 2 != 0) {
1854: // We missed a value. Since digIndx isn't even, stuck too many
1855: // values into the buffer (this is what we get for padding the
1856: // first byte with a zero). "Front-patch" now by pushing all
1857: // nybbles forward.
1858: // Doing it this way ensures that at least 50% of the time
1859: // (statistically speaking) we'll only be doing a single pass
1860: // and optimizes for strings with single digits. I'm just
1861: // assuming that's the more common case.
1862: for (int i = 2; i < endIndex; i++) {
1863: m_utilStringBuffer_
1864: .setCharAt(
1865: i,
1866: (char) ((((((m_utilStringBuffer_
1867: .charAt(i) - 6) >>> 1) % 10) * 10)
1868: + (((m_utilStringBuffer_
1869: .charAt(i + 1) - 6) >>> 1) / 10) << 1) + 6));
1870: }
1871: --digIndx;
1872: }
1873:
1874: // Subtract one off of the last byte.
1875: m_utilStringBuffer_
1876: .setCharAt(endIndex - 1,
1877: (char) (m_utilStringBuffer_
1878: .charAt(endIndex - 1) - 1));
1879:
1880: // We want to skip over the first two slots in the buffer.
1881: // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
1882: // The second slot is for the sign/exponent byte:
1883: // 0x80 + (decimalPos/2) & 7f.
1884: m_utilStringBuffer_.setCharAt(0,
1885: (char) RuleBasedCollator.CODAN_PLACEHOLDER);
1886: m_utilStringBuffer_.setCharAt(1,
1887: (char) (0x80 + ((digIndx >>> 1) & 0x7F)));
1888:
1889: // Now transfer the collation key to our collIterate struct.
1890: // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
1891: ce = (((m_utilStringBuffer_.charAt(0) << 8)
1892: // Primary weight
1893: | m_utilStringBuffer_.charAt(1)) << RuleBasedCollator.CE_PRIMARY_SHIFT_)
1894: // Secondary weight
1895: | (RuleBasedCollator.BYTE_COMMON_ << RuleBasedCollator.CE_SECONDARY_SHIFT_)
1896: | RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
1897: int i = 2; // Reset the index into the buffer.
1898:
1899: m_CEBuffer_[0] = ce;
1900: m_CEBufferSize_ = 1;
1901: m_CEBufferOffset_ = 1;
1902: while (i < endIndex) {
1903: int primWeight = m_utilStringBuffer_.charAt(i++) << 8;
1904: if (i < endIndex) {
1905: primWeight |= m_utilStringBuffer_.charAt(i++);
1906: }
1907: m_CEBuffer_[m_CEBufferSize_++] = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
1908: | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1909: }
1910: return ce;
1911: }
1912:
1913: // no numeric mode, we'll just switch to whatever we stashed and
1914: // continue
1915: // find the offset to expansion table
1916: return collator.m_expansion_[getExpansionOffset(collator, ce)];
1917: }
1918:
1919: /**
1920: * Gets the next implicit ce for codepoints
1921: * @param codepoint current codepoint
1922: * @return implicit ce
1923: */
1924: private int nextImplicit(int codepoint) {
1925: if (!UCharacter.isLegal(codepoint)) {
1926: // synwee to check with vladimir on the range of isNonChar()
1927: // illegal code value, use completely ignoreable!
1928: return IGNORABLE;
1929: }
1930: int result = RuleBasedCollator.impCEGen_
1931: .getImplicitFromCodePoint(codepoint);
1932: m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_) | 0x00000505;
1933: m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
1934: m_CEBufferOffset_ = 1;
1935: m_CEBufferSize_ = 2;
1936: return m_CEBuffer_[0];
1937: }
1938:
1939: /**
1940: * Returns the next ce associated with the following surrogate characters
1941: * @param ch current character
1942: * @return ce
1943: */
1944: private int nextSurrogate(char ch) {
1945: int ch_int = nextChar();
1946: char nextch = (char) ch_int;
1947: if (ch_int != CharacterIterator.DONE
1948: && UTF16.isTrailSurrogate(nextch)) {
1949: int codepoint = UCharacterProperty.getRawSupplementary(ch,
1950: nextch);
1951: return nextImplicit(codepoint);
1952: }
1953: if (nextch != CharacterIterator.DONE) {
1954: previousChar(); // reverts back to the original position
1955: }
1956: return IGNORABLE; // completely ignorable
1957: }
1958:
1959: /**
1960: * Returns the next ce for a hangul character, this is an implicit
1961: * calculation
1962: * @param collator current collator
1963: * @param ch current character
1964: * @return hangul ce
1965: */
1966: private int nextHangul(RuleBasedCollator collator, char ch) {
1967: char L = (char) (ch - HANGUL_SBASE_);
1968:
1969: // divide into pieces
1970: // do it in this order since some compilers can do % and / in one
1971: // operation
1972: char T = (char) (L % HANGUL_TCOUNT_);
1973: L /= HANGUL_TCOUNT_;
1974: char V = (char) (L % HANGUL_VCOUNT_);
1975: L /= HANGUL_VCOUNT_;
1976:
1977: // offset them
1978: L += HANGUL_LBASE_;
1979: V += HANGUL_VBASE_;
1980: T += HANGUL_TBASE_;
1981:
1982: // return the first CE, but first put the rest into the expansion
1983: // buffer
1984: m_CEBufferSize_ = 0;
1985: if (!collator.m_isJamoSpecial_) { // FAST PATH
1986: m_CEBuffer_[m_CEBufferSize_++] = collator.m_trie_
1987: .getLeadValue(L);
1988: m_CEBuffer_[m_CEBufferSize_++] = collator.m_trie_
1989: .getLeadValue(V);
1990:
1991: if (T != HANGUL_TBASE_) {
1992: m_CEBuffer_[m_CEBufferSize_++] = collator.m_trie_
1993: .getLeadValue(T);
1994: }
1995: m_CEBufferOffset_ = 1;
1996: return m_CEBuffer_[0];
1997: } else {
1998: // Jamo is Special
1999: // Since Hanguls pass the FCD check, it is guaranteed that we
2000: // won't be in the normalization buffer if something like this
2001: // happens
2002: // Move Jamos into normalization buffer
2003: m_buffer_.append((char) L);
2004: m_buffer_.append((char) V);
2005: if (T != HANGUL_TBASE_) {
2006: m_buffer_.append((char) T);
2007: }
2008: m_FCDLimit_ = m_source_.getIndex();
2009: m_FCDStart_ = m_FCDLimit_ - 1;
2010: // Indicate where to continue in main input string after
2011: // exhausting the buffer
2012: return IGNORABLE;
2013: }
2014: }
2015:
2016: /**
2017: * <p>Special CE management. Expansions, contractions etc...</p>
2018: * @param collator can be plain UCA
2019: * @param ce current ce
2020: * @param ch current character
2021: * @return next special ce
2022: */
2023: private int nextSpecial(RuleBasedCollator collator, int ce, char ch) {
2024: int codepoint = ch;
2025: Backup entrybackup = m_utilSpecialEntryBackUp_;
2026: // this is to handle recursive looping
2027: if (entrybackup != null) {
2028: m_utilSpecialEntryBackUp_ = null;
2029: } else {
2030: entrybackup = new Backup();
2031: }
2032: backupInternalState(entrybackup);
2033: try { // forces it to assign m_utilSpecialEntryBackup_
2034: while (true) {
2035: // This loop will repeat only in the case of contractions,
2036: // surrogate
2037: switch (RuleBasedCollator.getTag(ce)) {
2038: case CE_NOT_FOUND_TAG_:
2039: // impossible case for icu4j
2040: return ce;
2041: case RuleBasedCollator.CE_SURROGATE_TAG_:
2042: if (isEnd()) {
2043: return IGNORABLE;
2044: }
2045: backupInternalState(m_utilSpecialBackUp_);
2046: char trail = (char) nextChar();
2047: ce = nextSurrogate(collator, ce, trail);
2048: // calculate the supplementary code point value,
2049: // if surrogate was not tailored we go one more round
2050: codepoint = UCharacterProperty.getRawSupplementary(
2051: ch, trail);
2052: break;
2053: case CE_SPEC_PROC_TAG_:
2054: ce = nextSpecialPrefix(collator, ce, entrybackup);
2055: break;
2056: case CE_CONTRACTION_TAG_:
2057: ce = nextContraction(collator, ce);
2058: break;
2059: case CE_LONG_PRIMARY_TAG_:
2060: return nextLongPrimary(ce);
2061: case CE_EXPANSION_TAG_:
2062: return nextExpansion(collator, ce);
2063: case CE_DIGIT_TAG_:
2064: ce = nextDigit(collator, ce, codepoint);
2065: break;
2066: // various implicits optimization
2067: case CE_CJK_IMPLICIT_TAG_:
2068: // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2069: return nextImplicit(codepoint);
2070: case CE_IMPLICIT_TAG_: // everything that is not defined
2071: return nextImplicit(codepoint);
2072: case CE_TRAIL_SURROGATE_TAG_:
2073: return IGNORABLE; // DC00-DFFF broken surrogate
2074: case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
2075: return nextSurrogate(ch);
2076: case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2077: return nextHangul(collator, ch);
2078: case CE_CHARSET_TAG_:
2079: // not yet implemented probably after 1.8
2080: return CE_NOT_FOUND_;
2081: default:
2082: ce = IGNORABLE;
2083: // synwee todo, throw exception or something here.
2084: }
2085: if (!RuleBasedCollator.isSpecial(ce)) {
2086: break;
2087: }
2088: }
2089: } finally {
2090: m_utilSpecialEntryBackUp_ = entrybackup;
2091: }
2092: return ce;
2093: }
2094:
2095: /**
2096: * Special processing is getting a CE that is preceded by a certain prefix.
2097: * Currently this is only needed for optimizing Japanese length and
2098: * iteration marks. When we encouter a special processing tag, we go
2099: * backwards and try to see if we have a match. Contraction tables are used
2100: * - so the whole process is not unlike contraction. prefix data is stored
2101: * backwards in the table.
2102: * @param collator current collator
2103: * @param ce current ce
2104: * @return previous ce
2105: */
2106: private int previousSpecialPrefix(RuleBasedCollator collator, int ce) {
2107: backupInternalState(m_utilSpecialBackUp_);
2108: while (true) {
2109: // position ourselves at the begining of contraction sequence
2110: int offset = getContractionOffset(collator, ce);
2111: int entryoffset = offset;
2112: if (isBackwardsStart()) {
2113: ce = collator.m_contractionCE_[offset];
2114: break;
2115: }
2116: char prevch = (char) previousChar();
2117: while (prevch > collator.m_contractionIndex_[offset]) {
2118: // since contraction codepoints are ordered, we skip all that
2119: // are smaller
2120: offset++;
2121: }
2122: if (prevch == collator.m_contractionIndex_[offset]) {
2123: ce = collator.m_contractionCE_[offset];
2124: } else {
2125: // if there is a completely ignorable code point in the middle
2126: // of a prefix, we need to act as if it's not there assumption:
2127: // 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to
2128: // zero)
2129: // lone surrogates cannot be set to zero as it would break
2130: // other processing
2131: int isZeroCE = collator.m_trie_.getLeadValue(prevch);
2132: // it's easy for BMP code points
2133: if (isZeroCE == 0) {
2134: continue;
2135: } else if (UTF16.isTrailSurrogate(prevch)
2136: || UTF16.isLeadSurrogate(prevch)) {
2137: // for supplementary code points, we have to check the next one
2138: // situations where we are going to ignore
2139: // 1. beginning of the string: schar is a lone surrogate
2140: // 2. schar is a lone surrogate
2141: // 3. schar is a trail surrogate in a valid surrogate
2142: // sequence that is explicitly set to zero.
2143: if (!isBackwardsStart()) {
2144: char lead = (char) previousChar();
2145: if (UTF16.isLeadSurrogate(lead)) {
2146: isZeroCE = collator.m_trie_
2147: .getLeadValue(lead);
2148: if (RuleBasedCollator.getTag(isZeroCE) == RuleBasedCollator.CE_SURROGATE_TAG_) {
2149: int finalCE = collator.m_trie_
2150: .getTrailValue(isZeroCE, prevch);
2151: if (finalCE == 0) {
2152: // this is a real, assigned completely
2153: // ignorable code point
2154: continue;
2155: }
2156: }
2157: } else {
2158: nextChar(); // revert to original offset
2159: // lone surrogate, completely ignorable
2160: continue;
2161: }
2162: nextChar(); // revert to original offset
2163: } else {
2164: // lone surrogate at the beggining, completely ignorable
2165: continue;
2166: }
2167: }
2168:
2169: // char was not in the table. prefix not found
2170: ce = collator.m_contractionCE_[entryoffset];
2171: }
2172:
2173: if (!isSpecialPrefixTag(ce)) {
2174: // char was in the contraction table, and the corresponding ce
2175: // is not a prefix ce. We found the prefix, break out of loop,
2176: // this ce will end up being returned.
2177: break;
2178: }
2179: }
2180: updateInternalState(m_utilSpecialBackUp_);
2181: return ce;
2182: }
2183:
2184: /**
2185: * Retrieves the previous contraction ce. To ensure that the backwards and
2186: * forwards iteration matches, we take the current region of most possible
2187: * match and pass it through the forward iteration. This will ensure that
2188: * the obstinate problem of overlapping contractions will not occur.
2189: * @param collator current collator
2190: * @param ce current ce
2191: * @param ch current character
2192: * @return previous contraction ce
2193: */
2194: private int previousContraction(RuleBasedCollator collator, int ce,
2195: char ch) {
2196: m_utilStringBuffer_.setLength(0);
2197: // since we might encounter normalized characters (from the thai
2198: // processing) we can't use peekCharacter() here.
2199: char prevch = (char) previousChar();
2200: boolean atStart = false;
2201: // TODO: address the comment above - maybe now we *can* use peekCharacter
2202: //while (collator.isUnsafe(ch) || isThaiPreVowel(prevch)) {
2203: while (collator.isUnsafe(ch)) {
2204: m_utilStringBuffer_.insert(0, ch);
2205: ch = prevch;
2206: if (isBackwardsStart()) {
2207: atStart = true;
2208: break;
2209: }
2210: prevch = (char) previousChar();
2211: }
2212: if (!atStart) {
2213: // undo the previousChar() if we didn't reach the beginning
2214: nextChar();
2215: }
2216: // adds the initial base character to the string
2217: m_utilStringBuffer_.insert(0, ch);
2218:
2219: // a new collation element iterator is used to simply things, since
2220: // using the current collation element iterator will mean that the
2221: // forward and backwards iteration will share and change the same
2222: // buffers. it is going to be painful.
2223: int originaldecomp = collator.getDecomposition();
2224: // for faster access, since string would have been normalized above
2225: collator.setDecomposition(Collator.NO_DECOMPOSITION);
2226: if (m_utilColEIter_ == null) {
2227: m_utilColEIter_ = new CollationElementIterator(
2228: m_utilStringBuffer_.toString(), collator);
2229: } else {
2230: m_utilColEIter_.m_collator_ = collator;
2231: m_utilColEIter_.setText(m_utilStringBuffer_.toString());
2232: }
2233: ce = m_utilColEIter_.next();
2234: m_CEBufferSize_ = 0;
2235: while (ce != NULLORDER) {
2236: if (m_CEBufferSize_ == m_CEBuffer_.length) {
2237: try {
2238: // increasing cebuffer size
2239: int tempbuffer[] = new int[m_CEBuffer_.length + 50];
2240: System.arraycopy(m_CEBuffer_, 0, tempbuffer, 0,
2241: m_CEBuffer_.length);
2242: m_CEBuffer_ = tempbuffer;
2243: } catch (MissingResourceException e) {
2244: throw e;
2245: } catch (Exception e) {
2246: if (DEBUG) {
2247: e.printStackTrace();
2248: }
2249: return NULLORDER;
2250: }
2251: }
2252: m_CEBuffer_[m_CEBufferSize_++] = ce;
2253: ce = m_utilColEIter_.next();
2254: }
2255: collator.setDecomposition(originaldecomp);
2256: m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2257: return m_CEBuffer_[m_CEBufferOffset_];
2258: }
2259:
2260: /**
2261: * Returns the previous long primary ces
2262: * @param ce long primary ce
2263: * @return previous long primary ces
2264: */
2265: private int previousLongPrimary(int ce) {
2266: m_CEBufferSize_ = 0;
2267: m_CEBuffer_[m_CEBufferSize_++] = ((ce & 0xFFFF00) << 8)
2268: | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_;
2269: m_CEBuffer_[m_CEBufferSize_++] = ((ce & 0xFF) << 24)
2270: | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2271: m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2272: return m_CEBuffer_[m_CEBufferOffset_];
2273: }
2274:
2275: /**
2276: * Returns the previous expansion ces
2277: * @param collator current collator
2278: * @param ce current ce
2279: * @return previous expansion ce
2280: */
2281: private int previousExpansion(RuleBasedCollator collator, int ce) {
2282: // find the offset to expansion table
2283: int offset = getExpansionOffset(collator, ce);
2284: m_CEBufferSize_ = getExpansionCount(ce);
2285: if (m_CEBufferSize_ != 0) {
2286: // less than 16 elements in expansion
2287: for (int i = 0; i < m_CEBufferSize_; i++) {
2288: m_CEBuffer_[i] = collator.m_expansion_[offset + i];
2289: }
2290:
2291: } else {
2292: // null terminated ces
2293: while (collator.m_expansion_[offset + m_CEBufferSize_] != 0) {
2294: m_CEBuffer_[m_CEBufferSize_] = collator.m_expansion_[offset
2295: + m_CEBufferSize_];
2296: m_CEBufferSize_++;
2297: }
2298: }
2299: m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2300: return m_CEBuffer_[m_CEBufferOffset_];
2301: }
2302:
2303: /**
2304: * Getting the digit collation elements
2305: * @param collator
2306: * @param ce current collation element
2307: * @param ch current code point
2308: * @return digit collation element
2309: */
2310: private int previousDigit(RuleBasedCollator collator, int ce,
2311: char ch) {
2312: // We do a check to see if we want to collate digits as numbers; if so we generate
2313: // a custom collation key. Otherwise we pull out the value stored in the expansion table.
2314: if (m_collator_.m_isNumericCollation_) {
2315: int leadingZeroIndex = 0;
2316: int collateVal = 0;
2317: boolean nonZeroValReached = false;
2318:
2319: // clear and set initial string buffer length
2320: m_utilStringBuffer_.setLength(3);
2321:
2322: // We parse the source string until we hit a char that's NOT a digit
2323: // Use this u_charDigitValue. This might be slow because we have to
2324: // handle surrogates...
2325: int char32 = ch;
2326: if (UTF16.isTrailSurrogate(ch)) {
2327: if (!isBackwardsStart()) {
2328: char lead = (char) previousChar();
2329: if (UTF16.isLeadSurrogate(lead)) {
2330: char32 = UCharacterProperty
2331: .getRawSupplementary(lead, ch);
2332: } else {
2333: goForwardOne();
2334: }
2335: }
2336: }
2337: int digVal = UCharacter.digit(char32);
2338: int digIndx = 0;
2339: for (;;) {
2340: // Make sure we have enough space.
2341: if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
2342: m_utilStringBuffer_.setLength(m_utilStringBuffer_
2343: .length() << 1);
2344: }
2345: // Skipping over "trailing" zeroes but we still add to digIndx.
2346: if (digVal != 0 || nonZeroValReached) {
2347: if (digVal != 0 && !nonZeroValReached) {
2348: nonZeroValReached = true;
2349: }
2350:
2351: // We parse the digit string into base 100 numbers (this
2352: // fits into a byte).
2353: // We only add to the buffer in twos, thus if we are
2354: // parsing an odd character, that serves as the 'tens'
2355: // digit while the if we are parsing an even one, that is
2356: // the 'ones' digit. We dumped the parsed base 100 value
2357: // (collateVal) into a buffer. We multiply each collateVal
2358: // by 2 (to give us room) and add 5 (to avoid overlapping
2359: // magic CE byte values). The last byte we subtract 1 to
2360: // ensure it is less than all the other bytes.
2361: // Since we're doing in this reverse we want to put the
2362: // first digit encountered into the ones place and the
2363: // second digit encountered into the tens place.
2364:
2365: if (digIndx % 2 == 1) {
2366: collateVal += digVal * 10;
2367:
2368: // This removes leading zeroes.
2369: if (collateVal == 0 && leadingZeroIndex == 0) {
2370: leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2371: } else if (leadingZeroIndex != 0) {
2372: leadingZeroIndex = 0;
2373: }
2374:
2375: m_utilStringBuffer_.setCharAt(
2376: ((digIndx - 1) >>> 1) + 2,
2377: (char) ((collateVal << 1) + 6));
2378: collateVal = 0;
2379: } else {
2380: collateVal = digVal;
2381: }
2382: }
2383: digIndx++;
2384:
2385: if (!isBackwardsStart()) {
2386: backupInternalState(m_utilSpecialBackUp_);
2387: char32 = previousChar();
2388: ch = (char) ch;
2389: if (UTF16.isTrailSurrogate(ch)) {
2390: if (!isBackwardsStart()) {
2391: char lead = (char) previousChar();
2392: if (UTF16.isLeadSurrogate(lead)) {
2393: char32 = UCharacterProperty
2394: .getRawSupplementary(lead, ch);
2395: } else {
2396: updateInternalState(m_utilSpecialBackUp_);
2397: }
2398: }
2399: }
2400:
2401: digVal = UCharacter.digit(char32);
2402: if (digVal == -1) {
2403: updateInternalState(m_utilSpecialBackUp_);
2404: break;
2405: }
2406: } else {
2407: break;
2408: }
2409: }
2410:
2411: if (nonZeroValReached == false) {
2412: digIndx = 2;
2413: m_utilStringBuffer_.setCharAt(2, (char) 6);
2414: }
2415:
2416: if (digIndx % 2 != 0) {
2417: if (collateVal == 0 && leadingZeroIndex == 0) {
2418: // This removes the leading 0 in a odd number sequence of
2419: // numbers e.g. avery001
2420: leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2421: } else {
2422: // this is not a leading 0, we add it in
2423: m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
2424: (char) ((collateVal << 1) + 6));
2425: digIndx++;
2426: }
2427: }
2428:
2429: int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex
2430: : ((digIndx >>> 1) + 2);
2431: digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros
2432: // Subtract one off of the last byte.
2433: // Really the first byte here, but it's reversed...
2434: m_utilStringBuffer_.setCharAt(2,
2435: (char) (m_utilStringBuffer_.charAt(2) - 1));
2436: // We want to skip over the first two slots in the buffer.
2437: // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
2438: // The second slot is for the sign/exponent byte:
2439: // 0x80 + (decimalPos/2) & 7f.
2440: m_utilStringBuffer_.setCharAt(0,
2441: (char) RuleBasedCollator.CODAN_PLACEHOLDER);
2442: m_utilStringBuffer_.setCharAt(1,
2443: (char) (0x80 + ((digIndx >>> 1) & 0x7F)));
2444:
2445: // Now transfer the collation key to our collIterate struct.
2446: // The total size for our collation key is endIndx bumped up to the
2447: // next largest even value divided by two.
2448: m_CEBufferSize_ = 0;
2449: m_CEBuffer_[m_CEBufferSize_++] = (((m_utilStringBuffer_
2450: .charAt(0) << 8)
2451: // Primary weight
2452: | m_utilStringBuffer_.charAt(1)) << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2453: // Secondary weight
2454: | (RuleBasedCollator.BYTE_COMMON_ << RuleBasedCollator.CE_SECONDARY_SHIFT_)
2455: // Tertiary weight.
2456: | RuleBasedCollator.BYTE_COMMON_;
2457: int i = endIndex - 1; // Reset the index into the buffer.
2458: while (i >= 2) {
2459: int primWeight = m_utilStringBuffer_.charAt(i--) << 8;
2460: if (i >= 2) {
2461: primWeight |= m_utilStringBuffer_.charAt(i--);
2462: }
2463: m_CEBuffer_[m_CEBufferSize_++] = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2464: | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2465: }
2466: m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2467: return m_CEBuffer_[m_CEBufferOffset_];
2468: } else {
2469: return collator.m_expansion_[getExpansionOffset(collator,
2470: ce)];
2471: }
2472: }
2473:
2474: /**
2475: * Returns previous hangul ces
2476: * @param collator current collator
2477: * @param ch current character
2478: * @return previous hangul ce
2479: */
2480: private int previousHangul(RuleBasedCollator collator, char ch) {
2481: char L = (char) (ch - HANGUL_SBASE_);
2482: // we do it in this order since some compilers can do % and / in one
2483: // operation
2484: char T = (char) (L % HANGUL_TCOUNT_);
2485: L /= HANGUL_TCOUNT_;
2486: char V = (char) (L % HANGUL_VCOUNT_);
2487: L /= HANGUL_VCOUNT_;
2488:
2489: // offset them
2490: L += HANGUL_LBASE_;
2491: V += HANGUL_VBASE_;
2492: T += HANGUL_TBASE_;
2493:
2494: m_CEBufferSize_ = 0;
2495: if (!collator.m_isJamoSpecial_) {
2496: m_CEBuffer_[m_CEBufferSize_++] = collator.m_trie_
2497: .getLeadValue(L);
2498: m_CEBuffer_[m_CEBufferSize_++] = collator.m_trie_
2499: .getLeadValue(V);
2500: if (T != HANGUL_TBASE_) {
2501: m_CEBuffer_[m_CEBufferSize_++] = collator.m_trie_
2502: .getLeadValue(T);
2503: }
2504: m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2505: return m_CEBuffer_[m_CEBufferOffset_];
2506: } else {
2507: // Since Hanguls pass the FCD check, it is guaranteed that we won't
2508: // be in the normalization buffer if something like this happens
2509: // Move Jamos into normalization buffer
2510: m_buffer_.append(L);
2511: m_buffer_.append(V);
2512: if (T != HANGUL_TBASE_) {
2513: m_buffer_.append(T);
2514: }
2515:
2516: m_FCDStart_ = m_source_.getIndex();
2517: m_FCDLimit_ = m_FCDStart_ + 1;
2518: return IGNORABLE;
2519: }
2520: }
2521:
2522: /**
2523: * Gets implicit codepoint ces
2524: * @param codepoint current codepoint
2525: * @return implicit codepoint ces
2526: */
2527: private int previousImplicit(int codepoint) {
2528: if (!UCharacter.isLegal(codepoint)) {
2529: return IGNORABLE; // illegal code value, completely ignoreable!
2530: }
2531: int result = RuleBasedCollator.impCEGen_
2532: .getImplicitFromCodePoint(codepoint);
2533: m_CEBufferSize_ = 2;
2534: m_CEBufferOffset_ = 1;
2535: m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_) | 0x00000505;
2536: m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
2537: return m_CEBuffer_[1];
2538: }
2539:
2540: /**
2541: * Gets the previous surrogate ce
2542: * @param ch current character
2543: * @return previous surrogate ce
2544: */
2545: private int previousSurrogate(char ch) {
2546: if (isBackwardsStart()) {
2547: // we are at the start of the string, wrong place to be at
2548: return IGNORABLE;
2549: }
2550: char prevch = (char) previousChar();
2551: // Handles Han and Supplementary characters here.
2552: if (UTF16.isLeadSurrogate(prevch)) {
2553: return previousImplicit(UCharacterProperty
2554: .getRawSupplementary(prevch, ch));
2555: }
2556: if (prevch != CharacterIterator.DONE) {
2557: nextChar();
2558: }
2559: return IGNORABLE; // completely ignorable
2560: }
2561:
2562: /**
2563: * <p>Special CE management. Expansions, contractions etc...</p>
2564: * @param collator can be plain UCA
2565: * @param ce current ce
2566: * @param ch current character
2567: * @return previous special ce
2568: */
2569: private int previousSpecial(RuleBasedCollator collator, int ce,
2570: char ch) {
2571: while (true) {
2572: // the only ces that loops are thai, special prefix and
2573: // contractions
2574: switch (RuleBasedCollator.getTag(ce)) {
2575: case CE_NOT_FOUND_TAG_: // this tag always returns
2576: return ce;
2577: case RuleBasedCollator.CE_SURROGATE_TAG_:
2578: // essentialy a disengaged lead surrogate. a broken
2579: // sequence was encountered and this is an error
2580: return IGNORABLE;
2581: case CE_SPEC_PROC_TAG_:
2582: ce = previousSpecialPrefix(collator, ce);
2583: break;
2584: case CE_CONTRACTION_TAG_:
2585: // may loop for first character e.g. "0x0f71" for english
2586: if (isBackwardsStart()) {
2587: // start of string or this is not the end of any contraction
2588: ce = collator.m_contractionCE_[getContractionOffset(
2589: collator, ce)];
2590: break;
2591: }
2592: return previousContraction(collator, ce, ch); // else
2593: case CE_LONG_PRIMARY_TAG_:
2594: return previousLongPrimary(ce);
2595: case CE_EXPANSION_TAG_: // always returns
2596: return previousExpansion(collator, ce);
2597: case CE_DIGIT_TAG_:
2598: ce = previousDigit(collator, ce, ch);
2599: break;
2600: case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2601: return previousHangul(collator, ch);
2602: case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
2603: return IGNORABLE; // broken surrogate sequence
2604: case CE_TRAIL_SURROGATE_TAG_: // DC00-DFFF
2605: return previousSurrogate(ch);
2606: case CE_CJK_IMPLICIT_TAG_:
2607: // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2608: return previousImplicit(ch);
2609: case CE_IMPLICIT_TAG_: // everything that is not defined
2610: // UCA is filled with these. Tailorings are NOT_FOUND
2611: return previousImplicit(ch);
2612: case CE_CHARSET_TAG_: // this tag always returns
2613: return CE_NOT_FOUND_;
2614: default: // this tag always returns
2615: ce = IGNORABLE;
2616: }
2617: if (!RuleBasedCollator.isSpecial(ce)) {
2618: break;
2619: }
2620: }
2621: return ce;
2622: }
2623:
2624: /**
2625: * GET IMPLICIT PRIMARY WEIGHTS
2626: * @param cp codepoint
2627: * @param value is left justified primary key
2628: */
2629: // private static final int getImplicitPrimary(int cp)
2630: // {
2631: // cp = swapCJK(cp);
2632: //
2633: // //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
2634: // // we now have a range of numbers from 0 to 21FFFF.
2635: // // we must skip all 00, 01, 02 bytes, so most bytes have 253 values
2636: // // we must leave a gap of 01 between all values of the last byte, so
2637: // // the last byte has 126 values (3 byte case)
2638: // // we shift so that HAN all has the same first primary, for
2639: // // compression.
2640: // // for the 4 byte case, we make the gap as large as we can fit.
2641: // // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
2642: // // Four byte forms (most supplementaries) are EF xx xx xx (with a gap
2643: // // of LAST2_MULTIPLIER == 14)
2644: //
2645: // int last0 = cp - RuleBasedCollator.IMPLICIT_4BYTE_BOUNDARY_;
2646: // if (last0 < 0) {
2647: // int last1 = cp / RuleBasedCollator.LAST_COUNT_;
2648: // last0 = cp % RuleBasedCollator.LAST_COUNT_;
2649: //
2650: // int last2 = last1 / RuleBasedCollator.OTHER_COUNT_;
2651: // last1 %= RuleBasedCollator.OTHER_COUNT_;
2652: // return RuleBasedCollator.IMPLICIT_BASE_3BYTE_ + (last2 << 24)
2653: // + (last1 << 16)
2654: // + ((last0 * RuleBasedCollator.LAST_MULTIPLIER_) << 8);
2655: // }
2656: // else {
2657: // int last1 = last0 / RuleBasedCollator.LAST_COUNT2_;
2658: // last0 %= RuleBasedCollator.LAST_COUNT2_;
2659: //
2660: // int last2 = last1 / RuleBasedCollator.OTHER_COUNT_;
2661: // last1 %= RuleBasedCollator.OTHER_COUNT_;
2662: //
2663: // int last3 = last2 / RuleBasedCollator.OTHER_COUNT_;
2664: // last2 %= RuleBasedCollator.OTHER_COUNT_;
2665: // return RuleBasedCollator.IMPLICIT_BASE_4BYTE_ + (last3 << 24)
2666: // + (last2 << 16) + (last1 << 8)
2667: // + (last0 * RuleBasedCollator.LAST2_MULTIPLIER_);
2668: // }
2669: // }
2670: // /**
2671: // * Swapping CJK characters for implicit ces
2672: // * @param cp codepoint CJK
2673: // * @return swapped result
2674: // */
2675: // private static final int swapCJK(int cp)
2676: // {
2677: // if (cp >= CJK_BASE_) {
2678: // if (cp < CJK_LIMIT_) {
2679: // return cp - CJK_BASE_;
2680: // }
2681: // if (cp < CJK_COMPAT_USED_BASE_) {
2682: // return cp + NON_CJK_OFFSET_;
2683: // }
2684: // if (cp < CJK_COMPAT_USED_LIMIT_) {
2685: // return cp - CJK_COMPAT_USED_BASE_ + (CJK_LIMIT_ - CJK_BASE_);
2686: // }
2687: // if (cp < CJK_B_BASE_) {
2688: // return cp + NON_CJK_OFFSET_;
2689: // }
2690: // if (cp < CJK_B_LIMIT_) {
2691: // return cp; // non-BMP-CJK
2692: // }
2693: // return cp + NON_CJK_OFFSET_; // non-CJK
2694: // }
2695: // if (cp < CJK_A_BASE_) {
2696: // return cp + NON_CJK_OFFSET_;
2697: // }
2698: // if (cp < CJK_A_LIMIT_) {
2699: // return cp - CJK_A_BASE_ + (CJK_LIMIT_ - CJK_BASE_)
2700: // + (CJK_COMPAT_USED_LIMIT_ - CJK_COMPAT_USED_BASE_);
2701: // }
2702: // return cp + NON_CJK_OFFSET_; // non-CJK
2703: // }
2704: /**
2705: * Gets a character from the source string at a given offset.
2706: * Handles both normal and iterative cases.
2707: * No error checking and does not access the normalization buffer
2708: * - caller beware!
2709: * @param offset offset from current position which character is to be
2710: * retrieved
2711: * @return character at current position + offset
2712: */
2713: private char peekCharacter(int offset) {
2714: if (offset != 0) {
2715: int currentoffset = m_source_.getIndex();
2716: m_source_.setIndex(currentoffset + offset);
2717: char result = (char) m_source_.current();
2718: m_source_.setIndex(currentoffset);
2719: return result;
2720: } else {
2721: return (char) m_source_.current();
2722: }
2723: }
2724:
2725: /**
2726: * Moves back 1 position in the source string. This is slightly less
2727: * complicated than previousChar in that it doesn't normalize while
2728: * moving back. Boundary checks are not performed.
2729: * This method is to be used with caution, with the assumption that
2730: * moving back one position will not exceed the source limits.
2731: * Use only with nextChar() and never call this API twice in a row without
2732: * nextChar() in the middle.
2733: */
2734: private void goBackOne() {
2735: if (m_bufferOffset_ >= 0) {
2736: m_bufferOffset_--;
2737: } else {
2738: m_source_.setIndex(m_source_.getIndex() - 1);
2739: }
2740: }
2741:
2742: /**
2743: * Moves forward 1 position in the source string. This is slightly less
2744: * complicated than nextChar in that it doesn't normalize while
2745: * moving back. Boundary checks are not performed.
2746: * This method is to be used with caution, with the assumption that
2747: * moving back one position will not exceed the source limits.
2748: * Use only with previousChar() and never call this API twice in a row
2749: * without previousChar() in the middle.
2750: */
2751: private void goForwardOne() {
2752: if (m_bufferOffset_ < 0) {
2753: // we're working on the source and not normalizing. fast path.
2754: // note Thai pre-vowel reordering uses buffer too
2755: m_source_.setIndex(m_source_.getIndex() + 1);
2756: } else {
2757: // we are in the buffer, buffer offset will never be 0 here
2758: m_bufferOffset_++;
2759: }
2760: }
2761: }
|