0001: //##header
0002: /**
0003: *******************************************************************************
0004: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0005: * others. All Rights Reserved. *
0006: *******************************************************************************
0007: */package com.ibm.icu.text;
0008:
0009: import java.io.IOException;
0010: import java.text.CharacterIterator;
0011: import java.text.ParseException;
0012: import java.util.Arrays;
0013: import java.util.MissingResourceException;
0014:
0015: //#ifndef FOUNDATION
0016: import java.nio.ByteBuffer; //#else
0017: //##import com.ibm.icu.impl.ByteBuffer;
0018: //#endif
0019:
0020: import com.ibm.icu.impl.BOCU;
0021: import com.ibm.icu.impl.ICUDebug;
0022: import com.ibm.icu.impl.ICUResourceBundle;
0023: import com.ibm.icu.impl.ImplicitCEGenerator;
0024: import com.ibm.icu.impl.IntTrie;
0025: import com.ibm.icu.impl.StringUCharacterIterator;
0026: import com.ibm.icu.impl.Trie;
0027: import com.ibm.icu.impl.TrieIterator;
0028: import com.ibm.icu.impl.Utility;
0029: import com.ibm.icu.lang.UCharacter;
0030: import com.ibm.icu.util.RangeValueIterator;
0031: import com.ibm.icu.util.ULocale;
0032: import com.ibm.icu.util.UResourceBundle;
0033: import com.ibm.icu.util.VersionInfo;
0034:
0035: /**
0036: * <p>RuleBasedCollator is a concrete subclass of Collator. It allows
0037: * customization of the Collator via user-specified rule sets.
0038: * RuleBasedCollator is designed to be fully compliant to the <a
0039: * href="http://www.unicode.org/unicode/reports/tr10/"> Unicode
0040: * Collation Algorithm (UCA)</a> and conforms to ISO 14651.</p>
0041: *
0042: * <p>Users are strongly encouraged to read <a
0043: * href="http://icu.sourceforge.net/userguide/Collate_Intro.html">
0044: * the users guide</a> for more information about the collation
0045: * service before using this class.</p>
0046: *
0047: * <p>Create a RuleBasedCollator from a locale by calling the
0048: * getInstance(Locale) factory method in the base class Collator.
0049: * Collator.getInstance(Locale) creates a RuleBasedCollator object
0050: * based on the collation rules defined by the argument locale. If a
0051: * customized collation ordering ar attributes is required, use the
0052: * RuleBasedCollator(String) constructor with the appropriate
0053: * rules. The customized RuleBasedCollator will base its ordering on
0054: * UCA, while re-adjusting the attributes and orders of the characters
0055: * in the specified rule accordingly.</p>
0056: *
0057: * <p>RuleBasedCollator provides correct collation orders for most
0058: * locales supported in ICU. If specific data for a locale is not
0059: * available, the orders eventually falls back to the <a
0060: * href="http://www.unicode.org/unicode/reports/tr10/">UCA collation
0061: * order </a>.</p>
0062: *
0063: * <p>For information about the collation rule syntax and details
0064: * about customization, please refer to the
0065: * <a href="http://icu.sourceforge.net/userguide/Collate_Customization.html">
0066: * Collation customization</a> section of the user's guide.</p>
0067: *
0068: * <p><strong>Note</strong> that there are some differences between
0069: * the Collation rule syntax used in Java and ICU4J:
0070: *
0071: * <ul>
0072: * <li>According to the JDK documentation:
0073: * <i>
0074: * <p>
0075: * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule
0076: * is in force when a Thai vowel of the range \U0E40-\U0E44 precedes a
0077: * Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the
0078: * range \U0EC0-\U0EC4 precedes a Lao consonant of the range
0079: * \U0E81-\U0EAE then the
0080: * vowel is placed after the consonant for collation purposes.
0081: * </p>
0082: * <p>
0083: * If a rule is without the modifier '!', the Thai/Lao vowel-consonant
0084: * swapping is not turned on.
0085: * </p>
0086: * </i>
0087: * <p>
0088: * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao
0089: * vowel-consonant swapping, since the UCA clearly states that it has to be
0090: * supported to ensure a correct sorting order. If a '!' is encountered, it is
0091: * ignored.
0092: * </p>
0093: * <li>As mentioned in the documentation of the base class Collator,
0094: * compatibility decomposition mode is not supported.
0095: * </ul>
0096: * <p>
0097: * <strong>Examples</strong>
0098: * </p>
0099: * <p>
0100: * Creating Customized RuleBasedCollators:
0101: * <blockquote>
0102: * <pre>
0103: * String simple = "& a < b < c < d";
0104: * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
0105: *
0106: * String norwegian = "& a , A < b , B < c , C < d , D < e , E "
0107: * + "< f , F < g , G < h , H < i , I < j , "
0108: * + "J < k , K < l , L < m , M < n , N < "
0109: * + "o , O < p , P < q , Q < r , R < s , S < "
0110: * + "t , T < u , U < v , V < w , W < x , X "
0111: * + "< y , Y < z , Z < \u00E5 = a\u030A "
0112: * + ", \u00C5 = A\u030A ; aa , AA < \u00E6 "
0113: * + ", \u00C6 < \u00F8 , \u00D8";
0114: * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
0115: * </pre>
0116: * </blockquote>
0117: *
0118: * Concatenating rules to combine <code>Collator</code>s:
0119: * <blockquote>
0120: * <pre>
0121: * // Create an en_US Collator object
0122: * RuleBasedCollator en_USCollator = (RuleBasedCollator)
0123: * Collator.getInstance(new Locale("en", "US", ""));
0124: * // Create a da_DK Collator object
0125: * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
0126: * Collator.getInstance(new Locale("da", "DK", ""));
0127: * // Combine the two
0128: * // First, get the collation rules from en_USCollator
0129: * String en_USRules = en_USCollator.getRules();
0130: * // Second, get the collation rules from da_DKCollator
0131: * String da_DKRules = da_DKCollator.getRules();
0132: * RuleBasedCollator newCollator =
0133: * new RuleBasedCollator(en_USRules + da_DKRules);
0134: * // newCollator has the combined rules
0135: * </pre>
0136: * </blockquote>
0137: *
0138: * Making changes to an existing RuleBasedCollator to create a new
0139: * <code>Collator</code> object, by appending changes to the existing rule:
0140: * <blockquote>
0141: * <pre>
0142: * // Create a new Collator object with additional rules
0143: * String addRules = "& C < ch, cH, Ch, CH";
0144: * RuleBasedCollator myCollator =
0145: * new RuleBasedCollator(en_USCollator + addRules);
0146: * // myCollator contains the new rules
0147: * </pre>
0148: * </blockquote>
0149: *
0150: * How to change the order of non-spacing accents:
0151: * <blockquote>
0152: * <pre>
0153: * // old rule with main accents
0154: * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 "
0155: * + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "
0156: * + "; \u0306 ; \u0307 ; \u0309 ; \u030A "
0157: * + "; \u030B ; \u030C ; \u030D ; \u030E "
0158: * + "; \u030F ; \u0310 ; \u0311 ; \u0312 "
0159: * + "< a , A ; ae, AE ; \u00e6 , \u00c6 "
0160: * + "< b , B < c, C < e, E & C < d , D";
0161: * // change the order of accent characters
0162: * String addOn = "& \u0300 ; \u0308 ; \u0302";
0163: * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
0164: * </pre>
0165: * </blockquote>
0166: *
0167: * Putting in a new primary ordering before the default setting,
0168: * e.g. sort English characters before or after Japanese characters in the Japanese
0169: * <code>Collator</code>:
0170: * <blockquote>
0171: * <pre>
0172: * // get en_US Collator rules
0173: * RuleBasedCollator en_USCollator
0174: * = (RuleBasedCollator)Collator.getInstance(Locale.US);
0175: * // add a few Japanese characters to sort before English characters
0176: * // suppose the last character before the first base letter 'a' in
0177: * // the English collation rule is \u2212
0178: * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, "
0179: * + "\u3044";
0180: * RuleBasedCollator myJapaneseCollator
0181: * = new RuleBasedCollator(en_USCollator.getRules() + jaString);
0182: * </pre>
0183: * </blockquote>
0184: * </p>
0185: * <p>
0186: * This class is not subclassable
0187: * </p>
0188: * @author Syn Wee Quek
0189: * @stable ICU 2.8
0190: */
0191: public final class RuleBasedCollator extends Collator {
0192: // public constructors ---------------------------------------------------
0193:
0194: /**
0195: * <p>
0196: * Constructor that takes the argument rules for
0197: * customization. The collator will be based on UCA,
0198: * with the attributes and re-ordering of the characters specified in the
0199: * argument rules.
0200: * </p>
0201: * <p>See the user guide's section on
0202: * <a href="http://icu.sourceforge.net/userguide/Collate_Customization.html">
0203: * Collation Customization</a> for details on the rule syntax.
0204: * </p>
0205: * @param rules the collation rules to build the collation table from.
0206: * @exception ParseException and IOException thrown. ParseException thrown
0207: * when argument rules have an invalid syntax. IOException
0208: * thrown when an error occured while reading internal data.
0209: * @stable ICU 2.8
0210: */
0211: public RuleBasedCollator(String rules) throws Exception {
0212: checkUCA();
0213: if (rules == null) {
0214: throw new IllegalArgumentException(
0215: "Collation rules can not be null");
0216: }
0217: init(rules);
0218: }
0219:
0220: // public methods --------------------------------------------------------
0221:
0222: /**
0223: * Clones the RuleBasedCollator
0224: * @return a new instance of this RuleBasedCollator object
0225: * @stable ICU 2.8
0226: */
0227: public Object clone() throws CloneNotSupportedException {
0228: RuleBasedCollator result = (RuleBasedCollator) super .clone();
0229: if (latinOneCEs_ != null) {
0230: result.m_reallocLatinOneCEs_ = true;
0231: }
0232: // since all collation data in the RuleBasedCollator do not change
0233: // we can safely assign the result.fields to this collator
0234: result.initUtility(false); // let the new clone have their own util
0235: // iterators
0236: return result;
0237: }
0238:
0239: /**
0240: * Return a CollationElementIterator for the given String.
0241: * @see CollationElementIterator
0242: * @stable ICU 2.8
0243: */
0244: public CollationElementIterator getCollationElementIterator(
0245: String source) {
0246: return new CollationElementIterator(source, this );
0247: }
0248:
0249: /**
0250: * Return a CollationElementIterator for the given CharacterIterator.
0251: * The source iterator's integrity will be preserved since a new copy
0252: * will be created for use.
0253: * @see CollationElementIterator
0254: * @stable ICU 2.8
0255: */
0256: public CollationElementIterator getCollationElementIterator(
0257: CharacterIterator source) {
0258: CharacterIterator newsource = (CharacterIterator) source
0259: .clone();
0260: return new CollationElementIterator(newsource, this );
0261: }
0262:
0263: /**
0264: * Return a CollationElementIterator for the given UCharacterIterator.
0265: * The source iterator's integrity will be preserved since a new copy
0266: * will be created for use.
0267: * @see CollationElementIterator
0268: * @stable ICU 2.8
0269: */
0270: public CollationElementIterator getCollationElementIterator(
0271: UCharacterIterator source) {
0272: return new CollationElementIterator(source, this );
0273: }
0274:
0275: // public setters --------------------------------------------------------
0276:
0277: /**
0278: * Sets the Hiragana Quaternary mode to be on or off.
0279: * When the Hiragana Quaternary mode is turned on, the collator
0280: * positions Hiragana characters before all non-ignorable characters in
0281: * QUATERNARY strength. This is to produce a correct JIS collation order,
0282: * distinguishing between Katakana and Hiragana characters.
0283: * @param flag true if Hiragana Quaternary mode is to be on, false
0284: * otherwise
0285: * @see #setHiraganaQuaternaryDefault
0286: * @see #isHiraganaQuaternary
0287: * @stable ICU 2.8
0288: */
0289: public void setHiraganaQuaternary(boolean flag) {
0290: m_isHiragana4_ = flag;
0291: updateInternalState();
0292: }
0293:
0294: /**
0295: * Sets the Hiragana Quaternary mode to the initial mode set during
0296: * construction of the RuleBasedCollator.
0297: * See setHiraganaQuaternary(boolean) for more details.
0298: * @see #setHiraganaQuaternary(boolean)
0299: * @see #isHiraganaQuaternary
0300: * @stable ICU 2.8
0301: */
0302: public void setHiraganaQuaternaryDefault() {
0303: m_isHiragana4_ = m_defaultIsHiragana4_;
0304: updateInternalState();
0305: }
0306:
0307: /**
0308: * Sets whether uppercase characters sort before lowercase
0309: * characters or vice versa, in strength TERTIARY. The default
0310: * mode is false, and so lowercase characters sort before uppercase
0311: * characters.
0312: * If true, sort upper case characters first.
0313: * @param upperfirst true to sort uppercase characters before
0314: * lowercase characters, false to sort lowercase
0315: * characters before uppercase characters
0316: * @see #isLowerCaseFirst
0317: * @see #isUpperCaseFirst
0318: * @see #setLowerCaseFirst
0319: * @see #setCaseFirstDefault
0320: * @stable ICU 2.8
0321: */
0322: public void setUpperCaseFirst(boolean upperfirst) {
0323: if (upperfirst) {
0324: if (m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
0325: latinOneRegenTable_ = true;
0326: }
0327: m_caseFirst_ = AttributeValue.UPPER_FIRST_;
0328: } else {
0329: if (m_caseFirst_ != AttributeValue.OFF_) {
0330: latinOneRegenTable_ = true;
0331: }
0332: m_caseFirst_ = AttributeValue.OFF_;
0333: }
0334: updateInternalState();
0335: }
0336:
0337: /**
0338: * Sets the orders of lower cased characters to sort before upper cased
0339: * characters, in strength TERTIARY. The default
0340: * mode is false.
0341: * If true is set, the RuleBasedCollator will sort lower cased characters
0342: * before the upper cased ones.
0343: * Otherwise, if false is set, the RuleBasedCollator will ignore case
0344: * preferences.
0345: * @param lowerfirst true for sorting lower cased characters before
0346: * upper cased characters, false to ignore case
0347: * preferences.
0348: * @see #isLowerCaseFirst
0349: * @see #isUpperCaseFirst
0350: * @see #setUpperCaseFirst
0351: * @see #setCaseFirstDefault
0352: * @stable ICU 2.8
0353: */
0354: public void setLowerCaseFirst(boolean lowerfirst) {
0355: if (lowerfirst) {
0356: if (m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
0357: latinOneRegenTable_ = true;
0358: }
0359: m_caseFirst_ = AttributeValue.LOWER_FIRST_;
0360: } else {
0361: if (m_caseFirst_ != AttributeValue.OFF_) {
0362: latinOneRegenTable_ = true;
0363: }
0364: m_caseFirst_ = AttributeValue.OFF_;
0365: }
0366: updateInternalState();
0367: }
0368:
0369: /**
0370: * Sets the case first mode to the initial mode set during
0371: * construction of the RuleBasedCollator.
0372: * See setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more
0373: * details.
0374: * @see #isLowerCaseFirst
0375: * @see #isUpperCaseFirst
0376: * @see #setLowerCaseFirst(boolean)
0377: * @see #setUpperCaseFirst(boolean)
0378: * @stable ICU 2.8
0379: */
0380: public final void setCaseFirstDefault() {
0381: if (m_caseFirst_ != m_defaultCaseFirst_) {
0382: latinOneRegenTable_ = true;
0383: }
0384: m_caseFirst_ = m_defaultCaseFirst_;
0385: updateInternalState();
0386: }
0387:
0388: /**
0389: * Sets the alternate handling mode to the initial mode set during
0390: * construction of the RuleBasedCollator.
0391: * See setAlternateHandling(boolean) for more details.
0392: * @see #setAlternateHandlingShifted(boolean)
0393: * @see #isAlternateHandlingShifted()
0394: * @stable ICU 2.8
0395: */
0396: public void setAlternateHandlingDefault() {
0397: m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
0398: updateInternalState();
0399: }
0400:
0401: /**
0402: * Sets the case level mode to the initial mode set during
0403: * construction of the RuleBasedCollator.
0404: * See setCaseLevel(boolean) for more details.
0405: * @see #setCaseLevel(boolean)
0406: * @see #isCaseLevel
0407: * @stable ICU 2.8
0408: */
0409: public void setCaseLevelDefault() {
0410: m_isCaseLevel_ = m_defaultIsCaseLevel_;
0411: updateInternalState();
0412: }
0413:
0414: /**
0415: * Sets the decomposition mode to the initial mode set during construction
0416: * of the RuleBasedCollator.
0417: * See setDecomposition(int) for more details.
0418: * @see #getDecomposition
0419: * @see #setDecomposition(int)
0420: * @stable ICU 2.8
0421: */
0422: public void setDecompositionDefault() {
0423: setDecomposition(m_defaultDecomposition_);
0424: updateInternalState();
0425: }
0426:
0427: /**
0428: * Sets the French collation mode to the initial mode set during
0429: * construction of the RuleBasedCollator.
0430: * See setFrenchCollation(boolean) for more details.
0431: * @see #isFrenchCollation
0432: * @see #setFrenchCollation(boolean)
0433: * @stable ICU 2.8
0434: */
0435: public void setFrenchCollationDefault() {
0436: if (m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
0437: latinOneRegenTable_ = true;
0438: }
0439: m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
0440: updateInternalState();
0441: }
0442:
0443: /**
0444: * Sets the collation strength to the initial mode set during the
0445: * construction of the RuleBasedCollator.
0446: * See setStrength(int) for more details.
0447: * @see #setStrength(int)
0448: * @see #getStrength
0449: * @stable ICU 2.8
0450: */
0451: public void setStrengthDefault() {
0452: setStrength(m_defaultStrength_);
0453: updateInternalState();
0454: }
0455:
0456: /**
0457: * Method to set numeric collation to its default value.
0458: * When numeric collation is turned on, this Collator generates a collation
0459: * key for the numeric value of substrings of digits. This is a way to get
0460: * '100' to sort AFTER '2'
0461: * @see #getNumericCollation
0462: * @see #setNumericCollation
0463: * @stable ICU 2.8
0464: */
0465: public void setNumericCollationDefault() {
0466: setNumericCollation(m_defaultIsNumericCollation_);
0467: updateInternalState();
0468: }
0469:
0470: /**
0471: * Sets the mode for the direction of SECONDARY weights to be used in
0472: * French collation.
0473: * The default value is false, which treats SECONDARY weights in the order
0474: * they appear.
0475: * If set to true, the SECONDARY weights will be sorted backwards.
0476: * See the section on
0477: * <a href="http://icu.sourceforge.net/userguide/Collate_ServiceArchitecture.html">
0478: * French collation</a> for more information.
0479: * @param flag true to set the French collation on, false to set it off
0480: * @stable ICU 2.8
0481: * @see #isFrenchCollation
0482: * @see #setFrenchCollationDefault
0483: */
0484: public void setFrenchCollation(boolean flag) {
0485: if (m_isFrenchCollation_ != flag) {
0486: latinOneRegenTable_ = true;
0487: }
0488: m_isFrenchCollation_ = flag;
0489: updateInternalState();
0490: }
0491:
0492: /**
0493: * Sets the alternate handling for QUATERNARY strength to be either
0494: * shifted or non-ignorable.
0495: * See the UCA definition on
0496: * <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting">
0497: * Alternate Weighting</a>.
0498: * This attribute will only be effective when QUATERNARY strength is set.
0499: * The default value for this mode is false, corresponding to the
0500: * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the
0501: * RuleBasedCollator will treats all the codepoints with non-ignorable
0502: * primary weights in the same way.
0503: * If the mode is set to true, the behaviour corresponds to SHIFTED defined
0504: * in UCA, this causes codepoints with PRIMARY orders that are equal or
0505: * below the variable top value to be ignored in PRIMARY order and
0506: * moved to the QUATERNARY order.
0507: * @param shifted true if SHIFTED behaviour for alternate handling is
0508: * desired, false for the NON_IGNORABLE behaviour.
0509: * @see #isAlternateHandlingShifted
0510: * @see #setAlternateHandlingDefault
0511: * @stable ICU 2.8
0512: */
0513: public void setAlternateHandlingShifted(boolean shifted) {
0514: m_isAlternateHandlingShifted_ = shifted;
0515: updateInternalState();
0516: }
0517:
0518: /**
0519: * <p>
0520: * When case level is set to true, an additional weight is formed
0521: * between the SECONDARY and TERTIARY weight, known as the case level.
0522: * The case level is used to distinguish large and small Japanese Kana
0523: * characters. Case level could also be used in other situations.
0524: * For example to distinguish certain Pinyin characters.
0525: * The default value is false, which means the case level is not generated.
0526: * The contents of the case level are affected by the case first
0527: * mode. A simple way to ignore accent differences in a string is to set
0528: * the strength to PRIMARY and enable case level.
0529: * </p>
0530: * <p>
0531: * See the section on
0532: * <a href="http://icu.sourceforge.net/userguide/Collate_ServiceArchitecture.html">
0533: * case level</a> for more information.
0534: * </p>
0535: * @param flag true if case level sorting is required, false otherwise
0536: * @stable ICU 2.8
0537: * @see #setCaseLevelDefault
0538: * @see #isCaseLevel
0539: */
0540: public void setCaseLevel(boolean flag) {
0541: m_isCaseLevel_ = flag;
0542: updateInternalState();
0543: }
0544:
0545: /**
0546: * <p>
0547: * Sets this Collator's strength property. The strength property
0548: * determines the minimum level of difference considered significant
0549: * during comparison.
0550: * </p>
0551: * <p>See the Collator class description for an example of use.</p>
0552: * @param newStrength the new strength value.
0553: * @see #getStrength
0554: * @see #setStrengthDefault
0555: * @see #PRIMARY
0556: * @see #SECONDARY
0557: * @see #TERTIARY
0558: * @see #QUATERNARY
0559: * @see #IDENTICAL
0560: * @exception IllegalArgumentException If the new strength value is not one
0561: * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
0562: * @stable ICU 2.8
0563: */
0564: public void setStrength(int newStrength) {
0565: super .setStrength(newStrength);
0566: updateInternalState();
0567: }
0568:
0569: /**
0570: * <p>
0571: * Variable top is a two byte primary value which causes all the codepoints
0572: * with primary values that are less or equal than the variable top to be
0573: * shifted when alternate handling is set to SHIFTED.
0574: * </p>
0575: * <p>
0576: * Sets the variable top to a collation element value of a string supplied.
0577: * </p>
0578: * @param varTop one or more (if contraction) characters to which the
0579: * variable top should be set
0580: * @return a int value containing the value of the variable top in upper 16
0581: * bits. Lower 16 bits are undefined.
0582: * @exception IllegalArgumentException is thrown if varTop argument is not
0583: * a valid variable top element. A variable top element is
0584: * invalid when
0585: * <ul>
0586: * <li>it is a contraction that does not exist in the
0587: * Collation order
0588: * <li>when the PRIMARY strength collation element for the
0589: * variable top has more than two bytes
0590: * <li>when the varTop argument is null or zero in length.
0591: * </ul>
0592: * @see #getVariableTop
0593: * @see RuleBasedCollator#setAlternateHandlingShifted
0594: * @stable ICU 2.6
0595: */
0596: public int setVariableTop(String varTop) {
0597: if (varTop == null || varTop.length() == 0) {
0598: throw new IllegalArgumentException(
0599: "Variable top argument string can not be null or zero in length.");
0600: }
0601: if (m_srcUtilIter_ == null) {
0602: initUtility(true);
0603: }
0604:
0605: m_srcUtilColEIter_.setText(varTop);
0606: int ce = m_srcUtilColEIter_.next();
0607:
0608: // here we check if we have consumed all characters
0609: // you can put in either one character or a contraction
0610: // you shouldn't put more...
0611: if (m_srcUtilColEIter_.getOffset() != varTop.length()
0612: || ce == CollationElementIterator.NULLORDER) {
0613: throw new IllegalArgumentException(
0614: "Variable top argument string is a contraction that does not exist "
0615: + "in the Collation order");
0616: }
0617:
0618: int nextCE = m_srcUtilColEIter_.next();
0619:
0620: if ((nextCE != CollationElementIterator.NULLORDER)
0621: && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
0622: throw new IllegalArgumentException(
0623: "Variable top argument string can only have a single collation "
0624: + "element that has less than or equal to two PRIMARY strength "
0625: + "bytes");
0626: }
0627:
0628: m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;
0629:
0630: return ce & CE_PRIMARY_MASK_;
0631: }
0632:
0633: /**
0634: * Sets the variable top to a collation element value supplied.
0635: * Variable top is set to the upper 16 bits.
0636: * Lower 16 bits are ignored.
0637: * @param varTop Collation element value, as returned by setVariableTop or
0638: * getVariableTop
0639: * @see #getVariableTop
0640: * @see #setVariableTop(String)
0641: * @stable ICU 2.6
0642: */
0643: public void setVariableTop(int varTop) {
0644: m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
0645: }
0646:
0647: /**
0648: * When numeric collation is turned on, this Collator generates a collation
0649: * key for the numeric value of substrings of digits. This is a way to get
0650: * '100' to sort AFTER '2'
0651: * @param flag true to turn numeric collation on and false to turn it off
0652: * @see #getNumericCollation
0653: * @see #setNumericCollationDefault
0654: * @stable ICU 2.8
0655: */
0656: public void setNumericCollation(boolean flag) {
0657: // sort substrings of digits as numbers
0658: m_isNumericCollation_ = flag;
0659: updateInternalState();
0660: }
0661:
0662: // public getters --------------------------------------------------------
0663:
0664: /**
0665: * Gets the collation rules for this RuleBasedCollator.
0666: * Equivalent to String getRules(RuleOption.FULL_RULES).
0667: * @return returns the collation rules
0668: * @see #getRules(boolean)
0669: * @stable ICU 2.8
0670: */
0671: public String getRules() {
0672: return m_rules_;
0673: }
0674:
0675: /**
0676: * Returns current rules. The argument defines whether full rules
0677: * (UCA + tailored) rules are returned or just the tailoring.
0678: * @param fullrules true if the rules that defines the full set of
0679: * collation order is required, otherwise false for returning only
0680: * the tailored rules
0681: * @return the current rules that defines this Collator.
0682: * @see #getRules()
0683: * @stable ICU 2.6
0684: */
0685: public String getRules(boolean fullrules) {
0686: if (!fullrules) {
0687: return m_rules_;
0688: }
0689: // take the UCA rules and append real rules at the end
0690: return UCA_.m_rules_.concat(m_rules_);
0691: }
0692:
0693: /**
0694: * Get an UnicodeSet that contains all the characters and sequences
0695: * tailored in this collator.
0696: * @return a pointer to a UnicodeSet object containing all the
0697: * code points and sequences that may sort differently than
0698: * in the UCA.
0699: * @exception ParseException thrown when argument rules have an
0700: * invalid syntax. IOException
0701: * @stable ICU 2.4
0702: */
0703: public UnicodeSet getTailoredSet() {
0704: try {
0705: CollationRuleParser src = new CollationRuleParser(
0706: getRules());
0707: return src.getTailoredSet();
0708: } catch (Exception e) {
0709: throw new IllegalStateException(
0710: "A tailoring rule should not "
0711: + "have errors. Something is quite wrong!");
0712: }
0713: }
0714:
0715: private class contContext {
0716: RuleBasedCollator coll;
0717: UnicodeSet contractions;
0718: UnicodeSet expansions;
0719: UnicodeSet removedContractions;
0720: boolean addPrefixes;
0721:
0722: contContext(RuleBasedCollator coll, UnicodeSet contractions,
0723: UnicodeSet expansions, UnicodeSet removedContractions,
0724: boolean addPrefixes) {
0725: this .coll = coll;
0726: this .contractions = contractions;
0727: this .expansions = expansions;
0728: this .removedContractions = removedContractions;
0729: this .addPrefixes = addPrefixes;
0730: }
0731: }
0732:
0733: private void addSpecial(contContext c, StringBuffer buffer, int CE) {
0734: StringBuffer b = new StringBuffer();
0735: int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;
0736: int newCE = c.coll.m_contractionCE_[offset];
0737: // we might have a contraction that ends from previous level
0738: if (newCE != CollationElementIterator.CE_NOT_FOUND_) {
0739: if (isSpecial(CE)
0740: && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_
0741: && isSpecial(newCE)
0742: && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_
0743: && c.addPrefixes) {
0744: addSpecial(c, buffer, newCE);
0745: }
0746: if (buffer.length() > 1) {
0747: if (c.contractions != null) {
0748: c.contractions.add(buffer.toString());
0749: }
0750: if (c.expansions != null
0751: && isSpecial(CE)
0752: && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
0753: c.expansions.add(buffer.toString());
0754: }
0755: }
0756: }
0757:
0758: offset++;
0759: // check whether we're doing contraction or prefix
0760: if (getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_
0761: && c.addPrefixes) {
0762: while (c.coll.m_contractionIndex_[offset] != 0xFFFF) {
0763: b.delete(0, b.length());
0764: b.append(buffer);
0765: newCE = c.coll.m_contractionCE_[offset];
0766: b.insert(0, c.coll.m_contractionIndex_[offset]);
0767: if (isSpecial(newCE)
0768: && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
0769: addSpecial(c, b, newCE);
0770: } else {
0771: if (c.contractions != null) {
0772: c.contractions.add(b.toString());
0773: }
0774: if (c.expansions != null
0775: && isSpecial(newCE)
0776: && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
0777: c.expansions.add(b.toString());
0778: }
0779: }
0780: offset++;
0781: }
0782: } else if (getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
0783: while (c.coll.m_contractionIndex_[offset] != 0xFFFF) {
0784: b.delete(0, b.length());
0785: b.append(buffer);
0786: newCE = c.coll.m_contractionCE_[offset];
0787: b.append(c.coll.m_contractionIndex_[offset]);
0788: if (isSpecial(newCE)
0789: && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
0790: addSpecial(c, b, newCE);
0791: } else {
0792: if (c.contractions != null) {
0793: c.contractions.add(b.toString());
0794: }
0795: if (c.expansions != null
0796: && isSpecial(newCE)
0797: && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
0798: c.expansions.add(b.toString());
0799: }
0800: }
0801: offset++;
0802: }
0803: }
0804: }
0805:
0806: private void processSpecials(contContext c) {
0807: int internalBufferSize = 512;
0808: TrieIterator trieiterator = new TrieIterator(c.coll.m_trie_);
0809: RangeValueIterator.Element element = new RangeValueIterator.Element();
0810: while (trieiterator.next(element)) {
0811: int start = element.start;
0812: int limit = element.limit;
0813: int CE = element.value;
0814: StringBuffer contraction = new StringBuffer(
0815: internalBufferSize);
0816:
0817: if (isSpecial(CE)) {
0818: if (((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
0819: while (start < limit) {
0820: // if there are suppressed contractions, we don't
0821: // want to add them.
0822: if (c.removedContractions != null
0823: && c.removedContractions
0824: .contains(start)) {
0825: start++;
0826: continue;
0827: }
0828: // we start our contraction from middle, since we don't know if it
0829: // will grow toward right or left
0830: contraction.append((char) start);
0831: addSpecial(c, contraction, CE);
0832: start++;
0833: }
0834: } else if (c.expansions != null
0835: && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
0836: while (start < limit) {
0837: c.expansions.add(start++);
0838: }
0839: }
0840: }
0841: }
0842: }
0843:
0844: /**
0845: * Gets unicode sets containing contractions and/or expansions of a collator
0846: * @param contractions if not null, set to contain contractions
0847: * @param expansions if not null, set to contain expansions
0848: * @param addPrefixes add the prefix contextual elements to contractions
0849: * @throws Exception
0850: * @draft ICU 3.4
0851: * @provisional This API might change or be removed in a future release.
0852: */
0853: public void getContractionsAndExpansions(UnicodeSet contractions,
0854: UnicodeSet expansions, boolean addPrefixes)
0855: throws Exception {
0856: if (contractions != null) {
0857: contractions.clear();
0858: }
0859: if (expansions != null) {
0860: expansions.clear();
0861: }
0862: int rulesLen = 0;
0863: String rules = getRules();
0864: try {
0865: CollationRuleParser src = new CollationRuleParser(rules);
0866: contContext c = new contContext(RuleBasedCollator.UCA_,
0867: contractions, expansions, src.m_removeSet_,
0868: addPrefixes);
0869:
0870: // Add the UCA contractions
0871: processSpecials(c);
0872: // This is collator specific. Add contractions from a collator
0873: c.coll = this ;
0874: c.removedContractions = null;
0875: processSpecials(c);
0876: } catch (Exception e) {
0877: throw e;
0878: }
0879: }
0880:
0881: /**
0882: * <p>
0883: * Get a Collation key for the argument String source from this
0884: * RuleBasedCollator.
0885: * </p>
0886: * <p>
0887: * General recommendation: <br>
0888: * If comparison are to be done to the same String multiple times, it would
0889: * be more efficient to generate CollationKeys for the Strings and use
0890: * CollationKey.compareTo(CollationKey) for the comparisons.
0891: * If the each Strings are compared to only once, using the method
0892: * RuleBasedCollator.compare(String, String) will have a better performance.
0893: * </p>
0894: * <p>
0895: * See the class documentation for an explanation about CollationKeys.
0896: * </p>
0897: * @param source the text String to be transformed into a collation key.
0898: * @return the CollationKey for the given String based on this
0899: * RuleBasedCollator's collation rules. If the source String is
0900: * null, a null CollationKey is returned.
0901: * @see CollationKey
0902: * @see #compare(String, String)
0903: * @see #getRawCollationKey
0904: * @stable ICU 2.8
0905: */
0906: public CollationKey getCollationKey(String source) {
0907: if (source == null) {
0908: return null;
0909: }
0910: m_utilRawCollationKey_ = getRawCollationKey(source,
0911: m_utilRawCollationKey_);
0912: return new CollationKey(source, m_utilRawCollationKey_);
0913: }
0914:
0915: /**
0916: * Gets the simpler form of a CollationKey for the String source following
0917: * the rules of this Collator and stores the result into the user provided
0918: * argument key.
0919: * If key has a internal byte array of length that's too small for the
0920: * result, the internal byte array will be grown to the exact required
0921: * size.
0922: * @param source the text String to be transformed into a RawCollationKey
0923: * @param key output RawCollationKey to store results
0924: * @return If key is null, a new instance of RawCollationKey will be
0925: * created and returned, otherwise the user provided key will be
0926: * returned.
0927: * @see #getCollationKey
0928: * @see #compare(String, String)
0929: * @see RawCollationKey
0930: * @stable ICU 2.8
0931: */
0932: public RawCollationKey getRawCollationKey(String source,
0933: RawCollationKey key) {
0934: if (source == null) {
0935: return null;
0936: }
0937: int strength = getStrength();
0938: m_utilCompare0_ = m_isCaseLevel_;
0939: m_utilCompare1_ = true;
0940: m_utilCompare2_ = strength >= SECONDARY;
0941: m_utilCompare3_ = strength >= TERTIARY;
0942: m_utilCompare4_ = strength >= QUATERNARY;
0943: m_utilCompare5_ = strength == IDENTICAL;
0944:
0945: m_utilBytesCount0_ = 0;
0946: m_utilBytesCount1_ = 0;
0947: m_utilBytesCount2_ = 0;
0948: m_utilBytesCount3_ = 0;
0949: m_utilBytesCount4_ = 0;
0950: m_utilBytesCount5_ = 0;
0951: m_utilCount0_ = 0;
0952: m_utilCount1_ = 0;
0953: m_utilCount2_ = 0;
0954: m_utilCount3_ = 0;
0955: m_utilCount4_ = 0;
0956: m_utilCount5_ = 0;
0957: boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
0958: // TODO: UCOL_COMMON_BOT4 should be a function of qShifted.
0959: // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so
0960: // high.
0961: int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1)
0962: & LAST_BYTE_MASK_;
0963: byte hiragana4 = 0;
0964: if (m_isHiragana4_ && m_utilCompare4_) {
0965: // allocate one more space for hiragana, value for hiragana
0966: hiragana4 = (byte) commonBottom4;
0967: commonBottom4++;
0968: }
0969:
0970: int bottomCount4 = 0xFF - commonBottom4;
0971: // If we need to normalize, we'll do it all at once at the beginning!
0972: if (m_utilCompare5_
0973: && Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) {
0974: // if it is identical strength, we have to normalize the string to
0975: // NFD so that it will be appended correctly to the end of the sort
0976: // key
0977: source = Normalizer.decompose(source, false);
0978: } else if (getDecomposition() != NO_DECOMPOSITION
0979: && Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.YES) {
0980: // for the rest of the strength, if decomposition is on, FCD is
0981: // enough for us to work on.
0982: source = Normalizer.normalize(source, Normalizer.FCD);
0983: }
0984: getSortKeyBytes(source, doFrench, hiragana4, commonBottom4,
0985: bottomCount4);
0986: if (key == null) {
0987: key = new RawCollationKey();
0988: }
0989: getSortKey(source, doFrench, commonBottom4, bottomCount4, key);
0990: return key;
0991: }
0992:
0993: /**
0994: * Return true if an uppercase character is sorted before the corresponding lowercase character.
0995: * See setCaseFirst(boolean) for details.
0996: * @see #setUpperCaseFirst
0997: * @see #setLowerCaseFirst
0998: * @see #isLowerCaseFirst
0999: * @see #setCaseFirstDefault
1000: * @return true if upper cased characters are sorted before lower cased
1001: * characters, false otherwise
1002: * @stable ICU 2.8
1003: */
1004: public boolean isUpperCaseFirst() {
1005: return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);
1006: }
1007:
1008: /**
1009: * Return true if a lowercase character is sorted before the corresponding uppercase character.
1010: * See setCaseFirst(boolean) for details.
1011: * @see #setUpperCaseFirst
1012: * @see #setLowerCaseFirst
1013: * @see #isUpperCaseFirst
1014: * @see #setCaseFirstDefault
1015: * @return true lower cased characters are sorted before upper cased
1016: * characters, false otherwise
1017: * @stable ICU 2.8
1018: */
1019: public boolean isLowerCaseFirst() {
1020: return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);
1021: }
1022:
1023: /**
1024: * Checks if the alternate handling behaviour is the UCA defined SHIFTED or
1025: * NON_IGNORABLE.
1026: * If return value is true, then the alternate handling attribute for the
1027: * Collator is SHIFTED. Otherwise if return value is false, then the
1028: * alternate handling attribute for the Collator is NON_IGNORABLE
1029: * See setAlternateHandlingShifted(boolean) for more details.
1030: * @return true or false
1031: * @see #setAlternateHandlingShifted(boolean)
1032: * @see #setAlternateHandlingDefault
1033: * @stable ICU 2.8
1034: */
1035: public boolean isAlternateHandlingShifted() {
1036: return m_isAlternateHandlingShifted_;
1037: }
1038:
1039: /**
1040: * Checks if case level is set to true.
1041: * See setCaseLevel(boolean) for details.
1042: * @return the case level mode
1043: * @see #setCaseLevelDefault
1044: * @see #isCaseLevel
1045: * @see #setCaseLevel(boolean)
1046: * @stable ICU 2.8
1047: */
1048: public boolean isCaseLevel() {
1049: return m_isCaseLevel_;
1050: }
1051:
1052: /**
1053: * Checks if French Collation is set to true.
1054: * See setFrenchCollation(boolean) for details.
1055: * @return true if French Collation is set to true, false otherwise
1056: * @see #setFrenchCollation(boolean)
1057: * @see #setFrenchCollationDefault
1058: * @stable ICU 2.8
1059: */
1060: public boolean isFrenchCollation() {
1061: return m_isFrenchCollation_;
1062: }
1063:
1064: /**
1065: * Checks if the Hiragana Quaternary mode is set on.
1066: * See setHiraganaQuaternary(boolean) for more details.
1067: * @return flag true if Hiragana Quaternary mode is on, false otherwise
1068: * @see #setHiraganaQuaternaryDefault
1069: * @see #setHiraganaQuaternary(boolean)
1070: * @stable ICU 2.8
1071: */
1072: public boolean isHiraganaQuaternary() {
1073: return m_isHiragana4_;
1074: }
1075:
1076: /**
1077: * Gets the variable top value of a Collator.
1078: * Lower 16 bits are undefined and should be ignored.
1079: * @return the variable top value of a Collator.
1080: * @see #setVariableTop
1081: * @stable ICU 2.6
1082: */
1083: public int getVariableTop() {
1084: return m_variableTopValue_ << 16;
1085: }
1086:
1087: /**
1088: * Method to retrieve the numeric collation value.
1089: * When numeric collation is turned on, this Collator generates a collation
1090: * key for the numeric value of substrings of digits. This is a way to get
1091: * '100' to sort AFTER '2'
1092: * @see #setNumericCollation
1093: * @see #setNumericCollationDefault
1094: * @return true if numeric collation is turned on, false otherwise
1095: * @stable ICU 2.8
1096: */
1097: public boolean getNumericCollation() {
1098: return m_isNumericCollation_;
1099: }
1100:
1101: // public other methods -------------------------------------------------
1102:
1103: /**
1104: * Compares the equality of two RuleBasedCollator objects.
1105: * RuleBasedCollator objects are equal if they have the same collation
1106: * rules and the same attributes.
1107: * @param obj the RuleBasedCollator to be compared to.
1108: * @return true if this RuleBasedCollator has exactly the same
1109: * collation behaviour as obj, false otherwise.
1110: * @stable ICU 2.8
1111: */
1112: public boolean equals(Object obj) {
1113: if (obj == null) {
1114: return false; // super does class check
1115: }
1116: if (this == obj) {
1117: return true;
1118: }
1119: if (getClass() != obj.getClass()) {
1120: return false;
1121: }
1122: RuleBasedCollator other = (RuleBasedCollator) obj;
1123: // all other non-transient information is also contained in rules.
1124: if (getStrength() != other.getStrength()
1125: || getDecomposition() != other.getDecomposition()
1126: || other.m_caseFirst_ != m_caseFirst_
1127: || other.m_caseSwitch_ != m_caseSwitch_
1128: || other.m_isAlternateHandlingShifted_ != m_isAlternateHandlingShifted_
1129: || other.m_isCaseLevel_ != m_isCaseLevel_
1130: || other.m_isFrenchCollation_ != m_isFrenchCollation_
1131: || other.m_isHiragana4_ != m_isHiragana4_) {
1132: return false;
1133: }
1134: boolean rules = m_rules_ == other.m_rules_;
1135: if (!rules && (m_rules_ != null && other.m_rules_ != null)) {
1136: rules = m_rules_.equals(other.m_rules_);
1137: }
1138: if (!rules || !ICUDebug.enabled("collation")) {
1139: return rules;
1140: }
1141: if (m_addition3_ != other.m_addition3_
1142: || m_bottom3_ != other.m_bottom3_
1143: || m_bottomCount3_ != other.m_bottomCount3_
1144: || m_common3_ != other.m_common3_
1145: || m_isSimple3_ != other.m_isSimple3_
1146: || m_mask3_ != other.m_mask3_
1147: || m_minContractionEnd_ != other.m_minContractionEnd_
1148: || m_minUnsafe_ != other.m_minUnsafe_
1149: || m_top3_ != other.m_top3_
1150: || m_topCount3_ != other.m_topCount3_
1151: || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
1152: return false;
1153: }
1154: if (!m_trie_.equals(other.m_trie_)) {
1155: // we should use the trie iterator here, but then this part is
1156: // only used in the test.
1157: for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i--) {
1158: int v = m_trie_.getCodePointValue(i);
1159: int otherv = other.m_trie_.getCodePointValue(i);
1160: if (v != otherv) {
1161: int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_);
1162: if (mask == (otherv & 0xff000000)) {
1163: v &= 0xffffff;
1164: otherv &= 0xffffff;
1165: if (mask == 0xf1000000) {
1166: v -= (m_expansionOffset_ << 4);
1167: otherv -= (other.m_expansionOffset_ << 4);
1168: } else if (mask == 0xf2000000) {
1169: v -= m_contractionOffset_;
1170: otherv -= other.m_contractionOffset_;
1171: }
1172: if (v == otherv) {
1173: continue;
1174: }
1175: }
1176: return false;
1177: }
1178: }
1179: }
1180: if (Arrays.equals(m_contractionCE_, other.m_contractionCE_)
1181: && Arrays.equals(m_contractionEnd_,
1182: other.m_contractionEnd_)
1183: && Arrays.equals(m_contractionIndex_,
1184: other.m_contractionIndex_)
1185: && Arrays.equals(m_expansion_, other.m_expansion_)
1186: && Arrays.equals(m_expansionEndCE_,
1187: other.m_expansionEndCE_)) {
1188: // not comparing paddings
1189: for (int i = 0; i < m_expansionEndCE_.length; i++) {
1190: if (m_expansionEndCEMaxSize_[i] != other.m_expansionEndCEMaxSize_[i]) {
1191: return false;
1192: }
1193: return true;
1194: }
1195: }
1196: return false;
1197: }
1198:
1199: /**
1200: * Generates a unique hash code for this RuleBasedCollator.
1201: * @return the unique hash code for this Collator
1202: * @stable ICU 2.8
1203: */
1204: public int hashCode() {
1205: String rules = getRules();
1206: if (rules == null) {
1207: rules = "";
1208: }
1209: return rules.hashCode();
1210: }
1211:
1212: /**
1213: * Compares the source text String to the target text String according to
1214: * the collation rules, strength and decomposition mode for this
1215: * RuleBasedCollator.
1216: * Returns an integer less than,
1217: * equal to or greater than zero depending on whether the source String is
1218: * less than, equal to or greater than the target String. See the Collator
1219: * class description for an example of use.
1220: * </p>
1221: * <p>
1222: * General recommendation: <br>
1223: * If comparison are to be done to the same String multiple times, it would
1224: * be more efficient to generate CollationKeys for the Strings and use
1225: * CollationKey.compareTo(CollationKey) for the comparisons.
1226: * If speed performance is critical and object instantiation is to be
1227: * reduced, further optimization may be achieved by generating a simpler
1228: * key of the form RawCollationKey and reusing this RawCollationKey
1229: * object with the method RuleBasedCollator.getRawCollationKey. Internal
1230: * byte representation can be directly accessed via RawCollationKey and
1231: * stored for future use. Like CollationKey, RawCollationKey provides a
1232: * method RawCollationKey.compareTo for key comparisons.
1233: * If the each Strings are compared to only once, using the method
1234: * RuleBasedCollator.compare(String, String) will have a better performance.
1235: * </p>
1236: * @param source the source text String.
1237: * @param target the target text String.
1238: * @return Returns an integer value. Value is less than zero if source is
1239: * less than target, value is zero if source and target are equal,
1240: * value is greater than zero if source is greater than target.
1241: * @see CollationKey
1242: * @see #getCollationKey
1243: * @stable ICU 2.8
1244: */
1245: public int compare(String source, String target) {
1246: if (source == target) {
1247: return 0;
1248: }
1249:
1250: // Find the length of any leading portion that is equal
1251: int offset = getFirstUnmatchedOffset(source, target);
1252: //return compareRegular(source, target, offset);
1253: if (latinOneUse_) {
1254: if ((offset < source.length() && source.charAt(offset) > ENDOFLATINONERANGE_)
1255: || (offset < target.length() && target
1256: .charAt(offset) > ENDOFLATINONERANGE_)) {
1257: // source or target start with non-latin-1
1258: return compareRegular(source, target, offset);
1259: } else {
1260: return compareUseLatin1(source, target, offset);
1261: }
1262: } else {
1263: return compareRegular(source, target, offset);
1264: }
1265: }
1266:
1267: // package private inner interfaces --------------------------------------
1268:
1269: /**
1270: * Attribute values to be used when setting the Collator options
1271: */
1272: static interface AttributeValue {
1273: /**
1274: * Indicates that the default attribute value will be used.
1275: * See individual attribute for details on its default value.
1276: */
1277: static final int DEFAULT_ = -1;
1278: /**
1279: * Primary collation strength
1280: */
1281: static final int PRIMARY_ = Collator.PRIMARY;
1282: /**
1283: * Secondary collation strength
1284: */
1285: static final int SECONDARY_ = Collator.SECONDARY;
1286: /**
1287: * Tertiary collation strength
1288: */
1289: static final int TERTIARY_ = Collator.TERTIARY;
1290: /**
1291: * Default collation strength
1292: */
1293: static final int DEFAULT_STRENGTH_ = Collator.TERTIARY;
1294: /**
1295: * Internal use for strength checks in Collation elements
1296: */
1297: static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1;
1298: /**
1299: * Quaternary collation strength
1300: */
1301: static final int QUATERNARY_ = 3;
1302: /**
1303: * Identical collation strength
1304: */
1305: static final int IDENTICAL_ = Collator.IDENTICAL;
1306: /**
1307: * Internal use for strength checks
1308: */
1309: static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;
1310: /**
1311: * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL,
1312: * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
1313: */
1314: static final int OFF_ = 16;
1315: /**
1316: * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL,
1317: * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
1318: */
1319: static final int ON_ = 17;
1320: /**
1321: * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted
1322: */
1323: static final int SHIFTED_ = 20;
1324: /**
1325: * Valid for ALTERNATE_HANDLING. Alternate handling will be non
1326: * ignorable
1327: */
1328: static final int NON_IGNORABLE_ = 21;
1329: /**
1330: * Valid for CASE_FIRST - lower case sorts before upper case
1331: */
1332: static final int LOWER_FIRST_ = 24;
1333: /**
1334: * Upper case sorts before lower case
1335: */
1336: static final int UPPER_FIRST_ = 25;
1337: /**
1338: * Number of attribute values
1339: */
1340: static final int LIMIT_ = 29;
1341: }
1342:
1343: /**
1344: * Attributes that collation service understands. All the attributes can
1345: * take DEFAULT value, as well as the values specific to each one.
1346: */
1347: static interface Attribute {
1348: /**
1349: * Attribute for direction of secondary weights - used in French.
1350: * Acceptable values are ON, which results in secondary weights being
1351: * considered backwards and OFF which treats secondary weights in the
1352: * order they appear.
1353: */
1354: static final int FRENCH_COLLATION_ = 0;
1355: /**
1356: * Attribute for handling variable elements. Acceptable values are
1357: * NON_IGNORABLE (default) which treats all the codepoints with
1358: * non-ignorable primary weights in the same way, and SHIFTED which
1359: * causes codepoints with primary weights that are equal or below the
1360: * variable top value to be ignored on primary level and moved to the
1361: * quaternary level.
1362: */
1363: static final int ALTERNATE_HANDLING_ = 1;
1364: /**
1365: * Controls the ordering of upper and lower case letters. Acceptable
1366: * values are OFF (default), which orders upper and lower case letters
1367: * in accordance to their tertiary weights, UPPER_FIRST which forces
1368: * upper case letters to sort before lower case letters, and
1369: * LOWER_FIRST which does the opposite.
1370: */
1371: static final int CASE_FIRST_ = 2;
1372: /**
1373: * Controls whether an extra case level (positioned before the third
1374: * level) is generated or not. Acceptable values are OFF (default),
1375: * when case level is not generated, and ON which causes the case
1376: * level to be generated. Contents of the case level are affected by
1377: * the value of CASE_FIRST attribute. A simple way to ignore accent
1378: * differences in a string is to set the strength to PRIMARY and
1379: * enable case level.
1380: */
1381: static final int CASE_LEVEL_ = 3;
1382: /**
1383: * Controls whether the normalization check and necessary
1384: * normalizations are performed. When set to OFF (default) no
1385: * normalization check is performed. The correctness of the result is
1386: * guaranteed only if the input data is in so-called FCD form (see
1387: * users manual for more info). When set to ON, an incremental check
1388: * is performed to see whether the input data is in the FCD form. If
1389: * the data is not in the FCD form, incremental NFD normalization is
1390: * performed.
1391: */
1392: static final int NORMALIZATION_MODE_ = 4;
1393: /**
1394: * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY,
1395: * QUATERNARY or IDENTICAL. The usual strength for most locales
1396: * (except Japanese) is tertiary. Quaternary strength is useful when
1397: * combined with shifted setting for alternate handling attribute and
1398: * for JIS x 4061 collation, when it is used to distinguish between
1399: * Katakana and Hiragana (this is achieved by setting the
1400: * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is
1401: * affected only by the number of non ignorable code points in the
1402: * string. Identical strength is rarely useful, as it amounts to
1403: * codepoints of the NFD form of the string.
1404: */
1405: static final int STRENGTH_ = 5;
1406: /**
1407: * When turned on, this attribute positions Hiragana before all
1408: * non-ignorables on quaternary level. This is a sneaky way to produce
1409: * JIS sort order.
1410: */
1411: static final int HIRAGANA_QUATERNARY_MODE_ = 6;
1412: /**
1413: * Attribute count
1414: */
1415: static final int LIMIT_ = 7;
1416: }
1417:
1418: /**
1419: * DataManipulate singleton
1420: */
1421: static class DataManipulate implements Trie.DataManipulate {
1422: // public methods ----------------------------------------------------
1423:
1424: /**
1425: * Internal method called to parse a lead surrogate's ce for the offset
1426: * to the next trail surrogate data.
1427: * @param ce collation element of the lead surrogate
1428: * @return data offset or 0 for the next trail surrogate
1429: * @stable ICU 2.8
1430: */
1431: public final int getFoldingOffset(int ce) {
1432: if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {
1433: return (ce & 0xFFFFFF);
1434: }
1435: return 0;
1436: }
1437:
1438: /**
1439: * Get singleton object
1440: */
1441: public static final DataManipulate getInstance() {
1442: if (m_instance_ == null) {
1443: m_instance_ = new DataManipulate();
1444: }
1445: return m_instance_;
1446: }
1447:
1448: // private data member ----------------------------------------------
1449:
1450: /**
1451: * Singleton instance
1452: */
1453: private static DataManipulate m_instance_;
1454:
1455: // private constructor ----------------------------------------------
1456:
1457: /**
1458: * private to prevent initialization
1459: */
1460: private DataManipulate() {
1461: }
1462: }
1463:
1464: /**
1465: * UCAConstants
1466: */
1467: static final class UCAConstants {
1468: int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
1469: int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
1470: int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705
1471: int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000
1472: int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500
1473: int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05
1474: int FIRST_VARIABLE_[] = new int[2]; // 0x05070505
1475: int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505
1476: int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505
1477: int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505
1478: int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303
1479: int FIRST_IMPLICIT_[] = new int[2];
1480: int LAST_IMPLICIT_[] = new int[2];
1481: int FIRST_TRAILING_[] = new int[2];
1482: int LAST_TRAILING_[] = new int[2];
1483: int PRIMARY_TOP_MIN_;
1484: int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
1485: int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
1486: int PRIMARY_TRAILING_MIN_; // 0xE8000000
1487: int PRIMARY_TRAILING_MAX_; // 0xF0000000
1488: int PRIMARY_SPECIAL_MIN_; // 0xE8000000
1489: int PRIMARY_SPECIAL_MAX_; // 0xF0000000
1490: }
1491:
1492: // package private data member -------------------------------------------
1493:
1494: static final byte BYTE_FIRST_TAILORED_ = (byte) 0x04;
1495: static final byte BYTE_COMMON_ = (byte) 0x05;
1496: static final int COMMON_TOP_2_ = 0x86; // int for unsigness
1497: static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;
1498: /**
1499: * Case strength mask
1500: */
1501: static final int CE_CASE_BIT_MASK_ = 0xC0;
1502: static final int CE_TAG_SHIFT_ = 24;
1503: static final int CE_TAG_MASK_ = 0x0F000000;
1504:
1505: static final int CE_SPECIAL_FLAG_ = 0xF0000000;
1506: /**
1507: * Lead surrogate that is tailored and doesn't start a contraction
1508: */
1509: static final int CE_SURROGATE_TAG_ = 5;
1510: /**
1511: * Mask to get the primary strength of the collation element
1512: */
1513: static final int CE_PRIMARY_MASK_ = 0xFFFF0000;
1514: /**
1515: * Mask to get the secondary strength of the collation element
1516: */
1517: static final int CE_SECONDARY_MASK_ = 0xFF00;
1518: /**
1519: * Mask to get the tertiary strength of the collation element
1520: */
1521: static final int CE_TERTIARY_MASK_ = 0xFF;
1522: /**
1523: * Primary strength shift
1524: */
1525: static final int CE_PRIMARY_SHIFT_ = 16;
1526: /**
1527: * Secondary strength shift
1528: */
1529: static final int CE_SECONDARY_SHIFT_ = 8;
1530: /**
1531: * Continuation marker
1532: */
1533: static final int CE_CONTINUATION_MARKER_ = 0xC0;
1534:
1535: /**
1536: * Size of collator raw data headers and options before the expansion
1537: * data. This is used when expansion ces are to be retrieved. ICU4C uses
1538: * the expansion offset starting from UCollator.UColHeader, hence ICU4J
1539: * will have to minus that off to get the right expansion ce offset. In
1540: * number of ints.
1541: */
1542: int m_expansionOffset_;
1543: /**
1544: * Size of collator raw data headers, options and expansions before
1545: * contraction data. This is used when contraction ces are to be retrieved.
1546: * ICU4C uses contraction offset starting from UCollator.UColHeader, hence
1547: * ICU4J will have to minus that off to get the right contraction ce
1548: * offset. In number of chars.
1549: */
1550: int m_contractionOffset_;
1551: /**
1552: * Flag indicator if Jamo is special
1553: */
1554: boolean m_isJamoSpecial_;
1555:
1556: // Collator options ------------------------------------------------------
1557:
1558: int m_defaultVariableTopValue_;
1559: boolean m_defaultIsFrenchCollation_;
1560: boolean m_defaultIsAlternateHandlingShifted_;
1561: int m_defaultCaseFirst_;
1562: boolean m_defaultIsCaseLevel_;
1563: int m_defaultDecomposition_;
1564: int m_defaultStrength_;
1565: boolean m_defaultIsHiragana4_;
1566: boolean m_defaultIsNumericCollation_;
1567:
1568: /**
1569: * Value of the variable top
1570: */
1571: int m_variableTopValue_;
1572: /**
1573: * Attribute for special Hiragana
1574: */
1575: boolean m_isHiragana4_;
1576: /**
1577: * Case sorting customization
1578: */
1579: int m_caseFirst_;
1580: /**
1581: * Numeric collation option
1582: */
1583: boolean m_isNumericCollation_;
1584:
1585: // end Collator options --------------------------------------------------
1586:
1587: /**
1588: * Expansion table
1589: */
1590: int m_expansion_[];
1591: /**
1592: * Contraction index table
1593: */
1594: char m_contractionIndex_[];
1595: /**
1596: * Contraction CE table
1597: */
1598: int m_contractionCE_[];
1599: /**
1600: * Data trie
1601: */
1602: IntTrie m_trie_;
1603: /**
1604: * Table to store all collation elements that are the last element of an
1605: * expansion. This is for use in StringSearch.
1606: */
1607: int m_expansionEndCE_[];
1608: /**
1609: * Table to store the maximum size of any expansions that end with the
1610: * corresponding collation element in m_expansionEndCE_. For use in
1611: * StringSearch too
1612: */
1613: byte m_expansionEndCEMaxSize_[];
1614: /**
1615: * Heuristic table to store information on whether a char character is
1616: * considered "unsafe". "Unsafe" character are combining marks or those
1617: * belonging to some contraction sequence from the offset 1 onwards.
1618: * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered
1619: * unsafe. If we have another contraction "ZA" with the one above, then
1620: * 'A', 'B', 'C' are "unsafe" but 'Z' is not.
1621: */
1622: byte m_unsafe_[];
1623: /**
1624: * Table to store information on whether a codepoint can occur as the last
1625: * character in a contraction
1626: */
1627: byte m_contractionEnd_[];
1628: /**
1629: * Original collation rules
1630: */
1631: String m_rules_;
1632: /**
1633: * The smallest "unsafe" codepoint
1634: */
1635: char m_minUnsafe_;
1636: /**
1637: * The smallest codepoint that could be the end of a contraction
1638: */
1639: char m_minContractionEnd_;
1640: /**
1641: * General version of the collator
1642: */
1643: VersionInfo m_version_;
1644: /**
1645: * UCA version
1646: */
1647: VersionInfo m_UCA_version_;
1648: /**
1649: * UCD version
1650: */
1651: VersionInfo m_UCD_version_;
1652:
1653: /**
1654: * UnicodeData.txt property object
1655: */
1656: static final RuleBasedCollator UCA_;
1657: /**
1658: * UCA Constants
1659: */
1660: static final UCAConstants UCA_CONSTANTS_;
1661: /**
1662: * Table for UCA and builder use
1663: */
1664: static final char UCA_CONTRACTIONS_[];
1665:
1666: private static boolean UCA_INIT_COMPLETE;
1667:
1668: /**
1669: * Implicit generator
1670: */
1671: static final ImplicitCEGenerator impCEGen_;
1672: // /**
1673: // * Implicit constants
1674: // */
1675: // static final int IMPLICIT_BASE_BYTE_;
1676: // static final int IMPLICIT_LIMIT_BYTE_;
1677: // static final int IMPLICIT_4BYTE_BOUNDARY_;
1678: // static final int LAST_MULTIPLIER_;
1679: // static final int LAST2_MULTIPLIER_;
1680: // static final int IMPLICIT_BASE_3BYTE_;
1681: // static final int IMPLICIT_BASE_4BYTE_;
1682: // static final int BYTES_TO_AVOID_ = 3;
1683: // static final int OTHER_COUNT_ = 256 - BYTES_TO_AVOID_;
1684: // static final int LAST_COUNT_ = OTHER_COUNT_ / 2;
1685: // /**
1686: // * Room for intervening, without expanding to 5 bytes
1687: // */
1688: // static final int LAST_COUNT2_ = OTHER_COUNT_ / 21;
1689: // static final int IMPLICIT_3BYTE_COUNT_ = 1;
1690: //
1691: static final byte SORT_LEVEL_TERMINATOR_ = 1;
1692:
1693: // These are values from UCA required for
1694: // implicit generation and supressing sort key compression
1695: // they should regularly be in the UCA, but if one
1696: // is running without UCA, it could be a problem
1697: static final int maxRegularPrimary = 0xA0;
1698: static final int minImplicitPrimary = 0xE0;
1699: static final int maxImplicitPrimary = 0xE4;
1700:
1701: // block to initialise character property database
1702: static {
1703: // take pains to let static class init succeed, otherwise the class itself won't exist and
1704: // clients will get a NoClassDefFoundException. Instead, make the constructors fail if
1705: // we can't load the UCA data.
1706:
1707: RuleBasedCollator iUCA_ = null;
1708: UCAConstants iUCA_CONSTANTS_ = null;
1709: char iUCA_CONTRACTIONS_[] = null;
1710: ImplicitCEGenerator iimpCEGen_ = null;
1711: try {
1712: // !!! note what's going on here...
1713: // even though the static init of the class is not yet complete, we
1714: // instantiate an instance of the class. So we'd better be sure that
1715: // instantiation doesn't rely on the static initialization that's
1716: // not complete yet!
1717: iUCA_ = new RuleBasedCollator();
1718: iUCA_CONSTANTS_ = new UCAConstants();
1719: iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_,
1720: iUCA_CONSTANTS_);
1721:
1722: // called before doing canonical closure for the UCA.
1723: iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary,
1724: maxImplicitPrimary);
1725: //iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
1726: iUCA_.init();
1727: ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle
1728: .getBundleInstance(
1729: ICUResourceBundle.ICU_COLLATION_BASE_NAME,
1730: ULocale.ENGLISH);
1731: iUCA_.m_rules_ = (String) rb.getObject("UCARules");
1732: } catch (MissingResourceException ex) {
1733: // throw ex;
1734: } catch (IOException e) {
1735: // e.printStackTrace();
1736: // throw new MissingResourceException(e.getMessage(),"","");
1737: }
1738:
1739: UCA_ = iUCA_;
1740: UCA_CONSTANTS_ = iUCA_CONSTANTS_;
1741: UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;
1742: impCEGen_ = iimpCEGen_;
1743:
1744: UCA_INIT_COMPLETE = true;
1745: }
1746:
1747: private static void checkUCA() throws MissingResourceException {
1748: if (UCA_INIT_COMPLETE && UCA_ == null) {
1749: throw new MissingResourceException(
1750: "Collator UCA data unavailable", "", "");
1751: }
1752: }
1753:
1754: // package private constructors ------------------------------------------
1755:
1756: /**
1757: * <p>Private contructor for use by subclasses.
1758: * Public access to creating Collators is handled by the API
1759: * Collator.getInstance() or RuleBasedCollator(String rules).
1760: * </p>
1761: * <p>
1762: * This constructor constructs the UCA collator internally
1763: * </p>
1764: */
1765: RuleBasedCollator() {
1766: checkUCA();
1767: initUtility(false);
1768: }
1769:
1770: /**
1771: * Constructors a RuleBasedCollator from the argument locale.
1772: * If no resource bundle is associated with the locale, UCA is used
1773: * instead.
1774: * @param locale
1775: */
1776: RuleBasedCollator(ULocale locale) {
1777: checkUCA();
1778: ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle
1779: .getBundleInstance(
1780: ICUResourceBundle.ICU_COLLATION_BASE_NAME,
1781: locale);
1782: initUtility(false);
1783: if (rb != null) {
1784: try {
1785: // Use keywords, if supplied for lookup
1786: String collkey = locale.getKeywordValue("collation");
1787: if (collkey == null) {
1788: collkey = rb
1789: .getStringWithFallback("collations/default");
1790: }
1791:
1792: // collations/default will always give a string back
1793: // keyword for the real collation data
1794: // if "collations/collkey" will return null if collkey == null
1795: ICUResourceBundle elements = rb
1796: .getWithFallback("collations/" + collkey);
1797: if (elements != null) {
1798: // TODO: Determine actual & valid locale correctly
1799: ULocale uloc = rb.getULocale();
1800: setLocale(uloc, uloc);
1801:
1802: m_rules_ = elements.getString("Sequence");
1803: ByteBuffer buf = elements.get("%%CollationBin")
1804: .getBinary();
1805: // %%CollationBin
1806: if (buf != null) {
1807: // m_rules_ = (String)rules[1][1];
1808: byte map[] = buf.array();
1809: CollatorReader.initRBC(this , map);
1810: /*
1811: BufferedInputStream input =
1812: new BufferedInputStream(
1813: new ByteArrayInputStream(map));
1814: /*
1815: CollatorReader reader = new CollatorReader(input, false);
1816: if (map.length > MIN_BINARY_DATA_SIZE_) {
1817: reader.read(this, null);
1818: }
1819: else {
1820: reader.readHeader(this);
1821: reader.readOptions(this);
1822: // duplicating UCA_'s data
1823: setWithUCATables();
1824: }
1825: */
1826: // at this point, we have read in the collator
1827: // now we need to check whether the binary image has
1828: // the right UCA and other versions
1829: if (!m_UCA_version_.equals(UCA_.m_UCA_version_)
1830: || !m_UCD_version_
1831: .equals(UCA_.m_UCD_version_)) {
1832: init(m_rules_);
1833: return;
1834: }
1835: init();
1836: return;
1837: } else {
1838: // due to resource redirection ICUListResourceBundle does not
1839: // raise missing resource error
1840: //throw new MissingResourceException("Could not get resource for constructing RuleBasedCollator","com.ibm.icu.impl.data.LocaleElements_"+locale.toString(), "%%CollationBin");
1841:
1842: init(m_rules_);
1843: return;
1844: }
1845: }
1846: } catch (Exception e) {
1847: // e.printStackTrace();
1848: // if failed use UCA.
1849: }
1850: }
1851: setWithUCAData();
1852: }
1853:
1854: // package private methods -----------------------------------------------
1855:
1856: /**
1857: * Sets this collator to use the tables in UCA. Note options not taken
1858: * care of here.
1859: */
1860: final void setWithUCATables() {
1861: m_contractionOffset_ = UCA_.m_contractionOffset_;
1862: m_expansionOffset_ = UCA_.m_expansionOffset_;
1863: m_expansion_ = UCA_.m_expansion_;
1864: m_contractionIndex_ = UCA_.m_contractionIndex_;
1865: m_contractionCE_ = UCA_.m_contractionCE_;
1866: m_trie_ = UCA_.m_trie_;
1867: m_expansionEndCE_ = UCA_.m_expansionEndCE_;
1868: m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_;
1869: m_unsafe_ = UCA_.m_unsafe_;
1870: m_contractionEnd_ = UCA_.m_contractionEnd_;
1871: m_minUnsafe_ = UCA_.m_minUnsafe_;
1872: m_minContractionEnd_ = UCA_.m_minContractionEnd_;
1873: }
1874:
1875: /**
1876: * Sets this collator to use the all options and tables in UCA.
1877: */
1878: final void setWithUCAData() {
1879: latinOneFailed_ = true;
1880:
1881: m_addition3_ = UCA_.m_addition3_;
1882: m_bottom3_ = UCA_.m_bottom3_;
1883: m_bottomCount3_ = UCA_.m_bottomCount3_;
1884: m_caseFirst_ = UCA_.m_caseFirst_;
1885: m_caseSwitch_ = UCA_.m_caseSwitch_;
1886: m_common3_ = UCA_.m_common3_;
1887: m_contractionOffset_ = UCA_.m_contractionOffset_;
1888: setDecomposition(UCA_.getDecomposition());
1889: m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;
1890: m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;
1891: m_defaultIsAlternateHandlingShifted_ = UCA_.m_defaultIsAlternateHandlingShifted_;
1892: m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;
1893: m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;
1894: m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
1895: m_defaultStrength_ = UCA_.m_defaultStrength_;
1896: m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
1897: m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
1898: m_expansionOffset_ = UCA_.m_expansionOffset_;
1899: m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
1900: m_isCaseLevel_ = UCA_.m_isCaseLevel_;
1901: m_isFrenchCollation_ = UCA_.m_isFrenchCollation_;
1902: m_isHiragana4_ = UCA_.m_isHiragana4_;
1903: m_isJamoSpecial_ = UCA_.m_isJamoSpecial_;
1904: m_isSimple3_ = UCA_.m_isSimple3_;
1905: m_mask3_ = UCA_.m_mask3_;
1906: m_minContractionEnd_ = UCA_.m_minContractionEnd_;
1907: m_minUnsafe_ = UCA_.m_minUnsafe_;
1908: m_rules_ = UCA_.m_rules_;
1909: setStrength(UCA_.getStrength());
1910: m_top3_ = UCA_.m_top3_;
1911: m_topCount3_ = UCA_.m_topCount3_;
1912: m_variableTopValue_ = UCA_.m_variableTopValue_;
1913: m_isNumericCollation_ = UCA_.m_isNumericCollation_;
1914: setWithUCATables();
1915: latinOneFailed_ = false;
1916: }
1917:
1918: /**
1919: * Test whether a char character is potentially "unsafe" for use as a
1920: * collation starting point. "Unsafe" characters are combining marks or
1921: * those belonging to some contraction sequence from the offset 1 onwards.
1922: * E.g. if "ABC" is the only contraction, then 'B' and
1923: * 'C' are considered unsafe. If we have another contraction "ZA" with
1924: * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
1925: * @param ch character to determin
1926: * @return true if ch is unsafe, false otherwise
1927: */
1928: final boolean isUnsafe(char ch) {
1929: if (ch < m_minUnsafe_) {
1930: return false;
1931: }
1932:
1933: if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
1934: if (UTF16.isLeadSurrogate(ch) || UTF16.isTrailSurrogate(ch)) {
1935: // Trail surrogate are always considered unsafe.
1936: return true;
1937: }
1938: ch &= HEURISTIC_OVERFLOW_MASK_;
1939: ch += HEURISTIC_OVERFLOW_OFFSET_;
1940: }
1941: int value = m_unsafe_[ch >> HEURISTIC_SHIFT_];
1942: return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
1943: }
1944:
1945: /**
1946: * Approximate determination if a char character is at a contraction end.
1947: * Guaranteed to be true if a character is at the end of a contraction,
1948: * otherwise it is not deterministic.
1949: * @param ch character to be determined
1950: */
1951: final boolean isContractionEnd(char ch) {
1952: if (UTF16.isTrailSurrogate(ch)) {
1953: return true;
1954: }
1955:
1956: if (ch < m_minContractionEnd_) {
1957: return false;
1958: }
1959:
1960: if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
1961: ch &= HEURISTIC_OVERFLOW_MASK_;
1962: ch += HEURISTIC_OVERFLOW_OFFSET_;
1963: }
1964: int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_];
1965: return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
1966: }
1967:
1968: /**
1969: * Retrieve the tag of a special ce
1970: * @param ce ce to test
1971: * @return tag of ce
1972: */
1973: static int getTag(int ce) {
1974: return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;
1975: }
1976:
1977: /**
1978: * Checking if ce is special
1979: * @param ce to check
1980: * @return true if ce is special
1981: */
1982: static boolean isSpecial(int ce) {
1983: return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;
1984: }
1985:
1986: /**
1987: * Checks if the argument ce is a continuation
1988: * @param ce collation element to test
1989: * @return true if ce is a continuation
1990: */
1991: static final boolean isContinuation(int ce) {
1992: return ce != CollationElementIterator.NULLORDER
1993: && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
1994: }
1995:
1996: // private inner classes ------------------------------------------------
1997:
1998: // private variables -----------------------------------------------------
1999:
2000: /**
2001: * The smallest natural unsafe or contraction end char character before
2002: * tailoring.
2003: * This is a combining mark.
2004: */
2005: private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;
2006: /**
2007: * Heuristic table table size. Size is 32 bytes, 1 bit for each
2008: * latin 1 char, and some power of two for hashing the rest of the chars.
2009: * Size in bytes.
2010: */
2011: private static final char HEURISTIC_SIZE_ = 1056;
2012: /**
2013: * Mask value down to "some power of two" - 1,
2014: * number of bits, not num of bytes.
2015: */
2016: private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;
2017: /**
2018: * Unsafe character shift
2019: */
2020: private static final int HEURISTIC_SHIFT_ = 3;
2021: /**
2022: * Unsafe character addition for character too large, it has to be folded
2023: * then incremented.
2024: */
2025: private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;
2026: /**
2027: * Mask value to get offset in heuristic table.
2028: */
2029: private static final char HEURISTIC_MASK_ = 7;
2030:
2031: private int m_caseSwitch_;
2032: private int m_common3_;
2033: private int m_mask3_;
2034: /**
2035: * When switching case, we need to add or subtract different values.
2036: */
2037: private int m_addition3_;
2038: /**
2039: * Upper range when compressing
2040: */
2041: private int m_top3_;
2042: /**
2043: * Upper range when compressing
2044: */
2045: private int m_bottom3_;
2046: private int m_topCount3_;
2047: private int m_bottomCount3_;
2048: /**
2049: * Case first constants
2050: */
2051: private static final int CASE_SWITCH_ = 0xC0;
2052: private static final int NO_CASE_SWITCH_ = 0;
2053: /**
2054: * Case level constants
2055: */
2056: private static final int CE_REMOVE_CASE_ = 0x3F;
2057: private static final int CE_KEEP_CASE_ = 0xFF;
2058: /**
2059: * Case strength mask
2060: */
2061: private static final int CE_CASE_MASK_3_ = 0xFF;
2062: /**
2063: * Sortkey size factor. Values can be changed.
2064: */
2065: private static final double PROPORTION_2_ = 0.5;
2066: private static final double PROPORTION_3_ = 0.667;
2067:
2068: // These values come from the UCA ----------------------------------------
2069:
2070: /**
2071: * This is an enum that lists magic special byte values from the
2072: * fractional UCA
2073: */
2074: private static final byte BYTE_ZERO_ = 0x0;
2075: private static final byte BYTE_LEVEL_SEPARATOR_ = (byte) 0x01;
2076: private static final byte BYTE_SORTKEY_GLUE_ = (byte) 0x02;
2077: private static final byte BYTE_SHIFT_PREFIX_ = (byte) 0x03;
2078: /*private*/static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
2079: private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
2080: static final byte CODAN_PLACEHOLDER = 0x24;
2081: private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte) 0x4C;
2082: private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte) 0x4D;
2083: private static final byte BYTE_UNSHIFTED_MAX_ = (byte) 0xFF;
2084: private static final int TOTAL_2_ = COMMON_TOP_2_
2085: - COMMON_BOTTOM_2_ - 1;
2086: private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;
2087: private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;
2088: private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85;
2089: private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45;
2090: private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;
2091: private static final int COMMON_BOTTOM_3_ = 0x05;
2092: private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;
2093: private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = COMMON_BOTTOM_3_;
2094: private static final int TOP_COUNT_2_ = (int) (PROPORTION_2_ * TOTAL_2_);
2095: private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;
2096: private static final int COMMON_2_ = COMMON_BOTTOM_2_;
2097: private static final int COMMON_UPPER_FIRST_3_ = 0xC5;
2098: private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;
2099: private static final int COMMON_4_ = (byte) 0xFF;
2100:
2101: /**
2102: * Minimum size required for the binary collation data in bytes.
2103: * Size of UCA header + size of options to 4 bytes
2104: */
2105: //private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
2106: /**
2107: * If this collator is to generate only simple tertiaries for fast path
2108: */
2109: private boolean m_isSimple3_;
2110:
2111: /**
2112: * French collation sorting flag
2113: */
2114: private boolean m_isFrenchCollation_;
2115: /**
2116: * Flag indicating if shifted is requested for Quaternary alternate
2117: * handling. If this is not true, the default for alternate handling will
2118: * be non-ignorable.
2119: */
2120: private boolean m_isAlternateHandlingShifted_;
2121: /**
2122: * Extra case level for sorting
2123: */
2124: private boolean m_isCaseLevel_;
2125:
2126: private static final int SORT_BUFFER_INIT_SIZE_ = 128;
2127: private static final int SORT_BUFFER_INIT_SIZE_1_ = SORT_BUFFER_INIT_SIZE_ << 3;
2128: private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;
2129: private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;
2130: private static final int SORT_BUFFER_INIT_SIZE_CASE_ = SORT_BUFFER_INIT_SIZE_ >> 2;
2131: private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;
2132:
2133: private static final int CE_CONTINUATION_TAG_ = 0xC0;
2134: private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F;
2135:
2136: private static final int LAST_BYTE_MASK_ = 0xFF;
2137:
2138: private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
2139: private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
2140:
2141: private static final byte SORT_CASE_BYTE_START_ = (byte) 0x80;
2142: private static final byte SORT_CASE_SHIFT_START_ = (byte) 7;
2143:
2144: /**
2145: * CE buffer size
2146: */
2147: private static final int CE_BUFFER_SIZE_ = 512;
2148:
2149: // variables for Latin-1 processing
2150: boolean latinOneUse_ = false;
2151: boolean latinOneRegenTable_ = false;
2152: boolean latinOneFailed_ = false;
2153:
2154: int latinOneTableLen_ = 0;
2155: int latinOneCEs_[] = null;
2156: /**
2157: * Bunch of utility iterators
2158: */
2159: private StringUCharacterIterator m_srcUtilIter_;
2160: private CollationElementIterator m_srcUtilColEIter_;
2161: private StringUCharacterIterator m_tgtUtilIter_;
2162: private CollationElementIterator m_tgtUtilColEIter_;
2163: /**
2164: * Utility comparison flags
2165: */
2166: private boolean m_utilCompare0_;
2167: private boolean m_utilCompare1_;
2168: private boolean m_utilCompare2_;
2169: private boolean m_utilCompare3_;
2170: private boolean m_utilCompare4_;
2171: private boolean m_utilCompare5_;
2172: /**
2173: * Utility byte buffer
2174: */
2175: private byte m_utilBytes0_[];
2176: private byte m_utilBytes1_[];
2177: private byte m_utilBytes2_[];
2178: private byte m_utilBytes3_[];
2179: private byte m_utilBytes4_[];
2180: private byte m_utilBytes5_[];
2181: private RawCollationKey m_utilRawCollationKey_;
2182:
2183: private int m_utilBytesCount0_;
2184: private int m_utilBytesCount1_;
2185: private int m_utilBytesCount2_;
2186: private int m_utilBytesCount3_;
2187: private int m_utilBytesCount4_;
2188: private int m_utilBytesCount5_;
2189: private int m_utilCount0_;
2190: private int m_utilCount1_;
2191: private int m_utilCount2_;
2192: private int m_utilCount3_;
2193: private int m_utilCount4_;
2194: private int m_utilCount5_;
2195:
2196: private int m_utilFrenchStart_;
2197: private int m_utilFrenchEnd_;
2198:
2199: /**
2200: * Preparing the CE buffers. will be filled during the primary phase
2201: */
2202: private int m_srcUtilCEBuffer_[];
2203: private int m_tgtUtilCEBuffer_[];
2204: private int m_srcUtilCEBufferSize_;
2205: private int m_tgtUtilCEBufferSize_;
2206:
2207: private int m_srcUtilContOffset_;
2208: private int m_tgtUtilContOffset_;
2209:
2210: private int m_srcUtilOffset_;
2211: private int m_tgtUtilOffset_;
2212:
2213: // private methods -------------------------------------------------------
2214:
2215: private void init(String rules) throws Exception {
2216: setWithUCAData();
2217: CollationParsedRuleBuilder builder = new CollationParsedRuleBuilder(
2218: rules);
2219: builder.setRules(this );
2220: m_rules_ = rules;
2221: init();
2222: initUtility(false);
2223: }
2224:
2225: private final int compareRegular(String source, String target,
2226: int offset) {
2227: if (m_srcUtilIter_ == null) {
2228: initUtility(true);
2229: }
2230: int strength = getStrength();
2231: // setting up the collator parameters
2232: m_utilCompare0_ = m_isCaseLevel_;
2233: m_utilCompare1_ = true;
2234: m_utilCompare2_ = strength >= SECONDARY;
2235: m_utilCompare3_ = strength >= TERTIARY;
2236: m_utilCompare4_ = strength >= QUATERNARY;
2237: m_utilCompare5_ = strength == IDENTICAL;
2238: boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
2239: boolean doShift4 = m_isAlternateHandlingShifted_
2240: && m_utilCompare4_;
2241: boolean doHiragana4 = m_isHiragana4_ && m_utilCompare4_;
2242:
2243: if (doHiragana4 && doShift4) {
2244: String sourcesub = source.substring(offset);
2245: String targetsub = target.substring(offset);
2246: return compareBySortKeys(sourcesub, targetsub);
2247: }
2248:
2249: // This is the lowest primary value that will not be ignored if shifted
2250: int lowestpvalue = m_isAlternateHandlingShifted_ ? m_variableTopValue_ << 16
2251: : 0;
2252: m_srcUtilCEBufferSize_ = 0;
2253: m_tgtUtilCEBufferSize_ = 0;
2254: int result = doPrimaryCompare(doHiragana4, lowestpvalue,
2255: source, target, offset);
2256: if (m_srcUtilCEBufferSize_ == -1
2257: && m_tgtUtilCEBufferSize_ == -1) {
2258: // since the cebuffer is cleared when we have determined that
2259: // either source is greater than target or vice versa, the return
2260: // result is the comparison result and not the hiragana result
2261: return result;
2262: }
2263:
2264: int hiraganaresult = result;
2265:
2266: if (m_utilCompare2_) {
2267: result = doSecondaryCompare(doFrench);
2268: if (result != 0) {
2269: return result;
2270: }
2271: }
2272: // doing the case bit
2273: if (m_utilCompare0_) {
2274: result = doCaseCompare();
2275: if (result != 0) {
2276: return result;
2277: }
2278: }
2279: // Tertiary level
2280: if (m_utilCompare3_) {
2281: result = doTertiaryCompare();
2282: if (result != 0) {
2283: return result;
2284: }
2285: }
2286:
2287: if (doShift4) { // checkQuad
2288: result = doQuaternaryCompare(lowestpvalue);
2289: if (result != 0) {
2290: return result;
2291: }
2292: } else if (doHiragana4 && hiraganaresult != 0) {
2293: // If we're fine on quaternaries, we might be different
2294: // on Hiragana. This, however, might fail us in shifted.
2295: return hiraganaresult;
2296: }
2297:
2298: // For IDENTICAL comparisons, we use a bitwise character comparison
2299: // as a tiebreaker if all else is equal.
2300: // Getting here should be quite rare - strings are not identical -
2301: // that is checked first, but compared == through all other checks.
2302: if (m_utilCompare5_) {
2303: return doIdenticalCompare(source, target, offset, true);
2304: }
2305: return 0;
2306: }
2307:
2308: /**
2309: * Gets the 2 bytes of primary order and adds it to the primary byte array
2310: * @param ce current ce
2311: * @param notIsContinuation flag indicating if the current bytes belong to
2312: * a continuation ce
2313: * @param doShift flag indicating if ce is to be shifted
2314: * @param leadPrimary lead primary used for compression
2315: * @param commonBottom4 common byte value for Quaternary
2316: * @param bottomCount4 smallest byte value for Quaternary
2317: * @return the new lead primary for compression
2318: */
2319: private final int doPrimaryBytes(int ce, boolean notIsContinuation,
2320: boolean doShift, int leadPrimary, int commonBottom4,
2321: int bottomCount4) {
2322:
2323: int p2 = (ce >>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
2324: int p1 = ce >>> 8; // comparison
2325: if (doShift) {
2326: if (m_utilCount4_ > 0) {
2327: while (m_utilCount4_ > bottomCount4) {
2328: m_utilBytes4_ = append(m_utilBytes4_,
2329: m_utilBytesCount4_,
2330: (byte) (commonBottom4 + bottomCount4));
2331: m_utilBytesCount4_++;
2332: m_utilCount4_ -= bottomCount4;
2333: }
2334: m_utilBytes4_ = append(m_utilBytes4_,
2335: m_utilBytesCount4_,
2336: (byte) (commonBottom4 + (m_utilCount4_ - 1)));
2337: m_utilBytesCount4_++;
2338: m_utilCount4_ = 0;
2339: }
2340: // dealing with a variable and we're treating them as shifted
2341: // This is a shifted ignorable
2342: if (p1 != 0) {
2343: // we need to check this since we could be in continuation
2344: m_utilBytes4_ = append(m_utilBytes4_,
2345: m_utilBytesCount4_, (byte) p1);
2346: m_utilBytesCount4_++;
2347: }
2348: if (p2 != 0) {
2349: m_utilBytes4_ = append(m_utilBytes4_,
2350: m_utilBytesCount4_, (byte) p2);
2351: m_utilBytesCount4_++;
2352: }
2353: } else {
2354: // Note: This code assumes that the table is well built
2355: // i.e. not having 0 bytes where they are not supposed to be.
2356: // Usually, we'll have non-zero primary1 & primary2, except
2357: // in cases of LatinOne and friends, when primary2 will be
2358: // regular and simple sortkey calc
2359: if (p1 != CollationElementIterator.IGNORABLE) {
2360: if (notIsContinuation) {
2361: if (leadPrimary == p1) {
2362: m_utilBytes1_ = append(m_utilBytes1_,
2363: m_utilBytesCount1_, (byte) p2);
2364: m_utilBytesCount1_++;
2365: } else {
2366: if (leadPrimary != 0) {
2367: m_utilBytes1_ = append(
2368: m_utilBytes1_,
2369: m_utilBytesCount1_,
2370: ((p1 > leadPrimary) ? BYTE_UNSHIFTED_MAX_
2371: : BYTE_UNSHIFTED_MIN_));
2372: m_utilBytesCount1_++;
2373: }
2374: if (p2 == CollationElementIterator.IGNORABLE) {
2375: // one byter, not compressed
2376: m_utilBytes1_ = append(m_utilBytes1_,
2377: m_utilBytesCount1_, (byte) p1);
2378: m_utilBytesCount1_++;
2379: leadPrimary = 0;
2380: } else if (p1 < BYTE_FIRST_NON_LATIN_PRIMARY_
2381: || (p1 > maxRegularPrimary
2382: //> (RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_[0]
2383: // >>> 24)
2384: && p1 < minImplicitPrimary
2385: //< (RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_[0]
2386: // >>> 24)
2387: )) {
2388: // not compressible
2389: leadPrimary = 0;
2390: m_utilBytes1_ = append(m_utilBytes1_,
2391: m_utilBytesCount1_, (byte) p1);
2392: m_utilBytesCount1_++;
2393: m_utilBytes1_ = append(m_utilBytes1_,
2394: m_utilBytesCount1_, (byte) p2);
2395: m_utilBytesCount1_++;
2396: } else { // compress
2397: leadPrimary = p1;
2398: m_utilBytes1_ = append(m_utilBytes1_,
2399: m_utilBytesCount1_, (byte) p1);
2400: m_utilBytesCount1_++;
2401: m_utilBytes1_ = append(m_utilBytes1_,
2402: m_utilBytesCount1_, (byte) p2);
2403: m_utilBytesCount1_++;
2404: }
2405: }
2406: } else {
2407: // continuation, add primary to the key, no compression
2408: m_utilBytes1_ = append(m_utilBytes1_,
2409: m_utilBytesCount1_, (byte) p1);
2410: m_utilBytesCount1_++;
2411: if (p2 != CollationElementIterator.IGNORABLE) {
2412: m_utilBytes1_ = append(m_utilBytes1_,
2413: m_utilBytesCount1_, (byte) p2);
2414: // second part
2415: m_utilBytesCount1_++;
2416: }
2417: }
2418: }
2419: }
2420: return leadPrimary;
2421: }
2422:
2423: /**
2424: * Gets the secondary byte and adds it to the secondary byte array
2425: * @param ce current ce
2426: * @param notIsContinuation flag indicating if the current bytes belong to
2427: * a continuation ce
2428: * @param doFrench flag indicator if french sort is to be performed
2429: */
2430: private final void doSecondaryBytes(int ce,
2431: boolean notIsContinuation, boolean doFrench) {
2432: int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison
2433: if (s != 0) {
2434: if (!doFrench) {
2435: // This is compression code.
2436: if (s == COMMON_2_ && notIsContinuation) {
2437: m_utilCount2_++;
2438: } else {
2439: if (m_utilCount2_ > 0) {
2440: if (s > COMMON_2_) { // not necessary for 4th level.
2441: while (m_utilCount2_ > TOP_COUNT_2_) {
2442: m_utilBytes2_ = append(
2443: m_utilBytes2_,
2444: m_utilBytesCount2_,
2445: (byte) (COMMON_TOP_2_ - TOP_COUNT_2_));
2446: m_utilBytesCount2_++;
2447: m_utilCount2_ -= TOP_COUNT_2_;
2448: }
2449: m_utilBytes2_ = append(
2450: m_utilBytes2_,
2451: m_utilBytesCount2_,
2452: (byte) (COMMON_TOP_2_ - (m_utilCount2_ - 1)));
2453: m_utilBytesCount2_++;
2454: } else {
2455: while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2456: m_utilBytes2_ = append(
2457: m_utilBytes2_,
2458: m_utilBytesCount2_,
2459: (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
2460: m_utilBytesCount2_++;
2461: m_utilCount2_ -= BOTTOM_COUNT_2_;
2462: }
2463: m_utilBytes2_ = append(
2464: m_utilBytes2_,
2465: m_utilBytesCount2_,
2466: (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1)));
2467: m_utilBytesCount2_++;
2468: }
2469: m_utilCount2_ = 0;
2470: }
2471: m_utilBytes2_ = append(m_utilBytes2_,
2472: m_utilBytesCount2_, (byte) s);
2473: m_utilBytesCount2_++;
2474: }
2475: } else {
2476: m_utilBytes2_ = append(m_utilBytes2_,
2477: m_utilBytesCount2_, (byte) s);
2478: m_utilBytesCount2_++;
2479: // Do the special handling for French secondaries
2480: // We need to get continuation elements and do intermediate
2481: // restore
2482: // abc1c2c3de with french secondaries need to be edc1c2c3ba
2483: // NOT edc3c2c1ba
2484: if (notIsContinuation) {
2485: if (m_utilFrenchStart_ != -1) {
2486: // reverse secondaries from frenchStartPtr up to
2487: // frenchEndPtr
2488: reverseBuffer(m_utilBytes2_);
2489: m_utilFrenchStart_ = -1;
2490: }
2491: } else {
2492: if (m_utilFrenchStart_ == -1) {
2493: m_utilFrenchStart_ = m_utilBytesCount2_ - 2;
2494: }
2495: m_utilFrenchEnd_ = m_utilBytesCount2_ - 1;
2496: }
2497: }
2498: }
2499: }
2500:
2501: /**
2502: * Reverse the argument buffer
2503: * @param buffer to reverse
2504: */
2505: private void reverseBuffer(byte buffer[]) {
2506: int start = m_utilFrenchStart_;
2507: int end = m_utilFrenchEnd_;
2508: while (start < end) {
2509: byte b = buffer[start];
2510: buffer[start++] = buffer[end];
2511: buffer[end--] = b;
2512: }
2513: }
2514:
2515: /**
2516: * Insert the case shifting byte if required
2517: * @param caseshift value
2518: * @return new caseshift value
2519: */
2520: private final int doCaseShift(int caseshift) {
2521: if (caseshift == 0) {
2522: m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_,
2523: SORT_CASE_BYTE_START_);
2524: m_utilBytesCount0_++;
2525: caseshift = SORT_CASE_SHIFT_START_;
2526: }
2527: return caseshift;
2528: }
2529:
2530: /**
2531: * Performs the casing sort
2532: * @param tertiary byte in ints for easy comparison
2533: * @param notIsContinuation flag indicating if the current bytes belong to
2534: * a continuation ce
2535: * @param caseshift
2536: * @return the new value of case shift
2537: */
2538: private final int doCaseBytes(int tertiary,
2539: boolean notIsContinuation, int caseshift) {
2540: caseshift = doCaseShift(caseshift);
2541:
2542: if (notIsContinuation && tertiary != 0) {
2543: byte casebits = (byte) (tertiary & 0xC0);
2544: if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
2545: if (casebits == 0) {
2546: m_utilBytes0_[m_utilBytesCount0_ - 1] |= (1 << (--caseshift));
2547: } else {
2548: // second bit
2549: caseshift = doCaseShift(caseshift - 1);
2550: m_utilBytes0_[m_utilBytesCount0_ - 1] |= ((casebits >> 6) & 1) << (--caseshift);
2551: }
2552: } else {
2553: if (casebits != 0) {
2554: m_utilBytes0_[m_utilBytesCount0_ - 1] |= 1 << (--caseshift);
2555: // second bit
2556: caseshift = doCaseShift(caseshift);
2557: m_utilBytes0_[m_utilBytesCount0_ - 1] |= ((casebits >> 7) & 1) << (--caseshift);
2558: } else {
2559: caseshift--;
2560: }
2561: }
2562: }
2563:
2564: return caseshift;
2565: }
2566:
2567: /**
2568: * Gets the tertiary byte and adds it to the tertiary byte array
2569: * @param tertiary byte in int for easy comparison
2570: * @param notIsContinuation flag indicating if the current bytes belong to
2571: * a continuation ce
2572: */
2573: private final void doTertiaryBytes(int tertiary,
2574: boolean notIsContinuation) {
2575: if (tertiary != 0) {
2576: // This is compression code.
2577: // sequence size check is included in the if clause
2578: if (tertiary == m_common3_ && notIsContinuation) {
2579: m_utilCount3_++;
2580: } else {
2581: int common3 = m_common3_ & LAST_BYTE_MASK_;
2582: if (tertiary > common3
2583: && m_common3_ == COMMON_NORMAL_3_) {
2584: tertiary += m_addition3_;
2585: } else if (tertiary <= common3
2586: && m_common3_ == COMMON_UPPER_FIRST_3_) {
2587: tertiary -= m_addition3_;
2588: }
2589: if (m_utilCount3_ > 0) {
2590: if (tertiary > common3) {
2591: while (m_utilCount3_ > m_topCount3_) {
2592: m_utilBytes3_ = append(m_utilBytes3_,
2593: m_utilBytesCount3_,
2594: (byte) (m_top3_ - m_topCount3_));
2595: m_utilBytesCount3_++;
2596: m_utilCount3_ -= m_topCount3_;
2597: }
2598: m_utilBytes3_ = append(m_utilBytes3_,
2599: m_utilBytesCount3_,
2600: (byte) (m_top3_ - (m_utilCount3_ - 1)));
2601: m_utilBytesCount3_++;
2602: } else {
2603: while (m_utilCount3_ > m_bottomCount3_) {
2604: m_utilBytes3_ = append(
2605: m_utilBytes3_,
2606: m_utilBytesCount3_,
2607: (byte) (m_bottom3_ + m_bottomCount3_));
2608: m_utilBytesCount3_++;
2609: m_utilCount3_ -= m_bottomCount3_;
2610: }
2611: m_utilBytes3_ = append(
2612: m_utilBytes3_,
2613: m_utilBytesCount3_,
2614: (byte) (m_bottom3_ + (m_utilCount3_ - 1)));
2615: m_utilBytesCount3_++;
2616: }
2617: m_utilCount3_ = 0;
2618: }
2619: m_utilBytes3_ = append(m_utilBytes3_,
2620: m_utilBytesCount3_, (byte) tertiary);
2621: m_utilBytesCount3_++;
2622: }
2623: }
2624: }
2625:
2626: /**
2627: * Gets the Quaternary byte and adds it to the Quaternary byte array
2628: * @param isCodePointHiragana flag indicator if the previous codepoint
2629: * we dealt with was Hiragana
2630: * @param commonBottom4 smallest common Quaternary byte
2631: * @param bottomCount4 smallest Quaternary byte
2632: * @param hiragana4 hiragana Quaternary byte
2633: */
2634: private final void doQuaternaryBytes(boolean isCodePointHiragana,
2635: int commonBottom4, int bottomCount4, byte hiragana4) {
2636: if (isCodePointHiragana) { // This was Hiragana, need to note it
2637: if (m_utilCount4_ > 0) { // Close this part
2638: while (m_utilCount4_ > bottomCount4) {
2639: m_utilBytes4_ = append(m_utilBytes4_,
2640: m_utilBytesCount4_,
2641: (byte) (commonBottom4 + bottomCount4));
2642: m_utilBytesCount4_++;
2643: m_utilCount4_ -= bottomCount4;
2644: }
2645: m_utilBytes4_ = append(m_utilBytes4_,
2646: m_utilBytesCount4_,
2647: (byte) (commonBottom4 + (m_utilCount4_ - 1)));
2648: m_utilBytesCount4_++;
2649: m_utilCount4_ = 0;
2650: }
2651: m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2652: hiragana4); // Add the Hiragana
2653: m_utilBytesCount4_++;
2654: } else { // This wasn't Hiragana, so we can continue adding stuff
2655: m_utilCount4_++;
2656: }
2657: }
2658:
2659: /**
2660: * Iterates through the argument string for all ces.
2661: * Split the ces into their relevant primaries, secondaries etc.
2662: * @param source normalized string
2663: * @param doFrench flag indicator if special handling of French has to be
2664: * done
2665: * @param hiragana4 offset for Hiragana quaternary
2666: * @param commonBottom4 smallest common quaternary byte
2667: * @param bottomCount4 smallest quaternary byte
2668: */
2669: private final void getSortKeyBytes(String source, boolean doFrench,
2670: byte hiragana4, int commonBottom4, int bottomCount4)
2671:
2672: {
2673: if (m_srcUtilIter_ == null) {
2674: initUtility(true);
2675: }
2676: int backupDecomposition = getDecomposition();
2677: setDecomposition(NO_DECOMPOSITION); // have to revert to backup later
2678: m_srcUtilIter_.setText(source);
2679: m_srcUtilColEIter_.setText(m_srcUtilIter_);
2680: m_utilFrenchStart_ = -1;
2681: m_utilFrenchEnd_ = -1;
2682:
2683: // scriptorder not implemented yet
2684: // const uint8_t *scriptOrder = coll->scriptOrder;
2685:
2686: boolean doShift = false;
2687: boolean notIsContinuation = false;
2688:
2689: int leadPrimary = 0; // int for easier comparison
2690: int caseShift = 0;
2691:
2692: while (true) {
2693: int ce = m_srcUtilColEIter_.next();
2694: if (ce == CollationElementIterator.NULLORDER) {
2695: break;
2696: }
2697:
2698: if (ce == CollationElementIterator.IGNORABLE) {
2699: continue;
2700: }
2701:
2702: notIsContinuation = !isContinuation(ce);
2703:
2704: /*
2705: * if (notIsContinuation) {
2706: if (scriptOrder != NULL) {
2707: primary1 = scriptOrder[primary1];
2708: }
2709: }*/
2710: boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
2711: // actually we can just check that the first byte is 0
2712: // generation stuffs the order left first
2713: boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_) <= m_variableTopValue_;
2714: doShift = (m_isAlternateHandlingShifted_
2715: && ((notIsContinuation && isSmallerThanVariableTop && !isPrimaryByteIgnorable) // primary byte not 0
2716: || (!notIsContinuation && doShift)) || (doShift && isPrimaryByteIgnorable));
2717: if (doShift && isPrimaryByteIgnorable) {
2718: // amendment to the UCA says that primary ignorables and other
2719: // ignorables should be removed if following a shifted code
2720: // point
2721: // if we were shifted and we got an ignorable code point
2722: // we should just completely ignore it
2723: continue;
2724: }
2725: leadPrimary = doPrimaryBytes(ce, notIsContinuation,
2726: doShift, leadPrimary, commonBottom4, bottomCount4);
2727: if (doShift) {
2728: continue;
2729: }
2730: if (m_utilCompare2_) {
2731: doSecondaryBytes(ce, notIsContinuation, doFrench);
2732: }
2733:
2734: int t = ce & LAST_BYTE_MASK_;
2735: if (!notIsContinuation) {
2736: t = ce & CE_REMOVE_CONTINUATION_MASK_;
2737: }
2738:
2739: if (m_utilCompare0_
2740: && (!isPrimaryByteIgnorable || m_utilCompare2_)) {
2741: // do the case level if we need to do it. We don't want to calculate
2742: // case level for primary ignorables if we have only primary strength and case level
2743: // otherwise we would break well formedness of CEs
2744: caseShift = doCaseBytes(t, notIsContinuation, caseShift);
2745: } else if (notIsContinuation) {
2746: t ^= m_caseSwitch_;
2747: }
2748:
2749: t &= m_mask3_;
2750:
2751: if (m_utilCompare3_) {
2752: doTertiaryBytes(t, notIsContinuation);
2753: }
2754:
2755: if (m_utilCompare4_ && notIsContinuation) { // compare quad
2756: doQuaternaryBytes(
2757: m_srcUtilColEIter_.m_isCodePointHiragana_,
2758: commonBottom4, bottomCount4, hiragana4);
2759: }
2760: }
2761: setDecomposition(backupDecomposition); // reverts to original
2762: if (m_utilFrenchStart_ != -1) {
2763: // one last round of checks
2764: reverseBuffer(m_utilBytes2_);
2765: }
2766: }
2767:
2768: /**
2769: * From the individual strength byte results the final compact sortkey
2770: * will be calculated.
2771: * @param source text string
2772: * @param doFrench flag indicating that special handling of French has to
2773: * be done
2774: * @param commonBottom4 smallest common quaternary byte
2775: * @param bottomCount4 smallest quaternary byte
2776: * @param key output RawCollationKey to store results, key cannot be null
2777: */
2778: private final void getSortKey(String source, boolean doFrench,
2779: int commonBottom4, int bottomCount4, RawCollationKey key) {
2780: // we have done all the CE's, now let's put them together to form
2781: // a key
2782: if (m_utilCompare2_) {
2783: doSecondary(doFrench);
2784: }
2785: // adding case level should be independent of secondary level
2786: if (m_utilCompare0_) {
2787: doCase();
2788: }
2789: if (m_utilCompare3_) {
2790: doTertiary();
2791: if (m_utilCompare4_) {
2792: doQuaternary(commonBottom4, bottomCount4);
2793: if (m_utilCompare5_) {
2794: doIdentical(source);
2795: }
2796:
2797: }
2798: }
2799: m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2800: (byte) 0);
2801: m_utilBytesCount1_++;
2802:
2803: key.set(m_utilBytes1_, 0, m_utilBytesCount1_);
2804: }
2805:
2806: /**
2807: * Packs the French bytes
2808: */
2809: private final void doFrench() {
2810: for (int i = 0; i < m_utilBytesCount2_; i++) {
2811: byte s = m_utilBytes2_[m_utilBytesCount2_ - i - 1];
2812: // This is compression code.
2813: if (s == COMMON_2_) {
2814: ++m_utilCount2_;
2815: } else {
2816: if (m_utilCount2_ > 0) {
2817: // getting the unsigned value
2818: if ((s & LAST_BYTE_MASK_) > COMMON_2_) {
2819: // not necessary for 4th level.
2820: while (m_utilCount2_ > TOP_COUNT_2_) {
2821: m_utilBytes1_ = append(
2822: m_utilBytes1_,
2823: m_utilBytesCount1_,
2824: (byte) (COMMON_TOP_2_ - TOP_COUNT_2_));
2825: m_utilBytesCount1_++;
2826: m_utilCount2_ -= TOP_COUNT_2_;
2827: }
2828: m_utilBytes1_ = append(
2829: m_utilBytes1_,
2830: m_utilBytesCount1_,
2831: (byte) (COMMON_TOP_2_ - (m_utilCount2_ - 1)));
2832: m_utilBytesCount1_++;
2833: } else {
2834: while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2835: m_utilBytes1_ = append(
2836: m_utilBytes1_,
2837: m_utilBytesCount1_,
2838: (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
2839: m_utilBytesCount1_++;
2840: m_utilCount2_ -= BOTTOM_COUNT_2_;
2841: }
2842: m_utilBytes1_ = append(
2843: m_utilBytes1_,
2844: m_utilBytesCount1_,
2845: (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1)));
2846: m_utilBytesCount1_++;
2847: }
2848: m_utilCount2_ = 0;
2849: }
2850: m_utilBytes1_ = append(m_utilBytes1_,
2851: m_utilBytesCount1_, s);
2852: m_utilBytesCount1_++;
2853: }
2854: }
2855: if (m_utilCount2_ > 0) {
2856: while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2857: m_utilBytes1_ = append(m_utilBytes1_,
2858: m_utilBytesCount1_,
2859: (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
2860: m_utilBytesCount1_++;
2861: m_utilCount2_ -= BOTTOM_COUNT_2_;
2862: }
2863: m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2864: (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1)));
2865: m_utilBytesCount1_++;
2866: }
2867: }
2868:
2869: /**
2870: * Compacts the secondary bytes and stores them into the primary array
2871: * @param doFrench flag indicator that French has to be handled specially
2872: */
2873: private final void doSecondary(boolean doFrench) {
2874: if (m_utilCount2_ > 0) {
2875: while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2876: m_utilBytes2_ = append(m_utilBytes2_,
2877: m_utilBytesCount2_,
2878: (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
2879: m_utilBytesCount2_++;
2880: m_utilCount2_ -= BOTTOM_COUNT_2_;
2881: }
2882: m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
2883: (byte) (COMMON_BOTTOM_2_ + (m_utilCount2_ - 1)));
2884: m_utilBytesCount2_++;
2885: }
2886:
2887: m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2888: SORT_LEVEL_TERMINATOR_);
2889: m_utilBytesCount1_++;
2890:
2891: if (doFrench) { // do the reverse copy
2892: doFrench();
2893: } else {
2894: if (m_utilBytes1_.length <= m_utilBytesCount1_
2895: + m_utilBytesCount2_) {
2896: m_utilBytes1_ = increase(m_utilBytes1_,
2897: m_utilBytesCount1_, m_utilBytesCount2_);
2898: }
2899: System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_,
2900: m_utilBytesCount1_, m_utilBytesCount2_);
2901: m_utilBytesCount1_ += m_utilBytesCount2_;
2902: }
2903: }
2904:
2905: /**
2906: * Increase buffer size
2907: * @param buffer array of bytes
2908: * @param size of the byte array
2909: * @param incrementsize size to increase
2910: * @return the new buffer
2911: */
2912: private static final byte[] increase(byte buffer[], int size,
2913: int incrementsize) {
2914: byte result[] = new byte[buffer.length + incrementsize];
2915: System.arraycopy(buffer, 0, result, 0, size);
2916: return result;
2917: }
2918:
2919: /**
2920: * Increase buffer size
2921: * @param buffer array of ints
2922: * @param size of the byte array
2923: * @param incrementsize size to increase
2924: * @return the new buffer
2925: */
2926: private static final int[] increase(int buffer[], int size,
2927: int incrementsize) {
2928: int result[] = new int[buffer.length + incrementsize];
2929: System.arraycopy(buffer, 0, result, 0, size);
2930: return result;
2931: }
2932:
2933: /**
2934: * Compacts the case bytes and stores them into the primary array
2935: */
2936: private final void doCase() {
2937: m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2938: SORT_LEVEL_TERMINATOR_);
2939: m_utilBytesCount1_++;
2940: if (m_utilBytes1_.length <= m_utilBytesCount1_
2941: + m_utilBytesCount0_) {
2942: m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
2943: m_utilBytesCount0_);
2944: }
2945: System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_,
2946: m_utilBytesCount1_, m_utilBytesCount0_);
2947: m_utilBytesCount1_ += m_utilBytesCount0_;
2948: }
2949:
2950: /**
2951: * Compacts the tertiary bytes and stores them into the primary array
2952: */
2953: private final void doTertiary() {
2954: if (m_utilCount3_ > 0) {
2955: if (m_common3_ != COMMON_BOTTOM_3_) {
2956: while (m_utilCount3_ >= m_topCount3_) {
2957: m_utilBytes3_ = append(m_utilBytes3_,
2958: m_utilBytesCount3_,
2959: (byte) (m_top3_ - m_topCount3_));
2960: m_utilBytesCount3_++;
2961: m_utilCount3_ -= m_topCount3_;
2962: }
2963: m_utilBytes3_ = append(m_utilBytes3_,
2964: m_utilBytesCount3_,
2965: (byte) (m_top3_ - m_utilCount3_));
2966: m_utilBytesCount3_++;
2967: } else {
2968: while (m_utilCount3_ > m_bottomCount3_) {
2969: m_utilBytes3_ = append(m_utilBytes3_,
2970: m_utilBytesCount3_,
2971: (byte) (m_bottom3_ + m_bottomCount3_));
2972: m_utilBytesCount3_++;
2973: m_utilCount3_ -= m_bottomCount3_;
2974: }
2975: m_utilBytes3_ = append(m_utilBytes3_,
2976: m_utilBytesCount3_,
2977: (byte) (m_bottom3_ + (m_utilCount3_ - 1)));
2978: m_utilBytesCount3_++;
2979: }
2980: }
2981: m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2982: SORT_LEVEL_TERMINATOR_);
2983: m_utilBytesCount1_++;
2984: if (m_utilBytes1_.length <= m_utilBytesCount1_
2985: + m_utilBytesCount3_) {
2986: m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
2987: m_utilBytesCount3_);
2988: }
2989: System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_,
2990: m_utilBytesCount1_, m_utilBytesCount3_);
2991: m_utilBytesCount1_ += m_utilBytesCount3_;
2992: }
2993:
2994: /**
2995: * Compacts the quaternary bytes and stores them into the primary array
2996: */
2997: private final void doQuaternary(int commonbottom4, int bottomcount4) {
2998: if (m_utilCount4_ > 0) {
2999: while (m_utilCount4_ > bottomcount4) {
3000: m_utilBytes4_ = append(m_utilBytes4_,
3001: m_utilBytesCount4_,
3002: (byte) (commonbottom4 + bottomcount4));
3003: m_utilBytesCount4_++;
3004: m_utilCount4_ -= bottomcount4;
3005: }
3006: m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
3007: (byte) (commonbottom4 + (m_utilCount4_ - 1)));
3008: m_utilBytesCount4_++;
3009: }
3010: m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
3011: SORT_LEVEL_TERMINATOR_);
3012: m_utilBytesCount1_++;
3013: if (m_utilBytes1_.length <= m_utilBytesCount1_
3014: + m_utilBytesCount4_) {
3015: m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
3016: m_utilBytesCount4_);
3017: }
3018: System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_,
3019: m_utilBytesCount1_, m_utilBytesCount4_);
3020: m_utilBytesCount1_ += m_utilBytesCount4_;
3021: }
3022:
3023: /**
3024: * Deals with the identical sort.
3025: * Appends the BOCSU version of the source string to the ends of the
3026: * byte buffer.
3027: * @param source text string
3028: */
3029: private final void doIdentical(String source) {
3030: int isize = BOCU.getCompressionLength(source);
3031: m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
3032: SORT_LEVEL_TERMINATOR_);
3033: m_utilBytesCount1_++;
3034: if (m_utilBytes1_.length <= m_utilBytesCount1_ + isize) {
3035: m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
3036: 1 + isize);
3037: }
3038: m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_,
3039: m_utilBytesCount1_);
3040: }
3041:
3042: /**
3043: * Gets the offset of the first unmatched characters in source and target.
3044: * This method returns the offset of the start of a contraction or a
3045: * combining sequence, if the first difference is in the middle of such a
3046: * sequence.
3047: * @param source string
3048: * @param target string
3049: * @return offset of the first unmatched characters in source and target.
3050: */
3051: private final int getFirstUnmatchedOffset(String source,
3052: String target) {
3053: int result = 0;
3054: int slength = source.length();
3055: int tlength = target.length();
3056: int minlength = slength;
3057: if (minlength > tlength) {
3058: minlength = tlength;
3059: }
3060: while (result < minlength
3061: && source.charAt(result) == target.charAt(result)) {
3062: result++;
3063: }
3064: if (result > 0) {
3065: // There is an identical portion at the beginning of the two
3066: // strings. If the identical portion ends within a contraction or a
3067: // combining character sequence, back up to the start of that
3068: // sequence.
3069: char schar = 0;
3070: char tchar = 0;
3071: if (result < minlength) {
3072: schar = source.charAt(result); // first differing chars
3073: tchar = target.charAt(result);
3074: } else {
3075: schar = source.charAt(minlength - 1);
3076: if (isUnsafe(schar)) {
3077: tchar = schar;
3078: } else if (slength == tlength) {
3079: return result;
3080: } else if (slength < tlength) {
3081: tchar = target.charAt(result);
3082: } else {
3083: schar = source.charAt(result);
3084: }
3085: }
3086: if (isUnsafe(schar) || isUnsafe(tchar)) {
3087: // We are stopped in the middle of a contraction or combining
3088: // sequence.
3089: // Look backwards for the part of the string for the start of
3090: // the sequence
3091: // It doesn't matter which string we scan, since they are the
3092: // same in this region.
3093: do {
3094: result--;
3095: } while (result > 0 && isUnsafe(source.charAt(result)));
3096: }
3097: }
3098: return result;
3099: }
3100:
3101: /**
3102: * Appending an byte to an array of bytes and increases it if we run out of
3103: * space
3104: * @param array of byte arrays
3105: * @param appendindex index in the byte array to append
3106: * @param value to append
3107: * @return array if array size can accomodate the new value, otherwise
3108: * a bigger array will be created and returned
3109: */
3110: private static final byte[] append(byte array[], int appendindex,
3111: byte value) {
3112: try {
3113: array[appendindex] = value;
3114: } catch (ArrayIndexOutOfBoundsException e) {
3115: array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);
3116: array[appendindex] = value;
3117: }
3118: return array;
3119: }
3120:
3121: /**
3122: * This is a trick string compare function that goes in and uses sortkeys
3123: * to compare. It is used when compare gets in trouble and needs to bail
3124: * out.
3125: * @param source text string
3126: * @param target text string
3127: */
3128: private final int compareBySortKeys(String source, String target)
3129:
3130: {
3131: m_utilRawCollationKey_ = getRawCollationKey(source,
3132: m_utilRawCollationKey_);
3133: // this method is very seldom called
3134: RawCollationKey targetkey = getRawCollationKey(target, null);
3135: return m_utilRawCollationKey_.compareTo(targetkey);
3136: }
3137:
3138: /**
3139: * Performs the primary comparisons, and fills up the CE buffer at the
3140: * same time.
3141: * The return value toggles between the comparison result and the hiragana
3142: * result. If either the source is greater than target or vice versa, the
3143: * return result is the comparison result, ie 1 or -1, furthermore the
3144: * cebuffers will be cleared when that happens. If the primary comparisons
3145: * are equal, we'll have to continue with secondary comparison. In this case
3146: * the cebuffer will not be cleared and the return result will be the
3147: * hiragana result.
3148: * @param doHiragana4 flag indicator that Hiragana Quaternary has to be
3149: * observed
3150: * @param lowestpvalue the lowest primary value that will not be ignored if
3151: * alternate handling is shifted
3152: * @param source text string
3153: * @param target text string
3154: * @param textoffset offset in text to start the comparison
3155: * @return comparion result if a primary difference is found, otherwise
3156: * hiragana result
3157: */
3158: private final int doPrimaryCompare(boolean doHiragana4,
3159: int lowestpvalue, String source, String target,
3160: int textoffset)
3161:
3162: {
3163: // Preparing the context objects for iterating over strings
3164: m_srcUtilIter_.setText(source);
3165: m_srcUtilColEIter_.setText(m_srcUtilIter_, textoffset);
3166: m_tgtUtilIter_.setText(target);
3167: m_tgtUtilColEIter_.setText(m_tgtUtilIter_, textoffset);
3168:
3169: // Non shifted primary processing is quite simple
3170: if (!m_isAlternateHandlingShifted_) {
3171: int hiraganaresult = 0;
3172: while (true) {
3173: int sorder = 0;
3174: // We fetch CEs until we hit a non ignorable primary or end.
3175: do {
3176: sorder = m_srcUtilColEIter_.next();
3177: m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_,
3178: m_srcUtilCEBufferSize_, sorder);
3179: m_srcUtilCEBufferSize_++;
3180: sorder &= CE_PRIMARY_MASK_;
3181: } while (sorder == CollationElementIterator.IGNORABLE);
3182:
3183: int torder = 0;
3184: do {
3185: torder = m_tgtUtilColEIter_.next();
3186: m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_,
3187: m_tgtUtilCEBufferSize_, torder);
3188: m_tgtUtilCEBufferSize_++;
3189: torder &= CE_PRIMARY_MASK_;
3190: } while (torder == CollationElementIterator.IGNORABLE);
3191:
3192: // if both primaries are the same
3193: if (sorder == torder) {
3194: // and there are no more CEs, we advance to the next level
3195: // see if we are at the end of either string
3196: if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
3197: if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] != CollationElementIterator.NULLORDER) {
3198: return -1;
3199: }
3200: break;
3201: } else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
3202: return 1;
3203: }
3204: if (doHiragana4
3205: && hiraganaresult == 0
3206: && m_srcUtilColEIter_.m_isCodePointHiragana_ != m_tgtUtilColEIter_.m_isCodePointHiragana_) {
3207: if (m_srcUtilColEIter_.m_isCodePointHiragana_) {
3208: hiraganaresult = -1;
3209: } else {
3210: hiraganaresult = 1;
3211: }
3212: }
3213: } else {
3214: // if two primaries are different, we are done
3215: return endPrimaryCompare(sorder, torder);
3216: }
3217: }
3218: // no primary difference... do the rest from the buffers
3219: return hiraganaresult;
3220: } else { // shifted - do a slightly more complicated processing :)
3221: while (true) {
3222: int sorder = getPrimaryShiftedCompareCE(
3223: m_srcUtilColEIter_, lowestpvalue, true);
3224: int torder = getPrimaryShiftedCompareCE(
3225: m_tgtUtilColEIter_, lowestpvalue, false);
3226: if (sorder == torder) {
3227: if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
3228: break;
3229: } else {
3230: continue;
3231: }
3232: } else {
3233: return endPrimaryCompare(sorder, torder);
3234: }
3235: } // no primary difference... do the rest from the buffers
3236: }
3237: return 0;
3238: }
3239:
3240: /**
3241: * This is used only for primary strength when we know that sorder is
3242: * already different from torder.
3243: * Compares sorder and torder, returns -1 if sorder is less than torder.
3244: * Clears the cebuffer at the same time.
3245: * @param sorder source strength order
3246: * @param torder target strength order
3247: * @return the comparison result of sorder and torder
3248: */
3249: private final int endPrimaryCompare(int sorder, int torder) {
3250: // if we reach here, the ce offset accessed is the last ce
3251: // appended to the buffer
3252: boolean isSourceNullOrder = (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER);
3253: boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER);
3254: m_srcUtilCEBufferSize_ = -1;
3255: m_tgtUtilCEBufferSize_ = -1;
3256: if (isSourceNullOrder) {
3257: return -1;
3258: }
3259: if (isTargetNullOrder) {
3260: return 1;
3261: }
3262: // getting rid of the sign
3263: sorder >>>= CE_PRIMARY_SHIFT_;
3264: torder >>>= CE_PRIMARY_SHIFT_;
3265: if (sorder < torder) {
3266: return -1;
3267: }
3268: return 1;
3269: }
3270:
3271: /**
3272: * Calculates the next primary shifted value and fills up cebuffer with the
3273: * next non-ignorable ce.
3274: * @param coleiter collation element iterator
3275: * @param doHiragana4 flag indicator if hiragana quaternary is to be
3276: * handled
3277: * @param lowestpvalue lowest primary shifted value that will not be
3278: * ignored
3279: * @return result next modified ce
3280: */
3281: private final int getPrimaryShiftedCompareCE(
3282: CollationElementIterator coleiter, int lowestpvalue,
3283: boolean isSrc)
3284:
3285: {
3286: boolean shifted = false;
3287: int result = CollationElementIterator.IGNORABLE;
3288: int cebuffer[] = m_srcUtilCEBuffer_;
3289: int cebuffersize = m_srcUtilCEBufferSize_;
3290: if (!isSrc) {
3291: cebuffer = m_tgtUtilCEBuffer_;
3292: cebuffersize = m_tgtUtilCEBufferSize_;
3293: }
3294: while (true) {
3295: result = coleiter.next();
3296: if (result == CollationElementIterator.NULLORDER) {
3297: cebuffer = append(cebuffer, cebuffersize, result);
3298: cebuffersize++;
3299: break;
3300: } else if (result == CollationElementIterator.IGNORABLE
3301: || (shifted && (result & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE)) {
3302: // UCA amendment - ignore ignorables that follow shifted code
3303: // points
3304: continue;
3305: } else if (isContinuation(result)) {
3306: if ((result & CE_PRIMARY_MASK_) != CollationElementIterator.IGNORABLE) {
3307: // There is primary value
3308: if (shifted) {
3309: result = (result & CE_PRIMARY_MASK_)
3310: | CE_CONTINUATION_MARKER_;
3311: // preserve interesting continuation
3312: cebuffer = append(cebuffer, cebuffersize,
3313: result);
3314: cebuffersize++;
3315: continue;
3316: } else {
3317: cebuffer = append(cebuffer, cebuffersize,
3318: result);
3319: cebuffersize++;
3320: break;
3321: }
3322: } else { // Just lower level values
3323: if (!shifted) {
3324: cebuffer = append(cebuffer, cebuffersize,
3325: result);
3326: cebuffersize++;
3327: }
3328: }
3329: } else { // regular
3330: if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_,
3331: lowestpvalue) > 0) {
3332: cebuffer = append(cebuffer, cebuffersize, result);
3333: cebuffersize++;
3334: break;
3335: } else {
3336: if ((result & CE_PRIMARY_MASK_) != 0) {
3337: shifted = true;
3338: result &= CE_PRIMARY_MASK_;
3339: cebuffer = append(cebuffer, cebuffersize,
3340: result);
3341: cebuffersize++;
3342: continue;
3343: } else {
3344: cebuffer = append(cebuffer, cebuffersize,
3345: result);
3346: cebuffersize++;
3347: shifted = false;
3348: continue;
3349: }
3350: }
3351: }
3352: }
3353: if (isSrc) {
3354: m_srcUtilCEBuffer_ = cebuffer;
3355: m_srcUtilCEBufferSize_ = cebuffersize;
3356: } else {
3357: m_tgtUtilCEBuffer_ = cebuffer;
3358: m_tgtUtilCEBufferSize_ = cebuffersize;
3359: }
3360: result &= CE_PRIMARY_MASK_;
3361: return result;
3362: }
3363:
3364: /**
3365: * Appending an int to an array of ints and increases it if we run out of
3366: * space
3367: * @param array of int arrays
3368: * @param appendindex index at which value will be appended
3369: * @param value to append
3370: * @return array if size is not increased, otherwise a new array will be
3371: * returned
3372: */
3373: private static final int[] append(int array[], int appendindex,
3374: int value) {
3375: if (appendindex + 1 >= array.length) {
3376: array = increase(array, appendindex, CE_BUFFER_SIZE_);
3377: }
3378: array[appendindex] = value;
3379: return array;
3380: }
3381:
3382: /**
3383: * Does secondary strength comparison based on the collected ces.
3384: * @param doFrench flag indicates if French ordering is to be done
3385: * @return the secondary strength comparison result
3386: */
3387: private final int doSecondaryCompare(boolean doFrench) {
3388: // now, we're gonna reexamine collected CEs
3389: if (!doFrench) { // normal
3390: int soffset = 0;
3391: int toffset = 0;
3392: while (true) {
3393: int sorder = CollationElementIterator.IGNORABLE;
3394: while (sorder == CollationElementIterator.IGNORABLE) {
3395: sorder = m_srcUtilCEBuffer_[soffset++]
3396: & CE_SECONDARY_MASK_;
3397: }
3398: int torder = CollationElementIterator.IGNORABLE;
3399: while (torder == CollationElementIterator.IGNORABLE) {
3400: torder = m_tgtUtilCEBuffer_[toffset++]
3401: & CE_SECONDARY_MASK_;
3402: }
3403:
3404: if (sorder == torder) {
3405: if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3406: if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3407: return -1;
3408: }
3409: break;
3410: } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3411: return 1;
3412: }
3413: } else {
3414: if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3415: return -1;
3416: }
3417: if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3418: return 1;
3419: }
3420: return (sorder < torder) ? -1 : 1;
3421: }
3422: }
3423: } else { // do the French
3424: m_srcUtilContOffset_ = 0;
3425: m_tgtUtilContOffset_ = 0;
3426: m_srcUtilOffset_ = m_srcUtilCEBufferSize_ - 2;
3427: m_tgtUtilOffset_ = m_tgtUtilCEBufferSize_ - 2;
3428: while (true) {
3429: int sorder = getSecondaryFrenchCE(true);
3430: int torder = getSecondaryFrenchCE(false);
3431: if (sorder == torder) {
3432: if ((m_srcUtilOffset_ < 0 && m_tgtUtilOffset_ < 0)
3433: || (m_srcUtilOffset_ >= 0 && m_srcUtilCEBuffer_[m_srcUtilOffset_] == CollationElementIterator.NULLORDER)) {
3434: break;
3435: }
3436: } else {
3437: return (sorder < torder) ? -1 : 1;
3438: }
3439: }
3440: }
3441: return 0;
3442: }
3443:
3444: /**
3445: * Calculates the next secondary french CE.
3446: * @param isSrc flag indicator if we are calculating the src ces
3447: * @return result next modified ce
3448: */
3449: private final int getSecondaryFrenchCE(boolean isSrc) {
3450: int result = CollationElementIterator.IGNORABLE;
3451: int offset = m_srcUtilOffset_;
3452: int continuationoffset = m_srcUtilContOffset_;
3453: int cebuffer[] = m_srcUtilCEBuffer_;
3454: if (!isSrc) {
3455: offset = m_tgtUtilOffset_;
3456: continuationoffset = m_tgtUtilContOffset_;
3457: cebuffer = m_tgtUtilCEBuffer_;
3458: }
3459:
3460: while (result == CollationElementIterator.IGNORABLE
3461: && offset >= 0) {
3462: if (continuationoffset == 0) {
3463: result = cebuffer[offset];
3464: while (isContinuation(cebuffer[offset--])) {
3465: }
3466: // after this, sorder is at the start of continuation,
3467: // and offset points before that
3468: if (isContinuation(cebuffer[offset + 1])) {
3469: // save offset for later
3470: continuationoffset = offset;
3471: offset += 2;
3472: }
3473: } else {
3474: result = cebuffer[offset++];
3475: if (!isContinuation(result)) {
3476: // we have finished with this continuation
3477: offset = continuationoffset;
3478: // reset the pointer to before continuation
3479: continuationoffset = 0;
3480: continue;
3481: }
3482: }
3483: result &= CE_SECONDARY_MASK_; // remove continuation bit
3484: }
3485: if (isSrc) {
3486: m_srcUtilOffset_ = offset;
3487: m_srcUtilContOffset_ = continuationoffset;
3488: } else {
3489: m_tgtUtilOffset_ = offset;
3490: m_tgtUtilContOffset_ = continuationoffset;
3491: }
3492: return result;
3493: }
3494:
3495: /**
3496: * Does case strength comparison based on the collected ces.
3497: * @return the case strength comparison result
3498: */
3499: private final int doCaseCompare() {
3500: int soffset = 0;
3501: int toffset = 0;
3502: while (true) {
3503: int sorder = CollationElementIterator.IGNORABLE;
3504: int torder = CollationElementIterator.IGNORABLE;
3505: while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3506: sorder = m_srcUtilCEBuffer_[soffset++];
3507: if (!isContinuation(sorder)
3508: && ((sorder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
3509: // primary ignorables should not be considered on the case level when the strength is primary
3510: // otherwise, the CEs stop being well-formed
3511: sorder &= CE_CASE_MASK_3_;
3512: sorder ^= m_caseSwitch_;
3513: } else {
3514: sorder = CollationElementIterator.IGNORABLE;
3515: }
3516: }
3517:
3518: while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3519: torder = m_tgtUtilCEBuffer_[toffset++];
3520: if (!isContinuation(torder)
3521: && ((torder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
3522: // primary ignorables should not be considered on the case level when the strength is primary
3523: // otherwise, the CEs stop being well-formed
3524: torder &= CE_CASE_MASK_3_;
3525: torder ^= m_caseSwitch_;
3526: } else {
3527: torder = CollationElementIterator.IGNORABLE;
3528: }
3529: }
3530:
3531: sorder &= CE_CASE_BIT_MASK_;
3532: torder &= CE_CASE_BIT_MASK_;
3533: if (sorder == torder) {
3534: // checking end of strings
3535: if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3536: if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3537: return -1;
3538: }
3539: break;
3540: } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3541: return 1;
3542: }
3543: } else {
3544: if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3545: return -1;
3546: }
3547: if (m_tgtUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3548: return 1;
3549: }
3550: return (sorder < torder) ? -1 : 1;
3551: }
3552: }
3553: return 0;
3554: }
3555:
3556: /**
3557: * Does tertiary strength comparison based on the collected ces.
3558: * @return the tertiary strength comparison result
3559: */
3560: private final int doTertiaryCompare() {
3561: int soffset = 0;
3562: int toffset = 0;
3563: while (true) {
3564: int sorder = CollationElementIterator.IGNORABLE;
3565: int torder = CollationElementIterator.IGNORABLE;
3566: while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3567: sorder = m_srcUtilCEBuffer_[soffset++] & m_mask3_;
3568: if (!isContinuation(sorder)) {
3569: sorder ^= m_caseSwitch_;
3570: } else {
3571: sorder &= CE_REMOVE_CASE_;
3572: }
3573: }
3574:
3575: while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3576: torder = m_tgtUtilCEBuffer_[toffset++] & m_mask3_;
3577: if (!isContinuation(torder)) {
3578: torder ^= m_caseSwitch_;
3579: } else {
3580: torder &= CE_REMOVE_CASE_;
3581: }
3582: }
3583:
3584: if (sorder == torder) {
3585: if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3586: if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3587: return -1;
3588: }
3589: break;
3590: } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3591: return 1;
3592: }
3593: } else {
3594: if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3595: return -1;
3596: }
3597: if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3598: return 1;
3599: }
3600: return (sorder < torder) ? -1 : 1;
3601: }
3602: }
3603: return 0;
3604: }
3605:
3606: /**
3607: * Does quaternary strength comparison based on the collected ces.
3608: * @param lowestpvalue the lowest primary value that will not be ignored if
3609: * alternate handling is shifted
3610: * @return the quaternary strength comparison result
3611: */
3612: private final int doQuaternaryCompare(int lowestpvalue) {
3613: boolean sShifted = true;
3614: boolean tShifted = true;
3615: int soffset = 0;
3616: int toffset = 0;
3617: while (true) {
3618: int sorder = CollationElementIterator.IGNORABLE;
3619: int torder = CollationElementIterator.IGNORABLE;
3620: while (sorder == CollationElementIterator.IGNORABLE
3621: || (isContinuation(sorder) && !sShifted)) {
3622: sorder = m_srcUtilCEBuffer_[soffset++];
3623: if (isContinuation(sorder)) {
3624: if (!sShifted) {
3625: continue;
3626: }
3627: } else if (Utility
3628: .compareUnsigned(sorder, lowestpvalue) > 0
3629: || (sorder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) {
3630: // non continuation
3631: sorder = CE_PRIMARY_MASK_;
3632: sShifted = false;
3633: } else {
3634: sShifted = true;
3635: }
3636: }
3637: sorder >>>= CE_PRIMARY_SHIFT_;
3638: while (torder == CollationElementIterator.IGNORABLE
3639: || (isContinuation(torder) && !tShifted)) {
3640: torder = m_tgtUtilCEBuffer_[toffset++];
3641: if (isContinuation(torder)) {
3642: if (!tShifted) {
3643: continue;
3644: }
3645: } else if (Utility
3646: .compareUnsigned(torder, lowestpvalue) > 0
3647: || (torder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) {
3648: // non continuation
3649: torder = CE_PRIMARY_MASK_;
3650: tShifted = false;
3651: } else {
3652: tShifted = true;
3653: }
3654: }
3655: torder >>>= CE_PRIMARY_SHIFT_;
3656:
3657: if (sorder == torder) {
3658: if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3659: if (m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3660: return -1;
3661: }
3662: break;
3663: } else if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3664: return 1;
3665: }
3666: } else {
3667: if (m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3668: return -1;
3669: }
3670: if (m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3671: return 1;
3672: }
3673: return (sorder < torder) ? -1 : 1;
3674: }
3675: }
3676: return 0;
3677: }
3678:
3679: /**
3680: * Internal function. Does byte level string compare. Used by strcoll if
3681: * strength == identical and strings are otherwise equal. This is a rare
3682: * case. Comparison must be done on NFD normalized strings. FCD is not good
3683: * enough.
3684: * @param source text
3685: * @param target text
3686: * @param offset of the first difference in the text strings
3687: * @param normalize flag indicating if we are to normalize the text before
3688: * comparison
3689: * @return 1 if source is greater than target, -1 less than and 0 if equals
3690: */
3691: private static final int doIdenticalCompare(String source,
3692: String target, int offset, boolean normalize)
3693:
3694: {
3695: if (normalize) {
3696: if (Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) {
3697: source = Normalizer.decompose(source, false);
3698: }
3699:
3700: if (Normalizer.quickCheck(target, Normalizer.NFD, 0) != Normalizer.YES) {
3701: target = Normalizer.decompose(target, false);
3702: }
3703: offset = 0;
3704: }
3705:
3706: return doStringCompare(source, target, offset);
3707: }
3708:
3709: /**
3710: * Compares string for their codepoint order.
3711: * This comparison handles surrogate characters and place them after the
3712: * all non surrogate characters.
3713: * @param source text
3714: * @param target text
3715: * @param offset start offset for comparison
3716: * @return 1 if source is greater than target, -1 less than and 0 if equals
3717: */
3718: private static final int doStringCompare(String source,
3719: String target, int offset) {
3720: // compare identical prefixes - they do not need to be fixed up
3721: char schar = 0;
3722: char tchar = 0;
3723: int slength = source.length();
3724: int tlength = target.length();
3725: int minlength = Math.min(slength, tlength);
3726: while (offset < minlength) {
3727: schar = source.charAt(offset);
3728: tchar = target.charAt(offset++);
3729: if (schar != tchar) {
3730: break;
3731: }
3732: }
3733:
3734: if (schar == tchar && offset == minlength) {
3735: if (slength > minlength) {
3736: return 1;
3737: }
3738: if (tlength > minlength) {
3739: return -1;
3740: }
3741: return 0;
3742: }
3743:
3744: // if both values are in or above the surrogate range, Fix them up.
3745: if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE
3746: && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
3747: schar = fixupUTF16(schar);
3748: tchar = fixupUTF16(tchar);
3749: }
3750:
3751: // now c1 and c2 are in UTF-32-compatible order
3752: return (schar < tchar) ? -1 : 1; // schar and tchar has to be different
3753: }
3754:
3755: /**
3756: * Rotate surrogates to the top to get code point order
3757: */
3758: private static final char fixupUTF16(char ch) {
3759: if (ch >= 0xe000) {
3760: ch -= 0x800;
3761: } else {
3762: ch += 0x2000;
3763: }
3764: return ch;
3765: }
3766:
3767: /**
3768: * Resets the internal case data members and compression values.
3769: */
3770: private void updateInternalState() {
3771: if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
3772: m_caseSwitch_ = CASE_SWITCH_;
3773: } else {
3774: m_caseSwitch_ = NO_CASE_SWITCH_;
3775: }
3776:
3777: if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) {
3778: m_mask3_ = CE_REMOVE_CASE_;
3779: m_common3_ = COMMON_NORMAL_3_;
3780: m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;
3781: m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;
3782: m_bottom3_ = COMMON_BOTTOM_3_;
3783: } else {
3784: m_mask3_ = CE_KEEP_CASE_;
3785: m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;
3786: if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
3787: m_common3_ = COMMON_UPPER_FIRST_3_;
3788: m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_;
3789: m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_;
3790: } else {
3791: m_common3_ = COMMON_NORMAL_3_;
3792: m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_;
3793: m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_;
3794: }
3795: }
3796:
3797: // Set the compression values
3798: int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1;
3799: // we multilply double with int, but need only int
3800: m_topCount3_ = (int) (PROPORTION_3_ * total3);
3801: m_bottomCount3_ = total3 - m_topCount3_;
3802:
3803: if (!m_isCaseLevel_
3804: && getStrength() == AttributeValue.TERTIARY_
3805: && !m_isFrenchCollation_
3806: && !m_isAlternateHandlingShifted_) {
3807: m_isSimple3_ = true;
3808: } else {
3809: m_isSimple3_ = false;
3810: }
3811: if (!m_isCaseLevel_
3812: && getStrength() <= AttributeValue.TERTIARY_
3813: && !m_isNumericCollation_
3814: && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
3815: if (latinOneCEs_ == null || latinOneRegenTable_) {
3816: if (setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
3817: latinOneUse_ = true;
3818: } else {
3819: latinOneUse_ = false;
3820: latinOneFailed_ = true;
3821: }
3822: latinOneRegenTable_ = false;
3823: } else { // latin1Table exists and it doesn't need to be regenerated, just use it
3824: latinOneUse_ = true;
3825: }
3826: } else {
3827: latinOneUse_ = false;
3828: }
3829:
3830: }
3831:
3832: /**
3833: * Initializes the RuleBasedCollator
3834: */
3835: private final void init() {
3836: for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; m_minUnsafe_++) {
3837: // Find the smallest unsafe char.
3838: if (isUnsafe(m_minUnsafe_)) {
3839: break;
3840: }
3841: }
3842:
3843: for (m_minContractionEnd_ = 0; m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; m_minContractionEnd_++) {
3844: // Find the smallest contraction-ending char.
3845: if (isContractionEnd(m_minContractionEnd_)) {
3846: break;
3847: }
3848: }
3849: latinOneFailed_ = true;
3850: setStrength(m_defaultStrength_);
3851: setDecomposition(m_defaultDecomposition_);
3852: m_variableTopValue_ = m_defaultVariableTopValue_;
3853: m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
3854: m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
3855: m_isCaseLevel_ = m_defaultIsCaseLevel_;
3856: m_caseFirst_ = m_defaultCaseFirst_;
3857: m_isHiragana4_ = m_defaultIsHiragana4_;
3858: m_isNumericCollation_ = m_defaultIsNumericCollation_;
3859: latinOneFailed_ = false;
3860: updateInternalState();
3861: }
3862:
3863: /**
3864: * Initializes utility iterators and byte buffer used by compare
3865: */
3866: private final void initUtility(boolean allocate) {
3867: if (allocate) {
3868: if (m_srcUtilIter_ == null) {
3869: m_srcUtilIter_ = new StringUCharacterIterator();
3870: m_srcUtilColEIter_ = new CollationElementIterator(
3871: m_srcUtilIter_, this );
3872: m_tgtUtilIter_ = new StringUCharacterIterator();
3873: m_tgtUtilColEIter_ = new CollationElementIterator(
3874: m_tgtUtilIter_, this );
3875: m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case
3876: m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary
3877: m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary
3878: m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary
3879: m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary
3880: m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
3881: m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
3882: }
3883: } else {
3884: m_srcUtilIter_ = null;
3885: m_srcUtilColEIter_ = null;
3886: m_tgtUtilIter_ = null;
3887: m_tgtUtilColEIter_ = null;
3888: m_utilBytes0_ = null;
3889: m_utilBytes1_ = null;
3890: m_utilBytes2_ = null;
3891: m_utilBytes3_ = null;
3892: m_utilBytes4_ = null;
3893: m_srcUtilCEBuffer_ = null;
3894: m_tgtUtilCEBuffer_ = null;
3895: }
3896: }
3897:
3898: // Consts for Latin-1 special processing
3899: private static final int ENDOFLATINONERANGE_ = 0xFF;
3900: private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_ + 50);
3901: private static final int BAIL_OUT_CE_ = 0xFF000000;
3902:
3903: /**
3904: * Generate latin-1 tables
3905: */
3906:
3907: private class shiftValues {
3908: int primShift = 24;
3909: int secShift = 24;
3910: int terShift = 24;
3911: }
3912:
3913: private final void addLatinOneEntry(char ch, int CE, shiftValues sh) {
3914: int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
3915: boolean reverseSecondary = false;
3916: if (!isContinuation(CE)) {
3917: tertiary = ((CE & m_mask3_));
3918: tertiary ^= m_caseSwitch_;
3919: reverseSecondary = true;
3920: } else {
3921: tertiary = (byte) ((CE & CE_REMOVE_CONTINUATION_MASK_));
3922: tertiary &= CE_REMOVE_CASE_;
3923: reverseSecondary = false;
3924: }
3925:
3926: secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
3927: primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);
3928: primary1 = (CE >>> 8);
3929:
3930: if (primary1 != 0) {
3931: latinOneCEs_[ch] |= (primary1 << sh.primShift);
3932: sh.primShift -= 8;
3933: }
3934: if (primary2 != 0) {
3935: if (sh.primShift < 0) {
3936: latinOneCEs_[ch] = BAIL_OUT_CE_;
3937: latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_;
3938: latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_;
3939: return;
3940: }
3941: latinOneCEs_[ch] |= (primary2 << sh.primShift);
3942: sh.primShift -= 8;
3943: }
3944: if (secondary != 0) {
3945: if (reverseSecondary && m_isFrenchCollation_) { // reverse secondary
3946: latinOneCEs_[latinOneTableLen_ + ch] >>>= 8; // make space for secondary
3947: latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << 24);
3948: } else { // normal case
3949: latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << sh.secShift);
3950: }
3951: sh.secShift -= 8;
3952: }
3953: if (tertiary != 0) {
3954: latinOneCEs_[2 * latinOneTableLen_ + ch] |= (tertiary << sh.terShift);
3955: sh.terShift -= 8;
3956: }
3957: }
3958:
3959: private final void resizeLatinOneTable(int newSize) {
3960: int newTable[] = new int[3 * newSize];
3961: int sizeToCopy = ((newSize < latinOneTableLen_) ? newSize
3962: : latinOneTableLen_);
3963: //uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared.
3964: System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy);
3965: System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable,
3966: newSize, sizeToCopy);
3967: System.arraycopy(latinOneCEs_, 2 * latinOneTableLen_, newTable,
3968: 2 * newSize, sizeToCopy);
3969: latinOneTableLen_ = newSize;
3970: latinOneCEs_ = newTable;
3971: }
3972:
3973: private final boolean setUpLatinOne() {
3974: if (latinOneCEs_ == null || m_reallocLatinOneCEs_) {
3975: latinOneCEs_ = new int[3 * LATINONETABLELEN_];
3976: latinOneTableLen_ = LATINONETABLELEN_;
3977: m_reallocLatinOneCEs_ = false;
3978: } else {
3979: Arrays.fill(latinOneCEs_, 0);
3980: }
3981: if (m_ContInfo_ == null) {
3982: m_ContInfo_ = new ContractionInfo();
3983: }
3984: char ch = 0;
3985: //StringBuffer sCh = new StringBuffer();
3986: //CollationElementIterator it = getCollationElementIterator(sCh.toString());
3987: CollationElementIterator it = getCollationElementIterator("");
3988:
3989: shiftValues s = new shiftValues();
3990: int CE = 0;
3991: char contractionOffset = ENDOFLATINONERANGE_ + 1;
3992:
3993: for (ch = 0; ch <= ENDOFLATINONERANGE_; ch++) {
3994: s.primShift = 24;
3995: s.secShift = 24;
3996: s.terShift = 24;
3997: if (ch < 0x100) {
3998: CE = m_trie_.getLatin1LinearValue(ch);
3999: } else {
4000: CE = m_trie_.getLeadValue(ch);
4001: if (CE == CollationElementIterator.CE_NOT_FOUND_) {
4002: CE = UCA_.m_trie_.getLeadValue(ch);
4003: }
4004: }
4005: if (!isSpecial(CE)) {
4006: addLatinOneEntry(ch, CE, s);
4007: } else {
4008: switch (RuleBasedCollator.getTag(CE)) {
4009: case CollationElementIterator.CE_EXPANSION_TAG_:
4010: case CollationElementIterator.CE_DIGIT_TAG_:
4011: //sCh.delete(0, sCh.length());
4012: //sCh.append(ch);
4013: //it.setText(sCh.toString());
4014: it.setText(UCharacter.toString(ch));
4015: while ((CE = it.next()) != CollationElementIterator.NULLORDER) {
4016: if (s.primShift < 0 || s.secShift < 0
4017: || s.terShift < 0) {
4018: latinOneCEs_[ch] = BAIL_OUT_CE_;
4019: latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_;
4020: latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_;
4021: break;
4022: }
4023: addLatinOneEntry(ch, CE, s);
4024: }
4025: break;
4026: case CollationElementIterator.CE_CONTRACTION_TAG_:
4027: // here is the trick
4028: // F2 is contraction. We do something very similar to contractions
4029: // but have two indices, one in the real contraction table and the
4030: // other to where we stuffed things. This hopes that we don't have
4031: // many contractions (this should work for latin-1 tables).
4032: {
4033: if ((CE & 0x00FFF000) != 0) {
4034: latinOneFailed_ = true;
4035: return false;
4036: }
4037:
4038: int UCharOffset = (CE & 0xFFFFFF)
4039: - m_contractionOffset_; //getContractionOffset(CE)]
4040:
4041: CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
4042:
4043: latinOneCEs_[ch] = CE;
4044: latinOneCEs_[latinOneTableLen_ + ch] = CE;
4045: latinOneCEs_[2 * latinOneTableLen_ + ch] = CE;
4046:
4047: // We're going to jump into contraction table, pick the elements
4048: // and use them
4049: do {
4050: //CE = *(contractionCEs + (UCharOffset - contractionIndex));
4051: CE = m_contractionCE_[UCharOffset];
4052: if (isSpecial(CE)
4053: && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
4054: int i; /* general counter */
4055: //uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to expansion table */
4056: int offset = ((CE & 0xFFFFF0) >> 4)
4057: - m_expansionOffset_; //it.getExpansionOffset(this, CE);
4058: int size = CE & 0xF; // getExpansionCount(CE);
4059: //CE = *CEOffset++;
4060: if (size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
4061: for (i = 0; i < size; i++) {
4062: if (s.primShift < 0
4063: || s.secShift < 0
4064: || s.terShift < 0) {
4065: latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4066: latinOneCEs_[latinOneTableLen_
4067: + contractionOffset] = BAIL_OUT_CE_;
4068: latinOneCEs_[2
4069: * latinOneTableLen_
4070: + contractionOffset] = BAIL_OUT_CE_;
4071: break;
4072: }
4073: addLatinOneEntry(contractionOffset,
4074: m_expansion_[offset + i], s);
4075: }
4076: } else { /* else, we do */
4077: while (m_expansion_[offset] != 0) {
4078: if (s.primShift < 0
4079: || s.secShift < 0
4080: || s.terShift < 0) {
4081: latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4082: latinOneCEs_[latinOneTableLen_
4083: + contractionOffset] = BAIL_OUT_CE_;
4084: latinOneCEs_[2
4085: * latinOneTableLen_
4086: + contractionOffset] = BAIL_OUT_CE_;
4087: break;
4088: }
4089: addLatinOneEntry(contractionOffset,
4090: m_expansion_[offset++], s);
4091: }
4092: }
4093: contractionOffset++;
4094: } else if (!isSpecial(CE)) {
4095: addLatinOneEntry(contractionOffset++, CE, s);
4096: } else {
4097: latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4098: latinOneCEs_[latinOneTableLen_
4099: + contractionOffset] = BAIL_OUT_CE_;
4100: latinOneCEs_[2 * latinOneTableLen_
4101: + contractionOffset] = BAIL_OUT_CE_;
4102: contractionOffset++;
4103: }
4104: UCharOffset++;
4105: s.primShift = 24;
4106: s.secShift = 24;
4107: s.terShift = 24;
4108: if (contractionOffset == latinOneTableLen_) { // we need to reallocate
4109: resizeLatinOneTable(2 * latinOneTableLen_);
4110: }
4111: } while (m_contractionIndex_[UCharOffset] != 0xFFFF);
4112: }
4113: break;
4114: default:
4115: latinOneFailed_ = true;
4116: return false;
4117: }
4118: }
4119: }
4120: // compact table
4121: if (contractionOffset < latinOneTableLen_) {
4122: resizeLatinOneTable(contractionOffset);
4123: }
4124: return true;
4125: }
4126:
4127: private class ContractionInfo {
4128: int index;
4129: }
4130:
4131: ContractionInfo m_ContInfo_;
4132:
4133: private int getLatinOneContraction(int strength, int CE, String s) {
4134: //int strength, int CE, String s, Integer ind) {
4135: int len = s.length();
4136: //const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
4137: int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
4138: int offset = 1;
4139: int latinOneOffset = (CE & 0x00FFF000) >>> 12;
4140: char schar = 0, tchar = 0;
4141:
4142: for (;;) {
4143: /*
4144: if(len == -1) {
4145: if(s[*index] == 0) { // end of string
4146: return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
4147: } else {
4148: schar = s[*index];
4149: }
4150: } else {
4151: */
4152: if (m_ContInfo_.index == len) {
4153: return (latinOneCEs_[strength * latinOneTableLen_
4154: + latinOneOffset]);
4155: } else {
4156: schar = s.charAt(m_ContInfo_.index);
4157: }
4158: //}
4159:
4160: while (schar > (tchar = m_contractionIndex_[UCharOffset
4161: + offset]/**(UCharOffset+offset)*/
4162: )) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
4163: offset++;
4164: }
4165:
4166: if (schar == tchar) {
4167: m_ContInfo_.index++;
4168: return (latinOneCEs_[strength * latinOneTableLen_
4169: + latinOneOffset + offset]);
4170: } else {
4171: if (schar > ENDOFLATINONERANGE_ /*& 0xFF00*/) {
4172: return BAIL_OUT_CE_;
4173: }
4174: // skip completely ignorables
4175: int isZeroCE = m_trie_.getLeadValue(schar); //UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
4176: if (isZeroCE == 0) { // we have to ignore completely ignorables
4177: m_ContInfo_.index++;
4178: continue;
4179: }
4180:
4181: return (latinOneCEs_[strength * latinOneTableLen_
4182: + latinOneOffset]);
4183: }
4184: }
4185: }
4186:
4187: /**
4188: * This is a fast strcoll, geared towards text in Latin-1.
4189: * It supports contractions of size two, French secondaries
4190: * and case switching. You can use it with strengths primary
4191: * to tertiary. It does not support shifted and case level.
4192: * It relies on the table build by setupLatin1Table. If it
4193: * doesn't understand something, it will go to the regular
4194: * strcoll.
4195: */
4196: private final int compareUseLatin1(String source, String target,
4197: int startOffset) {
4198: int sLen = source.length();
4199: int tLen = target.length();
4200:
4201: int strength = getStrength();
4202:
4203: int sIndex = startOffset, tIndex = startOffset;
4204: char sChar = 0, tChar = 0;
4205: int sOrder = 0, tOrder = 0;
4206:
4207: boolean endOfSource = false;
4208:
4209: //uint32_t *elements = coll->latinOneCEs;
4210:
4211: boolean haveContractions = false; // if we have contractions in our string
4212: // we cannot do French secondary
4213:
4214: int offset = latinOneTableLen_;
4215:
4216: // Do the primary level
4217: primLoop: for (;;) {
4218: while (sOrder == 0) { // this loop skips primary ignorables
4219: // sOrder=getNextlatinOneCE(source);
4220: if (sIndex == sLen) {
4221: endOfSource = true;
4222: break;
4223: }
4224: sChar = source.charAt(sIndex++); //[sIndex++];
4225: //}
4226: if (sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
4227: //fprintf(stderr, "R");
4228: return compareRegular(source, target, startOffset);
4229: }
4230: sOrder = latinOneCEs_[sChar];
4231: if (isSpecial(sOrder)) { // if we got a special
4232: // specials can basically be either contractions or bail-out signs. If we get anything
4233: // else, we'll bail out anywasy
4234: if (getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
4235: m_ContInfo_.index = sIndex;
4236: sOrder = getLatinOneContraction(0, sOrder,
4237: source);
4238: sIndex = m_ContInfo_.index;
4239: haveContractions = true; // if there are contractions, we cannot do French secondary
4240: // However, if there are contractions in the table, but we always use just one char,
4241: // we might be able to do French. This should be checked out.
4242: }
4243: if (isSpecial(sOrder) /*== UCOL_BAIL_OUT_CE*/) {
4244: //fprintf(stderr, "S");
4245: return compareRegular(source, target,
4246: startOffset);
4247: }
4248: }
4249: }
4250:
4251: while (tOrder == 0) { // this loop skips primary ignorables
4252: // tOrder=getNextlatinOneCE(target);
4253: if (tIndex == tLen) {
4254: if (endOfSource) {
4255: break primLoop;
4256: } else {
4257: return 1;
4258: }
4259: }
4260: tChar = target.charAt(tIndex++); //[tIndex++];
4261: if (tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
4262: //fprintf(stderr, "R");
4263: return compareRegular(source, target, startOffset);
4264: }
4265: tOrder = latinOneCEs_[tChar];
4266: if (isSpecial(tOrder)) {
4267: // Handling specials, see the comments for source
4268: if (getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
4269: m_ContInfo_.index = tIndex;
4270: tOrder = getLatinOneContraction(0, tOrder,
4271: target);
4272: tIndex = m_ContInfo_.index;
4273: haveContractions = true;
4274: }
4275: if (isSpecial(tOrder)/*== UCOL_BAIL_OUT_CE*/) {
4276: //fprintf(stderr, "S");
4277: return compareRegular(source, target,
4278: startOffset);
4279: }
4280: }
4281: }
4282: if (endOfSource) { // source is finished, but target is not, say the result.
4283: return -1;
4284: }
4285:
4286: if (sOrder == tOrder) { // if we have same CEs, we continue the loop
4287: sOrder = 0;
4288: tOrder = 0;
4289: continue;
4290: } else {
4291: // compare current top bytes
4292: if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
4293: // top bytes differ, return difference
4294: if (sOrder >>> 8 < tOrder >>> 8) {
4295: return -1;
4296: } else {
4297: return 1;
4298: }
4299: // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
4300: // since we must return enum value
4301: }
4302:
4303: // top bytes match, continue with following bytes
4304: sOrder <<= 8;
4305: tOrder <<= 8;
4306: }
4307: }
4308:
4309: // after primary loop, we definitely know the sizes of strings,
4310: // so we set it and use simpler loop for secondaries and tertiaries
4311: //sLen = sIndex; tLen = tIndex;
4312: if (strength >= SECONDARY) {
4313: // adjust the table beggining
4314: //latinOneCEs_ += coll->latinOneTableLen;
4315: endOfSource = false;
4316:
4317: if (!m_isFrenchCollation_) { // non French
4318: // This loop is a simplified copy of primary loop
4319: // at this point we know that whole strings are latin-1, so we don't
4320: // check for that. We also know that we only have contractions as
4321: // specials.
4322: //sIndex = 0; tIndex = 0;
4323: sIndex = startOffset;
4324: tIndex = startOffset;
4325: secLoop: for (;;) {
4326: while (sOrder == 0) {
4327: if (sIndex == sLen) {
4328: endOfSource = true;
4329: break;
4330: }
4331: sChar = source.charAt(sIndex++); //[sIndex++];
4332: sOrder = latinOneCEs_[offset + sChar];
4333: if (isSpecial(sOrder)) {
4334: m_ContInfo_.index = sIndex;
4335: sOrder = getLatinOneContraction(1, sOrder,
4336: source);
4337: sIndex = m_ContInfo_.index;
4338: }
4339: }
4340:
4341: while (tOrder == 0) {
4342: if (tIndex == tLen) {
4343: if (endOfSource) {
4344: break secLoop;
4345: } else {
4346: return 1;
4347: }
4348: }
4349: tChar = target.charAt(tIndex++); //[tIndex++];
4350: tOrder = latinOneCEs_[offset + tChar];
4351: if (isSpecial(tOrder)) {
4352: m_ContInfo_.index = tIndex;
4353: tOrder = getLatinOneContraction(1, tOrder,
4354: target);
4355: tIndex = m_ContInfo_.index;
4356: }
4357: }
4358: if (endOfSource) {
4359: return -1;
4360: }
4361:
4362: if (sOrder == tOrder) {
4363: sOrder = 0;
4364: tOrder = 0;
4365: continue;
4366: } else {
4367: // see primary loop for comments on this
4368: if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
4369: if (sOrder >>> 8 < tOrder >>> 8) {
4370: return -1;
4371: } else {
4372: return 1;
4373: }
4374: }
4375: sOrder <<= 8;
4376: tOrder <<= 8;
4377: }
4378: }
4379: } else { // French
4380: if (haveContractions) { // if we have contractions, we have to bail out
4381: // since we don't really know how to handle them here
4382: return compareRegular(source, target, startOffset);
4383: }
4384: // For French, we go backwards
4385: sIndex = sLen;
4386: tIndex = tLen;
4387: secFLoop: for (;;) {
4388: while (sOrder == 0) {
4389: if (sIndex == startOffset) {
4390: endOfSource = true;
4391: break;
4392: }
4393: sChar = source.charAt(--sIndex); //[--sIndex];
4394: sOrder = latinOneCEs_[offset + sChar];
4395: // don't even look for contractions
4396: }
4397:
4398: while (tOrder == 0) {
4399: if (tIndex == startOffset) {
4400: if (endOfSource) {
4401: break secFLoop;
4402: } else {
4403: return 1;
4404: }
4405: }
4406: tChar = target.charAt(--tIndex); //[--tIndex];
4407: tOrder = latinOneCEs_[offset + tChar];
4408: // don't even look for contractions
4409: }
4410: if (endOfSource) {
4411: return -1;
4412: }
4413:
4414: if (sOrder == tOrder) {
4415: sOrder = 0;
4416: tOrder = 0;
4417: continue;
4418: } else {
4419: // see the primary loop for comments
4420: if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
4421: if (sOrder >>> 8 < tOrder >>> 8) {
4422: return -1;
4423: } else {
4424: return 1;
4425: }
4426: }
4427: sOrder <<= 8;
4428: tOrder <<= 8;
4429: }
4430: }
4431: }
4432: }
4433:
4434: if (strength >= TERTIARY) {
4435: // tertiary loop is the same as secondary (except no French)
4436: offset += latinOneTableLen_;
4437: //sIndex = 0; tIndex = 0;
4438: sIndex = startOffset;
4439: tIndex = startOffset;
4440: endOfSource = false;
4441: for (;;) {
4442: while (sOrder == 0) {
4443: if (sIndex == sLen) {
4444: endOfSource = true;
4445: break;
4446: }
4447: sChar = source.charAt(sIndex++); //[sIndex++];
4448: sOrder = latinOneCEs_[offset + sChar];
4449: if (isSpecial(sOrder)) {
4450: m_ContInfo_.index = sIndex;
4451: sOrder = getLatinOneContraction(2, sOrder,
4452: source);
4453: sIndex = m_ContInfo_.index;
4454: }
4455: }
4456: while (tOrder == 0) {
4457: if (tIndex == tLen) {
4458: if (endOfSource) {
4459: return 0; // if both strings are at the end, they are equal
4460: } else {
4461: return 1;
4462: }
4463: }
4464: tChar = target.charAt(tIndex++); //[tIndex++];
4465: tOrder = latinOneCEs_[offset + tChar];
4466: if (isSpecial(tOrder)) {
4467: m_ContInfo_.index = tIndex;
4468: tOrder = getLatinOneContraction(2, tOrder,
4469: target);
4470: tIndex = m_ContInfo_.index;
4471: }
4472: }
4473: if (endOfSource) {
4474: return -1;
4475: }
4476: if (sOrder == tOrder) {
4477: sOrder = 0;
4478: tOrder = 0;
4479: continue;
4480: } else {
4481: if (((sOrder ^ tOrder) & 0xff000000) != 0) {
4482: if (sOrder >>> 8 < tOrder >>> 8) {
4483: return -1;
4484: } else {
4485: return 1;
4486: }
4487: }
4488: sOrder <<= 8;
4489: tOrder <<= 8;
4490: }
4491: }
4492: }
4493: return 0;
4494: }
4495:
4496: /**
4497: * Get the version of this collator object.
4498: * @return the version object associated with this collator
4499: * @stable ICU 2.8
4500: */
4501: public VersionInfo getVersion() {
4502: /* RunTime version */
4503: int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();
4504: /* Builder version*/
4505: int bdVersion = m_version_.getMajor();
4506:
4507: /* Charset Version. Need to get the version from cnv files
4508: * makeconv should populate cnv files with version and
4509: * an api has to be provided in ucnv.h to obtain this version
4510: */
4511: int csVersion = 0;
4512:
4513: /* combine the version info */
4514: int cmbVersion = ((rtVersion << 11) | (bdVersion << 6) | (csVersion)) & 0xFFFF;
4515:
4516: /* Tailoring rules */
4517: return VersionInfo.getInstance(cmbVersion >> 8,
4518: cmbVersion & 0xFF, m_version_.getMinor(),
4519: UCA_.m_UCA_version_.getMajor());
4520:
4521: // versionInfo[0] = (uint8_t)(cmbVersion>>8);
4522: // versionInfo[1] = (uint8_t)cmbVersion;
4523: // versionInfo[2] = coll->image->version[1];
4524: // versionInfo[3] = coll->UCA->image->UCAVersion[0];
4525: }
4526:
4527: /**
4528: * Get the UCA version of this collator object.
4529: * @return the version object associated with this collator
4530: * @stable ICU 2.8
4531: */
4532: public VersionInfo getUCAVersion() {
4533: return UCA_.m_UCA_version_;
4534: }
4535:
4536: private transient boolean m_reallocLatinOneCEs_;
4537: }
|