0001: /**
0002: *******************************************************************************
0003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */package com.ibm.icu.text;
0007:
0008: import java.io.IOException;
0009: import java.text.ParseException;
0010: import java.util.Hashtable;
0011: import java.util.Vector;
0012: import java.util.Arrays;
0013: import java.util.Enumeration;
0014:
0015: import com.ibm.icu.impl.TrieBuilder;
0016: import com.ibm.icu.impl.IntTrieBuilder;
0017: import com.ibm.icu.impl.TrieIterator;
0018: import com.ibm.icu.impl.Utility;
0019: import com.ibm.icu.impl.UCharacterProperty;
0020: import com.ibm.icu.lang.UCharacter;
0021: import com.ibm.icu.lang.UCharacterCategory;
0022: import com.ibm.icu.impl.NormalizerImpl;
0023: import com.ibm.icu.util.RangeValueIterator;
0024: import com.ibm.icu.util.VersionInfo;
0025:
0026: /**
0027: * Class for building a collator from a list of collation rules.
0028: * This class is uses CollationRuleParser
0029: * @author Syn Wee Quek
0030: * @since release 2.2, June 11 2002
0031: * @draft 2.2
0032: */
0033: final class CollationParsedRuleBuilder {
0034: // package private constructors ------------------------------------------
0035:
0036: /**
0037: * Constructor
0038: * @param rules collation rules
0039: * @exception ParseException thrown when argument rules have an invalid
0040: * syntax
0041: */
0042: CollationParsedRuleBuilder(String rules) throws ParseException {
0043: m_parser_ = new CollationRuleParser(rules);
0044: m_parser_.assembleTokenList();
0045: m_utilColEIter_ = RuleBasedCollator.UCA_
0046: .getCollationElementIterator("");
0047: }
0048:
0049: // package private inner classes -----------------------------------------
0050:
0051: /**
0052: * Inverse UCA wrapper
0053: */
0054: static class InverseUCA {
0055: // package private constructor ---------------------------------------
0056:
0057: InverseUCA() {
0058: }
0059:
0060: // package private data member ---------------------------------------
0061:
0062: /**
0063: * Array list of characters
0064: */
0065: int m_table_[];
0066: /**
0067: * Array list of continuation characters
0068: */
0069: char m_continuations_[];
0070:
0071: /**
0072: * UCA version of inverse UCA table
0073: */
0074: VersionInfo m_UCA_version_;
0075:
0076: // package private method --------------------------------------------
0077:
0078: /**
0079: * Returns the previous inverse ces of the argument ces
0080: * @param ce ce to test
0081: * @param contce continuation ce to test
0082: * @param strength collation strength
0083: * @param prevresult an array to store the return results previous
0084: * inverse ce and previous inverse continuation ce
0085: * @return result of the inverse ce
0086: */
0087: final int getInversePrevCE(int ce, int contce, int strength,
0088: int prevresult[]) {
0089: int result = findInverseCE(ce, contce);
0090:
0091: if (result < 0) {
0092: prevresult[0] = CollationElementIterator.NULLORDER;
0093: return -1;
0094: }
0095:
0096: ce &= STRENGTH_MASK_[strength];
0097: contce &= STRENGTH_MASK_[strength];
0098:
0099: prevresult[0] = ce;
0100: prevresult[1] = contce;
0101:
0102: while ((prevresult[0] & STRENGTH_MASK_[strength]) == ce
0103: && (prevresult[1] & STRENGTH_MASK_[strength]) == contce
0104: && result > 0) {
0105: // this condition should prevent falling off the edge of the
0106: // world
0107: // here, we end up in a singularity - zero
0108: prevresult[0] = m_table_[3 * (--result)];
0109: prevresult[1] = m_table_[3 * result + 1];
0110: }
0111: return result;
0112: }
0113:
0114: final int getCEStrengthDifference(int CE, int contCE,
0115: int prevCE, int prevContCE) {
0116: int strength = Collator.TERTIARY;
0117: while (((prevCE & STRENGTH_MASK_[strength]) != (CE & STRENGTH_MASK_[strength]) || (prevContCE & STRENGTH_MASK_[strength]) != (contCE & STRENGTH_MASK_[strength]))
0118: && (strength != 0)) {
0119: strength--;
0120: }
0121: return strength;
0122: }
0123:
0124: private int compareCEs(int source0, int source1, int target0,
0125: int target1) {
0126: int s1 = source0, s2, t1 = target0, t2;
0127: if (RuleBasedCollator.isContinuation(source1)) {
0128: s2 = source1;
0129: } else {
0130: s2 = 0;
0131: }
0132: if (RuleBasedCollator.isContinuation(target1)) {
0133: t2 = target1;
0134: } else {
0135: t2 = 0;
0136: }
0137:
0138: int s = 0, t = 0;
0139: if (s1 == t1 && s2 == t2) {
0140: return 0;
0141: }
0142: s = (s1 & 0xFFFF0000) | ((s2 & 0xFFFF0000) >> 16);
0143: t = (t1 & 0xFFFF0000) | ((t2 & 0xFFFF0000) >> 16);
0144: if (s == t) {
0145: s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00) >> 8;
0146: t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00) >> 8;
0147: if (s == t) {
0148: s = (s1 & 0x000000FF) << 8 | (s2 & 0x000000FF);
0149: t = (t1 & 0x000000FF) << 8 | (t2 & 0x000000FF);
0150: return Utility.compareUnsigned(s, t);
0151: } else {
0152: return Utility.compareUnsigned(s, t);
0153: }
0154: } else {
0155: return Utility.compareUnsigned(s, t);
0156: }
0157: }
0158:
0159: /**
0160: * Finding the inverse CE of the argument CEs
0161: * @param ce CE to be tested
0162: * @param contce continuation CE
0163: * @return inverse CE
0164: */
0165: int findInverseCE(int ce, int contce) {
0166: int bottom = 0;
0167: int top = m_table_.length / 3;
0168: int result = 0;
0169:
0170: while (bottom < top - 1) {
0171: result = (top + bottom) >> 1;
0172: int first = m_table_[3 * result];
0173: int second = m_table_[3 * result + 1];
0174: int comparison = compareCEs(first, second, ce, contce);
0175: if (comparison > 0) {
0176: top = result;
0177: } else if (comparison < 0) {
0178: bottom = result;
0179: } else {
0180: break;
0181: }
0182: }
0183:
0184: return result;
0185: }
0186:
0187: /**
0188: * Getting gap offsets in the inverse UCA
0189: * @param listheader parsed token lists
0190: * @exception Exception thrown when error occurs while finding the
0191: * collation gaps
0192: */
0193: void getInverseGapPositions(
0194: CollationRuleParser.TokenListHeader listheader)
0195: throws Exception {
0196: // reset all the gaps
0197: CollationRuleParser.Token token = listheader.m_first_;
0198: int tokenstrength = token.m_strength_;
0199:
0200: for (int i = 0; i < 3; i++) {
0201: listheader.m_gapsHi_[3 * i] = 0;
0202: listheader.m_gapsHi_[3 * i + 1] = 0;
0203: listheader.m_gapsHi_[3 * i + 2] = 0;
0204: listheader.m_gapsLo_[3 * i] = 0;
0205: listheader.m_gapsLo_[3 * i + 1] = 0;
0206: listheader.m_gapsLo_[3 * i + 2] = 0;
0207: listheader.m_numStr_[i] = 0;
0208: listheader.m_fStrToken_[i] = null;
0209: listheader.m_lStrToken_[i] = null;
0210: listheader.m_pos_[i] = -1;
0211: }
0212:
0213: if ((listheader.m_baseCE_ >>> 24) >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_
0214: && (listheader.m_baseCE_ >>> 24) <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_) {
0215: // implicits -
0216: listheader.m_pos_[0] = 0;
0217: int t1 = listheader.m_baseCE_;
0218: int t2 = listheader.m_baseContCE_;
0219: listheader.m_gapsLo_[0] = mergeCE(t1, t2,
0220: Collator.PRIMARY);
0221: listheader.m_gapsLo_[1] = mergeCE(t1, t2,
0222: Collator.SECONDARY);
0223: listheader.m_gapsLo_[2] = mergeCE(t1, t2,
0224: Collator.TERTIARY);
0225: int primaryCE = t1
0226: & RuleBasedCollator.CE_PRIMARY_MASK_
0227: | (t2 & RuleBasedCollator.CE_PRIMARY_MASK_) >>> 16;
0228: primaryCE = RuleBasedCollator.impCEGen_
0229: .getImplicitFromRaw(RuleBasedCollator.impCEGen_
0230: .getRawFromImplicit(primaryCE) + 1);
0231:
0232: t1 = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_
0233: | 0x0505;
0234: t2 = (primaryCE << 16)
0235: & RuleBasedCollator.CE_PRIMARY_MASK_
0236: | RuleBasedCollator.CE_CONTINUATION_MARKER_;
0237:
0238: // if (listheader.m_baseCE_ < 0xEF000000) {
0239: // // first implicits have three byte primaries, with a gap of
0240: // // one so we esentially need to add 2 to the top byte in
0241: // // listheader.m_baseContCE_
0242: // t2 += 0x02000000;
0243: // }
0244: // else {
0245: // // second implicits have four byte primaries, with a gap of
0246: // // IMPLICIT_LAST2_MULTIPLIER_
0247: // // Now, this guy is not really accessible here, so until we
0248: // // find a better way to pass it around, assume that the gap is 1
0249: // t2 += 0x00020000;
0250: // }
0251: listheader.m_gapsHi_[0] = mergeCE(t1, t2,
0252: Collator.PRIMARY);
0253: listheader.m_gapsHi_[1] = mergeCE(t1, t2,
0254: Collator.SECONDARY);
0255: listheader.m_gapsHi_[2] = mergeCE(t1, t2,
0256: Collator.TERTIARY);
0257: } else if (listheader.m_indirect_ == true
0258: && listheader.m_nextCE_ != 0) {
0259: listheader.m_pos_[0] = 0;
0260: int t1 = listheader.m_baseCE_;
0261: int t2 = listheader.m_baseContCE_;
0262: listheader.m_gapsLo_[0] = mergeCE(t1, t2,
0263: Collator.PRIMARY);
0264: listheader.m_gapsLo_[1] = mergeCE(t1, t2,
0265: Collator.SECONDARY);
0266: listheader.m_gapsLo_[2] = mergeCE(t1, t2,
0267: Collator.TERTIARY);
0268: t1 = listheader.m_nextCE_;
0269: t2 = listheader.m_nextContCE_;
0270: listheader.m_gapsHi_[0] = mergeCE(t1, t2,
0271: Collator.PRIMARY);
0272: listheader.m_gapsHi_[1] = mergeCE(t1, t2,
0273: Collator.SECONDARY);
0274: listheader.m_gapsHi_[2] = mergeCE(t1, t2,
0275: Collator.TERTIARY);
0276: } else {
0277: while (true) {
0278: if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
0279: listheader.m_pos_[tokenstrength] = getInverseNext(
0280: listheader, tokenstrength);
0281: if (listheader.m_pos_[tokenstrength] >= 0) {
0282: listheader.m_fStrToken_[tokenstrength] = token;
0283: } else {
0284: // The CE must be implicit, since it's not in the
0285: // table
0286: // Error
0287: throw new Exception(
0288: "Internal program error");
0289: }
0290: }
0291:
0292: while (token != null
0293: && token.m_strength_ >= tokenstrength) {
0294: if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
0295: listheader.m_lStrToken_[tokenstrength] = token;
0296: }
0297: token = token.m_next_;
0298: }
0299: if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_ - 1) {
0300: // check if previous interval is the same and merge the
0301: // intervals if it is so
0302: if (listheader.m_pos_[tokenstrength] == listheader.m_pos_[tokenstrength + 1]) {
0303: listheader.m_fStrToken_[tokenstrength] = listheader.m_fStrToken_[tokenstrength + 1];
0304: listheader.m_fStrToken_[tokenstrength + 1] = null;
0305: listheader.m_lStrToken_[tokenstrength + 1] = null;
0306: listheader.m_pos_[tokenstrength + 1] = -1;
0307: }
0308: }
0309: if (token != null) {
0310: tokenstrength = token.m_strength_;
0311: } else {
0312: break;
0313: }
0314: }
0315: for (int st = 0; st < 3; st++) {
0316: int pos = listheader.m_pos_[st];
0317: if (pos >= 0) {
0318: int t1 = m_table_[3 * pos];
0319: int t2 = m_table_[3 * pos + 1];
0320: listheader.m_gapsHi_[3 * st] = mergeCE(t1, t2,
0321: Collator.PRIMARY);
0322: listheader.m_gapsHi_[3 * st + 1] = mergeCE(t1,
0323: t2, Collator.SECONDARY);
0324: listheader.m_gapsHi_[3 * st + 2] = (t1 & 0x3f) << 24
0325: | (t2 & 0x3f) << 16;
0326: //pos --;
0327: //t1 = m_table_[3 * pos];
0328: //t2 = m_table_[3 * pos + 1];
0329: t1 = listheader.m_baseCE_;
0330: t2 = listheader.m_baseContCE_;
0331:
0332: listheader.m_gapsLo_[3 * st] = mergeCE(t1, t2,
0333: Collator.PRIMARY);
0334: listheader.m_gapsLo_[3 * st + 1] = mergeCE(t1,
0335: t2, Collator.SECONDARY);
0336: listheader.m_gapsLo_[3 * st + 2] = (t1 & 0x3f) << 24
0337: | (t2 & 0x3f) << 16;
0338: }
0339: }
0340: }
0341: }
0342:
0343: /**
0344: * Gets the next CE in the inverse table
0345: * @param listheader token list header
0346: * @param strength collation strength
0347: * @return next ce
0348: */
0349: private final int getInverseNext(
0350: CollationRuleParser.TokenListHeader listheader,
0351: int strength) {
0352: int ce = listheader.m_baseCE_;
0353: int secondce = listheader.m_baseContCE_;
0354: int result = findInverseCE(ce, secondce);
0355:
0356: if (result < 0) {
0357: return -1;
0358: }
0359:
0360: ce &= STRENGTH_MASK_[strength];
0361: secondce &= STRENGTH_MASK_[strength];
0362:
0363: int nextce = ce;
0364: int nextcontce = secondce;
0365:
0366: while ((nextce & STRENGTH_MASK_[strength]) == ce
0367: && (nextcontce & STRENGTH_MASK_[strength]) == secondce) {
0368: nextce = m_table_[3 * (++result)];
0369: nextcontce = m_table_[3 * result + 1];
0370: }
0371:
0372: listheader.m_nextCE_ = nextce;
0373: listheader.m_nextContCE_ = nextcontce;
0374:
0375: return result;
0376: }
0377: }
0378:
0379: // package private data members ------------------------------------------
0380:
0381: /**
0382: * Inverse UCA, instantiate only when required
0383: */
0384: static final InverseUCA INVERSE_UCA_;
0385:
0386: /**
0387: * UCA and Inverse UCA version do not match
0388: */
0389: private static final String INV_UCA_VERSION_MISMATCH_ = "UCA versions of UCA and inverse UCA should match";
0390:
0391: /**
0392: * UCA and Inverse UCA version do not match
0393: */
0394: private static final String UCA_NOT_INSTANTIATED_ = "UCA is not instantiated!";
0395:
0396: /**
0397: * Initializing the inverse UCA
0398: */
0399: static {
0400: InverseUCA temp = null;
0401: try {
0402: temp = CollatorReader.getInverseUCA();
0403: } catch (IOException e) {
0404: }
0405: /*
0406: try
0407: {
0408: String invdat = "/com/ibm/icu/impl/data/invuca.icu";
0409: InputStream i = CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
0410: BufferedInputStream b = new BufferedInputStream(i, 110000);
0411: INVERSE_UCA_ = CollatorReader.readInverseUCA(b);
0412: b.close();
0413: i.close();
0414: }
0415: catch (Exception e)
0416: {
0417: e.printStackTrace();
0418: throw new RuntimeException(e.getMessage());
0419: }
0420: */
0421:
0422: if (temp != null && RuleBasedCollator.UCA_ != null) {
0423: if (!temp.m_UCA_version_
0424: .equals(RuleBasedCollator.UCA_.m_UCA_version_)) {
0425: throw new RuntimeException(INV_UCA_VERSION_MISMATCH_);
0426: }
0427: } else {
0428: throw new RuntimeException(UCA_NOT_INSTANTIATED_);
0429: }
0430:
0431: INVERSE_UCA_ = temp;
0432: }
0433:
0434: // package private methods -----------------------------------------------
0435:
0436: /**
0437: * Parse and sets the collation rules in the argument collator
0438: * @param collator to set
0439: * @exception Exception thrown when internal program error occurs
0440: */
0441: void setRules(RuleBasedCollator collator) throws Exception {
0442: if (m_parser_.m_resultLength_ > 0
0443: || m_parser_.m_removeSet_ != null) {
0444: // we have a set of rules, let's make something of it
0445: assembleTailoringTable(collator);
0446: } else { // no rules, but no error either must be only options
0447: // We will init the collator from UCA
0448: collator.setWithUCATables();
0449: }
0450: // And set only the options
0451: m_parser_.setDefaultOptionsInCollator(collator);
0452: }
0453:
0454: private void copyRangeFromUCA(BuildTable t, int start, int end) {
0455: int u = 0;
0456: for (u = start; u <= end; u++) {
0457: // if ((CE = ucmpe32_get(t.m_mapping, u)) == UCOL_NOT_FOUND
0458: int CE = t.m_mapping_.getValue(u);
0459: if (CE == CE_NOT_FOUND_
0460: // this test is for contractions that are missing the starting
0461: // element. Looks like latin-1 should be done before
0462: // assembling the table, even if it results in more false
0463: // closure elements
0464: || (isContractionTableElement(CE) && getCE(
0465: t.m_contractions_, CE, 0) == CE_NOT_FOUND_)) {
0466: //m_utilElement_.m_uchars_ = str.toString();
0467: m_utilElement_.m_uchars_ = UCharacter.toString(u);
0468: m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
0469: m_utilElement_.m_prefix_ = 0;
0470: m_utilElement_.m_CELength_ = 0;
0471: m_utilColEIter_.setText(m_utilElement_.m_uchars_);
0472: while (CE != CollationElementIterator.NULLORDER) {
0473: CE = m_utilColEIter_.next();
0474: if (CE != CollationElementIterator.NULLORDER) {
0475: m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
0476: }
0477: }
0478: addAnElement(t, m_utilElement_);
0479: }
0480: }
0481: }
0482:
0483: /**
0484: * 2. Eliminate the negative lists by doing the following for each
0485: * non-null negative list:
0486: * o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
0487: * create new ListHeader X
0488: * o reverse the list, add to the end of X's positive list. Reset the
0489: * strength of the first item you add, based on the stronger strength
0490: * levels of the two lists.
0491: *
0492: * 3. For each ListHeader with a non-null positive list:
0493: * o Find all character strings with CEs between the baseCE and the
0494: * next/previous CE, at the strength of the first token. Add these to the
0495: * tailoring.
0496: * ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
0497: * tailoring has & x < z...
0498: * ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
0499: *
0500: * It is possible that this part should be done even while constructing list
0501: * The problem is that it is unknown what is going to be the strongest
0502: * weight.
0503: * So we might as well do it here
0504: * o Allocate CEs for each token in the list, based on the total number N
0505: * of the largest level difference, and the gap G between baseCE and nextCE
0506: * at that level. The relation * between the last item and nextCE is the
0507: * same as the strongest strength.
0508: * o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
0509: * ? There are 3 primary items: a, d, e. Fit them into the primary gap.
0510: * Then fit b and c into the secondary gap between a and d, then fit q
0511: * into the tertiary gap between b and c.
0512: * o Example: baseCE << b <<< q << c * nextCE(X,2)
0513: * ? There are 2 secondary items: b, c. Fit them into the secondary gap.
0514: * Then fit q into the tertiary gap between b and c.
0515: * o When incrementing primary values, we will not cross high byte
0516: * boundaries except where there is only a single-byte primary. That is
0517: * to ensure that the script reordering will continue to work.
0518: * @param collator the rule based collator to update
0519: * @exception Exception thrown when internal program error occurs
0520: */
0521: void assembleTailoringTable(RuleBasedCollator collator)
0522: throws Exception {
0523:
0524: for (int i = 0; i < m_parser_.m_resultLength_; i++) {
0525: // now we need to generate the CEs
0526: // We stuff the initial value in the buffers, and increase the
0527: // appropriate buffer according to strength
0528: if (m_parser_.m_listHeader_[i].m_first_ != null) {
0529: // if there are any elements
0530: // due to the way parser works, subsequent tailorings
0531: // may remove all the elements from a sequence, therefore
0532: // leaving an empty tailoring sequence.
0533: initBuffers(m_parser_.m_listHeader_[i]);
0534: }
0535: }
0536:
0537: if (m_parser_.m_variableTop_ != null) {
0538: // stuff the variable top value
0539: m_parser_.m_options_.m_variableTopValue_ = m_parser_.m_variableTop_.m_CE_[0] >>> 16;
0540: // remove it from the list
0541: if (m_parser_.m_variableTop_.m_listHeader_.m_first_ == m_parser_.m_variableTop_) { // first in list
0542: m_parser_.m_variableTop_.m_listHeader_.m_first_ = m_parser_.m_variableTop_.m_next_;
0543: }
0544: if (m_parser_.m_variableTop_.m_listHeader_.m_last_ == m_parser_.m_variableTop_) {
0545: // first in list
0546: m_parser_.m_variableTop_.m_listHeader_.m_last_ = m_parser_.m_variableTop_.m_previous_;
0547: }
0548: if (m_parser_.m_variableTop_.m_next_ != null) {
0549: m_parser_.m_variableTop_.m_next_.m_previous_ = m_parser_.m_variableTop_.m_previous_;
0550: }
0551: if (m_parser_.m_variableTop_.m_previous_ != null) {
0552: m_parser_.m_variableTop_.m_previous_.m_next_ = m_parser_.m_variableTop_.m_next_;
0553: }
0554: }
0555:
0556: BuildTable t = new BuildTable(m_parser_);
0557:
0558: // After this, we have assigned CE values to all regular CEs now we
0559: // will go through list once more and resolve expansions, make
0560: // UCAElements structs and add them to table
0561: for (int i = 0; i < m_parser_.m_resultLength_; i++) {
0562: // now we need to generate the CEs
0563: // We stuff the initial value in the buffers, and increase the
0564: // appropriate buffer according to strength */
0565: createElements(t, m_parser_.m_listHeader_[i]);
0566: }
0567:
0568: m_utilElement_.clear();
0569: StringBuffer str = new StringBuffer();
0570:
0571: // add latin-1 stuff
0572: copyRangeFromUCA(t, 0, 0xFF);
0573:
0574: // add stuff for copying
0575: if (m_parser_.m_copySet_ != null) {
0576: int i = 0;
0577: for (i = 0; i < m_parser_.m_copySet_.getRangeCount(); i++) {
0578: copyRangeFromUCA(t, m_parser_.m_copySet_
0579: .getRangeStart(i), m_parser_.m_copySet_
0580: .getRangeEnd(i));
0581: }
0582: }
0583:
0584: // copy contractions from the UCA - this is felt mostly for cyrillic
0585: char conts[] = RuleBasedCollator.UCA_CONTRACTIONS_;
0586: int offset = 0;
0587: while (conts[offset] != 0) {
0588: // tailoredCE = ucmpe32_get(t.m_mapping, *conts);
0589: int tailoredCE = t.m_mapping_.getValue(conts[offset]);
0590: if (tailoredCE != CE_NOT_FOUND_) {
0591: boolean needToAdd = true;
0592: if (isContractionTableElement(tailoredCE)) {
0593: if (isTailored(t.m_contractions_, tailoredCE,
0594: conts, offset + 1) == true) {
0595: needToAdd = false;
0596: }
0597: }
0598: if (m_parser_.m_removeSet_ != null
0599: && m_parser_.m_removeSet_
0600: .contains(conts[offset])) {
0601: needToAdd = false;
0602: }
0603:
0604: if (needToAdd == true) {
0605: // we need to add if this contraction is not tailored.
0606: m_utilElement_.m_prefix_ = 0;
0607: m_utilElement_.m_prefixChars_ = null;
0608: m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
0609: str.delete(0, str.length());
0610: str.append(conts[offset]);
0611: str.append(conts[offset + 1]);
0612: if (conts[offset + 2] != 0) {
0613: str.append(conts[offset + 2]);
0614: }
0615: m_utilElement_.m_uchars_ = str.toString();
0616: m_utilElement_.m_CELength_ = 0;
0617: m_utilColEIter_.setText(m_utilElement_.m_uchars_);
0618: while (true) {
0619: int CE = m_utilColEIter_.next();
0620: if (CE != CollationElementIterator.NULLORDER) {
0621: m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
0622: } else {
0623: break;
0624: }
0625: }
0626: addAnElement(t, m_utilElement_);
0627: }
0628: } else if (m_parser_.m_removeSet_ != null
0629: && m_parser_.m_removeSet_.contains(conts[offset])) {
0630: copyRangeFromUCA(t, conts[offset], conts[offset]);
0631: }
0632:
0633: offset += 3;
0634: }
0635:
0636: // Add completely ignorable elements
0637: processUCACompleteIgnorables(t);
0638:
0639: // canonical closure
0640: canonicalClosure(t);
0641:
0642: // still need to produce compatibility closure
0643: assembleTable(t, collator);
0644: }
0645:
0646: // private inner classes -------------------------------------------------
0647:
0648: private static class CEGenerator {
0649: // package private data members --------------------------------------
0650:
0651: WeightRange m_ranges_[];
0652: int m_rangesLength_;
0653: int m_byteSize_;
0654: int m_start_;
0655: int m_limit_;
0656: int m_maxCount_;
0657: int m_count_;
0658: int m_current_;
0659: int m_fLow_; // forbidden Low
0660: int m_fHigh_; // forbidden High
0661:
0662: // package private constructor ---------------------------------------
0663:
0664: CEGenerator() {
0665: m_ranges_ = new WeightRange[7];
0666: for (int i = 6; i >= 0; i--) {
0667: m_ranges_[i] = new WeightRange();
0668: }
0669: }
0670: }
0671:
0672: private static class WeightRange implements Comparable {
0673: // public methods ----------------------------------------------------
0674:
0675: /**
0676: * Compares this object with target
0677: * @param target object to compare with
0678: * @return 0 if equals, 1 if this is > target, -1 otherwise
0679: */
0680: public int compareTo(Object target) {
0681: if (this == target) {
0682: return 0;
0683: }
0684: int tstart = ((WeightRange) target).m_start_;
0685: if (m_start_ == tstart) {
0686: return 0;
0687: }
0688: if (m_start_ > tstart) {
0689: return 1;
0690: }
0691: return -1;
0692: }
0693:
0694: /**
0695: * Initialize
0696: */
0697: public void clear() {
0698: m_start_ = 0;
0699: m_end_ = 0;
0700: m_length_ = 0;
0701: m_count_ = 0;
0702: m_length2_ = 0;
0703: m_count2_ = 0;
0704: }
0705:
0706: // package private data members --------------------------------------
0707:
0708: int m_start_;
0709: int m_end_;
0710: int m_length_;
0711: int m_count_;
0712: int m_length2_;
0713: int m_count2_;
0714:
0715: // package private constructor ---------------------------------------
0716:
0717: WeightRange() {
0718: clear();
0719: }
0720:
0721: /**
0722: * Copy constructor.
0723: * Cloneable is troublesome, needs to check for exception
0724: * @param source to clone
0725: */
0726: WeightRange(WeightRange source) {
0727: m_start_ = source.m_start_;
0728: m_end_ = source.m_end_;
0729: m_length_ = source.m_length_;
0730: m_count_ = source.m_count_;
0731: m_length2_ = source.m_length2_;
0732: m_count2_ = source.m_count2_;
0733: }
0734: }
0735:
0736: private static class MaxJamoExpansionTable {
0737: // package private data members --------------------------------------
0738:
0739: Vector m_endExpansionCE_;
0740: // vector of booleans
0741: Vector m_isV_;
0742: byte m_maxLSize_;
0743: byte m_maxVSize_;
0744: byte m_maxTSize_;
0745:
0746: // package private constructor ---------------------------------------
0747:
0748: MaxJamoExpansionTable() {
0749: m_endExpansionCE_ = new Vector();
0750: m_isV_ = new Vector();
0751: m_endExpansionCE_.add(new Integer(0));
0752: m_isV_.add(new Boolean(false));
0753: m_maxLSize_ = 1;
0754: m_maxVSize_ = 1;
0755: m_maxTSize_ = 1;
0756: }
0757:
0758: MaxJamoExpansionTable(MaxJamoExpansionTable table) {
0759: m_endExpansionCE_ = (Vector) table.m_endExpansionCE_
0760: .clone();
0761: m_isV_ = (Vector) table.m_isV_.clone();
0762: m_maxLSize_ = table.m_maxLSize_;
0763: m_maxVSize_ = table.m_maxVSize_;
0764: m_maxTSize_ = table.m_maxTSize_;
0765: }
0766: }
0767:
0768: private static class MaxExpansionTable {
0769: // package private constructor --------------------------------------
0770:
0771: MaxExpansionTable() {
0772: m_endExpansionCE_ = new Vector();
0773: m_expansionCESize_ = new Vector();
0774: m_endExpansionCE_.add(new Integer(0));
0775: m_expansionCESize_.add(new Byte((byte) 0));
0776: }
0777:
0778: MaxExpansionTable(MaxExpansionTable table) {
0779: m_endExpansionCE_ = (Vector) table.m_endExpansionCE_
0780: .clone();
0781: m_expansionCESize_ = (Vector) table.m_expansionCESize_
0782: .clone();
0783: }
0784:
0785: // package private data member --------------------------------------
0786:
0787: Vector m_endExpansionCE_;
0788: Vector m_expansionCESize_;
0789: }
0790:
0791: private static class BasicContractionTable {
0792: // package private constructors -------------------------------------
0793:
0794: BasicContractionTable() {
0795: m_CEs_ = new Vector();
0796: m_codePoints_ = new StringBuffer();
0797: }
0798:
0799: // package private data members -------------------------------------
0800:
0801: StringBuffer m_codePoints_;
0802: Vector m_CEs_;
0803: }
0804:
0805: private static class ContractionTable {
0806: // package private constructor --------------------------------------
0807:
0808: /**
0809: * Builds a contraction table
0810: * @param mapping
0811: */
0812: ContractionTable(IntTrieBuilder mapping) {
0813: m_mapping_ = mapping;
0814: m_elements_ = new Vector();
0815: m_CEs_ = new Vector();
0816: m_codePoints_ = new StringBuffer();
0817: m_offsets_ = new Vector();
0818: m_currentTag_ = CE_NOT_FOUND_TAG_;
0819: }
0820:
0821: /**
0822: * Copies a contraction table.
0823: * Not all data will be copied into their own object.
0824: * @param table
0825: */
0826: ContractionTable(ContractionTable table) {
0827: m_mapping_ = table.m_mapping_;
0828: m_elements_ = (Vector) table.m_elements_.clone();
0829: m_codePoints_ = new StringBuffer(table.m_codePoints_
0830: .toString());
0831: m_CEs_ = (Vector) table.m_CEs_.clone();
0832: m_offsets_ = (Vector) table.m_offsets_.clone();
0833: m_currentTag_ = table.m_currentTag_;
0834: }
0835:
0836: // package private data members ------------------------------------
0837:
0838: /**
0839: * Vector of BasicContractionTable
0840: */
0841: Vector m_elements_;
0842: IntTrieBuilder m_mapping_;
0843: StringBuffer m_codePoints_;
0844: Vector m_CEs_;
0845: Vector m_offsets_;
0846: int m_currentTag_;
0847: }
0848:
0849: private static final class BuildTable implements
0850: TrieBuilder.DataManipulate {
0851: // package private methods ------------------------------------------
0852:
0853: /**
0854: * For construction of the Trie tables.
0855: * Has to be labeled public
0856: * @param cp
0857: * @param offset
0858: * @return data offset or 0
0859: * @draft 2.2
0860: */
0861: public int getFoldedValue(int cp, int offset) {
0862: int limit = cp + 0x400;
0863: while (cp < limit) {
0864: int value = m_mapping_.getValue(cp);
0865: boolean inBlockZero = m_mapping_.isInZeroBlock(cp);
0866: int tag = getCETag(value);
0867: if (inBlockZero == true) {
0868: cp += TrieBuilder.DATA_BLOCK_LENGTH;
0869: } else if (!(isSpecial(value) && (tag == CE_IMPLICIT_TAG_ || tag == CE_NOT_FOUND_TAG_))) {
0870: // These are values that are starting in either UCA
0871: // (IMPLICIT_TAG) or in the tailorings (NOT_FOUND_TAG).
0872: // Presence of these tags means that there is nothing in
0873: // this position and that it should be skipped.
0874: return RuleBasedCollator.CE_SPECIAL_FLAG_
0875: | (CE_SURROGATE_TAG_ << 24) | offset;
0876: } else {
0877: ++cp;
0878: }
0879: }
0880: return 0;
0881: }
0882:
0883: // package private constructor --------------------------------------
0884:
0885: /**
0886: * Returns a table
0887: */
0888: BuildTable(CollationRuleParser parser) {
0889: m_collator_ = new RuleBasedCollator();
0890: m_collator_.setWithUCAData();
0891: MaxExpansionTable maxet = new MaxExpansionTable();
0892: MaxJamoExpansionTable maxjet = new MaxJamoExpansionTable();
0893: m_options_ = parser.m_options_;
0894: m_expansions_ = new Vector();
0895: // Do your own mallocs for the structure, array and have linear
0896: // Latin 1
0897: int trieinitialvalue = RuleBasedCollator.CE_SPECIAL_FLAG_
0898: | (CE_NOT_FOUND_TAG_ << 24);
0899: // temporary fix for jb3822, 0x100000 -> 30000
0900: m_mapping_ = new IntTrieBuilder(null, 0x30000,
0901: trieinitialvalue, trieinitialvalue, true);
0902: m_prefixLookup_ = new Hashtable();
0903: // uhash_open(prefixLookupHash, prefixLookupComp);
0904: m_contractions_ = new ContractionTable(m_mapping_);
0905: // copy UCA's maxexpansion and merge as we go along
0906: m_maxExpansions_ = maxet;
0907: // adding an extra initial value for easier manipulation
0908: for (int i = 0; i < RuleBasedCollator.UCA_.m_expansionEndCE_.length; i++) {
0909: maxet.m_endExpansionCE_.add(new Integer(
0910: RuleBasedCollator.UCA_.m_expansionEndCE_[i]));
0911: maxet.m_expansionCESize_
0912: .add(new Byte(
0913: RuleBasedCollator.UCA_.m_expansionEndCEMaxSize_[i]));
0914: }
0915: m_maxJamoExpansions_ = maxjet;
0916:
0917: m_unsafeCP_ = new byte[UNSAFECP_TABLE_SIZE_];
0918: m_contrEndCP_ = new byte[UNSAFECP_TABLE_SIZE_];
0919: Arrays.fill(m_unsafeCP_, (byte) 0);
0920: Arrays.fill(m_contrEndCP_, (byte) 0);
0921: }
0922:
0923: /**
0924: * Duplicating a BuildTable.
0925: * Not all data will be duplicated into their own object.
0926: * @param table to clone
0927: */
0928: BuildTable(BuildTable table) {
0929: m_collator_ = table.m_collator_;
0930: m_mapping_ = new IntTrieBuilder(table.m_mapping_);
0931: m_expansions_ = (Vector) table.m_expansions_.clone();
0932: m_contractions_ = new ContractionTable(
0933: table.m_contractions_);
0934: m_contractions_.m_mapping_ = m_mapping_;
0935: m_options_ = table.m_options_;
0936: m_maxExpansions_ = new MaxExpansionTable(
0937: table.m_maxExpansions_);
0938: m_maxJamoExpansions_ = new MaxJamoExpansionTable(
0939: table.m_maxJamoExpansions_);
0940: m_unsafeCP_ = new byte[table.m_unsafeCP_.length];
0941: System.arraycopy(table.m_unsafeCP_, 0, m_unsafeCP_, 0,
0942: m_unsafeCP_.length);
0943: m_contrEndCP_ = new byte[table.m_contrEndCP_.length];
0944: System.arraycopy(table.m_contrEndCP_, 0, m_contrEndCP_, 0,
0945: m_contrEndCP_.length);
0946: }
0947:
0948: // package private data members -------------------------------------
0949:
0950: RuleBasedCollator m_collator_;
0951: IntTrieBuilder m_mapping_;
0952: Vector m_expansions_;
0953: ContractionTable m_contractions_;
0954: // UCATableHeader image;
0955: CollationRuleParser.OptionSet m_options_;
0956: MaxExpansionTable m_maxExpansions_;
0957: MaxJamoExpansionTable m_maxJamoExpansions_;
0958: byte m_unsafeCP_[];
0959: byte m_contrEndCP_[];
0960: Hashtable m_prefixLookup_;
0961: }
0962:
0963: private static class Elements {
0964: // package private data members -------------------------------------
0965:
0966: String m_prefixChars_;
0967: int m_prefix_;
0968: String m_uchars_;
0969: /**
0970: * Working string
0971: */
0972: String m_cPoints_;
0973: /**
0974: * Offset to the working string
0975: */
0976: int m_cPointsOffset_;
0977: /**
0978: * These are collation elements - there could be more than one - in
0979: * case of expansion
0980: */
0981: int m_CEs_[];
0982: int m_CELength_;
0983: /**
0984: * This is the value element maps in original table
0985: */
0986: int m_mapCE_;
0987: int m_sizePrim_[];
0988: int m_sizeSec_[];
0989: int m_sizeTer_[];
0990: boolean m_variableTop_;
0991: boolean m_caseBit_;
0992:
0993: // package private constructors -------------------------------------
0994:
0995: /**
0996: * Package private constructor
0997: */
0998: Elements() {
0999: m_sizePrim_ = new int[128];
1000: m_sizeSec_ = new int[128];
1001: m_sizeTer_ = new int[128];
1002: m_CEs_ = new int[256];
1003: m_CELength_ = 0;
1004: }
1005:
1006: /**
1007: * Package private constructor
1008: */
1009: Elements(Elements element) {
1010: m_prefixChars_ = element.m_prefixChars_;
1011: m_prefix_ = element.m_prefix_;
1012: m_uchars_ = element.m_uchars_;
1013: m_cPoints_ = element.m_cPoints_;
1014: m_cPointsOffset_ = element.m_cPointsOffset_;
1015: m_CEs_ = element.m_CEs_;
1016: m_CELength_ = element.m_CELength_;
1017: m_mapCE_ = element.m_mapCE_;
1018: m_sizePrim_ = element.m_sizePrim_;
1019: m_sizeSec_ = element.m_sizeSec_;
1020: m_sizeTer_ = element.m_sizeTer_;
1021: m_variableTop_ = element.m_variableTop_;
1022: m_caseBit_ = element.m_caseBit_;
1023: }
1024:
1025: // package private methods -------------------------------------------
1026:
1027: /**
1028: * Initializing the elements
1029: */
1030: public void clear() {
1031: m_prefixChars_ = null;
1032: m_prefix_ = 0;
1033: m_uchars_ = null;
1034: m_cPoints_ = null;
1035: m_cPointsOffset_ = 0;
1036: m_CELength_ = 0;
1037: m_mapCE_ = 0;
1038: Arrays.fill(m_sizePrim_, 0);
1039: Arrays.fill(m_sizeSec_, 0);
1040: Arrays.fill(m_sizeTer_, 0);
1041: m_variableTop_ = false;
1042: m_caseBit_ = false;
1043: }
1044:
1045: /**
1046: * Hashcode calculation for token
1047: * @return the hashcode
1048: */
1049: public int hashCode() {
1050: String str = m_cPoints_.substring(m_cPointsOffset_);
1051: return str.hashCode();
1052: }
1053:
1054: /**
1055: * Equals calculation
1056: * @param target object to compare
1057: * @return true if target is the same as this object
1058: */
1059: public boolean equals(Object target) {
1060: if (target == this ) {
1061: return true;
1062: }
1063: if (target instanceof Elements) {
1064: Elements t = (Elements) target;
1065: int size = m_cPoints_.length() - m_cPointsOffset_;
1066: if (size == t.m_cPoints_.length() - t.m_cPointsOffset_) {
1067: return t.m_cPoints_.regionMatches(
1068: t.m_cPointsOffset_, m_cPoints_,
1069: m_cPointsOffset_, size);
1070: }
1071: }
1072: return false;
1073: }
1074: }
1075:
1076: // private data member ---------------------------------------------------
1077:
1078: /**
1079: * Maximum strength used in CE building
1080: */
1081: private static final int CE_BASIC_STRENGTH_LIMIT_ = 3;
1082: /**
1083: * Maximum collation strength
1084: */
1085: private static final int CE_STRENGTH_LIMIT_ = 16;
1086: /**
1087: * Strength mask array, used in inverse UCA
1088: */
1089: private static final int STRENGTH_MASK_[] = { 0xFFFF0000,
1090: 0xFFFFFF00, 0xFFFFFFFF };
1091: /**
1092: * CE tag for not found
1093: */
1094: private static final int CE_NOT_FOUND_ = 0xF0000000;
1095: /**
1096: * CE tag for not found
1097: */
1098: private static final int CE_NOT_FOUND_TAG_ = 0;
1099: /**
1100: * This code point results in an expansion
1101: */
1102: private static final int CE_EXPANSION_TAG_ = 1;
1103: /**
1104: * Start of a contraction
1105: */
1106: private static final int CE_CONTRACTION_TAG_ = 2;
1107: /**
1108: * Thai character - do the reordering
1109: */
1110: private static final int CE_THAI_TAG_ = 3;
1111: /**
1112: * Charset processing, not yet implemented
1113: */
1114: private static final int CE_CHARSET_TAG_ = 4;
1115: /**
1116: * Lead surrogate that is tailored and doesn't start a contraction
1117: */
1118: private static final int CE_SURROGATE_TAG_ = 5;
1119: /**
1120: * AC00-D7AF
1121: */
1122: private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
1123: /**
1124: * D800-DBFF
1125: */
1126: private static final int CE_LEAD_SURROGATE_TAG_ = 7;
1127: /**
1128: * DC00-DFFF
1129: */
1130: private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
1131: /**
1132: * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
1133: */
1134: private static final int CE_CJK_IMPLICIT_TAG_ = 9;
1135: private static final int CE_IMPLICIT_TAG_ = 10;
1136: private static final int CE_SPEC_PROC_TAG_ = 11;
1137: /**
1138: * This is a three byte primary with starting secondaries and tertiaries.
1139: * It fits in a single 32 bit CE and is used instead of expansion to save
1140: * space without affecting the performance (hopefully)
1141: */
1142: private static final int CE_LONG_PRIMARY_TAG_ = 12;
1143: /**
1144: * Unsafe UChar hash table table size. Size is 32 bytes for 1 bit for each
1145: * latin 1 char + some power of two for hashing the rest of the chars.
1146: * Size in bytes
1147: */
1148: private static final int UNSAFECP_TABLE_SIZE_ = 1056;
1149: /**
1150: * Mask value down to "some power of two" -1. Number of bits, not num of
1151: * bytes.
1152: */
1153: private static final int UNSAFECP_TABLE_MASK_ = 0x1fff;
1154: /**
1155: * Case values
1156: */
1157: private static final int UPPER_CASE_ = 0x80;
1158: private static final int MIXED_CASE_ = 0x40;
1159: private static final int LOWER_CASE_ = 0x00;
1160: /**
1161: * Initial table size
1162: */
1163: private static final int INIT_TABLE_SIZE_ = 1028;
1164: /**
1165: * Header size, copied from ICU4C, to be changed when that value changes
1166: */
1167: private static final int HEADER_SIZE_ = 0xC4;
1168: /**
1169: * Contraction table new element indicator
1170: */
1171: private static final int CONTRACTION_TABLE_NEW_ELEMENT_ = 0xFFFFFF;
1172: /**
1173: * Parser for the rules
1174: */
1175: private CollationRuleParser m_parser_;
1176: /**
1177: * Utility UCA collation element iterator
1178: */
1179: private CollationElementIterator m_utilColEIter_;
1180: /**
1181: * Utility data members
1182: */
1183: private CEGenerator m_utilGens_[] = { new CEGenerator(),
1184: new CEGenerator(), new CEGenerator() };
1185: private int m_utilCEBuffer_[] = new int[CE_BASIC_STRENGTH_LIMIT_];
1186: private int m_utilIntBuffer_[] = new int[CE_STRENGTH_LIMIT_];
1187: private Elements m_utilElement_ = new Elements();
1188: private Elements m_utilElement2_ = new Elements();
1189: private CollationRuleParser.Token m_utilToken_ = new CollationRuleParser.Token();
1190: private int m_utilCountBuffer_[] = new int[6];
1191: private long m_utilLongBuffer_[] = new long[5];
1192: private WeightRange m_utilLowerWeightRange_[] = {
1193: new WeightRange(), new WeightRange(), new WeightRange(),
1194: new WeightRange(), new WeightRange() };
1195: private WeightRange m_utilUpperWeightRange_[] = {
1196: new WeightRange(), new WeightRange(), new WeightRange(),
1197: new WeightRange(), new WeightRange() };
1198: private WeightRange m_utilWeightRange_ = new WeightRange();
1199: private char m_utilCharBuffer_[] = new char[256];
1200: private CanonicalIterator m_utilCanIter_ = new CanonicalIterator("");
1201: private StringBuffer m_utilStringBuffer_ = new StringBuffer("");
1202:
1203: // private methods -------------------------------------------------------
1204:
1205: /**
1206: * @param listheader parsed rule tokens
1207: * @exception Exception thrown when internal error occurs
1208: */
1209: private void initBuffers(
1210: CollationRuleParser.TokenListHeader listheader)
1211: throws Exception {
1212: CollationRuleParser.Token token = listheader.m_last_;
1213: Arrays.fill(m_utilIntBuffer_, 0, CE_STRENGTH_LIMIT_, 0);
1214:
1215: token.m_toInsert_ = 1;
1216: m_utilIntBuffer_[token.m_strength_] = 1;
1217: while (token.m_previous_ != null) {
1218: if (token.m_previous_.m_strength_ < token.m_strength_) {
1219: // going up
1220: m_utilIntBuffer_[token.m_strength_] = 0;
1221: m_utilIntBuffer_[token.m_previous_.m_strength_]++;
1222: } else if (token.m_previous_.m_strength_ > token.m_strength_) {
1223: // going down
1224: m_utilIntBuffer_[token.m_previous_.m_strength_] = 1;
1225: } else {
1226: m_utilIntBuffer_[token.m_strength_]++;
1227: }
1228: token = token.m_previous_;
1229: token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
1230: }
1231:
1232: token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
1233: INVERSE_UCA_.getInverseGapPositions(listheader);
1234:
1235: token = listheader.m_first_;
1236: int fstrength = Collator.IDENTICAL;
1237: int initstrength = Collator.IDENTICAL;
1238:
1239: m_utilCEBuffer_[Collator.PRIMARY] = mergeCE(
1240: listheader.m_baseCE_, listheader.m_baseContCE_,
1241: Collator.PRIMARY);
1242: m_utilCEBuffer_[Collator.SECONDARY] = mergeCE(
1243: listheader.m_baseCE_, listheader.m_baseContCE_,
1244: Collator.SECONDARY);
1245: m_utilCEBuffer_[Collator.TERTIARY] = mergeCE(
1246: listheader.m_baseCE_, listheader.m_baseContCE_,
1247: Collator.TERTIARY);
1248: while (token != null) {
1249: fstrength = token.m_strength_;
1250: if (fstrength < initstrength) {
1251: initstrength = fstrength;
1252: if (listheader.m_pos_[fstrength] == -1) {
1253: while (listheader.m_pos_[fstrength] == -1
1254: && fstrength > 0) {
1255: fstrength--;
1256: }
1257: if (listheader.m_pos_[fstrength] == -1) {
1258: throw new Exception("Internal program error");
1259: }
1260: }
1261: if (initstrength == Collator.TERTIARY) {
1262: // starting with tertiary
1263: m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
1264: m_utilCEBuffer_[Collator.SECONDARY] = listheader.m_gapsLo_[fstrength * 3 + 1];
1265: m_utilCEBuffer_[Collator.TERTIARY] = getCEGenerator(
1266: m_utilGens_[Collator.TERTIARY],
1267: listheader.m_gapsLo_, listheader.m_gapsHi_,
1268: token, fstrength);
1269: } else if (initstrength == Collator.SECONDARY) {
1270: // secondaries
1271: m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
1272: m_utilCEBuffer_[Collator.SECONDARY] = getCEGenerator(
1273: m_utilGens_[Collator.SECONDARY],
1274: listheader.m_gapsLo_, listheader.m_gapsHi_,
1275: token, fstrength);
1276: m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1277: m_utilGens_[Collator.TERTIARY], token,
1278: Collator.TERTIARY);
1279: } else {
1280: // primaries
1281: m_utilCEBuffer_[Collator.PRIMARY] = getCEGenerator(
1282: m_utilGens_[Collator.PRIMARY],
1283: listheader.m_gapsLo_, listheader.m_gapsHi_,
1284: token, fstrength);
1285: m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
1286: m_utilGens_[Collator.SECONDARY], token,
1287: Collator.SECONDARY);
1288: m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1289: m_utilGens_[Collator.TERTIARY], token,
1290: Collator.TERTIARY);
1291: }
1292: } else {
1293: if (token.m_strength_ == Collator.TERTIARY) {
1294: m_utilCEBuffer_[Collator.TERTIARY] = getNextGenerated(m_utilGens_[Collator.TERTIARY]);
1295: } else if (token.m_strength_ == Collator.SECONDARY) {
1296: m_utilCEBuffer_[Collator.SECONDARY] = getNextGenerated(m_utilGens_[Collator.SECONDARY]);
1297: m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1298: m_utilGens_[Collator.TERTIARY], token,
1299: Collator.TERTIARY);
1300: } else if (token.m_strength_ == Collator.PRIMARY) {
1301: m_utilCEBuffer_[Collator.PRIMARY] = getNextGenerated(m_utilGens_[Collator.PRIMARY]);
1302: m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
1303: m_utilGens_[Collator.SECONDARY], token,
1304: Collator.SECONDARY);
1305: m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1306: m_utilGens_[Collator.TERTIARY], token,
1307: Collator.TERTIARY);
1308: }
1309: }
1310: doCE(m_utilCEBuffer_, token);
1311: token = token.m_next_;
1312: }
1313: }
1314:
1315: /**
1316: * Get the next generated ce
1317: * @param g ce generator
1318: * @return next generated ce
1319: */
1320: private int getNextGenerated(CEGenerator g) {
1321: g.m_current_ = nextWeight(g);
1322: return g.m_current_;
1323: }
1324:
1325: /**
1326: * @param g CEGenerator
1327: * @param token rule token
1328: * @param strength
1329: * @return ce generator
1330: * @exception Exception thrown when internal error occurs
1331: */
1332: private int getSimpleCEGenerator(CEGenerator g,
1333: CollationRuleParser.Token token, int strength)
1334: throws Exception {
1335: int high, low, count = 1;
1336: int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF;
1337:
1338: if (strength == Collator.SECONDARY) {
1339: low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1340: high = 0xFFFFFFFF;
1341: count = 0xFF - RuleBasedCollator.COMMON_TOP_2_;
1342: } else {
1343: low = RuleBasedCollator.BYTE_COMMON_ << 24; //0x05000000;
1344: high = 0x40000000;
1345: count = 0x40 - RuleBasedCollator.BYTE_COMMON_;
1346: }
1347:
1348: if (token.m_next_ != null
1349: && token.m_next_.m_strength_ == strength) {
1350: count = token.m_next_.m_toInsert_;
1351: }
1352:
1353: g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
1354: g.m_ranges_);
1355: g.m_current_ = RuleBasedCollator.BYTE_COMMON_ << 24;
1356:
1357: if (g.m_rangesLength_ == 0) {
1358: throw new Exception("Internal program error");
1359: }
1360: return g.m_current_;
1361: }
1362:
1363: /**
1364: * Combines 2 ce into one with respect to the argument strength
1365: * @param ce1 first ce
1366: * @param ce2 second ce
1367: * @param strength strength to use
1368: * @return combined ce
1369: */
1370: private static int mergeCE(int ce1, int ce2, int strength) {
1371: int mask = RuleBasedCollator.CE_TERTIARY_MASK_;
1372: if (strength == Collator.SECONDARY) {
1373: mask = RuleBasedCollator.CE_SECONDARY_MASK_;
1374: } else if (strength == Collator.PRIMARY) {
1375: mask = RuleBasedCollator.CE_PRIMARY_MASK_;
1376: }
1377: ce1 &= mask;
1378: ce2 &= mask;
1379: switch (strength) {
1380: case Collator.PRIMARY:
1381: return ce1 | ce2 >>> 16;
1382: case Collator.SECONDARY:
1383: return ce1 << 16 | ce2 << 8;
1384: default:
1385: return ce1 << 24 | ce2 << 16;
1386: }
1387: }
1388:
1389: /**
1390: * @param g CEGenerator
1391: * @param lows low gap array
1392: * @param highs high gap array
1393: * @param token rule token
1394: * @param fstrength
1395: * @exception Exception thrown when internal error occurs
1396: */
1397: private int getCEGenerator(CEGenerator g, int lows[], int highs[],
1398: CollationRuleParser.Token token, int fstrength)
1399: throws Exception {
1400: int strength = token.m_strength_;
1401: int low = lows[fstrength * 3 + strength];
1402: int high = highs[fstrength * 3 + strength];
1403: int maxbyte = 0;
1404: if (strength == Collator.TERTIARY) {
1405: maxbyte = 0x3F;
1406: } else if (strength == Collator.PRIMARY) {
1407: maxbyte = 0xFE;
1408: } else {
1409: maxbyte = 0xFF;
1410: }
1411:
1412: int count = token.m_toInsert_;
1413:
1414: if (Utility.compareUnsigned(low, high) >= 0
1415: && strength > Collator.PRIMARY) {
1416: int s = strength;
1417: while (true) {
1418: s--;
1419: if (lows[fstrength * 3 + s] != highs[fstrength * 3 + s]) {
1420: if (strength == Collator.SECONDARY) {
1421: low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1422: high = 0xFFFFFFFF;
1423: } else {
1424: // low = 0x02000000;
1425: // This needs to be checked - what if low is
1426: // not good...
1427: high = 0x40000000;
1428: }
1429: break;
1430: }
1431: if (s < 0) {
1432: throw new Exception("Internal program error");
1433: }
1434: }
1435: }
1436: if (low == 0) {
1437: low = 0x01000000;
1438: }
1439: if (strength == Collator.SECONDARY) { // similar as simple
1440: if (Utility.compareUnsigned(low,
1441: RuleBasedCollator.COMMON_BOTTOM_2_ << 24) >= 0
1442: && Utility.compareUnsigned(low,
1443: RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
1444: low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1445: }
1446: if (Utility.compareUnsigned(high,
1447: RuleBasedCollator.COMMON_BOTTOM_2_ << 24) > 0
1448: && Utility.compareUnsigned(high,
1449: RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
1450: high = RuleBasedCollator.COMMON_TOP_2_ << 24;
1451: }
1452: if (Utility.compareUnsigned(low,
1453: RuleBasedCollator.COMMON_BOTTOM_2_ << 24) < 0) {
1454: g.m_rangesLength_ = allocateWeights(
1455: RuleBasedCollator.BYTE_UNSHIFTED_MIN_ << 24,
1456: high, count, maxbyte, g.m_ranges_);
1457: g.m_current_ = nextWeight(g);
1458: //g.m_current_ = RuleBasedCollator.COMMON_BOTTOM_2_ << 24;
1459: return g.m_current_;
1460: }
1461: }
1462:
1463: g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
1464: g.m_ranges_);
1465: if (g.m_rangesLength_ == 0) {
1466: throw new Exception("Internal program error");
1467: }
1468: g.m_current_ = nextWeight(g);
1469: return g.m_current_;
1470: }
1471:
1472: /**
1473: * @param ceparts list of collation elements parts
1474: * @param token rule token
1475: * @exception Exception thrown when forming case bits for expansions fails
1476: */
1477: private void doCE(int ceparts[], CollationRuleParser.Token token)
1478: throws Exception {
1479: // this one makes the table and stuff
1480: // int noofbytes[] = new int[3];
1481: for (int i = 0; i < 3; i++) {
1482: // noofbytes[i] = countBytes(ceparts[i]);
1483: m_utilIntBuffer_[i] = countBytes(ceparts[i]);
1484: }
1485:
1486: // Here we have to pack CEs from parts
1487: int cei = 0;
1488: int value = 0;
1489:
1490: while ((cei << 1) < m_utilIntBuffer_[0]
1491: || cei < m_utilIntBuffer_[1]
1492: || cei < m_utilIntBuffer_[2]) {
1493: if (cei > 0) {
1494: value = RuleBasedCollator.CE_CONTINUATION_MARKER_;
1495: } else {
1496: value = 0;
1497: }
1498:
1499: if ((cei << 1) < m_utilIntBuffer_[0]) {
1500: value |= ((ceparts[0] >> (32 - ((cei + 1) << 4))) & 0xFFFF) << 16;
1501: }
1502: if (cei < m_utilIntBuffer_[1]) {
1503: value |= ((ceparts[1] >> (32 - ((cei + 1) << 3))) & 0xFF) << 8;
1504: }
1505:
1506: if (cei < m_utilIntBuffer_[2]) {
1507: value |= ((ceparts[2] >> (32 - ((cei + 1) << 3))) & 0x3F);
1508: }
1509: token.m_CE_[cei] = value;
1510: cei++;
1511: }
1512: if (cei == 0) { // totally ignorable
1513: token.m_CELength_ = 1;
1514: token.m_CE_[0] = 0;
1515: } else { // there is at least something
1516: token.m_CELength_ = cei;
1517: }
1518:
1519: // Case bits handling for expansion
1520: if (token.m_CE_[0] != 0) { // case bits should be set only for non-ignorables
1521: int startoftokenrule = token.m_source_ & 0xFF;
1522: if ((token.m_source_ >>> 24) > 1) {
1523: // Do it manually
1524: int length = token.m_source_ >>> 24;
1525: String tokenstr = token.m_rules_.substring(
1526: startoftokenrule, startoftokenrule + length);
1527: token.m_CE_[0] |= getCaseBits(tokenstr);
1528: } else {
1529: // Copy it from the UCA
1530: int caseCE = getFirstCE(token.m_rules_
1531: .charAt(startoftokenrule));
1532: token.m_CE_[0] |= (caseCE & 0xC0);
1533: }
1534: }
1535: }
1536:
1537: /**
1538: * Count the number of non-zero bytes used in the ce
1539: * @param ce
1540: * @return number of non-zero bytes used in ce
1541: */
1542: private static final int countBytes(int ce) {
1543: int mask = 0xFFFFFFFF;
1544: int result = 0;
1545: while (mask != 0) {
1546: if ((ce & mask) != 0) {
1547: result++;
1548: }
1549: mask >>>= 8;
1550: }
1551: return result;
1552: }
1553:
1554: /**
1555: * We are ready to create collation elements
1556: * @param t build table to insert
1557: * @param lh rule token list header
1558: */
1559: private void createElements(BuildTable t,
1560: CollationRuleParser.TokenListHeader lh) {
1561: CollationRuleParser.Token tok = lh.m_first_;
1562: m_utilElement_.clear();
1563: while (tok != null) {
1564: // first, check if there are any expansions
1565: // if there are expansions, we need to do a little bit more
1566: // processing since parts of expansion can be tailored, while
1567: // others are not
1568: if (tok.m_expansion_ != 0) {
1569: int len = tok.m_expansion_ >>> 24;
1570: int currentSequenceLen = len;
1571: int expOffset = tok.m_expansion_ & 0x00FFFFFF;
1572: m_utilToken_.m_source_ = currentSequenceLen | expOffset;
1573: m_utilToken_.m_rules_ = m_parser_.m_source_;
1574:
1575: while (len > 0) {
1576: currentSequenceLen = len;
1577: while (currentSequenceLen > 0) {
1578: m_utilToken_.m_source_ = (currentSequenceLen << 24)
1579: | expOffset;
1580: CollationRuleParser.Token expt = (CollationRuleParser.Token) m_parser_.m_hashTable_
1581: .get(m_utilToken_);
1582: if (expt != null
1583: && expt.m_strength_ != CollationRuleParser.TOKEN_RESET_) {
1584: // expansion is tailored
1585: int noOfCEsToCopy = expt.m_CELength_;
1586: for (int j = 0; j < noOfCEsToCopy; j++) {
1587: tok.m_expCE_[tok.m_expCELength_ + j] = expt.m_CE_[j];
1588: }
1589: tok.m_expCELength_ += noOfCEsToCopy;
1590: // never try to add codepoints and CEs.
1591: // For some odd reason, it won't work.
1592: expOffset += currentSequenceLen; //noOfCEsToCopy;
1593: len -= currentSequenceLen; //noOfCEsToCopy;
1594: break;
1595: } else {
1596: currentSequenceLen--;
1597: }
1598: }
1599: if (currentSequenceLen == 0) {
1600: // couldn't find any tailored subsequence, will have to
1601: // get one from UCA. first, get the UChars from the
1602: // rules then pick CEs out until there is no more and
1603: // stuff them into expansion
1604: m_utilColEIter_.setText(m_parser_.m_source_
1605: .substring(expOffset, expOffset + 1));
1606: while (true) {
1607: int order = m_utilColEIter_.next();
1608: if (order == CollationElementIterator.NULLORDER) {
1609: break;
1610: }
1611: tok.m_expCE_[tok.m_expCELength_++] = order;
1612: }
1613: expOffset++;
1614: len--;
1615: }
1616: }
1617: } else {
1618: tok.m_expCELength_ = 0;
1619: }
1620:
1621: // set the ucaelement with obtained values
1622: m_utilElement_.m_CELength_ = tok.m_CELength_
1623: + tok.m_expCELength_;
1624:
1625: // copy CEs
1626: System.arraycopy(tok.m_CE_, 0, m_utilElement_.m_CEs_, 0,
1627: tok.m_CELength_);
1628: System.arraycopy(tok.m_expCE_, 0, m_utilElement_.m_CEs_,
1629: tok.m_CELength_, tok.m_expCELength_);
1630:
1631: // copy UChars
1632: // We kept prefix and source kind of together, as it is a kind of a
1633: // contraction.
1634: // However, now we have to slice the prefix off the main thing -
1635: m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
1636: m_utilElement_.m_cPointsOffset_ = 0; //el.m_uchars_;
1637: if (tok.m_prefix_ != 0) {
1638: // we will just copy the prefix here, and adjust accordingly in
1639: // the addPrefix function in ucol_elm. The reason is that we
1640: // need to add both composed AND decomposed elements to the
1641: // unsafe table.
1642: int size = tok.m_prefix_ >> 24;
1643: int offset = tok.m_prefix_ & 0x00FFFFFF;
1644: m_utilElement_.m_prefixChars_ = m_parser_.m_source_
1645: .substring(offset, offset + size);
1646: size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
1647: offset = (tok.m_source_ & 0x00FFFFFF)
1648: + (tok.m_prefix_ >> 24);
1649: m_utilElement_.m_uchars_ = m_parser_.m_source_
1650: .substring(offset, offset + size);
1651: } else {
1652: m_utilElement_.m_prefixChars_ = null;
1653: int offset = tok.m_source_ & 0x00FFFFFF;
1654: int size = tok.m_source_ >>> 24;
1655: m_utilElement_.m_uchars_ = m_parser_.m_source_
1656: .substring(offset, offset + size);
1657: }
1658: m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
1659: for (int i = 0; i < m_utilElement_.m_cPoints_.length()
1660: - m_utilElement_.m_cPointsOffset_; i++) {
1661: if (isJamo(m_utilElement_.m_cPoints_.charAt(i))) {
1662: t.m_collator_.m_isJamoSpecial_ = true;
1663: break;
1664: }
1665: }
1666:
1667: /***
1668:
1669: // Case bits handling
1670: m_utilElement_.m_CEs_[0] &= 0xFFFFFF3F;
1671: // Clean the case bits field
1672: if (m_utilElement_.m_cPoints_.length()
1673: - m_utilElement_.m_cPointsOffset_ > 1) {
1674: // Do it manually
1675: m_utilElement_.m_CEs_[0]
1676: |= getCaseBits(m_utilElement_.m_cPoints_);
1677: }
1678: else {
1679: // Copy it from the UCA
1680: int caseCE = getFirstCE(m_utilElement_.m_cPoints_.charAt(0));
1681: m_utilElement_.m_CEs_[0] |= (caseCE & 0xC0);
1682: }
1683:
1684: ***/
1685: // and then, add it
1686: addAnElement(t, m_utilElement_);
1687: tok = tok.m_next_;
1688: }
1689: }
1690:
1691: /**
1692: * Testing if the string argument has case
1693: * @param src string
1694: * @return the case for this char array
1695: * @exception Exception thrown when internal program error occurs
1696: */
1697: private final int getCaseBits(String src) throws Exception {
1698: int uCount = 0;
1699: int lCount = 0;
1700: src = Normalizer.decompose(src, true);
1701: m_utilColEIter_.setText(src);
1702: for (int i = 0; i < src.length(); i++) {
1703: m_utilColEIter_.setText(src.substring(i, i + 1));
1704: int order = m_utilColEIter_.next();
1705: if (RuleBasedCollator.isContinuation(order)) {
1706: throw new Exception("Internal program error");
1707: }
1708: if ((order & RuleBasedCollator.CE_CASE_BIT_MASK_) == UPPER_CASE_) {
1709: uCount++;
1710: } else {
1711: char ch = src.charAt(i);
1712: if (UCharacter.isLowerCase(ch)) {
1713: lCount++;
1714: } else {
1715: if (toSmallKana(ch) == ch && toLargeKana(ch) != ch) {
1716: lCount++;
1717: }
1718: }
1719: }
1720: }
1721:
1722: if (uCount != 0 && lCount != 0) {
1723: return MIXED_CASE_;
1724: } else if (uCount != 0) {
1725: return UPPER_CASE_;
1726: } else {
1727: return LOWER_CASE_;
1728: }
1729: }
1730:
1731: /**
1732: * Converts a char to the uppercase Kana
1733: * @param ch character to convert
1734: * @return the converted Kana character
1735: */
1736: private static final char toLargeKana(char ch) {
1737: if (0x3042 < ch && ch < 0x30ef) { // Kana range
1738: switch (ch - 0x3000) {
1739: case 0x41:
1740: case 0x43:
1741: case 0x45:
1742: case 0x47:
1743: case 0x49:
1744: case 0x63:
1745: case 0x83:
1746: case 0x85:
1747: case 0x8E:
1748: case 0xA1:
1749: case 0xA3:
1750: case 0xA5:
1751: case 0xA7:
1752: case 0xA9:
1753: case 0xC3:
1754: case 0xE3:
1755: case 0xE5:
1756: case 0xEE:
1757: ch++;
1758: break;
1759: case 0xF5:
1760: ch = 0x30AB;
1761: break;
1762: case 0xF6:
1763: ch = 0x30B1;
1764: break;
1765: }
1766: }
1767: return ch;
1768: }
1769:
1770: /**
1771: * Converts a char to the lowercase Kana
1772: * @param ch character to convert
1773: * @return the converted Kana character
1774: */
1775: private static final char toSmallKana(char ch) {
1776: if (0x3042 < ch && ch < 0x30ef) { // Kana range
1777: switch (ch - 0x3000) {
1778: case 0x42:
1779: case 0x44:
1780: case 0x46:
1781: case 0x48:
1782: case 0x4A:
1783: case 0x64:
1784: case 0x84:
1785: case 0x86:
1786: case 0x8F:
1787: case 0xA2:
1788: case 0xA4:
1789: case 0xA6:
1790: case 0xA8:
1791: case 0xAA:
1792: case 0xC4:
1793: case 0xE4:
1794: case 0xE6:
1795: case 0xEF:
1796: ch--;
1797: break;
1798: case 0xAB:
1799: ch = 0x30F5;
1800: break;
1801: case 0xB1:
1802: ch = 0x30F6;
1803: break;
1804: }
1805: }
1806: return ch;
1807: }
1808:
1809: /**
1810: * This should be connected to special Jamo handling.
1811: */
1812: private int getFirstCE(char ch) {
1813: m_utilColEIter_.setText(UCharacter.toString(ch));
1814: return m_utilColEIter_.next();
1815: }
1816:
1817: /**
1818: * This adds a read element, while testing for existence
1819: * @param t build table
1820: * @param element
1821: * @return ce
1822: */
1823: private int addAnElement(BuildTable t, Elements element) {
1824: Vector expansions = t.m_expansions_;
1825: element.m_mapCE_ = 0;
1826:
1827: if (element.m_CELength_ == 1) {
1828: element.m_mapCE_ = element.m_CEs_[0];
1829:
1830: } else {
1831: // unfortunately, it looks like we have to look for a long primary
1832: // here since in canonical closure we are going to hit some long
1833: // primaries from the first phase, and they will come back as
1834: // continuations/expansions destroying the effect of the previous
1835: // opitimization. A long primary is a three byte primary with
1836: // starting secondaries and tertiaries. It can appear in long runs
1837: // of only primary differences (like east Asian tailorings) also,
1838: // it should not be an expansion, as expansions would break with
1839: // this
1840: if (element.m_CELength_ == 2 // a two CE expansion
1841: && RuleBasedCollator
1842: .isContinuation(element.m_CEs_[1])
1843: && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that has only primaries in continuation
1844: && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
1845: // a common secondary
1846: && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_) // and a common tertiary
1847: ) {
1848: element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
1849: // a long primary special
1850: | (CE_LONG_PRIMARY_TAG_ << 24)
1851: // first and second byte of primary
1852: | ((element.m_CEs_[0] >> 8) & 0xFFFF00)
1853: // third byte of primary
1854: | ((element.m_CEs_[1] >> 24) & 0xFF);
1855: } else {
1856: // omitting expansion offset in builder
1857: // (HEADER_SIZE_ >> 2)
1858: int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
1859: | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
1860: | (addExpansion(expansions, element.m_CEs_[0]) << 4)
1861: & 0xFFFFF0;
1862:
1863: for (int i = 1; i < element.m_CELength_; i++) {
1864: addExpansion(expansions, element.m_CEs_[i]);
1865: }
1866: if (element.m_CELength_ <= 0xF) {
1867: expansion |= element.m_CELength_;
1868: } else {
1869: addExpansion(expansions, 0);
1870: }
1871: element.m_mapCE_ = expansion;
1872: setMaxExpansion(
1873: element.m_CEs_[element.m_CELength_ - 1],
1874: (byte) element.m_CELength_, t.m_maxExpansions_);
1875: if (isJamo(element.m_cPoints_.charAt(0))) {
1876: t.m_collator_.m_isJamoSpecial_ = true;
1877: setMaxJamoExpansion(element.m_cPoints_.charAt(0),
1878: element.m_CEs_[element.m_CELength_ - 1],
1879: (byte) element.m_CELength_,
1880: t.m_maxJamoExpansions_);
1881: }
1882: }
1883: }
1884:
1885: // We treat digits differently - they are "uber special" and should be
1886: // processed differently if numeric collation is on.
1887: int uniChar = 0;
1888: if ((element.m_uchars_.length() == 2)
1889: && UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {
1890: uniChar = UCharacterProperty.getRawSupplementary(
1891: element.m_uchars_.charAt(0), element.m_uchars_
1892: .charAt(1));
1893: } else if (element.m_uchars_.length() == 1) {
1894: uniChar = element.m_uchars_.charAt(0);
1895: }
1896:
1897: // Here, we either have one normal CE OR mapCE is set. Therefore, we
1898: // stuff only one element to the expansion buffer. When we encounter a
1899: // digit and we don't do numeric collation, we will just pick the CE
1900: // we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE
1901: // && ucol_prv_getSpecialPrevCE). If we picked a special, further
1902: // processing will occur. If it's a simple CE, we'll return due
1903: // to how the loop is constructed.
1904: if (uniChar != 0 && UCharacter.isDigit(uniChar)) {
1905: // prepare the element
1906: int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
1907: | (CollationElementIterator.CE_DIGIT_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
1908: | 1;
1909: if (element.m_mapCE_ != 0) {
1910: // if there is an expansion, we'll pick it here
1911: expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);
1912: } else {
1913: expansion |= (addExpansion(expansions,
1914: element.m_CEs_[0]) << 4);
1915: }
1916: element.m_mapCE_ = expansion;
1917: }
1918:
1919: // here we want to add the prefix structure.
1920: // I will try to process it as a reverse contraction, if possible.
1921: // prefix buffer is already reversed.
1922:
1923: if (element.m_prefixChars_ != null
1924: && element.m_prefixChars_.length() - element.m_prefix_ > 0) {
1925: // We keep the seen prefix starter elements in a hashtable we need
1926: // it to be able to distinguish between the simple codepoints and
1927: // prefix starters. Also, we need to use it for canonical closure.
1928: m_utilElement2_.m_caseBit_ = element.m_caseBit_;
1929: m_utilElement2_.m_CELength_ = element.m_CELength_;
1930: m_utilElement2_.m_CEs_ = element.m_CEs_;
1931: m_utilElement2_.m_mapCE_ = element.m_mapCE_;
1932: //m_utilElement2_.m_prefixChars_ = element.m_prefixChars_;
1933: m_utilElement2_.m_sizePrim_ = element.m_sizePrim_;
1934: m_utilElement2_.m_sizeSec_ = element.m_sizeSec_;
1935: m_utilElement2_.m_sizeTer_ = element.m_sizeTer_;
1936: m_utilElement2_.m_variableTop_ = element.m_variableTop_;
1937: m_utilElement2_.m_prefix_ = element.m_prefix_;
1938: m_utilElement2_.m_prefixChars_ = Normalizer.compose(
1939: element.m_prefixChars_, false);
1940: m_utilElement2_.m_uchars_ = element.m_uchars_;
1941: m_utilElement2_.m_cPoints_ = element.m_cPoints_;
1942: m_utilElement2_.m_cPointsOffset_ = 0;
1943:
1944: if (t.m_prefixLookup_ != null) {
1945: Elements uCE = (Elements) t.m_prefixLookup_
1946: .get(element);
1947: if (uCE != null) {
1948: // there is already a set of code points here
1949: element.m_mapCE_ = addPrefix(t, uCE.m_mapCE_,
1950: element);
1951: } else { // no code points, so this spot is clean
1952: element.m_mapCE_ = addPrefix(t, CE_NOT_FOUND_,
1953: element);
1954: uCE = new Elements(element);
1955: uCE.m_cPoints_ = uCE.m_uchars_;
1956: t.m_prefixLookup_.put(uCE, uCE);
1957: }
1958: if (m_utilElement2_.m_prefixChars_.length() != element.m_prefixChars_
1959: .length()
1960: - element.m_prefix_
1961: || !m_utilElement2_.m_prefixChars_
1962: .regionMatches(0,
1963: element.m_prefixChars_,
1964: element.m_prefix_,
1965: m_utilElement2_.m_prefixChars_
1966: .length())) {
1967: // do it!
1968: m_utilElement2_.m_mapCE_ = addPrefix(t,
1969: element.m_mapCE_, m_utilElement2_);
1970: }
1971: }
1972: }
1973:
1974: // We need to use the canonical iterator here
1975: // the way we do it is to generate the canonically equivalent strings
1976: // for the contraction and then add the sequences that pass FCD check
1977: if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1
1978: && !(element.m_cPoints_.length()
1979: - element.m_cPointsOffset_ == 2
1980: && UTF16.isLeadSurrogate(element.m_cPoints_
1981: .charAt(0)) && UTF16
1982: .isTrailSurrogate(element.m_cPoints_.charAt(1)))) {
1983: // this is a contraction, we should check whether a composed form
1984: // should also be included
1985: m_utilCanIter_.setSource(element.m_cPoints_);
1986: String source = m_utilCanIter_.next();
1987: while (source != null && source.length() > 0) {
1988: if (Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.NO) {
1989: element.m_uchars_ = source;
1990: element.m_cPoints_ = element.m_uchars_;
1991: finalizeAddition(t, element);
1992: }
1993: source = m_utilCanIter_.next();
1994: }
1995:
1996: return element.m_mapCE_;
1997: } else {
1998: return finalizeAddition(t, element);
1999: }
2000: }
2001:
2002: /**
2003: * Adds an expansion ce to the expansion vector
2004: * @param expansions vector to add to
2005: * @param value of the expansion
2006: * @return the current position of the new element
2007: */
2008: private static final int addExpansion(Vector expansions, int value) {
2009: expansions.add(new Integer(value));
2010: return expansions.size() - 1;
2011: }
2012:
2013: /**
2014: * Looks for the maximum length of all expansion sequences ending with the
2015: * same collation element. The size required for maxexpansion and maxsize
2016: * is returned if the arrays are too small.
2017: * @param endexpansion the last expansion collation element to be added
2018: * @param expansionsize size of the expansion
2019: * @param maxexpansion data structure to store the maximum expansion data.
2020: * @returns size of the maxexpansion and maxsize used.
2021: */
2022: private static int setMaxExpansion(int endexpansion,
2023: byte expansionsize, MaxExpansionTable maxexpansion) {
2024: int start = 0;
2025: int limit = maxexpansion.m_endExpansionCE_.size();
2026: long unsigned = (long) endexpansion;
2027: unsigned &= 0xFFFFFFFFl;
2028:
2029: // using binary search to determine if last expansion element is
2030: // already in the array
2031: int result = -1;
2032: while (start < limit - 1) {
2033: int mid = start + ((limit - start) >> 1);
2034: long unsignedce = ((Integer) maxexpansion.m_endExpansionCE_
2035: .get(mid)).intValue();
2036: unsignedce &= 0xFFFFFFFFl;
2037: if (unsigned <= unsignedce) {
2038: limit = mid;
2039: } else {
2040: start = mid;
2041: }
2042: }
2043:
2044: if (((Integer) maxexpansion.m_endExpansionCE_.get(start))
2045: .intValue() == endexpansion) {
2046: result = start;
2047: } else if (((Integer) maxexpansion.m_endExpansionCE_.get(limit))
2048: .intValue() == endexpansion) {
2049: result = limit;
2050: }
2051: if (result > -1) {
2052: // found the ce in expansion, we'll just modify the size if it
2053: // is smaller
2054: Object currentsize = maxexpansion.m_expansionCESize_
2055: .get(result);
2056: if (((Byte) currentsize).byteValue() < expansionsize) {
2057: maxexpansion.m_expansionCESize_.set(result, new Byte(
2058: expansionsize));
2059: }
2060: } else {
2061: // we'll need to squeeze the value into the array. initial
2062: // implementation. shifting the subarray down by 1
2063: maxexpansion.m_endExpansionCE_.insertElementAt(new Integer(
2064: endexpansion), start + 1);
2065: maxexpansion.m_expansionCESize_.insertElementAt(new Byte(
2066: expansionsize), start + 1);
2067: }
2068: return maxexpansion.m_endExpansionCE_.size();
2069: }
2070:
2071: /**
2072: * Sets the maximum length of all jamo expansion sequences ending with the
2073: * same collation element. The size required for maxexpansion and maxsize
2074: * is returned if the arrays are too small.
2075: * @param ch the jamo codepoint
2076: * @param endexpansion the last expansion collation element to be added
2077: * @param expansionsize size of the expansion
2078: * @param maxexpansion data structure to store the maximum expansion data.
2079: * @returns size of the maxexpansion and maxsize used.
2080: */
2081: private static int setMaxJamoExpansion(char ch, int endexpansion,
2082: byte expansionsize, MaxJamoExpansionTable maxexpansion) {
2083: boolean isV = true;
2084: if (ch >= 0x1100 && ch <= 0x1112) {
2085: // determines L for Jamo, doesn't need to store this since it is
2086: // never at the end of a expansion
2087: if (maxexpansion.m_maxLSize_ < expansionsize) {
2088: maxexpansion.m_maxLSize_ = expansionsize;
2089: }
2090: return maxexpansion.m_endExpansionCE_.size();
2091: }
2092:
2093: if (ch >= 0x1161 && ch <= 0x1175) {
2094: // determines V for Jamo
2095: if (maxexpansion.m_maxVSize_ < expansionsize) {
2096: maxexpansion.m_maxVSize_ = expansionsize;
2097: }
2098: }
2099:
2100: if (ch >= 0x11A8 && ch <= 0x11C2) {
2101: isV = false;
2102: // determines T for Jamo
2103: if (maxexpansion.m_maxTSize_ < expansionsize) {
2104: maxexpansion.m_maxTSize_ = expansionsize;
2105: }
2106: }
2107:
2108: int pos = maxexpansion.m_endExpansionCE_.size();
2109: while (pos > 0) {
2110: pos--;
2111: if (((Integer) maxexpansion.m_endExpansionCE_.get(pos))
2112: .intValue() == endexpansion) {
2113: return maxexpansion.m_endExpansionCE_.size();
2114: }
2115: }
2116: maxexpansion.m_endExpansionCE_.add(new Integer(endexpansion));
2117: maxexpansion.m_isV_.add(new Boolean(isV));
2118:
2119: return maxexpansion.m_endExpansionCE_.size();
2120: }
2121:
2122: /**
2123: * Adds a prefix to the table
2124: * @param t build table to update
2125: * @param CE collation element to add
2126: * @param element rule element to add
2127: * @return modified ce
2128: */
2129: private int addPrefix(BuildTable t, int CE, Elements element) {
2130: // currently the longest prefix we're supporting in Japanese is two
2131: // characters long. Although this table could quite easily mimic
2132: // complete contraction stuff there is no good reason to make a general
2133: // solution, as it would require some error prone messing.
2134: ContractionTable contractions = t.m_contractions_;
2135: String oldCP = element.m_cPoints_;
2136: int oldCPOffset = element.m_cPointsOffset_;
2137:
2138: contractions.m_currentTag_ = CE_SPEC_PROC_TAG_;
2139: // here, we will normalize & add prefix to the table.
2140: int size = element.m_prefixChars_.length() - element.m_prefix_;
2141: for (int j = 1; j < size; j++) {
2142: // First add NFD prefix chars to unsafe CP hash table
2143: // Unless it is a trail surrogate, which is handled algoritmically
2144: // and shouldn't take up space in the table.
2145: char ch = element.m_prefixChars_.charAt(j
2146: + element.m_prefix_);
2147: if (!UTF16.isTrailSurrogate(ch)) {
2148: unsafeCPSet(t.m_unsafeCP_, ch);
2149: }
2150: }
2151:
2152: // StringBuffer reversed = new StringBuffer();
2153: m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
2154: for (int j = 0; j < size; j++) {
2155: // prefixes are going to be looked up backwards
2156: // therefore, we will promptly reverse the prefix buffer...
2157: int offset = element.m_prefixChars_.length() - j - 1;
2158: m_utilStringBuffer_.append(element.m_prefixChars_
2159: .charAt(offset));
2160: }
2161: element.m_prefixChars_ = m_utilStringBuffer_.toString();
2162: element.m_prefix_ = 0;
2163:
2164: // the first codepoint is also unsafe, as it forms a 'contraction' with
2165: // the prefix
2166: if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(0))) {
2167: unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_.charAt(0));
2168: }
2169:
2170: element.m_cPoints_ = element.m_prefixChars_;
2171: element.m_cPointsOffset_ = element.m_prefix_;
2172:
2173: // Add the last char of the contraction to the contraction-end hash
2174: // table. unless it is a trail surrogate, which is handled
2175: // algorithmically and shouldn't be in the table
2176: if (!UTF16.isTrailSurrogate(element.m_cPoints_
2177: .charAt(element.m_cPoints_.length() - 1))) {
2178: ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
2179: .charAt(element.m_cPoints_.length() - 1));
2180: }
2181: // First we need to check if contractions starts with a surrogate
2182: // int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
2183:
2184: // If there are any Jamos in the contraction, we should turn on special
2185: // processing for Jamos
2186: if (isJamo(element.m_prefixChars_.charAt(element.m_prefix_))) {
2187: t.m_collator_.m_isJamoSpecial_ = true;
2188: }
2189: // then we need to deal with it
2190: // we could aready have something in table - or we might not
2191: if (!isPrefix(CE)) {
2192: // if it wasn't contraction, we wouldn't end up here
2193: int firstContractionOffset = addContraction(contractions,
2194: CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);
2195: int newCE = processContraction(contractions, element,
2196: CE_NOT_FOUND_);
2197: addContraction(contractions, firstContractionOffset,
2198: element.m_prefixChars_.charAt(element.m_prefix_),
2199: newCE);
2200: addContraction(contractions, firstContractionOffset,
2201: (char) 0xFFFF, CE);
2202: CE = constructSpecialCE(CE_SPEC_PROC_TAG_,
2203: firstContractionOffset);
2204: } else {
2205: // we are adding to existing contraction
2206: // there were already some elements in the table, so we need to add
2207: // a new contraction
2208: // Two things can happen here: either the codepoint is already in
2209: // the table, or it is not
2210: char ch = element.m_prefixChars_.charAt(element.m_prefix_);
2211: int position = findCP(contractions, CE, ch);
2212: if (position > 0) {
2213: // if it is we just continue down the chain
2214: int eCE = getCE(contractions, CE, position);
2215: int newCE = processContraction(contractions, element,
2216: eCE);
2217: setContraction(contractions, CE, position, ch, newCE);
2218: } else {
2219: // if it isn't, we will have to create a new sequence
2220: processContraction(contractions, element, CE_NOT_FOUND_);
2221: insertContraction(contractions, CE, ch,
2222: element.m_mapCE_);
2223: }
2224: }
2225:
2226: element.m_cPoints_ = oldCP;
2227: element.m_cPointsOffset_ = oldCPOffset;
2228:
2229: return CE;
2230: }
2231:
2232: /**
2233: * Checks if the argument ce is a contraction
2234: * @param CE collation element
2235: * @return true if argument ce is a contraction
2236: */
2237: private static final boolean isContraction(int CE) {
2238: return isSpecial(CE) && (getCETag(CE) == CE_CONTRACTION_TAG_);
2239: }
2240:
2241: /**
2242: * Checks if the argument ce has a prefix
2243: * @param CE collation element
2244: * @return true if argument ce has a prefix
2245: */
2246: private static final boolean isPrefix(int CE) {
2247: return isSpecial(CE) && (getCETag(CE) == CE_SPEC_PROC_TAG_);
2248: }
2249:
2250: /**
2251: * Checks if the argument ce is special
2252: * @param CE collation element
2253: * @return true if argument ce is special
2254: */
2255: private static final boolean isSpecial(int CE) {
2256: return (CE & RuleBasedCollator.CE_SPECIAL_FLAG_) == 0xF0000000;
2257: }
2258:
2259: /**
2260: * Checks if the argument ce has a prefix
2261: * @param CE collation element
2262: * @return true if argument ce has a prefix
2263: */
2264: private static final int getCETag(int CE) {
2265: return (CE & RuleBasedCollator.CE_TAG_MASK_) >>> RuleBasedCollator.CE_TAG_SHIFT_;
2266: }
2267:
2268: /**
2269: * Gets the ce at position in contraction table
2270: * @param table contraction table
2271: * @param position offset to the contraction table
2272: * @return ce
2273: */
2274: private static final int getCE(ContractionTable table, int element,
2275: int position) {
2276: element &= 0xFFFFFF;
2277: BasicContractionTable tbl = getBasicContractionTable(table,
2278: element);
2279:
2280: if (tbl == null) {
2281: return CE_NOT_FOUND_;
2282: }
2283: if (position > tbl.m_CEs_.size() || position == -1) {
2284: return CE_NOT_FOUND_;
2285: } else {
2286: return ((Integer) tbl.m_CEs_.get(position)).intValue();
2287: }
2288: }
2289:
2290: /**
2291: * Sets the unsafe character
2292: * @param table unsafe table
2293: * @param c character to be added
2294: */
2295: private static final void unsafeCPSet(byte table[], char c) {
2296: int hash = c;
2297: if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
2298: if (hash >= 0xd800 && hash <= 0xf8ff) {
2299: // Part of a surrogate, or in private use area.
2300: // These don't go in the table
2301: return;
2302: }
2303: hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
2304: }
2305: table[hash >> 3] |= (1 << (hash & 7));
2306: }
2307:
2308: /**
2309: * Sets the contraction end character
2310: * @param table contraction end table
2311: * @param c character to be added
2312: */
2313: private static final void ContrEndCPSet(byte table[], char c) {
2314: int hash = c;
2315: if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
2316: hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
2317: }
2318: table[hash >> 3] |= (1 << (hash & 7));
2319: }
2320:
2321: /**
2322: * Adds more contractions in table. If element is non existant, it creates
2323: * on. Returns element handle
2324: * @param table contraction table
2325: * @param element offset to the contraction table
2326: * @param codePoint codepoint to add
2327: * @param value
2328: * @return collation element
2329: */
2330: private static int addContraction(ContractionTable table,
2331: int element, char codePoint, int value) {
2332: BasicContractionTable tbl = getBasicContractionTable(table,
2333: element);
2334: if (tbl == null) {
2335: tbl = addAContractionElement(table);
2336: element = table.m_elements_.size() - 1;
2337: }
2338:
2339: tbl.m_CEs_.add(new Integer(value));
2340: tbl.m_codePoints_.append(codePoint);
2341: return constructSpecialCE(table.m_currentTag_, element);
2342: }
2343:
2344: /**
2345: * Adds a contraction element to the table
2346: * @param table contraction table to update
2347: * @return contraction
2348: */
2349: private static BasicContractionTable addAContractionElement(
2350: ContractionTable table) {
2351: BasicContractionTable result = new BasicContractionTable();
2352: table.m_elements_.add(result);
2353: return result;
2354: }
2355:
2356: /**
2357: * Constructs a special ce
2358: * @param tag special tag
2359: * @param CE collation element
2360: * @return a contraction ce
2361: */
2362: private static final int constructSpecialCE(int tag, int CE) {
2363: return RuleBasedCollator.CE_SPECIAL_FLAG_
2364: | (tag << RuleBasedCollator.CE_TAG_SHIFT_)
2365: | (CE & 0xFFFFFF);
2366: }
2367:
2368: /**
2369: * Sets and inserts the element that has a contraction
2370: * @param contractions contraction table
2371: * @param element contracting element
2372: * @param existingCE
2373: * @return contraction ce
2374: */
2375: private static int processContraction(
2376: ContractionTable contractions, Elements element,
2377: int existingCE) {
2378: int firstContractionOffset = 0;
2379: // end of recursion
2380: if (element.m_cPoints_.length() - element.m_cPointsOffset_ == 1) {
2381: if (isContractionTableElement(existingCE)
2382: && getCETag(existingCE) == contractions.m_currentTag_) {
2383: changeContraction(contractions, existingCE, (char) 0,
2384: element.m_mapCE_);
2385: changeContraction(contractions, existingCE,
2386: (char) 0xFFFF, element.m_mapCE_);
2387: return existingCE;
2388: } else {
2389: // can't do just that. existingCe might be a contraction,
2390: // meaning that we need to do another step
2391: return element.m_mapCE_;
2392: }
2393: }
2394:
2395: // this recursion currently feeds on the only element we have...
2396: // We will have to copy it in order to accomodate for both backward
2397: // and forward cycles
2398: // we encountered either an empty space or a non-contraction element
2399: // this means we are constructing a new contraction sequence
2400: element.m_cPointsOffset_++;
2401: if (!isContractionTableElement(existingCE)) {
2402: // if it wasn't contraction, we wouldn't end up here
2403: firstContractionOffset = addContraction(contractions,
2404: CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0,
2405: existingCE);
2406: int newCE = processContraction(contractions, element,
2407: CE_NOT_FOUND_);
2408: addContraction(
2409: contractions,
2410: firstContractionOffset,
2411: element.m_cPoints_.charAt(element.m_cPointsOffset_),
2412: newCE);
2413: addContraction(contractions, firstContractionOffset,
2414: (char) 0xFFFF, existingCE);
2415: existingCE = constructSpecialCE(contractions.m_currentTag_,
2416: firstContractionOffset);
2417: } else {
2418: // we are adding to existing contraction
2419: // there were already some elements in the table, so we need to add
2420: // a new contraction
2421: // Two things can happen here: either the codepoint is already in
2422: // the table, or it is not
2423: int position = findCP(contractions, existingCE,
2424: element.m_cPoints_.charAt(element.m_cPointsOffset_));
2425: if (position > 0) {
2426: // if it is we just continue down the chain
2427: int eCE = getCE(contractions, existingCE, position);
2428: int newCE = processContraction(contractions, element,
2429: eCE);
2430: setContraction(contractions, existingCE, position,
2431: element.m_cPoints_
2432: .charAt(element.m_cPointsOffset_),
2433: newCE);
2434: } else {
2435: // if it isn't, we will have to create a new sequence
2436: int newCE = processContraction(contractions, element,
2437: CE_NOT_FOUND_);
2438: insertContraction(contractions, existingCE,
2439: element.m_cPoints_
2440: .charAt(element.m_cPointsOffset_),
2441: newCE);
2442: }
2443: }
2444: element.m_cPointsOffset_--;
2445: return existingCE;
2446: }
2447:
2448: /**
2449: * Checks if CE belongs to the contraction table
2450: * @param CE collation element to test
2451: * @return true if CE belongs to the contraction table
2452: */
2453: private static final boolean isContractionTableElement(int CE) {
2454: return isSpecial(CE)
2455: && (getCETag(CE) == CE_CONTRACTION_TAG_ || getCETag(CE) == CE_SPEC_PROC_TAG_);
2456: }
2457:
2458: /**
2459: * Gets the codepoint
2460: * @param table contraction table
2461: * @param element offset to the contraction element in the table
2462: * @param codePoint code point to look for
2463: * @return the offset to the code point
2464: */
2465: private static int findCP(ContractionTable table, int element,
2466: char codePoint) {
2467: BasicContractionTable tbl = getBasicContractionTable(table,
2468: element);
2469: if (tbl == null) {
2470: return -1;
2471: }
2472:
2473: int position = 0;
2474: while (codePoint > tbl.m_codePoints_.charAt(position)) {
2475: position++;
2476: if (position > tbl.m_codePoints_.length()) {
2477: return -1;
2478: }
2479: }
2480: if (codePoint == tbl.m_codePoints_.charAt(position)) {
2481: return position;
2482: } else {
2483: return -1;
2484: }
2485: }
2486:
2487: /**
2488: * Gets the contraction element out of the contraction table
2489: * @param table contraction table
2490: * @param offset to the element in the contraction table
2491: * @return basic contraction element at offset in the contraction table
2492: */
2493: private static final BasicContractionTable getBasicContractionTable(
2494: ContractionTable table, int offset) {
2495: offset &= 0xFFFFFF;
2496: if (offset == 0xFFFFFF) {
2497: return null;
2498: }
2499: return (BasicContractionTable) table.m_elements_.get(offset);
2500: }
2501:
2502: /**
2503: * Changes the contraction element
2504: * @param table contraction table
2505: * @param element offset to the element in the contraction table
2506: * @param codePoint codepoint
2507: * @param newCE new collation element
2508: * @return basic contraction element at offset in the contraction table
2509: */
2510: private static final int changeContraction(ContractionTable table,
2511: int element, char codePoint, int newCE) {
2512: BasicContractionTable tbl = getBasicContractionTable(table,
2513: element);
2514: if (tbl == null) {
2515: return 0;
2516: }
2517: int position = 0;
2518: while (codePoint > tbl.m_codePoints_.charAt(position)) {
2519: position++;
2520: if (position > tbl.m_codePoints_.length()) {
2521: return CE_NOT_FOUND_;
2522: }
2523: }
2524: if (codePoint == tbl.m_codePoints_.charAt(position)) {
2525: tbl.m_CEs_.set(position, new Integer(newCE));
2526: return element & 0xFFFFFF;
2527: } else {
2528: return CE_NOT_FOUND_;
2529: }
2530: }
2531:
2532: /**
2533: * Sets a part of contraction sequence in table. If element is non
2534: * existant, it creates on. Returns element handle.
2535: * @param table contraction table
2536: * @param element offset to the contraction table
2537: * @param offset
2538: * @param codePoint contraction character
2539: * @param value ce value
2540: * @return new contraction ce
2541: */
2542: private static final int setContraction(ContractionTable table,
2543: int element, int offset, char codePoint, int value) {
2544: element &= 0xFFFFFF;
2545: BasicContractionTable tbl = getBasicContractionTable(table,
2546: element);
2547: if (tbl == null) {
2548: tbl = addAContractionElement(table);
2549: element = table.m_elements_.size() - 1;
2550: }
2551:
2552: tbl.m_CEs_.set(offset, new Integer(value));
2553: tbl.m_codePoints_.setCharAt(offset, codePoint);
2554: return constructSpecialCE(table.m_currentTag_, element);
2555: }
2556:
2557: /**
2558: * Inserts a part of contraction sequence in table. Sequences behind the
2559: * offset are moved back. If element is non existent, it creates on.
2560: * @param table contraction
2561: * @param element offset to the table contraction
2562: * @param codePoint code point
2563: * @param value collation element value
2564: * @return contraction collation element
2565: */
2566: private static final int insertContraction(ContractionTable table,
2567: int element, char codePoint, int value) {
2568: element &= 0xFFFFFF;
2569: BasicContractionTable tbl = getBasicContractionTable(table,
2570: element);
2571: if (tbl == null) {
2572: tbl = addAContractionElement(table);
2573: element = table.m_elements_.size() - 1;
2574: }
2575:
2576: int offset = 0;
2577: while (tbl.m_codePoints_.charAt(offset) < codePoint
2578: && offset < tbl.m_codePoints_.length()) {
2579: offset++;
2580: }
2581:
2582: tbl.m_CEs_.insertElementAt(new Integer(value), offset);
2583: tbl.m_codePoints_.insert(offset, codePoint);
2584:
2585: return constructSpecialCE(table.m_currentTag_, element);
2586: }
2587:
2588: /**
2589: * Finalize addition
2590: * @param t build table
2591: * @param element to add
2592: */
2593: private final static int finalizeAddition(BuildTable t,
2594: Elements element) {
2595: int CE = CE_NOT_FOUND_;
2596: // This should add a completely ignorable element to the
2597: // unsafe table, so that backward iteration will skip
2598: // over it when treating contractions.
2599: if (element.m_mapCE_ == 0) {
2600: for (int i = 0; i < element.m_cPoints_.length(); i++) {
2601: char ch = element.m_cPoints_.charAt(i);
2602: if (!UTF16.isTrailSurrogate(ch)) {
2603: unsafeCPSet(t.m_unsafeCP_, ch);
2604: }
2605: }
2606: }
2607:
2608: if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1) {
2609: // we're adding a contraction
2610: int cp = UTF16.charAt(element.m_cPoints_,
2611: element.m_cPointsOffset_);
2612: CE = t.m_mapping_.getValue(cp);
2613: CE = addContraction(t, CE, element);
2614: } else {
2615: // easy case
2616: CE = t.m_mapping_.getValue(element.m_cPoints_
2617: .charAt(element.m_cPointsOffset_));
2618:
2619: if (CE != CE_NOT_FOUND_) {
2620: if (isContractionTableElement(CE)) {
2621: // adding a non contraction element (thai, expansion,
2622: // single) to already existing contraction
2623: if (!isPrefix(element.m_mapCE_)) {
2624: // we cannot reenter prefix elements - as we are going
2625: // to create a dead loop
2626: // Only expansions and regular CEs can go here...
2627: // Contractions will never happen in this place
2628: setContraction(t.m_contractions_, CE, 0,
2629: (char) 0, element.m_mapCE_);
2630: // This loop has to change the CE at the end of
2631: // contraction REDO!
2632: changeLastCE(t.m_contractions_, CE,
2633: element.m_mapCE_);
2634: }
2635: } else {
2636: t.m_mapping_.setValue(element.m_cPoints_
2637: .charAt(element.m_cPointsOffset_),
2638: element.m_mapCE_);
2639: }
2640: } else {
2641: t.m_mapping_.setValue(element.m_cPoints_
2642: .charAt(element.m_cPointsOffset_),
2643: element.m_mapCE_);
2644: }
2645: }
2646: return CE;
2647: }
2648:
2649: /**
2650: * Note regarding surrogate handling: We are interested only in the single
2651: * or leading surrogates in a contraction. If a surrogate is somewhere else
2652: * in the contraction, it is going to be handled as a pair of code units,
2653: * as it doesn't affect the performance AND handling surrogates specially
2654: * would complicate code way too much.
2655: */
2656: private static int addContraction(BuildTable t, int CE,
2657: Elements element) {
2658: ContractionTable contractions = t.m_contractions_;
2659: contractions.m_currentTag_ = CE_CONTRACTION_TAG_;
2660:
2661: // First we need to check if contractions starts with a surrogate
2662: int cp = UTF16.charAt(element.m_cPoints_, 0);
2663: int cpsize = 1;
2664: if (UCharacter.isSupplementary(cp)) {
2665: cpsize = 2;
2666: }
2667: if (cpsize < element.m_cPoints_.length()) {
2668: // This is a real contraction, if there are other characters after
2669: // the first
2670: int size = element.m_cPoints_.length()
2671: - element.m_cPointsOffset_;
2672: for (int j = 1; j < size; j++) {
2673: // First add contraction chars to unsafe CP hash table
2674: // Unless it is a trail surrogate, which is handled
2675: // algoritmically and shouldn't take up space in the table.
2676: if (!UTF16.isTrailSurrogate(element.m_cPoints_
2677: .charAt(element.m_cPointsOffset_ + j))) {
2678: unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_
2679: .charAt(element.m_cPointsOffset_ + j));
2680: }
2681: }
2682: // Add the last char of the contraction to the contraction-end
2683: // hash table. unless it is a trail surrogate, which is handled
2684: // algorithmically and shouldn't be in the table
2685: if (!UTF16.isTrailSurrogate(element.m_cPoints_
2686: .charAt(element.m_cPoints_.length() - 1))) {
2687: ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
2688: .charAt(element.m_cPoints_.length() - 1));
2689: }
2690:
2691: // If there are any Jamos in the contraction, we should turn on
2692: // special processing for Jamos
2693: if (isJamo(element.m_cPoints_
2694: .charAt(element.m_cPointsOffset_))) {
2695: t.m_collator_.m_isJamoSpecial_ = true;
2696: }
2697: // then we need to deal with it
2698: // we could aready have something in table - or we might not
2699: element.m_cPointsOffset_ += cpsize;
2700: if (!isContraction(CE)) {
2701: // if it wasn't contraction, we wouldn't end up here
2702: int firstContractionOffset = addContraction(
2703: contractions, CONTRACTION_TABLE_NEW_ELEMENT_,
2704: (char) 0, CE);
2705: int newCE = processContraction(contractions, element,
2706: CE_NOT_FOUND_);
2707: addContraction(contractions, firstContractionOffset,
2708: element.m_cPoints_
2709: .charAt(element.m_cPointsOffset_),
2710: newCE);
2711: addContraction(contractions, firstContractionOffset,
2712: (char) 0xFFFF, CE);
2713: CE = constructSpecialCE(CE_CONTRACTION_TAG_,
2714: firstContractionOffset);
2715: } else {
2716: // we are adding to existing contraction
2717: // there were already some elements in the table, so we need to
2718: // add a new contraction
2719: // Two things can happen here: either the codepoint is already
2720: // in the table, or it is not
2721: int position = findCP(contractions, CE,
2722: element.m_cPoints_
2723: .charAt(element.m_cPointsOffset_));
2724: if (position > 0) {
2725: // if it is we just continue down the chain
2726: int eCE = getCE(contractions, CE, position);
2727: int newCE = processContraction(contractions,
2728: element, eCE);
2729: setContraction(contractions, CE, position,
2730: element.m_cPoints_
2731: .charAt(element.m_cPointsOffset_),
2732: newCE);
2733: } else {
2734: // if it isn't, we will have to create a new sequence
2735: int newCE = processContraction(contractions,
2736: element, CE_NOT_FOUND_);
2737: insertContraction(contractions, CE,
2738: element.m_cPoints_
2739: .charAt(element.m_cPointsOffset_),
2740: newCE);
2741: }
2742: }
2743: element.m_cPointsOffset_ -= cpsize;
2744: t.m_mapping_.setValue(cp, CE);
2745: } else if (!isContraction(CE)) {
2746: // this is just a surrogate, and there is no contraction
2747: t.m_mapping_.setValue(cp, element.m_mapCE_);
2748: } else {
2749: // fill out the first stage of the contraction with the surrogate
2750: // CE
2751: changeContraction(contractions, CE, (char) 0,
2752: element.m_mapCE_);
2753: changeContraction(contractions, CE, (char) 0xFFFF,
2754: element.m_mapCE_);
2755: }
2756: return CE;
2757: }
2758:
2759: /**
2760: * this is for adding non contractions
2761: * @param table contraction table
2762: * @param element offset to the contraction table
2763: * @param value collation element value
2764: * @return new collation element
2765: */
2766: private static final int changeLastCE(ContractionTable table,
2767: int element, int value) {
2768: BasicContractionTable tbl = getBasicContractionTable(table,
2769: element);
2770: if (tbl == null) {
2771: return 0;
2772: }
2773:
2774: tbl.m_CEs_.set(tbl.m_CEs_.size() - 1, new Integer(value));
2775: return constructSpecialCE(table.m_currentTag_,
2776: element & 0xFFFFFF);
2777: }
2778:
2779: /**
2780: * Given a set of ranges calculated by allocWeights(), iterate through the
2781: * weights. Sets the next weight in cegenerator.m_current_.
2782: * @param cegenerator object that contains ranges weight range array and
2783: * its rangeCount
2784: * @return the next weight
2785: */
2786: private static int nextWeight(CEGenerator cegenerator) {
2787: if (cegenerator.m_rangesLength_ > 0) {
2788: // get maxByte from the .count field
2789: int maxByte = cegenerator.m_ranges_[0].m_count_;
2790: // get the next weight
2791: int weight = cegenerator.m_ranges_[0].m_start_;
2792: if (weight == cegenerator.m_ranges_[0].m_end_) {
2793: // this range is finished, remove it and move the following
2794: // ones up
2795: cegenerator.m_rangesLength_--;
2796: if (cegenerator.m_rangesLength_ > 0) {
2797: System.arraycopy(cegenerator.m_ranges_, 1,
2798: cegenerator.m_ranges_, 0,
2799: cegenerator.m_rangesLength_);
2800: cegenerator.m_ranges_[0].m_count_ = maxByte;
2801: // keep maxByte in ranges[0]
2802: }
2803: } else {
2804: // increment the weight for the next value
2805: cegenerator.m_ranges_[0].m_start_ = incWeight(weight,
2806: cegenerator.m_ranges_[0].m_length2_, maxByte);
2807: }
2808: return weight;
2809: }
2810: return -1;
2811: }
2812:
2813: /**
2814: * Increment the collation weight
2815: * @param weight to increment
2816: * @param length
2817: * @param maxByte
2818: * @return new incremented weight
2819: */
2820: private static final int incWeight(int weight, int length,
2821: int maxByte) {
2822: while (true) {
2823: int b = getWeightByte(weight, length);
2824: if (b < maxByte) {
2825: return setWeightByte(weight, length, b + 1);
2826: } else {
2827: // roll over, set this byte to BYTE_FIRST_TAILORED_ and
2828: // increment the previous one
2829: weight = setWeightByte(weight, length,
2830: RuleBasedCollator.BYTE_FIRST_TAILORED_);
2831: --length;
2832: }
2833: }
2834: }
2835:
2836: /**
2837: * Gets the weight byte
2838: * @param weight
2839: * @param index
2840: * @return byte
2841: */
2842: private static final int getWeightByte(int weight, int index) {
2843: return (weight >> ((4 - index) << 3)) & 0xff;
2844: }
2845:
2846: /**
2847: * Set the weight byte in table
2848: * @param weight
2849: * @param index
2850: * @param b byte
2851: */
2852: private static final int setWeightByte(int weight, int index, int b) {
2853: index <<= 3;
2854: // 0xffffffff except a 00 "hole" for the index-th byte
2855: int mask = 0xffffffff >>> index;
2856: index = 32 - index;
2857: mask |= 0xffffff00 << index;
2858: return (weight & mask) | (b << index);
2859: }
2860:
2861: /**
2862: * Call getWeightRanges and then determine heuristically which ranges to
2863: * use for a given number of weights between (excluding) two limits
2864: * @param lowerLimit
2865: * @param upperLimit
2866: * @param n
2867: * @param maxByte
2868: * @param ranges
2869: * @return
2870: */
2871: private int allocateWeights(int lowerLimit, int upperLimit, int n,
2872: int maxByte, WeightRange ranges[]) {
2873: // number of usable byte values 3..maxByte
2874: int countBytes = maxByte
2875: - RuleBasedCollator.BYTE_FIRST_TAILORED_ + 1;
2876: // [0] unused, [5] to make index checks unnecessary, m_utilCountBuffer_
2877: // countBytes to the power of index, m_utilLongBuffer_ for unsignedness
2878: // gcc requires explicit initialization
2879: m_utilLongBuffer_[0] = 1;
2880: m_utilLongBuffer_[1] = countBytes;
2881: m_utilLongBuffer_[2] = m_utilLongBuffer_[1] * countBytes;
2882: m_utilLongBuffer_[3] = m_utilLongBuffer_[2] * countBytes;
2883: m_utilLongBuffer_[4] = m_utilLongBuffer_[3] * countBytes;
2884: int rangeCount = getWeightRanges(lowerLimit, upperLimit,
2885: maxByte, countBytes, ranges);
2886: if (rangeCount <= 0) {
2887: return 0;
2888: }
2889: // what is the maximum number of weights with these ranges?
2890: long maxCount = 0;
2891: for (int i = 0; i < rangeCount; ++i) {
2892: maxCount += (long) ranges[i].m_count_
2893: * m_utilLongBuffer_[4 - ranges[i].m_length_];
2894: }
2895: if (maxCount < n) {
2896: return 0;
2897: }
2898: // set the length2 and count2 fields
2899: for (int i = 0; i < rangeCount; ++i) {
2900: ranges[i].m_length2_ = ranges[i].m_length_;
2901: ranges[i].m_count2_ = ranges[i].m_count_;
2902: }
2903: // try until we find suitably large ranges
2904: while (true) {
2905: // get the smallest number of bytes in a range
2906: int minLength = ranges[0].m_length2_;
2907: // sum up the number of elements that fit into ranges of each byte
2908: // length
2909: Arrays.fill(m_utilCountBuffer_, 0);
2910: for (int i = 0; i < rangeCount; ++i) {
2911: m_utilCountBuffer_[ranges[i].m_length2_] += ranges[i].m_count2_;
2912: }
2913: // now try to allocate n elements in the available short ranges
2914: if (n <= m_utilCountBuffer_[minLength]
2915: + m_utilCountBuffer_[minLength + 1]) {
2916: // trivial cases, use the first few ranges
2917: maxCount = 0;
2918: rangeCount = 0;
2919: do {
2920: maxCount += ranges[rangeCount].m_count2_;
2921: ++rangeCount;
2922: } while (n > maxCount);
2923: break;
2924: } else if (n <= ranges[0].m_count2_ * countBytes) {
2925: // easy case, just make this one range large enough by
2926: // lengthening it once more, possibly split it
2927: rangeCount = 1;
2928: // calculate how to split the range between maxLength-1
2929: // (count1) and maxLength (count2)
2930: long power_1 = m_utilLongBuffer_[minLength
2931: - ranges[0].m_length_];
2932: long power = power_1 * countBytes;
2933: int count2 = (int) ((n + power - 1) / power);
2934: int count1 = ranges[0].m_count_ - count2;
2935: // split the range
2936: if (count1 < 1) {
2937: // lengthen the entire range to maxLength
2938: lengthenRange(ranges, 0, maxByte, countBytes);
2939: } else {
2940: // really split the range
2941: // create a new range with the end and initial and current
2942: // length of the old one
2943: rangeCount = 2;
2944: ranges[1].m_end_ = ranges[0].m_end_;
2945: ranges[1].m_length_ = ranges[0].m_length_;
2946: ranges[1].m_length2_ = minLength;
2947: // set the end of the first range according to count1
2948: int i = ranges[0].m_length_;
2949: int b = getWeightByte(ranges[0].m_start_, i)
2950: + count1 - 1;
2951: // ranges[0].count and count1 may be >countBytes from
2952: // merging adjacent ranges; b > maxByte is possible
2953: if (b <= maxByte) {
2954: ranges[0].m_end_ = setWeightByte(
2955: ranges[0].m_start_, i, b);
2956: } else {
2957: ranges[0].m_end_ = setWeightByte(incWeight(
2958: ranges[0].m_start_, i - 1, maxByte), i,
2959: b - countBytes);
2960: }
2961: // set the bytes in the end weight at length + 1..length2
2962: // to maxByte
2963: b = (maxByte << 24) | (maxByte << 16)
2964: | (maxByte << 8) | maxByte; // this used to be 0xffffffff
2965: ranges[0].m_end_ = truncateWeight(ranges[0].m_end_,
2966: i)
2967: | (b >>> (i << 3))
2968: & (b << ((4 - minLength) << 3));
2969: // set the start of the second range to immediately follow
2970: // the end of the first one
2971: ranges[1].m_start_ = incWeight(ranges[0].m_end_,
2972: minLength, maxByte);
2973: // set the count values (informational)
2974: ranges[0].m_count_ = count1;
2975: ranges[1].m_count_ = count2;
2976:
2977: ranges[0].m_count2_ = (int) (count1 * power_1);
2978: // will be *countBytes when lengthened
2979: ranges[1].m_count2_ = (int) (count2 * power_1);
2980:
2981: // lengthen the second range to maxLength
2982: lengthenRange(ranges, 1, maxByte, countBytes);
2983: }
2984: break;
2985: }
2986: // no good match, lengthen all minLength ranges and iterate
2987: for (int i = 0; ranges[i].m_length2_ == minLength; ++i) {
2988: lengthenRange(ranges, i, maxByte, countBytes);
2989: }
2990: }
2991:
2992: if (rangeCount > 1) {
2993: // sort the ranges by weight values
2994: Arrays.sort(ranges, 0, rangeCount);
2995: }
2996:
2997: // set maxByte in ranges[0] for ucol_nextWeight()
2998: ranges[0].m_count_ = maxByte;
2999:
3000: return rangeCount;
3001: }
3002:
3003: /**
3004: * Updates the range length
3005: * @param range weight range array
3006: * @param offset to weight range array
3007: * @param maxByte
3008: * @param countBytes
3009: * @return new length
3010: */
3011: private static final int lengthenRange(WeightRange range[],
3012: int offset, int maxByte, int countBytes) {
3013: int length = range[offset].m_length2_ + 1;
3014: range[offset].m_start_ = setWeightTrail(range[offset].m_start_,
3015: length, RuleBasedCollator.BYTE_FIRST_TAILORED_);
3016: range[offset].m_end_ = setWeightTrail(range[offset].m_end_,
3017: length, maxByte);
3018: range[offset].m_count2_ *= countBytes;
3019: range[offset].m_length2_ = length;
3020: return length;
3021: }
3022:
3023: /**
3024: * Gets the weight
3025: * @param weight
3026: * @param length
3027: * @param trail
3028: * @return new weight
3029: */
3030: private static final int setWeightTrail(int weight, int length,
3031: int trail) {
3032: length = (4 - length) << 3;
3033: return (weight & (0xffffff00 << length)) | (trail << length);
3034: }
3035:
3036: /**
3037: * take two CE weights and calculate the
3038: * possible ranges of weights between the two limits, excluding them
3039: * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
3040: * @param lowerLimit
3041: * @param upperLimit
3042: * @param maxByte
3043: * @param countBytes
3044: * @param ranges
3045: * @return weight ranges
3046: */
3047: private int getWeightRanges(int lowerLimit, int upperLimit,
3048: int maxByte, int countBytes, WeightRange ranges[]) {
3049: // assume that both lowerLimit & upperLimit are not 0
3050: // get the lengths of the limits
3051: int lowerLength = lengthOfWeight(lowerLimit);
3052: int upperLength = lengthOfWeight(upperLimit);
3053: if (Utility.compareUnsigned(lowerLimit, upperLimit) >= 0) {
3054: return 0;
3055: }
3056: // check that neither is a prefix of the other
3057: if (lowerLength < upperLength) {
3058: if (lowerLimit == truncateWeight(upperLimit, lowerLength)) {
3059: return 0;
3060: }
3061: }
3062: // if the upper limit is a prefix of the lower limit then the earlier
3063: // test lowerLimit >= upperLimit has caught it
3064: // reset local variables
3065: // With the limit lengths of 1..4, there are up to 7 ranges for
3066: // allocation:
3067: // range minimum length
3068: // lower[4] 4
3069: // lower[3] 3
3070: // lower[2] 2
3071: // middle 1
3072: // upper[2] 2
3073: // upper[3] 3
3074: // upper[4] 4
3075: // We are now going to calculate up to 7 ranges.
3076: // Some of them will typically overlap, so we will then have to merge
3077: // and eliminate ranges.
3078:
3079: // We have to clean cruft from previous invocations
3080: // before doing anything. C++ already does that
3081: for (int length = 0; length < 5; length++) {
3082: m_utilLowerWeightRange_[length].clear();
3083: m_utilUpperWeightRange_[length].clear();
3084: }
3085: m_utilWeightRange_.clear();
3086:
3087: int weight = lowerLimit;
3088: for (int length = lowerLength; length >= 2; --length) {
3089: m_utilLowerWeightRange_[length].clear();
3090: int trail = getWeightByte(weight, length);
3091: if (trail < maxByte) {
3092: m_utilLowerWeightRange_[length].m_start_ = incWeightTrail(
3093: weight, length);
3094: m_utilLowerWeightRange_[length].m_end_ = setWeightTrail(
3095: weight, length, maxByte);
3096: m_utilLowerWeightRange_[length].m_length_ = length;
3097: m_utilLowerWeightRange_[length].m_count_ = maxByte
3098: - trail;
3099: }
3100: weight = truncateWeight(weight, length - 1);
3101: }
3102: m_utilWeightRange_.m_start_ = incWeightTrail(weight, 1);
3103:
3104: weight = upperLimit;
3105: // [0] and [1] are not used - this simplifies indexing,
3106: // m_utilUpperWeightRange_
3107:
3108: for (int length = upperLength; length >= 2; length--) {
3109: int trail = getWeightByte(weight, length);
3110: if (trail > RuleBasedCollator.BYTE_FIRST_TAILORED_) {
3111: m_utilUpperWeightRange_[length].m_start_ = setWeightTrail(
3112: weight, length,
3113: RuleBasedCollator.BYTE_FIRST_TAILORED_);
3114: m_utilUpperWeightRange_[length].m_end_ = decWeightTrail(
3115: weight, length);
3116: m_utilUpperWeightRange_[length].m_length_ = length;
3117: m_utilUpperWeightRange_[length].m_count_ = trail
3118: - RuleBasedCollator.BYTE_FIRST_TAILORED_;
3119: }
3120: weight = truncateWeight(weight, length - 1);
3121: }
3122: m_utilWeightRange_.m_end_ = decWeightTrail(weight, 1);
3123:
3124: // set the middle range
3125: m_utilWeightRange_.m_length_ = 1;
3126: if (Utility.compareUnsigned(m_utilWeightRange_.m_end_,
3127: m_utilWeightRange_.m_start_) >= 0) {
3128: //if (m_utilWeightRange_.m_end_ >= m_utilWeightRange_.m_start_) {
3129: m_utilWeightRange_.m_count_ = ((m_utilWeightRange_.m_end_ - m_utilWeightRange_.m_start_) >>> 24) + 1;
3130: } else {
3131: // eliminate overlaps
3132: // remove the middle range
3133: m_utilWeightRange_.m_count_ = 0;
3134: // reduce or remove the lower ranges that go beyond upperLimit
3135: for (int length = 4; length >= 2; --length) {
3136: if (m_utilLowerWeightRange_[length].m_count_ > 0
3137: && m_utilUpperWeightRange_[length].m_count_ > 0) {
3138: int start = m_utilUpperWeightRange_[length].m_start_;
3139: int end = m_utilLowerWeightRange_[length].m_end_;
3140: if (end >= start
3141: || incWeight(end, length, maxByte) == start) {
3142: // lower and upper ranges collide or are directly
3143: // adjacent: merge these two and remove all shorter
3144: // ranges
3145: start = m_utilLowerWeightRange_[length].m_start_;
3146: end = m_utilLowerWeightRange_[length].m_end_ = m_utilUpperWeightRange_[length].m_end_;
3147: // merging directly adjacent ranges needs to subtract
3148: // the 0/1 gaps in between;
3149: // it may result in a range with count>countBytes
3150: m_utilLowerWeightRange_[length].m_count_ = getWeightByte(
3151: end, length)
3152: - getWeightByte(start, length)
3153: + 1
3154: + countBytes
3155: * (getWeightByte(end, length - 1) - getWeightByte(
3156: start, length - 1));
3157: m_utilUpperWeightRange_[length].m_count_ = 0;
3158: while (--length >= 2) {
3159: m_utilLowerWeightRange_[length].m_count_ = m_utilUpperWeightRange_[length].m_count_ = 0;
3160: }
3161: break;
3162: }
3163: }
3164: }
3165: }
3166:
3167: // copy the ranges, shortest first, into the result array
3168: int rangeCount = 0;
3169: if (m_utilWeightRange_.m_count_ > 0) {
3170: ranges[0] = new WeightRange(m_utilWeightRange_);
3171: rangeCount = 1;
3172: }
3173: for (int length = 2; length <= 4; ++length) {
3174: // copy upper first so that later the middle range is more likely
3175: // the first one to use
3176: if (m_utilUpperWeightRange_[length].m_count_ > 0) {
3177: ranges[rangeCount] = new WeightRange(
3178: m_utilUpperWeightRange_[length]);
3179: ++rangeCount;
3180: }
3181: if (m_utilLowerWeightRange_[length].m_count_ > 0) {
3182: ranges[rangeCount] = new WeightRange(
3183: m_utilLowerWeightRange_[length]);
3184: ++rangeCount;
3185: }
3186: }
3187: return rangeCount;
3188: }
3189:
3190: /**
3191: * Truncates the weight with length
3192: * @param weight
3193: * @param length
3194: * @return truncated weight
3195: */
3196: private static final int truncateWeight(int weight, int length) {
3197: return weight & (0xffffffff << ((4 - length) << 3));
3198: }
3199:
3200: /**
3201: * Length of the weight
3202: * @param weight
3203: * @return length of the weight
3204: */
3205: private static final int lengthOfWeight(int weight) {
3206: if ((weight & 0xffffff) == 0) {
3207: return 1;
3208: } else if ((weight & 0xffff) == 0) {
3209: return 2;
3210: } else if ((weight & 0xff) == 0) {
3211: return 3;
3212: }
3213: return 4;
3214: }
3215:
3216: /**
3217: * Increment the weight trail
3218: * @param weight
3219: * @param length
3220: * @return new weight
3221: */
3222: private static final int incWeightTrail(int weight, int length) {
3223: return weight + (1 << ((4 - length) << 3));
3224: }
3225:
3226: /**
3227: * Decrement the weight trail
3228: * @param weight
3229: * @param length
3230: * @return new weight
3231: */
3232: private static int decWeightTrail(int weight, int length) {
3233: return weight - (1 << ((4 - length) << 3));
3234: }
3235:
3236: /**
3237: * Gets the codepoint
3238: * @param tbl contraction table
3239: * @param codePoint code point to look for
3240: * @return the offset to the code point
3241: */
3242: private static int findCP(BasicContractionTable tbl, char codePoint) {
3243: int position = 0;
3244: while (codePoint > tbl.m_codePoints_.charAt(position)) {
3245: position++;
3246: if (position > tbl.m_codePoints_.length()) {
3247: return -1;
3248: }
3249: }
3250: if (codePoint == tbl.m_codePoints_.charAt(position)) {
3251: return position;
3252: } else {
3253: return -1;
3254: }
3255: }
3256:
3257: /**
3258: * Finds a contraction ce
3259: * @param table
3260: * @param element
3261: * @param ch
3262: * @return ce
3263: */
3264: private static int findCE(ContractionTable table, int element,
3265: char ch) {
3266: if (table == null) {
3267: return CE_NOT_FOUND_;
3268: }
3269: BasicContractionTable tbl = getBasicContractionTable(table,
3270: element);
3271: if (tbl == null) {
3272: return CE_NOT_FOUND_;
3273: }
3274: int position = findCP(tbl, ch);
3275: if (position > tbl.m_CEs_.size() || position < 0) {
3276: return CE_NOT_FOUND_;
3277: }
3278: return ((Integer) tbl.m_CEs_.get(position)).intValue();
3279: }
3280:
3281: /**
3282: * Checks if the string is tailored in the contraction
3283: * @param table contraction table
3284: * @param element
3285: * @param array character array to check
3286: * @param offset array offset
3287: * @return true if it is tailored
3288: */
3289: private static boolean isTailored(ContractionTable table,
3290: int element, char array[], int offset) {
3291: while (array[offset] != 0) {
3292: element = findCE(table, element, array[offset]);
3293: if (element == CE_NOT_FOUND_) {
3294: return false;
3295: }
3296: if (!isContractionTableElement(element)) {
3297: return true;
3298: }
3299: offset++;
3300: }
3301: if (getCE(table, element, 0) != CE_NOT_FOUND_) {
3302: return true;
3303: } else {
3304: return false;
3305: }
3306: }
3307:
3308: /**
3309: * Assemble RuleBasedCollator
3310: * @param t build table
3311: * @param collator to update
3312: */
3313: private void assembleTable(BuildTable t, RuleBasedCollator collator) {
3314: IntTrieBuilder mapping = t.m_mapping_;
3315: Vector expansions = t.m_expansions_;
3316: ContractionTable contractions = t.m_contractions_;
3317: MaxExpansionTable maxexpansion = t.m_maxExpansions_;
3318:
3319: // contraction offset has to be in since we are building on the
3320: // UCA contractions
3321: // int beforeContractions = (HEADER_SIZE_
3322: // + paddedsize(expansions.size() << 2)) >>> 1;
3323: collator.m_contractionOffset_ = 0;
3324: int contractionsSize = constructTable(contractions);
3325:
3326: // the following operation depends on the trie data. Therefore, we have
3327: // to do it before the trie is compacted
3328: // sets jamo expansions
3329: getMaxExpansionJamo(mapping, maxexpansion,
3330: t.m_maxJamoExpansions_, collator.m_isJamoSpecial_);
3331:
3332: // TODO: LATIN1 array is now in the utrie - it should be removed from
3333: // the calculation
3334: setAttributes(collator, t.m_options_);
3335: // copy expansions
3336: int size = expansions.size();
3337: collator.m_expansion_ = new int[size];
3338: for (int i = 0; i < size; i++) {
3339: collator.m_expansion_[i] = ((Integer) expansions.get(i))
3340: .intValue();
3341: }
3342: // contractions block
3343: if (contractionsSize != 0) {
3344: // copy contraction index
3345: collator.m_contractionIndex_ = new char[contractionsSize];
3346: contractions.m_codePoints_.getChars(0, contractionsSize,
3347: collator.m_contractionIndex_, 0);
3348: // copy contraction collation elements
3349: collator.m_contractionCE_ = new int[contractionsSize];
3350: for (int i = 0; i < contractionsSize; i++) {
3351: collator.m_contractionCE_[i] = ((Integer) contractions.m_CEs_
3352: .get(i)).intValue();
3353: }
3354: }
3355: // copy mapping table
3356: collator.m_trie_ = mapping.serialize(t,
3357: RuleBasedCollator.DataManipulate.getInstance());
3358: // copy max expansion table
3359: // not copying the first element which is a dummy
3360: // to be in synch with icu4c's builder, we continue to use the
3361: // expansion offset
3362: // omitting expansion offset in builder
3363: collator.m_expansionOffset_ = 0;
3364: size = maxexpansion.m_endExpansionCE_.size();
3365: collator.m_expansionEndCE_ = new int[size - 1];
3366: for (int i = 1; i < size; i++) {
3367: collator.m_expansionEndCE_[i - 1] = ((Integer) maxexpansion.m_endExpansionCE_
3368: .get(i)).intValue();
3369: }
3370: collator.m_expansionEndCEMaxSize_ = new byte[size - 1];
3371: for (int i = 1; i < size; i++) {
3372: collator.m_expansionEndCEMaxSize_[i - 1] = ((Byte) maxexpansion.m_expansionCESize_
3373: .get(i)).byteValue();
3374: }
3375: // Unsafe chars table. Finish it off, then copy it.
3376: unsafeCPAddCCNZ(t);
3377: // Or in unsafebits from UCA, making a combined table.
3378: for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
3379: t.m_unsafeCP_[i] |= RuleBasedCollator.UCA_.m_unsafe_[i];
3380: }
3381: collator.m_unsafe_ = t.m_unsafeCP_;
3382:
3383: // Finish building Contraction Ending chars hash table and then copy it
3384: // out.
3385: // Or in unsafebits from UCA, making a combined table
3386: for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
3387: t.m_contrEndCP_[i] |= RuleBasedCollator.UCA_.m_contractionEnd_[i];
3388: }
3389: collator.m_contractionEnd_ = t.m_contrEndCP_;
3390: }
3391:
3392: /**
3393: * Sets this collator to use the all options and tables in UCA.
3394: * @param collator which attribute is to be set
3395: * @param option to set with
3396: */
3397: private static final void setAttributes(RuleBasedCollator collator,
3398: CollationRuleParser.OptionSet option) {
3399: collator.latinOneFailed_ = true;
3400: collator.m_caseFirst_ = option.m_caseFirst_;
3401: collator.setDecomposition(option.m_decomposition_);
3402: collator
3403: .setAlternateHandlingShifted(option.m_isAlternateHandlingShifted_);
3404: collator.setCaseLevel(option.m_isCaseLevel_);
3405: collator.setFrenchCollation(option.m_isFrenchCollation_);
3406: collator.m_isHiragana4_ = option.m_isHiragana4_;
3407: collator.setStrength(option.m_strength_);
3408: collator.m_variableTopValue_ = option.m_variableTopValue_;
3409: collator.latinOneFailed_ = false;
3410: }
3411:
3412: /**
3413: * Constructing the contraction table
3414: * @param table contraction table
3415: * @return
3416: */
3417: private int constructTable(ContractionTable table) {
3418: // See how much memory we need
3419: int tsize = table.m_elements_.size();
3420: if (tsize == 0) {
3421: return 0;
3422: }
3423: table.m_offsets_.clear();
3424: int position = 0;
3425: for (int i = 0; i < tsize; i++) {
3426: table.m_offsets_.add(new Integer(position));
3427: position += ((BasicContractionTable) table.m_elements_
3428: .get(i)).m_CEs_.size();
3429: }
3430: table.m_CEs_.clear();
3431: table.m_codePoints_.delete(0, table.m_codePoints_.length());
3432: // Now stuff the things in
3433: StringBuffer cpPointer = table.m_codePoints_;
3434: Vector CEPointer = table.m_CEs_;
3435: for (int i = 0; i < tsize; i++) {
3436: BasicContractionTable bct = (BasicContractionTable) table.m_elements_
3437: .get(i);
3438: int size = bct.m_CEs_.size();
3439: char ccMax = 0;
3440: char ccMin = 255;
3441: int offset = CEPointer.size();
3442: CEPointer.add(bct.m_CEs_.get(0));
3443: for (int j = 1; j < size; j++) {
3444: char ch = bct.m_codePoints_.charAt(j);
3445: char cc = (char) (UCharacter.getCombiningClass(ch) & 0xFF);
3446: if (cc > ccMax) {
3447: ccMax = cc;
3448: }
3449: if (cc < ccMin) {
3450: ccMin = cc;
3451: }
3452: cpPointer.append(ch);
3453: CEPointer.add(bct.m_CEs_.get(j));
3454: }
3455: cpPointer.insert(offset, (char) (((ccMin == ccMax) ? 1
3456: : 0 << 8) | ccMax));
3457: for (int j = 0; j < size; j++) {
3458: if (isContractionTableElement(((Integer) CEPointer
3459: .get(offset + j)).intValue())) {
3460: int ce = ((Integer) CEPointer.get(offset + j))
3461: .intValue();
3462: CEPointer
3463: .set(
3464: offset + j,
3465: new Integer(
3466: constructSpecialCE(
3467: getCETag(ce),
3468: ((Integer) table.m_offsets_
3469: .get(getContractionOffset(ce)))
3470: .intValue())));
3471: }
3472: }
3473: }
3474:
3475: for (int i = 0; i <= 0x10FFFF; i++) {
3476: int CE = table.m_mapping_.getValue(i);
3477: if (isContractionTableElement(CE)) {
3478: CE = constructSpecialCE(getCETag(CE),
3479: ((Integer) table.m_offsets_
3480: .get(getContractionOffset(CE)))
3481: .intValue());
3482: table.m_mapping_.setValue(i, CE);
3483: }
3484: }
3485: return position;
3486: }
3487:
3488: /**
3489: * Get contraction offset
3490: * @param ce collation element
3491: * @return contraction offset
3492: */
3493: private static final int getContractionOffset(int ce) {
3494: return ce & 0xFFFFFF;
3495: }
3496:
3497: /**
3498: * Gets the maximum Jamo expansion
3499: * @param mapping trie table
3500: * @param maxexpansion maximum expansion table
3501: * @param maxjamoexpansion maximum jamo expansion table
3502: * @param jamospecial is jamo special?
3503: */
3504: private static void getMaxExpansionJamo(IntTrieBuilder mapping,
3505: MaxExpansionTable maxexpansion,
3506: MaxJamoExpansionTable maxjamoexpansion, boolean jamospecial) {
3507: int VBASE = 0x1161;
3508: int TBASE = 0x11A8;
3509: int VCOUNT = 21;
3510: int TCOUNT = 28;
3511: int v = VBASE + VCOUNT - 1;
3512: int t = TBASE + TCOUNT - 1;
3513:
3514: while (v >= VBASE) {
3515: int ce = mapping.getValue(v);
3516: if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
3517: setMaxExpansion(ce, (byte) 2, maxexpansion);
3518: }
3519: v--;
3520: }
3521:
3522: while (t >= TBASE) {
3523: int ce = mapping.getValue(t);
3524: if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
3525: setMaxExpansion(ce, (byte) 3, maxexpansion);
3526: }
3527: t--;
3528: }
3529: // According to the docs, 99% of the time, the Jamo will not be special
3530: if (jamospecial) {
3531: // gets the max expansion in all unicode characters
3532: int count = maxjamoexpansion.m_endExpansionCE_.size();
3533: byte maxTSize = (byte) (maxjamoexpansion.m_maxLSize_
3534: + maxjamoexpansion.m_maxVSize_ + maxjamoexpansion.m_maxTSize_);
3535: byte maxVSize = (byte) (maxjamoexpansion.m_maxLSize_ + maxjamoexpansion.m_maxVSize_);
3536:
3537: while (count > 0) {
3538: count--;
3539: if (((Boolean) maxjamoexpansion.m_isV_.get(count))
3540: .booleanValue() == true) {
3541: setMaxExpansion(
3542: ((Integer) maxjamoexpansion.m_endExpansionCE_
3543: .get(count)).intValue(), maxVSize,
3544: maxexpansion);
3545: } else {
3546: setMaxExpansion(
3547: ((Integer) maxjamoexpansion.m_endExpansionCE_
3548: .get(count)).intValue(), maxTSize,
3549: maxexpansion);
3550: }
3551: }
3552: }
3553: }
3554:
3555: /**
3556: * To the UnsafeCP hash table, add all chars with combining class != 0
3557: * @param t build table
3558: */
3559: private static final void unsafeCPAddCCNZ(BuildTable t) {
3560:
3561: for (char c = 0; c < 0xffff; c++) {
3562: char fcd = NormalizerImpl.getFCD16(c);
3563: if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||
3564: (UTF16.isLeadSurrogate(c) && fcd != 0)) {
3565: // c is a leading surrogate with some FCD data
3566: unsafeCPSet(t.m_unsafeCP_, c);
3567: }
3568: }
3569:
3570: if (t.m_prefixLookup_ != null) {
3571: Enumeration els = t.m_prefixLookup_.elements();
3572: while (els.hasMoreElements()) {
3573: Elements e = (Elements) els.nextElement();
3574: // codepoints here are in the NFD form. We need to add the
3575: // first code point of the NFC form to unsafe, because
3576: // strcoll needs to backup over them.
3577: // weiv: This is wrong! See the comment above.
3578: //String decomp = Normalizer.decompose(e.m_cPoints_, true);
3579: //unsafeCPSet(t.m_unsafeCP_, decomp.charAt(0));
3580: // it should be:
3581: String comp = Normalizer.compose(e.m_cPoints_, false);
3582: unsafeCPSet(t.m_unsafeCP_, comp.charAt(0));
3583: }
3584: }
3585: }
3586:
3587: /**
3588: * Create closure
3589: * @param t build table
3590: * @param collator RuleBasedCollator
3591: * @param colEl collation element iterator
3592: * @param start
3593: * @param limit
3594: * @param type character type
3595: * @return
3596: */
3597: private boolean enumCategoryRangeClosureCategory(BuildTable t,
3598: RuleBasedCollator collator, CollationElementIterator colEl,
3599: int start, int limit, int type) {
3600: if (type != UCharacterCategory.UNASSIGNED
3601: && type != UCharacterCategory.PRIVATE_USE) {
3602: // if the range is assigned - we might ommit more categories later
3603:
3604: for (int u32 = start; u32 < limit; u32++) {
3605: int noOfDec = NormalizerImpl.getDecomposition(u32,
3606: false, m_utilCharBuffer_, 0, 256);
3607: if (noOfDec > 0) {
3608: // if we're positive, that means there is no decomposition
3609: String comp = UCharacter.toString(u32);
3610: String decomp = new String(m_utilCharBuffer_, 0,
3611: noOfDec);
3612: if (!collator.equals(comp, decomp)) {
3613: m_utilElement_.m_cPoints_ = decomp;
3614: m_utilElement_.m_prefix_ = 0;
3615: Elements prefix = (Elements) t.m_prefixLookup_
3616: .get(m_utilElement_);
3617: if (prefix == null) {
3618: m_utilElement_.m_cPoints_ = comp;
3619: m_utilElement_.m_prefix_ = 0;
3620: m_utilElement_.m_prefixChars_ = null;
3621: colEl.setText(decomp);
3622: int ce = colEl.next();
3623: m_utilElement_.m_CELength_ = 0;
3624: while (ce != CollationElementIterator.NULLORDER) {
3625: m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = ce;
3626: ce = colEl.next();
3627: }
3628: } else {
3629: m_utilElement_.m_cPoints_ = comp;
3630: m_utilElement_.m_prefix_ = 0;
3631: m_utilElement_.m_prefixChars_ = null;
3632: m_utilElement_.m_CELength_ = 1;
3633: m_utilElement_.m_CEs_[0] = prefix.m_mapCE_;
3634: // This character uses a prefix. We have to add it
3635: // to the unsafe table, as it decomposed form is
3636: // already in. In Japanese, this happens for \u309e
3637: // & \u30fe
3638: // Since unsafeCPSet is static in ucol_elm, we are
3639: // going to wrap it up in the unsafeCPAddCCNZ
3640: // function
3641: }
3642: addAnElement(t, m_utilElement_);
3643: }
3644: }
3645: }
3646: }
3647: return true;
3648: }
3649:
3650: /**
3651: * Determine if a character is a Jamo
3652: * @param ch character to test
3653: * @return true if ch is a Jamo, false otherwise
3654: */
3655: private static final boolean isJamo(char ch) {
3656: return (ch >= 0x1100 && ch <= 0x1112)
3657: || (ch >= 0x1175 && ch <= 0x1161)
3658: || (ch >= 0x11A8 && ch <= 0x11C2);
3659: }
3660:
3661: /**
3662: * Produces canonical closure
3663: */
3664: private void canonicalClosure(BuildTable t) {
3665: BuildTable temp = new BuildTable(t);
3666: assembleTable(temp, temp.m_collator_);
3667: // produce canonical closure
3668: CollationElementIterator coleiter = temp.m_collator_
3669: .getCollationElementIterator("");
3670: RangeValueIterator typeiter = UCharacter.getTypeIterator();
3671: RangeValueIterator.Element element = new RangeValueIterator.Element();
3672: while (typeiter.next(element)) {
3673: enumCategoryRangeClosureCategory(t, temp.m_collator_,
3674: coleiter, element.start, element.limit,
3675: element.value);
3676: }
3677: }
3678:
3679: private void processUCACompleteIgnorables(BuildTable t) {
3680: TrieIterator trieiterator = new TrieIterator(
3681: RuleBasedCollator.UCA_.m_trie_);
3682: RangeValueIterator.Element element = new RangeValueIterator.Element();
3683: while (trieiterator.next(element)) {
3684: int start = element.start;
3685: int limit = element.limit;
3686: if (element.value == 0) {
3687: while (start < limit) {
3688: int CE = t.m_mapping_.getValue(start);
3689: if (CE == CE_NOT_FOUND_) {
3690: m_utilElement_.m_prefix_ = 0;
3691: m_utilElement_.m_uchars_ = UCharacter
3692: .toString(start);
3693: m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
3694: m_utilElement_.m_cPointsOffset_ = 0;
3695: m_utilElement_.m_CELength_ = 1;
3696: m_utilElement_.m_CEs_[0] = 0;
3697: addAnElement(t, m_utilElement_);
3698: }
3699: start++;
3700: }
3701: }
3702: }
3703: }
3704: }
|