Source Code Cross Referenced for CollationParsedRuleBuilder.java in » Internationalization-Localization » icu4j » com » ibm » icu » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.text
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /**
0002:         *******************************************************************************
0003:         * Copyright (C) 1996-2006, International Business Machines Corporation and    *
0004:         * others. All Rights Reserved.                                                *
0005:         *******************************************************************************
0006:         */package com.ibm.icu.text;
0007:
0008:        import java.io.IOException;
0009:        import java.text.ParseException;
0010:        import java.util.Hashtable;
0011:        import java.util.Vector;
0012:        import java.util.Arrays;
0013:        import java.util.Enumeration;
0014:
0015:        import com.ibm.icu.impl.TrieBuilder;
0016:        import com.ibm.icu.impl.IntTrieBuilder;
0017:        import com.ibm.icu.impl.TrieIterator;
0018:        import com.ibm.icu.impl.Utility;
0019:        import com.ibm.icu.impl.UCharacterProperty;
0020:        import com.ibm.icu.lang.UCharacter;
0021:        import com.ibm.icu.lang.UCharacterCategory;
0022:        import com.ibm.icu.impl.NormalizerImpl;
0023:        import com.ibm.icu.util.RangeValueIterator;
0024:        import com.ibm.icu.util.VersionInfo;
0025:
0026:        /**
0027:         * Class for building a collator from a list of collation rules.
0028:         * This class is uses CollationRuleParser
0029:         * @author Syn Wee Quek
0030:         * @since release 2.2, June 11 2002
0031:         * @draft 2.2
0032:         */
0033:        final class CollationParsedRuleBuilder {
0034:            // package private constructors ------------------------------------------
0035:
0036:            /**
0037:             * Constructor
0038:             * @param rules collation rules
0039:             * @exception ParseException thrown when argument rules have an invalid 
0040:             *            syntax 
0041:             */
0042:            CollationParsedRuleBuilder(String rules) throws ParseException {
0043:                m_parser_ = new CollationRuleParser(rules);
0044:                m_parser_.assembleTokenList();
0045:                m_utilColEIter_ = RuleBasedCollator.UCA_
0046:                        .getCollationElementIterator("");
0047:            }
0048:
0049:            // package private inner classes -----------------------------------------
0050:
0051:            /** 
0052:             * Inverse UCA wrapper
0053:             */
0054:            static class InverseUCA {
0055:                // package private constructor ---------------------------------------
0056:
0057:                InverseUCA() {
0058:                }
0059:
0060:                // package private data member ---------------------------------------
0061:
0062:                /**
0063:                 * Array list of characters
0064:                 */
0065:                int m_table_[];
0066:                /**
0067:                 * Array list of continuation characters
0068:                 */
0069:                char m_continuations_[];
0070:
0071:                /**
0072:                 * UCA version of inverse UCA table
0073:                 */
0074:                VersionInfo m_UCA_version_;
0075:
0076:                // package private method --------------------------------------------
0077:
0078:                /**
0079:                 * Returns the previous inverse ces of the argument ces
0080:                 * @param ce ce to test
0081:                 * @param contce continuation ce to test
0082:                 * @param strength collation strength
0083:                 * @param prevresult an array to store the return results previous 
0084:                 *                   inverse ce and previous inverse continuation ce
0085:                 * @return result of the inverse ce 
0086:                 */
0087:                final int getInversePrevCE(int ce, int contce, int strength,
0088:                        int prevresult[]) {
0089:                    int result = findInverseCE(ce, contce);
0090:
0091:                    if (result < 0) {
0092:                        prevresult[0] = CollationElementIterator.NULLORDER;
0093:                        return -1;
0094:                    }
0095:
0096:                    ce &= STRENGTH_MASK_[strength];
0097:                    contce &= STRENGTH_MASK_[strength];
0098:
0099:                    prevresult[0] = ce;
0100:                    prevresult[1] = contce;
0101:
0102:                    while ((prevresult[0] & STRENGTH_MASK_[strength]) == ce
0103:                            && (prevresult[1] & STRENGTH_MASK_[strength]) == contce
0104:                            && result > 0) {
0105:                        // this condition should prevent falling off the edge of the 
0106:                        // world 
0107:                        // here, we end up in a singularity - zero
0108:                        prevresult[0] = m_table_[3 * (--result)];
0109:                        prevresult[1] = m_table_[3 * result + 1];
0110:                    }
0111:                    return result;
0112:                }
0113:
0114:                final int getCEStrengthDifference(int CE, int contCE,
0115:                        int prevCE, int prevContCE) {
0116:                    int strength = Collator.TERTIARY;
0117:                    while (((prevCE & STRENGTH_MASK_[strength]) != (CE & STRENGTH_MASK_[strength]) || (prevContCE & STRENGTH_MASK_[strength]) != (contCE & STRENGTH_MASK_[strength]))
0118:                            && (strength != 0)) {
0119:                        strength--;
0120:                    }
0121:                    return strength;
0122:                }
0123:
0124:                private int compareCEs(int source0, int source1, int target0,
0125:                        int target1) {
0126:                    int s1 = source0, s2, t1 = target0, t2;
0127:                    if (RuleBasedCollator.isContinuation(source1)) {
0128:                        s2 = source1;
0129:                    } else {
0130:                        s2 = 0;
0131:                    }
0132:                    if (RuleBasedCollator.isContinuation(target1)) {
0133:                        t2 = target1;
0134:                    } else {
0135:                        t2 = 0;
0136:                    }
0137:
0138:                    int s = 0, t = 0;
0139:                    if (s1 == t1 && s2 == t2) {
0140:                        return 0;
0141:                    }
0142:                    s = (s1 & 0xFFFF0000) | ((s2 & 0xFFFF0000) >> 16);
0143:                    t = (t1 & 0xFFFF0000) | ((t2 & 0xFFFF0000) >> 16);
0144:                    if (s == t) {
0145:                        s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00) >> 8;
0146:                        t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00) >> 8;
0147:                        if (s == t) {
0148:                            s = (s1 & 0x000000FF) << 8 | (s2 & 0x000000FF);
0149:                            t = (t1 & 0x000000FF) << 8 | (t2 & 0x000000FF);
0150:                            return Utility.compareUnsigned(s, t);
0151:                        } else {
0152:                            return Utility.compareUnsigned(s, t);
0153:                        }
0154:                    } else {
0155:                        return Utility.compareUnsigned(s, t);
0156:                    }
0157:                }
0158:
0159:                /**
0160:                 * Finding the inverse CE of the argument CEs
0161:                 * @param ce CE to be tested
0162:                 * @param contce continuation CE
0163:                 * @return inverse CE
0164:                 */
0165:                int findInverseCE(int ce, int contce) {
0166:                    int bottom = 0;
0167:                    int top = m_table_.length / 3;
0168:                    int result = 0;
0169:
0170:                    while (bottom < top - 1) {
0171:                        result = (top + bottom) >> 1;
0172:                        int first = m_table_[3 * result];
0173:                        int second = m_table_[3 * result + 1];
0174:                        int comparison = compareCEs(first, second, ce, contce);
0175:                        if (comparison > 0) {
0176:                            top = result;
0177:                        } else if (comparison < 0) {
0178:                            bottom = result;
0179:                        } else {
0180:                            break;
0181:                        }
0182:                    }
0183:
0184:                    return result;
0185:                }
0186:
0187:                /**
0188:                 * Getting gap offsets in the inverse UCA
0189:                 * @param listheader parsed token lists
0190:                 * @exception Exception thrown when error occurs while finding the 
0191:                 *            collation gaps
0192:                 */
0193:                void getInverseGapPositions(
0194:                        CollationRuleParser.TokenListHeader listheader)
0195:                        throws Exception {
0196:                    // reset all the gaps
0197:                    CollationRuleParser.Token token = listheader.m_first_;
0198:                    int tokenstrength = token.m_strength_;
0199:
0200:                    for (int i = 0; i < 3; i++) {
0201:                        listheader.m_gapsHi_[3 * i] = 0;
0202:                        listheader.m_gapsHi_[3 * i + 1] = 0;
0203:                        listheader.m_gapsHi_[3 * i + 2] = 0;
0204:                        listheader.m_gapsLo_[3 * i] = 0;
0205:                        listheader.m_gapsLo_[3 * i + 1] = 0;
0206:                        listheader.m_gapsLo_[3 * i + 2] = 0;
0207:                        listheader.m_numStr_[i] = 0;
0208:                        listheader.m_fStrToken_[i] = null;
0209:                        listheader.m_lStrToken_[i] = null;
0210:                        listheader.m_pos_[i] = -1;
0211:                    }
0212:
0213:                    if ((listheader.m_baseCE_ >>> 24) >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_
0214:                            && (listheader.m_baseCE_ >>> 24) <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_) {
0215:                        // implicits -
0216:                        listheader.m_pos_[0] = 0;
0217:                        int t1 = listheader.m_baseCE_;
0218:                        int t2 = listheader.m_baseContCE_;
0219:                        listheader.m_gapsLo_[0] = mergeCE(t1, t2,
0220:                                Collator.PRIMARY);
0221:                        listheader.m_gapsLo_[1] = mergeCE(t1, t2,
0222:                                Collator.SECONDARY);
0223:                        listheader.m_gapsLo_[2] = mergeCE(t1, t2,
0224:                                Collator.TERTIARY);
0225:                        int primaryCE = t1
0226:                                & RuleBasedCollator.CE_PRIMARY_MASK_
0227:                                | (t2 & RuleBasedCollator.CE_PRIMARY_MASK_) >>> 16;
0228:                        primaryCE = RuleBasedCollator.impCEGen_
0229:                                .getImplicitFromRaw(RuleBasedCollator.impCEGen_
0230:                                        .getRawFromImplicit(primaryCE) + 1);
0231:
0232:                        t1 = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_
0233:                                | 0x0505;
0234:                        t2 = (primaryCE << 16)
0235:                                & RuleBasedCollator.CE_PRIMARY_MASK_
0236:                                | RuleBasedCollator.CE_CONTINUATION_MARKER_;
0237:
0238:                        //                if (listheader.m_baseCE_ < 0xEF000000) {
0239:                        //                    // first implicits have three byte primaries, with a gap of
0240:                        //                    // one so we esentially need to add 2 to the top byte in 
0241:                        //                    // listheader.m_baseContCE_
0242:                        //                    t2 += 0x02000000;
0243:                        //                } 
0244:                        //                else {
0245:                        //                    // second implicits have four byte primaries, with a gap of
0246:                        //                    // IMPLICIT_LAST2_MULTIPLIER_
0247:                        //                    // Now, this guy is not really accessible here, so until we 
0248:                        //                    // find a better way to pass it around, assume that the gap is 1
0249:                        //                    t2 += 0x00020000;
0250:                        //                }
0251:                        listheader.m_gapsHi_[0] = mergeCE(t1, t2,
0252:                                Collator.PRIMARY);
0253:                        listheader.m_gapsHi_[1] = mergeCE(t1, t2,
0254:                                Collator.SECONDARY);
0255:                        listheader.m_gapsHi_[2] = mergeCE(t1, t2,
0256:                                Collator.TERTIARY);
0257:                    } else if (listheader.m_indirect_ == true
0258:                            && listheader.m_nextCE_ != 0) {
0259:                        listheader.m_pos_[0] = 0;
0260:                        int t1 = listheader.m_baseCE_;
0261:                        int t2 = listheader.m_baseContCE_;
0262:                        listheader.m_gapsLo_[0] = mergeCE(t1, t2,
0263:                                Collator.PRIMARY);
0264:                        listheader.m_gapsLo_[1] = mergeCE(t1, t2,
0265:                                Collator.SECONDARY);
0266:                        listheader.m_gapsLo_[2] = mergeCE(t1, t2,
0267:                                Collator.TERTIARY);
0268:                        t1 = listheader.m_nextCE_;
0269:                        t2 = listheader.m_nextContCE_;
0270:                        listheader.m_gapsHi_[0] = mergeCE(t1, t2,
0271:                                Collator.PRIMARY);
0272:                        listheader.m_gapsHi_[1] = mergeCE(t1, t2,
0273:                                Collator.SECONDARY);
0274:                        listheader.m_gapsHi_[2] = mergeCE(t1, t2,
0275:                                Collator.TERTIARY);
0276:                    } else {
0277:                        while (true) {
0278:                            if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
0279:                                listheader.m_pos_[tokenstrength] = getInverseNext(
0280:                                        listheader, tokenstrength);
0281:                                if (listheader.m_pos_[tokenstrength] >= 0) {
0282:                                    listheader.m_fStrToken_[tokenstrength] = token;
0283:                                } else {
0284:                                    // The CE must be implicit, since it's not in the 
0285:                                    // table 
0286:                                    // Error
0287:                                    throw new Exception(
0288:                                            "Internal program error");
0289:                                }
0290:                            }
0291:
0292:                            while (token != null
0293:                                    && token.m_strength_ >= tokenstrength) {
0294:                                if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
0295:                                    listheader.m_lStrToken_[tokenstrength] = token;
0296:                                }
0297:                                token = token.m_next_;
0298:                            }
0299:                            if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_ - 1) {
0300:                                // check if previous interval is the same and merge the 
0301:                                // intervals if it is so
0302:                                if (listheader.m_pos_[tokenstrength] == listheader.m_pos_[tokenstrength + 1]) {
0303:                                    listheader.m_fStrToken_[tokenstrength] = listheader.m_fStrToken_[tokenstrength + 1];
0304:                                    listheader.m_fStrToken_[tokenstrength + 1] = null;
0305:                                    listheader.m_lStrToken_[tokenstrength + 1] = null;
0306:                                    listheader.m_pos_[tokenstrength + 1] = -1;
0307:                                }
0308:                            }
0309:                            if (token != null) {
0310:                                tokenstrength = token.m_strength_;
0311:                            } else {
0312:                                break;
0313:                            }
0314:                        }
0315:                        for (int st = 0; st < 3; st++) {
0316:                            int pos = listheader.m_pos_[st];
0317:                            if (pos >= 0) {
0318:                                int t1 = m_table_[3 * pos];
0319:                                int t2 = m_table_[3 * pos + 1];
0320:                                listheader.m_gapsHi_[3 * st] = mergeCE(t1, t2,
0321:                                        Collator.PRIMARY);
0322:                                listheader.m_gapsHi_[3 * st + 1] = mergeCE(t1,
0323:                                        t2, Collator.SECONDARY);
0324:                                listheader.m_gapsHi_[3 * st + 2] = (t1 & 0x3f) << 24
0325:                                        | (t2 & 0x3f) << 16;
0326:                                //pos --;
0327:                                //t1 = m_table_[3 * pos];
0328:                                //t2 = m_table_[3 * pos + 1];
0329:                                t1 = listheader.m_baseCE_;
0330:                                t2 = listheader.m_baseContCE_;
0331:
0332:                                listheader.m_gapsLo_[3 * st] = mergeCE(t1, t2,
0333:                                        Collator.PRIMARY);
0334:                                listheader.m_gapsLo_[3 * st + 1] = mergeCE(t1,
0335:                                        t2, Collator.SECONDARY);
0336:                                listheader.m_gapsLo_[3 * st + 2] = (t1 & 0x3f) << 24
0337:                                        | (t2 & 0x3f) << 16;
0338:                            }
0339:                        }
0340:                    }
0341:                }
0342:
0343:                /**
0344:                 * Gets the next CE in the inverse table
0345:                 * @param listheader token list header
0346:                 * @param strength collation strength
0347:                 * @return next ce
0348:                 */
0349:                private final int getInverseNext(
0350:                        CollationRuleParser.TokenListHeader listheader,
0351:                        int strength) {
0352:                    int ce = listheader.m_baseCE_;
0353:                    int secondce = listheader.m_baseContCE_;
0354:                    int result = findInverseCE(ce, secondce);
0355:
0356:                    if (result < 0) {
0357:                        return -1;
0358:                    }
0359:
0360:                    ce &= STRENGTH_MASK_[strength];
0361:                    secondce &= STRENGTH_MASK_[strength];
0362:
0363:                    int nextce = ce;
0364:                    int nextcontce = secondce;
0365:
0366:                    while ((nextce & STRENGTH_MASK_[strength]) == ce
0367:                            && (nextcontce & STRENGTH_MASK_[strength]) == secondce) {
0368:                        nextce = m_table_[3 * (++result)];
0369:                        nextcontce = m_table_[3 * result + 1];
0370:                    }
0371:
0372:                    listheader.m_nextCE_ = nextce;
0373:                    listheader.m_nextContCE_ = nextcontce;
0374:
0375:                    return result;
0376:                }
0377:            }
0378:
0379:            // package private data members ------------------------------------------
0380:
0381:            /**
0382:             * Inverse UCA, instantiate only when required
0383:             */
0384:            static final InverseUCA INVERSE_UCA_;
0385:
0386:            /**
0387:             * UCA and Inverse UCA version do not match
0388:             */
0389:            private static final String INV_UCA_VERSION_MISMATCH_ = "UCA versions of UCA and inverse UCA should match";
0390:
0391:            /**
0392:             * UCA and Inverse UCA version do not match
0393:             */
0394:            private static final String UCA_NOT_INSTANTIATED_ = "UCA is not instantiated!";
0395:
0396:            /**
0397:             * Initializing the inverse UCA
0398:             */
0399:            static {
0400:                InverseUCA temp = null;
0401:                try {
0402:                    temp = CollatorReader.getInverseUCA();
0403:                } catch (IOException e) {
0404:                }
0405:                /*
0406:                try
0407:                {
0408:                String invdat = "/com/ibm/icu/impl/data/invuca.icu";
0409:                InputStream i = CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
0410:                BufferedInputStream b = new BufferedInputStream(i, 110000);
0411:                INVERSE_UCA_ = CollatorReader.readInverseUCA(b);
0412:                b.close();
0413:                i.close();
0414:                }
0415:                catch (Exception e)
0416:                {
0417:                e.printStackTrace();
0418:                throw new RuntimeException(e.getMessage());
0419:                }
0420:                 */
0421:
0422:                if (temp != null && RuleBasedCollator.UCA_ != null) {
0423:                    if (!temp.m_UCA_version_
0424:                            .equals(RuleBasedCollator.UCA_.m_UCA_version_)) {
0425:                        throw new RuntimeException(INV_UCA_VERSION_MISMATCH_);
0426:                    }
0427:                } else {
0428:                    throw new RuntimeException(UCA_NOT_INSTANTIATED_);
0429:                }
0430:
0431:                INVERSE_UCA_ = temp;
0432:            }
0433:
0434:            // package private methods -----------------------------------------------
0435:
0436:            /**
0437:             * Parse and sets the collation rules in the argument collator
0438:             * @param collator to set
0439:             * @exception Exception thrown when internal program error occurs
0440:             */
0441:            void setRules(RuleBasedCollator collator) throws Exception {
0442:                if (m_parser_.m_resultLength_ > 0
0443:                        || m_parser_.m_removeSet_ != null) {
0444:                    // we have a set of rules, let's make something of it 
0445:                    assembleTailoringTable(collator);
0446:                } else { // no rules, but no error either must be only options
0447:                    // We will init the collator from UCA   
0448:                    collator.setWithUCATables();
0449:                }
0450:                // And set only the options
0451:                m_parser_.setDefaultOptionsInCollator(collator);
0452:            }
0453:
0454:            private void copyRangeFromUCA(BuildTable t, int start, int end) {
0455:                int u = 0;
0456:                for (u = start; u <= end; u++) {
0457:                    // if ((CE = ucmpe32_get(t.m_mapping, u)) == UCOL_NOT_FOUND
0458:                    int CE = t.m_mapping_.getValue(u);
0459:                    if (CE == CE_NOT_FOUND_
0460:                    // this test is for contractions that are missing the starting 
0461:                            // element. Looks like latin-1 should be done before 
0462:                            // assembling the table, even if it results in more false 
0463:                            // closure elements
0464:                            || (isContractionTableElement(CE) && getCE(
0465:                                    t.m_contractions_, CE, 0) == CE_NOT_FOUND_)) {
0466:                        //m_utilElement_.m_uchars_ = str.toString();
0467:                        m_utilElement_.m_uchars_ = UCharacter.toString(u);
0468:                        m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
0469:                        m_utilElement_.m_prefix_ = 0;
0470:                        m_utilElement_.m_CELength_ = 0;
0471:                        m_utilColEIter_.setText(m_utilElement_.m_uchars_);
0472:                        while (CE != CollationElementIterator.NULLORDER) {
0473:                            CE = m_utilColEIter_.next();
0474:                            if (CE != CollationElementIterator.NULLORDER) {
0475:                                m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
0476:                            }
0477:                        }
0478:                        addAnElement(t, m_utilElement_);
0479:                    }
0480:                }
0481:            }
0482:
0483:            /**
0484:             * 2.  Eliminate the negative lists by doing the following for each 
0485:             * non-null negative list: 
0486:             * o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, 
0487:             * create new ListHeader X 
0488:             * o   reverse the list, add to the end of X's positive list. Reset the 
0489:             * strength of the first item you add, based on the stronger strength 
0490:             * levels of the two lists. 
0491:             * 
0492:             * 3.  For each ListHeader with a non-null positive list: 
0493:             * o   Find all character strings with CEs between the baseCE and the 
0494:             * next/previous CE, at the strength of the first token. Add these to the 
0495:             * tailoring. 
0496:             *     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the 
0497:             *       tailoring has & x < z... 
0498:             *     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ... 
0499:             * 
0500:             * It is possible that this part should be done even while constructing list
0501:             * The problem is that it is unknown what is going to be the strongest 
0502:             * weight.
0503:             * So we might as well do it here
0504:             * o   Allocate CEs for each token in the list, based on the total number N 
0505:             * of the largest level difference, and the gap G between baseCE and nextCE 
0506:             * at that level. The relation * between the last item and nextCE is the 
0507:             * same as the strongest strength. 
0508:             * o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) 
0509:             *     ? There are 3 primary items: a, d, e. Fit them into the primary gap. 
0510:             *     Then fit b and c into the secondary gap between a and d, then fit q 
0511:             *     into the tertiary gap between b and c. 
0512:             * o   Example: baseCE << b <<< q << c * nextCE(X,2) 
0513:             *     ? There are 2 secondary items: b, c. Fit them into the secondary gap. 
0514:             *       Then fit q into the tertiary gap between b and c. 
0515:             * o   When incrementing primary values, we will not cross high byte 
0516:             *     boundaries except where there is only a single-byte primary. That is 
0517:             *     to ensure that the script reordering will continue to work. 
0518:             * @param collator the rule based collator to update
0519:             * @exception Exception thrown when internal program error occurs
0520:             */
0521:            void assembleTailoringTable(RuleBasedCollator collator)
0522:                    throws Exception {
0523:
0524:                for (int i = 0; i < m_parser_.m_resultLength_; i++) {
0525:                    // now we need to generate the CEs  
0526:                    // We stuff the initial value in the buffers, and increase the 
0527:                    // appropriate buffer according to strength
0528:                    if (m_parser_.m_listHeader_[i].m_first_ != null) {
0529:                        // if there are any elements
0530:                        // due to the way parser works, subsequent tailorings
0531:                        // may remove all the elements from a sequence, therefore
0532:                        // leaving an empty tailoring sequence.
0533:                        initBuffers(m_parser_.m_listHeader_[i]);
0534:                    }
0535:                }
0536:
0537:                if (m_parser_.m_variableTop_ != null) {
0538:                    // stuff the variable top value
0539:                    m_parser_.m_options_.m_variableTopValue_ = m_parser_.m_variableTop_.m_CE_[0] >>> 16;
0540:                    // remove it from the list
0541:                    if (m_parser_.m_variableTop_.m_listHeader_.m_first_ == m_parser_.m_variableTop_) { // first in list
0542:                        m_parser_.m_variableTop_.m_listHeader_.m_first_ = m_parser_.m_variableTop_.m_next_;
0543:                    }
0544:                    if (m_parser_.m_variableTop_.m_listHeader_.m_last_ == m_parser_.m_variableTop_) {
0545:                        // first in list
0546:                        m_parser_.m_variableTop_.m_listHeader_.m_last_ = m_parser_.m_variableTop_.m_previous_;
0547:                    }
0548:                    if (m_parser_.m_variableTop_.m_next_ != null) {
0549:                        m_parser_.m_variableTop_.m_next_.m_previous_ = m_parser_.m_variableTop_.m_previous_;
0550:                    }
0551:                    if (m_parser_.m_variableTop_.m_previous_ != null) {
0552:                        m_parser_.m_variableTop_.m_previous_.m_next_ = m_parser_.m_variableTop_.m_next_;
0553:                    }
0554:                }
0555:
0556:                BuildTable t = new BuildTable(m_parser_);
0557:
0558:                // After this, we have assigned CE values to all regular CEs now we 
0559:                // will go through list once more and resolve expansions, make 
0560:                // UCAElements structs and add them to table               
0561:                for (int i = 0; i < m_parser_.m_resultLength_; i++) {
0562:                    // now we need to generate the CEs 
0563:                    // We stuff the initial value in the buffers, and increase the 
0564:                    // appropriate buffer according to strength                                                          */
0565:                    createElements(t, m_parser_.m_listHeader_[i]);
0566:                }
0567:
0568:                m_utilElement_.clear();
0569:                StringBuffer str = new StringBuffer();
0570:
0571:                // add latin-1 stuff
0572:                copyRangeFromUCA(t, 0, 0xFF);
0573:
0574:                // add stuff for copying 
0575:                if (m_parser_.m_copySet_ != null) {
0576:                    int i = 0;
0577:                    for (i = 0; i < m_parser_.m_copySet_.getRangeCount(); i++) {
0578:                        copyRangeFromUCA(t, m_parser_.m_copySet_
0579:                                .getRangeStart(i), m_parser_.m_copySet_
0580:                                .getRangeEnd(i));
0581:                    }
0582:                }
0583:
0584:                // copy contractions from the UCA - this is felt mostly for cyrillic
0585:                char conts[] = RuleBasedCollator.UCA_CONTRACTIONS_;
0586:                int offset = 0;
0587:                while (conts[offset] != 0) {
0588:                    // tailoredCE = ucmpe32_get(t.m_mapping, *conts);
0589:                    int tailoredCE = t.m_mapping_.getValue(conts[offset]);
0590:                    if (tailoredCE != CE_NOT_FOUND_) {
0591:                        boolean needToAdd = true;
0592:                        if (isContractionTableElement(tailoredCE)) {
0593:                            if (isTailored(t.m_contractions_, tailoredCE,
0594:                                    conts, offset + 1) == true) {
0595:                                needToAdd = false;
0596:                            }
0597:                        }
0598:                        if (m_parser_.m_removeSet_ != null
0599:                                && m_parser_.m_removeSet_
0600:                                        .contains(conts[offset])) {
0601:                            needToAdd = false;
0602:                        }
0603:
0604:                        if (needToAdd == true) {
0605:                            // we need to add if this contraction is not tailored.
0606:                            m_utilElement_.m_prefix_ = 0;
0607:                            m_utilElement_.m_prefixChars_ = null;
0608:                            m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
0609:                            str.delete(0, str.length());
0610:                            str.append(conts[offset]);
0611:                            str.append(conts[offset + 1]);
0612:                            if (conts[offset + 2] != 0) {
0613:                                str.append(conts[offset + 2]);
0614:                            }
0615:                            m_utilElement_.m_uchars_ = str.toString();
0616:                            m_utilElement_.m_CELength_ = 0;
0617:                            m_utilColEIter_.setText(m_utilElement_.m_uchars_);
0618:                            while (true) {
0619:                                int CE = m_utilColEIter_.next();
0620:                                if (CE != CollationElementIterator.NULLORDER) {
0621:                                    m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
0622:                                } else {
0623:                                    break;
0624:                                }
0625:                            }
0626:                            addAnElement(t, m_utilElement_);
0627:                        }
0628:                    } else if (m_parser_.m_removeSet_ != null
0629:                            && m_parser_.m_removeSet_.contains(conts[offset])) {
0630:                        copyRangeFromUCA(t, conts[offset], conts[offset]);
0631:                    }
0632:
0633:                    offset += 3;
0634:                }
0635:
0636:                // Add completely ignorable elements
0637:                processUCACompleteIgnorables(t);
0638:
0639:                // canonical closure 
0640:                canonicalClosure(t);
0641:
0642:                // still need to produce compatibility closure
0643:                assembleTable(t, collator);
0644:            }
0645:
0646:            // private inner classes -------------------------------------------------
0647:
0648:            private static class CEGenerator {
0649:                // package private data members --------------------------------------
0650:
0651:                WeightRange m_ranges_[];
0652:                int m_rangesLength_;
0653:                int m_byteSize_;
0654:                int m_start_;
0655:                int m_limit_;
0656:                int m_maxCount_;
0657:                int m_count_;
0658:                int m_current_;
0659:                int m_fLow_; // forbidden Low 
0660:                int m_fHigh_; // forbidden High 
0661:
0662:                // package private constructor ---------------------------------------
0663:
0664:                CEGenerator() {
0665:                    m_ranges_ = new WeightRange[7];
0666:                    for (int i = 6; i >= 0; i--) {
0667:                        m_ranges_[i] = new WeightRange();
0668:                    }
0669:                }
0670:            }
0671:
0672:            private static class WeightRange implements  Comparable {
0673:                // public methods ----------------------------------------------------
0674:
0675:                /**
0676:                 * Compares this object with target
0677:                 * @param target object to compare with
0678:                 * @return 0 if equals, 1 if this is > target, -1 otherwise
0679:                 */
0680:                public int compareTo(Object target) {
0681:                    if (this  == target) {
0682:                        return 0;
0683:                    }
0684:                    int tstart = ((WeightRange) target).m_start_;
0685:                    if (m_start_ == tstart) {
0686:                        return 0;
0687:                    }
0688:                    if (m_start_ > tstart) {
0689:                        return 1;
0690:                    }
0691:                    return -1;
0692:                }
0693:
0694:                /**
0695:                 * Initialize 
0696:                 */
0697:                public void clear() {
0698:                    m_start_ = 0;
0699:                    m_end_ = 0;
0700:                    m_length_ = 0;
0701:                    m_count_ = 0;
0702:                    m_length2_ = 0;
0703:                    m_count2_ = 0;
0704:                }
0705:
0706:                // package private data members --------------------------------------
0707:
0708:                int m_start_;
0709:                int m_end_;
0710:                int m_length_;
0711:                int m_count_;
0712:                int m_length2_;
0713:                int m_count2_;
0714:
0715:                // package private constructor ---------------------------------------
0716:
0717:                WeightRange() {
0718:                    clear();
0719:                }
0720:
0721:                /**
0722:                 * Copy constructor.
0723:                 * Cloneable is troublesome, needs to check for exception
0724:                 * @param source to clone
0725:                 */
0726:                WeightRange(WeightRange source) {
0727:                    m_start_ = source.m_start_;
0728:                    m_end_ = source.m_end_;
0729:                    m_length_ = source.m_length_;
0730:                    m_count_ = source.m_count_;
0731:                    m_length2_ = source.m_length2_;
0732:                    m_count2_ = source.m_count2_;
0733:                }
0734:            }
0735:
0736:            private static class MaxJamoExpansionTable {
0737:                // package private data members --------------------------------------
0738:
0739:                Vector m_endExpansionCE_;
0740:                // vector of booleans
0741:                Vector m_isV_;
0742:                byte m_maxLSize_;
0743:                byte m_maxVSize_;
0744:                byte m_maxTSize_;
0745:
0746:                // package private constructor ---------------------------------------
0747:
0748:                MaxJamoExpansionTable() {
0749:                    m_endExpansionCE_ = new Vector();
0750:                    m_isV_ = new Vector();
0751:                    m_endExpansionCE_.add(new Integer(0));
0752:                    m_isV_.add(new Boolean(false));
0753:                    m_maxLSize_ = 1;
0754:                    m_maxVSize_ = 1;
0755:                    m_maxTSize_ = 1;
0756:                }
0757:
0758:                MaxJamoExpansionTable(MaxJamoExpansionTable table) {
0759:                    m_endExpansionCE_ = (Vector) table.m_endExpansionCE_
0760:                            .clone();
0761:                    m_isV_ = (Vector) table.m_isV_.clone();
0762:                    m_maxLSize_ = table.m_maxLSize_;
0763:                    m_maxVSize_ = table.m_maxVSize_;
0764:                    m_maxTSize_ = table.m_maxTSize_;
0765:                }
0766:            }
0767:
0768:            private static class MaxExpansionTable {
0769:                // package private constructor --------------------------------------
0770:
0771:                MaxExpansionTable() {
0772:                    m_endExpansionCE_ = new Vector();
0773:                    m_expansionCESize_ = new Vector();
0774:                    m_endExpansionCE_.add(new Integer(0));
0775:                    m_expansionCESize_.add(new Byte((byte) 0));
0776:                }
0777:
0778:                MaxExpansionTable(MaxExpansionTable table) {
0779:                    m_endExpansionCE_ = (Vector) table.m_endExpansionCE_
0780:                            .clone();
0781:                    m_expansionCESize_ = (Vector) table.m_expansionCESize_
0782:                            .clone();
0783:                }
0784:
0785:                // package private data member --------------------------------------
0786:
0787:                Vector m_endExpansionCE_;
0788:                Vector m_expansionCESize_;
0789:            }
0790:
0791:            private static class BasicContractionTable {
0792:                // package private constructors -------------------------------------
0793:
0794:                BasicContractionTable() {
0795:                    m_CEs_ = new Vector();
0796:                    m_codePoints_ = new StringBuffer();
0797:                }
0798:
0799:                // package private data members -------------------------------------
0800:
0801:                StringBuffer m_codePoints_;
0802:                Vector m_CEs_;
0803:            }
0804:
0805:            private static class ContractionTable {
0806:                // package private constructor --------------------------------------
0807:
0808:                /**
0809:                 * Builds a contraction table
0810:                 * @param mapping
0811:                 */
0812:                ContractionTable(IntTrieBuilder mapping) {
0813:                    m_mapping_ = mapping;
0814:                    m_elements_ = new Vector();
0815:                    m_CEs_ = new Vector();
0816:                    m_codePoints_ = new StringBuffer();
0817:                    m_offsets_ = new Vector();
0818:                    m_currentTag_ = CE_NOT_FOUND_TAG_;
0819:                }
0820:
0821:                /**
0822:                 * Copies a contraction table.
0823:                 * Not all data will be copied into their own object.
0824:                 * @param table
0825:                 */
0826:                ContractionTable(ContractionTable table) {
0827:                    m_mapping_ = table.m_mapping_;
0828:                    m_elements_ = (Vector) table.m_elements_.clone();
0829:                    m_codePoints_ = new StringBuffer(table.m_codePoints_
0830:                            .toString());
0831:                    m_CEs_ = (Vector) table.m_CEs_.clone();
0832:                    m_offsets_ = (Vector) table.m_offsets_.clone();
0833:                    m_currentTag_ = table.m_currentTag_;
0834:                }
0835:
0836:                // package private data members ------------------------------------
0837:
0838:                /**
0839:                 * Vector of BasicContractionTable
0840:                 */
0841:                Vector m_elements_;
0842:                IntTrieBuilder m_mapping_;
0843:                StringBuffer m_codePoints_;
0844:                Vector m_CEs_;
0845:                Vector m_offsets_;
0846:                int m_currentTag_;
0847:            }
0848:
0849:            private static final class BuildTable implements 
0850:                    TrieBuilder.DataManipulate {
0851:                // package private methods ------------------------------------------
0852:
0853:                /**
0854:                 * For construction of the Trie tables.
0855:                 * Has to be labeled public
0856:                 * @param cp
0857:                 * @param offset
0858:                 * @return data offset or 0 
0859:                 * @draft 2.2
0860:                 */
0861:                public int getFoldedValue(int cp, int offset) {
0862:                    int limit = cp + 0x400;
0863:                    while (cp < limit) {
0864:                        int value = m_mapping_.getValue(cp);
0865:                        boolean inBlockZero = m_mapping_.isInZeroBlock(cp);
0866:                        int tag = getCETag(value);
0867:                        if (inBlockZero == true) {
0868:                            cp += TrieBuilder.DATA_BLOCK_LENGTH;
0869:                        } else if (!(isSpecial(value) && (tag == CE_IMPLICIT_TAG_ || tag == CE_NOT_FOUND_TAG_))) {
0870:                            // These are values that are starting in either UCA 
0871:                            // (IMPLICIT_TAG) or in the tailorings (NOT_FOUND_TAG). 
0872:                            // Presence of these tags means that there is nothing in 
0873:                            // this position and that it should be skipped.
0874:                            return RuleBasedCollator.CE_SPECIAL_FLAG_
0875:                                    | (CE_SURROGATE_TAG_ << 24) | offset;
0876:                        } else {
0877:                            ++cp;
0878:                        }
0879:                    }
0880:                    return 0;
0881:                }
0882:
0883:                // package private constructor --------------------------------------
0884:
0885:                /**
0886:                 * Returns a table
0887:                 */
0888:                BuildTable(CollationRuleParser parser) {
0889:                    m_collator_ = new RuleBasedCollator();
0890:                    m_collator_.setWithUCAData();
0891:                    MaxExpansionTable maxet = new MaxExpansionTable();
0892:                    MaxJamoExpansionTable maxjet = new MaxJamoExpansionTable();
0893:                    m_options_ = parser.m_options_;
0894:                    m_expansions_ = new Vector();
0895:                    // Do your own mallocs for the structure, array and have linear 
0896:                    // Latin 1
0897:                    int trieinitialvalue = RuleBasedCollator.CE_SPECIAL_FLAG_
0898:                            | (CE_NOT_FOUND_TAG_ << 24);
0899:                    // temporary fix for jb3822, 0x100000 -> 30000
0900:                    m_mapping_ = new IntTrieBuilder(null, 0x30000,
0901:                            trieinitialvalue, trieinitialvalue, true);
0902:                    m_prefixLookup_ = new Hashtable();
0903:                    // uhash_open(prefixLookupHash, prefixLookupComp);
0904:                    m_contractions_ = new ContractionTable(m_mapping_);
0905:                    // copy UCA's maxexpansion and merge as we go along
0906:                    m_maxExpansions_ = maxet;
0907:                    // adding an extra initial value for easier manipulation 
0908:                    for (int i = 0; i < RuleBasedCollator.UCA_.m_expansionEndCE_.length; i++) {
0909:                        maxet.m_endExpansionCE_.add(new Integer(
0910:                                RuleBasedCollator.UCA_.m_expansionEndCE_[i]));
0911:                        maxet.m_expansionCESize_
0912:                                .add(new Byte(
0913:                                        RuleBasedCollator.UCA_.m_expansionEndCEMaxSize_[i]));
0914:                    }
0915:                    m_maxJamoExpansions_ = maxjet;
0916:
0917:                    m_unsafeCP_ = new byte[UNSAFECP_TABLE_SIZE_];
0918:                    m_contrEndCP_ = new byte[UNSAFECP_TABLE_SIZE_];
0919:                    Arrays.fill(m_unsafeCP_, (byte) 0);
0920:                    Arrays.fill(m_contrEndCP_, (byte) 0);
0921:                }
0922:
0923:                /**
0924:                 * Duplicating a BuildTable.
0925:                 * Not all data will be duplicated into their own object.
0926:                 * @param table to clone
0927:                 */
0928:                BuildTable(BuildTable table) {
0929:                    m_collator_ = table.m_collator_;
0930:                    m_mapping_ = new IntTrieBuilder(table.m_mapping_);
0931:                    m_expansions_ = (Vector) table.m_expansions_.clone();
0932:                    m_contractions_ = new ContractionTable(
0933:                            table.m_contractions_);
0934:                    m_contractions_.m_mapping_ = m_mapping_;
0935:                    m_options_ = table.m_options_;
0936:                    m_maxExpansions_ = new MaxExpansionTable(
0937:                            table.m_maxExpansions_);
0938:                    m_maxJamoExpansions_ = new MaxJamoExpansionTable(
0939:                            table.m_maxJamoExpansions_);
0940:                    m_unsafeCP_ = new byte[table.m_unsafeCP_.length];
0941:                    System.arraycopy(table.m_unsafeCP_, 0, m_unsafeCP_, 0,
0942:                            m_unsafeCP_.length);
0943:                    m_contrEndCP_ = new byte[table.m_contrEndCP_.length];
0944:                    System.arraycopy(table.m_contrEndCP_, 0, m_contrEndCP_, 0,
0945:                            m_contrEndCP_.length);
0946:                }
0947:
0948:                // package private data members -------------------------------------
0949:
0950:                RuleBasedCollator m_collator_;
0951:                IntTrieBuilder m_mapping_;
0952:                Vector m_expansions_;
0953:                ContractionTable m_contractions_;
0954:                // UCATableHeader image;
0955:                CollationRuleParser.OptionSet m_options_;
0956:                MaxExpansionTable m_maxExpansions_;
0957:                MaxJamoExpansionTable m_maxJamoExpansions_;
0958:                byte m_unsafeCP_[];
0959:                byte m_contrEndCP_[];
0960:                Hashtable m_prefixLookup_;
0961:            }
0962:
0963:            private static class Elements {
0964:                // package private data members -------------------------------------
0965:
0966:                String m_prefixChars_;
0967:                int m_prefix_;
0968:                String m_uchars_;
0969:                /**
0970:                 * Working string
0971:                 */
0972:                String m_cPoints_;
0973:                /**
0974:                 * Offset to the working string
0975:                 */
0976:                int m_cPointsOffset_;
0977:                /** 
0978:                 * These are collation elements - there could be more than one - in 
0979:                 * case of expansion 
0980:                 */
0981:                int m_CEs_[];
0982:                int m_CELength_;
0983:                /** 
0984:                 * This is the value element maps in original table   
0985:                 */
0986:                int m_mapCE_;
0987:                int m_sizePrim_[];
0988:                int m_sizeSec_[];
0989:                int m_sizeTer_[];
0990:                boolean m_variableTop_;
0991:                boolean m_caseBit_;
0992:
0993:                // package private constructors -------------------------------------
0994:
0995:                /**
0996:                 * Package private constructor
0997:                 */
0998:                Elements() {
0999:                    m_sizePrim_ = new int[128];
1000:                    m_sizeSec_ = new int[128];
1001:                    m_sizeTer_ = new int[128];
1002:                    m_CEs_ = new int[256];
1003:                    m_CELength_ = 0;
1004:                }
1005:
1006:                /**
1007:                 * Package private constructor
1008:                 */
1009:                Elements(Elements element) {
1010:                    m_prefixChars_ = element.m_prefixChars_;
1011:                    m_prefix_ = element.m_prefix_;
1012:                    m_uchars_ = element.m_uchars_;
1013:                    m_cPoints_ = element.m_cPoints_;
1014:                    m_cPointsOffset_ = element.m_cPointsOffset_;
1015:                    m_CEs_ = element.m_CEs_;
1016:                    m_CELength_ = element.m_CELength_;
1017:                    m_mapCE_ = element.m_mapCE_;
1018:                    m_sizePrim_ = element.m_sizePrim_;
1019:                    m_sizeSec_ = element.m_sizeSec_;
1020:                    m_sizeTer_ = element.m_sizeTer_;
1021:                    m_variableTop_ = element.m_variableTop_;
1022:                    m_caseBit_ = element.m_caseBit_;
1023:                }
1024:
1025:                // package private methods -------------------------------------------
1026:
1027:                /**
1028:                 * Initializing the elements
1029:                 */
1030:                public void clear() {
1031:                    m_prefixChars_ = null;
1032:                    m_prefix_ = 0;
1033:                    m_uchars_ = null;
1034:                    m_cPoints_ = null;
1035:                    m_cPointsOffset_ = 0;
1036:                    m_CELength_ = 0;
1037:                    m_mapCE_ = 0;
1038:                    Arrays.fill(m_sizePrim_, 0);
1039:                    Arrays.fill(m_sizeSec_, 0);
1040:                    Arrays.fill(m_sizeTer_, 0);
1041:                    m_variableTop_ = false;
1042:                    m_caseBit_ = false;
1043:                }
1044:
1045:                /**
1046:                 * Hashcode calculation for token
1047:                 * @return the hashcode
1048:                 */
1049:                public int hashCode() {
1050:                    String str = m_cPoints_.substring(m_cPointsOffset_);
1051:                    return str.hashCode();
1052:                }
1053:
1054:                /**
1055:                 * Equals calculation
1056:                 * @param target object to compare
1057:                 * @return true if target is the same as this object
1058:                 */
1059:                public boolean equals(Object target) {
1060:                    if (target == this ) {
1061:                        return true;
1062:                    }
1063:                    if (target instanceof  Elements) {
1064:                        Elements t = (Elements) target;
1065:                        int size = m_cPoints_.length() - m_cPointsOffset_;
1066:                        if (size == t.m_cPoints_.length() - t.m_cPointsOffset_) {
1067:                            return t.m_cPoints_.regionMatches(
1068:                                    t.m_cPointsOffset_, m_cPoints_,
1069:                                    m_cPointsOffset_, size);
1070:                        }
1071:                    }
1072:                    return false;
1073:                }
1074:            }
1075:
1076:            // private data member ---------------------------------------------------
1077:
1078:            /**
1079:             * Maximum strength used in CE building
1080:             */
1081:            private static final int CE_BASIC_STRENGTH_LIMIT_ = 3;
1082:            /**
1083:             * Maximum collation strength
1084:             */
1085:            private static final int CE_STRENGTH_LIMIT_ = 16;
1086:            /**
1087:             * Strength mask array, used in inverse UCA
1088:             */
1089:            private static final int STRENGTH_MASK_[] = { 0xFFFF0000,
1090:                    0xFFFFFF00, 0xFFFFFFFF };
1091:            /**
1092:             * CE tag for not found
1093:             */
1094:            private static final int CE_NOT_FOUND_ = 0xF0000000;
1095:            /**
1096:             * CE tag for not found
1097:             */
1098:            private static final int CE_NOT_FOUND_TAG_ = 0;
1099:            /**
1100:             * This code point results in an expansion 
1101:             */
1102:            private static final int CE_EXPANSION_TAG_ = 1;
1103:            /** 
1104:             * Start of a contraction 
1105:             */
1106:            private static final int CE_CONTRACTION_TAG_ = 2;
1107:            /** 
1108:             * Thai character - do the reordering 
1109:             */
1110:            private static final int CE_THAI_TAG_ = 3;
1111:            /** 
1112:             * Charset processing, not yet implemented
1113:             */
1114:            private static final int CE_CHARSET_TAG_ = 4;
1115:            /** 
1116:             * Lead surrogate that is tailored and doesn't start a contraction 
1117:             */
1118:            private static final int CE_SURROGATE_TAG_ = 5;
1119:            /** 
1120:             * AC00-D7AF
1121:             */
1122:            private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
1123:            /** 
1124:             * D800-DBFF
1125:             */
1126:            private static final int CE_LEAD_SURROGATE_TAG_ = 7;
1127:            /** 
1128:             * DC00-DFFF
1129:             */
1130:            private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
1131:            /** 
1132:             * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
1133:             */
1134:            private static final int CE_CJK_IMPLICIT_TAG_ = 9;
1135:            private static final int CE_IMPLICIT_TAG_ = 10;
1136:            private static final int CE_SPEC_PROC_TAG_ = 11;
1137:            /** 
1138:             * This is a three byte primary with starting secondaries and tertiaries.
1139:             * It fits in a single 32 bit CE and is used instead of expansion to save
1140:             * space without affecting the performance (hopefully) 
1141:             */
1142:            private static final int CE_LONG_PRIMARY_TAG_ = 12;
1143:            /** 
1144:             * Unsafe UChar hash table table size. Size is 32 bytes for 1 bit for each 
1145:             * latin 1 char + some power of two for hashing the rest of the chars. 
1146:             * Size in bytes                               
1147:             */
1148:            private static final int UNSAFECP_TABLE_SIZE_ = 1056;
1149:            /** 
1150:             * Mask value down to "some power of two" -1. Number of bits, not num of 
1151:             * bytes.       
1152:             */
1153:            private static final int UNSAFECP_TABLE_MASK_ = 0x1fff;
1154:            /**
1155:             * Case values
1156:             */
1157:            private static final int UPPER_CASE_ = 0x80;
1158:            private static final int MIXED_CASE_ = 0x40;
1159:            private static final int LOWER_CASE_ = 0x00;
1160:            /**
1161:             * Initial table size
1162:             */
1163:            private static final int INIT_TABLE_SIZE_ = 1028;
1164:            /**
1165:             * Header size, copied from ICU4C, to be changed when that value changes
1166:             */
1167:            private static final int HEADER_SIZE_ = 0xC4;
1168:            /**
1169:             * Contraction table new element indicator
1170:             */
1171:            private static final int CONTRACTION_TABLE_NEW_ELEMENT_ = 0xFFFFFF;
1172:            /**
1173:             * Parser for the rules
1174:             */
1175:            private CollationRuleParser m_parser_;
1176:            /**
1177:             * Utility UCA collation element iterator
1178:             */
1179:            private CollationElementIterator m_utilColEIter_;
1180:            /**
1181:             * Utility data members
1182:             */
1183:            private CEGenerator m_utilGens_[] = { new CEGenerator(),
1184:                    new CEGenerator(), new CEGenerator() };
1185:            private int m_utilCEBuffer_[] = new int[CE_BASIC_STRENGTH_LIMIT_];
1186:            private int m_utilIntBuffer_[] = new int[CE_STRENGTH_LIMIT_];
1187:            private Elements m_utilElement_ = new Elements();
1188:            private Elements m_utilElement2_ = new Elements();
1189:            private CollationRuleParser.Token m_utilToken_ = new CollationRuleParser.Token();
1190:            private int m_utilCountBuffer_[] = new int[6];
1191:            private long m_utilLongBuffer_[] = new long[5];
1192:            private WeightRange m_utilLowerWeightRange_[] = {
1193:                    new WeightRange(), new WeightRange(), new WeightRange(),
1194:                    new WeightRange(), new WeightRange() };
1195:            private WeightRange m_utilUpperWeightRange_[] = {
1196:                    new WeightRange(), new WeightRange(), new WeightRange(),
1197:                    new WeightRange(), new WeightRange() };
1198:            private WeightRange m_utilWeightRange_ = new WeightRange();
1199:            private char m_utilCharBuffer_[] = new char[256];
1200:            private CanonicalIterator m_utilCanIter_ = new CanonicalIterator("");
1201:            private StringBuffer m_utilStringBuffer_ = new StringBuffer("");
1202:
1203:            // private methods -------------------------------------------------------
1204:
1205:            /**
1206:             * @param listheader parsed rule tokens
1207:             * @exception Exception thrown when internal error occurs
1208:             */
1209:            private void initBuffers(
1210:                    CollationRuleParser.TokenListHeader listheader)
1211:                    throws Exception {
1212:                CollationRuleParser.Token token = listheader.m_last_;
1213:                Arrays.fill(m_utilIntBuffer_, 0, CE_STRENGTH_LIMIT_, 0);
1214:
1215:                token.m_toInsert_ = 1;
1216:                m_utilIntBuffer_[token.m_strength_] = 1;
1217:                while (token.m_previous_ != null) {
1218:                    if (token.m_previous_.m_strength_ < token.m_strength_) {
1219:                        // going up
1220:                        m_utilIntBuffer_[token.m_strength_] = 0;
1221:                        m_utilIntBuffer_[token.m_previous_.m_strength_]++;
1222:                    } else if (token.m_previous_.m_strength_ > token.m_strength_) {
1223:                        // going down
1224:                        m_utilIntBuffer_[token.m_previous_.m_strength_] = 1;
1225:                    } else {
1226:                        m_utilIntBuffer_[token.m_strength_]++;
1227:                    }
1228:                    token = token.m_previous_;
1229:                    token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
1230:                }
1231:
1232:                token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
1233:                INVERSE_UCA_.getInverseGapPositions(listheader);
1234:
1235:                token = listheader.m_first_;
1236:                int fstrength = Collator.IDENTICAL;
1237:                int initstrength = Collator.IDENTICAL;
1238:
1239:                m_utilCEBuffer_[Collator.PRIMARY] = mergeCE(
1240:                        listheader.m_baseCE_, listheader.m_baseContCE_,
1241:                        Collator.PRIMARY);
1242:                m_utilCEBuffer_[Collator.SECONDARY] = mergeCE(
1243:                        listheader.m_baseCE_, listheader.m_baseContCE_,
1244:                        Collator.SECONDARY);
1245:                m_utilCEBuffer_[Collator.TERTIARY] = mergeCE(
1246:                        listheader.m_baseCE_, listheader.m_baseContCE_,
1247:                        Collator.TERTIARY);
1248:                while (token != null) {
1249:                    fstrength = token.m_strength_;
1250:                    if (fstrength < initstrength) {
1251:                        initstrength = fstrength;
1252:                        if (listheader.m_pos_[fstrength] == -1) {
1253:                            while (listheader.m_pos_[fstrength] == -1
1254:                                    && fstrength > 0) {
1255:                                fstrength--;
1256:                            }
1257:                            if (listheader.m_pos_[fstrength] == -1) {
1258:                                throw new Exception("Internal program error");
1259:                            }
1260:                        }
1261:                        if (initstrength == Collator.TERTIARY) {
1262:                            // starting with tertiary
1263:                            m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
1264:                            m_utilCEBuffer_[Collator.SECONDARY] = listheader.m_gapsLo_[fstrength * 3 + 1];
1265:                            m_utilCEBuffer_[Collator.TERTIARY] = getCEGenerator(
1266:                                    m_utilGens_[Collator.TERTIARY],
1267:                                    listheader.m_gapsLo_, listheader.m_gapsHi_,
1268:                                    token, fstrength);
1269:                        } else if (initstrength == Collator.SECONDARY) {
1270:                            // secondaries
1271:                            m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
1272:                            m_utilCEBuffer_[Collator.SECONDARY] = getCEGenerator(
1273:                                    m_utilGens_[Collator.SECONDARY],
1274:                                    listheader.m_gapsLo_, listheader.m_gapsHi_,
1275:                                    token, fstrength);
1276:                            m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1277:                                    m_utilGens_[Collator.TERTIARY], token,
1278:                                    Collator.TERTIARY);
1279:                        } else {
1280:                            // primaries 
1281:                            m_utilCEBuffer_[Collator.PRIMARY] = getCEGenerator(
1282:                                    m_utilGens_[Collator.PRIMARY],
1283:                                    listheader.m_gapsLo_, listheader.m_gapsHi_,
1284:                                    token, fstrength);
1285:                            m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
1286:                                    m_utilGens_[Collator.SECONDARY], token,
1287:                                    Collator.SECONDARY);
1288:                            m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1289:                                    m_utilGens_[Collator.TERTIARY], token,
1290:                                    Collator.TERTIARY);
1291:                        }
1292:                    } else {
1293:                        if (token.m_strength_ == Collator.TERTIARY) {
1294:                            m_utilCEBuffer_[Collator.TERTIARY] = getNextGenerated(m_utilGens_[Collator.TERTIARY]);
1295:                        } else if (token.m_strength_ == Collator.SECONDARY) {
1296:                            m_utilCEBuffer_[Collator.SECONDARY] = getNextGenerated(m_utilGens_[Collator.SECONDARY]);
1297:                            m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1298:                                    m_utilGens_[Collator.TERTIARY], token,
1299:                                    Collator.TERTIARY);
1300:                        } else if (token.m_strength_ == Collator.PRIMARY) {
1301:                            m_utilCEBuffer_[Collator.PRIMARY] = getNextGenerated(m_utilGens_[Collator.PRIMARY]);
1302:                            m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
1303:                                    m_utilGens_[Collator.SECONDARY], token,
1304:                                    Collator.SECONDARY);
1305:                            m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1306:                                    m_utilGens_[Collator.TERTIARY], token,
1307:                                    Collator.TERTIARY);
1308:                        }
1309:                    }
1310:                    doCE(m_utilCEBuffer_, token);
1311:                    token = token.m_next_;
1312:                }
1313:            }
1314:
1315:            /**
1316:             * Get the next generated ce
1317:             * @param g ce generator
1318:             * @return next generated ce 
1319:             */
1320:            private int getNextGenerated(CEGenerator g) {
1321:                g.m_current_ = nextWeight(g);
1322:                return g.m_current_;
1323:            }
1324:
1325:            /**
1326:             * @param g CEGenerator
1327:             * @param token rule token
1328:             * @param strength 
1329:             * @return ce generator
1330:             * @exception Exception thrown when internal error occurs
1331:             */
1332:            private int getSimpleCEGenerator(CEGenerator g,
1333:                    CollationRuleParser.Token token, int strength)
1334:                    throws Exception {
1335:                int high, low, count = 1;
1336:                int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF;
1337:
1338:                if (strength == Collator.SECONDARY) {
1339:                    low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1340:                    high = 0xFFFFFFFF;
1341:                    count = 0xFF - RuleBasedCollator.COMMON_TOP_2_;
1342:                } else {
1343:                    low = RuleBasedCollator.BYTE_COMMON_ << 24; //0x05000000;
1344:                    high = 0x40000000;
1345:                    count = 0x40 - RuleBasedCollator.BYTE_COMMON_;
1346:                }
1347:
1348:                if (token.m_next_ != null
1349:                        && token.m_next_.m_strength_ == strength) {
1350:                    count = token.m_next_.m_toInsert_;
1351:                }
1352:
1353:                g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
1354:                        g.m_ranges_);
1355:                g.m_current_ = RuleBasedCollator.BYTE_COMMON_ << 24;
1356:
1357:                if (g.m_rangesLength_ == 0) {
1358:                    throw new Exception("Internal program error");
1359:                }
1360:                return g.m_current_;
1361:            }
1362:
1363:            /**
1364:             * Combines 2 ce into one with respect to the argument strength
1365:             * @param ce1 first ce
1366:             * @param ce2 second ce
1367:             * @param strength strength to use
1368:             * @return combined ce
1369:             */
1370:            private static int mergeCE(int ce1, int ce2, int strength) {
1371:                int mask = RuleBasedCollator.CE_TERTIARY_MASK_;
1372:                if (strength == Collator.SECONDARY) {
1373:                    mask = RuleBasedCollator.CE_SECONDARY_MASK_;
1374:                } else if (strength == Collator.PRIMARY) {
1375:                    mask = RuleBasedCollator.CE_PRIMARY_MASK_;
1376:                }
1377:                ce1 &= mask;
1378:                ce2 &= mask;
1379:                switch (strength) {
1380:                case Collator.PRIMARY:
1381:                    return ce1 | ce2 >>> 16;
1382:                case Collator.SECONDARY:
1383:                    return ce1 << 16 | ce2 << 8;
1384:                default:
1385:                    return ce1 << 24 | ce2 << 16;
1386:                }
1387:            }
1388:
1389:            /**
1390:             * @param g CEGenerator
1391:             * @param lows low gap array
1392:             * @param highs high gap array
1393:             * @param token rule token
1394:             * @param fstrength 
1395:             * @exception Exception thrown when internal error occurs
1396:             */
1397:            private int getCEGenerator(CEGenerator g, int lows[], int highs[],
1398:                    CollationRuleParser.Token token, int fstrength)
1399:                    throws Exception {
1400:                int strength = token.m_strength_;
1401:                int low = lows[fstrength * 3 + strength];
1402:                int high = highs[fstrength * 3 + strength];
1403:                int maxbyte = 0;
1404:                if (strength == Collator.TERTIARY) {
1405:                    maxbyte = 0x3F;
1406:                } else if (strength == Collator.PRIMARY) {
1407:                    maxbyte = 0xFE;
1408:                } else {
1409:                    maxbyte = 0xFF;
1410:                }
1411:
1412:                int count = token.m_toInsert_;
1413:
1414:                if (Utility.compareUnsigned(low, high) >= 0
1415:                        && strength > Collator.PRIMARY) {
1416:                    int s = strength;
1417:                    while (true) {
1418:                        s--;
1419:                        if (lows[fstrength * 3 + s] != highs[fstrength * 3 + s]) {
1420:                            if (strength == Collator.SECONDARY) {
1421:                                low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1422:                                high = 0xFFFFFFFF;
1423:                            } else {
1424:                                // low = 0x02000000; 
1425:                                // This needs to be checked - what if low is
1426:                                // not good...
1427:                                high = 0x40000000;
1428:                            }
1429:                            break;
1430:                        }
1431:                        if (s < 0) {
1432:                            throw new Exception("Internal program error");
1433:                        }
1434:                    }
1435:                }
1436:                if (low == 0) {
1437:                    low = 0x01000000;
1438:                }
1439:                if (strength == Collator.SECONDARY) { // similar as simple 
1440:                    if (Utility.compareUnsigned(low,
1441:                            RuleBasedCollator.COMMON_BOTTOM_2_ << 24) >= 0
1442:                            && Utility.compareUnsigned(low,
1443:                                    RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
1444:                        low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1445:                    }
1446:                    if (Utility.compareUnsigned(high,
1447:                            RuleBasedCollator.COMMON_BOTTOM_2_ << 24) > 0
1448:                            && Utility.compareUnsigned(high,
1449:                                    RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
1450:                        high = RuleBasedCollator.COMMON_TOP_2_ << 24;
1451:                    }
1452:                    if (Utility.compareUnsigned(low,
1453:                            RuleBasedCollator.COMMON_BOTTOM_2_ << 24) < 0) {
1454:                        g.m_rangesLength_ = allocateWeights(
1455:                                RuleBasedCollator.BYTE_UNSHIFTED_MIN_ << 24,
1456:                                high, count, maxbyte, g.m_ranges_);
1457:                        g.m_current_ = nextWeight(g);
1458:                        //g.m_current_ = RuleBasedCollator.COMMON_BOTTOM_2_ << 24;
1459:                        return g.m_current_;
1460:                    }
1461:                }
1462:
1463:                g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
1464:                        g.m_ranges_);
1465:                if (g.m_rangesLength_ == 0) {
1466:                    throw new Exception("Internal program error");
1467:                }
1468:                g.m_current_ = nextWeight(g);
1469:                return g.m_current_;
1470:            }
1471:
1472:            /**
1473:             * @param ceparts list of collation elements parts
1474:             * @param token rule token
1475:             * @exception Exception thrown when forming case bits for expansions fails
1476:             */
1477:            private void doCE(int ceparts[], CollationRuleParser.Token token)
1478:                    throws Exception {
1479:                // this one makes the table and stuff
1480:                // int noofbytes[] = new int[3];
1481:                for (int i = 0; i < 3; i++) {
1482:                    // noofbytes[i] = countBytes(ceparts[i]);
1483:                    m_utilIntBuffer_[i] = countBytes(ceparts[i]);
1484:                }
1485:
1486:                // Here we have to pack CEs from parts
1487:                int cei = 0;
1488:                int value = 0;
1489:
1490:                while ((cei << 1) < m_utilIntBuffer_[0]
1491:                        || cei < m_utilIntBuffer_[1]
1492:                        || cei < m_utilIntBuffer_[2]) {
1493:                    if (cei > 0) {
1494:                        value = RuleBasedCollator.CE_CONTINUATION_MARKER_;
1495:                    } else {
1496:                        value = 0;
1497:                    }
1498:
1499:                    if ((cei << 1) < m_utilIntBuffer_[0]) {
1500:                        value |= ((ceparts[0] >> (32 - ((cei + 1) << 4))) & 0xFFFF) << 16;
1501:                    }
1502:                    if (cei < m_utilIntBuffer_[1]) {
1503:                        value |= ((ceparts[1] >> (32 - ((cei + 1) << 3))) & 0xFF) << 8;
1504:                    }
1505:
1506:                    if (cei < m_utilIntBuffer_[2]) {
1507:                        value |= ((ceparts[2] >> (32 - ((cei + 1) << 3))) & 0x3F);
1508:                    }
1509:                    token.m_CE_[cei] = value;
1510:                    cei++;
1511:                }
1512:                if (cei == 0) { // totally ignorable
1513:                    token.m_CELength_ = 1;
1514:                    token.m_CE_[0] = 0;
1515:                } else { // there is at least something
1516:                    token.m_CELength_ = cei;
1517:                }
1518:
1519:                // Case bits handling for expansion
1520:                if (token.m_CE_[0] != 0) { // case bits should be set only for non-ignorables
1521:                    int startoftokenrule = token.m_source_ & 0xFF;
1522:                    if ((token.m_source_ >>> 24) > 1) {
1523:                        // Do it manually
1524:                        int length = token.m_source_ >>> 24;
1525:                        String tokenstr = token.m_rules_.substring(
1526:                                startoftokenrule, startoftokenrule + length);
1527:                        token.m_CE_[0] |= getCaseBits(tokenstr);
1528:                    } else {
1529:                        // Copy it from the UCA
1530:                        int caseCE = getFirstCE(token.m_rules_
1531:                                .charAt(startoftokenrule));
1532:                        token.m_CE_[0] |= (caseCE & 0xC0);
1533:                    }
1534:                }
1535:            }
1536:
1537:            /**
1538:             * Count the number of non-zero bytes used in the ce
1539:             * @param ce 
1540:             * @return number of non-zero bytes used in ce
1541:             */
1542:            private static final int countBytes(int ce) {
1543:                int mask = 0xFFFFFFFF;
1544:                int result = 0;
1545:                while (mask != 0) {
1546:                    if ((ce & mask) != 0) {
1547:                        result++;
1548:                    }
1549:                    mask >>>= 8;
1550:                }
1551:                return result;
1552:            }
1553:
1554:            /**
1555:             * We are ready to create collation elements
1556:             * @param t build table to insert
1557:             * @param lh rule token list header
1558:             */
1559:            private void createElements(BuildTable t,
1560:                    CollationRuleParser.TokenListHeader lh) {
1561:                CollationRuleParser.Token tok = lh.m_first_;
1562:                m_utilElement_.clear();
1563:                while (tok != null) {
1564:                    // first, check if there are any expansions
1565:                    // if there are expansions, we need to do a little bit more 
1566:                    // processing since parts of expansion can be tailored, while 
1567:                    // others are not
1568:                    if (tok.m_expansion_ != 0) {
1569:                        int len = tok.m_expansion_ >>> 24;
1570:                        int currentSequenceLen = len;
1571:                        int expOffset = tok.m_expansion_ & 0x00FFFFFF;
1572:                        m_utilToken_.m_source_ = currentSequenceLen | expOffset;
1573:                        m_utilToken_.m_rules_ = m_parser_.m_source_;
1574:
1575:                        while (len > 0) {
1576:                            currentSequenceLen = len;
1577:                            while (currentSequenceLen > 0) {
1578:                                m_utilToken_.m_source_ = (currentSequenceLen << 24)
1579:                                        | expOffset;
1580:                                CollationRuleParser.Token expt = (CollationRuleParser.Token) m_parser_.m_hashTable_
1581:                                        .get(m_utilToken_);
1582:                                if (expt != null
1583:                                        && expt.m_strength_ != CollationRuleParser.TOKEN_RESET_) {
1584:                                    // expansion is tailored
1585:                                    int noOfCEsToCopy = expt.m_CELength_;
1586:                                    for (int j = 0; j < noOfCEsToCopy; j++) {
1587:                                        tok.m_expCE_[tok.m_expCELength_ + j] = expt.m_CE_[j];
1588:                                    }
1589:                                    tok.m_expCELength_ += noOfCEsToCopy;
1590:                                    // never try to add codepoints and CEs.
1591:                                    // For some odd reason, it won't work.
1592:                                    expOffset += currentSequenceLen; //noOfCEsToCopy;
1593:                                    len -= currentSequenceLen; //noOfCEsToCopy;
1594:                                    break;
1595:                                } else {
1596:                                    currentSequenceLen--;
1597:                                }
1598:                            }
1599:                            if (currentSequenceLen == 0) {
1600:                                // couldn't find any tailored subsequence, will have to 
1601:                                // get one from UCA. first, get the UChars from the 
1602:                                // rules then pick CEs out until there is no more and 
1603:                                // stuff them into expansion
1604:                                m_utilColEIter_.setText(m_parser_.m_source_
1605:                                        .substring(expOffset, expOffset + 1));
1606:                                while (true) {
1607:                                    int order = m_utilColEIter_.next();
1608:                                    if (order == CollationElementIterator.NULLORDER) {
1609:                                        break;
1610:                                    }
1611:                                    tok.m_expCE_[tok.m_expCELength_++] = order;
1612:                                }
1613:                                expOffset++;
1614:                                len--;
1615:                            }
1616:                        }
1617:                    } else {
1618:                        tok.m_expCELength_ = 0;
1619:                    }
1620:
1621:                    // set the ucaelement with obtained values
1622:                    m_utilElement_.m_CELength_ = tok.m_CELength_
1623:                            + tok.m_expCELength_;
1624:
1625:                    // copy CEs
1626:                    System.arraycopy(tok.m_CE_, 0, m_utilElement_.m_CEs_, 0,
1627:                            tok.m_CELength_);
1628:                    System.arraycopy(tok.m_expCE_, 0, m_utilElement_.m_CEs_,
1629:                            tok.m_CELength_, tok.m_expCELength_);
1630:
1631:                    // copy UChars 
1632:                    // We kept prefix and source kind of together, as it is a kind of a 
1633:                    // contraction. 
1634:                    // However, now we have to slice the prefix off the main thing - 
1635:                    m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
1636:                    m_utilElement_.m_cPointsOffset_ = 0; //el.m_uchars_;
1637:                    if (tok.m_prefix_ != 0) {
1638:                        // we will just copy the prefix here, and adjust accordingly in 
1639:                        // the addPrefix function in ucol_elm. The reason is that we 
1640:                        // need to add both composed AND decomposed elements to the 
1641:                        // unsafe table.
1642:                        int size = tok.m_prefix_ >> 24;
1643:                        int offset = tok.m_prefix_ & 0x00FFFFFF;
1644:                        m_utilElement_.m_prefixChars_ = m_parser_.m_source_
1645:                                .substring(offset, offset + size);
1646:                        size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
1647:                        offset = (tok.m_source_ & 0x00FFFFFF)
1648:                                + (tok.m_prefix_ >> 24);
1649:                        m_utilElement_.m_uchars_ = m_parser_.m_source_
1650:                                .substring(offset, offset + size);
1651:                    } else {
1652:                        m_utilElement_.m_prefixChars_ = null;
1653:                        int offset = tok.m_source_ & 0x00FFFFFF;
1654:                        int size = tok.m_source_ >>> 24;
1655:                        m_utilElement_.m_uchars_ = m_parser_.m_source_
1656:                                .substring(offset, offset + size);
1657:                    }
1658:                    m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
1659:                    for (int i = 0; i < m_utilElement_.m_cPoints_.length()
1660:                            - m_utilElement_.m_cPointsOffset_; i++) {
1661:                        if (isJamo(m_utilElement_.m_cPoints_.charAt(i))) {
1662:                            t.m_collator_.m_isJamoSpecial_ = true;
1663:                            break;
1664:                        }
1665:                    }
1666:
1667:                    /***
1668:                    
1669:                    // Case bits handling 
1670:                    m_utilElement_.m_CEs_[0] &= 0xFFFFFF3F; 
1671:                    // Clean the case bits field
1672:                    if (m_utilElement_.m_cPoints_.length() 
1673:                        - m_utilElement_.m_cPointsOffset_ > 1) {
1674:                    // Do it manually
1675:                    m_utilElement_.m_CEs_[0] 
1676:                    |= getCaseBits(m_utilElement_.m_cPoints_);
1677:                    } 
1678:                    else {
1679:                    // Copy it from the UCA
1680:                    int caseCE = getFirstCE(m_utilElement_.m_cPoints_.charAt(0));
1681:                    m_utilElement_.m_CEs_[0] |= (caseCE & 0xC0);
1682:                    }
1683:                    
1684:                     ***/
1685:                    // and then, add it
1686:                    addAnElement(t, m_utilElement_);
1687:                    tok = tok.m_next_;
1688:                }
1689:            }
1690:
1691:            /**
1692:             * Testing if the string argument has case
1693:             * @param src string
1694:             * @return the case for this char array
1695:             * @exception Exception thrown when internal program error occurs
1696:             */
1697:            private final int getCaseBits(String src) throws Exception {
1698:                int uCount = 0;
1699:                int lCount = 0;
1700:                src = Normalizer.decompose(src, true);
1701:                m_utilColEIter_.setText(src);
1702:                for (int i = 0; i < src.length(); i++) {
1703:                    m_utilColEIter_.setText(src.substring(i, i + 1));
1704:                    int order = m_utilColEIter_.next();
1705:                    if (RuleBasedCollator.isContinuation(order)) {
1706:                        throw new Exception("Internal program error");
1707:                    }
1708:                    if ((order & RuleBasedCollator.CE_CASE_BIT_MASK_) == UPPER_CASE_) {
1709:                        uCount++;
1710:                    } else {
1711:                        char ch = src.charAt(i);
1712:                        if (UCharacter.isLowerCase(ch)) {
1713:                            lCount++;
1714:                        } else {
1715:                            if (toSmallKana(ch) == ch && toLargeKana(ch) != ch) {
1716:                                lCount++;
1717:                            }
1718:                        }
1719:                    }
1720:                }
1721:
1722:                if (uCount != 0 && lCount != 0) {
1723:                    return MIXED_CASE_;
1724:                } else if (uCount != 0) {
1725:                    return UPPER_CASE_;
1726:                } else {
1727:                    return LOWER_CASE_;
1728:                }
1729:            }
1730:
1731:            /**
1732:             * Converts a char to the uppercase Kana
1733:             * @param ch character to convert
1734:             * @return the converted Kana character
1735:             */
1736:            private static final char toLargeKana(char ch) {
1737:                if (0x3042 < ch && ch < 0x30ef) { // Kana range 
1738:                    switch (ch - 0x3000) {
1739:                    case 0x41:
1740:                    case 0x43:
1741:                    case 0x45:
1742:                    case 0x47:
1743:                    case 0x49:
1744:                    case 0x63:
1745:                    case 0x83:
1746:                    case 0x85:
1747:                    case 0x8E:
1748:                    case 0xA1:
1749:                    case 0xA3:
1750:                    case 0xA5:
1751:                    case 0xA7:
1752:                    case 0xA9:
1753:                    case 0xC3:
1754:                    case 0xE3:
1755:                    case 0xE5:
1756:                    case 0xEE:
1757:                        ch++;
1758:                        break;
1759:                    case 0xF5:
1760:                        ch = 0x30AB;
1761:                        break;
1762:                    case 0xF6:
1763:                        ch = 0x30B1;
1764:                        break;
1765:                    }
1766:                }
1767:                return ch;
1768:            }
1769:
1770:            /**
1771:             * Converts a char to the lowercase Kana
1772:             * @param ch character to convert
1773:             * @return the converted Kana character
1774:             */
1775:            private static final char toSmallKana(char ch) {
1776:                if (0x3042 < ch && ch < 0x30ef) { // Kana range
1777:                    switch (ch - 0x3000) {
1778:                    case 0x42:
1779:                    case 0x44:
1780:                    case 0x46:
1781:                    case 0x48:
1782:                    case 0x4A:
1783:                    case 0x64:
1784:                    case 0x84:
1785:                    case 0x86:
1786:                    case 0x8F:
1787:                    case 0xA2:
1788:                    case 0xA4:
1789:                    case 0xA6:
1790:                    case 0xA8:
1791:                    case 0xAA:
1792:                    case 0xC4:
1793:                    case 0xE4:
1794:                    case 0xE6:
1795:                    case 0xEF:
1796:                        ch--;
1797:                        break;
1798:                    case 0xAB:
1799:                        ch = 0x30F5;
1800:                        break;
1801:                    case 0xB1:
1802:                        ch = 0x30F6;
1803:                        break;
1804:                    }
1805:                }
1806:                return ch;
1807:            }
1808:
1809:            /**
1810:             * This should be connected to special Jamo handling.
1811:             */
1812:            private int getFirstCE(char ch) {
1813:                m_utilColEIter_.setText(UCharacter.toString(ch));
1814:                return m_utilColEIter_.next();
1815:            }
1816:
1817:            /** 
1818:             * This adds a read element, while testing for existence 
1819:             * @param t build table
1820:             * @param element 
1821:             * @return ce
1822:             */
1823:            private int addAnElement(BuildTable t, Elements element) {
1824:                Vector expansions = t.m_expansions_;
1825:                element.m_mapCE_ = 0;
1826:
1827:                if (element.m_CELength_ == 1) {
1828:                    element.m_mapCE_ = element.m_CEs_[0];
1829:
1830:                } else {
1831:                    // unfortunately, it looks like we have to look for a long primary 
1832:                    // here since in canonical closure we are going to hit some long 
1833:                    // primaries from the first phase, and they will come back as 
1834:                    // continuations/expansions destroying the effect of the previous 
1835:                    // opitimization. A long primary is a three byte primary with 
1836:                    // starting secondaries and tertiaries. It can appear in long runs 
1837:                    // of only primary differences (like east Asian tailorings) also, 
1838:                    // it should not be an expansion, as expansions would break with 
1839:                    // this
1840:                    if (element.m_CELength_ == 2 // a two CE expansion 
1841:                            && RuleBasedCollator
1842:                                    .isContinuation(element.m_CEs_[1])
1843:                            && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that has only primaries in continuation
1844:                            && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
1845:                            // a common secondary
1846:                            && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_) // and a common tertiary
1847:                    ) {
1848:                        element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
1849:                        // a long primary special
1850:                                | (CE_LONG_PRIMARY_TAG_ << 24)
1851:                                // first and second byte of primary
1852:                                | ((element.m_CEs_[0] >> 8) & 0xFFFF00)
1853:                                // third byte of primary
1854:                                | ((element.m_CEs_[1] >> 24) & 0xFF);
1855:                    } else {
1856:                        // omitting expansion offset in builder
1857:                        // (HEADER_SIZE_ >> 2)
1858:                        int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
1859:                                | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
1860:                                | (addExpansion(expansions, element.m_CEs_[0]) << 4)
1861:                                & 0xFFFFF0;
1862:
1863:                        for (int i = 1; i < element.m_CELength_; i++) {
1864:                            addExpansion(expansions, element.m_CEs_[i]);
1865:                        }
1866:                        if (element.m_CELength_ <= 0xF) {
1867:                            expansion |= element.m_CELength_;
1868:                        } else {
1869:                            addExpansion(expansions, 0);
1870:                        }
1871:                        element.m_mapCE_ = expansion;
1872:                        setMaxExpansion(
1873:                                element.m_CEs_[element.m_CELength_ - 1],
1874:                                (byte) element.m_CELength_, t.m_maxExpansions_);
1875:                        if (isJamo(element.m_cPoints_.charAt(0))) {
1876:                            t.m_collator_.m_isJamoSpecial_ = true;
1877:                            setMaxJamoExpansion(element.m_cPoints_.charAt(0),
1878:                                    element.m_CEs_[element.m_CELength_ - 1],
1879:                                    (byte) element.m_CELength_,
1880:                                    t.m_maxJamoExpansions_);
1881:                        }
1882:                    }
1883:                }
1884:
1885:                // We treat digits differently - they are "uber special" and should be
1886:                // processed differently if numeric collation is on. 
1887:                int uniChar = 0;
1888:                if ((element.m_uchars_.length() == 2)
1889:                        && UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {
1890:                    uniChar = UCharacterProperty.getRawSupplementary(
1891:                            element.m_uchars_.charAt(0), element.m_uchars_
1892:                                    .charAt(1));
1893:                } else if (element.m_uchars_.length() == 1) {
1894:                    uniChar = element.m_uchars_.charAt(0);
1895:                }
1896:
1897:                // Here, we either have one normal CE OR mapCE is set. Therefore, we 
1898:                // stuff only one element to the expansion buffer. When we encounter a 
1899:                // digit and we don't do numeric collation, we will just pick the CE 
1900:                // we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE 
1901:                // && ucol_prv_getSpecialPrevCE). If we picked a special, further 
1902:                // processing will occur. If it's a simple CE, we'll return due
1903:                // to how the loop is constructed.
1904:                if (uniChar != 0 && UCharacter.isDigit(uniChar)) {
1905:                    // prepare the element
1906:                    int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
1907:                            | (CollationElementIterator.CE_DIGIT_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
1908:                            | 1;
1909:                    if (element.m_mapCE_ != 0) {
1910:                        // if there is an expansion, we'll pick it here
1911:                        expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);
1912:                    } else {
1913:                        expansion |= (addExpansion(expansions,
1914:                                element.m_CEs_[0]) << 4);
1915:                    }
1916:                    element.m_mapCE_ = expansion;
1917:                }
1918:
1919:                // here we want to add the prefix structure.
1920:                // I will try to process it as a reverse contraction, if possible.
1921:                // prefix buffer is already reversed.
1922:
1923:                if (element.m_prefixChars_ != null
1924:                        && element.m_prefixChars_.length() - element.m_prefix_ > 0) {
1925:                    // We keep the seen prefix starter elements in a hashtable we need 
1926:                    // it to be able to distinguish between the simple codepoints and 
1927:                    // prefix starters. Also, we need to use it for canonical closure.
1928:                    m_utilElement2_.m_caseBit_ = element.m_caseBit_;
1929:                    m_utilElement2_.m_CELength_ = element.m_CELength_;
1930:                    m_utilElement2_.m_CEs_ = element.m_CEs_;
1931:                    m_utilElement2_.m_mapCE_ = element.m_mapCE_;
1932:                    //m_utilElement2_.m_prefixChars_ = element.m_prefixChars_;
1933:                    m_utilElement2_.m_sizePrim_ = element.m_sizePrim_;
1934:                    m_utilElement2_.m_sizeSec_ = element.m_sizeSec_;
1935:                    m_utilElement2_.m_sizeTer_ = element.m_sizeTer_;
1936:                    m_utilElement2_.m_variableTop_ = element.m_variableTop_;
1937:                    m_utilElement2_.m_prefix_ = element.m_prefix_;
1938:                    m_utilElement2_.m_prefixChars_ = Normalizer.compose(
1939:                            element.m_prefixChars_, false);
1940:                    m_utilElement2_.m_uchars_ = element.m_uchars_;
1941:                    m_utilElement2_.m_cPoints_ = element.m_cPoints_;
1942:                    m_utilElement2_.m_cPointsOffset_ = 0;
1943:
1944:                    if (t.m_prefixLookup_ != null) {
1945:                        Elements uCE = (Elements) t.m_prefixLookup_
1946:                                .get(element);
1947:                        if (uCE != null) {
1948:                            // there is already a set of code points here
1949:                            element.m_mapCE_ = addPrefix(t, uCE.m_mapCE_,
1950:                                    element);
1951:                        } else { // no code points, so this spot is clean
1952:                            element.m_mapCE_ = addPrefix(t, CE_NOT_FOUND_,
1953:                                    element);
1954:                            uCE = new Elements(element);
1955:                            uCE.m_cPoints_ = uCE.m_uchars_;
1956:                            t.m_prefixLookup_.put(uCE, uCE);
1957:                        }
1958:                        if (m_utilElement2_.m_prefixChars_.length() != element.m_prefixChars_
1959:                                .length()
1960:                                - element.m_prefix_
1961:                                || !m_utilElement2_.m_prefixChars_
1962:                                        .regionMatches(0,
1963:                                                element.m_prefixChars_,
1964:                                                element.m_prefix_,
1965:                                                m_utilElement2_.m_prefixChars_
1966:                                                        .length())) {
1967:                            // do it!
1968:                            m_utilElement2_.m_mapCE_ = addPrefix(t,
1969:                                    element.m_mapCE_, m_utilElement2_);
1970:                        }
1971:                    }
1972:                }
1973:
1974:                // We need to use the canonical iterator here
1975:                // the way we do it is to generate the canonically equivalent strings 
1976:                // for the contraction and then add the sequences that pass FCD check
1977:                if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1
1978:                        && !(element.m_cPoints_.length()
1979:                                - element.m_cPointsOffset_ == 2
1980:                                && UTF16.isLeadSurrogate(element.m_cPoints_
1981:                                        .charAt(0)) && UTF16
1982:                                .isTrailSurrogate(element.m_cPoints_.charAt(1)))) {
1983:                    // this is a contraction, we should check whether a composed form 
1984:                    // should also be included
1985:                    m_utilCanIter_.setSource(element.m_cPoints_);
1986:                    String source = m_utilCanIter_.next();
1987:                    while (source != null && source.length() > 0) {
1988:                        if (Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.NO) {
1989:                            element.m_uchars_ = source;
1990:                            element.m_cPoints_ = element.m_uchars_;
1991:                            finalizeAddition(t, element);
1992:                        }
1993:                        source = m_utilCanIter_.next();
1994:                    }
1995:
1996:                    return element.m_mapCE_;
1997:                } else {
1998:                    return finalizeAddition(t, element);
1999:                }
2000:            }
2001:
2002:            /**
2003:             * Adds an expansion ce to the expansion vector
2004:             * @param expansions vector to add to
2005:             * @param value of the expansion
2006:             * @return the current position of the new element
2007:             */
2008:            private static final int addExpansion(Vector expansions, int value) {
2009:                expansions.add(new Integer(value));
2010:                return expansions.size() - 1;
2011:            }
2012:
2013:            /**
2014:             * Looks for the maximum length of all expansion sequences ending with the 
2015:             * same collation element. The size required for maxexpansion and maxsize 
2016:             * is returned if the arrays are too small.
2017:             * @param endexpansion the last expansion collation element to be added
2018:             * @param expansionsize size of the expansion
2019:             * @param maxexpansion data structure to store the maximum expansion data.
2020:             * @returns size of the maxexpansion and maxsize used.
2021:             */
2022:            private static int setMaxExpansion(int endexpansion,
2023:                    byte expansionsize, MaxExpansionTable maxexpansion) {
2024:                int start = 0;
2025:                int limit = maxexpansion.m_endExpansionCE_.size();
2026:                long unsigned = (long) endexpansion;
2027:                unsigned &= 0xFFFFFFFFl;
2028:
2029:                // using binary search to determine if last expansion element is 
2030:                // already in the array 
2031:                int result = -1;
2032:                while (start < limit - 1) {
2033:                    int mid = start + ((limit - start) >> 1);
2034:                    long unsignedce = ((Integer) maxexpansion.m_endExpansionCE_
2035:                            .get(mid)).intValue();
2036:                    unsignedce &= 0xFFFFFFFFl;
2037:                    if (unsigned <= unsignedce) {
2038:                        limit = mid;
2039:                    } else {
2040:                        start = mid;
2041:                    }
2042:                }
2043:
2044:                if (((Integer) maxexpansion.m_endExpansionCE_.get(start))
2045:                        .intValue() == endexpansion) {
2046:                    result = start;
2047:                } else if (((Integer) maxexpansion.m_endExpansionCE_.get(limit))
2048:                        .intValue() == endexpansion) {
2049:                    result = limit;
2050:                }
2051:                if (result > -1) {
2052:                    // found the ce in expansion, we'll just modify the size if it 
2053:                    // is smaller
2054:                    Object currentsize = maxexpansion.m_expansionCESize_
2055:                            .get(result);
2056:                    if (((Byte) currentsize).byteValue() < expansionsize) {
2057:                        maxexpansion.m_expansionCESize_.set(result, new Byte(
2058:                                expansionsize));
2059:                    }
2060:                } else {
2061:                    // we'll need to squeeze the value into the array. initial 
2062:                    // implementation. shifting the subarray down by 1
2063:                    maxexpansion.m_endExpansionCE_.insertElementAt(new Integer(
2064:                            endexpansion), start + 1);
2065:                    maxexpansion.m_expansionCESize_.insertElementAt(new Byte(
2066:                            expansionsize), start + 1);
2067:                }
2068:                return maxexpansion.m_endExpansionCE_.size();
2069:            }
2070:
2071:            /**
2072:             * Sets the maximum length of all jamo expansion sequences ending with the 
2073:             * same collation element. The size required for maxexpansion and maxsize 
2074:             * is returned if the arrays are too small.
2075:             * @param ch the jamo codepoint
2076:             * @param endexpansion the last expansion collation element to be added
2077:             * @param expansionsize size of the expansion
2078:             * @param maxexpansion data structure to store the maximum expansion data.
2079:             * @returns size of the maxexpansion and maxsize used.
2080:             */
2081:            private static int setMaxJamoExpansion(char ch, int endexpansion,
2082:                    byte expansionsize, MaxJamoExpansionTable maxexpansion) {
2083:                boolean isV = true;
2084:                if (ch >= 0x1100 && ch <= 0x1112) {
2085:                    // determines L for Jamo, doesn't need to store this since it is 
2086:                    // never at the end of a expansion
2087:                    if (maxexpansion.m_maxLSize_ < expansionsize) {
2088:                        maxexpansion.m_maxLSize_ = expansionsize;
2089:                    }
2090:                    return maxexpansion.m_endExpansionCE_.size();
2091:                }
2092:
2093:                if (ch >= 0x1161 && ch <= 0x1175) {
2094:                    // determines V for Jamo
2095:                    if (maxexpansion.m_maxVSize_ < expansionsize) {
2096:                        maxexpansion.m_maxVSize_ = expansionsize;
2097:                    }
2098:                }
2099:
2100:                if (ch >= 0x11A8 && ch <= 0x11C2) {
2101:                    isV = false;
2102:                    // determines T for Jamo
2103:                    if (maxexpansion.m_maxTSize_ < expansionsize) {
2104:                        maxexpansion.m_maxTSize_ = expansionsize;
2105:                    }
2106:                }
2107:
2108:                int pos = maxexpansion.m_endExpansionCE_.size();
2109:                while (pos > 0) {
2110:                    pos--;
2111:                    if (((Integer) maxexpansion.m_endExpansionCE_.get(pos))
2112:                            .intValue() == endexpansion) {
2113:                        return maxexpansion.m_endExpansionCE_.size();
2114:                    }
2115:                }
2116:                maxexpansion.m_endExpansionCE_.add(new Integer(endexpansion));
2117:                maxexpansion.m_isV_.add(new Boolean(isV));
2118:
2119:                return maxexpansion.m_endExpansionCE_.size();
2120:            }
2121:
2122:            /**
2123:             * Adds a prefix to the table
2124:             * @param t build table to update
2125:             * @param CE collation element to add
2126:             * @param element rule element to add
2127:             * @return modified ce
2128:             */
2129:            private int addPrefix(BuildTable t, int CE, Elements element) {
2130:                // currently the longest prefix we're supporting in Japanese is two 
2131:                // characters long. Although this table could quite easily mimic 
2132:                // complete contraction stuff there is no good reason to make a general 
2133:                // solution, as it would require some error prone messing.
2134:                ContractionTable contractions = t.m_contractions_;
2135:                String oldCP = element.m_cPoints_;
2136:                int oldCPOffset = element.m_cPointsOffset_;
2137:
2138:                contractions.m_currentTag_ = CE_SPEC_PROC_TAG_;
2139:                // here, we will normalize & add prefix to the table.
2140:                int size = element.m_prefixChars_.length() - element.m_prefix_;
2141:                for (int j = 1; j < size; j++) {
2142:                    // First add NFD prefix chars to unsafe CP hash table 
2143:                    // Unless it is a trail surrogate, which is handled algoritmically 
2144:                    // and shouldn't take up space in the table.
2145:                    char ch = element.m_prefixChars_.charAt(j
2146:                            + element.m_prefix_);
2147:                    if (!UTF16.isTrailSurrogate(ch)) {
2148:                        unsafeCPSet(t.m_unsafeCP_, ch);
2149:                    }
2150:                }
2151:
2152:                // StringBuffer reversed = new StringBuffer();
2153:                m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
2154:                for (int j = 0; j < size; j++) {
2155:                    // prefixes are going to be looked up backwards
2156:                    // therefore, we will promptly reverse the prefix buffer...
2157:                    int offset = element.m_prefixChars_.length() - j - 1;
2158:                    m_utilStringBuffer_.append(element.m_prefixChars_
2159:                            .charAt(offset));
2160:                }
2161:                element.m_prefixChars_ = m_utilStringBuffer_.toString();
2162:                element.m_prefix_ = 0;
2163:
2164:                // the first codepoint is also unsafe, as it forms a 'contraction' with 
2165:                // the prefix
2166:                if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(0))) {
2167:                    unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_.charAt(0));
2168:                }
2169:
2170:                element.m_cPoints_ = element.m_prefixChars_;
2171:                element.m_cPointsOffset_ = element.m_prefix_;
2172:
2173:                // Add the last char of the contraction to the contraction-end hash 
2174:                // table. unless it is a trail surrogate, which is handled 
2175:                // algorithmically and shouldn't be in the table
2176:                if (!UTF16.isTrailSurrogate(element.m_cPoints_
2177:                        .charAt(element.m_cPoints_.length() - 1))) {
2178:                    ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
2179:                            .charAt(element.m_cPoints_.length() - 1));
2180:                }
2181:                // First we need to check if contractions starts with a surrogate
2182:                // int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
2183:
2184:                // If there are any Jamos in the contraction, we should turn on special 
2185:                // processing for Jamos
2186:                if (isJamo(element.m_prefixChars_.charAt(element.m_prefix_))) {
2187:                    t.m_collator_.m_isJamoSpecial_ = true;
2188:                }
2189:                // then we need to deal with it 
2190:                // we could aready have something in table - or we might not 
2191:                if (!isPrefix(CE)) {
2192:                    // if it wasn't contraction, we wouldn't end up here
2193:                    int firstContractionOffset = addContraction(contractions,
2194:                            CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);
2195:                    int newCE = processContraction(contractions, element,
2196:                            CE_NOT_FOUND_);
2197:                    addContraction(contractions, firstContractionOffset,
2198:                            element.m_prefixChars_.charAt(element.m_prefix_),
2199:                            newCE);
2200:                    addContraction(contractions, firstContractionOffset,
2201:                            (char) 0xFFFF, CE);
2202:                    CE = constructSpecialCE(CE_SPEC_PROC_TAG_,
2203:                            firstContractionOffset);
2204:                } else {
2205:                    // we are adding to existing contraction 
2206:                    // there were already some elements in the table, so we need to add 
2207:                    // a new contraction 
2208:                    // Two things can happen here: either the codepoint is already in 
2209:                    // the table, or it is not
2210:                    char ch = element.m_prefixChars_.charAt(element.m_prefix_);
2211:                    int position = findCP(contractions, CE, ch);
2212:                    if (position > 0) {
2213:                        // if it is we just continue down the chain 
2214:                        int eCE = getCE(contractions, CE, position);
2215:                        int newCE = processContraction(contractions, element,
2216:                                eCE);
2217:                        setContraction(contractions, CE, position, ch, newCE);
2218:                    } else {
2219:                        // if it isn't, we will have to create a new sequence 
2220:                        processContraction(contractions, element, CE_NOT_FOUND_);
2221:                        insertContraction(contractions, CE, ch,
2222:                                element.m_mapCE_);
2223:                    }
2224:                }
2225:
2226:                element.m_cPoints_ = oldCP;
2227:                element.m_cPointsOffset_ = oldCPOffset;
2228:
2229:                return CE;
2230:            }
2231:
2232:            /**
2233:             * Checks if the argument ce is a contraction
2234:             * @param CE collation element
2235:             * @return true if argument ce is a contraction
2236:             */
2237:            private static final boolean isContraction(int CE) {
2238:                return isSpecial(CE) && (getCETag(CE) == CE_CONTRACTION_TAG_);
2239:            }
2240:
2241:            /**
2242:             * Checks if the argument ce has a prefix
2243:             * @param CE collation element
2244:             * @return true if argument ce has a prefix
2245:             */
2246:            private static final boolean isPrefix(int CE) {
2247:                return isSpecial(CE) && (getCETag(CE) == CE_SPEC_PROC_TAG_);
2248:            }
2249:
2250:            /**
2251:             * Checks if the argument ce is special
2252:             * @param CE collation element
2253:             * @return true if argument ce is special
2254:             */
2255:            private static final boolean isSpecial(int CE) {
2256:                return (CE & RuleBasedCollator.CE_SPECIAL_FLAG_) == 0xF0000000;
2257:            }
2258:
2259:            /**
2260:             * Checks if the argument ce has a prefix
2261:             * @param CE collation element
2262:             * @return true if argument ce has a prefix
2263:             */
2264:            private static final int getCETag(int CE) {
2265:                return (CE & RuleBasedCollator.CE_TAG_MASK_) >>> RuleBasedCollator.CE_TAG_SHIFT_;
2266:            }
2267:
2268:            /**
2269:             * Gets the ce at position in contraction table
2270:             * @param table contraction table
2271:             * @param position offset to the contraction table
2272:             * @return ce
2273:             */
2274:            private static final int getCE(ContractionTable table, int element,
2275:                    int position) {
2276:                element &= 0xFFFFFF;
2277:                BasicContractionTable tbl = getBasicContractionTable(table,
2278:                        element);
2279:
2280:                if (tbl == null) {
2281:                    return CE_NOT_FOUND_;
2282:                }
2283:                if (position > tbl.m_CEs_.size() || position == -1) {
2284:                    return CE_NOT_FOUND_;
2285:                } else {
2286:                    return ((Integer) tbl.m_CEs_.get(position)).intValue();
2287:                }
2288:            }
2289:
2290:            /**
2291:             * Sets the unsafe character
2292:             * @param table unsafe table
2293:             * @param c character to be added
2294:             */
2295:            private static final void unsafeCPSet(byte table[], char c) {
2296:                int hash = c;
2297:                if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
2298:                    if (hash >= 0xd800 && hash <= 0xf8ff) {
2299:                        // Part of a surrogate, or in private use area. 
2300:                        // These don't go in the table                            
2301:                        return;
2302:                    }
2303:                    hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
2304:                }
2305:                table[hash >> 3] |= (1 << (hash & 7));
2306:            }
2307:
2308:            /**
2309:             * Sets the contraction end character
2310:             * @param table contraction end table
2311:             * @param c character to be added
2312:             */
2313:            private static final void ContrEndCPSet(byte table[], char c) {
2314:                int hash = c;
2315:                if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
2316:                    hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
2317:                }
2318:                table[hash >> 3] |= (1 << (hash & 7));
2319:            }
2320:
2321:            /** 
2322:             * Adds more contractions in table. If element is non existant, it creates 
2323:             * on. Returns element handle 
2324:             * @param table contraction table
2325:             * @param element offset to the contraction table
2326:             * @param codePoint codepoint to add
2327:             * @param value
2328:             * @return collation element
2329:             */
2330:            private static int addContraction(ContractionTable table,
2331:                    int element, char codePoint, int value) {
2332:                BasicContractionTable tbl = getBasicContractionTable(table,
2333:                        element);
2334:                if (tbl == null) {
2335:                    tbl = addAContractionElement(table);
2336:                    element = table.m_elements_.size() - 1;
2337:                }
2338:
2339:                tbl.m_CEs_.add(new Integer(value));
2340:                tbl.m_codePoints_.append(codePoint);
2341:                return constructSpecialCE(table.m_currentTag_, element);
2342:            }
2343:
2344:            /**
2345:             * Adds a contraction element to the table
2346:             * @param table contraction table to update
2347:             * @return contraction 
2348:             */
2349:            private static BasicContractionTable addAContractionElement(
2350:                    ContractionTable table) {
2351:                BasicContractionTable result = new BasicContractionTable();
2352:                table.m_elements_.add(result);
2353:                return result;
2354:            }
2355:
2356:            /**
2357:             * Constructs a special ce
2358:             * @param tag special tag
2359:             * @param CE collation element 
2360:             * @return a contraction ce
2361:             */
2362:            private static final int constructSpecialCE(int tag, int CE) {
2363:                return RuleBasedCollator.CE_SPECIAL_FLAG_
2364:                        | (tag << RuleBasedCollator.CE_TAG_SHIFT_)
2365:                        | (CE & 0xFFFFFF);
2366:            }
2367:
2368:            /**
2369:             * Sets and inserts the element that has a contraction
2370:             * @param contractions contraction table 
2371:             * @param element contracting element
2372:             * @param existingCE
2373:             * @return contraction ce
2374:             */
2375:            private static int processContraction(
2376:                    ContractionTable contractions, Elements element,
2377:                    int existingCE) {
2378:                int firstContractionOffset = 0;
2379:                // end of recursion 
2380:                if (element.m_cPoints_.length() - element.m_cPointsOffset_ == 1) {
2381:                    if (isContractionTableElement(existingCE)
2382:                            && getCETag(existingCE) == contractions.m_currentTag_) {
2383:                        changeContraction(contractions, existingCE, (char) 0,
2384:                                element.m_mapCE_);
2385:                        changeContraction(contractions, existingCE,
2386:                                (char) 0xFFFF, element.m_mapCE_);
2387:                        return existingCE;
2388:                    } else {
2389:                        // can't do just that. existingCe might be a contraction, 
2390:                        // meaning that we need to do another step
2391:                        return element.m_mapCE_;
2392:                    }
2393:                }
2394:
2395:                // this recursion currently feeds on the only element we have... 
2396:                // We will have to copy it in order to accomodate for both backward 
2397:                // and forward cycles
2398:                // we encountered either an empty space or a non-contraction element 
2399:                // this means we are constructing a new contraction sequence 
2400:                element.m_cPointsOffset_++;
2401:                if (!isContractionTableElement(existingCE)) {
2402:                    // if it wasn't contraction, we wouldn't end up here
2403:                    firstContractionOffset = addContraction(contractions,
2404:                            CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0,
2405:                            existingCE);
2406:                    int newCE = processContraction(contractions, element,
2407:                            CE_NOT_FOUND_);
2408:                    addContraction(
2409:                            contractions,
2410:                            firstContractionOffset,
2411:                            element.m_cPoints_.charAt(element.m_cPointsOffset_),
2412:                            newCE);
2413:                    addContraction(contractions, firstContractionOffset,
2414:                            (char) 0xFFFF, existingCE);
2415:                    existingCE = constructSpecialCE(contractions.m_currentTag_,
2416:                            firstContractionOffset);
2417:                } else {
2418:                    // we are adding to existing contraction
2419:                    // there were already some elements in the table, so we need to add 
2420:                    // a new contraction 
2421:                    // Two things can happen here: either the codepoint is already in 
2422:                    // the table, or it is not
2423:                    int position = findCP(contractions, existingCE,
2424:                            element.m_cPoints_.charAt(element.m_cPointsOffset_));
2425:                    if (position > 0) {
2426:                        // if it is we just continue down the chain 
2427:                        int eCE = getCE(contractions, existingCE, position);
2428:                        int newCE = processContraction(contractions, element,
2429:                                eCE);
2430:                        setContraction(contractions, existingCE, position,
2431:                                element.m_cPoints_
2432:                                        .charAt(element.m_cPointsOffset_),
2433:                                newCE);
2434:                    } else {
2435:                        // if it isn't, we will have to create a new sequence 
2436:                        int newCE = processContraction(contractions, element,
2437:                                CE_NOT_FOUND_);
2438:                        insertContraction(contractions, existingCE,
2439:                                element.m_cPoints_
2440:                                        .charAt(element.m_cPointsOffset_),
2441:                                newCE);
2442:                    }
2443:                }
2444:                element.m_cPointsOffset_--;
2445:                return existingCE;
2446:            }
2447:
2448:            /**
2449:             * Checks if CE belongs to the contraction table
2450:             * @param CE collation element to test
2451:             * @return true if CE belongs to the contraction table
2452:             */
2453:            private static final boolean isContractionTableElement(int CE) {
2454:                return isSpecial(CE)
2455:                        && (getCETag(CE) == CE_CONTRACTION_TAG_ || getCETag(CE) == CE_SPEC_PROC_TAG_);
2456:            }
2457:
2458:            /**
2459:             * Gets the codepoint 
2460:             * @param table contraction table
2461:             * @param element offset to the contraction element in the table
2462:             * @param codePoint code point to look for
2463:             * @return the offset to the code point
2464:             */
2465:            private static int findCP(ContractionTable table, int element,
2466:                    char codePoint) {
2467:                BasicContractionTable tbl = getBasicContractionTable(table,
2468:                        element);
2469:                if (tbl == null) {
2470:                    return -1;
2471:                }
2472:
2473:                int position = 0;
2474:                while (codePoint > tbl.m_codePoints_.charAt(position)) {
2475:                    position++;
2476:                    if (position > tbl.m_codePoints_.length()) {
2477:                        return -1;
2478:                    }
2479:                }
2480:                if (codePoint == tbl.m_codePoints_.charAt(position)) {
2481:                    return position;
2482:                } else {
2483:                    return -1;
2484:                }
2485:            }
2486:
2487:            /**
2488:             * Gets the contraction element out of the contraction table
2489:             * @param table contraction table
2490:             * @param offset to the element in the contraction table
2491:             * @return basic contraction element at offset in the contraction table
2492:             */
2493:            private static final BasicContractionTable getBasicContractionTable(
2494:                    ContractionTable table, int offset) {
2495:                offset &= 0xFFFFFF;
2496:                if (offset == 0xFFFFFF) {
2497:                    return null;
2498:                }
2499:                return (BasicContractionTable) table.m_elements_.get(offset);
2500:            }
2501:
2502:            /**
2503:             * Changes the contraction element
2504:             * @param table contraction table
2505:             * @param element offset to the element in the contraction table
2506:             * @param codePoint codepoint 
2507:             * @param newCE new collation element
2508:             * @return basic contraction element at offset in the contraction table
2509:             */
2510:            private static final int changeContraction(ContractionTable table,
2511:                    int element, char codePoint, int newCE) {
2512:                BasicContractionTable tbl = getBasicContractionTable(table,
2513:                        element);
2514:                if (tbl == null) {
2515:                    return 0;
2516:                }
2517:                int position = 0;
2518:                while (codePoint > tbl.m_codePoints_.charAt(position)) {
2519:                    position++;
2520:                    if (position > tbl.m_codePoints_.length()) {
2521:                        return CE_NOT_FOUND_;
2522:                    }
2523:                }
2524:                if (codePoint == tbl.m_codePoints_.charAt(position)) {
2525:                    tbl.m_CEs_.set(position, new Integer(newCE));
2526:                    return element & 0xFFFFFF;
2527:                } else {
2528:                    return CE_NOT_FOUND_;
2529:                }
2530:            }
2531:
2532:            /** 
2533:             * Sets a part of contraction sequence in table. If element is non 
2534:             * existant, it creates on. Returns element handle.
2535:             * @param table contraction table
2536:             * @param element offset to the contraction table
2537:             * @param offset
2538:             * @param codePoint contraction character
2539:             * @param value ce value
2540:             * @return new contraction ce
2541:             */
2542:            private static final int setContraction(ContractionTable table,
2543:                    int element, int offset, char codePoint, int value) {
2544:                element &= 0xFFFFFF;
2545:                BasicContractionTable tbl = getBasicContractionTable(table,
2546:                        element);
2547:                if (tbl == null) {
2548:                    tbl = addAContractionElement(table);
2549:                    element = table.m_elements_.size() - 1;
2550:                }
2551:
2552:                tbl.m_CEs_.set(offset, new Integer(value));
2553:                tbl.m_codePoints_.setCharAt(offset, codePoint);
2554:                return constructSpecialCE(table.m_currentTag_, element);
2555:            }
2556:
2557:            /** 
2558:             * Inserts a part of contraction sequence in table. Sequences behind the 
2559:             * offset are moved back. If element is non existent, it creates on. 
2560:             * @param table contraction
2561:             * @param element offset to the table contraction
2562:             * @param codePoint code point
2563:             * @param value collation element value
2564:             * @return contraction collation element
2565:             */
2566:            private static final int insertContraction(ContractionTable table,
2567:                    int element, char codePoint, int value) {
2568:                element &= 0xFFFFFF;
2569:                BasicContractionTable tbl = getBasicContractionTable(table,
2570:                        element);
2571:                if (tbl == null) {
2572:                    tbl = addAContractionElement(table);
2573:                    element = table.m_elements_.size() - 1;
2574:                }
2575:
2576:                int offset = 0;
2577:                while (tbl.m_codePoints_.charAt(offset) < codePoint
2578:                        && offset < tbl.m_codePoints_.length()) {
2579:                    offset++;
2580:                }
2581:
2582:                tbl.m_CEs_.insertElementAt(new Integer(value), offset);
2583:                tbl.m_codePoints_.insert(offset, codePoint);
2584:
2585:                return constructSpecialCE(table.m_currentTag_, element);
2586:            }
2587:
2588:            /**
2589:             * Finalize addition
2590:             * @param t build table
2591:             * @param element to add
2592:             */
2593:            private final static int finalizeAddition(BuildTable t,
2594:                    Elements element) {
2595:                int CE = CE_NOT_FOUND_;
2596:                // This should add a completely ignorable element to the  
2597:                // unsafe table, so that backward iteration will skip 
2598:                // over it when treating contractions. 
2599:                if (element.m_mapCE_ == 0) {
2600:                    for (int i = 0; i < element.m_cPoints_.length(); i++) {
2601:                        char ch = element.m_cPoints_.charAt(i);
2602:                        if (!UTF16.isTrailSurrogate(ch)) {
2603:                            unsafeCPSet(t.m_unsafeCP_, ch);
2604:                        }
2605:                    }
2606:                }
2607:
2608:                if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1) {
2609:                    // we're adding a contraction
2610:                    int cp = UTF16.charAt(element.m_cPoints_,
2611:                            element.m_cPointsOffset_);
2612:                    CE = t.m_mapping_.getValue(cp);
2613:                    CE = addContraction(t, CE, element);
2614:                } else {
2615:                    // easy case
2616:                    CE = t.m_mapping_.getValue(element.m_cPoints_
2617:                            .charAt(element.m_cPointsOffset_));
2618:
2619:                    if (CE != CE_NOT_FOUND_) {
2620:                        if (isContractionTableElement(CE)) {
2621:                            // adding a non contraction element (thai, expansion, 
2622:                            // single) to already existing contraction 
2623:                            if (!isPrefix(element.m_mapCE_)) {
2624:                                // we cannot reenter prefix elements - as we are going 
2625:                                // to create a dead loop
2626:                                // Only expansions and regular CEs can go here... 
2627:                                // Contractions will never happen in this place
2628:                                setContraction(t.m_contractions_, CE, 0,
2629:                                        (char) 0, element.m_mapCE_);
2630:                                // This loop has to change the CE at the end of 
2631:                                // contraction REDO!
2632:                                changeLastCE(t.m_contractions_, CE,
2633:                                        element.m_mapCE_);
2634:                            }
2635:                        } else {
2636:                            t.m_mapping_.setValue(element.m_cPoints_
2637:                                    .charAt(element.m_cPointsOffset_),
2638:                                    element.m_mapCE_);
2639:                        }
2640:                    } else {
2641:                        t.m_mapping_.setValue(element.m_cPoints_
2642:                                .charAt(element.m_cPointsOffset_),
2643:                                element.m_mapCE_);
2644:                    }
2645:                }
2646:                return CE;
2647:            }
2648:
2649:            /** 
2650:             * Note regarding surrogate handling: We are interested only in the single
2651:             * or leading surrogates in a contraction. If a surrogate is somewhere else
2652:             * in the contraction, it is going to be handled as a pair of code units,
2653:             * as it doesn't affect the performance AND handling surrogates specially
2654:             * would complicate code way too much.
2655:             */
2656:            private static int addContraction(BuildTable t, int CE,
2657:                    Elements element) {
2658:                ContractionTable contractions = t.m_contractions_;
2659:                contractions.m_currentTag_ = CE_CONTRACTION_TAG_;
2660:
2661:                // First we need to check if contractions starts with a surrogate
2662:                int cp = UTF16.charAt(element.m_cPoints_, 0);
2663:                int cpsize = 1;
2664:                if (UCharacter.isSupplementary(cp)) {
2665:                    cpsize = 2;
2666:                }
2667:                if (cpsize < element.m_cPoints_.length()) {
2668:                    // This is a real contraction, if there are other characters after 
2669:                    // the first
2670:                    int size = element.m_cPoints_.length()
2671:                            - element.m_cPointsOffset_;
2672:                    for (int j = 1; j < size; j++) {
2673:                        // First add contraction chars to unsafe CP hash table 
2674:                        // Unless it is a trail surrogate, which is handled 
2675:                        // algoritmically and shouldn't take up space in the table.
2676:                        if (!UTF16.isTrailSurrogate(element.m_cPoints_
2677:                                .charAt(element.m_cPointsOffset_ + j))) {
2678:                            unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_
2679:                                    .charAt(element.m_cPointsOffset_ + j));
2680:                        }
2681:                    }
2682:                    // Add the last char of the contraction to the contraction-end 
2683:                    // hash table. unless it is a trail surrogate, which is handled 
2684:                    // algorithmically and shouldn't be in the table
2685:                    if (!UTF16.isTrailSurrogate(element.m_cPoints_
2686:                            .charAt(element.m_cPoints_.length() - 1))) {
2687:                        ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
2688:                                .charAt(element.m_cPoints_.length() - 1));
2689:                    }
2690:
2691:                    // If there are any Jamos in the contraction, we should turn on 
2692:                    // special processing for Jamos
2693:                    if (isJamo(element.m_cPoints_
2694:                            .charAt(element.m_cPointsOffset_))) {
2695:                        t.m_collator_.m_isJamoSpecial_ = true;
2696:                    }
2697:                    // then we need to deal with it 
2698:                    // we could aready have something in table - or we might not 
2699:                    element.m_cPointsOffset_ += cpsize;
2700:                    if (!isContraction(CE)) {
2701:                        // if it wasn't contraction, we wouldn't end up here
2702:                        int firstContractionOffset = addContraction(
2703:                                contractions, CONTRACTION_TABLE_NEW_ELEMENT_,
2704:                                (char) 0, CE);
2705:                        int newCE = processContraction(contractions, element,
2706:                                CE_NOT_FOUND_);
2707:                        addContraction(contractions, firstContractionOffset,
2708:                                element.m_cPoints_
2709:                                        .charAt(element.m_cPointsOffset_),
2710:                                newCE);
2711:                        addContraction(contractions, firstContractionOffset,
2712:                                (char) 0xFFFF, CE);
2713:                        CE = constructSpecialCE(CE_CONTRACTION_TAG_,
2714:                                firstContractionOffset);
2715:                    } else {
2716:                        // we are adding to existing contraction 
2717:                        // there were already some elements in the table, so we need to 
2718:                        // add a new contraction
2719:                        // Two things can happen here: either the codepoint is already 
2720:                        // in the table, or it is not 
2721:                        int position = findCP(contractions, CE,
2722:                                element.m_cPoints_
2723:                                        .charAt(element.m_cPointsOffset_));
2724:                        if (position > 0) {
2725:                            // if it is we just continue down the chain
2726:                            int eCE = getCE(contractions, CE, position);
2727:                            int newCE = processContraction(contractions,
2728:                                    element, eCE);
2729:                            setContraction(contractions, CE, position,
2730:                                    element.m_cPoints_
2731:                                            .charAt(element.m_cPointsOffset_),
2732:                                    newCE);
2733:                        } else {
2734:                            // if it isn't, we will have to create a new sequence 
2735:                            int newCE = processContraction(contractions,
2736:                                    element, CE_NOT_FOUND_);
2737:                            insertContraction(contractions, CE,
2738:                                    element.m_cPoints_
2739:                                            .charAt(element.m_cPointsOffset_),
2740:                                    newCE);
2741:                        }
2742:                    }
2743:                    element.m_cPointsOffset_ -= cpsize;
2744:                    t.m_mapping_.setValue(cp, CE);
2745:                } else if (!isContraction(CE)) {
2746:                    // this is just a surrogate, and there is no contraction 
2747:                    t.m_mapping_.setValue(cp, element.m_mapCE_);
2748:                } else {
2749:                    // fill out the first stage of the contraction with the surrogate 
2750:                    // CE 
2751:                    changeContraction(contractions, CE, (char) 0,
2752:                            element.m_mapCE_);
2753:                    changeContraction(contractions, CE, (char) 0xFFFF,
2754:                            element.m_mapCE_);
2755:                }
2756:                return CE;
2757:            }
2758:
2759:            /** 
2760:             * this is for adding non contractions 
2761:             * @param table contraction table
2762:             * @param element offset to the contraction table
2763:             * @param value collation element value
2764:             * @return new collation element 
2765:             */
2766:            private static final int changeLastCE(ContractionTable table,
2767:                    int element, int value) {
2768:                BasicContractionTable tbl = getBasicContractionTable(table,
2769:                        element);
2770:                if (tbl == null) {
2771:                    return 0;
2772:                }
2773:
2774:                tbl.m_CEs_.set(tbl.m_CEs_.size() - 1, new Integer(value));
2775:                return constructSpecialCE(table.m_currentTag_,
2776:                        element & 0xFFFFFF);
2777:            }
2778:
2779:            /**
2780:             * Given a set of ranges calculated by allocWeights(), iterate through the 
2781:             * weights. Sets the next weight in cegenerator.m_current_.
2782:             * @param cegenerator object that contains ranges weight range array and
2783:             *        its rangeCount
2784:             * @return the next weight
2785:             */
2786:            private static int nextWeight(CEGenerator cegenerator) {
2787:                if (cegenerator.m_rangesLength_ > 0) {
2788:                    // get maxByte from the .count field
2789:                    int maxByte = cegenerator.m_ranges_[0].m_count_;
2790:                    // get the next weight 
2791:                    int weight = cegenerator.m_ranges_[0].m_start_;
2792:                    if (weight == cegenerator.m_ranges_[0].m_end_) {
2793:                        // this range is finished, remove it and move the following 
2794:                        // ones up 
2795:                        cegenerator.m_rangesLength_--;
2796:                        if (cegenerator.m_rangesLength_ > 0) {
2797:                            System.arraycopy(cegenerator.m_ranges_, 1,
2798:                                    cegenerator.m_ranges_, 0,
2799:                                    cegenerator.m_rangesLength_);
2800:                            cegenerator.m_ranges_[0].m_count_ = maxByte;
2801:                            // keep maxByte in ranges[0]
2802:                        }
2803:                    } else {
2804:                        // increment the weight for the next value
2805:                        cegenerator.m_ranges_[0].m_start_ = incWeight(weight,
2806:                                cegenerator.m_ranges_[0].m_length2_, maxByte);
2807:                    }
2808:                    return weight;
2809:                }
2810:                return -1;
2811:            }
2812:
2813:            /**
2814:             * Increment the collation weight
2815:             * @param weight to increment
2816:             * @param length
2817:             * @param maxByte
2818:             * @return new incremented weight
2819:             */
2820:            private static final int incWeight(int weight, int length,
2821:                    int maxByte) {
2822:                while (true) {
2823:                    int b = getWeightByte(weight, length);
2824:                    if (b < maxByte) {
2825:                        return setWeightByte(weight, length, b + 1);
2826:                    } else {
2827:                        // roll over, set this byte to BYTE_FIRST_TAILORED_ and 
2828:                        // increment the previous one
2829:                        weight = setWeightByte(weight, length,
2830:                                RuleBasedCollator.BYTE_FIRST_TAILORED_);
2831:                        --length;
2832:                    }
2833:                }
2834:            }
2835:
2836:            /**
2837:             * Gets the weight byte
2838:             * @param weight
2839:             * @param index
2840:             * @return byte
2841:             */
2842:            private static final int getWeightByte(int weight, int index) {
2843:                return (weight >> ((4 - index) << 3)) & 0xff;
2844:            }
2845:
2846:            /**
2847:             * Set the weight byte in table
2848:             * @param weight 
2849:             * @param index
2850:             * @param b byte
2851:             */
2852:            private static final int setWeightByte(int weight, int index, int b) {
2853:                index <<= 3;
2854:                // 0xffffffff except a 00 "hole" for the index-th byte
2855:                int mask = 0xffffffff >>> index;
2856:                index = 32 - index;
2857:                mask |= 0xffffff00 << index;
2858:                return (weight & mask) | (b << index);
2859:            }
2860:
2861:            /**
2862:             * Call getWeightRanges and then determine heuristically which ranges to 
2863:             * use for a given number of weights between (excluding) two limits
2864:             * @param lowerLimit
2865:             * @param upperLimit
2866:             * @param n
2867:             * @param maxByte
2868:             * @param ranges
2869:             * @return
2870:             */
2871:            private int allocateWeights(int lowerLimit, int upperLimit, int n,
2872:                    int maxByte, WeightRange ranges[]) {
2873:                // number of usable byte values 3..maxByte
2874:                int countBytes = maxByte
2875:                        - RuleBasedCollator.BYTE_FIRST_TAILORED_ + 1;
2876:                // [0] unused, [5] to make index checks unnecessary, m_utilCountBuffer_
2877:                // countBytes to the power of index, m_utilLongBuffer_ for unsignedness
2878:                // gcc requires explicit initialization 
2879:                m_utilLongBuffer_[0] = 1;
2880:                m_utilLongBuffer_[1] = countBytes;
2881:                m_utilLongBuffer_[2] = m_utilLongBuffer_[1] * countBytes;
2882:                m_utilLongBuffer_[3] = m_utilLongBuffer_[2] * countBytes;
2883:                m_utilLongBuffer_[4] = m_utilLongBuffer_[3] * countBytes;
2884:                int rangeCount = getWeightRanges(lowerLimit, upperLimit,
2885:                        maxByte, countBytes, ranges);
2886:                if (rangeCount <= 0) {
2887:                    return 0;
2888:                }
2889:                // what is the maximum number of weights with these ranges?
2890:                long maxCount = 0;
2891:                for (int i = 0; i < rangeCount; ++i) {
2892:                    maxCount += (long) ranges[i].m_count_
2893:                            * m_utilLongBuffer_[4 - ranges[i].m_length_];
2894:                }
2895:                if (maxCount < n) {
2896:                    return 0;
2897:                }
2898:                // set the length2 and count2 fields
2899:                for (int i = 0; i < rangeCount; ++i) {
2900:                    ranges[i].m_length2_ = ranges[i].m_length_;
2901:                    ranges[i].m_count2_ = ranges[i].m_count_;
2902:                }
2903:                // try until we find suitably large ranges
2904:                while (true) {
2905:                    // get the smallest number of bytes in a range
2906:                    int minLength = ranges[0].m_length2_;
2907:                    // sum up the number of elements that fit into ranges of each byte 
2908:                    // length
2909:                    Arrays.fill(m_utilCountBuffer_, 0);
2910:                    for (int i = 0; i < rangeCount; ++i) {
2911:                        m_utilCountBuffer_[ranges[i].m_length2_] += ranges[i].m_count2_;
2912:                    }
2913:                    // now try to allocate n elements in the available short ranges 
2914:                    if (n <= m_utilCountBuffer_[minLength]
2915:                            + m_utilCountBuffer_[minLength + 1]) {
2916:                        // trivial cases, use the first few ranges
2917:                        maxCount = 0;
2918:                        rangeCount = 0;
2919:                        do {
2920:                            maxCount += ranges[rangeCount].m_count2_;
2921:                            ++rangeCount;
2922:                        } while (n > maxCount);
2923:                        break;
2924:                    } else if (n <= ranges[0].m_count2_ * countBytes) {
2925:                        // easy case, just make this one range large enough by 
2926:                        // lengthening it once more, possibly split it
2927:                        rangeCount = 1;
2928:                        // calculate how to split the range between maxLength-1 
2929:                        // (count1) and maxLength (count2) 
2930:                        long power_1 = m_utilLongBuffer_[minLength
2931:                                - ranges[0].m_length_];
2932:                        long power = power_1 * countBytes;
2933:                        int count2 = (int) ((n + power - 1) / power);
2934:                        int count1 = ranges[0].m_count_ - count2;
2935:                        // split the range
2936:                        if (count1 < 1) {
2937:                            // lengthen the entire range to maxLength 
2938:                            lengthenRange(ranges, 0, maxByte, countBytes);
2939:                        } else {
2940:                            // really split the range
2941:                            // create a new range with the end and initial and current 
2942:                            // length of the old one
2943:                            rangeCount = 2;
2944:                            ranges[1].m_end_ = ranges[0].m_end_;
2945:                            ranges[1].m_length_ = ranges[0].m_length_;
2946:                            ranges[1].m_length2_ = minLength;
2947:                            // set the end of the first range according to count1
2948:                            int i = ranges[0].m_length_;
2949:                            int b = getWeightByte(ranges[0].m_start_, i)
2950:                                    + count1 - 1;
2951:                            // ranges[0].count and count1 may be >countBytes from 
2952:                            // merging adjacent ranges; b > maxByte is possible
2953:                            if (b <= maxByte) {
2954:                                ranges[0].m_end_ = setWeightByte(
2955:                                        ranges[0].m_start_, i, b);
2956:                            } else {
2957:                                ranges[0].m_end_ = setWeightByte(incWeight(
2958:                                        ranges[0].m_start_, i - 1, maxByte), i,
2959:                                        b - countBytes);
2960:                            }
2961:                            // set the bytes in the end weight at length + 1..length2 
2962:                            // to maxByte
2963:                            b = (maxByte << 24) | (maxByte << 16)
2964:                                    | (maxByte << 8) | maxByte; // this used to be 0xffffffff 
2965:                            ranges[0].m_end_ = truncateWeight(ranges[0].m_end_,
2966:                                    i)
2967:                                    | (b >>> (i << 3))
2968:                                    & (b << ((4 - minLength) << 3));
2969:                            // set the start of the second range to immediately follow 
2970:                            // the end of the first one
2971:                            ranges[1].m_start_ = incWeight(ranges[0].m_end_,
2972:                                    minLength, maxByte);
2973:                            // set the count values (informational)
2974:                            ranges[0].m_count_ = count1;
2975:                            ranges[1].m_count_ = count2;
2976:
2977:                            ranges[0].m_count2_ = (int) (count1 * power_1);
2978:                            // will be *countBytes when lengthened 
2979:                            ranges[1].m_count2_ = (int) (count2 * power_1);
2980:
2981:                            // lengthen the second range to maxLength
2982:                            lengthenRange(ranges, 1, maxByte, countBytes);
2983:                        }
2984:                        break;
2985:                    }
2986:                    // no good match, lengthen all minLength ranges and iterate 
2987:                    for (int i = 0; ranges[i].m_length2_ == minLength; ++i) {
2988:                        lengthenRange(ranges, i, maxByte, countBytes);
2989:                    }
2990:                }
2991:
2992:                if (rangeCount > 1) {
2993:                    // sort the ranges by weight values 
2994:                    Arrays.sort(ranges, 0, rangeCount);
2995:                }
2996:
2997:                // set maxByte in ranges[0] for ucol_nextWeight()
2998:                ranges[0].m_count_ = maxByte;
2999:
3000:                return rangeCount;
3001:            }
3002:
3003:            /**
3004:             * Updates the range length
3005:             * @param range weight range array
3006:             * @param offset to weight range array
3007:             * @param maxByte
3008:             * @param countBytes
3009:             * @return new length
3010:             */
3011:            private static final int lengthenRange(WeightRange range[],
3012:                    int offset, int maxByte, int countBytes) {
3013:                int length = range[offset].m_length2_ + 1;
3014:                range[offset].m_start_ = setWeightTrail(range[offset].m_start_,
3015:                        length, RuleBasedCollator.BYTE_FIRST_TAILORED_);
3016:                range[offset].m_end_ = setWeightTrail(range[offset].m_end_,
3017:                        length, maxByte);
3018:                range[offset].m_count2_ *= countBytes;
3019:                range[offset].m_length2_ = length;
3020:                return length;
3021:            }
3022:
3023:            /**
3024:             * Gets the weight 
3025:             * @param weight
3026:             * @param length
3027:             * @param trail
3028:             * @return new weight
3029:             */
3030:            private static final int setWeightTrail(int weight, int length,
3031:                    int trail) {
3032:                length = (4 - length) << 3;
3033:                return (weight & (0xffffff00 << length)) | (trail << length);
3034:            }
3035:
3036:            /**
3037:             * take two CE weights and calculate the
3038:             * possible ranges of weights between the two limits, excluding them
3039:             * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
3040:             * @param lowerLimit
3041:             * @param upperLimit
3042:             * @param maxByte
3043:             * @param countBytes
3044:             * @param ranges
3045:             * @return weight ranges
3046:             */
3047:            private int getWeightRanges(int lowerLimit, int upperLimit,
3048:                    int maxByte, int countBytes, WeightRange ranges[]) {
3049:                // assume that both lowerLimit & upperLimit are not 0 
3050:                // get the lengths of the limits 
3051:                int lowerLength = lengthOfWeight(lowerLimit);
3052:                int upperLength = lengthOfWeight(upperLimit);
3053:                if (Utility.compareUnsigned(lowerLimit, upperLimit) >= 0) {
3054:                    return 0;
3055:                }
3056:                // check that neither is a prefix of the other
3057:                if (lowerLength < upperLength) {
3058:                    if (lowerLimit == truncateWeight(upperLimit, lowerLength)) {
3059:                        return 0;
3060:                    }
3061:                }
3062:                // if the upper limit is a prefix of the lower limit then the earlier 
3063:                // test lowerLimit >= upperLimit has caught it
3064:                // reset local variables
3065:                // With the limit lengths of 1..4, there are up to 7 ranges for 
3066:                // allocation:
3067:                // range     minimum length
3068:                // lower[4]  4
3069:                // lower[3]  3
3070:                // lower[2]  2
3071:                // middle    1
3072:                // upper[2]  2
3073:                // upper[3]  3
3074:                // upper[4]  4
3075:                // We are now going to calculate up to 7 ranges.
3076:                // Some of them will typically overlap, so we will then have to merge 
3077:                // and eliminate ranges.
3078:
3079:                // We have to clean cruft from previous invocations
3080:                // before doing anything. C++ already does that
3081:                for (int length = 0; length < 5; length++) {
3082:                    m_utilLowerWeightRange_[length].clear();
3083:                    m_utilUpperWeightRange_[length].clear();
3084:                }
3085:                m_utilWeightRange_.clear();
3086:
3087:                int weight = lowerLimit;
3088:                for (int length = lowerLength; length >= 2; --length) {
3089:                    m_utilLowerWeightRange_[length].clear();
3090:                    int trail = getWeightByte(weight, length);
3091:                    if (trail < maxByte) {
3092:                        m_utilLowerWeightRange_[length].m_start_ = incWeightTrail(
3093:                                weight, length);
3094:                        m_utilLowerWeightRange_[length].m_end_ = setWeightTrail(
3095:                                weight, length, maxByte);
3096:                        m_utilLowerWeightRange_[length].m_length_ = length;
3097:                        m_utilLowerWeightRange_[length].m_count_ = maxByte
3098:                                - trail;
3099:                    }
3100:                    weight = truncateWeight(weight, length - 1);
3101:                }
3102:                m_utilWeightRange_.m_start_ = incWeightTrail(weight, 1);
3103:
3104:                weight = upperLimit;
3105:                // [0] and [1] are not used - this simplifies indexing, 
3106:                // m_utilUpperWeightRange_
3107:
3108:                for (int length = upperLength; length >= 2; length--) {
3109:                    int trail = getWeightByte(weight, length);
3110:                    if (trail > RuleBasedCollator.BYTE_FIRST_TAILORED_) {
3111:                        m_utilUpperWeightRange_[length].m_start_ = setWeightTrail(
3112:                                weight, length,
3113:                                RuleBasedCollator.BYTE_FIRST_TAILORED_);
3114:                        m_utilUpperWeightRange_[length].m_end_ = decWeightTrail(
3115:                                weight, length);
3116:                        m_utilUpperWeightRange_[length].m_length_ = length;
3117:                        m_utilUpperWeightRange_[length].m_count_ = trail
3118:                                - RuleBasedCollator.BYTE_FIRST_TAILORED_;
3119:                    }
3120:                    weight = truncateWeight(weight, length - 1);
3121:                }
3122:                m_utilWeightRange_.m_end_ = decWeightTrail(weight, 1);
3123:
3124:                // set the middle range
3125:                m_utilWeightRange_.m_length_ = 1;
3126:                if (Utility.compareUnsigned(m_utilWeightRange_.m_end_,
3127:                        m_utilWeightRange_.m_start_) >= 0) {
3128:                    //if (m_utilWeightRange_.m_end_ >= m_utilWeightRange_.m_start_) {
3129:                    m_utilWeightRange_.m_count_ = ((m_utilWeightRange_.m_end_ - m_utilWeightRange_.m_start_) >>> 24) + 1;
3130:                } else {
3131:                    // eliminate overlaps
3132:                    // remove the middle range
3133:                    m_utilWeightRange_.m_count_ = 0;
3134:                    // reduce or remove the lower ranges that go beyond upperLimit
3135:                    for (int length = 4; length >= 2; --length) {
3136:                        if (m_utilLowerWeightRange_[length].m_count_ > 0
3137:                                && m_utilUpperWeightRange_[length].m_count_ > 0) {
3138:                            int start = m_utilUpperWeightRange_[length].m_start_;
3139:                            int end = m_utilLowerWeightRange_[length].m_end_;
3140:                            if (end >= start
3141:                                    || incWeight(end, length, maxByte) == start) {
3142:                                // lower and upper ranges collide or are directly 
3143:                                // adjacent: merge these two and remove all shorter 
3144:                                // ranges
3145:                                start = m_utilLowerWeightRange_[length].m_start_;
3146:                                end = m_utilLowerWeightRange_[length].m_end_ = m_utilUpperWeightRange_[length].m_end_;
3147:                                // merging directly adjacent ranges needs to subtract 
3148:                                // the 0/1 gaps in between;
3149:                                // it may result in a range with count>countBytes
3150:                                m_utilLowerWeightRange_[length].m_count_ = getWeightByte(
3151:                                        end, length)
3152:                                        - getWeightByte(start, length)
3153:                                        + 1
3154:                                        + countBytes
3155:                                        * (getWeightByte(end, length - 1) - getWeightByte(
3156:                                                start, length - 1));
3157:                                m_utilUpperWeightRange_[length].m_count_ = 0;
3158:                                while (--length >= 2) {
3159:                                    m_utilLowerWeightRange_[length].m_count_ = m_utilUpperWeightRange_[length].m_count_ = 0;
3160:                                }
3161:                                break;
3162:                            }
3163:                        }
3164:                    }
3165:                }
3166:
3167:                // copy the ranges, shortest first, into the result array 
3168:                int rangeCount = 0;
3169:                if (m_utilWeightRange_.m_count_ > 0) {
3170:                    ranges[0] = new WeightRange(m_utilWeightRange_);
3171:                    rangeCount = 1;
3172:                }
3173:                for (int length = 2; length <= 4; ++length) {
3174:                    // copy upper first so that later the middle range is more likely 
3175:                    // the first one to use
3176:                    if (m_utilUpperWeightRange_[length].m_count_ > 0) {
3177:                        ranges[rangeCount] = new WeightRange(
3178:                                m_utilUpperWeightRange_[length]);
3179:                        ++rangeCount;
3180:                    }
3181:                    if (m_utilLowerWeightRange_[length].m_count_ > 0) {
3182:                        ranges[rangeCount] = new WeightRange(
3183:                                m_utilLowerWeightRange_[length]);
3184:                        ++rangeCount;
3185:                    }
3186:                }
3187:                return rangeCount;
3188:            }
3189:
3190:            /**
3191:             * Truncates the weight with length
3192:             * @param weight
3193:             * @param length
3194:             * @return truncated weight
3195:             */
3196:            private static final int truncateWeight(int weight, int length) {
3197:                return weight & (0xffffffff << ((4 - length) << 3));
3198:            }
3199:
3200:            /**
3201:             * Length of the weight
3202:             * @param weight
3203:             * @return length of the weight
3204:             */
3205:            private static final int lengthOfWeight(int weight) {
3206:                if ((weight & 0xffffff) == 0) {
3207:                    return 1;
3208:                } else if ((weight & 0xffff) == 0) {
3209:                    return 2;
3210:                } else if ((weight & 0xff) == 0) {
3211:                    return 3;
3212:                }
3213:                return 4;
3214:            }
3215:
3216:            /**
3217:             * Increment the weight trail
3218:             * @param weight 
3219:             * @param length
3220:             * @return new weight
3221:             */
3222:            private static final int incWeightTrail(int weight, int length) {
3223:                return weight + (1 << ((4 - length) << 3));
3224:            }
3225:
3226:            /**
3227:             * Decrement the weight trail
3228:             * @param weight 
3229:             * @param length
3230:             * @return new weight
3231:             */
3232:            private static int decWeightTrail(int weight, int length) {
3233:                return weight - (1 << ((4 - length) << 3));
3234:            }
3235:
3236:            /**
3237:             * Gets the codepoint 
3238:             * @param tbl contraction table
3239:             * @param codePoint code point to look for
3240:             * @return the offset to the code point
3241:             */
3242:            private static int findCP(BasicContractionTable tbl, char codePoint) {
3243:                int position = 0;
3244:                while (codePoint > tbl.m_codePoints_.charAt(position)) {
3245:                    position++;
3246:                    if (position > tbl.m_codePoints_.length()) {
3247:                        return -1;
3248:                    }
3249:                }
3250:                if (codePoint == tbl.m_codePoints_.charAt(position)) {
3251:                    return position;
3252:                } else {
3253:                    return -1;
3254:                }
3255:            }
3256:
3257:            /**
3258:             * Finds a contraction ce
3259:             * @param table
3260:             * @param element
3261:             * @param ch
3262:             * @return ce
3263:             */
3264:            private static int findCE(ContractionTable table, int element,
3265:                    char ch) {
3266:                if (table == null) {
3267:                    return CE_NOT_FOUND_;
3268:                }
3269:                BasicContractionTable tbl = getBasicContractionTable(table,
3270:                        element);
3271:                if (tbl == null) {
3272:                    return CE_NOT_FOUND_;
3273:                }
3274:                int position = findCP(tbl, ch);
3275:                if (position > tbl.m_CEs_.size() || position < 0) {
3276:                    return CE_NOT_FOUND_;
3277:                }
3278:                return ((Integer) tbl.m_CEs_.get(position)).intValue();
3279:            }
3280:
3281:            /**
3282:             * Checks if the string is tailored in the contraction
3283:             * @param table contraction table
3284:             * @param element 
3285:             * @param array character array to check
3286:             * @param offset array offset
3287:             * @return true if it is tailored
3288:             */
3289:            private static boolean isTailored(ContractionTable table,
3290:                    int element, char array[], int offset) {
3291:                while (array[offset] != 0) {
3292:                    element = findCE(table, element, array[offset]);
3293:                    if (element == CE_NOT_FOUND_) {
3294:                        return false;
3295:                    }
3296:                    if (!isContractionTableElement(element)) {
3297:                        return true;
3298:                    }
3299:                    offset++;
3300:                }
3301:                if (getCE(table, element, 0) != CE_NOT_FOUND_) {
3302:                    return true;
3303:                } else {
3304:                    return false;
3305:                }
3306:            }
3307:
3308:            /**
3309:             * Assemble RuleBasedCollator
3310:             * @param t build table
3311:             * @param collator to update
3312:             */
3313:            private void assembleTable(BuildTable t, RuleBasedCollator collator) {
3314:                IntTrieBuilder mapping = t.m_mapping_;
3315:                Vector expansions = t.m_expansions_;
3316:                ContractionTable contractions = t.m_contractions_;
3317:                MaxExpansionTable maxexpansion = t.m_maxExpansions_;
3318:
3319:                // contraction offset has to be in since we are building on the 
3320:                // UCA contractions 
3321:                // int beforeContractions = (HEADER_SIZE_ 
3322:                //                         + paddedsize(expansions.size() << 2)) >>> 1;
3323:                collator.m_contractionOffset_ = 0;
3324:                int contractionsSize = constructTable(contractions);
3325:
3326:                // the following operation depends on the trie data. Therefore, we have 
3327:                // to do it before the trie is compacted 
3328:                // sets jamo expansions
3329:                getMaxExpansionJamo(mapping, maxexpansion,
3330:                        t.m_maxJamoExpansions_, collator.m_isJamoSpecial_);
3331:
3332:                // TODO: LATIN1 array is now in the utrie - it should be removed from 
3333:                // the calculation
3334:                setAttributes(collator, t.m_options_);
3335:                // copy expansions
3336:                int size = expansions.size();
3337:                collator.m_expansion_ = new int[size];
3338:                for (int i = 0; i < size; i++) {
3339:                    collator.m_expansion_[i] = ((Integer) expansions.get(i))
3340:                            .intValue();
3341:                }
3342:                // contractions block 
3343:                if (contractionsSize != 0) {
3344:                    // copy contraction index 
3345:                    collator.m_contractionIndex_ = new char[contractionsSize];
3346:                    contractions.m_codePoints_.getChars(0, contractionsSize,
3347:                            collator.m_contractionIndex_, 0);
3348:                    // copy contraction collation elements
3349:                    collator.m_contractionCE_ = new int[contractionsSize];
3350:                    for (int i = 0; i < contractionsSize; i++) {
3351:                        collator.m_contractionCE_[i] = ((Integer) contractions.m_CEs_
3352:                                .get(i)).intValue();
3353:                    }
3354:                }
3355:                // copy mapping table
3356:                collator.m_trie_ = mapping.serialize(t,
3357:                        RuleBasedCollator.DataManipulate.getInstance());
3358:                // copy max expansion table
3359:                // not copying the first element which is a dummy
3360:                // to be in synch with icu4c's builder, we continue to use the 
3361:                // expansion offset
3362:                // omitting expansion offset in builder
3363:                collator.m_expansionOffset_ = 0;
3364:                size = maxexpansion.m_endExpansionCE_.size();
3365:                collator.m_expansionEndCE_ = new int[size - 1];
3366:                for (int i = 1; i < size; i++) {
3367:                    collator.m_expansionEndCE_[i - 1] = ((Integer) maxexpansion.m_endExpansionCE_
3368:                            .get(i)).intValue();
3369:                }
3370:                collator.m_expansionEndCEMaxSize_ = new byte[size - 1];
3371:                for (int i = 1; i < size; i++) {
3372:                    collator.m_expansionEndCEMaxSize_[i - 1] = ((Byte) maxexpansion.m_expansionCESize_
3373:                            .get(i)).byteValue();
3374:                }
3375:                // Unsafe chars table.  Finish it off, then copy it.
3376:                unsafeCPAddCCNZ(t);
3377:                // Or in unsafebits from UCA, making a combined table.
3378:                for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
3379:                    t.m_unsafeCP_[i] |= RuleBasedCollator.UCA_.m_unsafe_[i];
3380:                }
3381:                collator.m_unsafe_ = t.m_unsafeCP_;
3382:
3383:                // Finish building Contraction Ending chars hash table and then copy it 
3384:                // out.
3385:                // Or in unsafebits from UCA, making a combined table
3386:                for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
3387:                    t.m_contrEndCP_[i] |= RuleBasedCollator.UCA_.m_contractionEnd_[i];
3388:                }
3389:                collator.m_contractionEnd_ = t.m_contrEndCP_;
3390:            }
3391:
3392:            /**
3393:             * Sets this collator to use the all options and tables in UCA. 
3394:             * @param collator which attribute is to be set 
3395:             * @param option to set with
3396:             */
3397:            private static final void setAttributes(RuleBasedCollator collator,
3398:                    CollationRuleParser.OptionSet option) {
3399:                collator.latinOneFailed_ = true;
3400:                collator.m_caseFirst_ = option.m_caseFirst_;
3401:                collator.setDecomposition(option.m_decomposition_);
3402:                collator
3403:                        .setAlternateHandlingShifted(option.m_isAlternateHandlingShifted_);
3404:                collator.setCaseLevel(option.m_isCaseLevel_);
3405:                collator.setFrenchCollation(option.m_isFrenchCollation_);
3406:                collator.m_isHiragana4_ = option.m_isHiragana4_;
3407:                collator.setStrength(option.m_strength_);
3408:                collator.m_variableTopValue_ = option.m_variableTopValue_;
3409:                collator.latinOneFailed_ = false;
3410:            }
3411:
3412:            /**
3413:             * Constructing the contraction table
3414:             * @param table contraction table
3415:             * @return 
3416:             */
3417:            private int constructTable(ContractionTable table) {
3418:                // See how much memory we need 
3419:                int tsize = table.m_elements_.size();
3420:                if (tsize == 0) {
3421:                    return 0;
3422:                }
3423:                table.m_offsets_.clear();
3424:                int position = 0;
3425:                for (int i = 0; i < tsize; i++) {
3426:                    table.m_offsets_.add(new Integer(position));
3427:                    position += ((BasicContractionTable) table.m_elements_
3428:                            .get(i)).m_CEs_.size();
3429:                }
3430:                table.m_CEs_.clear();
3431:                table.m_codePoints_.delete(0, table.m_codePoints_.length());
3432:                // Now stuff the things in
3433:                StringBuffer cpPointer = table.m_codePoints_;
3434:                Vector CEPointer = table.m_CEs_;
3435:                for (int i = 0; i < tsize; i++) {
3436:                    BasicContractionTable bct = (BasicContractionTable) table.m_elements_
3437:                            .get(i);
3438:                    int size = bct.m_CEs_.size();
3439:                    char ccMax = 0;
3440:                    char ccMin = 255;
3441:                    int offset = CEPointer.size();
3442:                    CEPointer.add(bct.m_CEs_.get(0));
3443:                    for (int j = 1; j < size; j++) {
3444:                        char ch = bct.m_codePoints_.charAt(j);
3445:                        char cc = (char) (UCharacter.getCombiningClass(ch) & 0xFF);
3446:                        if (cc > ccMax) {
3447:                            ccMax = cc;
3448:                        }
3449:                        if (cc < ccMin) {
3450:                            ccMin = cc;
3451:                        }
3452:                        cpPointer.append(ch);
3453:                        CEPointer.add(bct.m_CEs_.get(j));
3454:                    }
3455:                    cpPointer.insert(offset, (char) (((ccMin == ccMax) ? 1
3456:                            : 0 << 8) | ccMax));
3457:                    for (int j = 0; j < size; j++) {
3458:                        if (isContractionTableElement(((Integer) CEPointer
3459:                                .get(offset + j)).intValue())) {
3460:                            int ce = ((Integer) CEPointer.get(offset + j))
3461:                                    .intValue();
3462:                            CEPointer
3463:                                    .set(
3464:                                            offset + j,
3465:                                            new Integer(
3466:                                                    constructSpecialCE(
3467:                                                            getCETag(ce),
3468:                                                            ((Integer) table.m_offsets_
3469:                                                                    .get(getContractionOffset(ce)))
3470:                                                                    .intValue())));
3471:                        }
3472:                    }
3473:                }
3474:
3475:                for (int i = 0; i <= 0x10FFFF; i++) {
3476:                    int CE = table.m_mapping_.getValue(i);
3477:                    if (isContractionTableElement(CE)) {
3478:                        CE = constructSpecialCE(getCETag(CE),
3479:                                ((Integer) table.m_offsets_
3480:                                        .get(getContractionOffset(CE)))
3481:                                        .intValue());
3482:                        table.m_mapping_.setValue(i, CE);
3483:                    }
3484:                }
3485:                return position;
3486:            }
3487:
3488:            /**
3489:             * Get contraction offset
3490:             * @param ce collation element 
3491:             * @return contraction offset
3492:             */
3493:            private static final int getContractionOffset(int ce) {
3494:                return ce & 0xFFFFFF;
3495:            }
3496:
3497:            /**
3498:             * Gets the maximum Jamo expansion
3499:             * @param mapping trie table
3500:             * @param maxexpansion maximum expansion table
3501:             * @param maxjamoexpansion maximum jamo expansion table
3502:             * @param jamospecial is jamo special?
3503:             */
3504:            private static void getMaxExpansionJamo(IntTrieBuilder mapping,
3505:                    MaxExpansionTable maxexpansion,
3506:                    MaxJamoExpansionTable maxjamoexpansion, boolean jamospecial) {
3507:                int VBASE = 0x1161;
3508:                int TBASE = 0x11A8;
3509:                int VCOUNT = 21;
3510:                int TCOUNT = 28;
3511:                int v = VBASE + VCOUNT - 1;
3512:                int t = TBASE + TCOUNT - 1;
3513:
3514:                while (v >= VBASE) {
3515:                    int ce = mapping.getValue(v);
3516:                    if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
3517:                        setMaxExpansion(ce, (byte) 2, maxexpansion);
3518:                    }
3519:                    v--;
3520:                }
3521:
3522:                while (t >= TBASE) {
3523:                    int ce = mapping.getValue(t);
3524:                    if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
3525:                        setMaxExpansion(ce, (byte) 3, maxexpansion);
3526:                    }
3527:                    t--;
3528:                }
3529:                // According to the docs, 99% of the time, the Jamo will not be special 
3530:                if (jamospecial) {
3531:                    // gets the max expansion in all unicode characters
3532:                    int count = maxjamoexpansion.m_endExpansionCE_.size();
3533:                    byte maxTSize = (byte) (maxjamoexpansion.m_maxLSize_
3534:                            + maxjamoexpansion.m_maxVSize_ + maxjamoexpansion.m_maxTSize_);
3535:                    byte maxVSize = (byte) (maxjamoexpansion.m_maxLSize_ + maxjamoexpansion.m_maxVSize_);
3536:
3537:                    while (count > 0) {
3538:                        count--;
3539:                        if (((Boolean) maxjamoexpansion.m_isV_.get(count))
3540:                                .booleanValue() == true) {
3541:                            setMaxExpansion(
3542:                                    ((Integer) maxjamoexpansion.m_endExpansionCE_
3543:                                            .get(count)).intValue(), maxVSize,
3544:                                    maxexpansion);
3545:                        } else {
3546:                            setMaxExpansion(
3547:                                    ((Integer) maxjamoexpansion.m_endExpansionCE_
3548:                                            .get(count)).intValue(), maxTSize,
3549:                                    maxexpansion);
3550:                        }
3551:                    }
3552:                }
3553:            }
3554:
3555:            /**  
3556:             * To the UnsafeCP hash table, add all chars with combining class != 0     
3557:             * @param t build table
3558:             */
3559:            private static final void unsafeCPAddCCNZ(BuildTable t) {
3560:
3561:                for (char c = 0; c < 0xffff; c++) {
3562:                    char fcd = NormalizerImpl.getFCD16(c);
3563:                    if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||
3564:                            (UTF16.isLeadSurrogate(c) && fcd != 0)) {
3565:                        // c is a leading surrogate with some FCD data
3566:                        unsafeCPSet(t.m_unsafeCP_, c);
3567:                    }
3568:                }
3569:
3570:                if (t.m_prefixLookup_ != null) {
3571:                    Enumeration els = t.m_prefixLookup_.elements();
3572:                    while (els.hasMoreElements()) {
3573:                        Elements e = (Elements) els.nextElement();
3574:                        // codepoints here are in the NFD form. We need to add the
3575:                        // first code point of the NFC form to unsafe, because 
3576:                        // strcoll needs to backup over them.
3577:                        // weiv: This is wrong! See the comment above.
3578:                        //String decomp = Normalizer.decompose(e.m_cPoints_, true);
3579:                        //unsafeCPSet(t.m_unsafeCP_, decomp.charAt(0));
3580:                        // it should be:
3581:                        String comp = Normalizer.compose(e.m_cPoints_, false);
3582:                        unsafeCPSet(t.m_unsafeCP_, comp.charAt(0));
3583:                    }
3584:                }
3585:            }
3586:
3587:            /**
3588:             * Create closure
3589:             * @param t build table
3590:             * @param collator RuleBasedCollator
3591:             * @param colEl collation element iterator
3592:             * @param start 
3593:             * @param limit
3594:             * @param type character type
3595:             * @return 
3596:             */
3597:            private boolean enumCategoryRangeClosureCategory(BuildTable t,
3598:                    RuleBasedCollator collator, CollationElementIterator colEl,
3599:                    int start, int limit, int type) {
3600:                if (type != UCharacterCategory.UNASSIGNED
3601:                        && type != UCharacterCategory.PRIVATE_USE) {
3602:                    // if the range is assigned - we might ommit more categories later
3603:
3604:                    for (int u32 = start; u32 < limit; u32++) {
3605:                        int noOfDec = NormalizerImpl.getDecomposition(u32,
3606:                                false, m_utilCharBuffer_, 0, 256);
3607:                        if (noOfDec > 0) {
3608:                            // if we're positive, that means there is no decomposition
3609:                            String comp = UCharacter.toString(u32);
3610:                            String decomp = new String(m_utilCharBuffer_, 0,
3611:                                    noOfDec);
3612:                            if (!collator.equals(comp, decomp)) {
3613:                                m_utilElement_.m_cPoints_ = decomp;
3614:                                m_utilElement_.m_prefix_ = 0;
3615:                                Elements prefix = (Elements) t.m_prefixLookup_
3616:                                        .get(m_utilElement_);
3617:                                if (prefix == null) {
3618:                                    m_utilElement_.m_cPoints_ = comp;
3619:                                    m_utilElement_.m_prefix_ = 0;
3620:                                    m_utilElement_.m_prefixChars_ = null;
3621:                                    colEl.setText(decomp);
3622:                                    int ce = colEl.next();
3623:                                    m_utilElement_.m_CELength_ = 0;
3624:                                    while (ce != CollationElementIterator.NULLORDER) {
3625:                                        m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = ce;
3626:                                        ce = colEl.next();
3627:                                    }
3628:                                } else {
3629:                                    m_utilElement_.m_cPoints_ = comp;
3630:                                    m_utilElement_.m_prefix_ = 0;
3631:                                    m_utilElement_.m_prefixChars_ = null;
3632:                                    m_utilElement_.m_CELength_ = 1;
3633:                                    m_utilElement_.m_CEs_[0] = prefix.m_mapCE_;
3634:                                    // This character uses a prefix. We have to add it 
3635:                                    // to the unsafe table, as it decomposed form is 
3636:                                    // already in. In Japanese, this happens for \u309e 
3637:                                    // & \u30fe
3638:                                    // Since unsafeCPSet is static in ucol_elm, we are 
3639:                                    // going to wrap it up in the unsafeCPAddCCNZ 
3640:                                    // function
3641:                                }
3642:                                addAnElement(t, m_utilElement_);
3643:                            }
3644:                        }
3645:                    }
3646:                }
3647:                return true;
3648:            }
3649:
3650:            /**
3651:             * Determine if a character is a Jamo
3652:             * @param ch character to test
3653:             * @return true if ch is a Jamo, false otherwise
3654:             */
3655:            private static final boolean isJamo(char ch) {
3656:                return (ch >= 0x1100 && ch <= 0x1112)
3657:                        || (ch >= 0x1175 && ch <= 0x1161)
3658:                        || (ch >= 0x11A8 && ch <= 0x11C2);
3659:            }
3660:
3661:            /**
3662:             * Produces canonical closure
3663:             */
3664:            private void canonicalClosure(BuildTable t) {
3665:                BuildTable temp = new BuildTable(t);
3666:                assembleTable(temp, temp.m_collator_);
3667:                // produce canonical closure 
3668:                CollationElementIterator coleiter = temp.m_collator_
3669:                        .getCollationElementIterator("");
3670:                RangeValueIterator typeiter = UCharacter.getTypeIterator();
3671:                RangeValueIterator.Element element = new RangeValueIterator.Element();
3672:                while (typeiter.next(element)) {
3673:                    enumCategoryRangeClosureCategory(t, temp.m_collator_,
3674:                            coleiter, element.start, element.limit,
3675:                            element.value);
3676:                }
3677:            }
3678:
3679:            private void processUCACompleteIgnorables(BuildTable t) {
3680:                TrieIterator trieiterator = new TrieIterator(
3681:                        RuleBasedCollator.UCA_.m_trie_);
3682:                RangeValueIterator.Element element = new RangeValueIterator.Element();
3683:                while (trieiterator.next(element)) {
3684:                    int start = element.start;
3685:                    int limit = element.limit;
3686:                    if (element.value == 0) {
3687:                        while (start < limit) {
3688:                            int CE = t.m_mapping_.getValue(start);
3689:                            if (CE == CE_NOT_FOUND_) {
3690:                                m_utilElement_.m_prefix_ = 0;
3691:                                m_utilElement_.m_uchars_ = UCharacter
3692:                                        .toString(start);
3693:                                m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
3694:                                m_utilElement_.m_cPointsOffset_ = 0;
3695:                                m_utilElement_.m_CELength_ = 1;
3696:                                m_utilElement_.m_CEs_[0] = 0;
3697:                                addAnElement(t, m_utilElement_);
3698:                            }
3699:                            start++;
3700:                        }
3701:                    }
3702:                }
3703:            }
3704:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.