Source Code Cross Referenced for CollationRuleParser.java in » Internationalization-Localization » icu4j » com » ibm » icu » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.text
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /**
0002:         *******************************************************************************
0003:         * Copyright (C) 1996-2006, International Business Machines Corporation and    *
0004:         * others. All Rights Reserved.                                                *
0005:         *******************************************************************************
0006:         */package com.ibm.icu.text;
0007:
0008:        import java.text.ParseException;
0009:        import java.util.Hashtable;
0010:        import java.util.Arrays;
0011:        import com.ibm.icu.lang.UCharacter;
0012:        import com.ibm.icu.impl.UCharacterProperty;
0013:
0014:        /**
0015:         * Class for parsing collation rules, produces a list of tokens that will be
0016:         * turned into collation elements
0017:         * @author Syn Wee Quek
0018:         * @since release 2.2, June 7 2002
0019:         * @draft 2.2
0020:         */
0021:        final class CollationRuleParser {
0022:            // public data members ---------------------------------------------------
0023:
0024:            // package private constructors ------------------------------------------
0025:
0026:            /**
0027:             * <p>RuleBasedCollator constructor that takes the rules.
0028:             * Please see RuleBasedCollator class description for more details on the
0029:             * collation rule syntax.</p>
0030:             * @see java.util.Locale
0031:             * @param rules the collation rules to build the collation table from.
0032:             * @exception ParseException thrown when argument rules have an invalid
0033:             *            syntax.
0034:             * @draft 2.2
0035:             */
0036:            CollationRuleParser(String rules) throws ParseException {
0037:                extractSetsFromRules(rules);
0038:                m_source_ = new StringBuffer(Normalizer.decompose(rules, false)
0039:                        .trim());
0040:                m_rules_ = m_source_.toString();
0041:                m_current_ = 0;
0042:                m_extraCurrent_ = m_source_.length();
0043:                m_variableTop_ = null;
0044:                m_parsedToken_ = new ParsedToken();
0045:                m_hashTable_ = new Hashtable();
0046:                m_options_ = new OptionSet(RuleBasedCollator.UCA_);
0047:                m_listHeader_ = new TokenListHeader[512];
0048:                m_resultLength_ = 0;
0049:                // call assembleTokenList() manually, so that we can
0050:                // init a parser and manually parse tokens
0051:                //assembleTokenList();
0052:            }
0053:
0054:            // package private inner classes -----------------------------------------
0055:
0056:            /**
0057:             * Collation options set
0058:             */
0059:            static class OptionSet {
0060:                // package private constructor ---------------------------------------
0061:
0062:                /**
0063:                 * Initializes the option set with the argument collators
0064:                 * @param collator option to use
0065:                 */
0066:                OptionSet(RuleBasedCollator collator) {
0067:                    m_variableTopValue_ = collator.m_variableTopValue_;
0068:                    m_isFrenchCollation_ = collator.isFrenchCollation();
0069:                    m_isAlternateHandlingShifted_ = collator
0070:                            .isAlternateHandlingShifted();
0071:                    m_caseFirst_ = collator.m_caseFirst_;
0072:                    m_isCaseLevel_ = collator.isCaseLevel();
0073:                    m_decomposition_ = collator.getDecomposition();
0074:                    m_strength_ = collator.getStrength();
0075:                    m_isHiragana4_ = collator.m_isHiragana4_;
0076:                }
0077:
0078:                // package private data members --------------------------------------
0079:
0080:                int m_variableTopValue_;
0081:                boolean m_isFrenchCollation_;
0082:                /**
0083:                 * Attribute for handling variable elements
0084:                 */
0085:                boolean m_isAlternateHandlingShifted_;
0086:                /**
0087:                 * who goes first, lower case or uppercase
0088:                 */
0089:                int m_caseFirst_;
0090:                /**
0091:                 * do we have an extra case level
0092:                 */
0093:                boolean m_isCaseLevel_;
0094:                /**
0095:                 * attribute for normalization
0096:                 */
0097:                int m_decomposition_;
0098:                /**
0099:                 * attribute for strength
0100:                 */
0101:                int m_strength_;
0102:                /**
0103:                 * attribute for special Hiragana
0104:                 */
0105:                boolean m_isHiragana4_;
0106:            }
0107:
0108:            /**
0109:             * List of tokens used by the collation rules
0110:             */
0111:            static class TokenListHeader {
0112:                Token m_first_;
0113:                Token m_last_;
0114:                Token m_reset_;
0115:                boolean m_indirect_;
0116:                int m_baseCE_;
0117:                int m_baseContCE_;
0118:                int m_nextCE_;
0119:                int m_nextContCE_;
0120:                int m_previousCE_;
0121:                int m_previousContCE_;
0122:                int m_pos_[] = new int[Collator.IDENTICAL + 1];
0123:                int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];
0124:                int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];
0125:                int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];
0126:                Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];
0127:                Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];
0128:            }
0129:
0130:            /**
0131:             * Token wrapper for collation rules
0132:             */
0133:            static class Token {
0134:                // package private data members ---------------------------------------
0135:
0136:                int m_CE_[];
0137:                int m_CELength_;
0138:                int m_expCE_[];
0139:                int m_expCELength_;
0140:                int m_source_;
0141:                int m_expansion_;
0142:                int m_prefix_;
0143:                int m_strength_;
0144:                int m_toInsert_;
0145:                int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>
0146:                TokenListHeader m_listHeader_;
0147:                Token m_previous_;
0148:                Token m_next_;
0149:                StringBuffer m_rules_;
0150:                char m_flags_;
0151:
0152:                // package private constructors ---------------------------------------
0153:
0154:                Token() {
0155:                    m_CE_ = new int[128];
0156:                    m_expCE_ = new int[128];
0157:                    // TODO: this should also handle reverse
0158:                    m_polarity_ = TOKEN_POLARITY_POSITIVE_;
0159:                    m_next_ = null;
0160:                    m_previous_ = null;
0161:                    m_CELength_ = 0;
0162:                    m_expCELength_ = 0;
0163:                }
0164:
0165:                // package private methods --------------------------------------------
0166:
0167:                /**
0168:                 * Hashcode calculation for token
0169:                 * @return the hashcode
0170:                 */
0171:                public int hashCode() {
0172:                    int result = 0;
0173:                    int len = (m_source_ & 0xFF000000) >>> 24;
0174:                    int inc = ((len - 32) / 32) + 1;
0175:
0176:                    int start = m_source_ & 0x00FFFFFF;
0177:                    int limit = start + len;
0178:
0179:                    while (start < limit) {
0180:                        result = (result * 37) + m_rules_.charAt(start);
0181:                        start += inc;
0182:                    }
0183:                    return result;
0184:                }
0185:
0186:                /**
0187:                 * Equals calculation
0188:                 * @param target object to compare
0189:                 * @return true if target is the same as this object
0190:                 */
0191:                public boolean equals(Object target) {
0192:                    if (target == this ) {
0193:                        return true;
0194:                    }
0195:                    if (target instanceof  Token) {
0196:                        Token t = (Token) target;
0197:                        int sstart = m_source_ & 0x00FFFFFF;
0198:                        int tstart = t.m_source_ & 0x00FFFFFF;
0199:                        int slimit = (m_source_ & 0xFF000000) >> 24;
0200:                        int tlimit = (m_source_ & 0xFF000000) >> 24;
0201:
0202:                        int end = sstart + slimit - 1;
0203:
0204:                        if (m_source_ == 0 || t.m_source_ == 0) {
0205:                            return false;
0206:                        }
0207:                        if (slimit != tlimit) {
0208:                            return false;
0209:                        }
0210:                        if (m_source_ == t.m_source_) {
0211:                            return true;
0212:                        }
0213:
0214:                        while (sstart < end
0215:                                && m_rules_.charAt(sstart) == t.m_rules_
0216:                                        .charAt(tstart)) {
0217:                            ++sstart;
0218:                            ++tstart;
0219:                        }
0220:                        if (m_rules_.charAt(sstart) == t.m_rules_
0221:                                .charAt(tstart)) {
0222:                            return true;
0223:                        }
0224:                    }
0225:                    return false;
0226:                }
0227:            }
0228:
0229:            // package private data member -------------------------------------------
0230:
0231:            /**
0232:             * Indicator that the token is resetted yet, ie & in the rules
0233:             */
0234:            static final int TOKEN_RESET_ = 0xDEADBEEF;
0235:
0236:            /**
0237:             * Size of the number of tokens
0238:             */
0239:            int m_resultLength_;
0240:            /**
0241:             * List of parsed tokens
0242:             */
0243:            TokenListHeader m_listHeader_[];
0244:            /**
0245:             * Variable top token
0246:             */
0247:            Token m_variableTop_;
0248:            /**
0249:             * Collation options
0250:             */
0251:            OptionSet m_options_;
0252:            /**
0253:             * Normalized collation rules with some extra characters
0254:             */
0255:            StringBuffer m_source_;
0256:            /**
0257:             * Hash table to keep all tokens
0258:             */
0259:            Hashtable m_hashTable_;
0260:
0261:            // package private method ------------------------------------------------
0262:
0263:            void setDefaultOptionsInCollator(RuleBasedCollator collator) {
0264:                collator.m_defaultStrength_ = m_options_.m_strength_;
0265:                collator.m_defaultDecomposition_ = m_options_.m_decomposition_;
0266:                collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;
0267:                collator.m_defaultIsAlternateHandlingShifted_ = m_options_.m_isAlternateHandlingShifted_;
0268:                collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;
0269:                collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
0270:                collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
0271:                collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
0272:            }
0273:
0274:            // private inner classes -------------------------------------------------
0275:
0276:            /**
0277:             * This is a token that has been parsed but not yet processed. Used to
0278:             * reduce the number of arguments in the parser
0279:             */
0280:            private static class ParsedToken {
0281:                // private constructor ----------------------------------------------
0282:
0283:                /**
0284:                 * Empty constructor
0285:                 */
0286:                ParsedToken() {
0287:                    m_charsLen_ = 0;
0288:                    m_charsOffset_ = 0;
0289:                    m_extensionLen_ = 0;
0290:                    m_extensionOffset_ = 0;
0291:                    m_prefixLen_ = 0;
0292:                    m_prefixOffset_ = 0;
0293:                    m_flags_ = 0;
0294:                    m_strength_ = TOKEN_UNSET_;
0295:                }
0296:
0297:                // private data members ---------------------------------------------
0298:
0299:                int m_strength_;
0300:                int m_charsOffset_;
0301:                int m_charsLen_;
0302:                int m_extensionOffset_;
0303:                int m_extensionLen_;
0304:                int m_prefixOffset_;
0305:                int m_prefixLen_;
0306:                char m_flags_;
0307:                char m_indirectIndex_;
0308:            }
0309:
0310:            /**
0311:             * Boundary wrappers
0312:             */
0313:            private static class IndirectBoundaries {
0314:                // package private constructor ---------------------------------------
0315:
0316:                IndirectBoundaries(int startce[], int limitce[]) {
0317:                    // Set values for the top - TODO: once we have values for all the
0318:                    // indirects, we are going to initalize here.
0319:                    m_startCE_ = startce[0];
0320:                    m_startContCE_ = startce[1];
0321:                    if (limitce != null) {
0322:                        m_limitCE_ = limitce[0];
0323:                        m_limitContCE_ = limitce[1];
0324:                    } else {
0325:                        m_limitCE_ = 0;
0326:                        m_limitContCE_ = 0;
0327:                    }
0328:                }
0329:
0330:                // package private data members --------------------------------------
0331:
0332:                int m_startCE_;
0333:                int m_startContCE_;
0334:                int m_limitCE_;
0335:                int m_limitContCE_;
0336:            }
0337:
0338:            /**
0339:             * Collation option rule tag
0340:             */
0341:            private static class TokenOption {
0342:                // package private constructor ---------------------------------------
0343:
0344:                TokenOption(String name, int attribute, String suboptions[],
0345:                        int suboptionattributevalue[]) {
0346:                    m_name_ = name;
0347:                    m_attribute_ = attribute;
0348:                    m_subOptions_ = suboptions;
0349:                    m_subOptionAttributeValues_ = suboptionattributevalue;
0350:                }
0351:
0352:                // package private data member ---------------------------------------
0353:
0354:                private String m_name_;
0355:                private int m_attribute_;
0356:                private String m_subOptions_[];
0357:                private int m_subOptionAttributeValues_[];
0358:            }
0359:
0360:            // private variables -----------------------------------------------------
0361:
0362:            /**
0363:             * Current parsed token
0364:             */
0365:            private ParsedToken m_parsedToken_;
0366:            /**
0367:             * Collation rule
0368:             */
0369:            private String m_rules_;
0370:            private int m_current_;
0371:            /**
0372:             * End of the option while reading.
0373:             * Need it for UnicodeSet reading support.
0374:             */
0375:            private int m_optionEnd_;
0376:            /**
0377:             * Current offset in m_source
0378:             */
0379:            private int m_sourceLimit_;
0380:            /**
0381:             * Offset to m_source_ ofr the extra expansion characters
0382:             */
0383:            private int m_extraCurrent_;
0384:
0385:            /**
0386:             * UnicodeSet that contains code points to be copied from the UCA
0387:             */
0388:            UnicodeSet m_copySet_;
0389:
0390:            /**
0391:             * UnicodeSet that contains code points for which we want to remove
0392:             * UCA contractions. It implies copying of these code points from
0393:             * the UCA.
0394:             */
0395:            UnicodeSet m_removeSet_;
0396:            /**
0397:             * This is space for the extra strings that need to be unquoted during the
0398:             * parsing of the rules
0399:             */
0400:            private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;
0401:            /**
0402:             * Indicator that the token is not set yet
0403:             */
0404:            private static final int TOKEN_UNSET_ = 0xFFFFFFFF;
0405:            /**
0406:             * Indicator that the rule is in the > polarity, ie everything on the
0407:             * right of the rule is less than
0408:             */
0409:            private static final int TOKEN_POLARITY_NEGATIVE_ = 0;
0410:            /**
0411:             * Indicator that the rule is in the < polarity, ie everything on the
0412:             * right of the rule is greater than
0413:             */
0414:            private static final int TOKEN_POLARITY_POSITIVE_ = 1;
0415:            /**
0416:             * Flag mask to determine if top is set
0417:             */
0418:            private static final int TOKEN_TOP_MASK_ = 0x04;
0419:            /**
0420:             * Flag mask to determine if variable top is set
0421:             */
0422:            private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;
0423:            /**
0424:             * Flag mask to determine if a before attribute is set
0425:             */
0426:            private static final int TOKEN_BEFORE_ = 0x03;
0427:            /**
0428:             * For use in parsing token options
0429:             */
0430:            private static final int TOKEN_SUCCESS_MASK_ = 0x10;
0431:
0432:            /**
0433:             * These values are used for finding CE values for indirect positioning.
0434:             * Indirect positioning is a mechanism for allowing resets on symbolic
0435:             * values. It only works for resets and you cannot tailor indirect names.
0436:             * An indirect name can define either an anchor point or a range. An anchor
0437:             * point behaves in exactly the same way as a code point in reset would,
0438:             * except that it cannot be tailored. A range (we currently only know for
0439:             * the [top] range will explicitly set the upper bound for generated CEs,
0440:             * thus allowing for better control over how many CEs can be squeezed
0441:             * between in the range without performance penalty. In that respect, we use
0442:             * [top] for tailoring of locales that use CJK characters. Other indirect
0443:             * values are currently a pure convenience, they can be used to assure that
0444:             * the CEs will be always positioned in the same place relative to a point
0445:             * with known properties (e.g. first primary ignorable).
0446:             */
0447:            private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];
0448:
0449:            /**
0450:             * Inverse UCA constants
0451:             */
0452:            private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;
0453:            private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;
0454:            private static final int INVERSE_SHIFT_VALUE_ = 20;
0455:
0456:            /**
0457:             * Collation option tags
0458:             * [last variable] last variable value
0459:             * [last primary ignorable] largest CE for primary ignorable
0460:             * [last secondary ignorable] largest CE for secondary ignorable
0461:             * [last tertiary ignorable] largest CE for tertiary ignorable
0462:             * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
0463:             */
0464:            private static final TokenOption RULES_OPTIONS_[];
0465:
0466:            static {
0467:                INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];
0468:                // UCOL_RESET_TOP_VALUE
0469:                INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(
0470:                        RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
0471:                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
0472:                // UCOL_FIRST_PRIMARY_IGNORABLE
0473:                INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(
0474:                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_,
0475:                        null);
0476:                // UCOL_LAST_PRIMARY_IGNORABLE
0477:                INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(
0478:                        RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_,
0479:                        null);
0480:
0481:                // UCOL_FIRST_SECONDARY_IGNORABLE
0482:                INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(
0483:                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_,
0484:                        null);
0485:                // UCOL_LAST_SECONDARY_IGNORABLE
0486:                INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(
0487:                        RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_,
0488:                        null);
0489:                // UCOL_FIRST_TERTIARY_IGNORABLE
0490:                INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(
0491:                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_,
0492:                        null);
0493:                // UCOL_LAST_TERTIARY_IGNORABLE
0494:                INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(
0495:                        RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_,
0496:                        null);
0497:                // UCOL_FIRST_VARIABLE;
0498:                INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(
0499:                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_, null);
0500:                // UCOL_LAST_VARIABLE
0501:                INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(
0502:                        RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_, null);
0503:                // UCOL_FIRST_NON_VARIABLE
0504:                INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(
0505:                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_,
0506:                        null);
0507:                // UCOL_LAST_NON_VARIABLE
0508:                INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(
0509:                        RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
0510:                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
0511:                // UCOL_FIRST_IMPLICIT
0512:                INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(
0513:                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_, null);
0514:                // UCOL_LAST_IMPLICIT
0515:                INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(
0516:                        RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,
0517:                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);
0518:                // UCOL_FIRST_TRAILING
0519:                INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(
0520:                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_, null);
0521:                // UCOL_LAST_TRAILING
0522:                INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(
0523:                        RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_, null);
0524:                INDIRECT_BOUNDARIES_[14].m_limitCE_ = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;
0525:
0526:                RULES_OPTIONS_ = new TokenOption[19];
0527:                String option[] = { "non-ignorable", "shifted" };
0528:                int value[] = {
0529:                        RuleBasedCollator.AttributeValue.NON_IGNORABLE_,
0530:                        RuleBasedCollator.AttributeValue.SHIFTED_ };
0531:                RULES_OPTIONS_[0] = new TokenOption("alternate",
0532:                        RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
0533:                        option, value);
0534:                option = new String[1];
0535:                option[0] = "2";
0536:                value = new int[1];
0537:                value[0] = RuleBasedCollator.AttributeValue.ON_;
0538:                RULES_OPTIONS_[1] = new TokenOption("backwards",
0539:                        RuleBasedCollator.Attribute.FRENCH_COLLATION_, option,
0540:                        value);
0541:                String offonoption[] = new String[2];
0542:                offonoption[0] = "off";
0543:                offonoption[1] = "on";
0544:                int offonvalue[] = new int[2];
0545:                offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;
0546:                offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;
0547:                RULES_OPTIONS_[2] = new TokenOption("caseLevel",
0548:                        RuleBasedCollator.Attribute.CASE_LEVEL_, offonoption,
0549:                        offonvalue);
0550:                option = new String[3];
0551:                option[0] = "lower";
0552:                option[1] = "upper";
0553:                option[2] = "off";
0554:                value = new int[3];
0555:                value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;
0556:                value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;
0557:                value[2] = RuleBasedCollator.AttributeValue.OFF_;
0558:                RULES_OPTIONS_[3] = new TokenOption("caseFirst",
0559:                        RuleBasedCollator.Attribute.CASE_FIRST_, option, value);
0560:                RULES_OPTIONS_[4] = new TokenOption("normalization",
0561:                        RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
0562:                        offonoption, offonvalue);
0563:                RULES_OPTIONS_[5] = new TokenOption("hiraganaQ",
0564:                        RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
0565:                        offonoption, offonvalue);
0566:                option = new String[5];
0567:                option[0] = "1";
0568:                option[1] = "2";
0569:                option[2] = "3";
0570:                option[3] = "4";
0571:                option[4] = "I";
0572:                value = new int[5];
0573:                value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
0574:                value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
0575:                value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
0576:                value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;
0577:                value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;
0578:                RULES_OPTIONS_[6] = new TokenOption("strength",
0579:                        RuleBasedCollator.Attribute.STRENGTH_, option, value);
0580:                RULES_OPTIONS_[7] = new TokenOption("variable top",
0581:                        RuleBasedCollator.Attribute.LIMIT_, null, null);
0582:                RULES_OPTIONS_[8] = new TokenOption("rearrange",
0583:                        RuleBasedCollator.Attribute.LIMIT_, null, null);
0584:                option = new String[3];
0585:                option[0] = "1";
0586:                option[1] = "2";
0587:                option[2] = "3";
0588:                value = new int[3];
0589:                value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
0590:                value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
0591:                value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
0592:                RULES_OPTIONS_[9] = new TokenOption("before",
0593:                        RuleBasedCollator.Attribute.LIMIT_, option, value);
0594:                RULES_OPTIONS_[10] = new TokenOption("top",
0595:                        RuleBasedCollator.Attribute.LIMIT_, null, null);
0596:                String firstlastoption[] = new String[7];
0597:                firstlastoption[0] = "primary";
0598:                firstlastoption[1] = "secondary";
0599:                firstlastoption[2] = "tertiary";
0600:                firstlastoption[3] = "variable";
0601:                firstlastoption[4] = "regular";
0602:                firstlastoption[5] = "implicit";
0603:                firstlastoption[6] = "trailing";
0604:
0605:                int firstlastvalue[] = new int[7];
0606:                Arrays.fill(firstlastvalue,
0607:                        RuleBasedCollator.AttributeValue.PRIMARY_);
0608:
0609:                RULES_OPTIONS_[11] = new TokenOption("first",
0610:                        RuleBasedCollator.Attribute.LIMIT_, firstlastoption,
0611:                        firstlastvalue);
0612:                RULES_OPTIONS_[12] = new TokenOption("last",
0613:                        RuleBasedCollator.Attribute.LIMIT_, firstlastoption,
0614:                        firstlastvalue);
0615:                RULES_OPTIONS_[13] = new TokenOption("optimize",
0616:                        RuleBasedCollator.Attribute.LIMIT_, null, null);
0617:                RULES_OPTIONS_[14] = new TokenOption("suppressContractions",
0618:                        RuleBasedCollator.Attribute.LIMIT_, null, null);
0619:                RULES_OPTIONS_[15] = new TokenOption("undefined",
0620:                        RuleBasedCollator.Attribute.LIMIT_, null, null);
0621:                RULES_OPTIONS_[16] = new TokenOption("scriptOrder",
0622:                        RuleBasedCollator.Attribute.LIMIT_, null, null);
0623:                RULES_OPTIONS_[17] = new TokenOption("charsetname",
0624:                        RuleBasedCollator.Attribute.LIMIT_, null, null);
0625:                RULES_OPTIONS_[18] = new TokenOption("charset",
0626:                        RuleBasedCollator.Attribute.LIMIT_, null, null);
0627:            }
0628:
0629:            /**
0630:             * Utility data members
0631:             */
0632:            private Token m_utilToken_ = new Token();
0633:            private CollationElementIterator m_UCAColEIter_ = RuleBasedCollator.UCA_
0634:                    .getCollationElementIterator("");
0635:            private int m_utilCEBuffer_[] = new int[2];
0636:
0637:            // private methods -------------------------------------------------------
0638:
0639:            /**
0640:             * Assembles the token list
0641:             * @exception ParseException thrown when rules syntax fails
0642:             */
0643:            int assembleTokenList() throws ParseException {
0644:                Token lastToken = null;
0645:                m_parsedToken_.m_strength_ = TOKEN_UNSET_;
0646:                int sourcelimit = m_source_.length();
0647:                int expandNext = 0;
0648:
0649:                while (m_current_ < sourcelimit) {
0650:                    m_parsedToken_.m_prefixOffset_ = 0;
0651:                    if (parseNextToken(lastToken == null) < 0) {
0652:                        // we have reached the end
0653:                        continue;
0654:                    }
0655:                    char specs = m_parsedToken_.m_flags_;
0656:                    boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);
0657:                    boolean top = ((specs & TOKEN_TOP_MASK_) != 0);
0658:                    int lastStrength = TOKEN_UNSET_;
0659:                    if (lastToken != null) {
0660:                        lastStrength = lastToken.m_strength_;
0661:                    }
0662:                    m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24
0663:                            | m_parsedToken_.m_charsOffset_;
0664:                    m_utilToken_.m_rules_ = m_source_;
0665:                    // 4 Lookup each source in the CharsToToken map, and find a
0666:                    // sourcetoken
0667:                    Token sourceToken = (Token) m_hashTable_.get(m_utilToken_);
0668:                    if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
0669:                        if (lastToken == null) {
0670:                            // this means that rules haven't started properly
0671:                            throwParseException(m_source_.toString(), 0);
0672:                        }
0673:                        //  6 Otherwise (when relation != reset)
0674:                        if (sourceToken == null) {
0675:                            // If sourceToken is null, create new one
0676:                            sourceToken = new Token();
0677:                            sourceToken.m_rules_ = m_source_;
0678:                            sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
0679:                                    | m_parsedToken_.m_charsOffset_;
0680:                            sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24
0681:                                    | m_parsedToken_.m_prefixOffset_;
0682:                            // TODO: this should also handle reverse
0683:                            sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
0684:                            sourceToken.m_next_ = null;
0685:                            sourceToken.m_previous_ = null;
0686:                            sourceToken.m_CELength_ = 0;
0687:                            sourceToken.m_expCELength_ = 0;
0688:                            m_hashTable_.put(sourceToken, sourceToken);
0689:                        } else {
0690:                            // we could have fished out a reset here
0691:                            if (sourceToken.m_strength_ != TOKEN_RESET_
0692:                                    && lastToken != sourceToken) {
0693:                                // otherwise remove sourceToken from where it was.
0694:                                if (sourceToken.m_next_ != null) {
0695:                                    if (sourceToken.m_next_.m_strength_ > sourceToken.m_strength_) {
0696:                                        sourceToken.m_next_.m_strength_ = sourceToken.m_strength_;
0697:                                    }
0698:                                    sourceToken.m_next_.m_previous_ = sourceToken.m_previous_;
0699:                                } else {
0700:                                    sourceToken.m_listHeader_.m_last_ = sourceToken.m_previous_;
0701:                                }
0702:                                if (sourceToken.m_previous_ != null) {
0703:                                    sourceToken.m_previous_.m_next_ = sourceToken.m_next_;
0704:                                } else {
0705:                                    sourceToken.m_listHeader_.m_first_ = sourceToken.m_next_;
0706:                                }
0707:                                sourceToken.m_next_ = null;
0708:                                sourceToken.m_previous_ = null;
0709:                            }
0710:                        }
0711:                        sourceToken.m_strength_ = m_parsedToken_.m_strength_;
0712:                        sourceToken.m_listHeader_ = lastToken.m_listHeader_;
0713:
0714:                        // 1.  Find the strongest strength in each list, and set
0715:                        // strongestP and strongestN accordingly in the headers.
0716:                        if (lastStrength == TOKEN_RESET_
0717:                                || sourceToken.m_listHeader_.m_first_ == null) {
0718:                            // If LAST is a reset insert sourceToken in the list.
0719:                            if (sourceToken.m_listHeader_.m_first_ == null) {
0720:                                sourceToken.m_listHeader_.m_first_ = sourceToken;
0721:                                sourceToken.m_listHeader_.m_last_ = sourceToken;
0722:                            } else { // we need to find a place for us
0723:                                // and we'll get in front of the same strength
0724:                                if (sourceToken.m_listHeader_.m_first_.m_strength_ <= sourceToken.m_strength_) {
0725:                                    sourceToken.m_next_ = sourceToken.m_listHeader_.m_first_;
0726:                                    sourceToken.m_next_.m_previous_ = sourceToken;
0727:                                    sourceToken.m_listHeader_.m_first_ = sourceToken;
0728:                                    sourceToken.m_previous_ = null;
0729:                                } else {
0730:                                    lastToken = sourceToken.m_listHeader_.m_first_;
0731:                                    while (lastToken.m_next_ != null
0732:                                            && lastToken.m_next_.m_strength_ > sourceToken.m_strength_) {
0733:                                        lastToken = lastToken.m_next_;
0734:                                    }
0735:                                    if (lastToken.m_next_ != null) {
0736:                                        lastToken.m_next_.m_previous_ = sourceToken;
0737:                                    } else {
0738:                                        sourceToken.m_listHeader_.m_last_ = sourceToken;
0739:                                    }
0740:                                    sourceToken.m_previous_ = lastToken;
0741:                                    sourceToken.m_next_ = lastToken.m_next_;
0742:                                    lastToken.m_next_ = sourceToken;
0743:                                }
0744:                            }
0745:                        } else {
0746:                            // Otherwise (when LAST is not a reset)
0747:                            // if polarity (LAST) == polarity(relation), insert
0748:                            // sourceToken after LAST, otherwise insert before.
0749:                            // when inserting after or before, search to the next
0750:                            // position with the same strength in that direction.
0751:                            // (This is called postpone insertion).
0752:                            if (sourceToken != lastToken) {
0753:                                if (lastToken.m_polarity_ == sourceToken.m_polarity_) {
0754:                                    while (lastToken.m_next_ != null
0755:                                            && lastToken.m_next_.m_strength_ > sourceToken.m_strength_) {
0756:                                        lastToken = lastToken.m_next_;
0757:                                    }
0758:                                    sourceToken.m_previous_ = lastToken;
0759:                                    if (lastToken.m_next_ != null) {
0760:                                        lastToken.m_next_.m_previous_ = sourceToken;
0761:                                    } else {
0762:                                        sourceToken.m_listHeader_.m_last_ = sourceToken;
0763:                                    }
0764:                                    sourceToken.m_next_ = lastToken.m_next_;
0765:                                    lastToken.m_next_ = sourceToken;
0766:                                } else {
0767:                                    while (lastToken.m_previous_ != null
0768:                                            && lastToken.m_previous_.m_strength_ > sourceToken.m_strength_) {
0769:                                        lastToken = lastToken.m_previous_;
0770:                                    }
0771:                                    sourceToken.m_next_ = lastToken;
0772:                                    if (lastToken.m_previous_ != null) {
0773:                                        lastToken.m_previous_.m_next_ = sourceToken;
0774:                                    } else {
0775:                                        sourceToken.m_listHeader_.m_first_ = sourceToken;
0776:                                    }
0777:                                    sourceToken.m_previous_ = lastToken.m_previous_;
0778:                                    lastToken.m_previous_ = sourceToken;
0779:                                }
0780:                            } else { // repeated one thing twice in rules, stay with the
0781:                                // stronger strength
0782:                                if (lastStrength < sourceToken.m_strength_) {
0783:                                    sourceToken.m_strength_ = lastStrength;
0784:                                }
0785:                            }
0786:                        }
0787:                        // if the token was a variable top, we're gonna put it in
0788:                        if (variableTop == true && m_variableTop_ == null) {
0789:                            variableTop = false;
0790:                            m_variableTop_ = sourceToken;
0791:                        }
0792:                        // Treat the expansions.
0793:                        // There are two types of expansions: explicit (x / y) and
0794:                        // reset based propagating expansions
0795:                        // (&abc * d * e <=> &ab * d / c * e / c)
0796:                        // if both of them are in effect for a token, they are combined.
0797:                        sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
0798:                                | m_parsedToken_.m_extensionOffset_;
0799:                        if (expandNext != 0) {
0800:                            if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) {
0801:                                // primary strength kills off the implicit expansion
0802:                                expandNext = 0;
0803:                            } else if (sourceToken.m_expansion_ == 0) {
0804:                                // if there is no expansion, implicit is just added to
0805:                                // the token
0806:                                sourceToken.m_expansion_ = expandNext;
0807:                            } else {
0808:                                // there is both explicit and implicit expansion.
0809:                                // We need to make a combination
0810:                                int start = expandNext & 0xFFFFFF;
0811:                                int size = expandNext >>> 24;
0812:                                if (size > 0) {
0813:                                    m_source_.append(m_source_.substring(start,
0814:                                            start + size));
0815:                                }
0816:                                start = m_parsedToken_.m_extensionOffset_;
0817:                                m_source_
0818:                                        .append(m_source_
0819:                                                .substring(
0820:                                                        start,
0821:                                                        start
0822:                                                                + m_parsedToken_.m_extensionLen_));
0823:                                sourceToken.m_expansion_ = (size + m_parsedToken_.m_extensionLen_) << 24
0824:                                        | m_extraCurrent_;
0825:                                m_extraCurrent_ += size
0826:                                        + m_parsedToken_.m_extensionLen_;
0827:                            }
0828:                        }
0829:                        // if the previous token was a reset before, the strength of this
0830:                        // token must match the strength of before. Otherwise we have an
0831:                        // undefined situation.
0832:                        // In other words, we currently have a cludge which we use to
0833:                        // represent &a >> x. This is written as &[before 2]a << x.
0834:                        if ((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {
0835:                            int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;
0836:                            if (beforeStrength != sourceToken.m_strength_) {
0837:                                throwParseException(m_source_.toString(),
0838:                                        m_current_);
0839:                            }
0840:                        }
0841:
0842:                    } else {
0843:                        if (lastToken != null && lastStrength == TOKEN_RESET_) {
0844:                            // if the previous token was also a reset, this means that
0845:                            // we have two consecutive resets and we want to remove the
0846:                            // previous one if empty
0847:                            if (m_resultLength_ > 0
0848:                                    && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
0849:                                m_resultLength_--;
0850:                            }
0851:                        }
0852:                        if (sourceToken == null) {
0853:                            // this is a reset, but it might still be somewhere in the
0854:                            // tailoring, in shorter form
0855:                            int searchCharsLen = m_parsedToken_.m_charsLen_;
0856:                            while (searchCharsLen > 1 && sourceToken == null) {
0857:                                searchCharsLen--;
0858:                                // key = searchCharsLen << 24 | charsOffset;
0859:                                m_utilToken_.m_source_ = searchCharsLen << 24
0860:                                        | m_parsedToken_.m_charsOffset_;
0861:                                m_utilToken_.m_rules_ = m_source_;
0862:                                sourceToken = (Token) m_hashTable_
0863:                                        .get(m_utilToken_);
0864:                            }
0865:                            if (sourceToken != null) {
0866:                                expandNext = (m_parsedToken_.m_charsLen_ - searchCharsLen) << 24
0867:                                        | (m_parsedToken_.m_charsOffset_ + searchCharsLen);
0868:                            }
0869:                        }
0870:                        if ((specs & TOKEN_BEFORE_) != 0) {
0871:                            if (top == false) {
0872:                                // we're doing before & there is no indirection
0873:                                int strength = (specs & TOKEN_BEFORE_) - 1;
0874:                                if (sourceToken != null
0875:                                        && sourceToken.m_strength_ != TOKEN_RESET_) {
0876:                                    // this is a before that is already ordered in the UCA
0877:                                    // - so we need to get the previous with good strength
0878:                                    while (sourceToken.m_strength_ > strength
0879:                                            && sourceToken.m_previous_ != null) {
0880:                                        sourceToken = sourceToken.m_previous_;
0881:                                    }
0882:                                    // here, either we hit the strength or NULL
0883:                                    if (sourceToken.m_strength_ == strength) {
0884:                                        if (sourceToken.m_previous_ != null) {
0885:                                            sourceToken = sourceToken.m_previous_;
0886:                                        } else { // start of list
0887:                                            sourceToken = sourceToken.m_listHeader_.m_reset_;
0888:                                        }
0889:                                    } else { // we hit NULL, we should be doing the else part
0890:                                        sourceToken = sourceToken.m_listHeader_.m_reset_;
0891:                                        sourceToken = getVirginBefore(
0892:                                                sourceToken, strength);
0893:                                    }
0894:                                } else {
0895:                                    sourceToken = getVirginBefore(sourceToken,
0896:                                            strength);
0897:                                }
0898:                            } else {
0899:                                // this is both before and indirection
0900:                                top = false;
0901:                                m_listHeader_[m_resultLength_] = new TokenListHeader();
0902:                                m_listHeader_[m_resultLength_].m_previousCE_ = 0;
0903:                                m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
0904:                                m_listHeader_[m_resultLength_].m_indirect_ = true;
0905:                                // we need to do slightly more work. we need to get the
0906:                                // baseCE using the inverse UCA & getPrevious. The next
0907:                                // bound is not set, and will be decided in ucol_bld
0908:                                int strength = (specs & TOKEN_BEFORE_) - 1;
0909:                                int baseCE = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startCE_;
0910:                                int baseContCE = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_;
0911:                                int ce[] = new int[2];
0912:                                if ((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
0913:                                        && (baseCE >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
0914:                                    int primary = baseCE
0915:                                            & RuleBasedCollator.CE_PRIMARY_MASK_
0916:                                            | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
0917:                                    int raw = RuleBasedCollator.impCEGen_
0918:                                            .getRawFromImplicit(primary);
0919:                                    int primaryCE = RuleBasedCollator.impCEGen_
0920:                                            .getImplicitFromRaw(raw - 1);
0921:                                    ce[0] = primaryCE
0922:                                            & RuleBasedCollator.CE_PRIMARY_MASK_
0923:                                            | 0x0505;
0924:                                    ce[1] = (primaryCE << 16)
0925:                                            & RuleBasedCollator.CE_PRIMARY_MASK_
0926:                                            | RuleBasedCollator.CE_CONTINUATION_MARKER_;
0927:                                } else {
0928:                                    CollationParsedRuleBuilder.InverseUCA invuca = CollationParsedRuleBuilder.INVERSE_UCA_;
0929:                                    invuca.getInversePrevCE(baseCE, baseContCE,
0930:                                            strength, ce);
0931:                                }
0932:                                m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];
0933:                                m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];
0934:                                m_listHeader_[m_resultLength_].m_nextCE_ = 0;
0935:                                m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
0936:
0937:                                sourceToken = new Token();
0938:                                expandNext = initAReset(0, sourceToken);
0939:                            }
0940:                        }
0941:                        // 5 If the relation is a reset:
0942:                        // If sourceToken is null
0943:                        // Create new list, create new sourceToken, make the baseCE
0944:                        // from source, put the sourceToken in ListHeader of the new
0945:                        // list
0946:                        if (sourceToken == null) {
0947:                            if (m_listHeader_[m_resultLength_] == null) {
0948:                                m_listHeader_[m_resultLength_] = new TokenListHeader();
0949:                            }
0950:                            // 3 Consider each item: relation, source, and expansion:
0951:                            // e.g. ...< x / y ...
0952:                            // First convert all expansions into normal form.
0953:                            // Examples:
0954:                            // If "xy" doesn't occur earlier in the list or in the UCA,
0955:                            // convert &xy * c * d * ... into &x * c/y * d * ...
0956:                            // Note: reset values can never have expansions, although
0957:                            // they can cause the very next item to have one. They may
0958:                            // be contractions, if they are found earlier in the list.
0959:                            if (top == false) {
0960:                                CollationElementIterator coleiter = RuleBasedCollator.UCA_
0961:                                        .getCollationElementIterator(m_source_
0962:                                                .substring(
0963:                                                        m_parsedToken_.m_charsOffset_,
0964:                                                        m_parsedToken_.m_charsOffset_
0965:                                                                + m_parsedToken_.m_charsLen_));
0966:
0967:                                int CE = coleiter.next();
0968:                                // offset to the character in the full rule string
0969:                                int expand = coleiter.getOffset()
0970:                                        + m_parsedToken_.m_charsOffset_;
0971:                                int SecondCE = coleiter.next();
0972:
0973:                                m_listHeader_[m_resultLength_].m_baseCE_ = CE & 0xFFFFFF3F;
0974:                                if (RuleBasedCollator.isContinuation(SecondCE)) {
0975:                                    m_listHeader_[m_resultLength_].m_baseContCE_ = SecondCE;
0976:                                } else {
0977:                                    m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
0978:                                }
0979:                                m_listHeader_[m_resultLength_].m_nextCE_ = 0;
0980:                                m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
0981:                                m_listHeader_[m_resultLength_].m_previousCE_ = 0;
0982:                                m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
0983:                                m_listHeader_[m_resultLength_].m_indirect_ = false;
0984:                                sourceToken = new Token();
0985:                                expandNext = initAReset(expand, sourceToken);
0986:                            } else { // top == TRUE
0987:                                top = false;
0988:                                m_listHeader_[m_resultLength_].m_previousCE_ = 0;
0989:                                m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
0990:                                m_listHeader_[m_resultLength_].m_indirect_ = true;
0991:                                IndirectBoundaries ib = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
0992:                                m_listHeader_[m_resultLength_].m_baseCE_ = ib.m_startCE_;
0993:                                m_listHeader_[m_resultLength_].m_baseContCE_ = ib.m_startContCE_;
0994:                                m_listHeader_[m_resultLength_].m_nextCE_ = ib.m_limitCE_;
0995:                                m_listHeader_[m_resultLength_].m_nextContCE_ = ib.m_limitContCE_;
0996:                                sourceToken = new Token();
0997:                                expandNext = initAReset(0, sourceToken);
0998:                            }
0999:                        } else { // reset to something already in rules
1000:                            top = false;
1001:                        }
1002:                    }
1003:                    // 7 After all this, set LAST to point to sourceToken, and goto
1004:                    // step 3.
1005:                    lastToken = sourceToken;
1006:                }
1007:
1008:                if (m_resultLength_ > 0
1009:                        && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
1010:                    m_resultLength_--;
1011:                }
1012:                return m_resultLength_;
1013:            }
1014:
1015:            /**
1016:             * Formats and throws a ParseException
1017:             * @param rules collation rule that failed
1018:             * @param offset failed offset in rules
1019:             * @throws ParseException with failure information
1020:             */
1021:            private static final void throwParseException(String rules,
1022:                    int offset) throws ParseException {
1023:                // for pre-context
1024:                String precontext = rules.substring(0, offset);
1025:                String postcontext = rules.substring(offset, rules.length());
1026:                StringBuffer error = new StringBuffer(
1027:                        "Parse error occurred in rule at offset ");
1028:                error.append(offset);
1029:                error.append("\n after the prefix \"");
1030:                error.append(precontext);
1031:                error.append("\" before the suffix \"");
1032:                error.append(postcontext);
1033:                throw new ParseException(error.toString(), offset);
1034:            }
1035:
1036:            private final boolean doSetTop() {
1037:                m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1038:                m_source_.append((char) 0xFFFE);
1039:                IndirectBoundaries ib = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
1040:                m_source_.append((char) (ib.m_startCE_ >> 16));
1041:                m_source_.append((char) (ib.m_startCE_ & 0xFFFF));
1042:                m_extraCurrent_ += 3;
1043:                if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_ == 0) {
1044:                    m_parsedToken_.m_charsLen_ = 3;
1045:                } else {
1046:                    m_source_
1047:                            .append((char) (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_ >> 16));
1048:                    m_source_
1049:                            .append((char) (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_ & 0xFFFF));
1050:                    m_extraCurrent_ += 2;
1051:                    m_parsedToken_.m_charsLen_ = 5;
1052:                }
1053:                return true;
1054:            }
1055:
1056:            private static boolean isCharNewLine(char c) {
1057:                switch (c) {
1058:                case 0x000A: /* LF */
1059:                case 0x000D: /* CR */
1060:                case 0x000C: /* FF */
1061:                case 0x0085: /* NEL */
1062:                case 0x2028: /* LS */
1063:                case 0x2029: /* PS */
1064:                    return true;
1065:                default:
1066:                    return false;
1067:                }
1068:            }
1069:
1070:            /**
1071:             * Getting the next token
1072:             *
1073:             * @param startofrules
1074:             *            flag indicating if we are at the start of rules
1075:             * @return the offset of the rules
1076:             * @exception ParseException
1077:             *                thrown when rule parsing fails
1078:             */
1079:            private int parseNextToken(boolean startofrules)
1080:                    throws ParseException {
1081:                // parsing part
1082:                boolean variabletop = false;
1083:                boolean top = false;
1084:                boolean inchars = true;
1085:                boolean inquote = false;
1086:                boolean wasinquote = false;
1087:                byte before = 0;
1088:                boolean isescaped = false;
1089:                int /*newcharslen = 0,*/newextensionlen = 0;
1090:                int /*charsoffset = 0,*/extensionoffset = 0;
1091:                int newstrength = TOKEN_UNSET_;
1092:
1093:                m_parsedToken_.m_charsLen_ = 0;
1094:                m_parsedToken_.m_charsOffset_ = 0;
1095:                m_parsedToken_.m_prefixOffset_ = 0;
1096:                m_parsedToken_.m_prefixLen_ = 0;
1097:                m_parsedToken_.m_indirectIndex_ = 0;
1098:
1099:                int limit = m_rules_.length();
1100:                while (m_current_ < limit) {
1101:                    char ch = m_source_.charAt(m_current_);
1102:                    if (inquote) {
1103:                        if (ch == 0x0027) { // '\''
1104:                            inquote = false;
1105:                        } else {
1106:                            if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {
1107:                                if (m_parsedToken_.m_charsLen_ == 0) {
1108:                                    m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1109:                                }
1110:                                m_parsedToken_.m_charsLen_++;
1111:                            } else {
1112:                                if (newextensionlen == 0) {
1113:                                    extensionoffset = m_extraCurrent_;
1114:                                }
1115:                                newextensionlen++;
1116:                            }
1117:                        }
1118:                    } else if (isescaped) {
1119:                        isescaped = false;
1120:                        if (newstrength == TOKEN_UNSET_) {
1121:                            throwParseException(m_rules_, m_current_);
1122:                        }
1123:                        if (ch != 0 && m_current_ != limit) {
1124:                            if (inchars) {
1125:                                if (m_parsedToken_.m_charsLen_ == 0) {
1126:                                    m_parsedToken_.m_charsOffset_ = m_current_;
1127:                                }
1128:                                m_parsedToken_.m_charsLen_++;
1129:                            } else {
1130:                                if (newextensionlen == 0) {
1131:                                    extensionoffset = m_current_;
1132:                                }
1133:                                newextensionlen++;
1134:                            }
1135:                        }
1136:                    } else {
1137:                        if (!UCharacterProperty.isRuleWhiteSpace(ch)) {
1138:                            // Sets the strength for this entry
1139:                            switch (ch) {
1140:                            case 0x003D: // '='
1141:                                if (newstrength != TOKEN_UNSET_) {
1142:                                    return doEndParseNextToken(newstrength,
1143:                                            top, extensionoffset,
1144:                                            newextensionlen, variabletop,
1145:                                            before);
1146:                                }
1147:                                // if we start with strength, we'll reset to top
1148:                                if (startofrules == true) {
1149:                                    m_parsedToken_.m_indirectIndex_ = 5;
1150:                                    top = doSetTop();
1151:                                    return doEndParseNextToken(TOKEN_RESET_,
1152:                                            top, extensionoffset,
1153:                                            newextensionlen, variabletop,
1154:                                            before);
1155:                                }
1156:                                newstrength = Collator.IDENTICAL;
1157:                                break;
1158:                            case 0x002C: // ','
1159:                                if (newstrength != TOKEN_UNSET_) {
1160:                                    return doEndParseNextToken(newstrength,
1161:                                            top, extensionoffset,
1162:                                            newextensionlen, variabletop,
1163:                                            before);
1164:                                }
1165:                                // if we start with strength, we'll reset to top
1166:                                if (startofrules == true) {
1167:                                    m_parsedToken_.m_indirectIndex_ = 5;
1168:                                    top = doSetTop();
1169:                                    return doEndParseNextToken(TOKEN_RESET_,
1170:                                            top, extensionoffset,
1171:                                            newextensionlen, variabletop,
1172:                                            before);
1173:                                }
1174:                                newstrength = Collator.TERTIARY;
1175:                                break;
1176:                            case 0x003B: // ';'
1177:                                if (newstrength != TOKEN_UNSET_) {
1178:                                    return doEndParseNextToken(newstrength,
1179:                                            top, extensionoffset,
1180:                                            newextensionlen, variabletop,
1181:                                            before);
1182:                                }
1183:                                // if we start with strength, we'll reset to top
1184:                                if (startofrules == true) {
1185:                                    m_parsedToken_.m_indirectIndex_ = 5;
1186:                                    top = doSetTop();
1187:                                    return doEndParseNextToken(TOKEN_RESET_,
1188:                                            top, extensionoffset,
1189:                                            newextensionlen, variabletop,
1190:                                            before);
1191:                                }
1192:                                newstrength = Collator.SECONDARY;
1193:                                break;
1194:                            case 0x003C: // '<'
1195:                                if (newstrength != TOKEN_UNSET_) {
1196:                                    return doEndParseNextToken(newstrength,
1197:                                            top, extensionoffset,
1198:                                            newextensionlen, variabletop,
1199:                                            before);
1200:                                }
1201:                                // if we start with strength, we'll reset to top
1202:                                if (startofrules == true) {
1203:                                    m_parsedToken_.m_indirectIndex_ = 5;
1204:                                    top = doSetTop();
1205:                                    return doEndParseNextToken(TOKEN_RESET_,
1206:                                            top, extensionoffset,
1207:                                            newextensionlen, variabletop,
1208:                                            before);
1209:                                }
1210:                                // before this, do a scan to verify whether this is
1211:                                // another strength
1212:                                if (m_source_.charAt(m_current_ + 1) == 0x003C) {
1213:                                    m_current_++;
1214:                                    if (m_source_.charAt(m_current_ + 1) == 0x003C) {
1215:                                        m_current_++; // three in a row!
1216:                                        newstrength = Collator.TERTIARY;
1217:                                    } else { // two in a row
1218:                                        newstrength = Collator.SECONDARY;
1219:                                    }
1220:                                } else { // just one
1221:                                    newstrength = Collator.PRIMARY;
1222:                                }
1223:                                break;
1224:                            case 0x0026: // '&'
1225:                                if (newstrength != TOKEN_UNSET_) {
1226:                                    return doEndParseNextToken(newstrength,
1227:                                            top, extensionoffset,
1228:                                            newextensionlen, variabletop,
1229:                                            before);
1230:                                }
1231:                                newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0
1232:                                break;
1233:                            case 0x005b: // '['
1234:                                // options - read an option, analyze it
1235:                                m_optionEnd_ = m_rules_.indexOf(0x005d,
1236:                                        m_current_);
1237:                                if (m_optionEnd_ != -1) { // ']'
1238:                                    byte result = readAndSetOption();
1239:                                    m_current_ = m_optionEnd_;
1240:                                    if ((result & TOKEN_TOP_MASK_) != 0) {
1241:                                        if (newstrength == TOKEN_RESET_) {
1242:                                            top = doSetTop();
1243:                                            if (before != 0) {
1244:                                                // This is a combination of before and
1245:                                                // indirection like
1246:                                                // '&[before 2][first regular]<b'
1247:                                                m_source_.append((char) 0x002d);
1248:                                                m_source_.append((char) before);
1249:                                                m_extraCurrent_ += 2;
1250:                                                m_parsedToken_.m_charsLen_ += 2;
1251:                                            }
1252:                                            m_current_++;
1253:                                            return doEndParseNextToken(
1254:                                                    newstrength, true,
1255:                                                    extensionoffset,
1256:                                                    newextensionlen,
1257:                                                    variabletop, before);
1258:                                        } else {
1259:                                            throwParseException(m_rules_,
1260:                                                    m_current_);
1261:                                        }
1262:                                    } else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) {
1263:                                        if (newstrength != TOKEN_RESET_
1264:                                                && newstrength != TOKEN_UNSET_) {
1265:                                            variabletop = true;
1266:                                            m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1267:                                            m_source_.append((char) 0xFFFF);
1268:                                            m_extraCurrent_++;
1269:                                            m_current_++;
1270:                                            m_parsedToken_.m_charsLen_ = 1;
1271:                                            return doEndParseNextToken(
1272:                                                    newstrength, top,
1273:                                                    extensionoffset,
1274:                                                    newextensionlen,
1275:                                                    variabletop, before);
1276:                                        } else {
1277:                                            throwParseException(m_rules_,
1278:                                                    m_current_);
1279:                                        }
1280:                                    } else if ((result & TOKEN_BEFORE_) != 0) {
1281:                                        if (newstrength == TOKEN_RESET_) {
1282:                                            before = (byte) (result & TOKEN_BEFORE_);
1283:                                        } else {
1284:                                            throwParseException(m_rules_,
1285:                                                    m_current_);
1286:                                        }
1287:                                    }
1288:                                }
1289:                                break;
1290:                            case 0x002F: // '/'
1291:                                wasinquote = false; // if we were copying source
1292:                                // characters, we want to stop now
1293:                                inchars = false; // we're now processing expansion
1294:                                break;
1295:                            case 0x005C: // back slash for escaped chars
1296:                                isescaped = true;
1297:                                break;
1298:                            // found a quote, we're gonna start copying
1299:                            case 0x0027: //'\''
1300:                                if (newstrength == TOKEN_UNSET_) {
1301:                                    // quote is illegal until we have a strength
1302:                                    throwParseException(m_rules_, m_current_);
1303:                                }
1304:                                inquote = true;
1305:                                if (inchars) { // we're doing characters
1306:                                    if (wasinquote == false) {
1307:                                        m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1308:                                    }
1309:                                    if (m_parsedToken_.m_charsLen_ != 0) {
1310:                                        m_source_
1311:                                                .append(m_source_
1312:                                                        .substring(
1313:                                                                m_current_
1314:                                                                        - m_parsedToken_.m_charsLen_,
1315:                                                                m_current_));
1316:                                        m_extraCurrent_ += m_parsedToken_.m_charsLen_;
1317:                                    }
1318:                                    m_parsedToken_.m_charsLen_++;
1319:                                } else { // we're doing an expansion
1320:                                    if (wasinquote == false) {
1321:                                        extensionoffset = m_extraCurrent_;
1322:                                    }
1323:                                    if (newextensionlen != 0) {
1324:                                        m_source_.append(m_source_.substring(
1325:                                                m_current_ - newextensionlen,
1326:                                                m_current_));
1327:                                        m_extraCurrent_ += newextensionlen;
1328:                                    }
1329:                                    newextensionlen++;
1330:                                }
1331:                                wasinquote = true;
1332:                                m_current_++;
1333:                                ch = m_source_.charAt(m_current_);
1334:                                if (ch == 0x0027) { // copy the double quote
1335:                                    m_source_.append(ch);
1336:                                    m_extraCurrent_++;
1337:                                    inquote = false;
1338:                                }
1339:                                break;
1340:                            // '@' is french only if the strength is not currently set
1341:                            // if it is, it's just a regular character in collation
1342:                            case 0x0040: // '@'
1343:                                if (newstrength == TOKEN_UNSET_) {
1344:                                    m_options_.m_isFrenchCollation_ = true;
1345:                                    break;
1346:                                }
1347:                            case 0x007C: //|
1348:                                // this means we have actually been reading prefix part
1349:                                // we want to store read characters to the prefix part
1350:                                // and continue reading the characters (proper way
1351:                                // would be to restart reading the chars, but in that
1352:                                // case we would have to complicate the token hasher,
1353:                                // which I do not intend to play with. Instead, we will
1354:                                // do prefixes when prefixes are due (before adding the
1355:                                // elements).
1356:                                m_parsedToken_.m_prefixOffset_ = m_parsedToken_.m_charsOffset_;
1357:                                m_parsedToken_.m_prefixLen_ = m_parsedToken_.m_charsLen_;
1358:                                if (inchars) { // we're doing characters
1359:                                    if (wasinquote == false) {
1360:                                        m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1361:                                    }
1362:                                    if (m_parsedToken_.m_charsLen_ != 0) {
1363:                                        String prefix = m_source_
1364:                                                .substring(
1365:                                                        m_current_
1366:                                                                - m_parsedToken_.m_charsLen_,
1367:                                                        m_current_);
1368:                                        m_source_.append(prefix);
1369:                                        m_extraCurrent_ += m_parsedToken_.m_charsLen_;
1370:                                    }
1371:                                    m_parsedToken_.m_charsLen_++;
1372:                                }
1373:                                wasinquote = true;
1374:                                do {
1375:                                    m_current_++;
1376:                                    ch = m_source_.charAt(m_current_);
1377:                                    // skip whitespace between '|' and the character
1378:                                } while (UCharacterProperty
1379:                                        .isRuleWhiteSpace(ch));
1380:                                break;
1381:                            case 0x0023: // '#' // this is a comment, skip everything through the end of line
1382:                                do {
1383:                                    m_current_++;
1384:                                    ch = m_source_.charAt(m_current_);
1385:                                } while (!isCharNewLine(ch));
1386:                                break;
1387:                            case 0x0021: // '!' // ignoring java set thai reordering
1388:                                break;
1389:                            default:
1390:                                if (newstrength == TOKEN_UNSET_) {
1391:                                    throwParseException(m_rules_, m_current_);
1392:                                }
1393:                                if (isSpecialChar(ch) && (inquote == false)) {
1394:                                    throwParseException(m_rules_, m_current_);
1395:                                }
1396:                                if (ch == 0x0000 && m_current_ + 1 == limit) {
1397:                                    break;
1398:                                }
1399:                                if (inchars) {
1400:                                    if (m_parsedToken_.m_charsLen_ == 0) {
1401:                                        m_parsedToken_.m_charsOffset_ = m_current_;
1402:                                    }
1403:                                    m_parsedToken_.m_charsLen_++;
1404:                                } else {
1405:                                    if (newextensionlen == 0) {
1406:                                        extensionoffset = m_current_;
1407:                                    }
1408:                                    newextensionlen++;
1409:                                }
1410:                                break;
1411:                            }
1412:                        }
1413:                    }
1414:                    if (wasinquote) {
1415:                        if (ch != 0x27) {
1416:                            m_source_.append(ch);
1417:                            m_extraCurrent_++;
1418:                        }
1419:                    }
1420:                    m_current_++;
1421:                }
1422:                return doEndParseNextToken(newstrength, top, extensionoffset,
1423:                        newextensionlen, variabletop, before);
1424:            }
1425:
1426:            /**
1427:             * End the next parse token
1428:             * @param newstrength new strength
1429:             * @return offset in rules, -1 for end of rules
1430:             */
1431:            private int doEndParseNextToken(int newstrength, /*int newcharslen,*/
1432:            boolean top, /*int charsoffset,*/
1433:            int extensionoffset, int newextensionlen, boolean variabletop,
1434:                    int before) throws ParseException {
1435:                if (newstrength == TOKEN_UNSET_) {
1436:                    return -1;
1437:                }
1438:                if (m_parsedToken_.m_charsLen_ == 0 && top == false) {
1439:                    throwParseException(m_rules_, m_current_);
1440:                }
1441:
1442:                m_parsedToken_.m_strength_ = newstrength;
1443:                //m_parsedToken_.m_charsOffset_ = charsoffset;
1444:                //m_parsedToken_.m_charsLen_ = newcharslen;
1445:                m_parsedToken_.m_extensionOffset_ = extensionoffset;
1446:                m_parsedToken_.m_extensionLen_ = newextensionlen;
1447:                m_parsedToken_.m_flags_ = (char) ((variabletop ? TOKEN_VARIABLE_TOP_MASK_
1448:                        : 0)
1449:                        | (top ? TOKEN_TOP_MASK_ : 0) | before);
1450:                return m_current_;
1451:            }
1452:
1453:            /**
1454:             * Token before this element
1455:             * @param sourcetoken
1456:             * @param strength collation strength
1457:             * @return the token before source token
1458:             * @exception ParseException thrown when rules have the wrong syntax
1459:             */
1460:            private Token getVirginBefore(Token sourcetoken, int strength)
1461:                    throws ParseException {
1462:                // this is a virgin before - we need to fish the anchor from the UCA
1463:                if (sourcetoken != null) {
1464:                    int offset = sourcetoken.m_source_ & 0xFFFFFF;
1465:                    m_UCAColEIter_.setText(m_source_.substring(offset,
1466:                            offset + 1));
1467:                } else {
1468:                    m_UCAColEIter_.setText(m_source_.substring(
1469:                            m_parsedToken_.m_charsOffset_,
1470:                            m_parsedToken_.m_charsOffset_ + 1));
1471:                }
1472:
1473:                int basece = m_UCAColEIter_.next() & 0xFFFFFF3F;
1474:                int basecontce = m_UCAColEIter_.next();
1475:                if (basecontce == CollationElementIterator.NULLORDER) {
1476:                    basecontce = 0;
1477:                }
1478:
1479:                int ch = 0;
1480:
1481:                if ((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
1482:                        && (basece >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
1483:
1484:                    int primary = basece
1485:                            & RuleBasedCollator.CE_PRIMARY_MASK_
1486:                            | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
1487:                    int raw = RuleBasedCollator.impCEGen_
1488:                            .getRawFromImplicit(primary);
1489:                    ch = RuleBasedCollator.impCEGen_
1490:                            .getCodePointFromRaw(raw - 1);
1491:                    int primaryCE = RuleBasedCollator.impCEGen_
1492:                            .getImplicitFromRaw(raw - 1);
1493:                    m_utilCEBuffer_[0] = primaryCE
1494:                            & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
1495:                    m_utilCEBuffer_[1] = (primaryCE << 16)
1496:                            & RuleBasedCollator.CE_PRIMARY_MASK_
1497:                            | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1498:
1499:                    m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1500:                    m_source_.append('\uFFFE');
1501:                    m_source_.append((char) ch);
1502:                    m_extraCurrent_ += 2;
1503:                    m_parsedToken_.m_charsLen_++;
1504:
1505:                    m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
1506:                            | m_parsedToken_.m_charsOffset_;
1507:                    m_utilToken_.m_rules_ = m_source_;
1508:                    sourcetoken = (Token) m_hashTable_.get(m_utilToken_);
1509:
1510:                    if (sourcetoken == null) {
1511:                        m_listHeader_[m_resultLength_] = new TokenListHeader();
1512:                        m_listHeader_[m_resultLength_].m_baseCE_ = m_utilCEBuffer_[0] & 0xFFFFFF3F;
1513:                        if (RuleBasedCollator
1514:                                .isContinuation(m_utilCEBuffer_[1])) {
1515:                            m_listHeader_[m_resultLength_].m_baseContCE_ = m_utilCEBuffer_[1];
1516:                        } else {
1517:                            m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
1518:                        }
1519:                        m_listHeader_[m_resultLength_].m_nextCE_ = 0;
1520:                        m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1521:                        m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1522:                        m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1523:                        m_listHeader_[m_resultLength_].m_indirect_ = false;
1524:
1525:                        sourcetoken = new Token();
1526:                        initAReset(-1, sourcetoken);
1527:                    }
1528:
1529:                } else {
1530:
1531:                    // first ce and second ce m_utilCEBuffer_
1532:                    int invpos = CollationParsedRuleBuilder.INVERSE_UCA_
1533:                            .getInversePrevCE(basece, basecontce, strength,
1534:                                    m_utilCEBuffer_);
1535:                    // we got the previous CE. Now we need to see if the difference between
1536:                    // the two CEs is really of the requested strength.
1537:                    // if it's a bigger difference (we asked for secondary and got primary), we
1538:                    // need to modify the CE.
1539:                    if (CollationParsedRuleBuilder.INVERSE_UCA_
1540:                            .getCEStrengthDifference(basece, basecontce,
1541:                                    m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {
1542:                        // adjust the strength
1543:                        // now we are in the situation where our baseCE should actually be modified in
1544:                        // order to get the CE in the right position.
1545:                        if (strength == Collator.SECONDARY) {
1546:                            m_utilCEBuffer_[0] = basece - 0x0200;
1547:                        } else { // strength == UCOL_TERTIARY
1548:                            m_utilCEBuffer_[0] = basece - 0x02;
1549:                        }
1550:                        if (RuleBasedCollator.isContinuation(basecontce)) {
1551:                            if (strength == Collator.SECONDARY) {
1552:                                m_utilCEBuffer_[1] = basecontce - 0x0200;
1553:                            } else { // strength == UCOL_TERTIARY
1554:                                m_utilCEBuffer_[1] = basecontce - 0x02;
1555:                            }
1556:                        }
1557:                    }
1558:
1559:                    /*
1560:                     // the code below relies on getting a code point from the inverse table, in order to be
1561:                     // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1562:                     // 1. There are many code points that have the same CE
1563:                     // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1564:                     // Also, in case when there is no equivalent strength before an element, we have to actually
1565:                     // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1566:                     // before a is a primary difference.
1567:                     ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos
1568:                     + 2];
1569:                     if ((ch &  INVERSE_SIZE_MASK_) != 0) {
1570:                     int offset = ch & INVERSE_OFFSET_MASK_;
1571:                     ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[
1572:                     offset];
1573:                     }
1574:                     m_source_.append((char)ch);
1575:                     m_extraCurrent_ ++;
1576:                     m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;
1577:                     m_parsedToken_.m_charsLen_ = 1;
1578:
1579:                     // We got an UCA before. However, this might have been tailored.
1580:                     // example:
1581:                     // &\u30ca = \u306a
1582:                     // &[before 3]\u306a<<<\u306a|\u309d
1583:
1584:                     m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
1585:                     | m_parsedToken_.m_charsOffset_;
1586:                     m_utilToken_.m_rules_ = m_source_;
1587:                     sourcetoken = (Token)m_hashTable_.get(m_utilToken_);
1588:                     */
1589:
1590:                    // here is how it should be. The situation such as &[before 1]a < x, should be
1591:                    // resolved exactly as if we wrote &a > x.
1592:                    // therefore, I don't really care if the UCA value before a has been changed.
1593:                    // However, I do care if the strength between my element and the previous element
1594:                    // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1595:                    // have to construct the base CE.
1596:                    // if we found a tailored thing, we have to use the UCA value and
1597:                    // construct a new reset token with constructed name
1598:                    //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {
1599:                    // character to which we want to anchor is already tailored.
1600:                    // We need to construct a new token which will be the anchor point
1601:                    //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');
1602:                    //m_source_.append(ch);
1603:                    //m_extraCurrent_ ++;
1604:                    //m_parsedToken_.m_charsLen_ ++;
1605:                    // grab before
1606:                    m_parsedToken_.m_charsOffset_ -= 10;
1607:                    m_parsedToken_.m_charsLen_ += 10;
1608:                    m_listHeader_[m_resultLength_] = new TokenListHeader();
1609:                    m_listHeader_[m_resultLength_].m_baseCE_ = m_utilCEBuffer_[0] & 0xFFFFFF3F;
1610:                    if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
1611:                        m_listHeader_[m_resultLength_].m_baseContCE_ = m_utilCEBuffer_[1];
1612:                    } else {
1613:                        m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
1614:                    }
1615:                    m_listHeader_[m_resultLength_].m_nextCE_ = 0;
1616:                    m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1617:                    m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1618:                    m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1619:                    m_listHeader_[m_resultLength_].m_indirect_ = false;
1620:                    sourcetoken = new Token();
1621:                    initAReset(-1, sourcetoken);
1622:                    //}
1623:                }
1624:                return sourcetoken;
1625:            }
1626:
1627:            /**
1628:             * Processing Description.
1629:             * 1. Build a m_listHeader_. Each list has a header, which contains two lists
1630:             * (positive and negative), a reset token, a baseCE, nextCE, and
1631:             * previousCE. The lists and reset may be null.
1632:             * 2. As you process, you keep a LAST pointer that points to the last token
1633:             * you handled.
1634:             * @param expand string offset, -1 for null strings
1635:             * @param targetToken token to update
1636:             * @return expandnext offset
1637:             * @throws ParseException thrown when rules syntax failed
1638:             */
1639:            private int initAReset(int expand, Token targetToken)
1640:                    throws ParseException {
1641:                if (m_resultLength_ == m_listHeader_.length - 1) {
1642:                    // Unfortunately, this won't work, as we store addresses of lhs in
1643:                    // token
1644:                    TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];
1645:                    System.arraycopy(m_listHeader_, 0, temp, 0,
1646:                            m_resultLength_ + 1);
1647:                    m_listHeader_ = temp;
1648:                }
1649:                // do the reset thing
1650:                targetToken.m_rules_ = m_source_;
1651:                targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
1652:                        | m_parsedToken_.m_charsOffset_;
1653:                targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
1654:                        | m_parsedToken_.m_extensionOffset_;
1655:                // keep the flags around so that we know about before
1656:                targetToken.m_flags_ = m_parsedToken_.m_flags_;
1657:
1658:                if (m_parsedToken_.m_prefixOffset_ != 0) {
1659:                    throwParseException(m_rules_,
1660:                            m_parsedToken_.m_charsOffset_ - 1);
1661:                }
1662:
1663:                targetToken.m_prefix_ = 0;
1664:                // TODO: this should also handle reverse
1665:                targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
1666:                targetToken.m_strength_ = TOKEN_RESET_;
1667:                targetToken.m_next_ = null;
1668:                targetToken.m_previous_ = null;
1669:                targetToken.m_CELength_ = 0;
1670:                targetToken.m_expCELength_ = 0;
1671:                targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];
1672:                m_listHeader_[m_resultLength_].m_first_ = null;
1673:                m_listHeader_[m_resultLength_].m_last_ = null;
1674:                m_listHeader_[m_resultLength_].m_first_ = null;
1675:                m_listHeader_[m_resultLength_].m_last_ = null;
1676:                m_listHeader_[m_resultLength_].m_reset_ = targetToken;
1677:
1678:                /* 3 Consider each item: relation, source, and expansion:
1679:                 * e.g. ...< x / y ...
1680:                 * First convert all expansions into normal form. Examples:
1681:                 * If "xy" doesn't occur earlier in the list or in the UCA, convert
1682:                 * &xy * c * d * ... into &x * c/y * d * ...
1683:                 * Note: reset values can never have expansions, although they can
1684:                 * cause the very next item to have one. They may be contractions, if
1685:                 * they are found earlier in the list.
1686:                 */
1687:                int result = 0;
1688:                if (expand > 0) {
1689:                    // check to see if there is an expansion
1690:                    if (m_parsedToken_.m_charsLen_ > 1) {
1691:                        targetToken.m_source_ = ((expand - m_parsedToken_.m_charsOffset_) << 24)
1692:                                | m_parsedToken_.m_charsOffset_;
1693:                        result = ((m_parsedToken_.m_charsLen_
1694:                                + m_parsedToken_.m_charsOffset_ - expand) << 24)
1695:                                | expand;
1696:                    }
1697:                }
1698:
1699:                m_resultLength_++;
1700:                m_hashTable_.put(targetToken, targetToken);
1701:                return result;
1702:            }
1703:
1704:            /**
1705:             * Checks if an character is special
1706:             * @param ch character to test
1707:             * @return true if the character is special
1708:             */
1709:            private static final boolean isSpecialChar(char ch) {
1710:                return (ch <= 0x002F && ch >= 0x0020)
1711:                        || (ch <= 0x003F && ch >= 0x003A)
1712:                        || (ch <= 0x0060 && ch >= 0x005B)
1713:                        || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;
1714:            }
1715:
1716:            private UnicodeSet readAndSetUnicodeSet(String source, int start)
1717:                    throws ParseException {
1718:                while (source.charAt(start) != '[') { /* advance while we find the first '[' */
1719:                    start++;
1720:                }
1721:                // now we need to get a balanced set of '[]'. The problem is that a set can have
1722:                // many, and *end point to the first closing '['
1723:                int noOpenBraces = 1;
1724:                int current = 1; // skip the opening brace
1725:                while (start + current < source.length() && noOpenBraces != 0) {
1726:                    if (source.charAt(start + current) == '[') {
1727:                        noOpenBraces++;
1728:                    } else if (source.charAt(start + current) == ']') { // closing brace
1729:                        noOpenBraces--;
1730:                    }
1731:                    current++;
1732:                }
1733:                //int nextBrace = -1;
1734:
1735:                if (noOpenBraces != 0
1736:                        || (/*nextBrace =*/source
1737:                                .indexOf("]", start + current) /*']'*/) == -1) {
1738:                    throwParseException(m_rules_, start);
1739:                }
1740:                return new UnicodeSet(source.substring(start, start + current)); //uset_openPattern(start, current);
1741:            }
1742:
1743:            /** in C, optionarg is passed by reference to function.
1744:             *  We use a private int to simulate this.
1745:             */
1746:            private int m_optionarg_ = 0;
1747:
1748:            private int readOption(String rules, int start, int optionend) {
1749:                m_optionarg_ = 0;
1750:                int i = 0;
1751:                while (i < RULES_OPTIONS_.length) {
1752:                    String option = RULES_OPTIONS_[i].m_name_;
1753:                    int optionlength = option.length();
1754:                    if (rules.length() > start + optionlength
1755:                            && option.equalsIgnoreCase(rules.substring(start,
1756:                                    start + optionlength))) {
1757:                        if (optionend - start > optionlength) {
1758:                            m_optionarg_ = start + optionlength;
1759:                            // start of the options, skip space
1760:                            while (m_optionarg_ < optionend
1761:                                    && UCharacter.isWhitespace(rules
1762:                                            .charAt(m_optionarg_))) { // eat whitespace
1763:                                m_optionarg_++;
1764:                            }
1765:                        }
1766:                        break;
1767:                    }
1768:                    i++;
1769:                }
1770:                if (i == RULES_OPTIONS_.length) {
1771:                    i = -1;
1772:                }
1773:                return i;
1774:            }
1775:
1776:            /**
1777:             * Reads and set collation options
1778:             * @return TOKEN_SUCCESS if option is set correct, 0 otherwise
1779:             * @exception ParseException thrown when options in rules are wrong
1780:             */
1781:            private byte readAndSetOption() throws ParseException {
1782:                int start = m_current_ + 1; // skip opening '['
1783:                int i = readOption(m_rules_, start, m_optionEnd_);
1784:
1785:                int optionarg = m_optionarg_;
1786:
1787:                if (i < 0) {
1788:                    throwParseException(m_rules_, start);
1789:                }
1790:
1791:                if (i < 7) {
1792:                    if (optionarg != 0) {
1793:                        for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j++) {
1794:                            String subname = RULES_OPTIONS_[i].m_subOptions_[j];
1795:                            int size = optionarg + subname.length();
1796:                            if (m_rules_.length() > size
1797:                                    && subname.equalsIgnoreCase(m_rules_
1798:                                            .substring(optionarg, size))) {
1799:                                setOptions(
1800:                                        m_options_,
1801:                                        RULES_OPTIONS_[i].m_attribute_,
1802:                                        RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);
1803:                                return TOKEN_SUCCESS_MASK_;
1804:                            }
1805:                        }
1806:                    }
1807:                    throwParseException(m_rules_, optionarg);
1808:                } else if (i == 7) { // variable top
1809:                    return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;
1810:                } else if (i == 8) { // rearange
1811:                    return TOKEN_SUCCESS_MASK_;
1812:                } else if (i == 9) { // before
1813:                    if (optionarg != 0) {
1814:                        for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j++) {
1815:                            String subname = RULES_OPTIONS_[i].m_subOptions_[j];
1816:                            int size = optionarg + subname.length();
1817:                            if (m_rules_.length() > size
1818:                                    && subname.equalsIgnoreCase(m_rules_
1819:                                            .substring(optionarg, optionarg
1820:                                                    + subname.length()))) {
1821:                                return (byte) (TOKEN_SUCCESS_MASK_ | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j] + 1);
1822:                            }
1823:                        }
1824:                    }
1825:                    throwParseException(m_rules_, optionarg);
1826:                } else if (i == 10) { // top, we are going to have an array with
1827:                    // structures of limit CEs index to this array will be
1828:                    // src->parsedToken.indirectIndex
1829:                    m_parsedToken_.m_indirectIndex_ = 0;
1830:                    return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
1831:                } else if (i < 13) { // first, last
1832:                    for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j++) {
1833:                        String subname = RULES_OPTIONS_[i].m_subOptions_[j];
1834:                        int size = optionarg + subname.length();
1835:                        if (m_rules_.length() > size
1836:                                && subname.equalsIgnoreCase(m_rules_.substring(
1837:                                        optionarg, size))) {
1838:                            m_parsedToken_.m_indirectIndex_ = (char) (i - 10 + (j << 1));
1839:                            return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
1840:                        }
1841:                    }
1842:                    throwParseException(m_rules_, optionarg);
1843:                } else if (i == 13 || i == 14) { // copy and remove are handled before normalization
1844:                    // we need to move end here
1845:                    int noOpenBraces = 1;
1846:                    m_current_++; // skip opening brace
1847:                    while (m_current_ < m_source_.length() && noOpenBraces != 0) {
1848:                        if (m_source_.charAt(m_current_) == '[') {
1849:                            noOpenBraces++;
1850:                        } else if (m_source_.charAt(m_current_) == ']') { // closing brace
1851:                            noOpenBraces--;
1852:                        }
1853:                        m_current_++;
1854:                    }
1855:                    m_optionEnd_ = m_current_ - 1;
1856:                    return TOKEN_SUCCESS_MASK_;
1857:                } else {
1858:                    throwParseException(m_rules_, optionarg);
1859:                }
1860:                return TOKEN_SUCCESS_MASK_; // we will never reach here.
1861:            }
1862:
1863:            /**
1864:             * Set collation option
1865:             * @param optionset option set to set
1866:             * @param attribute type to set
1867:             * @param value attribute value
1868:             */
1869:            private void setOptions(OptionSet optionset, int attribute,
1870:                    int value) {
1871:                switch (attribute) {
1872:                case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_:
1873:                    optionset.m_isHiragana4_ = (value == RuleBasedCollator.AttributeValue.ON_);
1874:                    break;
1875:                case RuleBasedCollator.Attribute.FRENCH_COLLATION_:
1876:                    optionset.m_isFrenchCollation_ = (value == RuleBasedCollator.AttributeValue.ON_);
1877:                    break;
1878:                case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_:
1879:                    optionset.m_isAlternateHandlingShifted_ = (value == RuleBasedCollator.AttributeValue.SHIFTED_);
1880:                    break;
1881:                case RuleBasedCollator.Attribute.CASE_FIRST_:
1882:                    optionset.m_caseFirst_ = value;
1883:                    break;
1884:                case RuleBasedCollator.Attribute.CASE_LEVEL_:
1885:                    optionset.m_isCaseLevel_ = (value == RuleBasedCollator.AttributeValue.ON_);
1886:                    break;
1887:                case RuleBasedCollator.Attribute.NORMALIZATION_MODE_:
1888:                    if (value == RuleBasedCollator.AttributeValue.ON_) {
1889:                        value = Collator.CANONICAL_DECOMPOSITION;
1890:                    }
1891:                    optionset.m_decomposition_ = value;
1892:                    break;
1893:                case RuleBasedCollator.Attribute.STRENGTH_:
1894:                    optionset.m_strength_ = value;
1895:                    break;
1896:                default:
1897:                    break;
1898:                }
1899:            }
1900:
1901:            UnicodeSet getTailoredSet() throws ParseException {
1902:                boolean startOfRules = true;
1903:                UnicodeSet tailored = new UnicodeSet();
1904:                String pattern;
1905:                CanonicalIterator it = new CanonicalIterator("");
1906:
1907:                m_parsedToken_.m_strength_ = TOKEN_UNSET_;
1908:                int sourcelimit = m_source_.length();
1909:                //int expandNext = 0;
1910:
1911:                while (m_current_ < sourcelimit) {
1912:                    m_parsedToken_.m_prefixOffset_ = 0;
1913:                    if (parseNextToken(startOfRules) < 0) {
1914:                        // we have reached the end
1915:                        continue;
1916:                    }
1917:                    startOfRules = false;
1918:                    // The idea is to tokenize the rule set. For each non-reset token,
1919:                    // we add all the canonicaly equivalent FCD sequences
1920:                    if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
1921:                        it.setSource(m_source_.substring(
1922:                                m_parsedToken_.m_charsOffset_,
1923:                                m_parsedToken_.m_charsOffset_
1924:                                        + m_parsedToken_.m_charsLen_));
1925:                        pattern = it.next();
1926:                        while (pattern != null) {
1927:                            if (Normalizer.quickCheck(pattern, Normalizer.FCD,
1928:                                    0) != Normalizer.NO) {
1929:                                tailored.add(pattern);
1930:                            }
1931:                            pattern = it.next();
1932:                        }
1933:                    }
1934:                }
1935:                return tailored;
1936:            }
1937:
1938:            final private void extractSetsFromRules(String rules)
1939:                    throws ParseException {
1940:                int optionNumber = -1;
1941:                int setStart = 0;
1942:                int i = 0;
1943:                while (i < rules.length()) {
1944:                    if (rules.charAt(i) == 0x005B) {
1945:                        optionNumber = readOption(rules, i + 1, rules.length());
1946:                        setStart = m_optionarg_;
1947:                        if (optionNumber == 13) { /* copy - parts of UCA to tailoring */
1948:                            UnicodeSet newSet = readAndSetUnicodeSet(rules,
1949:                                    setStart);
1950:                            if (m_copySet_ == null) {
1951:                                m_copySet_ = newSet;
1952:                            } else {
1953:                                m_copySet_.addAll(newSet);
1954:                            }
1955:                        } else if (optionNumber == 14) {
1956:                            UnicodeSet newSet = readAndSetUnicodeSet(rules,
1957:                                    setStart);
1958:                            if (m_removeSet_ == null) {
1959:                                m_removeSet_ = newSet;
1960:                            } else {
1961:                                m_removeSet_.addAll(newSet);
1962:                            }
1963:                        }
1964:                    }
1965:                    i++;
1966:                }
1967:            }
1968:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.