0001: /**
0002: *******************************************************************************
0003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */package com.ibm.icu.text;
0007:
0008: import java.text.ParseException;
0009: import java.util.Hashtable;
0010: import java.util.Arrays;
0011: import com.ibm.icu.lang.UCharacter;
0012: import com.ibm.icu.impl.UCharacterProperty;
0013:
0014: /**
0015: * Class for parsing collation rules, produces a list of tokens that will be
0016: * turned into collation elements
0017: * @author Syn Wee Quek
0018: * @since release 2.2, June 7 2002
0019: * @draft 2.2
0020: */
0021: final class CollationRuleParser {
0022: // public data members ---------------------------------------------------
0023:
0024: // package private constructors ------------------------------------------
0025:
0026: /**
0027: * <p>RuleBasedCollator constructor that takes the rules.
0028: * Please see RuleBasedCollator class description for more details on the
0029: * collation rule syntax.</p>
0030: * @see java.util.Locale
0031: * @param rules the collation rules to build the collation table from.
0032: * @exception ParseException thrown when argument rules have an invalid
0033: * syntax.
0034: * @draft 2.2
0035: */
0036: CollationRuleParser(String rules) throws ParseException {
0037: extractSetsFromRules(rules);
0038: m_source_ = new StringBuffer(Normalizer.decompose(rules, false)
0039: .trim());
0040: m_rules_ = m_source_.toString();
0041: m_current_ = 0;
0042: m_extraCurrent_ = m_source_.length();
0043: m_variableTop_ = null;
0044: m_parsedToken_ = new ParsedToken();
0045: m_hashTable_ = new Hashtable();
0046: m_options_ = new OptionSet(RuleBasedCollator.UCA_);
0047: m_listHeader_ = new TokenListHeader[512];
0048: m_resultLength_ = 0;
0049: // call assembleTokenList() manually, so that we can
0050: // init a parser and manually parse tokens
0051: //assembleTokenList();
0052: }
0053:
0054: // package private inner classes -----------------------------------------
0055:
0056: /**
0057: * Collation options set
0058: */
0059: static class OptionSet {
0060: // package private constructor ---------------------------------------
0061:
0062: /**
0063: * Initializes the option set with the argument collators
0064: * @param collator option to use
0065: */
0066: OptionSet(RuleBasedCollator collator) {
0067: m_variableTopValue_ = collator.m_variableTopValue_;
0068: m_isFrenchCollation_ = collator.isFrenchCollation();
0069: m_isAlternateHandlingShifted_ = collator
0070: .isAlternateHandlingShifted();
0071: m_caseFirst_ = collator.m_caseFirst_;
0072: m_isCaseLevel_ = collator.isCaseLevel();
0073: m_decomposition_ = collator.getDecomposition();
0074: m_strength_ = collator.getStrength();
0075: m_isHiragana4_ = collator.m_isHiragana4_;
0076: }
0077:
0078: // package private data members --------------------------------------
0079:
0080: int m_variableTopValue_;
0081: boolean m_isFrenchCollation_;
0082: /**
0083: * Attribute for handling variable elements
0084: */
0085: boolean m_isAlternateHandlingShifted_;
0086: /**
0087: * who goes first, lower case or uppercase
0088: */
0089: int m_caseFirst_;
0090: /**
0091: * do we have an extra case level
0092: */
0093: boolean m_isCaseLevel_;
0094: /**
0095: * attribute for normalization
0096: */
0097: int m_decomposition_;
0098: /**
0099: * attribute for strength
0100: */
0101: int m_strength_;
0102: /**
0103: * attribute for special Hiragana
0104: */
0105: boolean m_isHiragana4_;
0106: }
0107:
0108: /**
0109: * List of tokens used by the collation rules
0110: */
0111: static class TokenListHeader {
0112: Token m_first_;
0113: Token m_last_;
0114: Token m_reset_;
0115: boolean m_indirect_;
0116: int m_baseCE_;
0117: int m_baseContCE_;
0118: int m_nextCE_;
0119: int m_nextContCE_;
0120: int m_previousCE_;
0121: int m_previousContCE_;
0122: int m_pos_[] = new int[Collator.IDENTICAL + 1];
0123: int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];
0124: int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];
0125: int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];
0126: Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];
0127: Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];
0128: }
0129:
0130: /**
0131: * Token wrapper for collation rules
0132: */
0133: static class Token {
0134: // package private data members ---------------------------------------
0135:
0136: int m_CE_[];
0137: int m_CELength_;
0138: int m_expCE_[];
0139: int m_expCELength_;
0140: int m_source_;
0141: int m_expansion_;
0142: int m_prefix_;
0143: int m_strength_;
0144: int m_toInsert_;
0145: int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>
0146: TokenListHeader m_listHeader_;
0147: Token m_previous_;
0148: Token m_next_;
0149: StringBuffer m_rules_;
0150: char m_flags_;
0151:
0152: // package private constructors ---------------------------------------
0153:
0154: Token() {
0155: m_CE_ = new int[128];
0156: m_expCE_ = new int[128];
0157: // TODO: this should also handle reverse
0158: m_polarity_ = TOKEN_POLARITY_POSITIVE_;
0159: m_next_ = null;
0160: m_previous_ = null;
0161: m_CELength_ = 0;
0162: m_expCELength_ = 0;
0163: }
0164:
0165: // package private methods --------------------------------------------
0166:
0167: /**
0168: * Hashcode calculation for token
0169: * @return the hashcode
0170: */
0171: public int hashCode() {
0172: int result = 0;
0173: int len = (m_source_ & 0xFF000000) >>> 24;
0174: int inc = ((len - 32) / 32) + 1;
0175:
0176: int start = m_source_ & 0x00FFFFFF;
0177: int limit = start + len;
0178:
0179: while (start < limit) {
0180: result = (result * 37) + m_rules_.charAt(start);
0181: start += inc;
0182: }
0183: return result;
0184: }
0185:
0186: /**
0187: * Equals calculation
0188: * @param target object to compare
0189: * @return true if target is the same as this object
0190: */
0191: public boolean equals(Object target) {
0192: if (target == this ) {
0193: return true;
0194: }
0195: if (target instanceof Token) {
0196: Token t = (Token) target;
0197: int sstart = m_source_ & 0x00FFFFFF;
0198: int tstart = t.m_source_ & 0x00FFFFFF;
0199: int slimit = (m_source_ & 0xFF000000) >> 24;
0200: int tlimit = (m_source_ & 0xFF000000) >> 24;
0201:
0202: int end = sstart + slimit - 1;
0203:
0204: if (m_source_ == 0 || t.m_source_ == 0) {
0205: return false;
0206: }
0207: if (slimit != tlimit) {
0208: return false;
0209: }
0210: if (m_source_ == t.m_source_) {
0211: return true;
0212: }
0213:
0214: while (sstart < end
0215: && m_rules_.charAt(sstart) == t.m_rules_
0216: .charAt(tstart)) {
0217: ++sstart;
0218: ++tstart;
0219: }
0220: if (m_rules_.charAt(sstart) == t.m_rules_
0221: .charAt(tstart)) {
0222: return true;
0223: }
0224: }
0225: return false;
0226: }
0227: }
0228:
0229: // package private data member -------------------------------------------
0230:
0231: /**
0232: * Indicator that the token is resetted yet, ie & in the rules
0233: */
0234: static final int TOKEN_RESET_ = 0xDEADBEEF;
0235:
0236: /**
0237: * Size of the number of tokens
0238: */
0239: int m_resultLength_;
0240: /**
0241: * List of parsed tokens
0242: */
0243: TokenListHeader m_listHeader_[];
0244: /**
0245: * Variable top token
0246: */
0247: Token m_variableTop_;
0248: /**
0249: * Collation options
0250: */
0251: OptionSet m_options_;
0252: /**
0253: * Normalized collation rules with some extra characters
0254: */
0255: StringBuffer m_source_;
0256: /**
0257: * Hash table to keep all tokens
0258: */
0259: Hashtable m_hashTable_;
0260:
0261: // package private method ------------------------------------------------
0262:
0263: void setDefaultOptionsInCollator(RuleBasedCollator collator) {
0264: collator.m_defaultStrength_ = m_options_.m_strength_;
0265: collator.m_defaultDecomposition_ = m_options_.m_decomposition_;
0266: collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;
0267: collator.m_defaultIsAlternateHandlingShifted_ = m_options_.m_isAlternateHandlingShifted_;
0268: collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;
0269: collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
0270: collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
0271: collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
0272: }
0273:
0274: // private inner classes -------------------------------------------------
0275:
0276: /**
0277: * This is a token that has been parsed but not yet processed. Used to
0278: * reduce the number of arguments in the parser
0279: */
0280: private static class ParsedToken {
0281: // private constructor ----------------------------------------------
0282:
0283: /**
0284: * Empty constructor
0285: */
0286: ParsedToken() {
0287: m_charsLen_ = 0;
0288: m_charsOffset_ = 0;
0289: m_extensionLen_ = 0;
0290: m_extensionOffset_ = 0;
0291: m_prefixLen_ = 0;
0292: m_prefixOffset_ = 0;
0293: m_flags_ = 0;
0294: m_strength_ = TOKEN_UNSET_;
0295: }
0296:
0297: // private data members ---------------------------------------------
0298:
0299: int m_strength_;
0300: int m_charsOffset_;
0301: int m_charsLen_;
0302: int m_extensionOffset_;
0303: int m_extensionLen_;
0304: int m_prefixOffset_;
0305: int m_prefixLen_;
0306: char m_flags_;
0307: char m_indirectIndex_;
0308: }
0309:
0310: /**
0311: * Boundary wrappers
0312: */
0313: private static class IndirectBoundaries {
0314: // package private constructor ---------------------------------------
0315:
0316: IndirectBoundaries(int startce[], int limitce[]) {
0317: // Set values for the top - TODO: once we have values for all the
0318: // indirects, we are going to initalize here.
0319: m_startCE_ = startce[0];
0320: m_startContCE_ = startce[1];
0321: if (limitce != null) {
0322: m_limitCE_ = limitce[0];
0323: m_limitContCE_ = limitce[1];
0324: } else {
0325: m_limitCE_ = 0;
0326: m_limitContCE_ = 0;
0327: }
0328: }
0329:
0330: // package private data members --------------------------------------
0331:
0332: int m_startCE_;
0333: int m_startContCE_;
0334: int m_limitCE_;
0335: int m_limitContCE_;
0336: }
0337:
0338: /**
0339: * Collation option rule tag
0340: */
0341: private static class TokenOption {
0342: // package private constructor ---------------------------------------
0343:
0344: TokenOption(String name, int attribute, String suboptions[],
0345: int suboptionattributevalue[]) {
0346: m_name_ = name;
0347: m_attribute_ = attribute;
0348: m_subOptions_ = suboptions;
0349: m_subOptionAttributeValues_ = suboptionattributevalue;
0350: }
0351:
0352: // package private data member ---------------------------------------
0353:
0354: private String m_name_;
0355: private int m_attribute_;
0356: private String m_subOptions_[];
0357: private int m_subOptionAttributeValues_[];
0358: }
0359:
0360: // private variables -----------------------------------------------------
0361:
0362: /**
0363: * Current parsed token
0364: */
0365: private ParsedToken m_parsedToken_;
0366: /**
0367: * Collation rule
0368: */
0369: private String m_rules_;
0370: private int m_current_;
0371: /**
0372: * End of the option while reading.
0373: * Need it for UnicodeSet reading support.
0374: */
0375: private int m_optionEnd_;
0376: /**
0377: * Current offset in m_source
0378: */
0379: private int m_sourceLimit_;
0380: /**
0381: * Offset to m_source_ ofr the extra expansion characters
0382: */
0383: private int m_extraCurrent_;
0384:
0385: /**
0386: * UnicodeSet that contains code points to be copied from the UCA
0387: */
0388: UnicodeSet m_copySet_;
0389:
0390: /**
0391: * UnicodeSet that contains code points for which we want to remove
0392: * UCA contractions. It implies copying of these code points from
0393: * the UCA.
0394: */
0395: UnicodeSet m_removeSet_;
0396: /**
0397: * This is space for the extra strings that need to be unquoted during the
0398: * parsing of the rules
0399: */
0400: private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;
0401: /**
0402: * Indicator that the token is not set yet
0403: */
0404: private static final int TOKEN_UNSET_ = 0xFFFFFFFF;
0405: /**
0406: * Indicator that the rule is in the > polarity, ie everything on the
0407: * right of the rule is less than
0408: */
0409: private static final int TOKEN_POLARITY_NEGATIVE_ = 0;
0410: /**
0411: * Indicator that the rule is in the < polarity, ie everything on the
0412: * right of the rule is greater than
0413: */
0414: private static final int TOKEN_POLARITY_POSITIVE_ = 1;
0415: /**
0416: * Flag mask to determine if top is set
0417: */
0418: private static final int TOKEN_TOP_MASK_ = 0x04;
0419: /**
0420: * Flag mask to determine if variable top is set
0421: */
0422: private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;
0423: /**
0424: * Flag mask to determine if a before attribute is set
0425: */
0426: private static final int TOKEN_BEFORE_ = 0x03;
0427: /**
0428: * For use in parsing token options
0429: */
0430: private static final int TOKEN_SUCCESS_MASK_ = 0x10;
0431:
0432: /**
0433: * These values are used for finding CE values for indirect positioning.
0434: * Indirect positioning is a mechanism for allowing resets on symbolic
0435: * values. It only works for resets and you cannot tailor indirect names.
0436: * An indirect name can define either an anchor point or a range. An anchor
0437: * point behaves in exactly the same way as a code point in reset would,
0438: * except that it cannot be tailored. A range (we currently only know for
0439: * the [top] range will explicitly set the upper bound for generated CEs,
0440: * thus allowing for better control over how many CEs can be squeezed
0441: * between in the range without performance penalty. In that respect, we use
0442: * [top] for tailoring of locales that use CJK characters. Other indirect
0443: * values are currently a pure convenience, they can be used to assure that
0444: * the CEs will be always positioned in the same place relative to a point
0445: * with known properties (e.g. first primary ignorable).
0446: */
0447: private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];
0448:
0449: /**
0450: * Inverse UCA constants
0451: */
0452: private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;
0453: private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;
0454: private static final int INVERSE_SHIFT_VALUE_ = 20;
0455:
0456: /**
0457: * Collation option tags
0458: * [last variable] last variable value
0459: * [last primary ignorable] largest CE for primary ignorable
0460: * [last secondary ignorable] largest CE for secondary ignorable
0461: * [last tertiary ignorable] largest CE for tertiary ignorable
0462: * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
0463: */
0464: private static final TokenOption RULES_OPTIONS_[];
0465:
0466: static {
0467: INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];
0468: // UCOL_RESET_TOP_VALUE
0469: INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(
0470: RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
0471: RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
0472: // UCOL_FIRST_PRIMARY_IGNORABLE
0473: INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(
0474: RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_,
0475: null);
0476: // UCOL_LAST_PRIMARY_IGNORABLE
0477: INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(
0478: RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_,
0479: null);
0480:
0481: // UCOL_FIRST_SECONDARY_IGNORABLE
0482: INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(
0483: RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_,
0484: null);
0485: // UCOL_LAST_SECONDARY_IGNORABLE
0486: INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(
0487: RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_,
0488: null);
0489: // UCOL_FIRST_TERTIARY_IGNORABLE
0490: INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(
0491: RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_,
0492: null);
0493: // UCOL_LAST_TERTIARY_IGNORABLE
0494: INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(
0495: RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_,
0496: null);
0497: // UCOL_FIRST_VARIABLE;
0498: INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(
0499: RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_, null);
0500: // UCOL_LAST_VARIABLE
0501: INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(
0502: RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_, null);
0503: // UCOL_FIRST_NON_VARIABLE
0504: INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(
0505: RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_,
0506: null);
0507: // UCOL_LAST_NON_VARIABLE
0508: INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(
0509: RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
0510: RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
0511: // UCOL_FIRST_IMPLICIT
0512: INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(
0513: RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_, null);
0514: // UCOL_LAST_IMPLICIT
0515: INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(
0516: RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,
0517: RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);
0518: // UCOL_FIRST_TRAILING
0519: INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(
0520: RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_, null);
0521: // UCOL_LAST_TRAILING
0522: INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(
0523: RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_, null);
0524: INDIRECT_BOUNDARIES_[14].m_limitCE_ = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;
0525:
0526: RULES_OPTIONS_ = new TokenOption[19];
0527: String option[] = { "non-ignorable", "shifted" };
0528: int value[] = {
0529: RuleBasedCollator.AttributeValue.NON_IGNORABLE_,
0530: RuleBasedCollator.AttributeValue.SHIFTED_ };
0531: RULES_OPTIONS_[0] = new TokenOption("alternate",
0532: RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
0533: option, value);
0534: option = new String[1];
0535: option[0] = "2";
0536: value = new int[1];
0537: value[0] = RuleBasedCollator.AttributeValue.ON_;
0538: RULES_OPTIONS_[1] = new TokenOption("backwards",
0539: RuleBasedCollator.Attribute.FRENCH_COLLATION_, option,
0540: value);
0541: String offonoption[] = new String[2];
0542: offonoption[0] = "off";
0543: offonoption[1] = "on";
0544: int offonvalue[] = new int[2];
0545: offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;
0546: offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;
0547: RULES_OPTIONS_[2] = new TokenOption("caseLevel",
0548: RuleBasedCollator.Attribute.CASE_LEVEL_, offonoption,
0549: offonvalue);
0550: option = new String[3];
0551: option[0] = "lower";
0552: option[1] = "upper";
0553: option[2] = "off";
0554: value = new int[3];
0555: value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;
0556: value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;
0557: value[2] = RuleBasedCollator.AttributeValue.OFF_;
0558: RULES_OPTIONS_[3] = new TokenOption("caseFirst",
0559: RuleBasedCollator.Attribute.CASE_FIRST_, option, value);
0560: RULES_OPTIONS_[4] = new TokenOption("normalization",
0561: RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
0562: offonoption, offonvalue);
0563: RULES_OPTIONS_[5] = new TokenOption("hiraganaQ",
0564: RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
0565: offonoption, offonvalue);
0566: option = new String[5];
0567: option[0] = "1";
0568: option[1] = "2";
0569: option[2] = "3";
0570: option[3] = "4";
0571: option[4] = "I";
0572: value = new int[5];
0573: value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
0574: value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
0575: value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
0576: value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;
0577: value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;
0578: RULES_OPTIONS_[6] = new TokenOption("strength",
0579: RuleBasedCollator.Attribute.STRENGTH_, option, value);
0580: RULES_OPTIONS_[7] = new TokenOption("variable top",
0581: RuleBasedCollator.Attribute.LIMIT_, null, null);
0582: RULES_OPTIONS_[8] = new TokenOption("rearrange",
0583: RuleBasedCollator.Attribute.LIMIT_, null, null);
0584: option = new String[3];
0585: option[0] = "1";
0586: option[1] = "2";
0587: option[2] = "3";
0588: value = new int[3];
0589: value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
0590: value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
0591: value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
0592: RULES_OPTIONS_[9] = new TokenOption("before",
0593: RuleBasedCollator.Attribute.LIMIT_, option, value);
0594: RULES_OPTIONS_[10] = new TokenOption("top",
0595: RuleBasedCollator.Attribute.LIMIT_, null, null);
0596: String firstlastoption[] = new String[7];
0597: firstlastoption[0] = "primary";
0598: firstlastoption[1] = "secondary";
0599: firstlastoption[2] = "tertiary";
0600: firstlastoption[3] = "variable";
0601: firstlastoption[4] = "regular";
0602: firstlastoption[5] = "implicit";
0603: firstlastoption[6] = "trailing";
0604:
0605: int firstlastvalue[] = new int[7];
0606: Arrays.fill(firstlastvalue,
0607: RuleBasedCollator.AttributeValue.PRIMARY_);
0608:
0609: RULES_OPTIONS_[11] = new TokenOption("first",
0610: RuleBasedCollator.Attribute.LIMIT_, firstlastoption,
0611: firstlastvalue);
0612: RULES_OPTIONS_[12] = new TokenOption("last",
0613: RuleBasedCollator.Attribute.LIMIT_, firstlastoption,
0614: firstlastvalue);
0615: RULES_OPTIONS_[13] = new TokenOption("optimize",
0616: RuleBasedCollator.Attribute.LIMIT_, null, null);
0617: RULES_OPTIONS_[14] = new TokenOption("suppressContractions",
0618: RuleBasedCollator.Attribute.LIMIT_, null, null);
0619: RULES_OPTIONS_[15] = new TokenOption("undefined",
0620: RuleBasedCollator.Attribute.LIMIT_, null, null);
0621: RULES_OPTIONS_[16] = new TokenOption("scriptOrder",
0622: RuleBasedCollator.Attribute.LIMIT_, null, null);
0623: RULES_OPTIONS_[17] = new TokenOption("charsetname",
0624: RuleBasedCollator.Attribute.LIMIT_, null, null);
0625: RULES_OPTIONS_[18] = new TokenOption("charset",
0626: RuleBasedCollator.Attribute.LIMIT_, null, null);
0627: }
0628:
0629: /**
0630: * Utility data members
0631: */
0632: private Token m_utilToken_ = new Token();
0633: private CollationElementIterator m_UCAColEIter_ = RuleBasedCollator.UCA_
0634: .getCollationElementIterator("");
0635: private int m_utilCEBuffer_[] = new int[2];
0636:
0637: // private methods -------------------------------------------------------
0638:
0639: /**
0640: * Assembles the token list
0641: * @exception ParseException thrown when rules syntax fails
0642: */
0643: int assembleTokenList() throws ParseException {
0644: Token lastToken = null;
0645: m_parsedToken_.m_strength_ = TOKEN_UNSET_;
0646: int sourcelimit = m_source_.length();
0647: int expandNext = 0;
0648:
0649: while (m_current_ < sourcelimit) {
0650: m_parsedToken_.m_prefixOffset_ = 0;
0651: if (parseNextToken(lastToken == null) < 0) {
0652: // we have reached the end
0653: continue;
0654: }
0655: char specs = m_parsedToken_.m_flags_;
0656: boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);
0657: boolean top = ((specs & TOKEN_TOP_MASK_) != 0);
0658: int lastStrength = TOKEN_UNSET_;
0659: if (lastToken != null) {
0660: lastStrength = lastToken.m_strength_;
0661: }
0662: m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24
0663: | m_parsedToken_.m_charsOffset_;
0664: m_utilToken_.m_rules_ = m_source_;
0665: // 4 Lookup each source in the CharsToToken map, and find a
0666: // sourcetoken
0667: Token sourceToken = (Token) m_hashTable_.get(m_utilToken_);
0668: if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
0669: if (lastToken == null) {
0670: // this means that rules haven't started properly
0671: throwParseException(m_source_.toString(), 0);
0672: }
0673: // 6 Otherwise (when relation != reset)
0674: if (sourceToken == null) {
0675: // If sourceToken is null, create new one
0676: sourceToken = new Token();
0677: sourceToken.m_rules_ = m_source_;
0678: sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
0679: | m_parsedToken_.m_charsOffset_;
0680: sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24
0681: | m_parsedToken_.m_prefixOffset_;
0682: // TODO: this should also handle reverse
0683: sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
0684: sourceToken.m_next_ = null;
0685: sourceToken.m_previous_ = null;
0686: sourceToken.m_CELength_ = 0;
0687: sourceToken.m_expCELength_ = 0;
0688: m_hashTable_.put(sourceToken, sourceToken);
0689: } else {
0690: // we could have fished out a reset here
0691: if (sourceToken.m_strength_ != TOKEN_RESET_
0692: && lastToken != sourceToken) {
0693: // otherwise remove sourceToken from where it was.
0694: if (sourceToken.m_next_ != null) {
0695: if (sourceToken.m_next_.m_strength_ > sourceToken.m_strength_) {
0696: sourceToken.m_next_.m_strength_ = sourceToken.m_strength_;
0697: }
0698: sourceToken.m_next_.m_previous_ = sourceToken.m_previous_;
0699: } else {
0700: sourceToken.m_listHeader_.m_last_ = sourceToken.m_previous_;
0701: }
0702: if (sourceToken.m_previous_ != null) {
0703: sourceToken.m_previous_.m_next_ = sourceToken.m_next_;
0704: } else {
0705: sourceToken.m_listHeader_.m_first_ = sourceToken.m_next_;
0706: }
0707: sourceToken.m_next_ = null;
0708: sourceToken.m_previous_ = null;
0709: }
0710: }
0711: sourceToken.m_strength_ = m_parsedToken_.m_strength_;
0712: sourceToken.m_listHeader_ = lastToken.m_listHeader_;
0713:
0714: // 1. Find the strongest strength in each list, and set
0715: // strongestP and strongestN accordingly in the headers.
0716: if (lastStrength == TOKEN_RESET_
0717: || sourceToken.m_listHeader_.m_first_ == null) {
0718: // If LAST is a reset insert sourceToken in the list.
0719: if (sourceToken.m_listHeader_.m_first_ == null) {
0720: sourceToken.m_listHeader_.m_first_ = sourceToken;
0721: sourceToken.m_listHeader_.m_last_ = sourceToken;
0722: } else { // we need to find a place for us
0723: // and we'll get in front of the same strength
0724: if (sourceToken.m_listHeader_.m_first_.m_strength_ <= sourceToken.m_strength_) {
0725: sourceToken.m_next_ = sourceToken.m_listHeader_.m_first_;
0726: sourceToken.m_next_.m_previous_ = sourceToken;
0727: sourceToken.m_listHeader_.m_first_ = sourceToken;
0728: sourceToken.m_previous_ = null;
0729: } else {
0730: lastToken = sourceToken.m_listHeader_.m_first_;
0731: while (lastToken.m_next_ != null
0732: && lastToken.m_next_.m_strength_ > sourceToken.m_strength_) {
0733: lastToken = lastToken.m_next_;
0734: }
0735: if (lastToken.m_next_ != null) {
0736: lastToken.m_next_.m_previous_ = sourceToken;
0737: } else {
0738: sourceToken.m_listHeader_.m_last_ = sourceToken;
0739: }
0740: sourceToken.m_previous_ = lastToken;
0741: sourceToken.m_next_ = lastToken.m_next_;
0742: lastToken.m_next_ = sourceToken;
0743: }
0744: }
0745: } else {
0746: // Otherwise (when LAST is not a reset)
0747: // if polarity (LAST) == polarity(relation), insert
0748: // sourceToken after LAST, otherwise insert before.
0749: // when inserting after or before, search to the next
0750: // position with the same strength in that direction.
0751: // (This is called postpone insertion).
0752: if (sourceToken != lastToken) {
0753: if (lastToken.m_polarity_ == sourceToken.m_polarity_) {
0754: while (lastToken.m_next_ != null
0755: && lastToken.m_next_.m_strength_ > sourceToken.m_strength_) {
0756: lastToken = lastToken.m_next_;
0757: }
0758: sourceToken.m_previous_ = lastToken;
0759: if (lastToken.m_next_ != null) {
0760: lastToken.m_next_.m_previous_ = sourceToken;
0761: } else {
0762: sourceToken.m_listHeader_.m_last_ = sourceToken;
0763: }
0764: sourceToken.m_next_ = lastToken.m_next_;
0765: lastToken.m_next_ = sourceToken;
0766: } else {
0767: while (lastToken.m_previous_ != null
0768: && lastToken.m_previous_.m_strength_ > sourceToken.m_strength_) {
0769: lastToken = lastToken.m_previous_;
0770: }
0771: sourceToken.m_next_ = lastToken;
0772: if (lastToken.m_previous_ != null) {
0773: lastToken.m_previous_.m_next_ = sourceToken;
0774: } else {
0775: sourceToken.m_listHeader_.m_first_ = sourceToken;
0776: }
0777: sourceToken.m_previous_ = lastToken.m_previous_;
0778: lastToken.m_previous_ = sourceToken;
0779: }
0780: } else { // repeated one thing twice in rules, stay with the
0781: // stronger strength
0782: if (lastStrength < sourceToken.m_strength_) {
0783: sourceToken.m_strength_ = lastStrength;
0784: }
0785: }
0786: }
0787: // if the token was a variable top, we're gonna put it in
0788: if (variableTop == true && m_variableTop_ == null) {
0789: variableTop = false;
0790: m_variableTop_ = sourceToken;
0791: }
0792: // Treat the expansions.
0793: // There are two types of expansions: explicit (x / y) and
0794: // reset based propagating expansions
0795: // (&abc * d * e <=> &ab * d / c * e / c)
0796: // if both of them are in effect for a token, they are combined.
0797: sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
0798: | m_parsedToken_.m_extensionOffset_;
0799: if (expandNext != 0) {
0800: if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) {
0801: // primary strength kills off the implicit expansion
0802: expandNext = 0;
0803: } else if (sourceToken.m_expansion_ == 0) {
0804: // if there is no expansion, implicit is just added to
0805: // the token
0806: sourceToken.m_expansion_ = expandNext;
0807: } else {
0808: // there is both explicit and implicit expansion.
0809: // We need to make a combination
0810: int start = expandNext & 0xFFFFFF;
0811: int size = expandNext >>> 24;
0812: if (size > 0) {
0813: m_source_.append(m_source_.substring(start,
0814: start + size));
0815: }
0816: start = m_parsedToken_.m_extensionOffset_;
0817: m_source_
0818: .append(m_source_
0819: .substring(
0820: start,
0821: start
0822: + m_parsedToken_.m_extensionLen_));
0823: sourceToken.m_expansion_ = (size + m_parsedToken_.m_extensionLen_) << 24
0824: | m_extraCurrent_;
0825: m_extraCurrent_ += size
0826: + m_parsedToken_.m_extensionLen_;
0827: }
0828: }
0829: // if the previous token was a reset before, the strength of this
0830: // token must match the strength of before. Otherwise we have an
0831: // undefined situation.
0832: // In other words, we currently have a cludge which we use to
0833: // represent &a >> x. This is written as &[before 2]a << x.
0834: if ((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {
0835: int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;
0836: if (beforeStrength != sourceToken.m_strength_) {
0837: throwParseException(m_source_.toString(),
0838: m_current_);
0839: }
0840: }
0841:
0842: } else {
0843: if (lastToken != null && lastStrength == TOKEN_RESET_) {
0844: // if the previous token was also a reset, this means that
0845: // we have two consecutive resets and we want to remove the
0846: // previous one if empty
0847: if (m_resultLength_ > 0
0848: && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
0849: m_resultLength_--;
0850: }
0851: }
0852: if (sourceToken == null) {
0853: // this is a reset, but it might still be somewhere in the
0854: // tailoring, in shorter form
0855: int searchCharsLen = m_parsedToken_.m_charsLen_;
0856: while (searchCharsLen > 1 && sourceToken == null) {
0857: searchCharsLen--;
0858: // key = searchCharsLen << 24 | charsOffset;
0859: m_utilToken_.m_source_ = searchCharsLen << 24
0860: | m_parsedToken_.m_charsOffset_;
0861: m_utilToken_.m_rules_ = m_source_;
0862: sourceToken = (Token) m_hashTable_
0863: .get(m_utilToken_);
0864: }
0865: if (sourceToken != null) {
0866: expandNext = (m_parsedToken_.m_charsLen_ - searchCharsLen) << 24
0867: | (m_parsedToken_.m_charsOffset_ + searchCharsLen);
0868: }
0869: }
0870: if ((specs & TOKEN_BEFORE_) != 0) {
0871: if (top == false) {
0872: // we're doing before & there is no indirection
0873: int strength = (specs & TOKEN_BEFORE_) - 1;
0874: if (sourceToken != null
0875: && sourceToken.m_strength_ != TOKEN_RESET_) {
0876: // this is a before that is already ordered in the UCA
0877: // - so we need to get the previous with good strength
0878: while (sourceToken.m_strength_ > strength
0879: && sourceToken.m_previous_ != null) {
0880: sourceToken = sourceToken.m_previous_;
0881: }
0882: // here, either we hit the strength or NULL
0883: if (sourceToken.m_strength_ == strength) {
0884: if (sourceToken.m_previous_ != null) {
0885: sourceToken = sourceToken.m_previous_;
0886: } else { // start of list
0887: sourceToken = sourceToken.m_listHeader_.m_reset_;
0888: }
0889: } else { // we hit NULL, we should be doing the else part
0890: sourceToken = sourceToken.m_listHeader_.m_reset_;
0891: sourceToken = getVirginBefore(
0892: sourceToken, strength);
0893: }
0894: } else {
0895: sourceToken = getVirginBefore(sourceToken,
0896: strength);
0897: }
0898: } else {
0899: // this is both before and indirection
0900: top = false;
0901: m_listHeader_[m_resultLength_] = new TokenListHeader();
0902: m_listHeader_[m_resultLength_].m_previousCE_ = 0;
0903: m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
0904: m_listHeader_[m_resultLength_].m_indirect_ = true;
0905: // we need to do slightly more work. we need to get the
0906: // baseCE using the inverse UCA & getPrevious. The next
0907: // bound is not set, and will be decided in ucol_bld
0908: int strength = (specs & TOKEN_BEFORE_) - 1;
0909: int baseCE = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startCE_;
0910: int baseContCE = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_;
0911: int ce[] = new int[2];
0912: if ((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
0913: && (baseCE >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
0914: int primary = baseCE
0915: & RuleBasedCollator.CE_PRIMARY_MASK_
0916: | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
0917: int raw = RuleBasedCollator.impCEGen_
0918: .getRawFromImplicit(primary);
0919: int primaryCE = RuleBasedCollator.impCEGen_
0920: .getImplicitFromRaw(raw - 1);
0921: ce[0] = primaryCE
0922: & RuleBasedCollator.CE_PRIMARY_MASK_
0923: | 0x0505;
0924: ce[1] = (primaryCE << 16)
0925: & RuleBasedCollator.CE_PRIMARY_MASK_
0926: | RuleBasedCollator.CE_CONTINUATION_MARKER_;
0927: } else {
0928: CollationParsedRuleBuilder.InverseUCA invuca = CollationParsedRuleBuilder.INVERSE_UCA_;
0929: invuca.getInversePrevCE(baseCE, baseContCE,
0930: strength, ce);
0931: }
0932: m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];
0933: m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];
0934: m_listHeader_[m_resultLength_].m_nextCE_ = 0;
0935: m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
0936:
0937: sourceToken = new Token();
0938: expandNext = initAReset(0, sourceToken);
0939: }
0940: }
0941: // 5 If the relation is a reset:
0942: // If sourceToken is null
0943: // Create new list, create new sourceToken, make the baseCE
0944: // from source, put the sourceToken in ListHeader of the new
0945: // list
0946: if (sourceToken == null) {
0947: if (m_listHeader_[m_resultLength_] == null) {
0948: m_listHeader_[m_resultLength_] = new TokenListHeader();
0949: }
0950: // 3 Consider each item: relation, source, and expansion:
0951: // e.g. ...< x / y ...
0952: // First convert all expansions into normal form.
0953: // Examples:
0954: // If "xy" doesn't occur earlier in the list or in the UCA,
0955: // convert &xy * c * d * ... into &x * c/y * d * ...
0956: // Note: reset values can never have expansions, although
0957: // they can cause the very next item to have one. They may
0958: // be contractions, if they are found earlier in the list.
0959: if (top == false) {
0960: CollationElementIterator coleiter = RuleBasedCollator.UCA_
0961: .getCollationElementIterator(m_source_
0962: .substring(
0963: m_parsedToken_.m_charsOffset_,
0964: m_parsedToken_.m_charsOffset_
0965: + m_parsedToken_.m_charsLen_));
0966:
0967: int CE = coleiter.next();
0968: // offset to the character in the full rule string
0969: int expand = coleiter.getOffset()
0970: + m_parsedToken_.m_charsOffset_;
0971: int SecondCE = coleiter.next();
0972:
0973: m_listHeader_[m_resultLength_].m_baseCE_ = CE & 0xFFFFFF3F;
0974: if (RuleBasedCollator.isContinuation(SecondCE)) {
0975: m_listHeader_[m_resultLength_].m_baseContCE_ = SecondCE;
0976: } else {
0977: m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
0978: }
0979: m_listHeader_[m_resultLength_].m_nextCE_ = 0;
0980: m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
0981: m_listHeader_[m_resultLength_].m_previousCE_ = 0;
0982: m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
0983: m_listHeader_[m_resultLength_].m_indirect_ = false;
0984: sourceToken = new Token();
0985: expandNext = initAReset(expand, sourceToken);
0986: } else { // top == TRUE
0987: top = false;
0988: m_listHeader_[m_resultLength_].m_previousCE_ = 0;
0989: m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
0990: m_listHeader_[m_resultLength_].m_indirect_ = true;
0991: IndirectBoundaries ib = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
0992: m_listHeader_[m_resultLength_].m_baseCE_ = ib.m_startCE_;
0993: m_listHeader_[m_resultLength_].m_baseContCE_ = ib.m_startContCE_;
0994: m_listHeader_[m_resultLength_].m_nextCE_ = ib.m_limitCE_;
0995: m_listHeader_[m_resultLength_].m_nextContCE_ = ib.m_limitContCE_;
0996: sourceToken = new Token();
0997: expandNext = initAReset(0, sourceToken);
0998: }
0999: } else { // reset to something already in rules
1000: top = false;
1001: }
1002: }
1003: // 7 After all this, set LAST to point to sourceToken, and goto
1004: // step 3.
1005: lastToken = sourceToken;
1006: }
1007:
1008: if (m_resultLength_ > 0
1009: && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
1010: m_resultLength_--;
1011: }
1012: return m_resultLength_;
1013: }
1014:
1015: /**
1016: * Formats and throws a ParseException
1017: * @param rules collation rule that failed
1018: * @param offset failed offset in rules
1019: * @throws ParseException with failure information
1020: */
1021: private static final void throwParseException(String rules,
1022: int offset) throws ParseException {
1023: // for pre-context
1024: String precontext = rules.substring(0, offset);
1025: String postcontext = rules.substring(offset, rules.length());
1026: StringBuffer error = new StringBuffer(
1027: "Parse error occurred in rule at offset ");
1028: error.append(offset);
1029: error.append("\n after the prefix \"");
1030: error.append(precontext);
1031: error.append("\" before the suffix \"");
1032: error.append(postcontext);
1033: throw new ParseException(error.toString(), offset);
1034: }
1035:
1036: private final boolean doSetTop() {
1037: m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1038: m_source_.append((char) 0xFFFE);
1039: IndirectBoundaries ib = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
1040: m_source_.append((char) (ib.m_startCE_ >> 16));
1041: m_source_.append((char) (ib.m_startCE_ & 0xFFFF));
1042: m_extraCurrent_ += 3;
1043: if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_ == 0) {
1044: m_parsedToken_.m_charsLen_ = 3;
1045: } else {
1046: m_source_
1047: .append((char) (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_ >> 16));
1048: m_source_
1049: .append((char) (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_].m_startContCE_ & 0xFFFF));
1050: m_extraCurrent_ += 2;
1051: m_parsedToken_.m_charsLen_ = 5;
1052: }
1053: return true;
1054: }
1055:
1056: private static boolean isCharNewLine(char c) {
1057: switch (c) {
1058: case 0x000A: /* LF */
1059: case 0x000D: /* CR */
1060: case 0x000C: /* FF */
1061: case 0x0085: /* NEL */
1062: case 0x2028: /* LS */
1063: case 0x2029: /* PS */
1064: return true;
1065: default:
1066: return false;
1067: }
1068: }
1069:
1070: /**
1071: * Getting the next token
1072: *
1073: * @param startofrules
1074: * flag indicating if we are at the start of rules
1075: * @return the offset of the rules
1076: * @exception ParseException
1077: * thrown when rule parsing fails
1078: */
1079: private int parseNextToken(boolean startofrules)
1080: throws ParseException {
1081: // parsing part
1082: boolean variabletop = false;
1083: boolean top = false;
1084: boolean inchars = true;
1085: boolean inquote = false;
1086: boolean wasinquote = false;
1087: byte before = 0;
1088: boolean isescaped = false;
1089: int /*newcharslen = 0,*/newextensionlen = 0;
1090: int /*charsoffset = 0,*/extensionoffset = 0;
1091: int newstrength = TOKEN_UNSET_;
1092:
1093: m_parsedToken_.m_charsLen_ = 0;
1094: m_parsedToken_.m_charsOffset_ = 0;
1095: m_parsedToken_.m_prefixOffset_ = 0;
1096: m_parsedToken_.m_prefixLen_ = 0;
1097: m_parsedToken_.m_indirectIndex_ = 0;
1098:
1099: int limit = m_rules_.length();
1100: while (m_current_ < limit) {
1101: char ch = m_source_.charAt(m_current_);
1102: if (inquote) {
1103: if (ch == 0x0027) { // '\''
1104: inquote = false;
1105: } else {
1106: if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {
1107: if (m_parsedToken_.m_charsLen_ == 0) {
1108: m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1109: }
1110: m_parsedToken_.m_charsLen_++;
1111: } else {
1112: if (newextensionlen == 0) {
1113: extensionoffset = m_extraCurrent_;
1114: }
1115: newextensionlen++;
1116: }
1117: }
1118: } else if (isescaped) {
1119: isescaped = false;
1120: if (newstrength == TOKEN_UNSET_) {
1121: throwParseException(m_rules_, m_current_);
1122: }
1123: if (ch != 0 && m_current_ != limit) {
1124: if (inchars) {
1125: if (m_parsedToken_.m_charsLen_ == 0) {
1126: m_parsedToken_.m_charsOffset_ = m_current_;
1127: }
1128: m_parsedToken_.m_charsLen_++;
1129: } else {
1130: if (newextensionlen == 0) {
1131: extensionoffset = m_current_;
1132: }
1133: newextensionlen++;
1134: }
1135: }
1136: } else {
1137: if (!UCharacterProperty.isRuleWhiteSpace(ch)) {
1138: // Sets the strength for this entry
1139: switch (ch) {
1140: case 0x003D: // '='
1141: if (newstrength != TOKEN_UNSET_) {
1142: return doEndParseNextToken(newstrength,
1143: top, extensionoffset,
1144: newextensionlen, variabletop,
1145: before);
1146: }
1147: // if we start with strength, we'll reset to top
1148: if (startofrules == true) {
1149: m_parsedToken_.m_indirectIndex_ = 5;
1150: top = doSetTop();
1151: return doEndParseNextToken(TOKEN_RESET_,
1152: top, extensionoffset,
1153: newextensionlen, variabletop,
1154: before);
1155: }
1156: newstrength = Collator.IDENTICAL;
1157: break;
1158: case 0x002C: // ','
1159: if (newstrength != TOKEN_UNSET_) {
1160: return doEndParseNextToken(newstrength,
1161: top, extensionoffset,
1162: newextensionlen, variabletop,
1163: before);
1164: }
1165: // if we start with strength, we'll reset to top
1166: if (startofrules == true) {
1167: m_parsedToken_.m_indirectIndex_ = 5;
1168: top = doSetTop();
1169: return doEndParseNextToken(TOKEN_RESET_,
1170: top, extensionoffset,
1171: newextensionlen, variabletop,
1172: before);
1173: }
1174: newstrength = Collator.TERTIARY;
1175: break;
1176: case 0x003B: // ';'
1177: if (newstrength != TOKEN_UNSET_) {
1178: return doEndParseNextToken(newstrength,
1179: top, extensionoffset,
1180: newextensionlen, variabletop,
1181: before);
1182: }
1183: // if we start with strength, we'll reset to top
1184: if (startofrules == true) {
1185: m_parsedToken_.m_indirectIndex_ = 5;
1186: top = doSetTop();
1187: return doEndParseNextToken(TOKEN_RESET_,
1188: top, extensionoffset,
1189: newextensionlen, variabletop,
1190: before);
1191: }
1192: newstrength = Collator.SECONDARY;
1193: break;
1194: case 0x003C: // '<'
1195: if (newstrength != TOKEN_UNSET_) {
1196: return doEndParseNextToken(newstrength,
1197: top, extensionoffset,
1198: newextensionlen, variabletop,
1199: before);
1200: }
1201: // if we start with strength, we'll reset to top
1202: if (startofrules == true) {
1203: m_parsedToken_.m_indirectIndex_ = 5;
1204: top = doSetTop();
1205: return doEndParseNextToken(TOKEN_RESET_,
1206: top, extensionoffset,
1207: newextensionlen, variabletop,
1208: before);
1209: }
1210: // before this, do a scan to verify whether this is
1211: // another strength
1212: if (m_source_.charAt(m_current_ + 1) == 0x003C) {
1213: m_current_++;
1214: if (m_source_.charAt(m_current_ + 1) == 0x003C) {
1215: m_current_++; // three in a row!
1216: newstrength = Collator.TERTIARY;
1217: } else { // two in a row
1218: newstrength = Collator.SECONDARY;
1219: }
1220: } else { // just one
1221: newstrength = Collator.PRIMARY;
1222: }
1223: break;
1224: case 0x0026: // '&'
1225: if (newstrength != TOKEN_UNSET_) {
1226: return doEndParseNextToken(newstrength,
1227: top, extensionoffset,
1228: newextensionlen, variabletop,
1229: before);
1230: }
1231: newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0
1232: break;
1233: case 0x005b: // '['
1234: // options - read an option, analyze it
1235: m_optionEnd_ = m_rules_.indexOf(0x005d,
1236: m_current_);
1237: if (m_optionEnd_ != -1) { // ']'
1238: byte result = readAndSetOption();
1239: m_current_ = m_optionEnd_;
1240: if ((result & TOKEN_TOP_MASK_) != 0) {
1241: if (newstrength == TOKEN_RESET_) {
1242: top = doSetTop();
1243: if (before != 0) {
1244: // This is a combination of before and
1245: // indirection like
1246: // '&[before 2][first regular]<b'
1247: m_source_.append((char) 0x002d);
1248: m_source_.append((char) before);
1249: m_extraCurrent_ += 2;
1250: m_parsedToken_.m_charsLen_ += 2;
1251: }
1252: m_current_++;
1253: return doEndParseNextToken(
1254: newstrength, true,
1255: extensionoffset,
1256: newextensionlen,
1257: variabletop, before);
1258: } else {
1259: throwParseException(m_rules_,
1260: m_current_);
1261: }
1262: } else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) {
1263: if (newstrength != TOKEN_RESET_
1264: && newstrength != TOKEN_UNSET_) {
1265: variabletop = true;
1266: m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1267: m_source_.append((char) 0xFFFF);
1268: m_extraCurrent_++;
1269: m_current_++;
1270: m_parsedToken_.m_charsLen_ = 1;
1271: return doEndParseNextToken(
1272: newstrength, top,
1273: extensionoffset,
1274: newextensionlen,
1275: variabletop, before);
1276: } else {
1277: throwParseException(m_rules_,
1278: m_current_);
1279: }
1280: } else if ((result & TOKEN_BEFORE_) != 0) {
1281: if (newstrength == TOKEN_RESET_) {
1282: before = (byte) (result & TOKEN_BEFORE_);
1283: } else {
1284: throwParseException(m_rules_,
1285: m_current_);
1286: }
1287: }
1288: }
1289: break;
1290: case 0x002F: // '/'
1291: wasinquote = false; // if we were copying source
1292: // characters, we want to stop now
1293: inchars = false; // we're now processing expansion
1294: break;
1295: case 0x005C: // back slash for escaped chars
1296: isescaped = true;
1297: break;
1298: // found a quote, we're gonna start copying
1299: case 0x0027: //'\''
1300: if (newstrength == TOKEN_UNSET_) {
1301: // quote is illegal until we have a strength
1302: throwParseException(m_rules_, m_current_);
1303: }
1304: inquote = true;
1305: if (inchars) { // we're doing characters
1306: if (wasinquote == false) {
1307: m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1308: }
1309: if (m_parsedToken_.m_charsLen_ != 0) {
1310: m_source_
1311: .append(m_source_
1312: .substring(
1313: m_current_
1314: - m_parsedToken_.m_charsLen_,
1315: m_current_));
1316: m_extraCurrent_ += m_parsedToken_.m_charsLen_;
1317: }
1318: m_parsedToken_.m_charsLen_++;
1319: } else { // we're doing an expansion
1320: if (wasinquote == false) {
1321: extensionoffset = m_extraCurrent_;
1322: }
1323: if (newextensionlen != 0) {
1324: m_source_.append(m_source_.substring(
1325: m_current_ - newextensionlen,
1326: m_current_));
1327: m_extraCurrent_ += newextensionlen;
1328: }
1329: newextensionlen++;
1330: }
1331: wasinquote = true;
1332: m_current_++;
1333: ch = m_source_.charAt(m_current_);
1334: if (ch == 0x0027) { // copy the double quote
1335: m_source_.append(ch);
1336: m_extraCurrent_++;
1337: inquote = false;
1338: }
1339: break;
1340: // '@' is french only if the strength is not currently set
1341: // if it is, it's just a regular character in collation
1342: case 0x0040: // '@'
1343: if (newstrength == TOKEN_UNSET_) {
1344: m_options_.m_isFrenchCollation_ = true;
1345: break;
1346: }
1347: case 0x007C: //|
1348: // this means we have actually been reading prefix part
1349: // we want to store read characters to the prefix part
1350: // and continue reading the characters (proper way
1351: // would be to restart reading the chars, but in that
1352: // case we would have to complicate the token hasher,
1353: // which I do not intend to play with. Instead, we will
1354: // do prefixes when prefixes are due (before adding the
1355: // elements).
1356: m_parsedToken_.m_prefixOffset_ = m_parsedToken_.m_charsOffset_;
1357: m_parsedToken_.m_prefixLen_ = m_parsedToken_.m_charsLen_;
1358: if (inchars) { // we're doing characters
1359: if (wasinquote == false) {
1360: m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1361: }
1362: if (m_parsedToken_.m_charsLen_ != 0) {
1363: String prefix = m_source_
1364: .substring(
1365: m_current_
1366: - m_parsedToken_.m_charsLen_,
1367: m_current_);
1368: m_source_.append(prefix);
1369: m_extraCurrent_ += m_parsedToken_.m_charsLen_;
1370: }
1371: m_parsedToken_.m_charsLen_++;
1372: }
1373: wasinquote = true;
1374: do {
1375: m_current_++;
1376: ch = m_source_.charAt(m_current_);
1377: // skip whitespace between '|' and the character
1378: } while (UCharacterProperty
1379: .isRuleWhiteSpace(ch));
1380: break;
1381: case 0x0023: // '#' // this is a comment, skip everything through the end of line
1382: do {
1383: m_current_++;
1384: ch = m_source_.charAt(m_current_);
1385: } while (!isCharNewLine(ch));
1386: break;
1387: case 0x0021: // '!' // ignoring java set thai reordering
1388: break;
1389: default:
1390: if (newstrength == TOKEN_UNSET_) {
1391: throwParseException(m_rules_, m_current_);
1392: }
1393: if (isSpecialChar(ch) && (inquote == false)) {
1394: throwParseException(m_rules_, m_current_);
1395: }
1396: if (ch == 0x0000 && m_current_ + 1 == limit) {
1397: break;
1398: }
1399: if (inchars) {
1400: if (m_parsedToken_.m_charsLen_ == 0) {
1401: m_parsedToken_.m_charsOffset_ = m_current_;
1402: }
1403: m_parsedToken_.m_charsLen_++;
1404: } else {
1405: if (newextensionlen == 0) {
1406: extensionoffset = m_current_;
1407: }
1408: newextensionlen++;
1409: }
1410: break;
1411: }
1412: }
1413: }
1414: if (wasinquote) {
1415: if (ch != 0x27) {
1416: m_source_.append(ch);
1417: m_extraCurrent_++;
1418: }
1419: }
1420: m_current_++;
1421: }
1422: return doEndParseNextToken(newstrength, top, extensionoffset,
1423: newextensionlen, variabletop, before);
1424: }
1425:
1426: /**
1427: * End the next parse token
1428: * @param newstrength new strength
1429: * @return offset in rules, -1 for end of rules
1430: */
1431: private int doEndParseNextToken(int newstrength, /*int newcharslen,*/
1432: boolean top, /*int charsoffset,*/
1433: int extensionoffset, int newextensionlen, boolean variabletop,
1434: int before) throws ParseException {
1435: if (newstrength == TOKEN_UNSET_) {
1436: return -1;
1437: }
1438: if (m_parsedToken_.m_charsLen_ == 0 && top == false) {
1439: throwParseException(m_rules_, m_current_);
1440: }
1441:
1442: m_parsedToken_.m_strength_ = newstrength;
1443: //m_parsedToken_.m_charsOffset_ = charsoffset;
1444: //m_parsedToken_.m_charsLen_ = newcharslen;
1445: m_parsedToken_.m_extensionOffset_ = extensionoffset;
1446: m_parsedToken_.m_extensionLen_ = newextensionlen;
1447: m_parsedToken_.m_flags_ = (char) ((variabletop ? TOKEN_VARIABLE_TOP_MASK_
1448: : 0)
1449: | (top ? TOKEN_TOP_MASK_ : 0) | before);
1450: return m_current_;
1451: }
1452:
1453: /**
1454: * Token before this element
1455: * @param sourcetoken
1456: * @param strength collation strength
1457: * @return the token before source token
1458: * @exception ParseException thrown when rules have the wrong syntax
1459: */
1460: private Token getVirginBefore(Token sourcetoken, int strength)
1461: throws ParseException {
1462: // this is a virgin before - we need to fish the anchor from the UCA
1463: if (sourcetoken != null) {
1464: int offset = sourcetoken.m_source_ & 0xFFFFFF;
1465: m_UCAColEIter_.setText(m_source_.substring(offset,
1466: offset + 1));
1467: } else {
1468: m_UCAColEIter_.setText(m_source_.substring(
1469: m_parsedToken_.m_charsOffset_,
1470: m_parsedToken_.m_charsOffset_ + 1));
1471: }
1472:
1473: int basece = m_UCAColEIter_.next() & 0xFFFFFF3F;
1474: int basecontce = m_UCAColEIter_.next();
1475: if (basecontce == CollationElementIterator.NULLORDER) {
1476: basecontce = 0;
1477: }
1478:
1479: int ch = 0;
1480:
1481: if ((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
1482: && (basece >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
1483:
1484: int primary = basece
1485: & RuleBasedCollator.CE_PRIMARY_MASK_
1486: | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
1487: int raw = RuleBasedCollator.impCEGen_
1488: .getRawFromImplicit(primary);
1489: ch = RuleBasedCollator.impCEGen_
1490: .getCodePointFromRaw(raw - 1);
1491: int primaryCE = RuleBasedCollator.impCEGen_
1492: .getImplicitFromRaw(raw - 1);
1493: m_utilCEBuffer_[0] = primaryCE
1494: & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
1495: m_utilCEBuffer_[1] = (primaryCE << 16)
1496: & RuleBasedCollator.CE_PRIMARY_MASK_
1497: | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1498:
1499: m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1500: m_source_.append('\uFFFE');
1501: m_source_.append((char) ch);
1502: m_extraCurrent_ += 2;
1503: m_parsedToken_.m_charsLen_++;
1504:
1505: m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
1506: | m_parsedToken_.m_charsOffset_;
1507: m_utilToken_.m_rules_ = m_source_;
1508: sourcetoken = (Token) m_hashTable_.get(m_utilToken_);
1509:
1510: if (sourcetoken == null) {
1511: m_listHeader_[m_resultLength_] = new TokenListHeader();
1512: m_listHeader_[m_resultLength_].m_baseCE_ = m_utilCEBuffer_[0] & 0xFFFFFF3F;
1513: if (RuleBasedCollator
1514: .isContinuation(m_utilCEBuffer_[1])) {
1515: m_listHeader_[m_resultLength_].m_baseContCE_ = m_utilCEBuffer_[1];
1516: } else {
1517: m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
1518: }
1519: m_listHeader_[m_resultLength_].m_nextCE_ = 0;
1520: m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1521: m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1522: m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1523: m_listHeader_[m_resultLength_].m_indirect_ = false;
1524:
1525: sourcetoken = new Token();
1526: initAReset(-1, sourcetoken);
1527: }
1528:
1529: } else {
1530:
1531: // first ce and second ce m_utilCEBuffer_
1532: int invpos = CollationParsedRuleBuilder.INVERSE_UCA_
1533: .getInversePrevCE(basece, basecontce, strength,
1534: m_utilCEBuffer_);
1535: // we got the previous CE. Now we need to see if the difference between
1536: // the two CEs is really of the requested strength.
1537: // if it's a bigger difference (we asked for secondary and got primary), we
1538: // need to modify the CE.
1539: if (CollationParsedRuleBuilder.INVERSE_UCA_
1540: .getCEStrengthDifference(basece, basecontce,
1541: m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {
1542: // adjust the strength
1543: // now we are in the situation where our baseCE should actually be modified in
1544: // order to get the CE in the right position.
1545: if (strength == Collator.SECONDARY) {
1546: m_utilCEBuffer_[0] = basece - 0x0200;
1547: } else { // strength == UCOL_TERTIARY
1548: m_utilCEBuffer_[0] = basece - 0x02;
1549: }
1550: if (RuleBasedCollator.isContinuation(basecontce)) {
1551: if (strength == Collator.SECONDARY) {
1552: m_utilCEBuffer_[1] = basecontce - 0x0200;
1553: } else { // strength == UCOL_TERTIARY
1554: m_utilCEBuffer_[1] = basecontce - 0x02;
1555: }
1556: }
1557: }
1558:
1559: /*
1560: // the code below relies on getting a code point from the inverse table, in order to be
1561: // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1562: // 1. There are many code points that have the same CE
1563: // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1564: // Also, in case when there is no equivalent strength before an element, we have to actually
1565: // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1566: // before a is a primary difference.
1567: ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos
1568: + 2];
1569: if ((ch & INVERSE_SIZE_MASK_) != 0) {
1570: int offset = ch & INVERSE_OFFSET_MASK_;
1571: ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[
1572: offset];
1573: }
1574: m_source_.append((char)ch);
1575: m_extraCurrent_ ++;
1576: m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;
1577: m_parsedToken_.m_charsLen_ = 1;
1578:
1579: // We got an UCA before. However, this might have been tailored.
1580: // example:
1581: // &\u30ca = \u306a
1582: // &[before 3]\u306a<<<\u306a|\u309d
1583:
1584: m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
1585: | m_parsedToken_.m_charsOffset_;
1586: m_utilToken_.m_rules_ = m_source_;
1587: sourcetoken = (Token)m_hashTable_.get(m_utilToken_);
1588: */
1589:
1590: // here is how it should be. The situation such as &[before 1]a < x, should be
1591: // resolved exactly as if we wrote &a > x.
1592: // therefore, I don't really care if the UCA value before a has been changed.
1593: // However, I do care if the strength between my element and the previous element
1594: // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1595: // have to construct the base CE.
1596: // if we found a tailored thing, we have to use the UCA value and
1597: // construct a new reset token with constructed name
1598: //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {
1599: // character to which we want to anchor is already tailored.
1600: // We need to construct a new token which will be the anchor point
1601: //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');
1602: //m_source_.append(ch);
1603: //m_extraCurrent_ ++;
1604: //m_parsedToken_.m_charsLen_ ++;
1605: // grab before
1606: m_parsedToken_.m_charsOffset_ -= 10;
1607: m_parsedToken_.m_charsLen_ += 10;
1608: m_listHeader_[m_resultLength_] = new TokenListHeader();
1609: m_listHeader_[m_resultLength_].m_baseCE_ = m_utilCEBuffer_[0] & 0xFFFFFF3F;
1610: if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
1611: m_listHeader_[m_resultLength_].m_baseContCE_ = m_utilCEBuffer_[1];
1612: } else {
1613: m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
1614: }
1615: m_listHeader_[m_resultLength_].m_nextCE_ = 0;
1616: m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1617: m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1618: m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1619: m_listHeader_[m_resultLength_].m_indirect_ = false;
1620: sourcetoken = new Token();
1621: initAReset(-1, sourcetoken);
1622: //}
1623: }
1624: return sourcetoken;
1625: }
1626:
1627: /**
1628: * Processing Description.
1629: * 1. Build a m_listHeader_. Each list has a header, which contains two lists
1630: * (positive and negative), a reset token, a baseCE, nextCE, and
1631: * previousCE. The lists and reset may be null.
1632: * 2. As you process, you keep a LAST pointer that points to the last token
1633: * you handled.
1634: * @param expand string offset, -1 for null strings
1635: * @param targetToken token to update
1636: * @return expandnext offset
1637: * @throws ParseException thrown when rules syntax failed
1638: */
1639: private int initAReset(int expand, Token targetToken)
1640: throws ParseException {
1641: if (m_resultLength_ == m_listHeader_.length - 1) {
1642: // Unfortunately, this won't work, as we store addresses of lhs in
1643: // token
1644: TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];
1645: System.arraycopy(m_listHeader_, 0, temp, 0,
1646: m_resultLength_ + 1);
1647: m_listHeader_ = temp;
1648: }
1649: // do the reset thing
1650: targetToken.m_rules_ = m_source_;
1651: targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
1652: | m_parsedToken_.m_charsOffset_;
1653: targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
1654: | m_parsedToken_.m_extensionOffset_;
1655: // keep the flags around so that we know about before
1656: targetToken.m_flags_ = m_parsedToken_.m_flags_;
1657:
1658: if (m_parsedToken_.m_prefixOffset_ != 0) {
1659: throwParseException(m_rules_,
1660: m_parsedToken_.m_charsOffset_ - 1);
1661: }
1662:
1663: targetToken.m_prefix_ = 0;
1664: // TODO: this should also handle reverse
1665: targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
1666: targetToken.m_strength_ = TOKEN_RESET_;
1667: targetToken.m_next_ = null;
1668: targetToken.m_previous_ = null;
1669: targetToken.m_CELength_ = 0;
1670: targetToken.m_expCELength_ = 0;
1671: targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];
1672: m_listHeader_[m_resultLength_].m_first_ = null;
1673: m_listHeader_[m_resultLength_].m_last_ = null;
1674: m_listHeader_[m_resultLength_].m_first_ = null;
1675: m_listHeader_[m_resultLength_].m_last_ = null;
1676: m_listHeader_[m_resultLength_].m_reset_ = targetToken;
1677:
1678: /* 3 Consider each item: relation, source, and expansion:
1679: * e.g. ...< x / y ...
1680: * First convert all expansions into normal form. Examples:
1681: * If "xy" doesn't occur earlier in the list or in the UCA, convert
1682: * &xy * c * d * ... into &x * c/y * d * ...
1683: * Note: reset values can never have expansions, although they can
1684: * cause the very next item to have one. They may be contractions, if
1685: * they are found earlier in the list.
1686: */
1687: int result = 0;
1688: if (expand > 0) {
1689: // check to see if there is an expansion
1690: if (m_parsedToken_.m_charsLen_ > 1) {
1691: targetToken.m_source_ = ((expand - m_parsedToken_.m_charsOffset_) << 24)
1692: | m_parsedToken_.m_charsOffset_;
1693: result = ((m_parsedToken_.m_charsLen_
1694: + m_parsedToken_.m_charsOffset_ - expand) << 24)
1695: | expand;
1696: }
1697: }
1698:
1699: m_resultLength_++;
1700: m_hashTable_.put(targetToken, targetToken);
1701: return result;
1702: }
1703:
1704: /**
1705: * Checks if an character is special
1706: * @param ch character to test
1707: * @return true if the character is special
1708: */
1709: private static final boolean isSpecialChar(char ch) {
1710: return (ch <= 0x002F && ch >= 0x0020)
1711: || (ch <= 0x003F && ch >= 0x003A)
1712: || (ch <= 0x0060 && ch >= 0x005B)
1713: || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;
1714: }
1715:
1716: private UnicodeSet readAndSetUnicodeSet(String source, int start)
1717: throws ParseException {
1718: while (source.charAt(start) != '[') { /* advance while we find the first '[' */
1719: start++;
1720: }
1721: // now we need to get a balanced set of '[]'. The problem is that a set can have
1722: // many, and *end point to the first closing '['
1723: int noOpenBraces = 1;
1724: int current = 1; // skip the opening brace
1725: while (start + current < source.length() && noOpenBraces != 0) {
1726: if (source.charAt(start + current) == '[') {
1727: noOpenBraces++;
1728: } else if (source.charAt(start + current) == ']') { // closing brace
1729: noOpenBraces--;
1730: }
1731: current++;
1732: }
1733: //int nextBrace = -1;
1734:
1735: if (noOpenBraces != 0
1736: || (/*nextBrace =*/source
1737: .indexOf("]", start + current) /*']'*/) == -1) {
1738: throwParseException(m_rules_, start);
1739: }
1740: return new UnicodeSet(source.substring(start, start + current)); //uset_openPattern(start, current);
1741: }
1742:
1743: /** in C, optionarg is passed by reference to function.
1744: * We use a private int to simulate this.
1745: */
1746: private int m_optionarg_ = 0;
1747:
1748: private int readOption(String rules, int start, int optionend) {
1749: m_optionarg_ = 0;
1750: int i = 0;
1751: while (i < RULES_OPTIONS_.length) {
1752: String option = RULES_OPTIONS_[i].m_name_;
1753: int optionlength = option.length();
1754: if (rules.length() > start + optionlength
1755: && option.equalsIgnoreCase(rules.substring(start,
1756: start + optionlength))) {
1757: if (optionend - start > optionlength) {
1758: m_optionarg_ = start + optionlength;
1759: // start of the options, skip space
1760: while (m_optionarg_ < optionend
1761: && UCharacter.isWhitespace(rules
1762: .charAt(m_optionarg_))) { // eat whitespace
1763: m_optionarg_++;
1764: }
1765: }
1766: break;
1767: }
1768: i++;
1769: }
1770: if (i == RULES_OPTIONS_.length) {
1771: i = -1;
1772: }
1773: return i;
1774: }
1775:
1776: /**
1777: * Reads and set collation options
1778: * @return TOKEN_SUCCESS if option is set correct, 0 otherwise
1779: * @exception ParseException thrown when options in rules are wrong
1780: */
1781: private byte readAndSetOption() throws ParseException {
1782: int start = m_current_ + 1; // skip opening '['
1783: int i = readOption(m_rules_, start, m_optionEnd_);
1784:
1785: int optionarg = m_optionarg_;
1786:
1787: if (i < 0) {
1788: throwParseException(m_rules_, start);
1789: }
1790:
1791: if (i < 7) {
1792: if (optionarg != 0) {
1793: for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j++) {
1794: String subname = RULES_OPTIONS_[i].m_subOptions_[j];
1795: int size = optionarg + subname.length();
1796: if (m_rules_.length() > size
1797: && subname.equalsIgnoreCase(m_rules_
1798: .substring(optionarg, size))) {
1799: setOptions(
1800: m_options_,
1801: RULES_OPTIONS_[i].m_attribute_,
1802: RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);
1803: return TOKEN_SUCCESS_MASK_;
1804: }
1805: }
1806: }
1807: throwParseException(m_rules_, optionarg);
1808: } else if (i == 7) { // variable top
1809: return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;
1810: } else if (i == 8) { // rearange
1811: return TOKEN_SUCCESS_MASK_;
1812: } else if (i == 9) { // before
1813: if (optionarg != 0) {
1814: for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j++) {
1815: String subname = RULES_OPTIONS_[i].m_subOptions_[j];
1816: int size = optionarg + subname.length();
1817: if (m_rules_.length() > size
1818: && subname.equalsIgnoreCase(m_rules_
1819: .substring(optionarg, optionarg
1820: + subname.length()))) {
1821: return (byte) (TOKEN_SUCCESS_MASK_ | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j] + 1);
1822: }
1823: }
1824: }
1825: throwParseException(m_rules_, optionarg);
1826: } else if (i == 10) { // top, we are going to have an array with
1827: // structures of limit CEs index to this array will be
1828: // src->parsedToken.indirectIndex
1829: m_parsedToken_.m_indirectIndex_ = 0;
1830: return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
1831: } else if (i < 13) { // first, last
1832: for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j++) {
1833: String subname = RULES_OPTIONS_[i].m_subOptions_[j];
1834: int size = optionarg + subname.length();
1835: if (m_rules_.length() > size
1836: && subname.equalsIgnoreCase(m_rules_.substring(
1837: optionarg, size))) {
1838: m_parsedToken_.m_indirectIndex_ = (char) (i - 10 + (j << 1));
1839: return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
1840: }
1841: }
1842: throwParseException(m_rules_, optionarg);
1843: } else if (i == 13 || i == 14) { // copy and remove are handled before normalization
1844: // we need to move end here
1845: int noOpenBraces = 1;
1846: m_current_++; // skip opening brace
1847: while (m_current_ < m_source_.length() && noOpenBraces != 0) {
1848: if (m_source_.charAt(m_current_) == '[') {
1849: noOpenBraces++;
1850: } else if (m_source_.charAt(m_current_) == ']') { // closing brace
1851: noOpenBraces--;
1852: }
1853: m_current_++;
1854: }
1855: m_optionEnd_ = m_current_ - 1;
1856: return TOKEN_SUCCESS_MASK_;
1857: } else {
1858: throwParseException(m_rules_, optionarg);
1859: }
1860: return TOKEN_SUCCESS_MASK_; // we will never reach here.
1861: }
1862:
1863: /**
1864: * Set collation option
1865: * @param optionset option set to set
1866: * @param attribute type to set
1867: * @param value attribute value
1868: */
1869: private void setOptions(OptionSet optionset, int attribute,
1870: int value) {
1871: switch (attribute) {
1872: case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_:
1873: optionset.m_isHiragana4_ = (value == RuleBasedCollator.AttributeValue.ON_);
1874: break;
1875: case RuleBasedCollator.Attribute.FRENCH_COLLATION_:
1876: optionset.m_isFrenchCollation_ = (value == RuleBasedCollator.AttributeValue.ON_);
1877: break;
1878: case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_:
1879: optionset.m_isAlternateHandlingShifted_ = (value == RuleBasedCollator.AttributeValue.SHIFTED_);
1880: break;
1881: case RuleBasedCollator.Attribute.CASE_FIRST_:
1882: optionset.m_caseFirst_ = value;
1883: break;
1884: case RuleBasedCollator.Attribute.CASE_LEVEL_:
1885: optionset.m_isCaseLevel_ = (value == RuleBasedCollator.AttributeValue.ON_);
1886: break;
1887: case RuleBasedCollator.Attribute.NORMALIZATION_MODE_:
1888: if (value == RuleBasedCollator.AttributeValue.ON_) {
1889: value = Collator.CANONICAL_DECOMPOSITION;
1890: }
1891: optionset.m_decomposition_ = value;
1892: break;
1893: case RuleBasedCollator.Attribute.STRENGTH_:
1894: optionset.m_strength_ = value;
1895: break;
1896: default:
1897: break;
1898: }
1899: }
1900:
1901: UnicodeSet getTailoredSet() throws ParseException {
1902: boolean startOfRules = true;
1903: UnicodeSet tailored = new UnicodeSet();
1904: String pattern;
1905: CanonicalIterator it = new CanonicalIterator("");
1906:
1907: m_parsedToken_.m_strength_ = TOKEN_UNSET_;
1908: int sourcelimit = m_source_.length();
1909: //int expandNext = 0;
1910:
1911: while (m_current_ < sourcelimit) {
1912: m_parsedToken_.m_prefixOffset_ = 0;
1913: if (parseNextToken(startOfRules) < 0) {
1914: // we have reached the end
1915: continue;
1916: }
1917: startOfRules = false;
1918: // The idea is to tokenize the rule set. For each non-reset token,
1919: // we add all the canonicaly equivalent FCD sequences
1920: if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
1921: it.setSource(m_source_.substring(
1922: m_parsedToken_.m_charsOffset_,
1923: m_parsedToken_.m_charsOffset_
1924: + m_parsedToken_.m_charsLen_));
1925: pattern = it.next();
1926: while (pattern != null) {
1927: if (Normalizer.quickCheck(pattern, Normalizer.FCD,
1928: 0) != Normalizer.NO) {
1929: tailored.add(pattern);
1930: }
1931: pattern = it.next();
1932: }
1933: }
1934: }
1935: return tailored;
1936: }
1937:
1938: final private void extractSetsFromRules(String rules)
1939: throws ParseException {
1940: int optionNumber = -1;
1941: int setStart = 0;
1942: int i = 0;
1943: while (i < rules.length()) {
1944: if (rules.charAt(i) == 0x005B) {
1945: optionNumber = readOption(rules, i + 1, rules.length());
1946: setStart = m_optionarg_;
1947: if (optionNumber == 13) { /* copy - parts of UCA to tailoring */
1948: UnicodeSet newSet = readAndSetUnicodeSet(rules,
1949: setStart);
1950: if (m_copySet_ == null) {
1951: m_copySet_ = newSet;
1952: } else {
1953: m_copySet_.addAll(newSet);
1954: }
1955: } else if (optionNumber == 14) {
1956: UnicodeSet newSet = readAndSetUnicodeSet(rules,
1957: setStart);
1958: if (m_removeSet_ == null) {
1959: m_removeSet_ = newSet;
1960: } else {
1961: m_removeSet_.addAll(newSet);
1962: }
1963: }
1964: }
1965: i++;
1966: }
1967: }
1968: }
|