0001: /**
0002: *******************************************************************************
0003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */package com.ibm.icu.impl;
0007:
0008: import java.io.BufferedInputStream;
0009: import java.io.InputStream;
0010: import java.io.IOException;
0011: import java.util.Locale;
0012: import java.util.MissingResourceException;
0013:
0014: import com.ibm.icu.lang.UCharacter;
0015: import com.ibm.icu.lang.UCharacterCategory;
0016: import com.ibm.icu.lang.UProperty;
0017: import com.ibm.icu.text.BreakIterator;
0018: import com.ibm.icu.text.Normalizer;
0019: import com.ibm.icu.text.UCharacterIterator;
0020: import com.ibm.icu.text.UnicodeSet;
0021: import com.ibm.icu.text.UTF16;
0022: import com.ibm.icu.util.RangeValueIterator;
0023: import com.ibm.icu.util.ULocale;
0024: import com.ibm.icu.util.UResourceBundle;
0025: import com.ibm.icu.util.VersionInfo;
0026:
0027: /**
0028: * <p>Internal class used for Unicode character property database.</p>
0029: * <p>This classes store binary data read from uprops.icu.
0030: * It does not have the capability to parse the data into more high-level
0031: * information. It only returns bytes of information when required.</p>
0032: * <p>Due to the form most commonly used for retrieval, array of char is used
0033: * to store the binary data.</p>
0034: * <p>UCharacterPropertyDB also contains information on accessing indexes to
0035: * significant points in the binary data.</p>
0036: * <p>Responsibility for molding the binary data into more meaning form lies on
0037: * <a href=UCharacter.html>UCharacter</a>.</p>
0038: * @author Syn Wee Quek
0039: * @since release 2.1, february 1st 2002
0040: * @draft 2.1
0041: */
0042:
0043: public final class UCharacterProperty {
0044: // public data members -----------------------------------------------
0045:
0046: /**
0047: * Trie data
0048: */
0049: public CharTrie m_trie_;
0050: /**
0051: * Optimization
0052: * CharTrie index array
0053: */
0054: public char[] m_trieIndex_;
0055: /**
0056: * Optimization
0057: * CharTrie data array
0058: */
0059: public char[] m_trieData_;
0060: /**
0061: * Optimization
0062: * CharTrie data offset
0063: */
0064: public int m_trieInitialValue_;
0065: /**
0066: * Unicode version
0067: */
0068: public VersionInfo m_unicodeVersion_;
0069: /**
0070: * Latin capital letter i with dot above
0071: */
0072: public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
0073: /**
0074: * Latin small letter i with dot above
0075: */
0076: public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
0077: /**
0078: * Latin lowercase i
0079: */
0080: public static final char LATIN_SMALL_LETTER_I_ = 0x69;
0081: /**
0082: * Character type mask
0083: */
0084: public static final int TYPE_MASK = 0x1F;
0085:
0086: // uprops.h enum UPropertySource --------------------------------------- ***
0087:
0088: /** No source, not a supported property. */
0089: public static final int SRC_NONE = 0;
0090: /** From uchar.c/uprops.icu main trie */
0091: public static final int SRC_CHAR = 1;
0092: /** From uchar.c/uprops.icu properties vectors trie */
0093: public static final int SRC_PROPSVEC = 2;
0094: /** Hangul_Syllable_Type, from uchar.c/uprops.icu */
0095: public static final int SRC_HST = 3;
0096: /** From unames.c/unames.icu */
0097: public static final int SRC_NAMES = 4;
0098: /** From unorm.cpp/unorm.icu */
0099: public static final int SRC_NORM = 5;
0100: /** From ucase.c/ucase.icu */
0101: public static final int SRC_CASE = 6;
0102: /** From ubidi_props.c/ubidi.icu */
0103: public static final int SRC_BIDI = 7;
0104: /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
0105: public static final int SRC_CHAR_AND_PROPSVEC = 8;
0106: /** One more than the highest UPropertySource (SRC_) constant. */
0107: public static final int SRC_COUNT = 9;
0108:
0109: // public methods ----------------------------------------------------
0110:
0111: /**
0112: * Java friends implementation
0113: */
0114: public void setIndexData(CharTrie.FriendAgent friendagent) {
0115: m_trieIndex_ = friendagent.getPrivateIndex();
0116: m_trieData_ = friendagent.getPrivateData();
0117: m_trieInitialValue_ = friendagent.getPrivateInitialValue();
0118: }
0119:
0120: /**
0121: * Gets the property value at the index.
0122: * This is optimized.
0123: * Note this is alittle different from CharTrie the index m_trieData_
0124: * is never negative.
0125: * @param ch code point whose property value is to be retrieved
0126: * @return property value of code point
0127: */
0128: public final int getProperty(int ch) {
0129: if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
0130: || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
0131: // BMP codepoint 0000..D7FF or DC00..FFFF
0132: // optimized
0133: try { // using try for ch < 0 is faster than using an if statement
0134: return m_trieData_[(m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_)
0135: + (ch & Trie.INDEX_STAGE_3_MASK_)];
0136: } catch (ArrayIndexOutOfBoundsException e) {
0137: return m_trieInitialValue_;
0138: }
0139: }
0140: if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
0141: // lead surrogate D800..DBFF
0142: return m_trieData_[(m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
0143: + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] << Trie.INDEX_STAGE_2_SHIFT_)
0144: + (ch & Trie.INDEX_STAGE_3_MASK_)];
0145: }
0146: if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
0147: // supplementary code point 10000..10FFFF
0148: // look at the construction of supplementary characters
0149: // trail forms the ends of it.
0150: return m_trie_.getSurrogateValue(
0151: UTF16.getLeadSurrogate(ch),
0152: (char) (ch & Trie.SURROGATE_MASK_));
0153: }
0154: // ch is out of bounds
0155: // return m_dataOffset_ if there is an error, in this case we return
0156: // the default value: m_initialValue_
0157: // we cannot assume that m_initialValue_ is at offset 0
0158: // this is for optimization.
0159: return m_trieInitialValue_;
0160:
0161: // this all is an inlined form of return m_trie_.getCodePointValue(ch);
0162: }
0163:
0164: /**
0165: * Getting the signed numeric value of a character embedded in the property
0166: * argument
0167: * @param prop the character
0168: * @return signed numberic value
0169: */
0170: public static int getSignedValue(int prop) {
0171: return ((short) prop >> VALUE_SHIFT_);
0172: }
0173:
0174: /**
0175: * Getting the unsigned numeric value of a character embedded in the property
0176: * argument
0177: * @param prop the character
0178: * @return unsigned numberic value
0179: */
0180: ///CLOVER:OFF
0181: public static int getUnsignedValue(int prop) {
0182: return (prop >> VALUE_SHIFT_)
0183: & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
0184: }
0185:
0186: ///CLOVER:ON
0187:
0188: /* internal numeric pseudo-types for special encodings of numeric values */
0189: public static final int NT_FRACTION = 4; /* ==UCharacter.NumericType.COUNT, must not change unless binary format version changes */
0190: public static final int NT_LARGE = 5;
0191: public static final int NT_COUNT = 6;
0192:
0193: /**
0194: * Gets the unicode additional properties.
0195: * C version getUnicodeProperties.
0196: * @param codepoint codepoint whose additional properties is to be
0197: * retrieved
0198: * @param column
0199: * @return unicode properties
0200: */
0201: public int getAdditional(int codepoint, int column) {
0202: if (column == -1) {
0203: return getProperty(codepoint);
0204: }
0205: if (column < 0 || column >= m_additionalColumnsCount_) {
0206: return 0;
0207: }
0208: return m_additionalVectors_[m_additionalTrie_
0209: .getCodePointValue(codepoint)
0210: + column];
0211: }
0212:
0213: static final int MY_MASK = UCharacterProperty.TYPE_MASK
0214: & ((1 << UCharacterCategory.UPPERCASE_LETTER)
0215: | (1 << UCharacterCategory.LOWERCASE_LETTER)
0216: | (1 << UCharacterCategory.TITLECASE_LETTER)
0217: | (1 << UCharacterCategory.MODIFIER_LETTER) | (1 << UCharacterCategory.OTHER_LETTER));
0218:
0219: /**
0220: * <p>Get the "age" of the code point.</p>
0221: * <p>The "age" is the Unicode version when the code point was first
0222: * designated (as a non-character or for Private Use) or assigned a
0223: * character.</p>
0224: * <p>This can be useful to avoid emitting code points to receiving
0225: * processes that do not accept newer characters.</p>
0226: * <p>The data is from the UCD file DerivedAge.txt.</p>
0227: * <p>This API does not check the validity of the codepoint.</p>
0228: * @param codepoint The code point.
0229: * @return the Unicode version number
0230: * @draft ICU 2.1
0231: */
0232: public VersionInfo getAge(int codepoint) {
0233: int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
0234: return VersionInfo.getInstance((version >> FIRST_NIBBLE_SHIFT_)
0235: & LAST_NIBBLE_MASK_, version & LAST_NIBBLE_MASK_, 0, 0);
0236: }
0237:
0238: private static final long UNSIGNED_INT_MASK = 0xffffffffL;
0239:
0240: private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED);
0241: private static final int GC_CC_MASK = getMask(UCharacter.CONTROL);
0242: private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE);
0243: private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR);
0244: private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR);
0245: private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR);
0246: /** Mask constant for multiple UCharCategory bits (Z Separators). */
0247: private static final int GC_Z_MASK = GC_ZS_MASK | GC_ZL_MASK
0248: | GC_ZP_MASK;
0249:
0250: /**
0251: * Checks if c is in
0252: * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
0253: * with space=\p{Whitespace} and Control=Cc.
0254: * Implements UCHAR_POSIX_GRAPH.
0255: * @internal
0256: */
0257: private static final boolean isgraphPOSIX(int c) {
0258: /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
0259: /* comparing ==0 returns FALSE for the categories mentioned */
0260: return (getMask(UCharacter.getType(c)) & (GC_CC_MASK
0261: | GC_CS_MASK | GC_CN_MASK | GC_Z_MASK)) == 0;
0262: }
0263:
0264: private static final class BinaryProperties {
0265: int column;
0266: long mask;
0267:
0268: public BinaryProperties(int column, long mask) {
0269: this .column = column;
0270: this .mask = mask;
0271: }
0272: }
0273:
0274: BinaryProperties[] binProps = {
0275: /*
0276: * column and mask values for binary properties from u_getUnicodeProperties().
0277: * Must be in order of corresponding UProperty,
0278: * and there must be exacly one entry per binary UProperty.
0279: */
0280: new BinaryProperties(1, (1 << ALPHABETIC_PROPERTY_)),
0281: new BinaryProperties(1, (1 << ASCII_HEX_DIGIT_PROPERTY_)),
0282: new BinaryProperties(SRC_BIDI, 0), /* UCHAR_BIDI_CONTROL */
0283: new BinaryProperties(SRC_BIDI, 0), /* UCHAR_BIDI_MIRRORED */
0284: new BinaryProperties(1, (1 << DASH_PROPERTY_)),
0285: new BinaryProperties(1,
0286: (1 << DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)),
0287: new BinaryProperties(1, (1 << DEPRECATED_PROPERTY_)),
0288: new BinaryProperties(1, (1 << DIACRITIC_PROPERTY_)),
0289: new BinaryProperties(1, (1 << EXTENDER_PROPERTY_)),
0290: new BinaryProperties(SRC_NORM, 0), /* UCHAR_FULL_COMPOSITION_EXCLUSION */
0291: new BinaryProperties(1, (1 << GRAPHEME_BASE_PROPERTY_)),
0292: new BinaryProperties(1, (1 << GRAPHEME_EXTEND_PROPERTY_)),
0293: new BinaryProperties(1, (1 << GRAPHEME_LINK_PROPERTY_)),
0294: new BinaryProperties(1, (1 << HEX_DIGIT_PROPERTY_)),
0295: new BinaryProperties(1, (1 << HYPHEN_PROPERTY_)),
0296: new BinaryProperties(1, (1 << ID_CONTINUE_PROPERTY_)),
0297: new BinaryProperties(1, (1 << ID_START_PROPERTY_)),
0298: new BinaryProperties(1, (1 << IDEOGRAPHIC_PROPERTY_)),
0299: new BinaryProperties(1,
0300: (1 << IDS_BINARY_OPERATOR_PROPERTY_)),
0301: new BinaryProperties(1,
0302: (1 << IDS_TRINARY_OPERATOR_PROPERTY_)),
0303: new BinaryProperties(SRC_BIDI, 0), /* UCHAR_JOIN_CONTROL */
0304: new BinaryProperties(1,
0305: (1 << LOGICAL_ORDER_EXCEPTION_PROPERTY_)),
0306: new BinaryProperties(SRC_CASE, 0), /* UCHAR_LOWERCASE */
0307: new BinaryProperties(1, (1 << MATH_PROPERTY_)),
0308: new BinaryProperties(1,
0309: (1 << NONCHARACTER_CODE_POINT_PROPERTY_)),
0310: new BinaryProperties(1, (1 << QUOTATION_MARK_PROPERTY_)),
0311: new BinaryProperties(1, (1 << RADICAL_PROPERTY_)),
0312: new BinaryProperties(SRC_CASE, 0), /* UCHAR_SOFT_DOTTED */
0313: new BinaryProperties(1,
0314: (1 << TERMINAL_PUNCTUATION_PROPERTY_)),
0315: new BinaryProperties(1, (1 << UNIFIED_IDEOGRAPH_PROPERTY_)),
0316: new BinaryProperties(SRC_CASE, 0), /* UCHAR_UPPERCASE */
0317: new BinaryProperties(1, (1 << WHITE_SPACE_PROPERTY_)),
0318: new BinaryProperties(1, (1 << XID_CONTINUE_PROPERTY_)),
0319: new BinaryProperties(1, (1 << XID_START_PROPERTY_)),
0320: new BinaryProperties(SRC_CASE, 0), /* UCHAR_CASE_SENSITIVE */
0321: new BinaryProperties(2, (1 << V2_S_TERM_PROPERTY_)),
0322: new BinaryProperties(2,
0323: (1 << V2_VARIATION_SELECTOR_PROPERTY_)),
0324: new BinaryProperties(SRC_NORM, 0), /* UCHAR_NFD_INERT */
0325: new BinaryProperties(SRC_NORM, 0), /* UCHAR_NFKD_INERT */
0326: new BinaryProperties(SRC_NORM, 0), /* UCHAR_NFC_INERT */
0327: new BinaryProperties(SRC_NORM, 0), /* UCHAR_NFKC_INERT */
0328: new BinaryProperties(SRC_NORM, 0), /* UCHAR_SEGMENT_STARTER */
0329: new BinaryProperties(2, (1 << V2_PATTERN_SYNTAX)),
0330: new BinaryProperties(2, (1 << V2_PATTERN_WHITE_SPACE)),
0331: new BinaryProperties(SRC_CHAR_AND_PROPSVEC, 0), /* UCHAR_POSIX_ALNUM */
0332: new BinaryProperties(SRC_CHAR, 0), /* UCHAR_POSIX_BLANK */
0333: new BinaryProperties(SRC_CHAR, 0), /* UCHAR_POSIX_GRAPH */
0334: new BinaryProperties(SRC_CHAR, 0), /* UCHAR_POSIX_PRINT */
0335: new BinaryProperties(SRC_CHAR, 0) /* UCHAR_POSIX_XDIGIT */
0336: };
0337:
0338: /**
0339: * <p>Check a binary Unicode property for a code point.</p>
0340: * <p>Unicode, especially in version 3.2, defines many more properties
0341: * than the original set in UnicodeData.txt.</p>
0342: * <p>This API is intended to reflect Unicode properties as defined in
0343: * the Unicode Character Database (UCD) and Unicode Technical Reports
0344: * (UTR).</p>
0345: * <p>For details about the properties see
0346: * <a href=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
0347: * <p>For names of Unicode properties see the UCD file
0348: * PropertyAliases.txt.</p>
0349: * <p>This API does not check the validity of the codepoint.</p>
0350: * <p>Important: If ICU is built with UCD files from Unicode versions
0351: * below 3.2, then properties marked with "new" are not or
0352: * not fully available.</p>
0353: * @param codepoint Code point to test.
0354: * @param property selector constant from com.ibm.icu.lang.UProperty,
0355: * identifies which binary property to check.
0356: * @return true or false according to the binary Unicode property value
0357: * for ch. Also false if property is out of bounds or if the
0358: * Unicode version does not have data for the property at all, or
0359: * not for this code point.
0360: * @see com.ibm.icu.lang.UProperty
0361: * @draft ICU 2.1
0362: */
0363:
0364: public boolean hasBinaryProperty(int codepoint, int property) {
0365: if (property < UProperty.BINARY_START
0366: || UProperty.BINARY_LIMIT <= property) {
0367: // not a known binary property
0368: return false;
0369: } else {
0370: long mask = binProps[property].mask;
0371: int column = binProps[property].column;
0372: if (mask != 0) {
0373: // systematic, directly stored properties
0374: return ((UNSIGNED_INT_MASK & getAdditional(codepoint,
0375: column)) & mask) != 0;
0376: } else {
0377: if (column == SRC_CASE) {
0378: /* case mapping properties */
0379: UCaseProps csp;
0380: try {
0381: csp = UCaseProps.getSingleton();
0382: } catch (IOException e) {
0383: return false;
0384: }
0385: switch (property) {
0386: case UProperty.LOWERCASE:
0387: return UCaseProps.LOWER == csp
0388: .getType(codepoint);
0389: case UProperty.UPPERCASE:
0390: return UCaseProps.UPPER == csp
0391: .getType(codepoint);
0392: case UProperty.SOFT_DOTTED:
0393: return csp.isSoftDotted(codepoint);
0394: case UProperty.CASE_SENSITIVE:
0395: return csp.isCaseSensitive(codepoint);
0396: default:
0397: break;
0398: }
0399: } else if (column == SRC_NORM) {
0400: /* normalization properties from unorm.icu */
0401: switch (property) {
0402: case UProperty.FULL_COMPOSITION_EXCLUSION:
0403: return NormalizerImpl
0404: .isFullCompositionExclusion(codepoint);
0405: case UProperty.NFD_INERT:
0406: return Normalizer.isNFSkippable(codepoint,
0407: Normalizer.NFD);
0408: case UProperty.NFKD_INERT:
0409: return Normalizer.isNFSkippable(codepoint,
0410: Normalizer.NFKD);
0411: case UProperty.NFC_INERT:
0412: return Normalizer.isNFSkippable(codepoint,
0413: Normalizer.NFC);
0414: case UProperty.NFKC_INERT:
0415: return Normalizer.isNFSkippable(codepoint,
0416: Normalizer.NFKC);
0417: case UProperty.SEGMENT_STARTER:
0418: return NormalizerImpl
0419: .isCanonSafeStart(codepoint);
0420: default:
0421: break;
0422: }
0423: } else if (column == SRC_BIDI) {
0424: /* bidi/shaping properties */
0425: UBiDiProps bdp;
0426: try {
0427: bdp = UBiDiProps.getSingleton();
0428: } catch (IOException e) {
0429: return false;
0430: }
0431: switch (property) {
0432: case UProperty.BIDI_MIRRORED:
0433: return bdp.isMirrored(codepoint);
0434: case UProperty.BIDI_CONTROL:
0435: return bdp.isBidiControl(codepoint);
0436: case UProperty.JOIN_CONTROL:
0437: return bdp.isJoinControl(codepoint);
0438: default:
0439: break;
0440: }
0441: } else if (column == SRC_CHAR) {
0442: switch (property) {
0443: case UProperty.POSIX_BLANK:
0444: // "horizontal space"
0445: if (codepoint <= 0x9f) {
0446: return codepoint == 9 || codepoint == 0x20; /* TAB or SPACE */
0447: } else {
0448: /* Zs */
0449: return UCharacter.getType(codepoint) == UCharacter.SPACE_SEPARATOR;
0450: }
0451: case UProperty.POSIX_GRAPH:
0452: return isgraphPOSIX(codepoint);
0453: case UProperty.POSIX_PRINT:
0454: /*
0455: * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
0456: *
0457: * The only cntrl character in graph+blank is TAB (in blank).
0458: * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
0459: */
0460: return (UCharacter.getType(codepoint) == UCharacter.SPACE_SEPARATOR)
0461: || isgraphPOSIX(codepoint);
0462: case UProperty.POSIX_XDIGIT:
0463: /* check ASCII and Fullwidth ASCII a-fA-F */
0464: if ((codepoint <= 0x66 && codepoint >= 0x41 && (codepoint <= 0x46 || codepoint >= 0x61))
0465: || (codepoint >= 0xff21
0466: && codepoint <= 0xff46 && (codepoint <= 0xff26 || codepoint >= 0xff41))) {
0467: return true;
0468: }
0469:
0470: return UCharacter.getType(codepoint) == UCharacter.DECIMAL_DIGIT_NUMBER;
0471: default:
0472: break;
0473: }
0474: } else if (column == SRC_CHAR_AND_PROPSVEC) {
0475: switch (property) {
0476: case UProperty.POSIX_ALNUM:
0477: return UCharacter.isUAlphabetic(codepoint)
0478: || UCharacter.isDigit(codepoint);
0479: default:
0480: break;
0481: }
0482: }
0483: }
0484: }
0485: return false;
0486: }
0487:
0488: public final int getSource(int which) {
0489: if (which < UProperty.BINARY_START) {
0490: return SRC_NONE; /* undefined */
0491: } else if (which < UProperty.BINARY_LIMIT) {
0492: if (binProps[which].mask != 0) {
0493: return SRC_PROPSVEC;
0494: } else {
0495: return binProps[which].column;
0496: }
0497: } else if (which < UProperty.INT_START) {
0498: return SRC_NONE; /* undefined */
0499: } else if (which < UProperty.INT_LIMIT) {
0500: switch (which) {
0501: case UProperty.GENERAL_CATEGORY:
0502: case UProperty.NUMERIC_TYPE:
0503: return SRC_CHAR;
0504:
0505: case UProperty.HANGUL_SYLLABLE_TYPE:
0506: return SRC_HST;
0507:
0508: case UProperty.CANONICAL_COMBINING_CLASS:
0509: case UProperty.NFD_QUICK_CHECK:
0510: case UProperty.NFKD_QUICK_CHECK:
0511: case UProperty.NFC_QUICK_CHECK:
0512: case UProperty.NFKC_QUICK_CHECK:
0513: case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
0514: case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
0515: return SRC_NORM;
0516:
0517: case UProperty.BIDI_CLASS:
0518: case UProperty.JOINING_GROUP:
0519: case UProperty.JOINING_TYPE:
0520: return SRC_BIDI;
0521:
0522: default:
0523: return SRC_PROPSVEC;
0524: }
0525: } else if (which < UProperty.STRING_START) {
0526: switch (which) {
0527: case UProperty.GENERAL_CATEGORY_MASK:
0528: case UProperty.NUMERIC_VALUE:
0529: return SRC_CHAR;
0530:
0531: default:
0532: return SRC_NONE;
0533: }
0534: } else if (which < UProperty.STRING_LIMIT) {
0535: switch (which) {
0536: case UProperty.AGE:
0537: return SRC_PROPSVEC;
0538:
0539: case UProperty.BIDI_MIRRORING_GLYPH:
0540: return SRC_BIDI;
0541:
0542: case UProperty.CASE_FOLDING:
0543: case UProperty.LOWERCASE_MAPPING:
0544: case UProperty.SIMPLE_CASE_FOLDING:
0545: case UProperty.SIMPLE_LOWERCASE_MAPPING:
0546: case UProperty.SIMPLE_TITLECASE_MAPPING:
0547: case UProperty.SIMPLE_UPPERCASE_MAPPING:
0548: case UProperty.TITLECASE_MAPPING:
0549: case UProperty.UPPERCASE_MAPPING:
0550: return SRC_CASE;
0551:
0552: case UProperty.ISO_COMMENT:
0553: case UProperty.NAME:
0554: case UProperty.UNICODE_1_NAME:
0555: return SRC_NAMES;
0556:
0557: default:
0558: return SRC_NONE;
0559: }
0560: } else {
0561: return SRC_NONE; /* undefined */
0562: }
0563: }
0564:
0565: /**
0566: * Forms a supplementary code point from the argument character<br>
0567: * Note this is for internal use hence no checks for the validity of the
0568: * surrogate characters are done
0569: * @param lead lead surrogate character
0570: * @param trail trailing surrogate character
0571: * @return code point of the supplementary character
0572: */
0573: public static int getRawSupplementary(char lead, char trail) {
0574: return (lead << LEAD_SURROGATE_SHIFT_) + trail
0575: + SURROGATE_OFFSET_;
0576: }
0577:
0578: /**
0579: * Loads the property data and initialize the UCharacterProperty instance.
0580: * @throws MissingResourceException when data is missing or data has been corrupted
0581: */
0582: public static UCharacterProperty getInstance() {
0583: if (INSTANCE_ == null) {
0584: try {
0585: INSTANCE_ = new UCharacterProperty();
0586: } catch (Exception e) {
0587: throw new MissingResourceException(e.getMessage(), "",
0588: "");
0589: }
0590: }
0591: return INSTANCE_;
0592: }
0593:
0594: /**
0595: * <p>
0596: * Unicode property names and property value names are compared
0597: * "loosely". Property[Value]Aliases.txt say:
0598: * <quote>
0599: * "With loose matching of property names, the case distinctions,
0600: * whitespace, and '_' are ignored."
0601: * </quote>
0602: * </p>
0603: * <p>
0604: * This function does just that, for ASCII (char *) name strings.
0605: * It is almost identical to ucnv_compareNames() but also ignores
0606: * ASCII White_Space characters (U+0009..U+000d).
0607: * </p>
0608: * @param name1 name to compare
0609: * @param name2 name to compare
0610: * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0
0611: * if name1 is greater than name2.
0612: */
0613: /* to be implemented in 2.4
0614: * public static int comparePropertyNames(String name1, String name2)
0615: {
0616: int result = 0;
0617: int i1 = 0;
0618: int i2 = 0;
0619: while (true) {
0620: char ch1 = 0;
0621: char ch2 = 0;
0622: // Ignore delimiters '-', '_', and ASCII White_Space
0623: if (i1 < name1.length()) {
0624: ch1 = name1.charAt(i1 ++);
0625: }
0626: while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t'
0627: || ch1 == '\n' // synwee what is || ch1 == '\v'
0628: || ch1 == '\f' || ch1=='\r') {
0629: if (i1 < name1.length()) {
0630: ch1 = name1.charAt(i1 ++);
0631: }
0632: else {
0633: ch1 = 0;
0634: }
0635: }
0636: if (i2 < name2.length()) {
0637: ch2 = name2.charAt(i2 ++);
0638: }
0639: while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t'
0640: || ch2 == '\n' // synwee what is || ch1 == '\v'
0641: || ch2 == '\f' || ch2=='\r') {
0642: if (i2 < name2.length()) {
0643: ch2 = name2.charAt(i2 ++);
0644: }
0645: else {
0646: ch2 = 0;
0647: }
0648: }
0649:
0650: // If we reach the ends of both strings then they match
0651: if (ch1 == 0 && ch2 == 0) {
0652: return 0;
0653: }
0654:
0655: // Case-insensitive comparison
0656: if (ch1 != ch2) {
0657: result = Character.toLowerCase(ch1)
0658: - Character.toLowerCase(ch2);
0659: if (result != 0) {
0660: return result;
0661: }
0662: }
0663: }
0664: }
0665: */
0666:
0667: /**
0668: * Checks if the argument c is to be treated as a white space in ICU
0669: * rules. Usually ICU rule white spaces are ignored unless quoted.
0670: * Equivalent to test for Pattern_White_Space Unicode property.
0671: * Stable set of characters, won't change.
0672: * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
0673: * @param c codepoint to check
0674: * @return true if c is a ICU white space
0675: */
0676: public static boolean isRuleWhiteSpace(int c) {
0677: /* "white space" in the sense of ICU rule parsers
0678: This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
0679: See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
0680: U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
0681: Equivalent to test for Pattern_White_Space Unicode property.
0682: */
0683: return (c >= 0x0009 && c <= 0x2029 && (c <= 0x000D
0684: || c == 0x0020 || c == 0x0085 || c == 0x200E
0685: || c == 0x200F || c >= 0x2028));
0686: }
0687:
0688: /**
0689: * Get the the maximum values for some enum/int properties.
0690: * @return maximum values for the integer properties.
0691: */
0692: public int getMaxValues(int column) {
0693: // return m_maxBlockScriptValue_;
0694:
0695: switch (column) {
0696: case 0:
0697: return m_maxBlockScriptValue_;
0698: case 2:
0699: return m_maxJTGValue_;
0700: default:
0701: return 0;
0702: }
0703: }
0704:
0705: /**
0706: * Gets the type mask
0707: * @param type character type
0708: * @return mask
0709: */
0710: public static final int getMask(int type) {
0711: return 1 << type;
0712: }
0713:
0714: // protected variables -----------------------------------------------
0715:
0716: /**
0717: * Extra property trie
0718: */
0719: CharTrie m_additionalTrie_;
0720: /**
0721: * Extra property vectors, 1st column for age and second for binary
0722: * properties.
0723: */
0724: int m_additionalVectors_[];
0725: /**
0726: * Number of additional columns
0727: */
0728: int m_additionalColumnsCount_;
0729: /**
0730: * Maximum values for block, bits used as in vector word
0731: * 0
0732: */
0733: int m_maxBlockScriptValue_;
0734: /**
0735: * Maximum values for script, bits used as in vector word
0736: * 0
0737: */
0738: int m_maxJTGValue_;
0739: // private variables -------------------------------------------------
0740:
0741: /**
0742: * UnicodeData.txt property object
0743: */
0744: private static UCharacterProperty INSTANCE_ = null;
0745:
0746: /**
0747: * Default name of the datafile
0748: */
0749: private static final String DATA_FILE_NAME_ = ICUResourceBundle.ICU_BUNDLE
0750: + "/uprops.icu";
0751:
0752: /**
0753: * Default buffer size of datafile
0754: */
0755: private static final int DATA_BUFFER_SIZE_ = 25000;
0756:
0757: /**
0758: * Numeric value shift
0759: */
0760: private static final int VALUE_SHIFT_ = 8;
0761:
0762: /**
0763: * Mask to be applied after shifting to obtain an unsigned numeric value
0764: */
0765: private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
0766:
0767: /**
0768: *
0769: */
0770: private static final int NUMERIC_TYPE_SHIFT = 5;
0771:
0772: /**
0773: * To get the last 5 bits out from a data type
0774: */
0775: private static final int LAST_5_BIT_MASK_ = 0x1F;
0776:
0777: /**
0778: * Shift 5 bits
0779: */
0780: private static final int SHIFT_5_ = 5;
0781: /**
0782: * Shift 10 bits
0783: */
0784: private static final int SHIFT_10_ = 10;
0785:
0786: /**
0787: * Shift value for lead surrogate to form a supplementary character.
0788: */
0789: private static final int LEAD_SURROGATE_SHIFT_ = 10;
0790: /**
0791: * Offset to add to combined surrogate pair to avoid msking.
0792: */
0793: private static final int SURROGATE_OFFSET_ = UTF16.SUPPLEMENTARY_MIN_VALUE
0794: - (UTF16.SURROGATE_MIN_VALUE << LEAD_SURROGATE_SHIFT_)
0795: - UTF16.TRAIL_SURROGATE_MIN_VALUE;
0796: /**
0797: * Latin uppercase I
0798: */
0799: private static final char LATIN_CAPITAL_LETTER_I_ = 0x49;
0800: /**
0801: * Combining dot above
0802: */
0803: private static final char COMBINING_DOT_ABOVE_ = 0x307;
0804: /**
0805: * LATIN SMALL LETTER J
0806: */
0807: private static final int LATIN_SMALL_LETTER_J_ = 0x6a;
0808: /**
0809: * LATIN SMALL LETTER I WITH OGONEK
0810: */
0811: private static final int LATIN_SMALL_LETTER_I_WITH_OGONEK_ = 0x12f;
0812: /**
0813: * LATIN SMALL LETTER I WITH TILDE BELOW
0814: */
0815: private static final int LATIN_SMALL_LETTER_I_WITH_TILDE_BELOW_ = 0x1e2d;
0816: /**
0817: * LATIN SMALL LETTER I WITH DOT BELOW
0818: */
0819: private static final int LATIN_SMALL_LETTER_I_WITH_DOT_BELOW_ = 0x1ecb;
0820: /**
0821: * Combining class for combining mark above
0822: */
0823: private static final int COMBINING_MARK_ABOVE_CLASS_ = 230;
0824:
0825: /**
0826: * LATIN CAPITAL LETTER J
0827: */
0828: private static final int LATIN_CAPITAL_LETTER_J_ = 0x4a;
0829:
0830: /**
0831: * LATIN CAPITAL LETTER I WITH OGONEK
0832: */
0833: private static final int LATIN_CAPITAL_I_WITH_OGONEK_ = 0x12e;
0834: /**
0835: * LATIN CAPITAL LETTER I WITH TILDE
0836: */
0837: private static final int LATIN_CAPITAL_I_WITH_TILDE_ = 0x128;
0838: /**
0839: * LATIN CAPITAL LETTER I WITH GRAVE
0840: */
0841: private static final int LATIN_CAPITAL_I_WITH_GRAVE_ = 0xcc;
0842: /**
0843: * LATIN CAPITAL LETTER I WITH ACUTE
0844: */
0845: private static final int LATIN_CAPITAL_I_WITH_ACUTE_ = 0xcd;
0846: /**
0847: * COMBINING GRAVE ACCENT
0848: */
0849: private static final int COMBINING_GRAVE_ACCENT_ = 0x300;
0850: /**
0851: * COMBINING ACUTE ACCENT
0852: */
0853: private static final int COMBINING_ACUTE_ACCENT_ = 0x301;
0854: /**
0855: * COMBINING TILDE
0856: */
0857: private static final int COMBINING_TILDE_ = 0x303;
0858: /**
0859: * Greek capital letter sigma
0860: */
0861: private static final char GREEK_CAPITAL_LETTER_SIGMA_ = 0x3a3;
0862: /**
0863: * Greek small letter sigma
0864: */
0865: private static final char GREEK_SMALL_LETTER_SIGMA_ = 0x3c3;
0866: /**
0867: * Greek small letter rho
0868: */
0869: private static final char GREEK_SMALL_LETTER_RHO_ = 0x3c2;
0870: /**
0871: * Hyphens
0872: */
0873: private static final int HYPHEN_ = 0x2010;
0874: private static final int SOFT_HYPHEN_ = 0xAD;
0875: /**
0876: * To get the last character out from a data type
0877: */
0878: private static final int LAST_CHAR_MASK_ = 0xFFFF;
0879: /**
0880: * To get the last byte out from a data type
0881: */
0882: private static final int LAST_BYTE_MASK_ = 0xFF;
0883: /**
0884: * Shift 16 bits
0885: */
0886: private static final int SHIFT_16_ = 16;
0887:
0888: // additional properties ----------------------------------------------
0889:
0890: /**
0891: * Additional properties used in internal trie data
0892: */
0893: /*
0894: * Properties in vector word 1
0895: * Each bit encodes one binary property.
0896: * The following constants represent the bit number, use 1<<UPROPS_XYZ.
0897: * UPROPS_BINARY_1_TOP<=32!
0898: *
0899: * Keep this list of property enums in sync with
0900: * propListNames[] in icu/source/tools/genprops/props2.c!
0901: *
0902: * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
0903: */
0904: private static final int WHITE_SPACE_PROPERTY_ = 0;
0905: private static final int BIDI_CONTROL_PROPERTY_ = 1;
0906: private static final int JOIN_CONTROL_PROPERTY_ = 2;
0907: private static final int DASH_PROPERTY_ = 3;
0908: private static final int HYPHEN_PROPERTY_ = 4;
0909: private static final int QUOTATION_MARK_PROPERTY_ = 5;
0910: private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 6;
0911: private static final int MATH_PROPERTY_ = 7;
0912: private static final int HEX_DIGIT_PROPERTY_ = 8;
0913: private static final int ASCII_HEX_DIGIT_PROPERTY_ = 9;
0914: private static final int ALPHABETIC_PROPERTY_ = 10;
0915: private static final int IDEOGRAPHIC_PROPERTY_ = 11;
0916: private static final int DIACRITIC_PROPERTY_ = 12;
0917: private static final int EXTENDER_PROPERTY_ = 13;
0918: private static final int LOWERCASE_PROPERTY_ = 14;
0919: private static final int UPPERCASE_PROPERTY_ = 15;
0920: private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 16;
0921: private static final int GRAPHEME_EXTEND_PROPERTY_ = 17;
0922: private static final int GRAPHEME_LINK_PROPERTY_ = 18;
0923: private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 19;
0924: private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 20;
0925: private static final int RADICAL_PROPERTY_ = 21;
0926: private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 22;
0927: private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 23;
0928: private static final int DEPRECATED_PROPERTY_ = 24;
0929: private static final int SOFT_DOTTED_PROPERTY_ = 25;
0930: private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 26;
0931: private static final int XID_START_PROPERTY_ = 27;
0932: private static final int XID_CONTINUE_PROPERTY_ = 28;
0933: private static final int ID_START_PROPERTY_ = 29;
0934: private static final int ID_CONTINUE_PROPERTY_ = 30;
0935: private static final int GRAPHEME_BASE_PROPERTY_ = 31;
0936: private static final int BINARY_1_TOP_PROPERTY_ = 32;
0937:
0938: /**
0939: * First nibble shift
0940: */
0941: private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
0942: /**
0943: * Second nibble mask
0944: */
0945: private static final int LAST_NIBBLE_MASK_ = 0xF;
0946: /**
0947: * Age value shift
0948: */
0949: private static final int AGE_SHIFT_ = 24;
0950:
0951: // boolean properties in vector word 2
0952: private static final int V2_S_TERM_PROPERTY_ = 24;
0953: private static final int V2_VARIATION_SELECTOR_PROPERTY_ = 25;
0954: private static final int V2_PATTERN_SYNTAX = 26; /* new in ICU 3.4 and Unicode 4.1 */
0955: private static final int V2_PATTERN_WHITE_SPACE = 27;
0956:
0957: // private constructors --------------------------------------------------
0958:
0959: /**
0960: * Constructor
0961: * @exception IOException thrown when data reading fails or data corrupted
0962: */
0963: private UCharacterProperty() throws IOException {
0964: // jar access
0965: InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
0966: BufferedInputStream b = new BufferedInputStream(is,
0967: DATA_BUFFER_SIZE_);
0968: UCharacterPropertyReader reader = new UCharacterPropertyReader(
0969: b);
0970: reader.read(this );
0971: b.close();
0972:
0973: m_trie_.putIndexData(this );
0974: }
0975:
0976: // private methods -------------------------------------------------------
0977:
0978: /*
0979: * Compare additional properties to see if it has argument type
0980: * @param property 32 bit properties
0981: * @param type character type
0982: * @return true if property has type
0983: */
0984: /*private boolean compareAdditionalType(int property, int type)
0985: {
0986: return (property & (1 << type)) != 0;
0987: }*/
0988:
0989: // property starts for UnicodeSet -------------------------------------- ***
0990: private static final int TAB = 0x0009;
0991: private static final int LF = 0x000a;
0992: private static final int FF = 0x000c;
0993: private static final int CR = 0x000d;
0994: private static final int U_A = 0x0041;
0995: private static final int U_F = 0x0046;
0996: private static final int U_Z = 0x005a;
0997: private static final int U_a = 0x0061;
0998: private static final int U_f = 0x0066;
0999: private static final int U_z = 0x007a;
1000: private static final int DEL = 0x007f;
1001: private static final int NL = 0x0085;
1002: private static final int NBSP = 0x00a0;
1003: private static final int CGJ = 0x034f;
1004: private static final int FIGURESP = 0x2007;
1005: private static final int HAIRSP = 0x200a;
1006: private static final int ZWNJ = 0x200c;
1007: private static final int ZWJ = 0x200d;
1008: private static final int RLM = 0x200f;
1009: private static final int NNBSP = 0x202f;
1010: private static final int WJ = 0x2060;
1011: private static final int INHSWAP = 0x206a;
1012: private static final int NOMDIG = 0x206f;
1013: private static final int U_FW_A = 0xff21;
1014: private static final int U_FW_F = 0xff26;
1015: private static final int U_FW_Z = 0xff3a;
1016: private static final int U_FW_a = 0xff41;
1017: private static final int U_FW_f = 0xff46;
1018: private static final int U_FW_z = 0xff5a;
1019: private static final int ZWNBSP = 0xfeff;
1020:
1021: /* for Hangul_Syllable_Type */
1022: public void uhst_addPropertyStarts(UnicodeSet set) {
1023: /* add code points with hardcoded properties, plus the ones following them */
1024:
1025: /*
1026: * Add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE.
1027: * First, we add fixed boundaries for the blocks of Jamos.
1028: * Then we check in loops to see where the current Unicode version
1029: * actually stops assigning such Jamos. We start each loop
1030: * at the end of the per-Jamo-block assignments in Unicode 4 or earlier.
1031: * (These have not changed since Unicode 2.)
1032: */
1033: int c, value, value2;
1034:
1035: set.add(0x1100);
1036: value = UCharacter.HangulSyllableType.LEADING_JAMO;
1037: for (c = 0x115a; c <= 0x115f; ++c) {
1038: value2 = UCharacter.getIntPropertyValue(c,
1039: UProperty.HANGUL_SYLLABLE_TYPE);
1040: if (value != value2) {
1041: value = value2;
1042: set.add(c);
1043: }
1044: }
1045:
1046: set.add(0x1160);
1047: value = UCharacter.HangulSyllableType.VOWEL_JAMO;
1048: for (c = 0x11a3; c <= 0x11a7; ++c) {
1049: value2 = UCharacter.getIntPropertyValue(c,
1050: UProperty.HANGUL_SYLLABLE_TYPE);
1051: if (value != value2) {
1052: value = value2;
1053: set.add(c);
1054: }
1055: }
1056:
1057: set.add(0x11a8);
1058: value = UCharacter.HangulSyllableType.TRAILING_JAMO;
1059: for (c = 0x11fa; c <= 0x11ff; ++c) {
1060: value2 = UCharacter.getIntPropertyValue(c,
1061: UProperty.HANGUL_SYLLABLE_TYPE);
1062: if (value != value2) {
1063: value = value2;
1064: set.add(c);
1065: }
1066: }
1067: }
1068:
1069: public UnicodeSet addPropertyStarts(UnicodeSet set) {
1070: /* add the start code point of each same-value range of the main trie */
1071: TrieIterator propsIter = new TrieIterator(m_trie_);
1072: RangeValueIterator.Element propsResult = new RangeValueIterator.Element();
1073: while (propsIter.next(propsResult)) {
1074: set.add(propsResult.start);
1075: }
1076:
1077: /* add code points with hardcoded properties, plus the ones following them */
1078:
1079: /* add for u_isblank() */
1080: set.add(TAB);
1081: set.add(TAB + 1);
1082:
1083: /* add for IS_THAT_CONTROL_SPACE() */
1084: set.add(CR + 1); /* range TAB..CR */
1085: set.add(0x1c);
1086: set.add(0x1f + 1);
1087: set.add(NL);
1088: set.add(NL + 1);
1089:
1090: /* add for u_isIDIgnorable() what was not added above */
1091: set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
1092: set.add(HAIRSP);
1093: set.add(RLM + 1);
1094: set.add(INHSWAP);
1095: set.add(NOMDIG + 1);
1096: set.add(ZWNBSP);
1097: set.add(ZWNBSP + 1);
1098:
1099: /* add no-break spaces for u_isWhitespace() what was not added above */
1100: set.add(NBSP);
1101: set.add(NBSP + 1);
1102: set.add(FIGURESP);
1103: set.add(FIGURESP + 1);
1104: set.add(NNBSP);
1105: set.add(NNBSP + 1);
1106:
1107: /* add for u_charDigitValue() */
1108: // TODO remove when UCharacter.getHanNumericValue() is changed to just return
1109: // Unicode numeric values
1110: set.add(0x3007);
1111: set.add(0x3008);
1112: set.add(0x4e00);
1113: set.add(0x4e01);
1114: set.add(0x4e8c);
1115: set.add(0x4e8d);
1116: set.add(0x4e09);
1117: set.add(0x4e0a);
1118: set.add(0x56db);
1119: set.add(0x56dc);
1120: set.add(0x4e94);
1121: set.add(0x4e95);
1122: set.add(0x516d);
1123: set.add(0x516e);
1124: set.add(0x4e03);
1125: set.add(0x4e04);
1126: set.add(0x516b);
1127: set.add(0x516c);
1128: set.add(0x4e5d);
1129: set.add(0x4e5e);
1130:
1131: /* add for u_digit() */
1132: set.add(U_a);
1133: set.add(U_z + 1);
1134: set.add(U_A);
1135: set.add(U_Z + 1);
1136: set.add(U_FW_a);
1137: set.add(U_FW_z + 1);
1138: set.add(U_FW_A);
1139: set.add(U_FW_Z + 1);
1140:
1141: /* add for u_isxdigit() */
1142: set.add(U_f + 1);
1143: set.add(U_F + 1);
1144: set.add(U_FW_f + 1);
1145: set.add(U_FW_F + 1);
1146:
1147: /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
1148: set.add(WJ); /* range WJ..NOMDIG */
1149: set.add(0xfff0);
1150: set.add(0xfffb + 1);
1151: set.add(0xe0000);
1152: set.add(0xe0fff + 1);
1153:
1154: /* add for UCHAR_GRAPHEME_BASE and others */
1155: set.add(CGJ);
1156: set.add(CGJ + 1);
1157:
1158: return set; // for chaining
1159: }
1160:
1161: public void upropsvec_addPropertyStarts(UnicodeSet set) {
1162: /* add the start code point of each same-value range of the properties vectors trie */
1163: if (m_additionalColumnsCount_ > 0) {
1164: /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
1165: TrieIterator propsVectorsIter = new TrieIterator(
1166: m_additionalTrie_);
1167: RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
1168: while (propsVectorsIter.next(propsVectorsResult)) {
1169: set.add(propsVectorsResult.start);
1170: }
1171: }
1172: }
1173:
1174: /*----------------------------------------------------------------
1175: * Inclusions list
1176: *----------------------------------------------------------------*/
1177:
1178: /*
1179: * Return a set of characters for property enumeration.
1180: * The set implicitly contains 0x110000 as well, which is one more than the highest
1181: * Unicode code point.
1182: *
1183: * This set is used as an ordered list - its code points are ordered, and
1184: * consecutive code points (in Unicode code point order) in the set define a range.
1185: * For each two consecutive characters (start, limit) in the set,
1186: * all of the UCD/normalization and related properties for
1187: * all code points start..limit-1 are all the same,
1188: * except for character names and ISO comments.
1189: *
1190: * All Unicode code points U+0000..U+10ffff are covered by these ranges.
1191: * The ranges define a partition of the Unicode code space.
1192: * ICU uses the inclusions set to enumerate properties for generating
1193: * UnicodeSets containing all code points that have a certain property value.
1194: *
1195: * The Inclusion List is generated from the UCD. It is generated
1196: * by enumerating the data tries, and code points for hardcoded properties
1197: * are added as well.
1198: *
1199: * --------------------------------------------------------------------------
1200: *
1201: * The following are ideas for getting properties-unique code point ranges,
1202: * with possible optimizations beyond the current implementation.
1203: * These optimizations would require more code and be more fragile.
1204: * The current implementation generates one single list (set) for all properties.
1205: *
1206: * To enumerate properties efficiently, one needs to know ranges of
1207: * repetitive values, so that the value of only each start code point
1208: * can be applied to the whole range.
1209: * This information is in principle available in the uprops.icu/unorm.icu data.
1210: *
1211: * There are two obstacles:
1212: *
1213: * 1. Some properties are computed from multiple data structures,
1214: * making it necessary to get repetitive ranges by intersecting
1215: * ranges from multiple tries.
1216: *
1217: * 2. It is not economical to write code for getting repetitive ranges
1218: * that are precise for each of some 50 properties.
1219: *
1220: * Compromise ideas:
1221: *
1222: * - Get ranges per trie, not per individual property.
1223: * Each range contains the same values for a whole group of properties.
1224: * This would generate currently five range sets, two for uprops.icu tries
1225: * and three for unorm.icu tries.
1226: *
1227: * - Combine sets of ranges for multiple tries to get sufficient sets
1228: * for properties, e.g., the uprops.icu main and auxiliary tries
1229: * for all non-normalization properties.
1230: *
1231: * Ideas for representing ranges and combining them:
1232: *
1233: * - A UnicodeSet could hold just the start code points of ranges.
1234: * Multiple sets are easily combined by or-ing them together.
1235: *
1236: * - Alternatively, a UnicodeSet could hold each even-numbered range.
1237: * All ranges could be enumerated by using each start code point
1238: * (for the even-numbered ranges) as well as each limit (end+1) code point
1239: * (for the odd-numbered ranges).
1240: * It should be possible to combine two such sets by xor-ing them,
1241: * but no more than two.
1242: *
1243: * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
1244: * but the first one is certainly simpler and applicable for combining more than
1245: * two range sets.
1246: *
1247: * It is possible to combine all range sets for all uprops/unorm tries into one
1248: * set that can be used for all properties.
1249: * As an optimization, there could be less-combined range sets for certain
1250: * groups of properties.
1251: * The relationship of which less-combined range set to use for which property
1252: * depends on the implementation of the properties and must be hardcoded
1253: * - somewhat error-prone and higher maintenance but can be tested easily
1254: * by building property sets "the simple way" in test code.
1255: *
1256: * ---
1257: *
1258: * Do not use a UnicodeSet pattern because that causes infinite recursion;
1259: * UnicodeSet depends on the inclusions set.
1260: *
1261: * ---
1262: *
1263: * getInclusions() is commented out starting 2005-feb-12 because
1264: * UnicodeSet now calls the uxyz_addPropertyStarts() directly,
1265: * and only for the relevant property source.
1266: */
1267: /*
1268: public UnicodeSet getInclusions() {
1269: UnicodeSet set = new UnicodeSet();
1270: NormalizerImpl.addPropertyStarts(set);
1271: addPropertyStarts(set);
1272: return set;
1273: }
1274: */
1275: }
|