0001: //##header
0002: /**
0003: *******************************************************************************
0004: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0005: * others. All Rights Reserved. *
0006: *******************************************************************************
0007: */package com.ibm.icu.lang;
0008:
0009: import java.io.IOException;
0010: import java.lang.ref.SoftReference;
0011: import java.util.HashMap;
0012: import java.util.Locale;
0013: import java.util.Map;
0014: import java.util.MissingResourceException;
0015:
0016: import com.ibm.icu.impl.UBiDiProps;
0017: import com.ibm.icu.impl.UCaseProps;
0018: import com.ibm.icu.impl.NormalizerImpl;
0019: import com.ibm.icu.impl.UCharacterUtility;
0020: import com.ibm.icu.impl.UCharacterName;
0021: import com.ibm.icu.impl.UCharacterNameChoice;
0022: import com.ibm.icu.impl.UPropertyAliases;
0023: import com.ibm.icu.lang.UCharacterEnums.*;
0024: import com.ibm.icu.text.BreakIterator;
0025: import com.ibm.icu.text.UTF16;
0026: import com.ibm.icu.impl.UCharacterProperty;
0027: import com.ibm.icu.util.RangeValueIterator;
0028: import com.ibm.icu.util.ULocale;
0029: import com.ibm.icu.util.ValueIterator;
0030: import com.ibm.icu.util.VersionInfo;
0031:
0032: /**
0033: * <p>
0034: * The UCharacter class provides extensions to the
0035: * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
0036: * java.lang.Character</a> class. These extensions provide support for
0037: * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
0038: * class, provide support for supplementary characters (those with code
0039: * points above U+FFFF).
0040: * Each ICU release supports the latest version of Unicode available at that time.
0041: * </p>
0042: * <p>
0043: * Code points are represented in these API using ints. While it would be
0044: * more convenient in Java to have a separate primitive datatype for them,
0045: * ints suffice in the meantime.
0046: * </p>
0047: * <p>
0048: * To use this class please add the jar file name icu4j.jar to the
0049: * class path, since it contains data files which supply the information used
0050: * by this file.<br>
0051: * E.g. In Windows <br>
0052: * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
0053: * Otherwise, another method would be to copy the files uprops.dat and
0054: * unames.icu from the icu4j source subdirectory
0055: * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
0056: * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
0057: * </p>
0058: * <p>
0059: * Aside from the additions for UTF-16 support, and the updated Unicode
0060: * properties, the main differences between UCharacter and Character are:
0061: * <ul>
0062: * <li> UCharacter is not designed to be a char wrapper and does not have
0063: * APIs to which involves management of that single char.<br>
0064: * These include:
0065: * <ul>
0066: * <li> char charValue(),
0067: * <li> int compareTo(java.lang.Character, java.lang.Character), etc.
0068: * </ul>
0069: * <li> UCharacter does not include Character APIs that are deprecated, nor
0070: * does it include the Java-specific character information, such as
0071: * boolean isJavaIdentifierPart(char ch).
0072: * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
0073: * values '10' - '35'. UCharacter also does this in digit and
0074: * getNumericValue, to adhere to the java semantics of these
0075: * methods. New methods unicodeDigit, and
0076: * getUnicodeNumericValue do not treat the above code points
0077: * as having numeric values. This is a semantic change from ICU4J 1.3.1.
0078: * </ul>
0079: * <p>
0080: * Further detail differences can be determined from the program
0081: * <a href="http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
0082: * com.ibm.icu.dev.test.lang.UCharacterCompare</a>
0083: * </p>
0084: * <p>
0085: * In addition to Java compatibility functions, which calculate derived properties,
0086: * this API provides low-level access to the Unicode Character Database.
0087: * </p>
0088: * <p>
0089: * Unicode assigns each code point (not just assigned character) values for
0090: * many properties.
0091: * Most of them are simple boolean flags, or constants from a small enumerated list.
0092: * For some properties, values are strings or other relatively more complex types.
0093: * </p>
0094: * <p>
0095: * For more information see
0096: * "About the Unicode Character Database" (http://www.unicode.org/ucd/)
0097: * and the ICU User Guide chapter on Properties (http://icu.sourceforge.net/userguide/properties.html).
0098: * </p>
0099: * <p>
0100: * There are also functions that provide easy migration from C/POSIX functions
0101: * like isblank(). Their use is generally discouraged because the C/POSIX
0102: * standards do not define their semantics beyond the ASCII range, which means
0103: * that different implementations exhibit very different behavior.
0104: * Instead, Unicode properties should be used directly.
0105: * </p>
0106: * <p>
0107: * There are also only a few, broad C/POSIX character classes, and they tend
0108: * to be used for conflicting purposes. For example, the "isalpha()" class
0109: * is sometimes used to determine word boundaries, while a more sophisticated
0110: * approach would at least distinguish initial letters from continuation
0111: * characters (the latter including combining marks).
0112: * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
0113: * Another example: There is no "istitle()" class for titlecase characters.
0114: * </p>
0115: * <p>
0116: * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
0117: * ICU implements them according to the Standard Recommendations in
0118: * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
0119: * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
0120: * </p>
0121: * <p>
0122: * API access for C/POSIX character classes is as follows:
0123: * - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
0124: * - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
0125: * - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
0126: * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
0127: * - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
0128: * - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
0129: * - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
0130: * - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
0131: * - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
0132: * - cntrl: getType(c)==CONTROL
0133: * - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
0134: * - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
0135: * </p>
0136: * <p>
0137: * The C/POSIX character classes are also available in UnicodeSet patterns,
0138: * using patterns like [:graph:] or \p{graph}.
0139: * </p>
0140: * <p>
0141: * Note: There are several ICU (and Java) whitespace functions.
0142: * Comparison:
0143: * - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
0144: * most of general categories "Z" (separators) + most whitespace ISO controls
0145: * (including no-break spaces, but excluding IS1..IS4 and ZWSP)
0146: * - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
0147: * - isSpaceChar: just Z (including no-break spaces)
0148: * </p>
0149: * <p>
0150: * This class is not subclassable
0151: * </p>
0152: * @author Syn Wee Quek
0153: * @stable ICU 2.1
0154: * @see com.ibm.icu.lang.UCharacterEnums
0155: */
0156:
0157: public final class UCharacter implements ECharacterCategory,
0158: ECharacterDirection {
0159: // public inner classes ----------------------------------------------
0160:
0161: /**
0162: * A family of character subsets representing the character blocks in the
0163: * Unicode specification, generated from Unicode Data file Blocks.txt.
0164: * Character blocks generally define characters used for a specific script
0165: * or purpose. A character is contained by at most one Unicode block.
0166: * @stable ICU 2.4
0167: */
0168: public static final class UnicodeBlock extends Character.Subset {
0169: // blocks objects ---------------------------------------------------
0170:
0171: /**
0172: * @stable ICU 2.6
0173: */
0174: public static final UnicodeBlock NO_BLOCK = new UnicodeBlock(
0175: "NO_BLOCK", 0);
0176:
0177: /**
0178: * @stable ICU 2.4
0179: */
0180: public static final UnicodeBlock BASIC_LATIN = new UnicodeBlock(
0181: "BASIC_LATIN", 1);
0182: /**
0183: * @stable ICU 2.4
0184: */
0185: public static final UnicodeBlock LATIN_1_SUPPLEMENT = new UnicodeBlock(
0186: "LATIN_1_SUPPLEMENT", 2);
0187: /**
0188: * @stable ICU 2.4
0189: */
0190: public static final UnicodeBlock LATIN_EXTENDED_A = new UnicodeBlock(
0191: "LATIN_EXTENDED_A", 3);
0192: /**
0193: * @stable ICU 2.4
0194: */
0195: public static final UnicodeBlock LATIN_EXTENDED_B = new UnicodeBlock(
0196: "LATIN_EXTENDED_B", 4);
0197: /**
0198: * @stable ICU 2.4
0199: */
0200: public static final UnicodeBlock IPA_EXTENSIONS = new UnicodeBlock(
0201: "IPA_EXTENSIONS", 5);
0202: /**
0203: * @stable ICU 2.4
0204: */
0205: public static final UnicodeBlock SPACING_MODIFIER_LETTERS = new UnicodeBlock(
0206: "SPACING_MODIFIER_LETTERS", 6);
0207: /**
0208: * @stable ICU 2.4
0209: */
0210: public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS = new UnicodeBlock(
0211: "COMBINING_DIACRITICAL_MARKS", 7);
0212: /**
0213: * Unicode 3.2 renames this block to "Greek and Coptic".
0214: * @stable ICU 2.4
0215: */
0216: public static final UnicodeBlock GREEK = new UnicodeBlock(
0217: "GREEK", 8);
0218: /**
0219: * @stable ICU 2.4
0220: */
0221: public static final UnicodeBlock CYRILLIC = new UnicodeBlock(
0222: "CYRILLIC", 9);
0223: /**
0224: * @stable ICU 2.4
0225: */
0226: public static final UnicodeBlock ARMENIAN = new UnicodeBlock(
0227: "ARMENIAN", 10);
0228: /**
0229: * @stable ICU 2.4
0230: */
0231: public static final UnicodeBlock HEBREW = new UnicodeBlock(
0232: "HEBREW", 11);
0233: /**
0234: * @stable ICU 2.4
0235: */
0236: public static final UnicodeBlock ARABIC = new UnicodeBlock(
0237: "ARABIC", 12);
0238: /**
0239: * @stable ICU 2.4
0240: */
0241: public static final UnicodeBlock SYRIAC = new UnicodeBlock(
0242: "SYRIAC", 13);
0243: /**
0244: * @stable ICU 2.4
0245: */
0246: public static final UnicodeBlock THAANA = new UnicodeBlock(
0247: "THAANA", 14);
0248: /**
0249: * @stable ICU 2.4
0250: */
0251: public static final UnicodeBlock DEVANAGARI = new UnicodeBlock(
0252: "DEVANAGARI", 15);
0253: /**
0254: * @stable ICU 2.4
0255: */
0256: public static final UnicodeBlock BENGALI = new UnicodeBlock(
0257: "BENGALI", 16);
0258: /**
0259: * @stable ICU 2.4
0260: */
0261: public static final UnicodeBlock GURMUKHI = new UnicodeBlock(
0262: "GURMUKHI", 17);
0263: /**
0264: * @stable ICU 2.4
0265: */
0266: public static final UnicodeBlock GUJARATI = new UnicodeBlock(
0267: "GUJARATI", 18);
0268: /**
0269: * @stable ICU 2.4
0270: */
0271: public static final UnicodeBlock ORIYA = new UnicodeBlock(
0272: "ORIYA", 19);
0273: /**
0274: * @stable ICU 2.4
0275: */
0276: public static final UnicodeBlock TAMIL = new UnicodeBlock(
0277: "TAMIL", 20);
0278: /**
0279: * @stable ICU 2.4
0280: */
0281: public static final UnicodeBlock TELUGU = new UnicodeBlock(
0282: "TELUGU", 21);
0283: /**
0284: * @stable ICU 2.4
0285: */
0286: public static final UnicodeBlock KANNADA = new UnicodeBlock(
0287: "KANNADA", 22);
0288: /**
0289: * @stable ICU 2.4
0290: */
0291: public static final UnicodeBlock MALAYALAM = new UnicodeBlock(
0292: "MALAYALAM", 23);
0293: /**
0294: * @stable ICU 2.4
0295: */
0296: public static final UnicodeBlock SINHALA = new UnicodeBlock(
0297: "SINHALA", 24);
0298: /**
0299: * @stable ICU 2.4
0300: */
0301: public static final UnicodeBlock THAI = new UnicodeBlock(
0302: "THAI", 25);
0303: /**
0304: * @stable ICU 2.4
0305: */
0306: public static final UnicodeBlock LAO = new UnicodeBlock("LAO",
0307: 26);
0308: /**
0309: * @stable ICU 2.4
0310: */
0311: public static final UnicodeBlock TIBETAN = new UnicodeBlock(
0312: "TIBETAN", 27);
0313: /**
0314: * @stable ICU 2.4
0315: */
0316: public static final UnicodeBlock MYANMAR = new UnicodeBlock(
0317: "MYANMAR", 28);
0318: /**
0319: * @stable ICU 2.4
0320: */
0321: public static final UnicodeBlock GEORGIAN = new UnicodeBlock(
0322: "GEORGIAN", 29);
0323: /**
0324: * @stable ICU 2.4
0325: */
0326: public static final UnicodeBlock HANGUL_JAMO = new UnicodeBlock(
0327: "HANGUL_JAMO", 30);
0328: /**
0329: * @stable ICU 2.4
0330: */
0331: public static final UnicodeBlock ETHIOPIC = new UnicodeBlock(
0332: "ETHIOPIC", 31);
0333: /**
0334: * @stable ICU 2.4
0335: */
0336: public static final UnicodeBlock CHEROKEE = new UnicodeBlock(
0337: "CHEROKEE", 32);
0338: /**
0339: * @stable ICU 2.4
0340: */
0341: public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = new UnicodeBlock(
0342: "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS", 33);
0343: /**
0344: * @stable ICU 2.4
0345: */
0346: public static final UnicodeBlock OGHAM = new UnicodeBlock(
0347: "OGHAM", 34);
0348: /**
0349: * @stable ICU 2.4
0350: */
0351: public static final UnicodeBlock RUNIC = new UnicodeBlock(
0352: "RUNIC", 35);
0353: /**
0354: * @stable ICU 2.4
0355: */
0356: public static final UnicodeBlock KHMER = new UnicodeBlock(
0357: "KHMER", 36);
0358: /**
0359: * @stable ICU 2.4
0360: */
0361: public static final UnicodeBlock MONGOLIAN = new UnicodeBlock(
0362: "MONGOLIAN", 37);
0363: /**
0364: * @stable ICU 2.4
0365: */
0366: public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL = new UnicodeBlock(
0367: "LATIN_EXTENDED_ADDITIONAL", 38);
0368: /**
0369: * @stable ICU 2.4
0370: */
0371: public static final UnicodeBlock GREEK_EXTENDED = new UnicodeBlock(
0372: "GREEK_EXTENDED", 39);
0373: /**
0374: * @stable ICU 2.4
0375: */
0376: public static final UnicodeBlock GENERAL_PUNCTUATION = new UnicodeBlock(
0377: "GENERAL_PUNCTUATION", 40);
0378: /**
0379: * @stable ICU 2.4
0380: */
0381: public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS = new UnicodeBlock(
0382: "SUPERSCRIPTS_AND_SUBSCRIPTS", 41);
0383: /**
0384: * @stable ICU 2.4
0385: */
0386: public static final UnicodeBlock CURRENCY_SYMBOLS = new UnicodeBlock(
0387: "CURRENCY_SYMBOLS", 42);
0388: /**
0389: * Unicode 3.2 renames this block to "Combining Diacritical Marks for
0390: * Symbols".
0391: * @stable ICU 2.4
0392: */
0393: public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS = new UnicodeBlock(
0394: "COMBINING_MARKS_FOR_SYMBOLS", 43);
0395: /**
0396: * @stable ICU 2.4
0397: */
0398: public static final UnicodeBlock LETTERLIKE_SYMBOLS = new UnicodeBlock(
0399: "LETTERLIKE_SYMBOLS", 44);
0400: /**
0401: * @stable ICU 2.4
0402: */
0403: public static final UnicodeBlock NUMBER_FORMS = new UnicodeBlock(
0404: "NUMBER_FORMS", 45);
0405: /**
0406: * @stable ICU 2.4
0407: */
0408: public static final UnicodeBlock ARROWS = new UnicodeBlock(
0409: "ARROWS", 46);
0410: /**
0411: * @stable ICU 2.4
0412: */
0413: public static final UnicodeBlock MATHEMATICAL_OPERATORS = new UnicodeBlock(
0414: "MATHEMATICAL_OPERATORS", 47);
0415: /**
0416: * @stable ICU 2.4
0417: */
0418: public static final UnicodeBlock MISCELLANEOUS_TECHNICAL = new UnicodeBlock(
0419: "MISCELLANEOUS_TECHNICAL", 48);
0420: /**
0421: * @stable ICU 2.4
0422: */
0423: public static final UnicodeBlock CONTROL_PICTURES = new UnicodeBlock(
0424: "CONTROL_PICTURES", 49);
0425: /**
0426: * @stable ICU 2.4
0427: */
0428: public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION = new UnicodeBlock(
0429: "OPTICAL_CHARACTER_RECOGNITION", 50);
0430: /**
0431: * @stable ICU 2.4
0432: */
0433: public static final UnicodeBlock ENCLOSED_ALPHANUMERICS = new UnicodeBlock(
0434: "ENCLOSED_ALPHANUMERICS", 51);
0435: /**
0436: * @stable ICU 2.4
0437: */
0438: public static final UnicodeBlock BOX_DRAWING = new UnicodeBlock(
0439: "BOX_DRAWING", 52);
0440: /**
0441: * @stable ICU 2.4
0442: */
0443: public static final UnicodeBlock BLOCK_ELEMENTS = new UnicodeBlock(
0444: "BLOCK_ELEMENTS", 53);
0445: /**
0446: * @stable ICU 2.4
0447: */
0448: public static final UnicodeBlock GEOMETRIC_SHAPES = new UnicodeBlock(
0449: "GEOMETRIC_SHAPES", 54);
0450: /**
0451: * @stable ICU 2.4
0452: */
0453: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS = new UnicodeBlock(
0454: "MISCELLANEOUS_SYMBOLS", 55);
0455: /**
0456: * @stable ICU 2.4
0457: */
0458: public static final UnicodeBlock DINGBATS = new UnicodeBlock(
0459: "DINGBATS", 56);
0460: /**
0461: * @stable ICU 2.4
0462: */
0463: public static final UnicodeBlock BRAILLE_PATTERNS = new UnicodeBlock(
0464: "BRAILLE_PATTERNS", 57);
0465: /**
0466: * @stable ICU 2.4
0467: */
0468: public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT = new UnicodeBlock(
0469: "CJK_RADICALS_SUPPLEMENT", 58);
0470: /**
0471: * @stable ICU 2.4
0472: */
0473: public static final UnicodeBlock KANGXI_RADICALS = new UnicodeBlock(
0474: "KANGXI_RADICALS", 59);
0475: /**
0476: * @stable ICU 2.4
0477: */
0478: public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS = new UnicodeBlock(
0479: "IDEOGRAPHIC_DESCRIPTION_CHARACTERS", 60);
0480: /**
0481: * @stable ICU 2.4
0482: */
0483: public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION = new UnicodeBlock(
0484: "CJK_SYMBOLS_AND_PUNCTUATION", 61);
0485: /**
0486: * @stable ICU 2.4
0487: */
0488: public static final UnicodeBlock HIRAGANA = new UnicodeBlock(
0489: "HIRAGANA", 62);
0490: /**
0491: * @stable ICU 2.4
0492: */
0493: public static final UnicodeBlock KATAKANA = new UnicodeBlock(
0494: "KATAKANA", 63);
0495: /**
0496: * @stable ICU 2.4
0497: */
0498: public static final UnicodeBlock BOPOMOFO = new UnicodeBlock(
0499: "BOPOMOFO", 64);
0500: /**
0501: * @stable ICU 2.4
0502: */
0503: public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO = new UnicodeBlock(
0504: "HANGUL_COMPATIBILITY_JAMO", 65);
0505: /**
0506: * @stable ICU 2.4
0507: */
0508: public static final UnicodeBlock KANBUN = new UnicodeBlock(
0509: "KANBUN", 66);
0510: /**
0511: * @stable ICU 2.4
0512: */
0513: public static final UnicodeBlock BOPOMOFO_EXTENDED = new UnicodeBlock(
0514: "BOPOMOFO_EXTENDED", 67);
0515: /**
0516: * @stable ICU 2.4
0517: */
0518: public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS = new UnicodeBlock(
0519: "ENCLOSED_CJK_LETTERS_AND_MONTHS", 68);
0520: /**
0521: * @stable ICU 2.4
0522: */
0523: public static final UnicodeBlock CJK_COMPATIBILITY = new UnicodeBlock(
0524: "CJK_COMPATIBILITY", 69);
0525: /**
0526: * @stable ICU 2.4
0527: */
0528: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = new UnicodeBlock(
0529: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A", 70);
0530: /**
0531: * @stable ICU 2.4
0532: */
0533: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS = new UnicodeBlock(
0534: "CJK_UNIFIED_IDEOGRAPHS", 71);
0535: /**
0536: * @stable ICU 2.4
0537: */
0538: public static final UnicodeBlock YI_SYLLABLES = new UnicodeBlock(
0539: "YI_SYLLABLES", 72);
0540: /**
0541: * @stable ICU 2.4
0542: */
0543: public static final UnicodeBlock YI_RADICALS = new UnicodeBlock(
0544: "YI_RADICALS", 73);
0545: /**
0546: * @stable ICU 2.4
0547: */
0548: public static final UnicodeBlock HANGUL_SYLLABLES = new UnicodeBlock(
0549: "HANGUL_SYLLABLES", 74);
0550: /**
0551: * @stable ICU 2.4
0552: */
0553: public static final UnicodeBlock HIGH_SURROGATES = new UnicodeBlock(
0554: "HIGH_SURROGATES", 75);
0555: /**
0556: * @stable ICU 2.4
0557: */
0558: public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES = new UnicodeBlock(
0559: "HIGH_PRIVATE_USE_SURROGATES", 76);
0560: /**
0561: * @stable ICU 2.4
0562: */
0563: public static final UnicodeBlock LOW_SURROGATES = new UnicodeBlock(
0564: "LOW_SURROGATES", 77);
0565: /**
0566: * Same as public static final int PRIVATE_USE.
0567: * Until Unicode 3.1.1; the corresponding block name was "Private Use";
0568: * and multiple code point ranges had this block.
0569: * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
0570: * and adds separate blocks for the supplementary PUAs.
0571: * @stable ICU 2.4
0572: */
0573: public static final UnicodeBlock PRIVATE_USE_AREA = new UnicodeBlock(
0574: "PRIVATE_USE_AREA", 78);
0575: /**
0576: * Same as public static final int PRIVATE_USE_AREA.
0577: * Until Unicode 3.1.1; the corresponding block name was "Private Use";
0578: * and multiple code point ranges had this block.
0579: * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
0580: * and adds separate blocks for the supplementary PUAs.
0581: * @stable ICU 2.4
0582: */
0583: public static final UnicodeBlock PRIVATE_USE = PRIVATE_USE_AREA;
0584: /**
0585: * @stable ICU 2.4
0586: */
0587: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS = new UnicodeBlock(
0588: "CJK_COMPATIBILITY_IDEOGRAPHS", 79);
0589: /**
0590: * @stable ICU 2.4
0591: */
0592: public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS = new UnicodeBlock(
0593: "ALPHABETIC_PRESENTATION_FORMS", 80);
0594: /**
0595: * @stable ICU 2.4
0596: */
0597: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A = new UnicodeBlock(
0598: "ARABIC_PRESENTATION_FORMS_A", 81);
0599: /**
0600: * @stable ICU 2.4
0601: */
0602: public static final UnicodeBlock COMBINING_HALF_MARKS = new UnicodeBlock(
0603: "COMBINING_HALF_MARKS", 82);
0604: /**
0605: * @stable ICU 2.4
0606: */
0607: public static final UnicodeBlock CJK_COMPATIBILITY_FORMS = new UnicodeBlock(
0608: "CJK_COMPATIBILITY_FORMS", 83);
0609: /**
0610: * @stable ICU 2.4
0611: */
0612: public static final UnicodeBlock SMALL_FORM_VARIANTS = new UnicodeBlock(
0613: "SMALL_FORM_VARIANTS", 84);
0614: /**
0615: * @stable ICU 2.4
0616: */
0617: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B = new UnicodeBlock(
0618: "ARABIC_PRESENTATION_FORMS_B", 85);
0619: /**
0620: * @stable ICU 2.4
0621: */
0622: public static final UnicodeBlock SPECIALS = new UnicodeBlock(
0623: "SPECIALS", 86);
0624: /**
0625: * @stable ICU 2.4
0626: */
0627: public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS = new UnicodeBlock(
0628: "HALFWIDTH_AND_FULLWIDTH_FORMS", 87);
0629: /**
0630: * @stable ICU 2.4
0631: */
0632: public static final UnicodeBlock OLD_ITALIC = new UnicodeBlock(
0633: "OLD_ITALIC", 88);
0634: /**
0635: * @stable ICU 2.4
0636: */
0637: public static final UnicodeBlock GOTHIC = new UnicodeBlock(
0638: "GOTHIC", 89);
0639: /**
0640: * @stable ICU 2.4
0641: */
0642: public static final UnicodeBlock DESERET = new UnicodeBlock(
0643: "DESERET", 90);
0644: /**
0645: * @stable ICU 2.4
0646: */
0647: public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS = new UnicodeBlock(
0648: "BYZANTINE_MUSICAL_SYMBOLS", 91);
0649: /**
0650: * @stable ICU 2.4
0651: */
0652: public static final UnicodeBlock MUSICAL_SYMBOLS = new UnicodeBlock(
0653: "MUSICAL_SYMBOLS", 92);
0654: /**
0655: * @stable ICU 2.4
0656: */
0657: public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS = new UnicodeBlock(
0658: "MATHEMATICAL_ALPHANUMERIC_SYMBOLS", 93);
0659: /**
0660: * @stable ICU 2.4
0661: */
0662: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = new UnicodeBlock(
0663: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B", 94);
0664: /**
0665: * @stable ICU 2.4
0666: */
0667: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = new UnicodeBlock(
0668: "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT", 95);
0669: /**
0670: * @stable ICU 2.4
0671: */
0672: public static final UnicodeBlock TAGS = new UnicodeBlock(
0673: "TAGS", 96);
0674:
0675: // New blocks in Unicode 3.2
0676:
0677: /**
0678: * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
0679: * @stable ICU 2.4
0680: */
0681: public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY = new UnicodeBlock(
0682: "CYRILLIC_SUPPLEMENTARY", 97);
0683: /**
0684: * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
0685: * @stable ICU 3.0
0686: */
0687: public static final UnicodeBlock CYRILLIC_SUPPLEMENT = new UnicodeBlock(
0688: "CYRILLIC_SUPPLEMENT", 97);
0689: /**
0690: * @stable ICU 2.4
0691: */
0692: public static final UnicodeBlock TAGALOG = new UnicodeBlock(
0693: "TAGALOG", 98);
0694: /**
0695: * @stable ICU 2.4
0696: */
0697: public static final UnicodeBlock HANUNOO = new UnicodeBlock(
0698: "HANUNOO", 99);
0699: /**
0700: * @stable ICU 2.4
0701: */
0702: public static final UnicodeBlock BUHID = new UnicodeBlock(
0703: "BUHID", 100);
0704: /**
0705: * @stable ICU 2.4
0706: */
0707: public static final UnicodeBlock TAGBANWA = new UnicodeBlock(
0708: "TAGBANWA", 101);
0709: /**
0710: * @stable ICU 2.4
0711: */
0712: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = new UnicodeBlock(
0713: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A", 102);
0714: /**
0715: * @stable ICU 2.4
0716: */
0717: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A = new UnicodeBlock(
0718: "SUPPLEMENTAL_ARROWS_A", 103);
0719: /**
0720: * @stable ICU 2.4
0721: */
0722: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B = new UnicodeBlock(
0723: "SUPPLEMENTAL_ARROWS_B", 104);
0724: /**
0725: * @stable ICU 2.4
0726: */
0727: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = new UnicodeBlock(
0728: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B", 105);
0729: /**
0730: * @stable ICU 2.4
0731: */
0732: public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS = new UnicodeBlock(
0733: "SUPPLEMENTAL_MATHEMATICAL_OPERATORS", 106);
0734: /**
0735: * @stable ICU 2.4
0736: */
0737: public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS = new UnicodeBlock(
0738: "KATAKANA_PHONETIC_EXTENSIONS", 107);
0739: /**
0740: * @stable ICU 2.4
0741: */
0742: public static final UnicodeBlock VARIATION_SELECTORS = new UnicodeBlock(
0743: "VARIATION_SELECTORS", 108);
0744: /**
0745: * @stable ICU 2.4
0746: */
0747: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A = new UnicodeBlock(
0748: "SUPPLEMENTARY_PRIVATE_USE_AREA_A", 109);
0749: /**
0750: * @stable ICU 2.4
0751: */
0752: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B = new UnicodeBlock(
0753: "SUPPLEMENTARY_PRIVATE_USE_AREA_B", 110);
0754:
0755: /**
0756: * @stable ICU 2.6
0757: */
0758: public static final UnicodeBlock LIMBU = new UnicodeBlock(
0759: "LIMBU", 111);
0760: /**
0761: * @stable ICU 2.6
0762: */
0763: public static final UnicodeBlock TAI_LE = new UnicodeBlock(
0764: "TAI LE", 112);
0765: /**
0766: * @stable ICU 2.6
0767: */
0768: public static final UnicodeBlock KHMER_SYMBOLS = new UnicodeBlock(
0769: "KHMER SYMBOLS", 113);
0770:
0771: /**
0772: * @stable ICU 2.6
0773: */
0774: public static final UnicodeBlock PHONETIC_EXTENSIONS = new UnicodeBlock(
0775: "PHONETIC EXTENSIONS", 114);
0776:
0777: /**
0778: * @stable ICU 2.6
0779: */
0780: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS = new UnicodeBlock(
0781: "MISCELLANEOUS_SYMBOLS_AND_ARROWS", 115);
0782: /**
0783: * @stable ICU 2.6
0784: */
0785: public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS = new UnicodeBlock(
0786: "YIJING_HEXAGRAM_SYMBOLS", 116);
0787: /**
0788: * @stable ICU 2.6
0789: */
0790: public static final UnicodeBlock LINEAR_B_SYLLABARY = new UnicodeBlock(
0791: "LINEAR_B_SYLLABARY", 117);
0792: /**
0793: * @stable ICU 2.6
0794: */
0795: public static final UnicodeBlock LINEAR_B_IDEOGRAMS = new UnicodeBlock(
0796: "LINEAR_B_IDEOGRAMS", 118);
0797: /**
0798: * @stable ICU 2.6
0799: */
0800: public static final UnicodeBlock AEGEAN_NUMBERS = new UnicodeBlock(
0801: "AEGEAN_NUMBERS", 119);
0802: /**
0803: * @stable ICU 2.6
0804: */
0805: public static final UnicodeBlock UGARITIC = new UnicodeBlock(
0806: "UGARITIC", 120);
0807: /**
0808: * @stable ICU 2.6
0809: */
0810: public static final UnicodeBlock SHAVIAN = new UnicodeBlock(
0811: "SHAVIAN", 121);
0812: /**
0813: * @stable ICU 2.6
0814: */
0815: public static final UnicodeBlock OSMANYA = new UnicodeBlock(
0816: "OSMANYA", 122);
0817: /**
0818: * @stable ICU 2.6
0819: */
0820: public static final UnicodeBlock CYPRIOT_SYLLABARY = new UnicodeBlock(
0821: "CYPRIOT_SYLLABARY", 123);
0822: /**
0823: * @stable ICU 2.6
0824: */
0825: public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS = new UnicodeBlock(
0826: "TAI_XUAN_JING_SYMBOLS", 124);
0827:
0828: /**
0829: * @stable ICU 2.6
0830: */
0831: public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT = new UnicodeBlock(
0832: "VARIATION_SELECTORS_SUPPLEMENT", 125);
0833:
0834: /* New blocks in Unicode 4.1 */
0835:
0836: /**
0837: * @draft ICU 3.4
0838: * @provisional This API might change or be removed in a future release.
0839: */
0840: public static final UnicodeBlock ANCIENT_GREEK_MUSICAL_NOTATION = new UnicodeBlock(
0841: "ANCIENT_GREEK_MUSICAL_NOTATION", 126); /*[1D200]*/
0842:
0843: /**
0844: * @draft ICU 3.4
0845: * @provisional This API might change or be removed in a future release.
0846: */
0847: public static final UnicodeBlock ANCIENT_GREEK_NUMBERS = new UnicodeBlock(
0848: "ANCIENT_GREEK_NUMBERS", 127); /*[10140]*/
0849:
0850: /**
0851: * @draft ICU 3.4
0852: * @provisional This API might change or be removed in a future release.
0853: */
0854: public static final UnicodeBlock ARABIC_SUPPLEMENT = new UnicodeBlock(
0855: "ARABIC_SUPPLEMENT", 128); /*[0750]*/
0856:
0857: /**
0858: * @draft ICU 3.4
0859: * @provisional This API might change or be removed in a future release.
0860: */
0861: public static final UnicodeBlock BUGINESE = new UnicodeBlock(
0862: "BUGINESE", 129); /*[1A00]*/
0863:
0864: /**
0865: * @draft ICU 3.4
0866: * @provisional This API might change or be removed in a future release.
0867: */
0868: public static final UnicodeBlock CJK_STROKES = new UnicodeBlock(
0869: "CJK_STROKES", 130); /*[31C0]*/
0870:
0871: /**
0872: * @draft ICU 3.4
0873: * @provisional This API might change or be removed in a future release.
0874: */
0875: public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = new UnicodeBlock(
0876: "COMBINING_DIACRITICAL_MARKS_SUPPLEMENT", 131); /*[1DC0]*/
0877:
0878: /**
0879: * @draft ICU 3.4
0880: * @provisional This API might change or be removed in a future release.
0881: */
0882: public static final UnicodeBlock COPTIC = new UnicodeBlock(
0883: "COPTIC", 132); /*[2C80]*/
0884:
0885: /**
0886: * @draft ICU 3.4
0887: * @provisional This API might change or be removed in a future release.
0888: */
0889: public static final UnicodeBlock ETHIOPIC_EXTENDED = new UnicodeBlock(
0890: "ETHIOPIC_EXTENDED", 133); /*[2D80]*/
0891:
0892: /**
0893: * @draft ICU 3.4
0894: * @provisional This API might change or be removed in a future release.
0895: */
0896: public static final UnicodeBlock ETHIOPIC_SUPPLEMENT = new UnicodeBlock(
0897: "ETHIOPIC_SUPPLEMENT", 134); /*[1380]*/
0898:
0899: /**
0900: * @draft ICU 3.4
0901: * @provisional This API might change or be removed in a future release.
0902: */
0903: public static final UnicodeBlock GEORGIAN_SUPPLEMENT = new UnicodeBlock(
0904: "GEORGIAN_SUPPLEMENT", 135); /*[2D00]*/
0905:
0906: /**
0907: * @draft ICU 3.4
0908: * @provisional This API might change or be removed in a future release.
0909: */
0910: public static final UnicodeBlock GLAGOLITIC = new UnicodeBlock(
0911: "GLAGOLITIC", 136); /*[2C00]*/
0912:
0913: /**
0914: * @draft ICU 3.4
0915: * @provisional This API might change or be removed in a future release.
0916: */
0917: public static final UnicodeBlock KHAROSHTHI = new UnicodeBlock(
0918: "KHAROSHTHI", 137); /*[10A00]*/
0919:
0920: /**
0921: * @draft ICU 3.4
0922: * @provisional This API might change or be removed in a future release.
0923: */
0924: public static final UnicodeBlock MODIFIER_TONE_LETTERS = new UnicodeBlock(
0925: "MODIFIER_TONE_LETTERS", 138); /*[A700]*/
0926:
0927: /**
0928: * @draft ICU 3.4
0929: * @provisional This API might change or be removed in a future release.
0930: */
0931: public static final UnicodeBlock NEW_TAI_LUE = new UnicodeBlock(
0932: "NEW_TAI_LUE", 139); /*[1980]*/
0933:
0934: /**
0935: * @draft ICU 3.4
0936: * @provisional This API might change or be removed in a future release.
0937: */
0938: public static final UnicodeBlock OLD_PERSIAN = new UnicodeBlock(
0939: "OLD_PERSIAN", 140); /*[103A0]*/
0940:
0941: /**
0942: * @draft ICU 3.4
0943: * @provisional This API might change or be removed in a future release.
0944: */
0945: public static final UnicodeBlock PHONETIC_EXTENSIONS_SUPPLEMENT = new UnicodeBlock(
0946: "PHONETIC_EXTENSIONS_SUPPLEMENT", 141); /*[1D80]*/
0947:
0948: /**
0949: * @draft ICU 3.4
0950: * @provisional This API might change or be removed in a future release.
0951: */
0952: public static final UnicodeBlock SUPPLEMENTAL_PUNCTUATION = new UnicodeBlock(
0953: "SUPPLEMENTAL_PUNCTUATION", 142); /*[2E00]*/
0954:
0955: /**
0956: * @draft ICU 3.4
0957: * @provisional This API might change or be removed in a future release.
0958: */
0959: public static final UnicodeBlock SYLOTI_NAGRI = new UnicodeBlock(
0960: "SYLOTI_NAGRI", 143); /*[A800]*/
0961:
0962: /**
0963: * @draft ICU 3.4
0964: * @provisional This API might change or be removed in a future release.
0965: */
0966: public static final UnicodeBlock TIFINAGH = new UnicodeBlock(
0967: "TIFINAGH", 144); /*[2D30]*/
0968:
0969: /**
0970: * @draft ICU 3.4
0971: * @provisional This API might change or be removed in a future release.
0972: */
0973: public static final UnicodeBlock VERTICAL_FORMS = new UnicodeBlock(
0974: "VERTICAL_FORMS", 145); /*[FE10]*/
0975:
0976: /**
0977: * @draft ICU 3.6
0978: * @provisional This API might change or be removed in a future release.
0979: */
0980: public static final UnicodeBlock NKO = new UnicodeBlock("NKO",
0981: 146); /*[07C0]*/
0982: /**
0983: * @draft ICU 3.6
0984: * @provisional This API might change or be removed in a future release.
0985: */
0986: public static final UnicodeBlock BALINESE = new UnicodeBlock(
0987: "BALINESE", 147); /*[1B00]*/
0988: /**
0989: * @draft ICU 3.6
0990: * @provisional This API might change or be removed in a future release.
0991: */
0992: public static final UnicodeBlock LATIN_EXTENDED_C = new UnicodeBlock(
0993: "LATIN_EXTENDED_C", 148); /*[2C60]*/
0994: /**
0995: * @draft ICU 3.6
0996: * @provisional This API might change or be removed in a future release.
0997: */
0998: public static final UnicodeBlock LATIN_EXTENDED_D = new UnicodeBlock(
0999: "LATIN_EXTENDED_D", 149); /*[A720]*/
1000: /**
1001: * @draft ICU 3.6
1002: * @provisional This API might change or be removed in a future release.
1003: */
1004: public static final UnicodeBlock PHAGS_PA = new UnicodeBlock(
1005: "PHAGS_PA", 150); /*[A840]*/
1006: /**
1007: * @draft ICU 3.6
1008: * @provisional This API might change or be removed in a future release.
1009: */
1010: public static final UnicodeBlock PHOENICIAN = new UnicodeBlock(
1011: "PHOENICIAN", 151); /*[10900]*/
1012: /**
1013: * @draft ICU 3.6
1014: * @provisional This API might change or be removed in a future release.
1015: */
1016: public static final UnicodeBlock CUNEIFORM = new UnicodeBlock(
1017: "CUNEIFORM", 152); /*[12000]*/
1018: /**
1019: * @draft ICU 3.6
1020: * @provisional This API might change or be removed in a future release.
1021: */
1022: public static final UnicodeBlock CUNEIFORM_NUMBERS_AND_PUNCTUATION = new UnicodeBlock(
1023: "CUNEIFORM_NUMBERS_AND_PUNCTUATION", 153); /*[12400]*/
1024: /**
1025: * @draft ICU 3.6
1026: * @provisional This API might change or be removed in a future release.
1027: */
1028: public static final UnicodeBlock COUNTING_ROD_NUMERALS = new UnicodeBlock(
1029: "COUNTING_ROD_NUMERALS", 154); /*[1D360]*/
1030:
1031: /**
1032: * @stable ICU 2.4
1033: */
1034: public static final UnicodeBlock INVALID_CODE = new UnicodeBlock(
1035: "INVALID_CODE", -1);
1036:
1037: // block id corresponding to icu4c -----------------------------------
1038:
1039: /**
1040: * @stable ICU 2.4
1041: */
1042: public static final int INVALID_CODE_ID = -1;
1043: /**
1044: * @stable ICU 2.4
1045: */
1046: public static final int BASIC_LATIN_ID = 1;
1047: /**
1048: * @stable ICU 2.4
1049: */
1050: public static final int LATIN_1_SUPPLEMENT_ID = 2;
1051: /**
1052: * @stable ICU 2.4
1053: */
1054: public static final int LATIN_EXTENDED_A_ID = 3;
1055: /**
1056: * @stable ICU 2.4
1057: */
1058: public static final int LATIN_EXTENDED_B_ID = 4;
1059: /**
1060: * @stable ICU 2.4
1061: */
1062: public static final int IPA_EXTENSIONS_ID = 5;
1063: /**
1064: * @stable ICU 2.4
1065: */
1066: public static final int SPACING_MODIFIER_LETTERS_ID = 6;
1067: /**
1068: * @stable ICU 2.4
1069: */
1070: public static final int COMBINING_DIACRITICAL_MARKS_ID = 7;
1071: /**
1072: * Unicode 3.2 renames this block to "Greek and Coptic".
1073: * @stable ICU 2.4
1074: */
1075: public static final int GREEK_ID = 8;
1076: /**
1077: * @stable ICU 2.4
1078: */
1079: public static final int CYRILLIC_ID = 9;
1080: /**
1081: * @stable ICU 2.4
1082: */
1083: public static final int ARMENIAN_ID = 10;
1084: /**
1085: * @stable ICU 2.4
1086: */
1087: public static final int HEBREW_ID = 11;
1088: /**
1089: * @stable ICU 2.4
1090: */
1091: public static final int ARABIC_ID = 12;
1092: /**
1093: * @stable ICU 2.4
1094: */
1095: public static final int SYRIAC_ID = 13;
1096: /**
1097: * @stable ICU 2.4
1098: */
1099: public static final int THAANA_ID = 14;
1100: /**
1101: * @stable ICU 2.4
1102: */
1103: public static final int DEVANAGARI_ID = 15;
1104: /**
1105: * @stable ICU 2.4
1106: */
1107: public static final int BENGALI_ID = 16;
1108: /**
1109: * @stable ICU 2.4
1110: */
1111: public static final int GURMUKHI_ID = 17;
1112: /**
1113: * @stable ICU 2.4
1114: */
1115: public static final int GUJARATI_ID = 18;
1116: /**
1117: * @stable ICU 2.4
1118: */
1119: public static final int ORIYA_ID = 19;
1120: /**
1121: * @stable ICU 2.4
1122: */
1123: public static final int TAMIL_ID = 20;
1124: /**
1125: * @stable ICU 2.4
1126: */
1127: public static final int TELUGU_ID = 21;
1128: /**
1129: * @stable ICU 2.4
1130: */
1131: public static final int KANNADA_ID = 22;
1132: /**
1133: * @stable ICU 2.4
1134: */
1135: public static final int MALAYALAM_ID = 23;
1136: /**
1137: * @stable ICU 2.4
1138: */
1139: public static final int SINHALA_ID = 24;
1140: /**
1141: * @stable ICU 2.4
1142: */
1143: public static final int THAI_ID = 25;
1144: /**
1145: * @stable ICU 2.4
1146: */
1147: public static final int LAO_ID = 26;
1148: /**
1149: * @stable ICU 2.4
1150: */
1151: public static final int TIBETAN_ID = 27;
1152: /**
1153: * @stable ICU 2.4
1154: */
1155: public static final int MYANMAR_ID = 28;
1156: /**
1157: * @stable ICU 2.4
1158: */
1159: public static final int GEORGIAN_ID = 29;
1160: /**
1161: * @stable ICU 2.4
1162: */
1163: public static final int HANGUL_JAMO_ID = 30;
1164: /**
1165: * @stable ICU 2.4
1166: */
1167: public static final int ETHIOPIC_ID = 31;
1168: /**
1169: * @stable ICU 2.4
1170: */
1171: public static final int CHEROKEE_ID = 32;
1172: /**
1173: * @stable ICU 2.4
1174: */
1175: public static final int UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_ID = 33;
1176: /**
1177: * @stable ICU 2.4
1178: */
1179: public static final int OGHAM_ID = 34;
1180: /**
1181: * @stable ICU 2.4
1182: */
1183: public static final int RUNIC_ID = 35;
1184: /**
1185: * @stable ICU 2.4
1186: */
1187: public static final int KHMER_ID = 36;
1188: /**
1189: * @stable ICU 2.4
1190: */
1191: public static final int MONGOLIAN_ID = 37;
1192: /**
1193: * @stable ICU 2.4
1194: */
1195: public static final int LATIN_EXTENDED_ADDITIONAL_ID = 38;
1196: /**
1197: * @stable ICU 2.4
1198: */
1199: public static final int GREEK_EXTENDED_ID = 39;
1200: /**
1201: * @stable ICU 2.4
1202: */
1203: public static final int GENERAL_PUNCTUATION_ID = 40;
1204: /**
1205: * @stable ICU 2.4
1206: */
1207: public static final int SUPERSCRIPTS_AND_SUBSCRIPTS_ID = 41;
1208: /**
1209: * @stable ICU 2.4
1210: */
1211: public static final int CURRENCY_SYMBOLS_ID = 42;
1212: /**
1213: * Unicode 3.2 renames this block to "Combining Diacritical Marks for
1214: * Symbols".
1215: * @stable ICU 2.4
1216: */
1217: public static final int COMBINING_MARKS_FOR_SYMBOLS_ID = 43;
1218: /**
1219: * @stable ICU 2.4
1220: */
1221: public static final int LETTERLIKE_SYMBOLS_ID = 44;
1222: /**
1223: * @stable ICU 2.4
1224: */
1225: public static final int NUMBER_FORMS_ID = 45;
1226: /**
1227: * @stable ICU 2.4
1228: */
1229: public static final int ARROWS_ID = 46;
1230: /**
1231: * @stable ICU 2.4
1232: */
1233: public static final int MATHEMATICAL_OPERATORS_ID = 47;
1234: /**
1235: * @stable ICU 2.4
1236: */
1237: public static final int MISCELLANEOUS_TECHNICAL_ID = 48;
1238: /**
1239: * @stable ICU 2.4
1240: */
1241: public static final int CONTROL_PICTURES_ID = 49;
1242: /**
1243: * @stable ICU 2.4
1244: */
1245: public static final int OPTICAL_CHARACTER_RECOGNITION_ID = 50;
1246: /**
1247: * @stable ICU 2.4
1248: */
1249: public static final int ENCLOSED_ALPHANUMERICS_ID = 51;
1250: /**
1251: * @stable ICU 2.4
1252: */
1253: public static final int BOX_DRAWING_ID = 52;
1254: /**
1255: * @stable ICU 2.4
1256: */
1257: public static final int BLOCK_ELEMENTS_ID = 53;
1258: /**
1259: * @stable ICU 2.4
1260: */
1261: public static final int GEOMETRIC_SHAPES_ID = 54;
1262: /**
1263: * @stable ICU 2.4
1264: */
1265: public static final int MISCELLANEOUS_SYMBOLS_ID = 55;
1266: /**
1267: * @stable ICU 2.4
1268: */
1269: public static final int DINGBATS_ID = 56;
1270: /**
1271: * @stable ICU 2.4
1272: */
1273: public static final int BRAILLE_PATTERNS_ID = 57;
1274: /**
1275: * @stable ICU 2.4
1276: */
1277: public static final int CJK_RADICALS_SUPPLEMENT_ID = 58;
1278: /**
1279: * @stable ICU 2.4
1280: */
1281: public static final int KANGXI_RADICALS_ID = 59;
1282: /**
1283: * @stable ICU 2.4
1284: */
1285: public static final int IDEOGRAPHIC_DESCRIPTION_CHARACTERS_ID = 60;
1286: /**
1287: * @stable ICU 2.4
1288: */
1289: public static final int CJK_SYMBOLS_AND_PUNCTUATION_ID = 61;
1290: /**
1291: * @stable ICU 2.4
1292: */
1293: public static final int HIRAGANA_ID = 62;
1294: /**
1295: * @stable ICU 2.4
1296: */
1297: public static final int KATAKANA_ID = 63;
1298: /**
1299: * @stable ICU 2.4
1300: */
1301: public static final int BOPOMOFO_ID = 64;
1302: /**
1303: * @stable ICU 2.4
1304: */
1305: public static final int HANGUL_COMPATIBILITY_JAMO_ID = 65;
1306: /**
1307: * @stable ICU 2.4
1308: */
1309: public static final int KANBUN_ID = 66;
1310: /**
1311: * @stable ICU 2.4
1312: */
1313: public static final int BOPOMOFO_EXTENDED_ID = 67;
1314: /**
1315: * @stable ICU 2.4
1316: */
1317: public static final int ENCLOSED_CJK_LETTERS_AND_MONTHS_ID = 68;
1318: /**
1319: * @stable ICU 2.4
1320: */
1321: public static final int CJK_COMPATIBILITY_ID = 69;
1322: /**
1323: * @stable ICU 2.4
1324: */
1325: public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_ID = 70;
1326: /**
1327: * @stable ICU 2.4
1328: */
1329: public static final int CJK_UNIFIED_IDEOGRAPHS_ID = 71;
1330: /**
1331: * @stable ICU 2.4
1332: */
1333: public static final int YI_SYLLABLES_ID = 72;
1334: /**
1335: * @stable ICU 2.4
1336: */
1337: public static final int YI_RADICALS_ID = 73;
1338: /**
1339: * @stable ICU 2.4
1340: */
1341: public static final int HANGUL_SYLLABLES_ID = 74;
1342: /**
1343: * @stable ICU 2.4
1344: */
1345: public static final int HIGH_SURROGATES_ID = 75;
1346: /**
1347: * @stable ICU 2.4
1348: */
1349: public static final int HIGH_PRIVATE_USE_SURROGATES_ID = 76;
1350: /**
1351: * @stable ICU 2.4
1352: */
1353: public static final int LOW_SURROGATES_ID = 77;
1354: /**
1355: * Same as public static final int PRIVATE_USE.
1356: * Until Unicode 3.1.1; the corresponding block name was "Private Use";
1357: * and multiple code point ranges had this block.
1358: * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
1359: * and adds separate blocks for the supplementary PUAs.
1360: * @stable ICU 2.4
1361: */
1362: public static final int PRIVATE_USE_AREA_ID = 78;
1363: /**
1364: * Same as public static final int PRIVATE_USE_AREA.
1365: * Until Unicode 3.1.1; the corresponding block name was "Private Use";
1366: * and multiple code point ranges had this block.
1367: * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
1368: * and adds separate blocks for the supplementary PUAs.
1369: * @stable ICU 2.4
1370: */
1371: public static final int PRIVATE_USE_ID = PRIVATE_USE_AREA_ID;
1372: /**
1373: * @stable ICU 2.4
1374: */
1375: public static final int CJK_COMPATIBILITY_IDEOGRAPHS_ID = 79;
1376: /**
1377: * @stable ICU 2.4
1378: */
1379: public static final int ALPHABETIC_PRESENTATION_FORMS_ID = 80;
1380: /**
1381: * @stable ICU 2.4
1382: */
1383: public static final int ARABIC_PRESENTATION_FORMS_A_ID = 81;
1384: /**
1385: * @stable ICU 2.4
1386: */
1387: public static final int COMBINING_HALF_MARKS_ID = 82;
1388: /**
1389: * @stable ICU 2.4
1390: */
1391: public static final int CJK_COMPATIBILITY_FORMS_ID = 83;
1392: /**
1393: * @stable ICU 2.4
1394: */
1395: public static final int SMALL_FORM_VARIANTS_ID = 84;
1396: /**
1397: * @stable ICU 2.4
1398: */
1399: public static final int ARABIC_PRESENTATION_FORMS_B_ID = 85;
1400: /**
1401: * @stable ICU 2.4
1402: */
1403: public static final int SPECIALS_ID = 86;
1404: /**
1405: * @stable ICU 2.4
1406: */
1407: public static final int HALFWIDTH_AND_FULLWIDTH_FORMS_ID = 87;
1408: /**
1409: * @stable ICU 2.4
1410: */
1411: public static final int OLD_ITALIC_ID = 88;
1412: /**
1413: * @stable ICU 2.4
1414: */
1415: public static final int GOTHIC_ID = 89;
1416: /**
1417: * @stable ICU 2.4
1418: */
1419: public static final int DESERET_ID = 90;
1420: /**
1421: * @stable ICU 2.4
1422: */
1423: public static final int BYZANTINE_MUSICAL_SYMBOLS_ID = 91;
1424: /**
1425: * @stable ICU 2.4
1426: */
1427: public static final int MUSICAL_SYMBOLS_ID = 92;
1428: /**
1429: * @stable ICU 2.4
1430: */
1431: public static final int MATHEMATICAL_ALPHANUMERIC_SYMBOLS_ID = 93;
1432: /**
1433: * @stable ICU 2.4
1434: */
1435: public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_ID = 94;
1436: /**
1437: * @stable ICU 2.4
1438: */
1439: public static final int CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_ID = 95;
1440: /**
1441: * @stable ICU 2.4
1442: */
1443: public static final int TAGS_ID = 96;
1444:
1445: // New blocks in Unicode 3.2
1446:
1447: /**
1448: * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
1449: * @stable ICU 2.4
1450: */
1451: public static final int CYRILLIC_SUPPLEMENTARY_ID = 97;
1452: /**
1453: * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
1454: * @stable ICU 3.0
1455: */
1456:
1457: public static final int CYRILLIC_SUPPLEMENT_ID = 97;
1458: /**
1459: * @stable ICU 2.4
1460: */
1461: public static final int TAGALOG_ID = 98;
1462: /**
1463: * @stable ICU 2.4
1464: */
1465: public static final int HANUNOO_ID = 99;
1466: /**
1467: * @stable ICU 2.4
1468: */
1469: public static final int BUHID_ID = 100;
1470: /**
1471: * @stable ICU 2.4
1472: */
1473: public static final int TAGBANWA_ID = 101;
1474: /**
1475: * @stable ICU 2.4
1476: */
1477: public static final int MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A_ID = 102;
1478: /**
1479: * @stable ICU 2.4
1480: */
1481: public static final int SUPPLEMENTAL_ARROWS_A_ID = 103;
1482: /**
1483: * @stable ICU 2.4
1484: */
1485: public static final int SUPPLEMENTAL_ARROWS_B_ID = 104;
1486: /**
1487: * @stable ICU 2.4
1488: */
1489: public static final int MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B_ID = 105;
1490: /**
1491: * @stable ICU 2.4
1492: */
1493: public static final int SUPPLEMENTAL_MATHEMATICAL_OPERATORS_ID = 106;
1494: /**
1495: * @stable ICU 2.4
1496: */
1497: public static final int KATAKANA_PHONETIC_EXTENSIONS_ID = 107;
1498: /**
1499: * @stable ICU 2.4
1500: */
1501: public static final int VARIATION_SELECTORS_ID = 108;
1502: /**
1503: * @stable ICU 2.4
1504: */
1505: public static final int SUPPLEMENTARY_PRIVATE_USE_AREA_A_ID = 109;
1506: /**
1507: * @stable ICU 2.4
1508: */
1509: public static final int SUPPLEMENTARY_PRIVATE_USE_AREA_B_ID = 110;
1510:
1511: /**
1512: * @stable ICU 2.6
1513: */
1514: public static final int LIMBU_ID = 111; /*[1900]*/
1515: /**
1516: * @stable ICU 2.6
1517: */
1518: public static final int TAI_LE_ID = 112; /*[1950]*/
1519: /**
1520: * @stable ICU 2.6
1521: */
1522: public static final int KHMER_SYMBOLS_ID = 113; /*[19E0]*/
1523: /**
1524: * @stable ICU 2.6
1525: */
1526: public static final int PHONETIC_EXTENSIONS_ID = 114; /*[1D00]*/
1527: /**
1528: * @stable ICU 2.6
1529: */
1530: public static final int MISCELLANEOUS_SYMBOLS_AND_ARROWS_ID = 115; /*[2B00]*/
1531: /**
1532: * @stable ICU 2.6
1533: */
1534: public static final int YIJING_HEXAGRAM_SYMBOLS_ID = 116; /*[4DC0]*/
1535: /**
1536: * @stable ICU 2.6
1537: */
1538: public static final int LINEAR_B_SYLLABARY_ID = 117; /*[10000]*/
1539: /**
1540: * @stable ICU 2.6
1541: */
1542: public static final int LINEAR_B_IDEOGRAMS_ID = 118; /*[10080]*/
1543: /**
1544: * @stable ICU 2.6
1545: */
1546: public static final int AEGEAN_NUMBERS_ID = 119; /*[10100]*/
1547: /**
1548: * @stable ICU 2.6
1549: */
1550: public static final int UGARITIC_ID = 120; /*[10380]*/
1551: /**
1552: * @stable ICU 2.6
1553: */
1554: public static final int SHAVIAN_ID = 121; /*[10450]*/
1555: /**
1556: * @stable ICU 2.6
1557: */
1558: public static final int OSMANYA_ID = 122; /*[10480]*/
1559: /**
1560: * @stable ICU 2.6
1561: */
1562: public static final int CYPRIOT_SYLLABARY_ID = 123; /*[10800]*/
1563: /**
1564: * @stable ICU 2.6
1565: */
1566: public static final int TAI_XUAN_JING_SYMBOLS_ID = 124; /*[1D300]*/
1567: /**
1568: * @stable ICU 2.6
1569: */
1570: public static final int VARIATION_SELECTORS_SUPPLEMENT_ID = 125; /*[E0100]*/
1571:
1572: /* New blocks in Unicode 4.1 */
1573:
1574: /**
1575: * @draft ICU 3.4
1576: * @provisional This API might change or be removed in a future release.
1577: */
1578: public static final int ANCIENT_GREEK_MUSICAL_NOTATION_ID = 126; /*[1D200]*/
1579:
1580: /**
1581: * @draft ICU 3.4
1582: * @provisional This API might change or be removed in a future release.
1583: */
1584: public static final int ANCIENT_GREEK_NUMBERS_ID = 127; /*[10140]*/
1585:
1586: /**
1587: * @draft ICU 3.4
1588: * @provisional This API might change or be removed in a future release.
1589: */
1590: public static final int ARABIC_SUPPLEMENT_ID = 128; /*[0750]*/
1591:
1592: /**
1593: * @draft ICU 3.4
1594: * @provisional This API might change or be removed in a future release.
1595: */
1596: public static final int BUGINESE_ID = 129; /*[1A00]*/
1597:
1598: /**
1599: * @draft ICU 3.4
1600: * @provisional This API might change or be removed in a future release.
1601: */
1602: public static final int CJK_STROKES_ID = 130; /*[31C0]*/
1603:
1604: /**
1605: * @draft ICU 3.4
1606: * @provisional This API might change or be removed in a future release.
1607: */
1608: public static final int COMBINING_DIACRITICAL_MARKS_SUPPLEMENT_ID = 131; /*[1DC0]*/
1609:
1610: /**
1611: * @draft ICU 3.4
1612: * @provisional This API might change or be removed in a future release.
1613: */
1614: public static final int COPTIC_ID = 132; /*[2C80]*/
1615:
1616: /**
1617: * @draft ICU 3.4
1618: * @provisional This API might change or be removed in a future release.
1619: */
1620: public static final int ETHIOPIC_EXTENDED_ID = 133; /*[2D80]*/
1621:
1622: /**
1623: * @draft ICU 3.4
1624: * @provisional This API might change or be removed in a future release.
1625: */
1626: public static final int ETHIOPIC_SUPPLEMENT_ID = 134; /*[1380]*/
1627:
1628: /**
1629: * @draft ICU 3.4
1630: * @provisional This API might change or be removed in a future release.
1631: */
1632: public static final int GEORGIAN_SUPPLEMENT_ID = 135; /*[2D00]*/
1633:
1634: /**
1635: * @draft ICU 3.4
1636: * @provisional This API might change or be removed in a future release.
1637: */
1638: public static final int GLAGOLITIC_ID = 136; /*[2C00]*/
1639:
1640: /**
1641: * @draft ICU 3.4
1642: * @provisional This API might change or be removed in a future release.
1643: */
1644: public static final int KHAROSHTHI_ID = 137; /*[10A00]*/
1645:
1646: /**
1647: * @draft ICU 3.4
1648: * @provisional This API might change or be removed in a future release.
1649: */
1650: public static final int MODIFIER_TONE_LETTERS_ID = 138; /*[A700]*/
1651:
1652: /**
1653: * @draft ICU 3.4
1654: * @provisional This API might change or be removed in a future release.
1655: */
1656: public static final int NEW_TAI_LUE_ID = 139; /*[1980]*/
1657:
1658: /**
1659: * @draft ICU 3.4
1660: * @provisional This API might change or be removed in a future release.
1661: */
1662: public static final int OLD_PERSIAN_ID = 140; /*[103A0]*/
1663:
1664: /**
1665: * @draft ICU 3.4
1666: * @provisional This API might change or be removed in a future release.
1667: */
1668: public static final int PHONETIC_EXTENSIONS_SUPPLEMENT_ID = 141; /*[1D80]*/
1669:
1670: /**
1671: * @draft ICU 3.4
1672: * @provisional This API might change or be removed in a future release.
1673: */
1674: public static final int SUPPLEMENTAL_PUNCTUATION_ID = 142; /*[2E00]*/
1675:
1676: /**
1677: * @draft ICU 3.4
1678: * @provisional This API might change or be removed in a future release.
1679: */
1680: public static final int SYLOTI_NAGRI_ID = 143; /*[A800]*/
1681:
1682: /**
1683: * @draft ICU 3.4
1684: * @provisional This API might change or be removed in a future release.
1685: */
1686: public static final int TIFINAGH_ID = 144; /*[2D30]*/
1687:
1688: /**
1689: * @draft ICU 3.4
1690: * @provisional This API might change or be removed in a future release.
1691: */
1692: public static final int VERTICAL_FORMS_ID = 145; /*[FE10]*/
1693:
1694: /* New blocks in Unicode 5.0 */
1695:
1696: /**
1697: * @draft ICU 3.6
1698: * @provisional This API might change or be removed in a future release.
1699: */
1700: public static final int NKO_ID = 146; /*[07C0]*/
1701: /**
1702: * @draft ICU 3.6
1703: * @provisional This API might change or be removed in a future release.
1704: */
1705: public static final int BALINESE_ID = 147; /*[1B00]*/
1706: /**
1707: * @draft ICU 3.6
1708: * @provisional This API might change or be removed in a future release.
1709: */
1710: public static final int LATIN_EXTENDED_C_ID = 148; /*[2C60]*/
1711: /**
1712: * @draft ICU 3.6
1713: * @provisional This API might change or be removed in a future release.
1714: */
1715: public static final int LATIN_EXTENDED_D_ID = 149; /*[A720]*/
1716: /**
1717: * @draft ICU 3.6
1718: * @provisional This API might change or be removed in a future release.
1719: */
1720: public static final int PHAGS_PA_ID = 150; /*[A840]*/
1721: /**
1722: * @draft ICU 3.6
1723: * @provisional This API might change or be removed in a future release.
1724: */
1725: public static final int PHOENICIAN_ID = 151; /*[10900]*/
1726: /**
1727: * @draft ICU 3.6
1728: * @provisional This API might change or be removed in a future release.
1729: */
1730: public static final int CUNEIFORM_ID = 152; /*[12000]*/
1731: /**
1732: * @draft ICU 3.6
1733: * @provisional This API might change or be removed in a future release.
1734: */
1735: public static final int CUNEIFORM_NUMBERS_AND_PUNCTUATION_ID = 153; /*[12400]*/
1736: /**
1737: * @draft ICU 3.6
1738: * @provisional This API might change or be removed in a future release.
1739: */
1740: public static final int COUNTING_ROD_NUMERALS_ID = 154; /*[1D360]*/
1741:
1742: /**
1743: * @stable ICU 2.4
1744: */
1745: public static final int COUNT = 155;
1746:
1747: // public methods --------------------------------------------------
1748:
1749: /**
1750: * Gets the only instance of the UnicodeBlock with the argument ID.
1751: * If no such ID exists, a INVALID_CODE UnicodeBlock will be returned.
1752: * @param id UnicodeBlock ID
1753: * @return the only instance of the UnicodeBlock with the argument ID
1754: * if it exists, otherwise a INVALID_CODE UnicodeBlock will be
1755: * returned.
1756: * @stable ICU 2.4
1757: */
1758: public static UnicodeBlock getInstance(int id) {
1759: if (id >= 0 && id < BLOCKS_.length) {
1760: return BLOCKS_[id];
1761: }
1762: return INVALID_CODE;
1763: }
1764:
1765: /**
1766: * Returns the Unicode allocation block that contains the code point,
1767: * or null if the code point is not a member of a defined block.
1768: * @param ch code point to be tested
1769: * @return the Unicode allocation block that contains the code point
1770: * @stable ICU 2.4
1771: */
1772: public static UnicodeBlock of(int ch) {
1773: if (ch > MAX_VALUE) {
1774: return INVALID_CODE;
1775: }
1776:
1777: return UnicodeBlock.getInstance((PROPERTY_.getAdditional(
1778: ch, 0) & BLOCK_MASK_) >> BLOCK_SHIFT_);
1779: }
1780:
1781: /**
1782: * Internal function returning of(ch).getID().
1783: *
1784: * @param ch
1785: * @return numeric block value
1786: * @internal
1787: */
1788: static int idOf(int ch) {
1789: if (ch < 0 || ch > MAX_VALUE) {
1790: return -1;
1791: }
1792:
1793: return (PROPERTY_.getAdditional(ch, 0) & BLOCK_MASK_) >> BLOCK_SHIFT_;
1794: }
1795:
1796: /**
1797: * Cover the JDK 1.5 API. Return the Unicode block with the
1798: * given name. <br/><b>Note</b>: Unlike JDK 1.5, this only matches
1799: * against the official UCD name and the Java block name
1800: * (ignoring case).
1801: * @param blockName the name of the block to match
1802: * @return the UnicodeBlock with that name
1803: * @throws IllegalArgumentException if the blockName could not be matched
1804: * @stable ICU 3.0
1805: */
1806: public static final UnicodeBlock forName(String blockName) {
1807: Map m = null;
1808: if (mref != null) {
1809: m = (Map) mref.get();
1810: }
1811: if (m == null) {
1812: m = new HashMap(BLOCKS_.length);
1813: for (int i = 0; i < BLOCKS_.length; ++i) {
1814: UnicodeBlock b = BLOCKS_[i];
1815: String name = getPropertyValueName(UProperty.BLOCK,
1816: b.getID(), UProperty.NameChoice.LONG);
1817: m.put(name.toUpperCase(), b);
1818: m.put(name.replace('_', ' ').toUpperCase(), b);
1819: m.put(b.toString().toUpperCase(), b);
1820: }
1821: mref = new SoftReference(m);
1822: }
1823: UnicodeBlock b = (UnicodeBlock) m.get(blockName
1824: .toUpperCase());
1825: if (b == null) {
1826: throw new IllegalArgumentException();
1827: }
1828: return b;
1829: }
1830:
1831: private static SoftReference mref;
1832:
1833: /**
1834: * Returns the type ID of this Unicode block
1835: * @return integer type ID of this Unicode block
1836: * @stable ICU 2.4
1837: */
1838: public int getID() {
1839: return m_id_;
1840: }
1841:
1842: // private data members ---------------------------------------------
1843:
1844: /**
1845: * Array of UnicodeBlocks, for easy access in getInstance(int)
1846: */
1847: private final static UnicodeBlock BLOCKS_[] = { NO_BLOCK,
1848: BASIC_LATIN, LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A,
1849: LATIN_EXTENDED_B, IPA_EXTENSIONS,
1850: SPACING_MODIFIER_LETTERS, COMBINING_DIACRITICAL_MARKS,
1851: GREEK, CYRILLIC, ARMENIAN, HEBREW, ARABIC, SYRIAC,
1852: THAANA, DEVANAGARI, BENGALI, GURMUKHI, GUJARATI, ORIYA,
1853: TAMIL, TELUGU, KANNADA, MALAYALAM, SINHALA, THAI, LAO,
1854: TIBETAN, MYANMAR, GEORGIAN, HANGUL_JAMO, ETHIOPIC,
1855: CHEROKEE, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, OGHAM,
1856: RUNIC, KHMER, MONGOLIAN, LATIN_EXTENDED_ADDITIONAL,
1857: GREEK_EXTENDED, GENERAL_PUNCTUATION,
1858: SUPERSCRIPTS_AND_SUBSCRIPTS, CURRENCY_SYMBOLS,
1859: COMBINING_MARKS_FOR_SYMBOLS, LETTERLIKE_SYMBOLS,
1860: NUMBER_FORMS, ARROWS, MATHEMATICAL_OPERATORS,
1861: MISCELLANEOUS_TECHNICAL, CONTROL_PICTURES,
1862: OPTICAL_CHARACTER_RECOGNITION, ENCLOSED_ALPHANUMERICS,
1863: BOX_DRAWING, BLOCK_ELEMENTS, GEOMETRIC_SHAPES,
1864: MISCELLANEOUS_SYMBOLS, DINGBATS, BRAILLE_PATTERNS,
1865: CJK_RADICALS_SUPPLEMENT, KANGXI_RADICALS,
1866: IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
1867: CJK_SYMBOLS_AND_PUNCTUATION, HIRAGANA, KATAKANA,
1868: BOPOMOFO, HANGUL_COMPATIBILITY_JAMO, KANBUN,
1869: BOPOMOFO_EXTENDED, ENCLOSED_CJK_LETTERS_AND_MONTHS,
1870: CJK_COMPATIBILITY, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
1871: CJK_UNIFIED_IDEOGRAPHS, YI_SYLLABLES, YI_RADICALS,
1872: HANGUL_SYLLABLES, HIGH_SURROGATES,
1873: HIGH_PRIVATE_USE_SURROGATES, LOW_SURROGATES,
1874: PRIVATE_USE_AREA, CJK_COMPATIBILITY_IDEOGRAPHS,
1875: ALPHABETIC_PRESENTATION_FORMS,
1876: ARABIC_PRESENTATION_FORMS_A, COMBINING_HALF_MARKS,
1877: CJK_COMPATIBILITY_FORMS, SMALL_FORM_VARIANTS,
1878: ARABIC_PRESENTATION_FORMS_B, SPECIALS,
1879: HALFWIDTH_AND_FULLWIDTH_FORMS, OLD_ITALIC, GOTHIC,
1880: DESERET, BYZANTINE_MUSICAL_SYMBOLS, MUSICAL_SYMBOLS,
1881: MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
1882: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
1883: CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, TAGS,
1884: CYRILLIC_SUPPLEMENT, TAGALOG, HANUNOO, BUHID, TAGBANWA,
1885: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
1886: SUPPLEMENTAL_ARROWS_A, SUPPLEMENTAL_ARROWS_B,
1887: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
1888: SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
1889: KATAKANA_PHONETIC_EXTENSIONS, VARIATION_SELECTORS,
1890: SUPPLEMENTARY_PRIVATE_USE_AREA_A,
1891: SUPPLEMENTARY_PRIVATE_USE_AREA_B, LIMBU, TAI_LE,
1892: KHMER_SYMBOLS, PHONETIC_EXTENSIONS,
1893: MISCELLANEOUS_SYMBOLS_AND_ARROWS,
1894: YIJING_HEXAGRAM_SYMBOLS, LINEAR_B_SYLLABARY,
1895: LINEAR_B_IDEOGRAMS, AEGEAN_NUMBERS, UGARITIC, SHAVIAN,
1896: OSMANYA, CYPRIOT_SYLLABARY, TAI_XUAN_JING_SYMBOLS,
1897: VARIATION_SELECTORS_SUPPLEMENT,
1898:
1899: /* New blocks in Unicode 4.1 */
1900: ANCIENT_GREEK_MUSICAL_NOTATION, ANCIENT_GREEK_NUMBERS,
1901: ARABIC_SUPPLEMENT, BUGINESE, CJK_STROKES,
1902: COMBINING_DIACRITICAL_MARKS_SUPPLEMENT, COPTIC,
1903: ETHIOPIC_EXTENDED, ETHIOPIC_SUPPLEMENT,
1904: GEORGIAN_SUPPLEMENT, GLAGOLITIC, KHAROSHTHI,
1905: MODIFIER_TONE_LETTERS, NEW_TAI_LUE, OLD_PERSIAN,
1906: PHONETIC_EXTENSIONS_SUPPLEMENT,
1907: SUPPLEMENTAL_PUNCTUATION, SYLOTI_NAGRI, TIFINAGH,
1908: VERTICAL_FORMS, NKO, BALINESE, LATIN_EXTENDED_C,
1909: LATIN_EXTENDED_D, PHAGS_PA, PHOENICIAN, CUNEIFORM,
1910: CUNEIFORM_NUMBERS_AND_PUNCTUATION,
1911: COUNTING_ROD_NUMERALS };
1912:
1913: static {
1914: if (COUNT != BLOCKS_.length) {
1915: throw new java.lang.IllegalStateException(
1916: "UnicodeBlock fields are inconsistent!");
1917: }
1918: }
1919: /**
1920: * Identification code for this UnicodeBlock
1921: */
1922: private int m_id_;
1923:
1924: // private constructor ----------------------------------------------
1925:
1926: /**
1927: * UnicodeBlock constructor
1928: * @param name name of this UnicodeBlock
1929: * @param id unique id of this UnicodeBlock
1930: * @exception NullPointerException if name is <code>null</code>
1931: */
1932: private UnicodeBlock(String name, int id) {
1933: super (name);
1934: m_id_ = id;
1935: }
1936: }
1937:
1938: /**
1939: * East Asian Width constants.
1940: * @see UProperty#EAST_ASIAN_WIDTH
1941: * @see UCharacter#getIntPropertyValue
1942: * @stable ICU 2.4
1943: */
1944: public static interface EastAsianWidth {
1945: /**
1946: * @stable ICU 2.4
1947: */
1948: public static final int NEUTRAL = 0;
1949: /**
1950: * @stable ICU 2.4
1951: */
1952: public static final int AMBIGUOUS = 1;
1953: /**
1954: * @stable ICU 2.4
1955: */
1956: public static final int HALFWIDTH = 2;
1957: /**
1958: * @stable ICU 2.4
1959: */
1960: public static final int FULLWIDTH = 3;
1961: /**
1962: * @stable ICU 2.4
1963: */
1964: public static final int NARROW = 4;
1965: /**
1966: * @stable ICU 2.4
1967: */
1968: public static final int WIDE = 5;
1969: /**
1970: * @stable ICU 2.4
1971: */
1972: public static final int COUNT = 6;
1973: }
1974:
1975: /**
1976: * Decomposition Type constants.
1977: * @see UProperty#DECOMPOSITION_TYPE
1978: * @stable ICU 2.4
1979: */
1980: public static interface DecompositionType {
1981: /**
1982: * @stable ICU 2.4
1983: */
1984: public static final int NONE = 0;
1985: /**
1986: * @stable ICU 2.4
1987: */
1988: public static final int CANONICAL = 1;
1989: /**
1990: * @stable ICU 2.4
1991: */
1992: public static final int COMPAT = 2;
1993: /**
1994: * @stable ICU 2.4
1995: */
1996: public static final int CIRCLE = 3;
1997: /**
1998: * @stable ICU 2.4
1999: */
2000: public static final int FINAL = 4;
2001: /**
2002: * @stable ICU 2.4
2003: */
2004: public static final int FONT = 5;
2005: /**
2006: * @stable ICU 2.4
2007: */
2008: public static final int FRACTION = 6;
2009: /**
2010: * @stable ICU 2.4
2011: */
2012: public static final int INITIAL = 7;
2013: /**
2014: * @stable ICU 2.4
2015: */
2016: public static final int ISOLATED = 8;
2017: /**
2018: * @stable ICU 2.4
2019: */
2020: public static final int MEDIAL = 9;
2021: /**
2022: * @stable ICU 2.4
2023: */
2024: public static final int NARROW = 10;
2025: /**
2026: * @stable ICU 2.4
2027: */
2028: public static final int NOBREAK = 11;
2029: /**
2030: * @stable ICU 2.4
2031: */
2032: public static final int SMALL = 12;
2033: /**
2034: * @stable ICU 2.4
2035: */
2036: public static final int SQUARE = 13;
2037: /**
2038: * @stable ICU 2.4
2039: */
2040: public static final int SUB = 14;
2041: /**
2042: * @stable ICU 2.4
2043: */
2044: public static final int SUPER = 15;
2045: /**
2046: * @stable ICU 2.4
2047: */
2048: public static final int VERTICAL = 16;
2049: /**
2050: * @stable ICU 2.4
2051: */
2052: public static final int WIDE = 17;
2053: /**
2054: * @stable ICU 2.4
2055: */
2056: public static final int COUNT = 18;
2057: }
2058:
2059: /**
2060: * Joining Type constants.
2061: * @see UProperty#JOINING_TYPE
2062: * @stable ICU 2.4
2063: */
2064: public static interface JoiningType {
2065: /**
2066: * @stable ICU 2.4
2067: */
2068: public static final int NON_JOINING = 0;
2069: /**
2070: * @stable ICU 2.4
2071: */
2072: public static final int JOIN_CAUSING = 1;
2073: /**
2074: * @stable ICU 2.4
2075: */
2076: public static final int DUAL_JOINING = 2;
2077: /**
2078: * @stable ICU 2.4
2079: */
2080: public static final int LEFT_JOINING = 3;
2081: /**
2082: * @stable ICU 2.4
2083: */
2084: public static final int RIGHT_JOINING = 4;
2085: /**
2086: * @stable ICU 2.4
2087: */
2088: public static final int TRANSPARENT = 5;
2089: /**
2090: * @stable ICU 2.4
2091: */
2092: public static final int COUNT = 6;
2093: }
2094:
2095: /**
2096: * Joining Group constants.
2097: * @see UProperty#JOINING_GROUP
2098: * @stable ICU 2.4
2099: */
2100: public static interface JoiningGroup {
2101: /**
2102: * @stable ICU 2.4
2103: */
2104: public static final int NO_JOINING_GROUP = 0;
2105: /**
2106: * @stable ICU 2.4
2107: */
2108: public static final int AIN = 1;
2109: /**
2110: * @stable ICU 2.4
2111: */
2112: public static final int ALAPH = 2;
2113: /**
2114: * @stable ICU 2.4
2115: */
2116: public static final int ALEF = 3;
2117: /**
2118: * @stable ICU 2.4
2119: */
2120: public static final int BEH = 4;
2121: /**
2122: * @stable ICU 2.4
2123: */
2124: public static final int BETH = 5;
2125: /**
2126: * @stable ICU 2.4
2127: */
2128: public static final int DAL = 6;
2129: /**
2130: * @stable ICU 2.4
2131: */
2132: public static final int DALATH_RISH = 7;
2133: /**
2134: * @stable ICU 2.4
2135: */
2136: public static final int E = 8;
2137: /**
2138: * @stable ICU 2.4
2139: */
2140: public static final int FEH = 9;
2141: /**
2142: * @stable ICU 2.4
2143: */
2144: public static final int FINAL_SEMKATH = 10;
2145: /**
2146: * @stable ICU 2.4
2147: */
2148: public static final int GAF = 11;
2149: /**
2150: * @stable ICU 2.4
2151: */
2152: public static final int GAMAL = 12;
2153: /**
2154: * @stable ICU 2.4
2155: */
2156: public static final int HAH = 13;
2157: /**
2158: * @stable ICU 2.4
2159: */
2160: public static final int HAMZA_ON_HEH_GOAL = 14;
2161: /**
2162: * @stable ICU 2.4
2163: */
2164: public static final int HE = 15;
2165: /**
2166: * @stable ICU 2.4
2167: */
2168: public static final int HEH = 16;
2169: /**
2170: * @stable ICU 2.4
2171: */
2172: public static final int HEH_GOAL = 17;
2173: /**
2174: * @stable ICU 2.4
2175: */
2176: public static final int HETH = 18;
2177: /**
2178: * @stable ICU 2.4
2179: */
2180: public static final int KAF = 19;
2181: /**
2182: * @stable ICU 2.4
2183: */
2184: public static final int KAPH = 20;
2185: /**
2186: * @stable ICU 2.4
2187: */
2188: public static final int KNOTTED_HEH = 21;
2189: /**
2190: * @stable ICU 2.4
2191: */
2192: public static final int LAM = 22;
2193: /**
2194: * @stable ICU 2.4
2195: */
2196: public static final int LAMADH = 23;
2197: /**
2198: * @stable ICU 2.4
2199: */
2200: public static final int MEEM = 24;
2201: /**
2202: * @stable ICU 2.4
2203: */
2204: public static final int MIM = 25;
2205: /**
2206: * @stable ICU 2.4
2207: */
2208: public static final int NOON = 26;
2209: /**
2210: * @stable ICU 2.4
2211: */
2212: public static final int NUN = 27;
2213: /**
2214: * @stable ICU 2.4
2215: */
2216: public static final int PE = 28;
2217: /**
2218: * @stable ICU 2.4
2219: */
2220: public static final int QAF = 29;
2221: /**
2222: * @stable ICU 2.4
2223: */
2224: public static final int QAPH = 30;
2225: /**
2226: * @stable ICU 2.4
2227: */
2228: public static final int REH = 31;
2229: /**
2230: * @stable ICU 2.4
2231: */
2232: public static final int REVERSED_PE = 32;
2233: /**
2234: * @stable ICU 2.4
2235: */
2236: public static final int SAD = 33;
2237: /**
2238: * @stable ICU 2.4
2239: */
2240: public static final int SADHE = 34;
2241: /**
2242: * @stable ICU 2.4
2243: */
2244: public static final int SEEN = 35;
2245: /**
2246: * @stable ICU 2.4
2247: */
2248: public static final int SEMKATH = 36;
2249: /**
2250: * @stable ICU 2.4
2251: */
2252: public static final int SHIN = 37;
2253: /**
2254: * @stable ICU 2.4
2255: */
2256: public static final int SWASH_KAF = 38;
2257: /**
2258: * @stable ICU 2.4
2259: */
2260: public static final int SYRIAC_WAW = 39;
2261: /**
2262: * @stable ICU 2.4
2263: */
2264: public static final int TAH = 40;
2265: /**
2266: * @stable ICU 2.4
2267: */
2268: public static final int TAW = 41;
2269: /**
2270: * @stable ICU 2.4
2271: */
2272: public static final int TEH_MARBUTA = 42;
2273: /**
2274: * @stable ICU 2.4
2275: */
2276: public static final int TETH = 43;
2277: /**
2278: * @stable ICU 2.4
2279: */
2280: public static final int WAW = 44;
2281: /**
2282: * @stable ICU 2.4
2283: */
2284: public static final int YEH = 45;
2285: /**
2286: * @stable ICU 2.4
2287: */
2288: public static final int YEH_BARREE = 46;
2289: /**
2290: * @stable ICU 2.4
2291: */
2292: public static final int YEH_WITH_TAIL = 47;
2293: /**
2294: * @stable ICU 2.4
2295: */
2296: public static final int YUDH = 48;
2297: /**
2298: * @stable ICU 2.4
2299: */
2300: public static final int YUDH_HE = 49;
2301: /**
2302: * @stable ICU 2.4
2303: */
2304: public static final int ZAIN = 50;
2305: /**
2306: * @stable ICU 2.6
2307: */
2308: public static final int FE = 51;
2309: /**
2310: * @stable ICU 2.6
2311: */
2312: public static final int KHAPH = 52;
2313: /**
2314: * @stable ICU 2.6
2315: */
2316: public static final int ZHAIN = 53;
2317: /**
2318: * @stable ICU 2.4
2319: */
2320: public static final int COUNT = 54;
2321: }
2322:
2323: /**
2324: * Grapheme Cluster Break constants.
2325: * @see UProperty#GRAPHEME_CLUSTER_BREAK
2326: * @draft ICU 3.4
2327: * @provisional This API might change or be removed in a future release.
2328: */
2329: public static interface GraphemeClusterBreak {
2330: /**
2331: * @draft ICU 3.4
2332: * @provisional This API might change or be removed in a future release.
2333: */
2334: public static final int OTHER = 0;
2335: /**
2336: * @draft ICU 3.4
2337: * @provisional This API might change or be removed in a future release.
2338: */
2339: public static final int CONTROL = 1;
2340: /**
2341: * @draft ICU 3.4
2342: * @provisional This API might change or be removed in a future release.
2343: */
2344: public static final int CR = 2;
2345: /**
2346: * @draft ICU 3.4
2347: * @provisional This API might change or be removed in a future release.
2348: */
2349: public static final int EXTEND = 3;
2350: /**
2351: * @draft ICU 3.4
2352: * @provisional This API might change or be removed in a future release.
2353: */
2354: public static final int L = 4;
2355: /**
2356: * @draft ICU 3.4
2357: * @provisional This API might change or be removed in a future release.
2358: */
2359: public static final int LF = 5;
2360: /**
2361: * @draft ICU 3.4
2362: * @provisional This API might change or be removed in a future release.
2363: */
2364: public static final int LV = 6;
2365: /**
2366: * @draft ICU 3.4
2367: * @provisional This API might change or be removed in a future release.
2368: */
2369: public static final int LVT = 7;
2370: /**
2371: * @draft ICU 3.4
2372: * @provisional This API might change or be removed in a future release.
2373: */
2374: public static final int T = 8;
2375: /**
2376: * @draft ICU 3.4
2377: * @provisional This API might change or be removed in a future release.
2378: */
2379: public static final int V = 9;
2380: /**
2381: * @draft ICU 3.4
2382: * @provisional This API might change or be removed in a future release.
2383: */
2384: public static final int COUNT = 10;
2385: }
2386:
2387: /**
2388: * Word Break constants.
2389: * @see UProperty#WORD_BREAK
2390: * @draft ICU 3.4
2391: * @provisional This API might change or be removed in a future release.
2392: */
2393: public static interface WordBreak {
2394: /**
2395: * @draft ICU 3.4
2396: * @provisional This API might change or be removed in a future release.
2397: */
2398: public static final int OTHER = 0;
2399: /**
2400: * @draft ICU 3.4
2401: * @provisional This API might change or be removed in a future release.
2402: */
2403: public static final int ALETTER = 1;
2404: /**
2405: * @draft ICU 3.4
2406: * @provisional This API might change or be removed in a future release.
2407: */
2408: public static final int FORMAT = 2;
2409: /**
2410: * @draft ICU 3.4
2411: * @provisional This API might change or be removed in a future release.
2412: */
2413: public static final int KATAKANA = 3;
2414: /**
2415: * @draft ICU 3.4
2416: * @provisional This API might change or be removed in a future release.
2417: */
2418: public static final int MIDLETTER = 4;
2419: /**
2420: * @draft ICU 3.4
2421: * @provisional This API might change or be removed in a future release.
2422: */
2423: public static final int MIDNUM = 5;
2424: /**
2425: * @draft ICU 3.4
2426: * @provisional This API might change or be removed in a future release.
2427: */
2428: public static final int NUMERIC = 6;
2429: /**
2430: * @draft ICU 3.4
2431: * @provisional This API might change or be removed in a future release.
2432: */
2433: public static final int EXTENDNUMLET = 7;
2434: /**
2435: * @draft ICU 3.4
2436: * @provisional This API might change or be removed in a future release.
2437: */
2438: public static final int COUNT = 8;
2439: }
2440:
2441: /**
2442: * Sentence Break constants.
2443: * @see UProperty#SENTENCE_BREAK
2444: * @draft ICU 3.4
2445: * @provisional This API might change or be removed in a future release.
2446: */
2447: public static interface SentenceBreak {
2448: /**
2449: * @draft ICU 3.4
2450: * @provisional This API might change or be removed in a future release.
2451: */
2452: public static final int OTHER = 0;
2453: /**
2454: * @draft ICU 3.4
2455: * @provisional This API might change or be removed in a future release.
2456: */
2457: public static final int ATERM = 1;
2458: /**
2459: * @draft ICU 3.4
2460: * @provisional This API might change or be removed in a future release.
2461: */
2462: public static final int CLOSE = 2;
2463: /**
2464: * @draft ICU 3.4
2465: * @provisional This API might change or be removed in a future release.
2466: */
2467: public static final int FORMAT = 3;
2468: /**
2469: * @draft ICU 3.4
2470: * @provisional This API might change or be removed in a future release.
2471: */
2472: public static final int LOWER = 4;
2473: /**
2474: * @draft ICU 3.4
2475: * @provisional This API might change or be removed in a future release.
2476: */
2477: public static final int NUMERIC = 5;
2478: /**
2479: * @draft ICU 3.4
2480: * @provisional This API might change or be removed in a future release.
2481: */
2482: public static final int OLETTER = 6;
2483: /**
2484: * @draft ICU 3.4
2485: * @provisional This API might change or be removed in a future release.
2486: */
2487: public static final int SEP = 7;
2488: /**
2489: * @draft ICU 3.4
2490: * @provisional This API might change or be removed in a future release.
2491: */
2492: public static final int SP = 8;
2493: /**
2494: * @draft ICU 3.4
2495: * @provisional This API might change or be removed in a future release.
2496: */
2497: public static final int STERM = 9;
2498: /**
2499: * @draft ICU 3.4
2500: * @provisional This API might change or be removed in a future release.
2501: */
2502: public static final int UPPER = 10;
2503: /**
2504: * @draft ICU 3.4
2505: * @provisional This API might change or be removed in a future release.
2506: */
2507: public static final int COUNT = 11;
2508: }
2509:
2510: /**
2511: * Line Break constants.
2512: * @see UProperty#LINE_BREAK
2513: * @stable ICU 2.4
2514: */
2515: public static interface LineBreak {
2516: /**
2517: * @stable ICU 2.4
2518: */
2519: public static final int UNKNOWN = 0;
2520: /**
2521: * @stable ICU 2.4
2522: */
2523: public static final int AMBIGUOUS = 1;
2524: /**
2525: * @stable ICU 2.4
2526: */
2527: public static final int ALPHABETIC = 2;
2528: /**
2529: * @stable ICU 2.4
2530: */
2531: public static final int BREAK_BOTH = 3;
2532: /**
2533: * @stable ICU 2.4
2534: */
2535: public static final int BREAK_AFTER = 4;
2536: /**
2537: * @stable ICU 2.4
2538: */
2539: public static final int BREAK_BEFORE = 5;
2540: /**
2541: * @stable ICU 2.4
2542: */
2543: public static final int MANDATORY_BREAK = 6;
2544: /**
2545: * @stable ICU 2.4
2546: */
2547: public static final int CONTINGENT_BREAK = 7;
2548: /**
2549: * @stable ICU 2.4
2550: */
2551: public static final int CLOSE_PUNCTUATION = 8;
2552: /**
2553: * @stable ICU 2.4
2554: */
2555: public static final int COMBINING_MARK = 9;
2556: /**
2557: * @stable ICU 2.4
2558: */
2559: public static final int CARRIAGE_RETURN = 10;
2560: /**
2561: * @stable ICU 2.4
2562: */
2563: public static final int EXCLAMATION = 11;
2564: /**
2565: * @stable ICU 2.4
2566: */
2567: public static final int GLUE = 12;
2568: /**
2569: * @stable ICU 2.4
2570: */
2571: public static final int HYPHEN = 13;
2572: /**
2573: * @stable ICU 2.4
2574: */
2575: public static final int IDEOGRAPHIC = 14;
2576: /**
2577: * @see #INSEPARABLE
2578: * @stable ICU 2.4
2579: */
2580: public static final int INSEPERABLE = 15;
2581: /**
2582: * Renamed from the misspelled "inseperable" in Unicode 4.0.1.
2583: * @stable ICU 3.0
2584: */
2585: public static final int INSEPARABLE = 15;
2586: /**
2587: * @stable ICU 2.4
2588: */
2589: public static final int INFIX_NUMERIC = 16;
2590: /**
2591: * @stable ICU 2.4
2592: */
2593: public static final int LINE_FEED = 17;
2594: /**
2595: * @stable ICU 2.4
2596: */
2597: public static final int NONSTARTER = 18;
2598: /**
2599: * @stable ICU 2.4
2600: */
2601: public static final int NUMERIC = 19;
2602: /**
2603: * @stable ICU 2.4
2604: */
2605: public static final int OPEN_PUNCTUATION = 20;
2606: /**
2607: * @stable ICU 2.4
2608: */
2609: public static final int POSTFIX_NUMERIC = 21;
2610: /**
2611: * @stable ICU 2.4
2612: */
2613: public static final int PREFIX_NUMERIC = 22;
2614: /**
2615: * @stable ICU 2.4
2616: */
2617: public static final int QUOTATION = 23;
2618: /**
2619: * @stable ICU 2.4
2620: */
2621: public static final int COMPLEX_CONTEXT = 24;
2622: /**
2623: * @stable ICU 2.4
2624: */
2625: public static final int SURROGATE = 25;
2626: /**
2627: * @stable ICU 2.4
2628: */
2629: public static final int SPACE = 26;
2630: /**
2631: * @stable ICU 2.4
2632: */
2633: public static final int BREAK_SYMBOLS = 27;
2634: /**
2635: * @stable ICU 2.4
2636: */
2637: public static final int ZWSPACE = 28;
2638:
2639: /**
2640: * @stable ICU 2.6
2641: */
2642: public static final int NEXT_LINE = 29; /*[NL]*//* from here on: new in Unicode 4/ICU 2.6 */
2643:
2644: /**
2645: * @stable ICU 2.6
2646: */
2647: public static final int WORD_JOINER = 30; /*[WJ]*/
2648:
2649: /* from here on: new in Unicode 4.1/ICU 3.4 */
2650:
2651: /**
2652: * @draft ICU 3.4
2653: * @provisional This API might change or be removed in a future release.
2654: */
2655: public static final int H2 = 31;
2656: /**
2657: * @draft ICU 3.4
2658: * @provisional This API might change or be removed in a future release.
2659: */
2660: public static final int H3 = 32;
2661: /**
2662: * @draft ICU 3.4
2663: * @provisional This API might change or be removed in a future release.
2664: */
2665: public static final int JL = 33;
2666: /**
2667: * @draft ICU 3.4
2668: * @provisional This API might change or be removed in a future release.
2669: */
2670: public static final int JT = 34;
2671: /**
2672: * @draft ICU 3.4
2673: * @provisional This API might change or be removed in a future release.
2674: */
2675: public static final int JV = 35;
2676:
2677: /**
2678: * @stable ICU 2.4
2679: */
2680: public static final int COUNT = 36;
2681: }
2682:
2683: /**
2684: * Numeric Type constants.
2685: * @see UProperty#NUMERIC_TYPE
2686: * @stable ICU 2.4
2687: */
2688: public static interface NumericType {
2689: /**
2690: * @stable ICU 2.4
2691: */
2692: public static final int NONE = 0;
2693: /**
2694: * @stable ICU 2.4
2695: */
2696: public static final int DECIMAL = 1;
2697: /**
2698: * @stable ICU 2.4
2699: */
2700: public static final int DIGIT = 2;
2701: /**
2702: * @stable ICU 2.4
2703: */
2704: public static final int NUMERIC = 3;
2705: /**
2706: * @stable ICU 2.4
2707: */
2708: public static final int COUNT = 4;
2709: }
2710:
2711: /**
2712: * Hangul Syllable Type constants.
2713: *
2714: * @see UProperty#HANGUL_SYLLABLE_TYPE
2715: * @stable ICU 2.6
2716: */
2717: public static interface HangulSyllableType {
2718: /**
2719: * @stable ICU 2.6
2720: */
2721: public static final int NOT_APPLICABLE = 0; /*[NA]*//*See note !!*/
2722: /**
2723: * @stable ICU 2.6
2724: */
2725: public static final int LEADING_JAMO = 1; /*[L]*/
2726: /**
2727: * @stable ICU 2.6
2728: */
2729: public static final int VOWEL_JAMO = 2; /*[V]*/
2730: /**
2731: * @stable ICU 2.6
2732: */
2733: public static final int TRAILING_JAMO = 3; /*[T]*/
2734: /**
2735: * @stable ICU 2.6
2736: */
2737: public static final int LV_SYLLABLE = 4; /*[LV]*/
2738: /**
2739: * @stable ICU 2.6
2740: */
2741: public static final int LVT_SYLLABLE = 5; /*[LVT]*/
2742: /**
2743: * @stable ICU 2.6
2744: */
2745: public static final int COUNT = 6;
2746: }
2747:
2748: // public data members -----------------------------------------------
2749:
2750: /**
2751: * The lowest Unicode code point value.
2752: * @stable ICU 2.1
2753: */
2754: public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
2755:
2756: /**
2757: * The highest Unicode code point value (scalar value) according to the
2758: * Unicode Standard.
2759: * This is a 21-bit value (21 bits, rounded up).<br>
2760: * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
2761: * @stable ICU 2.1
2762: */
2763: public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
2764:
2765: /**
2766: * The minimum value for Supplementary code points
2767: * @stable ICU 2.1
2768: */
2769: public static final int SUPPLEMENTARY_MIN_VALUE = UTF16.SUPPLEMENTARY_MIN_VALUE;
2770:
2771: /**
2772: * Unicode value used when translating into Unicode encoding form and there
2773: * is no existing character.
2774: * @stable ICU 2.1
2775: */
2776: public static final int REPLACEMENT_CHAR = '\uFFFD';
2777:
2778: /**
2779: * Special value that is returned by getUnicodeNumericValue(int) when no
2780: * numeric value is defined for a code point.
2781: * @stable ICU 2.4
2782: * @see #getUnicodeNumericValue
2783: */
2784: public static final double NO_NUMERIC_VALUE = -123456789;
2785:
2786: /**
2787: * Compatibility constant for Java Character's MIN_RADIX.
2788: * @draft ICU 3.4
2789: * @provisional This API might change or be removed in a future release.
2790: */
2791: public static final int MIN_RADIX = java.lang.Character.MIN_RADIX;
2792:
2793: /**
2794: * Compatibility constant for Java Character's MAX_RADIX.
2795: * @draft ICU 3.4
2796: * @provisional This API might change or be removed in a future release.
2797: */
2798: public static final int MAX_RADIX = java.lang.Character.MAX_RADIX;
2799:
2800: // public methods ----------------------------------------------------
2801:
2802: /**
2803: * Retrieves the numeric value of a decimal digit code point.
2804: * <br>This method observes the semantics of
2805: * <code>java.lang.Character.digit()</code>. Note that this
2806: * will return positive values for code points for which isDigit
2807: * returns false, just like java.lang.Character.
2808: * <br><em>Semantic Change:</em> In release 1.3.1 and
2809: * prior, this did not treat the European letters as having a
2810: * digit value, and also treated numeric letters and other numbers as
2811: * digits.
2812: * This has been changed to conform to the java semantics.
2813: * <br>A code point is a valid digit if and only if:
2814: * <ul>
2815: * <li>ch is a decimal digit or one of the european letters, and
2816: * <li>the value of ch is less than the specified radix.
2817: * </ul>
2818: * @param ch the code point to query
2819: * @param radix the radix
2820: * @return the numeric value represented by the code point in the
2821: * specified radix, or -1 if the code point is not a decimal digit
2822: * or if its value is too large for the radix
2823: * @stable ICU 2.1
2824: */
2825: public static int digit(int ch, int radix) {
2826: // when ch is out of bounds getProperty == 0
2827: int props = getProperty(ch);
2828: int value;
2829: if (getNumericType(props) == NumericType.DECIMAL) {
2830: value = UCharacterProperty.getUnsignedValue(props);
2831: } else {
2832: value = getEuropeanDigit(ch);
2833: }
2834: return (0 <= value && value < radix) ? value : -1;
2835: }
2836:
2837: /**
2838: * Retrieves the numeric value of a decimal digit code point.
2839: * <br>This is a convenience overload of <code>digit(int, int)</code>
2840: * that provides a decimal radix.
2841: * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
2842: * treated numeric letters and other numbers as digits. This has
2843: * been changed to conform to the java semantics.
2844: * @param ch the code point to query
2845: * @return the numeric value represented by the code point,
2846: * or -1 if the code point is not a decimal digit or if its
2847: * value is too large for a decimal radix
2848: * @stable ICU 2.1
2849: */
2850: public static int digit(int ch) {
2851: int props = getProperty(ch);
2852: if (getNumericType(props) == NumericType.DECIMAL) {
2853: return UCharacterProperty.getUnsignedValue(props);
2854: } else {
2855: return -1;
2856: }
2857: }
2858:
2859: /**
2860: * Returns the numeric value of the code point as a nonnegative
2861: * integer.
2862: * <br>If the code point does not have a numeric value, then -1 is returned.
2863: * <br>
2864: * If the code point has a numeric value that cannot be represented as a
2865: * nonnegative integer (for example, a fractional value), then -2 is
2866: * returned.
2867: * @param ch the code point to query
2868: * @return the numeric value of the code point, or -1 if it has no numeric
2869: * value, or -2 if it has a numeric value that cannot be represented as a
2870: * nonnegative integer
2871: * @stable ICU 2.1
2872: */
2873: public static int getNumericValue(int ch) {
2874: // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit()
2875: int props = PROPERTY_.getProperty(ch);
2876: int numericType = getNumericType(props);
2877:
2878: if (numericType == 0) {
2879: return getEuropeanDigit(ch);
2880: }
2881: if (numericType == UCharacterProperty.NT_FRACTION
2882: || numericType >= UCharacterProperty.NT_COUNT) {
2883: return -2;
2884: }
2885:
2886: int numericValue = UCharacterProperty.getUnsignedValue(props);
2887:
2888: if (numericType < NumericType.COUNT) {
2889: /* normal type, the value is stored directly */
2890: return numericValue;
2891: } else /* numericType==NT_LARGE */{
2892: /* large value with exponent */
2893: long numValue;
2894: int mant, exp;
2895:
2896: mant = numericValue >> LARGE_MANT_SHIFT;
2897: exp = numericValue & LARGE_EXP_MASK;
2898: if (mant == 0) {
2899: mant = 1;
2900: exp += LARGE_EXP_OFFSET_EXTRA;
2901: } else if (mant > 9) {
2902: return -2; /* reserved mantissa value */
2903: } else {
2904: exp += LARGE_EXP_OFFSET;
2905: }
2906: if (exp > 9) {
2907: return -2;
2908: }
2909:
2910: numValue = mant;
2911:
2912: /* multiply by 10^exp without math.h */
2913: while (exp >= 4) {
2914: numValue *= 10000.;
2915: exp -= 4;
2916: }
2917: switch (exp) {
2918: case 3:
2919: numValue *= 1000.;
2920: break;
2921: case 2:
2922: numValue *= 100.;
2923: break;
2924: case 1:
2925: numValue *= 10.;
2926: break;
2927: case 0:
2928: default:
2929: break;
2930: }
2931: if (numValue <= Integer.MAX_VALUE) {
2932: return (int) numValue;
2933: } else {
2934: return -2;
2935: }
2936: }
2937: }
2938:
2939: /**
2940: * <p>Get the numeric value for a Unicode code point as defined in the
2941: * Unicode Character Database.</p>
2942: * <p>A "double" return type is necessary because some numeric values are
2943: * fractions, negative, or too large for int.</p>
2944: * <p>For characters without any numeric values in the Unicode Character
2945: * Database, this function will return NO_NUMERIC_VALUE.</p>
2946: * <p><em>API Change:</em> In release 2.2 and prior, this API has a
2947: * return type int and returns -1 when the argument ch does not have a
2948: * corresponding numeric value. This has been changed to synch with ICU4C
2949: * </p>
2950: * This corresponds to the ICU4C function u_getNumericValue.
2951: * @param ch Code point to get the numeric value for.
2952: * @return numeric value of ch, or NO_NUMERIC_VALUE if none is defined.
2953: * @stable ICU 2.4
2954: */
2955: public static double getUnicodeNumericValue(int ch) {
2956: // equivalent to c version double u_getNumericValue(UChar32 c)
2957: int props = PROPERTY_.getProperty(ch);
2958: int numericType = getNumericType(props);
2959:
2960: if (numericType == 0
2961: || numericType >= UCharacterProperty.NT_COUNT) {
2962: return NO_NUMERIC_VALUE;
2963: }
2964:
2965: int numericValue = UCharacterProperty.getUnsignedValue(props);
2966:
2967: if (numericType < NumericType.COUNT) {
2968: /* normal type, the value is stored directly */
2969: return numericValue;
2970: } else if (numericType == UCharacterProperty.NT_FRACTION) {
2971: /* fraction value */
2972: int numerator, denominator;
2973:
2974: numerator = numericValue >> FRACTION_NUM_SHIFT;
2975: denominator = (numericValue & FRACTION_DEN_MASK)
2976: + FRACTION_DEN_OFFSET;
2977:
2978: if (numerator == 0) {
2979: numerator = -1;
2980: }
2981: return (double) numerator / (double) denominator;
2982: } else /* numericType==NT_LARGE */{
2983: /* large value with exponent */
2984: double numValue;
2985: int mant, exp;
2986:
2987: mant = numericValue >> LARGE_MANT_SHIFT;
2988: exp = numericValue & LARGE_EXP_MASK;
2989: if (mant == 0) {
2990: mant = 1;
2991: exp += LARGE_EXP_OFFSET_EXTRA;
2992: } else if (mant > 9) {
2993: return NO_NUMERIC_VALUE; /* reserved mantissa value */
2994: } else {
2995: exp += LARGE_EXP_OFFSET;
2996: }
2997:
2998: numValue = mant;
2999:
3000: /* multiply by 10^exp without math.h */
3001: while (exp >= 4) {
3002: numValue *= 10000.;
3003: exp -= 4;
3004: }
3005: switch (exp) {
3006: case 3:
3007: numValue *= 1000.;
3008: break;
3009: case 2:
3010: numValue *= 100.;
3011: break;
3012: case 1:
3013: numValue *= 10.;
3014: break;
3015: case 0:
3016: default:
3017: break;
3018: }
3019:
3020: return numValue;
3021: }
3022: }
3023:
3024: /**
3025: * Compatibility override of Java deprecated method. This
3026: * method will always remain deprecated. Delegates to
3027: * java.lang.Character.isSpace.
3028: * @param ch the code point
3029: * @return true if the code point is a space character as
3030: * defined by java.lang.Character.isSpace.
3031: * @deprecated ICU 3.4 (Java)
3032: */
3033: public static boolean isSpace(int ch) {
3034: return ch <= 0x20
3035: && (ch == 0x20 || ch == 0x09 || ch == 0x0a
3036: || ch == 0x0c || ch == 0x0d);
3037: }
3038:
3039: /**
3040: * Returns a value indicating a code point's Unicode category.
3041: * Up-to-date Unicode implementation of java.lang.Character.getType()
3042: * except for the above mentioned code points that had their category
3043: * changed.<br>
3044: * Return results are constants from the interface
3045: * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
3046: * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
3047: * those returned by java.lang.Character.getType. UCharacterCategory values
3048: * match the ones used in ICU4C, while java.lang.Character type
3049: * values, though similar, skip the value 17.</p>
3050: * @param ch code point whose type is to be determined
3051: * @return category which is a value of UCharacterCategory
3052: * @stable ICU 2.1
3053: */
3054: public static int getType(int ch) {
3055: return getProperty(ch) & UCharacterProperty.TYPE_MASK;
3056: }
3057:
3058: /**
3059: * Determines if a code point has a defined meaning in the up-to-date
3060: * Unicode standard.
3061: * E.g. supplementary code points though allocated space are not defined in
3062: * Unicode yet.<br>
3063: * Up-to-date Unicode implementation of java.lang.Character.isDefined()
3064: * @param ch code point to be determined if it is defined in the most
3065: * current version of Unicode
3066: * @return true if this code point is defined in unicode
3067: * @stable ICU 2.1
3068: */
3069: public static boolean isDefined(int ch) {
3070: return getType(ch) != 0;
3071: }
3072:
3073: /**
3074: * Determines if a code point is a Java digit.
3075: * <br>This method observes the semantics of
3076: * <code>java.lang.Character.isDigit()</code>. It returns true for decimal
3077: * digits only.
3078: * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this treated
3079: * numeric letters and other numbers as digits.
3080: * This has been changed to conform to the java semantics.
3081: * @param ch code point to query
3082: * @return true if this code point is a digit
3083: * @stable ICU 2.1
3084: */
3085: public static boolean isDigit(int ch) {
3086: return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
3087: }
3088:
3089: /**
3090: * Determines if the specified code point is an ISO control character.
3091: * A code point is considered to be an ISO control character if it is in
3092: * the range \u0000 through \u001F or in the range \u007F through
3093: * \u009F.<br>
3094: * Up-to-date Unicode implementation of java.lang.Character.isISOControl()
3095: * @param ch code point to determine if it is an ISO control character
3096: * @return true if code point is a ISO control character
3097: * @stable ICU 2.1
3098: */
3099: public static boolean isISOControl(int ch) {
3100: return ch >= 0 && ch <= APPLICATION_PROGRAM_COMMAND_
3101: && ((ch <= UNIT_SEPARATOR_) || (ch >= DELETE_));
3102: }
3103:
3104: /**
3105: * Determines if the specified code point is a letter.
3106: * Up-to-date Unicode implementation of java.lang.Character.isLetter()
3107: * @param ch code point to determine if it is a letter
3108: * @return true if code point is a letter
3109: * @stable ICU 2.1
3110: */
3111: public static boolean isLetter(int ch) {
3112: // if props == 0, it will just fall through and return false
3113: return ((1 << getType(ch)) & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3114: | (1 << UCharacterCategory.LOWERCASE_LETTER)
3115: | (1 << UCharacterCategory.TITLECASE_LETTER)
3116: | (1 << UCharacterCategory.MODIFIER_LETTER) | (1 << UCharacterCategory.OTHER_LETTER))) != 0;
3117: }
3118:
3119: /**
3120: * Determines if the specified code point is a letter or digit.
3121: * Note this method, unlike java.lang.Character does not regard the ascii
3122: * characters 'A' - 'Z' and 'a' - 'z' as digits.
3123: * @param ch code point to determine if it is a letter or a digit
3124: * @return true if code point is a letter or a digit
3125: * @stable ICU 2.1
3126: */
3127: public static boolean isLetterOrDigit(int ch) {
3128: return ((1 << getType(ch)) & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3129: | (1 << UCharacterCategory.LOWERCASE_LETTER)
3130: | (1 << UCharacterCategory.TITLECASE_LETTER)
3131: | (1 << UCharacterCategory.MODIFIER_LETTER)
3132: | (1 << UCharacterCategory.OTHER_LETTER) | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER))) != 0;
3133: }
3134:
3135: /**
3136: * Compatibility override of Java deprecated method. This
3137: * method will always remain deprecated. Delegates to
3138: * java.lang.Character.isJavaIdentifierStart.
3139: * @param cp the code point
3140: * @return true if the code point can start a java identifier.
3141: * @deprecated ICU 3.4 (Java)
3142: */
3143: public static boolean isJavaLetter(int cp) {
3144: return isJavaIdentifierStart(cp);
3145: }
3146:
3147: /**
3148: * Compatibility override of Java deprecated method. This
3149: * method will always remain deprecated. Delegates to
3150: * java.lang.Character.isJavaIdentifierPart.
3151: * @param cp the code point
3152: * @return true if the code point can continue a java identifier.
3153: * @deprecated ICU 3.4 (Java)
3154: */
3155: public static boolean isJavaLetterOrDigit(int cp) {
3156: return isJavaIdentifierPart(cp);
3157: }
3158:
3159: /**
3160: * Compatibility override of Java method, delegates to
3161: * java.lang.Character.isJavaIdentifierStart.
3162: * @param cp the code point
3163: * @return true if the code point can start a java identifier.
3164: * @draft ICU 3.4
3165: * @provisional This API might change or be removed in a future release.
3166: */
3167: public static boolean isJavaIdentifierStart(int cp) {
3168: // note, downcast to char for jdk 1.4 compatibility
3169: return java.lang.Character.isJavaIdentifierStart((char) cp);
3170: }
3171:
3172: /**
3173: * Compatibility override of Java method, delegates to
3174: * java.lang.Character.isJavaIdentifierPart.
3175: * @param cp the code point
3176: * @return true if the code point can continue a java identifier.
3177: * @draft ICU 3.4
3178: * @provisional This API might change or be removed in a future release.
3179: */
3180: public static boolean isJavaIdentifierPart(int cp) {
3181: // note, downcast to char for jdk 1.4 compatibility
3182: return java.lang.Character.isJavaIdentifierPart((char) cp);
3183: }
3184:
3185: /**
3186: * Determines if the specified code point is a lowercase character.
3187: * UnicodeData only contains case mappings for code points where they are
3188: * one-to-one mappings; it also omits information about context-sensitive
3189: * case mappings.<br> For more information about Unicode case mapping
3190: * please refer to the
3191: * <a href=http://www.unicode.org/unicode/reports/tr21/>Technical report
3192: * #21</a>.<br>
3193: * Up-to-date Unicode implementation of java.lang.Character.isLowerCase()
3194: * @param ch code point to determine if it is in lowercase
3195: * @return true if code point is a lowercase character
3196: * @stable ICU 2.1
3197: */
3198: public static boolean isLowerCase(int ch) {
3199: // if props == 0, it will just fall through and return false
3200: return getType(ch) == UCharacterCategory.LOWERCASE_LETTER;
3201: }
3202:
3203: /**
3204: * Determines if the specified code point is a white space character.
3205: * A code point is considered to be an whitespace character if and only
3206: * if it satisfies one of the following criteria:
3207: * <ul>
3208: * <li> It is a Unicode space separator (category "Zs"), but is not
3209: * a no-break space (\u00A0 or \u202F or \uFEFF).
3210: * <li> It is a Unicode line separator (category "Zl").
3211: * <li> It is a Unicode paragraph separator (category "Zp").
3212: * <li> It is \u0009, HORIZONTAL TABULATION.
3213: * <li> It is \u000A, LINE FEED.
3214: * <li> It is \u000B, VERTICAL TABULATION.
3215: * <li> It is \u000C, FORM FEED.
3216: * <li> It is \u000D, CARRIAGE RETURN.
3217: * <li> It is \u001C, FILE SEPARATOR.
3218: * <li> It is \u001D, GROUP SEPARATOR.
3219: * <li> It is \u001E, RECORD SEPARATOR.
3220: * <li> It is \u001F, UNIT SEPARATOR.
3221: * </ul>
3222: *
3223: * This API tries to synch to the semantics of the Java API,
3224: * java.lang.Character.isWhitespace().
3225: * @param ch code point to determine if it is a white space
3226: * @return true if the specified code point is a white space character
3227: * @stable ICU 2.1
3228: */
3229: public static boolean isWhitespace(int ch) {
3230: // exclude no-break spaces
3231: // if props == 0, it will just fall through and return false
3232: return ((1 << getType(ch)) & ((1 << UCharacterCategory.SPACE_SEPARATOR)
3233: | (1 << UCharacterCategory.LINE_SEPARATOR) | (1 << UCharacterCategory.PARAGRAPH_SEPARATOR))) != 0
3234: && (ch != NO_BREAK_SPACE_)
3235: && (ch != NARROW_NO_BREAK_SPACE_)
3236: && (ch != ZERO_WIDTH_NO_BREAK_SPACE_)
3237: // TAB VT LF FF CR FS GS RS US NL are all control characters
3238: // that are white spaces.
3239: || (ch >= 0x9 && ch <= 0xd)
3240: || (ch >= 0x1c && ch <= 0x1f);
3241: }
3242:
3243: /**
3244: * Determines if the specified code point is a Unicode specified space
3245: * character, i.e. if code point is in the category Zs, Zl and Zp.
3246: * Up-to-date Unicode implementation of java.lang.Character.isSpaceChar().
3247: * @param ch code point to determine if it is a space
3248: * @return true if the specified code point is a space character
3249: * @stable ICU 2.1
3250: */
3251: public static boolean isSpaceChar(int ch) {
3252: // if props == 0, it will just fall through and return false
3253: return ((1 << getType(ch)) & ((1 << UCharacterCategory.SPACE_SEPARATOR)
3254: | (1 << UCharacterCategory.LINE_SEPARATOR) | (1 << UCharacterCategory.PARAGRAPH_SEPARATOR))) != 0;
3255: }
3256:
3257: /**
3258: * Determines if the specified code point is a titlecase character.
3259: * UnicodeData only contains case mappings for code points where they are
3260: * one-to-one mappings; it also omits information about context-sensitive
3261: * case mappings.<br>
3262: * For more information about Unicode case mapping please refer to the
3263: * <a href=http://www.unicode.org/unicode/reports/tr21/>
3264: * Technical report #21</a>.<br>
3265: * Up-to-date Unicode implementation of java.lang.Character.isTitleCase().
3266: * @param ch code point to determine if it is in title case
3267: * @return true if the specified code point is a titlecase character
3268: * @stable ICU 2.1
3269: */
3270: public static boolean isTitleCase(int ch) {
3271: // if props == 0, it will just fall through and return false
3272: return getType(ch) == UCharacterCategory.TITLECASE_LETTER;
3273: }
3274:
3275: /**
3276: * Determines if the specified code point may be any part of a Unicode
3277: * identifier other than the starting character.
3278: * A code point may be part of a Unicode identifier if and only if it is
3279: * one of the following:
3280: * <ul>
3281: * <li> Lu Uppercase letter
3282: * <li> Ll Lowercase letter
3283: * <li> Lt Titlecase letter
3284: * <li> Lm Modifier letter
3285: * <li> Lo Other letter
3286: * <li> Nl Letter number
3287: * <li> Pc Connecting punctuation character
3288: * <li> Nd decimal number
3289: * <li> Mc Spacing combining mark
3290: * <li> Mn Non-spacing mark
3291: * <li> Cf formatting code
3292: * </ul>
3293: * Up-to-date Unicode implementation of
3294: * java.lang.Character.isUnicodeIdentifierPart().<br>
3295: * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
3296: * @param ch code point to determine if is can be part of a Unicode
3297: * identifier
3298: * @return true if code point is any character belonging a unicode
3299: * identifier suffix after the first character
3300: * @stable ICU 2.1
3301: */
3302: public static boolean isUnicodeIdentifierPart(int ch) {
3303: // if props == 0, it will just fall through and return false
3304: // cat == format
3305: return ((1 << getType(ch)) & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3306: | (1 << UCharacterCategory.LOWERCASE_LETTER)
3307: | (1 << UCharacterCategory.TITLECASE_LETTER)
3308: | (1 << UCharacterCategory.MODIFIER_LETTER)
3309: | (1 << UCharacterCategory.OTHER_LETTER)
3310: | (1 << UCharacterCategory.LETTER_NUMBER)
3311: | (1 << UCharacterCategory.CONNECTOR_PUNCTUATION)
3312: | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER)
3313: | (1 << UCharacterCategory.COMBINING_SPACING_MARK) | (1 << UCharacterCategory.NON_SPACING_MARK))) != 0
3314: || isIdentifierIgnorable(ch);
3315: }
3316:
3317: /**
3318: * Determines if the specified code point is permissible as the first
3319: * character in a Unicode identifier.
3320: * A code point may start a Unicode identifier if it is of type either
3321: * <ul>
3322: * <li> Lu Uppercase letter
3323: * <li> Ll Lowercase letter
3324: * <li> Lt Titlecase letter
3325: * <li> Lm Modifier letter
3326: * <li> Lo Other letter
3327: * <li> Nl Letter number
3328: * </ul>
3329: * Up-to-date Unicode implementation of
3330: * java.lang.Character.isUnicodeIdentifierStart().<br>
3331: * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
3332: * @param ch code point to determine if it can start a Unicode identifier
3333: * @return true if code point is the first character belonging a unicode
3334: * identifier
3335: * @stable ICU 2.1
3336: */
3337: public static boolean isUnicodeIdentifierStart(int ch) {
3338: /*int cat = getType(ch);*/
3339: // if props == 0, it will just fall through and return false
3340: return ((1 << getType(ch)) & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3341: | (1 << UCharacterCategory.LOWERCASE_LETTER)
3342: | (1 << UCharacterCategory.TITLECASE_LETTER)
3343: | (1 << UCharacterCategory.MODIFIER_LETTER)
3344: | (1 << UCharacterCategory.OTHER_LETTER) | (1 << UCharacterCategory.LETTER_NUMBER))) != 0;
3345: }
3346:
3347: /**
3348: * Determines if the specified code point should be regarded as an
3349: * ignorable character in a Unicode identifier.
3350: * A character is ignorable in the Unicode standard if it is of the type
3351: * Cf, Formatting code.<br>
3352: * Up-to-date Unicode implementation of
3353: * java.lang.Character.isIdentifierIgnorable().<br>
3354: * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
3355: * @param ch code point to be determined if it can be ignored in a Unicode
3356: * identifier.
3357: * @return true if the code point is ignorable
3358: * @stable ICU 2.1
3359: */
3360: public static boolean isIdentifierIgnorable(int ch) {
3361: // see java.lang.Character.isIdentifierIgnorable() on range of
3362: // ignorable characters.
3363: if (ch <= 0x9f) {
3364: return isISOControl(ch)
3365: && !((ch >= 0x9 && ch <= 0xd) || (ch >= 0x1c && ch <= 0x1f));
3366: }
3367: return getType(ch) == UCharacterCategory.FORMAT;
3368: }
3369:
3370: /**
3371: * Determines if the specified code point is an uppercase character.
3372: * UnicodeData only contains case mappings for code point where they are
3373: * one-to-one mappings; it also omits information about context-sensitive
3374: * case mappings.<br>
3375: * For language specific case conversion behavior, use
3376: * toUpperCase(locale, str). <br>
3377: * For example, the case conversion for dot-less i and dotted I in Turkish,
3378: * or for final sigma in Greek.
3379: * For more information about Unicode case mapping please refer to the
3380: * <a href=http://www.unicode.org/unicode/reports/tr21/>
3381: * Technical report #21</a>.<br>
3382: * Up-to-date Unicode implementation of java.lang.Character.isUpperCase().
3383: * @param ch code point to determine if it is in uppercase
3384: * @return true if the code point is an uppercase character
3385: * @stable ICU 2.1
3386: */
3387: public static boolean isUpperCase(int ch) {
3388: // if props == 0, it will just fall through and return false
3389: return getType(ch) == UCharacterCategory.UPPERCASE_LETTER;
3390: }
3391:
3392: /**
3393: * The given code point is mapped to its lowercase equivalent; if the code
3394: * point has no lowercase equivalent, the code point itself is returned.
3395: * Up-to-date Unicode implementation of java.lang.Character.toLowerCase()
3396: *
3397: * <p>This function only returns the simple, single-code point case mapping.
3398: * Full case mappings should be used whenever possible because they produce
3399: * better results by working on whole strings.
3400: * They take into account the string context and the language and can map
3401: * to a result string with a different length as appropriate.
3402: * Full case mappings are applied by the case mapping functions
3403: * that take String parameters rather than code points (int).
3404: * See also the User Guide chapter on C/POSIX migration:
3405: * http://icu.sourceforge.net/userguide/posix.html#case_mappings
3406: *
3407: * @param ch code point whose lowercase equivalent is to be retrieved
3408: * @return the lowercase equivalent code point
3409: * @stable ICU 2.1
3410: */
3411: public static int toLowerCase(int ch) {
3412: return gCsp.tolower(ch);
3413: }
3414:
3415: /**
3416: * Converts argument code point and returns a String object representing
3417: * the code point's value in UTF16 format.
3418: * The result is a string whose length is 1 for non-supplementary code
3419: * points, 2 otherwise.<br>
3420: * com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this
3421: * function.<br>
3422: * Up-to-date Unicode implementation of java.lang.Character.toString()
3423: * @param ch code point
3424: * @return string representation of the code point, null if code point is not
3425: * defined in unicode
3426: * @stable ICU 2.1
3427: */
3428: public static String toString(int ch) {
3429: if (ch < MIN_VALUE || ch > MAX_VALUE) {
3430: return null;
3431: }
3432:
3433: if (ch < SUPPLEMENTARY_MIN_VALUE) {
3434: return String.valueOf((char) ch);
3435: }
3436:
3437: StringBuffer result = new StringBuffer();
3438: result.append(UTF16.getLeadSurrogate(ch));
3439: result.append(UTF16.getTrailSurrogate(ch));
3440: return result.toString();
3441: }
3442:
3443: /**
3444: * Converts the code point argument to titlecase.
3445: * If no titlecase is available, the uppercase is returned. If no uppercase
3446: * is available, the code point itself is returned.
3447: * Up-to-date Unicode implementation of java.lang.Character.toTitleCase()
3448: *
3449: * <p>This function only returns the simple, single-code point case mapping.
3450: * Full case mappings should be used whenever possible because they produce
3451: * better results by working on whole strings.
3452: * They take into account the string context and the language and can map
3453: * to a result string with a different length as appropriate.
3454: * Full case mappings are applied by the case mapping functions
3455: * that take String parameters rather than code points (int).
3456: * See also the User Guide chapter on C/POSIX migration:
3457: * http://icu.sourceforge.net/userguide/posix.html#case_mappings
3458: *
3459: * @param ch code point whose title case is to be retrieved
3460: * @return titlecase code point
3461: * @stable ICU 2.1
3462: */
3463: public static int toTitleCase(int ch) {
3464: return gCsp.totitle(ch);
3465: }
3466:
3467: /**
3468: * Converts the character argument to uppercase.
3469: * If no uppercase is available, the character itself is returned.
3470: * Up-to-date Unicode implementation of java.lang.Character.toUpperCase()
3471: *
3472: * <p>This function only returns the simple, single-code point case mapping.
3473: * Full case mappings should be used whenever possible because they produce
3474: * better results by working on whole strings.
3475: * They take into account the string context and the language and can map
3476: * to a result string with a different length as appropriate.
3477: * Full case mappings are applied by the case mapping functions
3478: * that take String parameters rather than code points (int).
3479: * See also the User Guide chapter on C/POSIX migration:
3480: * http://icu.sourceforge.net/userguide/posix.html#case_mappings
3481: *
3482: * @param ch code point whose uppercase is to be retrieved
3483: * @return uppercase code point
3484: * @stable ICU 2.1
3485: */
3486: public static int toUpperCase(int ch) {
3487: return gCsp.toupper(ch);
3488: }
3489:
3490: // extra methods not in java.lang.Character --------------------------
3491:
3492: /**
3493: * Determines if the code point is a supplementary character.
3494: * A code point is a supplementary character if and only if it is greater
3495: * than <a href=#SUPPLEMENTARY_MIN_VALUE>SUPPLEMENTARY_MIN_VALUE</a>
3496: * @param ch code point to be determined if it is in the supplementary
3497: * plane
3498: * @return true if code point is a supplementary character
3499: * @stable ICU 2.1
3500: */
3501: public static boolean isSupplementary(int ch) {
3502: return ch >= UCharacter.SUPPLEMENTARY_MIN_VALUE
3503: && ch <= UCharacter.MAX_VALUE;
3504: }
3505:
3506: /**
3507: * Determines if the code point is in the BMP plane.
3508: * @param ch code point to be determined if it is not a supplementary
3509: * character
3510: * @return true if code point is not a supplementary character
3511: * @stable ICU 2.1
3512: */
3513: public static boolean isBMP(int ch) {
3514: return (ch >= 0 && ch <= LAST_CHAR_MASK_);
3515: }
3516:
3517: /**
3518: * Determines whether the specified code point is a printable character
3519: * according to the Unicode standard.
3520: * @param ch code point to be determined if it is printable
3521: * @return true if the code point is a printable character
3522: * @stable ICU 2.1
3523: */
3524: public static boolean isPrintable(int ch) {
3525: int cat = getType(ch);
3526: // if props == 0, it will just fall through and return false
3527: return (cat != UCharacterCategory.UNASSIGNED
3528: && cat != UCharacterCategory.CONTROL
3529: && cat != UCharacterCategory.FORMAT
3530: && cat != UCharacterCategory.PRIVATE_USE
3531: && cat != UCharacterCategory.SURROGATE && cat != UCharacterCategory.GENERAL_OTHER_TYPES);
3532: }
3533:
3534: /**
3535: * Determines whether the specified code point is of base form.
3536: * A code point of base form does not graphically combine with preceding
3537: * characters, and is neither a control nor a format character.
3538: * @param ch code point to be determined if it is of base form
3539: * @return true if the code point is of base form
3540: * @stable ICU 2.1
3541: */
3542: public static boolean isBaseForm(int ch) {
3543: int cat = getType(ch);
3544: // if props == 0, it will just fall through and return false
3545: return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER
3546: || cat == UCharacterCategory.OTHER_NUMBER
3547: || cat == UCharacterCategory.LETTER_NUMBER
3548: || cat == UCharacterCategory.UPPERCASE_LETTER
3549: || cat == UCharacterCategory.LOWERCASE_LETTER
3550: || cat == UCharacterCategory.TITLECASE_LETTER
3551: || cat == UCharacterCategory.MODIFIER_LETTER
3552: || cat == UCharacterCategory.OTHER_LETTER
3553: || cat == UCharacterCategory.NON_SPACING_MARK
3554: || cat == UCharacterCategory.ENCLOSING_MARK
3555: || cat == UCharacterCategory.COMBINING_SPACING_MARK;
3556: }
3557:
3558: /**
3559: * Returns the Bidirection property of a code point.
3560: * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
3561: * property.<br>
3562: * Result returned belongs to the interface
3563: * <a href=UCharacterDirection.html>UCharacterDirection</a>
3564: * @param ch the code point to be determined its direction
3565: * @return direction constant from UCharacterDirection.
3566: * @stable ICU 2.1
3567: */
3568: public static int getDirection(int ch) {
3569: return gBdp.getClass(ch);
3570: }
3571:
3572: /**
3573: * Determines whether the code point has the "mirrored" property.
3574: * This property is set for characters that are commonly used in
3575: * Right-To-Left contexts and need to be displayed with a "mirrored"
3576: * glyph.
3577: * @param ch code point whose mirror is to be determined
3578: * @return true if the code point has the "mirrored" property
3579: * @stable ICU 2.1
3580: */
3581: public static boolean isMirrored(int ch) {
3582: return gBdp.isMirrored(ch);
3583: }
3584:
3585: /**
3586: * Maps the specified code point to a "mirror-image" code point.
3587: * For code points with the "mirrored" property, implementations sometimes
3588: * need a "poor man's" mapping to another code point such that the default
3589: * glyph may serve as the mirror-image of the default glyph of the
3590: * specified code point.<br>
3591: * This is useful for text conversion to and from codepages with visual
3592: * order, and for displays without glyph selection capabilities.
3593: * @param ch code point whose mirror is to be retrieved
3594: * @return another code point that may serve as a mirror-image substitute,
3595: * or ch itself if there is no such mapping or ch does not have the
3596: * "mirrored" property
3597: * @stable ICU 2.1
3598: */
3599: public static int getMirror(int ch) {
3600: return gBdp.getMirror(ch);
3601: }
3602:
3603: /**
3604: * Gets the combining class of the argument codepoint
3605: * @param ch code point whose combining is to be retrieved
3606: * @return the combining class of the codepoint
3607: * @stable ICU 2.1
3608: */
3609: public static int getCombiningClass(int ch) {
3610: if (ch < MIN_VALUE || ch > MAX_VALUE) {
3611: throw new IllegalArgumentException(
3612: "Codepoint out of bounds");
3613: }
3614: return NormalizerImpl.getCombiningClass(ch);
3615: }
3616:
3617: /**
3618: * A code point is illegal if and only if
3619: * <ul>
3620: * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
3621: * <li> A surrogate value, 0xD800 to 0xDFFF
3622: * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
3623: * </ul>
3624: * Note: legal does not mean that it is assigned in this version of Unicode.
3625: * @param ch code point to determine if it is a legal code point by itself
3626: * @return true if and only if legal.
3627: * @stable ICU 2.1
3628: */
3629: public static boolean isLegal(int ch) {
3630: if (ch < MIN_VALUE) {
3631: return false;
3632: }
3633: if (ch < UTF16.SURROGATE_MIN_VALUE) {
3634: return true;
3635: }
3636: if (ch <= UTF16.SURROGATE_MAX_VALUE) {
3637: return false;
3638: }
3639: if (UCharacterUtility.isNonCharacter(ch)) {
3640: return false;
3641: }
3642: return (ch <= MAX_VALUE);
3643: }
3644:
3645: /**
3646: * A string is legal iff all its code points are legal.
3647: * A code point is illegal if and only if
3648: * <ul>
3649: * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
3650: * <li> A surrogate value, 0xD800 to 0xDFFF
3651: * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
3652: * </ul>
3653: * Note: legal does not mean that it is assigned in this version of Unicode.
3654: * @param str containing code points to examin
3655: * @return true if and only if legal.
3656: * @stable ICU 2.1
3657: */
3658: public static boolean isLegal(String str) {
3659: int size = str.length();
3660: int codepoint;
3661: for (int i = 0; i < size; i++) {
3662: codepoint = UTF16.charAt(str, i);
3663: if (!isLegal(codepoint)) {
3664: return false;
3665: }
3666: if (isSupplementary(codepoint)) {
3667: i++;
3668: }
3669: }
3670: return true;
3671: }
3672:
3673: /**
3674: * Gets the version of Unicode data used.
3675: * @return the unicode version number used
3676: * @stable ICU 2.1
3677: */
3678: public static VersionInfo getUnicodeVersion() {
3679: return PROPERTY_.m_unicodeVersion_;
3680: }
3681:
3682: /**
3683: * Retrieve the most current Unicode name of the argument code point, or
3684: * null if the character is unassigned or outside the range
3685: * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
3686: * <br>
3687: * Note calling any methods related to code point names, e.g. get*Name*()
3688: * incurs a one-time initialisation cost to construct the name tables.
3689: * @param ch the code point for which to get the name
3690: * @return most current Unicode name
3691: * @stable ICU 2.1
3692: */
3693: public static String getName(int ch) {
3694: if (NAME_ == null) {
3695: throw new MissingResourceException(
3696: "Could not load unames.icu", "", "");
3697: }
3698: return NAME_
3699: .getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME);
3700: }
3701:
3702: /**
3703: * Gets the names for each of the characters in a string
3704: * @param s string to format
3705: * @param separator string to go between names
3706: * @return string of names
3707: * @internal
3708: * @deprecated This API is ICU internal only.
3709: */
3710: public static String getName(String s, String separator) {
3711: if (s.length() == 1) { // handle common case
3712: return getName(s.charAt(0));
3713: }
3714: int cp;
3715: StringBuffer sb = new StringBuffer();
3716: for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
3717: cp = UTF16.charAt(s, i);
3718: if (i != 0)
3719: sb.append(separator);
3720: sb.append(UCharacter.getName(cp));
3721: }
3722: return sb.toString();
3723: }
3724:
3725: /**
3726: * Retrieve the earlier version 1.0 Unicode name of the argument code
3727: * point, or null if the character is unassigned or outside the range
3728: * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
3729: * <br>
3730: * Note calling any methods related to code point names, e.g. get*Name*()
3731: * incurs a one-time initialisation cost to construct the name tables.
3732: * @param ch the code point for which to get the name
3733: * @return version 1.0 Unicode name
3734: * @stable ICU 2.1
3735: */
3736: public static String getName1_0(int ch) {
3737: if (NAME_ == null) {
3738: throw new MissingResourceException(
3739: "Could not load unames.icu", "", "");
3740: }
3741: return NAME_.getName(ch,
3742: UCharacterNameChoice.UNICODE_10_CHAR_NAME);
3743: }
3744:
3745: /**
3746: * <p>Retrieves a name for a valid codepoint. Unlike, getName(int) and
3747: * getName1_0(int), this method will return a name even for codepoints that
3748: * are not assigned a name in UnicodeData.txt.
3749: * </p>
3750: * The names are returned in the following order.
3751: * <ul>
3752: * <li> Most current Unicode name if there is any
3753: * <li> Unicode 1.0 name if there is any
3754: * <li> Extended name in the form of
3755: * "<codepoint_type-codepoint_hex_digits>". E.g. <noncharacter-fffe>
3756: * </ul>
3757: * Note calling any methods related to code point names, e.g. get*Name*()
3758: * incurs a one-time initialisation cost to construct the name tables.
3759: * @param ch the code point for which to get the name
3760: * @return a name for the argument codepoint
3761: * @stable ICU 2.6
3762: */
3763: public static String getExtendedName(int ch) {
3764: if (NAME_ == null) {
3765: throw new MissingResourceException(
3766: "Could not load unames.icu", "", "");
3767: }
3768: return NAME_.getName(ch,
3769: UCharacterNameChoice.EXTENDED_CHAR_NAME);
3770: }
3771:
3772: /**
3773: * Get the ISO 10646 comment for a character.
3774: * The ISO 10646 comment is an informative field in the Unicode Character
3775: * Database (UnicodeData.txt field 11) and is from the ISO 10646 names list.
3776: * @param ch The code point for which to get the ISO comment.
3777: * It must be <code>0<=c<=0x10ffff</code>.
3778: * @return The ISO comment, or null if there is no comment for this
3779: * character.
3780: * @stable ICU 2.4
3781: */
3782: public static String getISOComment(int ch) {
3783: if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE) {
3784: return null;
3785: }
3786: if (NAME_ == null) {
3787: throw new MissingResourceException(
3788: "Could not load unames.icu", "", "");
3789: }
3790: String result = NAME_.getGroupName(ch,
3791: UCharacterNameChoice.ISO_COMMENT_);
3792: return result;
3793: }
3794:
3795: /**
3796: * <p>Find a Unicode code point by its most current Unicode name and
3797: * return its code point value. All Unicode names are in uppercase.</p>
3798: * Note calling any methods related to code point names, e.g. get*Name*()
3799: * incurs a one-time initialisation cost to construct the name tables.
3800: * @param name most current Unicode character name whose code point is to
3801: * be returned
3802: * @return code point or -1 if name is not found
3803: * @stable ICU 2.1
3804: */
3805: public static int getCharFromName(String name) {
3806: if (NAME_ == null) {
3807: throw new MissingResourceException(
3808: "Could not load unames.icu", "", "");
3809: }
3810: return NAME_.getCharFromName(
3811: UCharacterNameChoice.UNICODE_CHAR_NAME, name);
3812: }
3813:
3814: /**
3815: * <p>Find a Unicode character by its version 1.0 Unicode name and return
3816: * its code point value. All Unicode names are in uppercase.</p>
3817: * Note calling any methods related to code point names, e.g. get*Name*()
3818: * incurs a one-time initialisation cost to construct the name tables.
3819: * @param name Unicode 1.0 code point name whose code point is to
3820: * returned
3821: * @return code point or -1 if name is not found
3822: * @stable ICU 2.1
3823: */
3824: public static int getCharFromName1_0(String name) {
3825: if (NAME_ == null) {
3826: throw new MissingResourceException(
3827: "Could not load unames.icu", "", "");
3828: }
3829: return NAME_.getCharFromName(
3830: UCharacterNameChoice.UNICODE_10_CHAR_NAME, name);
3831: }
3832:
3833: /**
3834: * <p>Find a Unicode character by either its name and return its code
3835: * point value. All Unicode names are in uppercase.
3836: * Extended names are all lowercase except for numbers and are contained
3837: * within angle brackets.</p>
3838: * The names are searched in the following order
3839: * <ul>
3840: * <li> Most current Unicode name if there is any
3841: * <li> Unicode 1.0 name if there is any
3842: * <li> Extended name in the form of
3843: * "<codepoint_type-codepoint_hex_digits>". E.g. <noncharacter-FFFE>
3844: * </ul>
3845: * Note calling any methods related to code point names, e.g. get*Name*()
3846: * incurs a one-time initialisation cost to construct the name tables.
3847: * @param name codepoint name
3848: * @return code point associated with the name or -1 if the name is not
3849: * found.
3850: * @stable ICU 2.6
3851: */
3852: public static int getCharFromExtendedName(String name) {
3853: if (NAME_ == null) {
3854: throw new MissingResourceException(
3855: "Could not load unames.icu", "", "");
3856: }
3857: return NAME_.getCharFromName(
3858: UCharacterNameChoice.EXTENDED_CHAR_NAME, name);
3859: }
3860:
3861: /**
3862: * Return the Unicode name for a given property, as given in the
3863: * Unicode database file PropertyAliases.txt. Most properties
3864: * have more than one name. The nameChoice determines which one
3865: * is returned.
3866: *
3867: * In addition, this function maps the property
3868: * UProperty.GENERAL_CATEGORY_MASK to the synthetic names "gcm" /
3869: * "General_Category_Mask". These names are not in
3870: * PropertyAliases.txt.
3871: *
3872: * @param property UProperty selector.
3873: *
3874: * @param nameChoice UProperty.NameChoice selector for which name
3875: * to get. All properties have a long name. Most have a short
3876: * name, but some do not. Unicode allows for additional names; if
3877: * present these will be returned by UProperty.NameChoice.LONG + i,
3878: * where i=1, 2,...
3879: *
3880: * @return a name, or null if Unicode explicitly defines no name
3881: * ("n/a") for a given property/nameChoice. If a given nameChoice
3882: * throws an exception, then all larger values of nameChoice will
3883: * throw an exception. If null is returned for a given
3884: * nameChoice, then other nameChoice values may return non-null
3885: * results.
3886: *
3887: * @exception IllegalArgumentException thrown if property or
3888: * nameChoice are invalid.
3889: *
3890: * @see UProperty
3891: * @see UProperty.NameChoice
3892: * @stable ICU 2.4
3893: */
3894: public static String getPropertyName(int property, int nameChoice) {
3895: return PNAMES_.getPropertyName(property, nameChoice);
3896: }
3897:
3898: /**
3899: * Return the UProperty selector for a given property name, as
3900: * specified in the Unicode database file PropertyAliases.txt.
3901: * Short, long, and any other variants are recognized.
3902: *
3903: * In addition, this function maps the synthetic names "gcm" /
3904: * "General_Category_Mask" to the property
3905: * UProperty.GENERAL_CATEGORY_MASK. These names are not in
3906: * PropertyAliases.txt.
3907: *
3908: * @param propertyAlias the property name to be matched. The name
3909: * is compared using "loose matching" as described in
3910: * PropertyAliases.txt.
3911: *
3912: * @return a UProperty enum.
3913: *
3914: * @exception IllegalArgumentException thrown if propertyAlias
3915: * is not recognized.
3916: *
3917: * @see UProperty
3918: * @stable ICU 2.4
3919: */
3920: public static int getPropertyEnum(String propertyAlias) {
3921: return PNAMES_.getPropertyEnum(propertyAlias);
3922: }
3923:
3924: /**
3925: * Return the Unicode name for a given property value, as given in
3926: * the Unicode database file PropertyValueAliases.txt. Most
3927: * values have more than one name. The nameChoice determines
3928: * which one is returned.
3929: *
3930: * Note: Some of the names in PropertyValueAliases.txt can only be
3931: * retrieved using UProperty.GENERAL_CATEGORY_MASK, not
3932: * UProperty.GENERAL_CATEGORY. These include: "C" / "Other", "L" /
3933: * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
3934: * / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
3935: *
3936: * @param property UProperty selector constant.
3937: * UProperty.INT_START <= property < UProperty.INT_LIMIT or
3938: * UProperty.BINARY_START <= property < UProperty.BINARY_LIMIT or
3939: * UProperty.MASK_START < = property < UProperty.MASK_LIMIT.
3940: * If out of range, null is returned.
3941: *
3942: * @param value selector for a value for the given property. In
3943: * general, valid values range from 0 up to some maximum. There
3944: * are a few exceptions: (1.) UProperty.BLOCK values begin at the
3945: * non-zero value BASIC_LATIN.getID(). (2.)
3946: * UProperty.CANONICAL_COMBINING_CLASS values are not contiguous
3947: * and range from 0..240. (3.) UProperty.GENERAL_CATEGORY_MASK values
3948: * are mask values produced by left-shifting 1 by
3949: * UCharacter.getType(). This allows grouped categories such as
3950: * [:L:] to be represented. Mask values are non-contiguous.
3951: *
3952: * @param nameChoice UProperty.NameChoice selector for which name
3953: * to get. All values have a long name. Most have a short name,
3954: * but some do not. Unicode allows for additional names; if
3955: * present these will be returned by UProperty.NameChoice.LONG + i,
3956: * where i=1, 2,...
3957: *
3958: * @return a name, or null if Unicode explicitly defines no name
3959: * ("n/a") for a given property/value/nameChoice. If a given
3960: * nameChoice throws an exception, then all larger values of
3961: * nameChoice will throw an exception. If null is returned for a
3962: * given nameChoice, then other nameChoice values may return
3963: * non-null results.
3964: *
3965: * @exception IllegalArgumentException thrown if property, value,
3966: * or nameChoice are invalid.
3967: *
3968: * @see UProperty
3969: * @see UProperty.NameChoice
3970: * @stable ICU 2.4
3971: */
3972: public static String getPropertyValueName(int property, int value,
3973: int nameChoice) {
3974: if (property == UProperty.CANONICAL_COMBINING_CLASS
3975: && value >= UCharacter
3976: .getIntPropertyMinValue(UProperty.CANONICAL_COMBINING_CLASS)
3977: && value <= UCharacter
3978: .getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS)
3979: && nameChoice >= 0
3980: && nameChoice < UProperty.NameChoice.COUNT) {
3981: // this is hard coded for the valid cc
3982: // because PropertyValueAliases.txt does not contain all of them
3983: try {
3984: return PNAMES_.getPropertyValueName(property, value,
3985: nameChoice);
3986: } catch (IllegalArgumentException e) {
3987: return null;
3988: }
3989: }
3990: return PNAMES_
3991: .getPropertyValueName(property, value, nameChoice);
3992: }
3993:
3994: /**
3995: * Return the property value integer for a given value name, as
3996: * specified in the Unicode database file PropertyValueAliases.txt.
3997: * Short, long, and any other variants are recognized.
3998: *
3999: * Note: Some of the names in PropertyValueAliases.txt will only be
4000: * recognized with UProperty.GENERAL_CATEGORY_MASK, not
4001: * UProperty.GENERAL_CATEGORY. These include: "C" / "Other", "L" /
4002: * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
4003: * / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
4004: *
4005: * @param property UProperty selector constant.
4006: * UProperty.INT_START <= property < UProperty.INT_LIMIT or
4007: * UProperty.BINARY_START <= property < UProperty.BINARY_LIMIT or
4008: * UProperty.MASK_START < = property < UProperty.MASK_LIMIT.
4009: * Only these properties can be enumerated.
4010: *
4011: * @param valueAlias the value name to be matched. The name is
4012: * compared using "loose matching" as described in
4013: * PropertyValueAliases.txt.
4014: *
4015: * @return a value integer. Note: UProperty.GENERAL_CATEGORY
4016: * values are mask values produced by left-shifting 1 by
4017: * UCharacter.getType(). This allows grouped categories such as
4018: * [:L:] to be represented.
4019: *
4020: * @see UProperty
4021: * @throws IllegalArgumentException if property is not a valid UProperty
4022: * selector
4023: * @stable ICU 2.4
4024: */
4025: public static int getPropertyValueEnum(int property,
4026: String valueAlias) {
4027: return PNAMES_.getPropertyValueEnum(property, valueAlias);
4028: }
4029:
4030: /**
4031: * Returns a code point corresponding to the two UTF16 characters.
4032: * @param lead the lead char
4033: * @param trail the trail char
4034: * @return code point if surrogate characters are valid.
4035: * @exception IllegalArgumentException thrown when argument characters do
4036: * not form a valid codepoint
4037: * @stable ICU 2.1
4038: */
4039: public static int getCodePoint(char lead, char trail) {
4040: if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
4041: && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE
4042: && trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
4043: && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
4044: return UCharacterProperty.getRawSupplementary(lead, trail);
4045: }
4046: throw new IllegalArgumentException(
4047: "Illegal surrogate characters");
4048: }
4049:
4050: /**
4051: * Returns the code point corresponding to the UTF16 character.
4052: * @param char16 the UTF16 character
4053: * @return code point if argument is a valid character.
4054: * @exception IllegalArgumentException thrown when char16 is not a valid
4055: * codepoint
4056: * @stable ICU 2.1
4057: */
4058: public static int getCodePoint(char char16) {
4059: if (UCharacter.isLegal(char16)) {
4060: return char16;
4061: }
4062: throw new IllegalArgumentException("Illegal codepoint");
4063: }
4064:
4065: /**
4066: * Implementation of UCaseProps.ContextIterator, iterates over a String.
4067: * See ustrcase.c/utf16_caseContextIterator().
4068: */
4069: private static class StringContextIterator implements
4070: UCaseProps.ContextIterator {
4071: /**
4072: * Constructor.
4073: * @param s String to iterate over.
4074: */
4075: StringContextIterator(String s) {
4076: this .s = s;
4077: limit = s.length();
4078: cpStart = cpLimit = index = 0;
4079: dir = 0;
4080: }
4081:
4082: /**
4083: * Set the iteration limit for nextCaseMapCP() to an index within the string.
4084: * If the limit parameter is negative or past the string, then the
4085: * string length is restored as the iteration limit.
4086: *
4087: * This limit does not affect the next() function which always
4088: * iterates to the very end of the string.
4089: *
4090: * @param lim The iteration limit.
4091: */
4092: public void setLimit(int lim) {
4093: if (0 <= lim && lim <= s.length()) {
4094: limit = lim;
4095: } else {
4096: limit = s.length();
4097: }
4098: }
4099:
4100: /**
4101: * Iterate forward through the string to fetch the next code point
4102: * to be case-mapped, and set the context indexes for it.
4103: * Performance optimization, to save on function calls and redundant
4104: * tests. Combines UTF16.charAt(), UTF16.getCharCount(), and setIndex().
4105: *
4106: * When the iteration limit is reached (and -1 is returned),
4107: * getCPStart() will be at the iteration limit.
4108: *
4109: * Iteration with next() does not affect the position for nextCaseMapCP().
4110: *
4111: * @return The next code point to be case-mapped, or <0 when the iteration is done.
4112: */
4113: public int nextCaseMapCP() {
4114: cpStart = cpLimit;
4115: if (cpLimit < limit) {
4116: int c = s.charAt(cpLimit++);
4117: if (UTF16.LEAD_SURROGATE_MIN_VALUE <= c
4118: || c <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
4119: char c2;
4120: if (c <= UTF16.LEAD_SURROGATE_MAX_VALUE
4121: && cpLimit < limit
4122: && UTF16.TRAIL_SURROGATE_MIN_VALUE <= (c2 = s
4123: .charAt(cpLimit))
4124: && c2 <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
4125: // supplementary code point
4126: ++cpLimit;
4127: c = UCharacterProperty.getRawSupplementary(
4128: (char) c, c2);
4129: // else unpaired surrogate code point
4130: }
4131: // else BMP code point
4132: }
4133: return c;
4134: } else {
4135: return -1;
4136: }
4137: }
4138:
4139: /**
4140: * Get the start of the code point that was last returned
4141: * by nextCaseMapCP().
4142: */
4143: public int getCPStart() {
4144: return cpStart;
4145: }
4146:
4147: // implement UCaseProps.ContextIterator
4148: public void reset(int dir) {
4149: if (dir > 0) {
4150: /* reset for forward iteration */
4151: this .dir = 1;
4152: index = cpLimit;
4153: } else if (dir < 0) {
4154: /* reset for backward iteration */
4155: this .dir = -1;
4156: index = cpStart;
4157: } else {
4158: // not a valid direction
4159: this .dir = 0;
4160: index = 0;
4161: }
4162: }
4163:
4164: public int next() {
4165: int c;
4166:
4167: if (dir > 0 && index < s.length()) {
4168: c = UTF16.charAt(s, index);
4169: index += UTF16.getCharCount(c);
4170: return c;
4171: } else if (dir < 0 && index > 0) {
4172: c = UTF16.charAt(s, index - 1);
4173: index -= UTF16.getCharCount(c);
4174: return c;
4175: }
4176: return -1;
4177: }
4178:
4179: // variables
4180: protected String s;
4181: protected int index, limit, cpStart, cpLimit;
4182: protected int dir; // 0=initial state >0=forward <0=backward
4183: }
4184:
4185: /**
4186: * Gets uppercase version of the argument string.
4187: * Casing is dependent on the default locale and context-sensitive.
4188: * @param str source string to be performed on
4189: * @return uppercase version of the argument string
4190: * @stable ICU 2.1
4191: */
4192: public static String toUpperCase(String str) {
4193: return toUpperCase(ULocale.getDefault(), str);
4194: }
4195:
4196: /**
4197: * Gets lowercase version of the argument string.
4198: * Casing is dependent on the default locale and context-sensitive
4199: * @param str source string to be performed on
4200: * @return lowercase version of the argument string
4201: * @stable ICU 2.1
4202: */
4203: public static String toLowerCase(String str) {
4204: return toLowerCase(ULocale.getDefault(), str);
4205: }
4206:
4207: /**
4208: * <p>Gets the titlecase version of the argument string.</p>
4209: * <p>Position for titlecasing is determined by the argument break
4210: * iterator, hence the user can customized his break iterator for
4211: * a specialized titlecasing. In this case only the forward iteration
4212: * needs to be implemented.
4213: * If the break iterator passed in is null, the default Unicode algorithm
4214: * will be used to determine the titlecase positions.
4215: * </p>
4216: * <p>Only positions returned by the break iterator will be title cased,
4217: * character in between the positions will all be in lower case.</p>
4218: * <p>Casing is dependent on the default locale and context-sensitive</p>
4219: * @param str source string to be performed on
4220: * @param breakiter break iterator to determine the positions in which
4221: * the character should be title cased.
4222: * @return lowercase version of the argument string
4223: * @stable ICU 2.6
4224: */
4225: public static String toTitleCase(String str, BreakIterator breakiter) {
4226: return toTitleCase(ULocale.getDefault(), str, breakiter);
4227: }
4228:
4229: /**
4230: * Gets uppercase version of the argument string.
4231: * Casing is dependent on the argument locale and context-sensitive.
4232: * @param locale which string is to be converted in
4233: * @param str source string to be performed on
4234: * @return uppercase version of the argument string
4235: * @stable ICU 2.1
4236: */
4237: public static String toUpperCase(Locale locale, String str) {
4238: return toUpperCase(ULocale.forLocale(locale), str);
4239: }
4240:
4241: /**
4242: * Gets uppercase version of the argument string.
4243: * Casing is dependent on the argument locale and context-sensitive.
4244: * @param locale which string is to be converted in
4245: * @param str source string to be performed on
4246: * @return uppercase version of the argument string
4247: * @draft ICU 3.2
4248: * @provisional This API might change or be removed in a future release.
4249: */
4250: public static String toUpperCase(ULocale locale, String str) {
4251: StringContextIterator iter = new StringContextIterator(str);
4252: StringBuffer result = new StringBuffer(str.length());
4253: int[] locCache = new int[1];
4254: int c;
4255:
4256: if (locale == null) {
4257: locale = ULocale.getDefault();
4258: }
4259: locCache[0] = 0;
4260:
4261: while ((c = iter.nextCaseMapCP()) >= 0) {
4262: c = gCsp.toFullUpper(c, iter, result, locale, locCache);
4263:
4264: /* decode the result */
4265: if (c < 0) {
4266: /* (not) original code point */
4267: c = ~c;
4268: } else if (c <= UCaseProps.MAX_STRING_LENGTH) {
4269: /* mapping already appended to result */
4270: continue;
4271: /* } else { append single-code point mapping */
4272: }
4273: if (c <= 0xffff) {
4274: result.append((char) c);
4275: } else {
4276: UTF16.append(result, c);
4277: }
4278: }
4279: return result.toString();
4280: }
4281:
4282: /**
4283: * Gets lowercase version of the argument string.
4284: * Casing is dependent on the argument locale and context-sensitive
4285: * @param locale which string is to be converted in
4286: * @param str source string to be performed on
4287: * @return lowercase version of the argument string
4288: * @stable ICU 2.1
4289: */
4290: public static String toLowerCase(Locale locale, String str) {
4291: return toLowerCase(ULocale.forLocale(locale), str);
4292: }
4293:
4294: /**
4295: * Gets lowercase version of the argument string.
4296: * Casing is dependent on the argument locale and context-sensitive
4297: * @param locale which string is to be converted in
4298: * @param str source string to be performed on
4299: * @return lowercase version of the argument string
4300: * @draft ICU 3.2
4301: * @provisional This API might change or be removed in a future release.
4302: */
4303: public static String toLowerCase(ULocale locale, String str) {
4304: StringContextIterator iter = new StringContextIterator(str);
4305: StringBuffer result = new StringBuffer(str.length());
4306: int[] locCache = new int[1];
4307: int c;
4308:
4309: if (locale == null) {
4310: locale = ULocale.getDefault();
4311: }
4312: locCache[0] = 0;
4313:
4314: while ((c = iter.nextCaseMapCP()) >= 0) {
4315: c = gCsp.toFullLower(c, iter, result, locale, locCache);
4316:
4317: /* decode the result */
4318: if (c < 0) {
4319: /* (not) original code point */
4320: c = ~c;
4321: } else if (c <= UCaseProps.MAX_STRING_LENGTH) {
4322: /* mapping already appended to result */
4323: continue;
4324: /* } else { append single-code point mapping */
4325: }
4326: if (c <= 0xffff) {
4327: result.append((char) c);
4328: } else {
4329: UTF16.append(result, c);
4330: }
4331: }
4332: return result.toString();
4333: }
4334:
4335: /**
4336: * <p>Gets the titlecase version of the argument string.</p>
4337: * <p>Position for titlecasing is determined by the argument break
4338: * iterator, hence the user can customized his break iterator for
4339: * a specialized titlecasing. In this case only the forward iteration
4340: * needs to be implemented.
4341: * If the break iterator passed in is null, the default Unicode algorithm
4342: * will be used to determine the titlecase positions.
4343: * </p>
4344: * <p>Only positions returned by the break iterator will be title cased,
4345: * character in between the positions will all be in lower case.</p>
4346: * <p>Casing is dependent on the argument locale and context-sensitive</p>
4347: * @param locale which string is to be converted in
4348: * @param str source string to be performed on
4349: * @param breakiter break iterator to determine the positions in which
4350: * the character should be title cased.
4351: * @return lowercase version of the argument string
4352: * @stable ICU 2.6
4353: */
4354: public static String toTitleCase(Locale locale, String str,
4355: BreakIterator breakiter) {
4356: return toTitleCase(ULocale.forLocale(locale), str, breakiter);
4357: }
4358:
4359: /**
4360: * <p>Gets the titlecase version of the argument string.</p>
4361: * <p>Position for titlecasing is determined by the argument break
4362: * iterator, hence the user can customized his break iterator for
4363: * a specialized titlecasing. In this case only the forward iteration
4364: * needs to be implemented.
4365: * If the break iterator passed in is null, the default Unicode algorithm
4366: * will be used to determine the titlecase positions.
4367: * </p>
4368: * <p>Only positions returned by the break iterator will be title cased,
4369: * character in between the positions will all be in lower case.</p>
4370: * <p>Casing is dependent on the argument locale and context-sensitive</p>
4371: * @param locale which string is to be converted in
4372: * @param str source string to be performed on
4373: * @param titleIter break iterator to determine the positions in which
4374: * the character should be title cased.
4375: * @return lowercase version of the argument string
4376: * @draft ICU 3.2
4377: * @provisional This API might change or be removed in a future release.
4378: */
4379: public static String toTitleCase(ULocale locale, String str,
4380: BreakIterator titleIter) {
4381: StringContextIterator iter = new StringContextIterator(str);
4382: StringBuffer result = new StringBuffer(str.length());
4383: int[] locCache = new int[1];
4384: int c, srcLength = str.length();
4385:
4386: if (locale == null) {
4387: locale = ULocale.getDefault();
4388: }
4389: locCache[0] = 0;
4390:
4391: if (titleIter == null) {
4392: titleIter = BreakIterator.getWordInstance(locale);
4393: }
4394: titleIter.setText(str);
4395:
4396: int prev, titleStart, index;
4397: boolean isFirstIndex;
4398:
4399: /* set up local variables */
4400: prev = 0;
4401: isFirstIndex = true;
4402:
4403: /* titlecasing loop */
4404: while (prev < srcLength) {
4405: /* find next index where to titlecase */
4406: if (isFirstIndex) {
4407: isFirstIndex = false;
4408: index = titleIter.first();
4409: } else {
4410: index = titleIter.next();
4411: }
4412: if (index == BreakIterator.DONE || index > srcLength) {
4413: index = srcLength;
4414: }
4415:
4416: /*
4417: * Unicode 4 & 5 section 3.13 Default Case Operations:
4418: *
4419: * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
4420: * #29, "Text Boundaries." Between each pair of word boundaries, find the first
4421: * cased character F. If F exists, map F to default_title(F); then map each
4422: * subsequent character C to default_lower(C).
4423: *
4424: * In this implementation, segment [prev..index[ into 3 parts:
4425: * a) uncased characters (copy as-is) [prev..titleStart[
4426: * b) first case letter (titlecase) [titleStart..titleLimit[
4427: * c) subsequent characters (lowercase) [titleLimit..index[
4428: */
4429: if (prev < index) {
4430: /* find and copy uncased characters [prev..titleStart[ */
4431: iter.setLimit(index);
4432: while ((c = iter.nextCaseMapCP()) >= 0
4433: && UCaseProps.NONE == gCsp.getType(c)) {
4434: }
4435: titleStart = iter.getCPStart();
4436: if (prev < titleStart) {
4437: // TODO: With Java 5, this would want to be result.append(str, prev, titleStart);
4438: result.append(str.substring(prev, titleStart));
4439: }
4440:
4441: if (titleStart < index) {
4442: /* titlecase c which is from titleStart */
4443: c = gCsp.toFullTitle(c, iter, result, locale,
4444: locCache);
4445:
4446: /* decode the result and lowercase up to index */
4447: for (;;) {
4448: if (c < 0) {
4449: /* (not) original code point */
4450: c = ~c;
4451: if (c <= 0xffff) {
4452: result.append((char) c);
4453: } else {
4454: UTF16.append(result, c);
4455: }
4456: } else if (c <= UCaseProps.MAX_STRING_LENGTH) {
4457: /* mapping already appended to result */
4458: } else {
4459: /* append single-code point mapping */
4460: if (c <= 0xffff) {
4461: result.append((char) c);
4462: } else {
4463: UTF16.append(result, c);
4464: }
4465: }
4466:
4467: if ((c = iter.nextCaseMapCP()) >= 0) {
4468: c = gCsp.toFullLower(c, iter, result,
4469: locale, locCache);
4470: } else {
4471: break;
4472: }
4473: }
4474: }
4475: }
4476:
4477: prev = index;
4478: }
4479: return result.toString();
4480: }
4481:
4482: /**
4483: * The given character is mapped to its case folding equivalent according
4484: * to UnicodeData.txt and CaseFolding.txt; if the character has no case
4485: * folding equivalent, the character itself is returned.
4486: *
4487: * <p>This function only returns the simple, single-code point case mapping.
4488: * Full case mappings should be used whenever possible because they produce
4489: * better results by working on whole strings.
4490: * They can map to a result string with a different length as appropriate.
4491: * Full case mappings are applied by the case mapping functions
4492: * that take String parameters rather than code points (int).
4493: * See also the User Guide chapter on C/POSIX migration:
4494: * http://icu.sourceforge.net/userguide/posix.html#case_mappings
4495: *
4496: * @param ch the character to be converted
4497: * @param defaultmapping Indicates if all mappings defined in
4498: * CaseFolding.txt is to be used, otherwise the
4499: * mappings for dotted I and dotless i marked with
4500: * 'I' in CaseFolding.txt will be skipped.
4501: * @return the case folding equivalent of the character, if
4502: * any; otherwise the character itself.
4503: * @see #foldCase(String, boolean)
4504: * @stable ICU 2.1
4505: */
4506: public static int foldCase(int ch, boolean defaultmapping) {
4507: return foldCase(ch, defaultmapping ? FOLD_CASE_DEFAULT
4508: : FOLD_CASE_EXCLUDE_SPECIAL_I);
4509: }
4510:
4511: /**
4512: * The given string is mapped to its case folding equivalent according to
4513: * UnicodeData.txt and CaseFolding.txt; if any character has no case
4514: * folding equivalent, the character itself is returned.
4515: * "Full", multiple-code point case folding mappings are returned here.
4516: * For "simple" single-code point mappings use the API
4517: * foldCase(int ch, boolean defaultmapping).
4518: * @param str the String to be converted
4519: * @param defaultmapping Indicates if all mappings defined in
4520: * CaseFolding.txt is to be used, otherwise the
4521: * mappings for dotted I and dotless i marked with
4522: * 'I' in CaseFolding.txt will be skipped.
4523: * @return the case folding equivalent of the character, if
4524: * any; otherwise the character itself.
4525: * @see #foldCase(int, boolean)
4526: * @stable ICU 2.1
4527: */
4528: public static String foldCase(String str, boolean defaultmapping) {
4529: return foldCase(str, defaultmapping ? FOLD_CASE_DEFAULT
4530: : FOLD_CASE_EXCLUDE_SPECIAL_I);
4531: }
4532:
4533: /**
4534: * Option value for case folding: use default mappings defined in CaseFolding.txt.
4535: * @stable ICU 2.6
4536: */
4537: public static final int FOLD_CASE_DEFAULT = 0x0000;
4538: /**
4539: * Option value for case folding: exclude the mappings for dotted I
4540: * and dotless i marked with 'I' in CaseFolding.txt.
4541: * @stable ICU 2.6
4542: */
4543: public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 0x0001;
4544:
4545: /**
4546: * The given character is mapped to its case folding equivalent according
4547: * to UnicodeData.txt and CaseFolding.txt; if the character has no case
4548: * folding equivalent, the character itself is returned.
4549: *
4550: * <p>This function only returns the simple, single-code point case mapping.
4551: * Full case mappings should be used whenever possible because they produce
4552: * better results by working on whole strings.
4553: * They can map to a result string with a different length as appropriate.
4554: * Full case mappings are applied by the case mapping functions
4555: * that take String parameters rather than code points (int).
4556: * See also the User Guide chapter on C/POSIX migration:
4557: * http://icu.sourceforge.net/userguide/posix.html#case_mappings
4558: *
4559: * @param ch the character to be converted
4560: * @param options A bit set for special processing. Currently the recognised options are
4561: * FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
4562: * @return the case folding equivalent of the character, if
4563: * any; otherwise the character itself.
4564: * @see #foldCase(String, boolean)
4565: * @stable ICU 2.6
4566: */
4567: public static int foldCase(int ch, int options) {
4568: return gCsp.fold(ch, options);
4569: }
4570:
4571: /**
4572: * The given string is mapped to its case folding equivalent according to
4573: * UnicodeData.txt and CaseFolding.txt; if any character has no case
4574: * folding equivalent, the character itself is returned.
4575: * "Full", multiple-code point case folding mappings are returned here.
4576: * For "simple" single-code point mappings use the API
4577: * foldCase(int ch, boolean defaultmapping).
4578: * @param str the String to be converted
4579: * @param options A bit set for special processing. Currently the recognised options are
4580: * FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
4581: * @return the case folding equivalent of the character, if
4582: * any; otherwise the character itself.
4583: * @see #foldCase(int, boolean)
4584: * @stable ICU 2.6
4585: */
4586: public static final String foldCase(String str, int options) {
4587: StringBuffer result = new StringBuffer(str.length());
4588: int c, i, length;
4589:
4590: length = str.length();
4591: for (i = 0; i < length;) {
4592: c = UTF16.charAt(str, i);
4593: i += UTF16.getCharCount(c);
4594: c = gCsp.toFullFolding(c, result, options);
4595:
4596: /* decode the result */
4597: if (c < 0) {
4598: /* (not) original code point */
4599: c = ~c;
4600: } else if (c <= UCaseProps.MAX_STRING_LENGTH) {
4601: /* mapping already appended to result */
4602: continue;
4603: /* } else { append single-code point mapping */
4604: }
4605: if (c <= 0xffff) {
4606: result.append((char) c);
4607: } else {
4608: UTF16.append(result, c);
4609: }
4610: }
4611: return result.toString();
4612: }
4613:
4614: /**
4615: * Return numeric value of Han code points.
4616: * <br> This returns the value of Han 'numeric' code points,
4617: * including those for zero, ten, hundred, thousand, ten thousand,
4618: * and hundred million.
4619: * This includes both the standard and 'checkwriting'
4620: * characters, the 'big circle' zero character, and the standard
4621: * zero character.
4622: * @param ch code point to query
4623: * @return value if it is a Han 'numeric character,' otherwise return -1.
4624: * @stable ICU 2.4
4625: */
4626: public static int getHanNumericValue(int ch) {
4627: // TODO: Are these all covered by Unicode numeric value data?
4628: switch (ch) {
4629: case IDEOGRAPHIC_NUMBER_ZERO_:
4630: case CJK_IDEOGRAPH_COMPLEX_ZERO_:
4631: return 0; // Han Zero
4632: case CJK_IDEOGRAPH_FIRST_:
4633: case CJK_IDEOGRAPH_COMPLEX_ONE_:
4634: return 1; // Han One
4635: case CJK_IDEOGRAPH_SECOND_:
4636: case CJK_IDEOGRAPH_COMPLEX_TWO_:
4637: return 2; // Han Two
4638: case CJK_IDEOGRAPH_THIRD_:
4639: case CJK_IDEOGRAPH_COMPLEX_THREE_:
4640: return 3; // Han Three
4641: case CJK_IDEOGRAPH_FOURTH_:
4642: case CJK_IDEOGRAPH_COMPLEX_FOUR_:
4643: return 4; // Han Four
4644: case CJK_IDEOGRAPH_FIFTH_:
4645: case CJK_IDEOGRAPH_COMPLEX_FIVE_:
4646: return 5; // Han Five
4647: case CJK_IDEOGRAPH_SIXTH_:
4648: case CJK_IDEOGRAPH_COMPLEX_SIX_:
4649: return 6; // Han Six
4650: case CJK_IDEOGRAPH_SEVENTH_:
4651: case CJK_IDEOGRAPH_COMPLEX_SEVEN_:
4652: return 7; // Han Seven
4653: case CJK_IDEOGRAPH_EIGHTH_:
4654: case CJK_IDEOGRAPH_COMPLEX_EIGHT_:
4655: return 8; // Han Eight
4656: case CJK_IDEOGRAPH_NINETH_:
4657: case CJK_IDEOGRAPH_COMPLEX_NINE_:
4658: return 9; // Han Nine
4659: case CJK_IDEOGRAPH_TEN_:
4660: case CJK_IDEOGRAPH_COMPLEX_TEN_:
4661: return 10;
4662: case CJK_IDEOGRAPH_HUNDRED_:
4663: case CJK_IDEOGRAPH_COMPLEX_HUNDRED_:
4664: return 100;
4665: case CJK_IDEOGRAPH_THOUSAND_:
4666: case CJK_IDEOGRAPH_COMPLEX_THOUSAND_:
4667: return 1000;
4668: case CJK_IDEOGRAPH_TEN_THOUSAND_:
4669: return 10000;
4670: case CJK_IDEOGRAPH_HUNDRED_MILLION_:
4671: return 100000000;
4672: }
4673: return -1; // no value
4674: }
4675:
4676: /**
4677: * <p>Gets an iterator for character types, iterating over codepoints.</p>
4678: * Example of use:<br>
4679: * <pre>
4680: * RangeValueIterator iterator = UCharacter.getTypeIterator();
4681: * RangeValueIterator.Element element = new RangeValueIterator.Element();
4682: * while (iterator.next(element)) {
4683: * System.out.println("Codepoint \\u" +
4684: * Integer.toHexString(element.start) +
4685: * " to codepoint \\u" +
4686: * Integer.toHexString(element.limit - 1) +
4687: * " has the character type " +
4688: * element.value);
4689: * }
4690: * </pre>
4691: * @return an iterator
4692: * @stable ICU 2.6
4693: */
4694: public static RangeValueIterator getTypeIterator() {
4695: return new UCharacterTypeIterator(PROPERTY_);
4696: }
4697:
4698: /**
4699: * <p>Gets an iterator for character names, iterating over codepoints.</p>
4700: * <p>This API only gets the iterator for the modern, most up-to-date
4701: * Unicode names. For older 1.0 Unicode names use get1_0NameIterator() or
4702: * for extended names use getExtendedNameIterator().</p>
4703: * Example of use:<br>
4704: * <pre>
4705: * ValueIterator iterator = UCharacter.getNameIterator();
4706: * ValueIterator.Element element = new ValueIterator.Element();
4707: * while (iterator.next(element)) {
4708: * System.out.println("Codepoint \\u" +
4709: * Integer.toHexString(element.codepoint) +
4710: * " has the name " + (String)element.value);
4711: * }
4712: * </pre>
4713: * <p>The maximal range which the name iterator iterates is from
4714: * UCharacter.MIN_VALUE to UCharacter.MAX_VALUE.</p>
4715: * @return an iterator
4716: * @stable ICU 2.6
4717: */
4718: public static ValueIterator getNameIterator() {
4719: if (NAME_ == null) {
4720: throw new RuntimeException("Could not load unames.icu");
4721: }
4722: return new UCharacterNameIterator(NAME_,
4723: UCharacterNameChoice.UNICODE_CHAR_NAME);
4724: }
4725:
4726: /**
4727: * <p>Gets an iterator for character names, iterating over codepoints.</p>
4728: * <p>This API only gets the iterator for the older 1.0 Unicode names.
4729: * For modern, most up-to-date Unicode names use getNameIterator() or
4730: * for extended names use getExtendedNameIterator().</p>
4731: * Example of use:<br>
4732: * <pre>
4733: * ValueIterator iterator = UCharacter.get1_0NameIterator();
4734: * ValueIterator.Element element = new ValueIterator.Element();
4735: * while (iterator.next(element)) {
4736: * System.out.println("Codepoint \\u" +
4737: * Integer.toHexString(element.codepoint) +
4738: * " has the name " + (String)element.value);
4739: * }
4740: * </pre>
4741: * <p>The maximal range which the name iterator iterates is from
4742: * @return an iterator
4743: * @stable ICU 2.6
4744: */
4745: public static ValueIterator getName1_0Iterator() {
4746: if (NAME_ == null) {
4747: throw new RuntimeException("Could not load unames.icu");
4748: }
4749: return new UCharacterNameIterator(NAME_,
4750: UCharacterNameChoice.UNICODE_10_CHAR_NAME);
4751: }
4752:
4753: /**
4754: * <p>Gets an iterator for character names, iterating over codepoints.</p>
4755: * <p>This API only gets the iterator for the extended names.
4756: * For modern, most up-to-date Unicode names use getNameIterator() or
4757: * for older 1.0 Unicode names use get1_0NameIterator().</p>
4758: * Example of use:<br>
4759: * <pre>
4760: * ValueIterator iterator = UCharacter.getExtendedNameIterator();
4761: * ValueIterator.Element element = new ValueIterator.Element();
4762: * while (iterator.next(element)) {
4763: * System.out.println("Codepoint \\u" +
4764: * Integer.toHexString(element.codepoint) +
4765: * " has the name " + (String)element.value);
4766: * }
4767: * </pre>
4768: * <p>The maximal range which the name iterator iterates is from
4769: * @return an iterator
4770: * @stable ICU 2.6
4771: */
4772: public static ValueIterator getExtendedNameIterator() {
4773: if (NAME_ == null) {
4774: throw new MissingResourceException(
4775: "Could not load unames.icu", "", "");
4776: }
4777: return new UCharacterNameIterator(NAME_,
4778: UCharacterNameChoice.EXTENDED_CHAR_NAME);
4779: }
4780:
4781: /**
4782: * <p>Get the "age" of the code point.</p>
4783: * <p>The "age" is the Unicode version when the code point was first
4784: * designated (as a non-character or for Private Use) or assigned a
4785: * character.
4786: * <p>This can be useful to avoid emitting code points to receiving
4787: * processes that do not accept newer characters.</p>
4788: * <p>The data is from the UCD file DerivedAge.txt.</p>
4789: * @param ch The code point.
4790: * @return the Unicode version number
4791: * @stable ICU 2.6
4792: */
4793: public static VersionInfo getAge(int ch) {
4794: if (ch < MIN_VALUE || ch > MAX_VALUE) {
4795: throw new IllegalArgumentException(
4796: "Codepoint out of bounds");
4797: }
4798: return PROPERTY_.getAge(ch);
4799: }
4800:
4801: /**
4802: * <p>Check a binary Unicode property for a code point.</p>
4803: * <p>Unicode, especially in version 3.2, defines many more properties
4804: * than the original set in UnicodeData.txt.</p>
4805: * <p>This API is intended to reflect Unicode properties as defined in
4806: * the Unicode Character Database (UCD) and Unicode Technical Reports
4807: * (UTR).</p>
4808: * <p>For details about the properties see
4809: * <a href=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
4810: * <p>For names of Unicode properties see the UCD file
4811: * PropertyAliases.txt.</p>
4812: * <p>This API does not check the validity of the codepoint.</p>
4813: * <p>Important: If ICU is built with UCD files from Unicode versions
4814: * below 3.2, then properties marked with "new" are not or
4815: * not fully available.</p>
4816: * @param ch code point to test.
4817: * @param property selector constant from com.ibm.icu.lang.UProperty,
4818: * identifies which binary property to check.
4819: * @return true or false according to the binary Unicode property value
4820: * for ch. Also false if property is out of bounds or if the
4821: * Unicode version does not have data for the property at all, or
4822: * not for this code point.
4823: * @see com.ibm.icu.lang.UProperty
4824: * @stable ICU 2.6
4825: */
4826: public static boolean hasBinaryProperty(int ch, int property) {
4827: if (ch < MIN_VALUE || ch > MAX_VALUE) {
4828: throw new IllegalArgumentException(
4829: "Codepoint out of bounds");
4830: }
4831: return PROPERTY_.hasBinaryProperty(ch, property);
4832: }
4833:
4834: /**
4835: * <p>Check if a code point has the Alphabetic Unicode property.</p>
4836: * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.ALPHABETIC).</p>
4837: * <p>Different from UCharacter.isLetter(ch)!</p>
4838: * @stable ICU 2.6
4839: * @param ch codepoint to be tested
4840: */
4841: public static boolean isUAlphabetic(int ch) {
4842: return hasBinaryProperty(ch, UProperty.ALPHABETIC);
4843: }
4844:
4845: /**
4846: * <p>Check if a code point has the Lowercase Unicode property.</p>
4847: * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.LOWERCASE).</p>
4848: * <p>This is different from UCharacter.isLowerCase(ch)!</p>
4849: * @param ch codepoint to be tested
4850: * @stable ICU 2.6
4851: */
4852: public static boolean isULowercase(int ch) {
4853: return hasBinaryProperty(ch, UProperty.LOWERCASE);
4854: }
4855:
4856: /**
4857: * <p>Check if a code point has the Uppercase Unicode property.</p>
4858: * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.UPPERCASE).</p>
4859: * <p>This is different from UCharacter.isUpperCase(ch)!</p>
4860: * @param ch codepoint to be tested
4861: * @stable ICU 2.6
4862: */
4863: public static boolean isUUppercase(int ch) {
4864: return hasBinaryProperty(ch, UProperty.UPPERCASE);
4865: }
4866:
4867: /**
4868: * <p>Check if a code point has the White_Space Unicode property.</p>
4869: * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.WHITE_SPACE).</p>
4870: * <p>This is different from both UCharacter.isSpace(ch) and
4871: * UCharacter.isWhitespace(ch)!</p>
4872: * @param ch codepoint to be tested
4873: * @stable ICU 2.6
4874: */
4875: public static boolean isUWhiteSpace(int ch) {
4876: return hasBinaryProperty(ch, UProperty.WHITE_SPACE);
4877: }
4878:
4879: /**
4880: * <p>Gets the property value for an Unicode property type of a code point.
4881: * Also returns binary and mask property values.</p>
4882: * <p>Unicode, especially in version 3.2, defines many more properties than
4883: * the original set in UnicodeData.txt.</p>
4884: * <p>The properties APIs are intended to reflect Unicode properties as
4885: * defined in the Unicode Character Database (UCD) and Unicode Technical
4886: * Reports (UTR). For details about the properties see
4887: * http://www.unicode.org/.</p>
4888: * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
4889: * </p>
4890: * <pre>
4891: * Sample usage:
4892: * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
4893: * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
4894: * boolean b = (ideo == 1) ? true : false;
4895: * </pre>
4896: * @param ch code point to test.
4897: * @param type UProperty selector constant, identifies which binary
4898: * property to check. Must be
4899: * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
4900: * UProperty.INT_START <= type < UProperty.INT_LIMIT or
4901: * UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
4902: * @return numeric value that is directly the property value or,
4903: * for enumerated properties, corresponds to the numeric value of
4904: * the enumerated constant of the respective property value
4905: * enumeration type (cast to enum type if necessary).
4906: * Returns 0 or 1 (for false / true) for binary Unicode properties.
4907: * Returns a bit-mask for mask properties.
4908: * Returns 0 if 'type' is out of bounds or if the Unicode version
4909: * does not have data for the property at all, or not for this code
4910: * point.
4911: * @see UProperty
4912: * @see #hasBinaryProperty
4913: * @see #getIntPropertyMinValue
4914: * @see #getIntPropertyMaxValue
4915: * @see #getUnicodeVersion
4916: * @stable ICU 2.4
4917: */
4918: public static int getIntPropertyValue(int ch, int type) {
4919: if (type < UProperty.BINARY_START) {
4920: return 0; // undefined
4921: } else if (type < UProperty.BINARY_LIMIT) {
4922: return hasBinaryProperty(ch, type) ? 1 : 0;
4923: } else if (type < UProperty.INT_START) {
4924: return 0; // undefined
4925: } else if (type < UProperty.INT_LIMIT) {
4926: //int result = 0;
4927: switch (type) {
4928: case UProperty.BIDI_CLASS:
4929: return getDirection(ch);
4930: case UProperty.BLOCK:
4931: return UnicodeBlock.idOf(ch);
4932: case UProperty.CANONICAL_COMBINING_CLASS:
4933: return getCombiningClass(ch);
4934: case UProperty.DECOMPOSITION_TYPE:
4935: return PROPERTY_.getAdditional(ch, 2)
4936: & DECOMPOSITION_TYPE_MASK_;
4937: case UProperty.EAST_ASIAN_WIDTH:
4938: return (PROPERTY_.getAdditional(ch, 0) & EAST_ASIAN_MASK_) >> EAST_ASIAN_SHIFT_;
4939: case UProperty.GENERAL_CATEGORY:
4940: return getType(ch);
4941: case UProperty.JOINING_GROUP:
4942: return gBdp.getJoiningGroup(ch);
4943: case UProperty.JOINING_TYPE:
4944: return gBdp.getJoiningType(ch);
4945: case UProperty.LINE_BREAK:
4946: return (int) (PROPERTY_.getAdditional(ch, 0) & LINE_BREAK_MASK_) >> LINE_BREAK_SHIFT_;
4947: case UProperty.NUMERIC_TYPE:
4948: type = getNumericType(PROPERTY_.getProperty(ch));
4949: if (type > NumericType.NUMERIC) {
4950: /* keep internal variants of NumericType.NUMERIC from becoming visible */
4951: type = NumericType.NUMERIC;
4952: }
4953: return type;
4954: case UProperty.SCRIPT:
4955: return UScript.getScript(ch);
4956: case UProperty.HANGUL_SYLLABLE_TYPE:
4957: /* purely algorithmic; hardcode known characters, check for assigned new ones */
4958: if (ch < NormalizerImpl.JAMO_L_BASE) {
4959: /* NA */
4960: } else if (ch <= 0x11ff) {
4961: /* Jamo range */
4962: if (ch <= 0x115f) {
4963: /* Jamo L range, HANGUL CHOSEONG ... */
4964: if (ch == 0x115f
4965: || ch <= 0x1159
4966: || getType(ch) == UCharacterCategory.OTHER_LETTER) {
4967: return HangulSyllableType.LEADING_JAMO;
4968: }
4969: } else if (ch <= 0x11a7) {
4970: /* Jamo V range, HANGUL JUNGSEONG ... */
4971: if (ch <= 0x11a2
4972: || getType(ch) == UCharacterCategory.OTHER_LETTER) {
4973: return HangulSyllableType.VOWEL_JAMO;
4974: }
4975: } else {
4976: /* Jamo T range */
4977: if (ch <= 0x11f9
4978: || getType(ch) == UCharacterCategory.OTHER_LETTER) {
4979: return HangulSyllableType.TRAILING_JAMO;
4980: }
4981: }
4982: } else if ((ch -= NormalizerImpl.HANGUL_BASE) < 0) {
4983: /* NA */
4984: } else if (ch < NormalizerImpl.HANGUL_COUNT) {
4985: /* Hangul syllable */
4986: return ch % NormalizerImpl.JAMO_T_COUNT == 0 ? HangulSyllableType.LV_SYLLABLE
4987: : HangulSyllableType.LVT_SYLLABLE;
4988: }
4989: return 0; /* NA */
4990:
4991: case UProperty.NFD_QUICK_CHECK:
4992: case UProperty.NFKD_QUICK_CHECK:
4993: case UProperty.NFC_QUICK_CHECK:
4994: case UProperty.NFKC_QUICK_CHECK:
4995: return NormalizerImpl.quickCheck(ch,
4996: (type - UProperty.NFD_QUICK_CHECK) + 2); // 2=UNORM_NFD
4997: case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
4998: return NormalizerImpl.getFCD16(ch) >> 8;
4999: case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
5000: return NormalizerImpl.getFCD16(ch) & 0xff;
5001: case UProperty.GRAPHEME_CLUSTER_BREAK:
5002: return (int) (PROPERTY_.getAdditional(ch, 2) & GCB_MASK) >> GCB_SHIFT;
5003: case UProperty.SENTENCE_BREAK:
5004: return (int) (PROPERTY_.getAdditional(ch, 2) & SB_MASK) >> SB_SHIFT;
5005: case UProperty.WORD_BREAK:
5006: return (int) (PROPERTY_.getAdditional(ch, 2) & WB_MASK) >> WB_SHIFT;
5007: default:
5008:
5009: return 0; /* undefined */
5010: }
5011: } else if (type == UProperty.GENERAL_CATEGORY_MASK) {
5012: return UCharacterProperty.getMask(getType(ch));
5013: }
5014: return 0; // undefined
5015: }
5016:
5017: /**
5018: * Returns a string version of the property value.
5019: * @param propertyEnum
5020: * @param codepoint
5021: * @param nameChoice
5022: * @return value as string
5023: * @internal
5024: * @deprecated This API is ICU internal only.
5025: */
5026: public static String getStringPropertyValue(int propertyEnum,
5027: int codepoint, int nameChoice) {
5028: // TODO some of these are less efficient, since a string is forced!
5029: if ((propertyEnum >= UProperty.BINARY_START && propertyEnum < UProperty.BINARY_LIMIT)
5030: || (propertyEnum >= UProperty.INT_START && propertyEnum < UProperty.INT_LIMIT)) {
5031: return getPropertyValueName(propertyEnum,
5032: getIntPropertyValue(codepoint, propertyEnum),
5033: nameChoice);
5034: }
5035: if (propertyEnum == UProperty.NUMERIC_VALUE) {
5036: return String.valueOf(getUnicodeNumericValue(codepoint));
5037: }
5038: // otherwise must be string property
5039: switch (propertyEnum) {
5040: case UProperty.AGE:
5041: return getAge(codepoint).toString();
5042: case UProperty.ISO_COMMENT:
5043: return getISOComment(codepoint);
5044: case UProperty.BIDI_MIRRORING_GLYPH:
5045: return UTF16.valueOf(getMirror(codepoint));
5046: case UProperty.CASE_FOLDING:
5047: return foldCase(UTF16.valueOf(codepoint), true);
5048: case UProperty.LOWERCASE_MAPPING:
5049: return toLowerCase(UTF16.valueOf(codepoint));
5050: case UProperty.NAME:
5051: return getName(codepoint);
5052: case UProperty.SIMPLE_CASE_FOLDING:
5053: return UTF16.valueOf(foldCase(codepoint, true));
5054: case UProperty.SIMPLE_LOWERCASE_MAPPING:
5055: return UTF16.valueOf(toLowerCase(codepoint));
5056: case UProperty.SIMPLE_TITLECASE_MAPPING:
5057: return UTF16.valueOf(toTitleCase(codepoint));
5058: case UProperty.SIMPLE_UPPERCASE_MAPPING:
5059: return UTF16.valueOf(toUpperCase(codepoint));
5060: case UProperty.TITLECASE_MAPPING:
5061: return toTitleCase(UTF16.valueOf(codepoint), null);
5062: case UProperty.UNICODE_1_NAME:
5063: return getName1_0(codepoint);
5064: case UProperty.UPPERCASE_MAPPING:
5065: return toUpperCase(UTF16.valueOf(codepoint));
5066: }
5067: throw new IllegalArgumentException("Illegal Property Enum");
5068: }
5069:
5070: /**
5071: * Get the minimum value for an integer/binary Unicode property type.
5072: * Can be used together with UCharacter.getIntPropertyMaxValue(int)
5073: * to allocate arrays of com.ibm.icu.text.UnicodeSet or similar.
5074: * @param type UProperty selector constant, identifies which binary
5075: * property to check. Must be
5076: * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
5077: * UProperty.INT_START <= type < UProperty.INT_LIMIT.
5078: * @return Minimum value returned by UCharacter.getIntPropertyValue(int)
5079: * for a Unicode property. 0 if the property
5080: * selector 'type' is out of range.
5081: * @see UProperty
5082: * @see #hasBinaryProperty
5083: * @see #getUnicodeVersion
5084: * @see #getIntPropertyMaxValue
5085: * @see #getIntPropertyValue
5086: * @stable ICU 2.4
5087: */
5088: public static int getIntPropertyMinValue(int type) {
5089:
5090: return 0; // undefined; and: all other properties have a minimum value
5091: // of 0
5092: }
5093:
5094: /**
5095: * Get the maximum value for an integer/binary Unicode property.
5096: * Can be used together with UCharacter.getIntPropertyMinValue(int)
5097: * to allocate arrays of com.ibm.icu.text.UnicodeSet or similar.
5098: * Examples for min/max values (for Unicode 3.2):
5099: * <ul>
5100: * <li> UProperty.BIDI_CLASS: 0/18 (UCharacterDirection.LEFT_TO_RIGHT/UCharacterDirection.BOUNDARY_NEUTRAL)
5101: * <li> UProperty.SCRIPT: 0/45 (UScript.COMMON/UScript.TAGBANWA)
5102: * <li> UProperty.IDEOGRAPHIC: 0/1 (false/true)
5103: * </ul>
5104: * For undefined UProperty constant values, min/max values will be 0/-1.
5105: * @param type UProperty selector constant, identifies which binary
5106: * property to check. Must be
5107: * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
5108: * UProperty.INT_START <= type < UProperty.INT_LIMIT.
5109: * @return Maximum value returned by u_getIntPropertyValue for a Unicode
5110: * property. <= 0 if the property selector 'type' is out of range.
5111: * @see UProperty
5112: * @see #hasBinaryProperty
5113: * @see #getUnicodeVersion
5114: * @see #getIntPropertyMaxValue
5115: * @see #getIntPropertyValue
5116: * @stable ICU 2.4
5117: */
5118: public static int getIntPropertyMaxValue(int type) {
5119: if (type < UProperty.BINARY_START) {
5120: return -1; // undefined
5121: } else if (type < UProperty.BINARY_LIMIT) {
5122: return 1; // maximum TRUE for all binary properties
5123: } else if (type < UProperty.INT_START) {
5124: return -1; // undefined
5125: } else if (type < UProperty.INT_LIMIT) {
5126: switch (type) {
5127: case UProperty.BIDI_CLASS:
5128: case UProperty.JOINING_GROUP:
5129: case UProperty.JOINING_TYPE:
5130: return gBdp.getMaxValue(type);
5131: case UProperty.BLOCK:
5132: return (PROPERTY_.getMaxValues(0) & BLOCK_MASK_) >> BLOCK_SHIFT_;
5133: case UProperty.CANONICAL_COMBINING_CLASS:
5134: case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
5135: case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
5136: return 0xff; // TODO do we need to be more precise,
5137: // getting the actual maximum?
5138: case UProperty.DECOMPOSITION_TYPE:
5139: return PROPERTY_.getMaxValues(2)
5140: & DECOMPOSITION_TYPE_MASK_;
5141: case UProperty.EAST_ASIAN_WIDTH:
5142: return (PROPERTY_.getMaxValues(0) & EAST_ASIAN_MASK_) >> EAST_ASIAN_SHIFT_;
5143: case UProperty.GENERAL_CATEGORY:
5144: return UCharacterCategory.CHAR_CATEGORY_COUNT - 1;
5145: case UProperty.LINE_BREAK:
5146: return (PROPERTY_.getMaxValues(0) & LINE_BREAK_MASK_) >> LINE_BREAK_SHIFT_;
5147: case UProperty.NUMERIC_TYPE:
5148: return NumericType.COUNT - 1;
5149: case UProperty.SCRIPT:
5150: return PROPERTY_.getMaxValues(0) & SCRIPT_MASK_;
5151: case UProperty.HANGUL_SYLLABLE_TYPE:
5152: return HangulSyllableType.COUNT - 1;
5153: case UProperty.NFD_QUICK_CHECK:
5154: case UProperty.NFKD_QUICK_CHECK:
5155: return 1; // YES -- these are never "maybe", only "no" or "yes"
5156: case UProperty.NFC_QUICK_CHECK:
5157: case UProperty.NFKC_QUICK_CHECK:
5158: return 2; // MAYBE
5159: case UProperty.GRAPHEME_CLUSTER_BREAK:
5160: return (PROPERTY_.getMaxValues(2) & GCB_MASK) >> GCB_SHIFT;
5161: case UProperty.SENTENCE_BREAK:
5162: return (PROPERTY_.getMaxValues(2) & SB_MASK) >> SB_SHIFT;
5163: case UProperty.WORD_BREAK:
5164: return (PROPERTY_.getMaxValues(2) & WB_MASK) >> WB_SHIFT;
5165: default:
5166: return -1; // undefined
5167: }
5168:
5169: }
5170: return -1; // undefined
5171: }
5172:
5173: /**
5174: * Provide the java.lang.Character forDigit API, for convenience.
5175: * @stable ICU 3.0
5176: */
5177: public static char forDigit(int digit, int radix) {
5178: return java.lang.Character.forDigit(digit, radix);
5179: }
5180:
5181: // JDK 1.5 API coverage
5182:
5183: /**
5184: * Cover the JDK 1.5 API, for convenience.
5185: * @see UTF16#LEAD_SURROGATE_MIN_VALUE
5186: * @stable ICU 3.0
5187: */
5188: public static final char MIN_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MIN_VALUE;
5189:
5190: /**
5191: * Cover the JDK 1.5 API, for convenience.
5192: * @see UTF16#LEAD_SURROGATE_MAX_VALUE
5193: * @stable ICU 3.0
5194: */
5195: public static final char MAX_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MAX_VALUE;
5196:
5197: /**
5198: * Cover the JDK 1.5 API, for convenience.
5199: * @see UTF16#TRAIL_SURROGATE_MIN_VALUE
5200: * @stable ICU 3.0
5201: */
5202: public static final char MIN_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MIN_VALUE;
5203:
5204: /**
5205: * Cover the JDK 1.5 API, for convenience.
5206: * @see UTF16#TRAIL_SURROGATE_MAX_VALUE
5207: * @stable ICU 3.0
5208: */
5209: public static final char MAX_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MAX_VALUE;
5210:
5211: /**
5212: * Cover the JDK 1.5 API, for convenience.
5213: * @see UTF16#SURROGATE_MIN_VALUE
5214: * @stable ICU 3.0
5215: */
5216: public static final char MIN_SURROGATE = UTF16.SURROGATE_MIN_VALUE;
5217:
5218: /**
5219: * Cover the JDK 1.5 API, for convenience.
5220: * @see UTF16#SURROGATE_MAX_VALUE
5221: * @stable ICU 3.0
5222: */
5223: public static final char MAX_SURROGATE = UTF16.SURROGATE_MAX_VALUE;
5224:
5225: /**
5226: * Cover the JDK 1.5 API, for convenience.
5227: * @see UTF16#SUPPLEMENTARY_MIN_VALUE
5228: * @stable ICU 3.0
5229: */
5230: public static final int MIN_SUPPLEMENTARY_CODE_POINT = UTF16.SUPPLEMENTARY_MIN_VALUE;
5231:
5232: /**
5233: * Cover the JDK 1.5 API, for convenience.
5234: * @see UTF16#CODEPOINT_MAX_VALUE
5235: * @stable ICU 3.0
5236: */
5237: public static final int MAX_CODE_POINT = UTF16.CODEPOINT_MAX_VALUE;
5238:
5239: /**
5240: * Cover the JDK 1.5 API, for convenience.
5241: * @see UTF16#CODEPOINT_MIN_VALUE
5242: * @stable ICU 3.0
5243: */
5244: public static final int MIN_CODE_POINT = UTF16.CODEPOINT_MIN_VALUE;
5245:
5246: /**
5247: * Cover the JDK 1.5 API, for convenience.
5248: * @param cp the code point to check
5249: * @return true if cp is a valid code point
5250: * @stable ICU 3.0
5251: */
5252: public static final boolean isValidCodePoint(int cp) {
5253: return cp >= 0 && cp <= MAX_CODE_POINT;
5254: }
5255:
5256: /**
5257: * Cover the JDK 1.5 API, for convenience.
5258: * @param cp the code point to check
5259: * @return true if cp is a supplementary code point
5260: * @stable ICU 3.0
5261: */
5262: public static final boolean isSupplementaryCodePoint(int cp) {
5263: return cp >= UTF16.SUPPLEMENTARY_MIN_VALUE
5264: && cp <= UTF16.CODEPOINT_MAX_VALUE;
5265: }
5266:
5267: /**
5268: * Cover the JDK 1.5 API, for convenience.
5269: * @param ch the char to check
5270: * @return true if ch is a high (lead) surrogate
5271: * @stable ICU 3.0
5272: */
5273: public static boolean isHighSurrogate(char ch) {
5274: return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
5275: }
5276:
5277: /**
5278: * Cover the JDK 1.5 API, for convenience.
5279: * @param ch the char to check
5280: * @return true if ch is a low (trail) surrogate
5281: * @stable ICU 3.0
5282: */
5283: public static boolean isLowSurrogate(char ch) {
5284: return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
5285: }
5286:
5287: /**
5288: * Cover the JDK 1.5 API, for convenience. Return true if the chars
5289: * form a valid surrogate pair.
5290: * @param high the high (lead) char
5291: * @param low the low (trail) char
5292: * @return true if high, low form a surrogate pair
5293: * @stable ICU 3.0
5294: */
5295: public static final boolean isSurrogatePair(char high, char low) {
5296: return isHighSurrogate(high) && isLowSurrogate(low);
5297: }
5298:
5299: /**
5300: * Cover the JDK 1.5 API, for convenience. Return the number of chars needed
5301: * to represent the code point. This does not check the
5302: * code point for validity.
5303: * @param cp the code point to check
5304: * @return the number of chars needed to represent the code point
5305: * @see UTF16#getCharCount
5306: * @stable ICU 3.0
5307: */
5308: public static int charCount(int cp) {
5309: return UTF16.getCharCount(cp);
5310: }
5311:
5312: /**
5313: * Cover the JDK 1.5 API, for convenience. Return the code point represented by
5314: * the characters. This does not check the surrogate pair for validity.
5315: * @param high the high (lead) surrogate
5316: * @param low the low (trail) surrogate
5317: * @return the code point formed by the surrogate pair
5318: * @stable ICU 3.0
5319: */
5320: public static final int toCodePoint(char high, char low) {
5321: return UCharacterProperty.getRawSupplementary(high, low);
5322: }
5323:
5324: /**
5325: * Cover the JDK 1.5 API, for convenience. Return the code point at index.
5326: * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5327: * API. This examines only the characters at index and index+1.
5328: * @param seq the characters to check
5329: * @param index the index of the first or only char forming the code point
5330: * @return the code point at the index
5331: * @stable ICU 3.0
5332: */
5333: //#ifndef FOUNDATION
5334: public static final int codePointAt(CharSequence seq, int index) {
5335: //#else
5336: //## public static final int codePointAt(String seq, int index) {
5337: //#endif
5338: char c1 = seq.charAt(index++);
5339: if (isHighSurrogate(c1)) {
5340: if (index < seq.length()) {
5341: char c2 = seq.charAt(index);
5342: if (isLowSurrogate(c2)) {
5343: return toCodePoint(c1, c2);
5344: }
5345: }
5346: }
5347: return c1;
5348: }
5349:
5350: //#ifdef FOUNDATION
5351: //## public static final int codePointAt(StringBuffer seq, int index) {
5352: //## return codePointAt(seq.toString(), index);
5353: //## }
5354: //#endif
5355:
5356: /**
5357: * Cover the JDK 1.5 API, for convenience. Return the code point at index.
5358: * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5359: * API. This examines only the characters at index and index+1.
5360: * @param text the characters to check
5361: * @param index the index of the first or only char forming the code point
5362: * @return the code point at the index
5363: * @stable ICU 3.0
5364: */
5365: public static final int codePointAt(char[] text, int index) {
5366: char c1 = text[index++];
5367: if (isHighSurrogate(c1)) {
5368: if (index < text.length) {
5369: char c2 = text[index];
5370: if (isLowSurrogate(c2)) {
5371: return toCodePoint(c1, c2);
5372: }
5373: }
5374: }
5375: return c1;
5376: }
5377:
5378: /**
5379: * Cover the JDK 1.5 API, for convenience. Return the code point at index.
5380: * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5381: * API. This examines only the characters at index and index+1.
5382: * @param text the characters to check
5383: * @param index the index of the first or only char forming the code point
5384: * @param limit the limit of the valid text
5385: * @return the code point at the index
5386: * @stable ICU 3.0
5387: */
5388: public static final int codePointAt(char[] text, int index,
5389: int limit) {
5390: if (index >= limit || limit > text.length) {
5391: throw new IndexOutOfBoundsException();
5392: }
5393: char c1 = text[index++];
5394: if (isHighSurrogate(c1)) {
5395: if (index < limit) {
5396: char c2 = text[index];
5397: if (isLowSurrogate(c2)) {
5398: return toCodePoint(c1, c2);
5399: }
5400: }
5401: }
5402: return c1;
5403: }
5404:
5405: /**
5406: * Cover the JDK 1.5 API, for convenience. Return the code point before index.
5407: * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5408: * API. This examines only the characters at index-1 and index-2.
5409: * @param seq the characters to check
5410: * @param index the index after the last or only char forming the code point
5411: * @return the code point before the index
5412: * @stable ICU 3.0
5413: */
5414: //#ifndef FOUNDATION
5415: public static final int codePointBefore(CharSequence seq, int index) {
5416: //#else
5417: //## public static final int codePointBefore(String seq, int index) {
5418: //#endif
5419: char c2 = seq.charAt(--index);
5420: if (isLowSurrogate(c2)) {
5421: if (index > 0) {
5422: char c1 = seq.charAt(--index);
5423: if (isHighSurrogate(c1)) {
5424: return toCodePoint(c1, c2);
5425: }
5426: }
5427: }
5428: return c2;
5429: }
5430:
5431: //#ifdef FOUNDATION
5432: //## public static final int codePointBefore(StringBuffer seq, int index) {
5433: //## return codePointBefore(seq.toString(), index);
5434: //## }
5435: //#endif
5436:
5437: /**
5438: * Cover the JDK 1.5 API, for convenience. Return the code point before index.
5439: * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5440: * API. This examines only the characters at index-1 and index-2.
5441: * @param text the characters to check
5442: * @param index the index after the last or only char forming the code point
5443: * @return the code point before the index
5444: * @stable ICU 3.0
5445: */
5446: public static final int codePointBefore(char[] text, int index) {
5447: char c2 = text[--index];
5448: if (isLowSurrogate(c2)) {
5449: if (index > 0) {
5450: char c1 = text[--index];
5451: if (isHighSurrogate(c1)) {
5452: return toCodePoint(c1, c2);
5453: }
5454: }
5455: }
5456: return c2;
5457: }
5458:
5459: /**
5460: * Cover the JDK 1.5 API, for convenience. Return the code point before index.
5461: * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5462: * API. This examines only the characters at index-1 and index-2.
5463: * @param text the characters to check
5464: * @param index the index after the last or only char forming the code point
5465: * @param limit the start of the valid text
5466: * @return the code point before the index
5467: * @stable ICU 3.0
5468: */
5469: public static final int codePointBefore(char[] text, int index,
5470: int limit) {
5471: if (index <= limit || limit < 0) {
5472: throw new IndexOutOfBoundsException();
5473: }
5474: char c2 = text[--index];
5475: if (isLowSurrogate(c2)) {
5476: if (index > limit) {
5477: char c1 = text[--index];
5478: if (isHighSurrogate(c1)) {
5479: return toCodePoint(c1, c2);
5480: }
5481: }
5482: }
5483: return c2;
5484: }
5485:
5486: /**
5487: * Cover the JDK 1.5 API, for convenience. Writes the chars representing the
5488: * code point into the destination at the given index.
5489: * @param cp the code point to convert
5490: * @param dst the destination array into which to put the char(s) representing the code point
5491: * @param dstIndex the index at which to put the first (or only) char
5492: * @return the count of the number of chars written (1 or 2)
5493: * @throws IllegalArgumentException if cp is not a valid code point
5494: * @stable ICU 3.0
5495: */
5496: public static final int toChars(int cp, char[] dst, int dstIndex) {
5497: if (cp >= 0) {
5498: if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
5499: dst[dstIndex] = (char) cp;
5500: return 1;
5501: }
5502: if (cp <= MAX_CODE_POINT) {
5503: dst[dstIndex] = UTF16.getLeadSurrogate(cp);
5504: dst[dstIndex + 1] = UTF16.getTrailSurrogate(cp);
5505: return 2;
5506: }
5507: }
5508: throw new IllegalArgumentException();
5509: }
5510:
5511: /**
5512: * Cover the JDK 1.5 API, for convenience. Returns a char array
5513: * representing the code point.
5514: * @param cp the code point to convert
5515: * @return an array containing the char(s) representing the code point
5516: * @throws IllegalArgumentException if cp is not a valid code point
5517: * @stable ICU 3.0
5518: */
5519: public static final char[] toChars(int cp) {
5520: if (cp >= 0) {
5521: if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
5522: return new char[] { (char) cp };
5523: }
5524: if (cp <= MAX_CODE_POINT) {
5525: return new char[] { UTF16.getLeadSurrogate(cp),
5526: UTF16.getTrailSurrogate(cp) };
5527: }
5528: }
5529: throw new IllegalArgumentException();
5530: }
5531:
5532: /**
5533: * Cover the JDK API, for convenience. Return a byte representing the directionality of
5534: * the character.
5535: * <br/><b>Note</b>: Unlike the JDK, this returns DIRECTIONALITY_LEFT_TO_RIGHT for undefined or
5536: * out-of-bounds characters. <br/><b>Note</b>: The return value must be
5537: * tested using the constants defined in {@link UCharacterEnums.ECharacterDirection}
5538: * since the values are different from the ones defined by <code>java.lang.Character</code>.
5539: * @param cp the code point to check
5540: * @return the directionality of the code point
5541: * @see #getDirection
5542: * @stable ICU 3.0
5543: */
5544: public static byte getDirectionality(int cp) {
5545: return (byte) getDirection(cp);
5546: }
5547:
5548: /**
5549: * Cover the JDK API, for convenience. Count the number of code points in the range of text.
5550: * @param text the characters to check
5551: * @param start the start of the range
5552: * @param limit the limit of the range
5553: * @return the number of code points in the range
5554: * @stable ICU 3.0
5555: */
5556: //#ifndef FOUNDATION
5557: public static int codePointCount(CharSequence text, int start,
5558: int limit) {
5559: //#else
5560: //## public static int codePointCount(String text, int start, int limit) {
5561: //#endif
5562: if (start < 0 || limit < start || limit > text.length()) {
5563: throw new IndexOutOfBoundsException("start (" + start
5564: + ") or limit (" + limit
5565: + ") invalid or out of range 0, " + text.length());
5566: }
5567:
5568: int len = limit - start;
5569: while (limit > start) {
5570: char ch = text.charAt(--limit);
5571: while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE
5572: && limit > start) {
5573: ch = text.charAt(--limit);
5574: if (ch >= MIN_HIGH_SURROGATE
5575: && ch <= MAX_HIGH_SURROGATE) {
5576: --len;
5577: break;
5578: }
5579: }
5580: }
5581: return len;
5582: }
5583:
5584: /**
5585: * Cover the JDK API, for convenience. Count the number of code points in the range of text.
5586: * @param text the characters to check
5587: * @param start the start of the range
5588: * @param limit the limit of the range
5589: * @return the number of code points in the range
5590: * @stable ICU 3.0
5591: */
5592: public static int codePointCount(char[] text, int start, int limit) {
5593: if (start < 0 || limit < start || limit > text.length) {
5594: throw new IndexOutOfBoundsException("start (" + start
5595: + ") or limit (" + limit
5596: + ") invalid or out of range 0, " + text.length);
5597: }
5598:
5599: int len = limit - start;
5600: while (limit > start) {
5601: char ch = text[--limit];
5602: while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE
5603: && limit > start) {
5604: ch = text[--limit];
5605: if (ch >= MIN_HIGH_SURROGATE
5606: && ch <= MAX_HIGH_SURROGATE) {
5607: --len;
5608: break;
5609: }
5610: }
5611: }
5612: return len;
5613: }
5614:
5615: /**
5616: * Cover the JDK API, for convenience. Adjust the char index by a code point offset.
5617: * @param text the characters to check
5618: * @param index the index to adjust
5619: * @param codePointOffset the number of code points by which to offset the index
5620: * @return the adjusted index
5621: * @stable ICU 3.0
5622: */
5623: //#ifndef FOUNDATION
5624: public static int offsetByCodePoints(CharSequence text, int index,
5625: int codePointOffset) {
5626: //#else
5627: //## public static int offsetByCodePoints(String text, int index, int codePointOffset) {
5628: //#endif
5629: if (index < 0 || index > text.length()) {
5630: throw new IndexOutOfBoundsException("index ( " + index
5631: + ") out of range 0, " + text.length());
5632: }
5633:
5634: if (codePointOffset < 0) {
5635: while (++codePointOffset <= 0) {
5636: char ch = text.charAt(--index);
5637: while (ch >= MIN_LOW_SURROGATE
5638: && ch <= MAX_LOW_SURROGATE && index > 0) {
5639: ch = text.charAt(--index);
5640: if (ch < MIN_HIGH_SURROGATE
5641: || ch > MAX_HIGH_SURROGATE) {
5642: if (++codePointOffset > 0) {
5643: return index + 1;
5644: }
5645: }
5646: }
5647: }
5648: } else {
5649: int limit = text.length();
5650: while (--codePointOffset >= 0) {
5651: char ch = text.charAt(index++);
5652: while (ch >= MIN_HIGH_SURROGATE
5653: && ch <= MAX_HIGH_SURROGATE && index < limit) {
5654: ch = text.charAt(index++);
5655: if (ch < MIN_LOW_SURROGATE
5656: || ch > MAX_LOW_SURROGATE) {
5657: if (--codePointOffset < 0) {
5658: return index - 1;
5659: }
5660: }
5661: }
5662: }
5663: }
5664:
5665: return index;
5666: }
5667:
5668: /**
5669: * Cover the JDK API, for convenience. Adjust the char index by a code point offset.
5670: * @param text the characters to check
5671: * @param start the start of the range to check
5672: * @param count the length of the range to check
5673: * @param index the index to adjust
5674: * @param codePointOffset the number of code points by which to offset the index
5675: * @return the adjusted index
5676: * @stable ICU 3.0
5677: */
5678: public static int offsetByCodePoints(char[] text, int start,
5679: int count, int index, int codePointOffset) {
5680: int limit = start + count;
5681: if (start < 0 || limit < start || limit > text.length
5682: || index < start || index > limit) {
5683: throw new IndexOutOfBoundsException("index ( " + index
5684: + ") out of range " + start + ", " + limit
5685: + " in array 0, " + text.length);
5686: }
5687:
5688: if (codePointOffset < 0) {
5689: while (++codePointOffset <= 0) {
5690: char ch = text[--index];
5691: if (index < start) {
5692: throw new IndexOutOfBoundsException("index ( "
5693: + index + ") < start (" + start + ")");
5694: }
5695: while (ch >= MIN_LOW_SURROGATE
5696: && ch <= MAX_LOW_SURROGATE && index > start) {
5697: ch = text[--index];
5698: if (ch < MIN_HIGH_SURROGATE
5699: || ch > MAX_HIGH_SURROGATE) {
5700: if (++codePointOffset > 0) {
5701: return index + 1;
5702: }
5703: }
5704: }
5705: }
5706: } else {
5707: while (--codePointOffset >= 0) {
5708: char ch = text[index++];
5709: if (index > limit) {
5710: throw new IndexOutOfBoundsException("index ( "
5711: + index + ") > limit (" + limit + ")");
5712: }
5713: while (ch >= MIN_HIGH_SURROGATE
5714: && ch <= MAX_HIGH_SURROGATE && index < limit) {
5715: ch = text[index++];
5716: if (ch < MIN_LOW_SURROGATE
5717: || ch > MAX_LOW_SURROGATE) {
5718: if (--codePointOffset < 0) {
5719: return index - 1;
5720: }
5721: }
5722: }
5723: }
5724: }
5725:
5726: return index;
5727: }
5728:
5729: // protected data members --------------------------------------------
5730:
5731: /**
5732: * Database storing the sets of character name
5733: */
5734: static UCharacterName NAME_ = null;
5735:
5736: /**
5737: * Singleton object encapsulating the imported pnames.icu property aliases
5738: */
5739: static UPropertyAliases PNAMES_ = null;
5740:
5741: // block to initialise name database and unicode 1.0 data
5742: static {
5743: try {
5744: PNAMES_ = new UPropertyAliases();
5745: NAME_ = UCharacterName.getInstance();
5746: } catch (IOException e) {
5747: // e.printStackTrace();
5748: throw new MissingResourceException(e.getMessage(), "", "");
5749: //throw new RuntimeException(e.getMessage());
5750: // DONOT throw an exception
5751: // we might be building ICU modularly wothout names.icu and
5752: // pnames.icu
5753: }
5754: }
5755:
5756: // private variables -------------------------------------------------
5757:
5758: /**
5759: * Database storing the sets of character property
5760: */
5761: private static final UCharacterProperty PROPERTY_;
5762: /**
5763: * For optimization
5764: */
5765: private static final char[] PROPERTY_TRIE_INDEX_;
5766: private static final char[] PROPERTY_TRIE_DATA_;
5767: private static final int PROPERTY_INITIAL_VALUE_;
5768:
5769: private static final UCaseProps gCsp;
5770: private static final UBiDiProps gBdp;
5771:
5772: // block to initialise character property database
5773: static {
5774: try {
5775: PROPERTY_ = UCharacterProperty.getInstance();
5776: PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
5777: PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
5778: PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
5779: } catch (Exception e) {
5780: throw new MissingResourceException(e.getMessage(), "", "");
5781: }
5782:
5783: /*
5784: * In ICU4J 3.2, most Unicode properties were loaded from uprops.icu.
5785: * ICU4J 3.4 adds ucase.icu for case mapping properties and
5786: * ubidi.icu for bidi/shaping properties and
5787: * removes case/bidi/shaping properties from uprops.icu.
5788: *
5789: * Loading of uprops.icu was always done during class loading of UCharacter.class.
5790: * In order to maintain performance for all such properties,
5791: * ucase.icu and ubidi.icu are also loaded during class loading of UCharacter.class.
5792: * It will not fail if they are missing.
5793: * These data items are loaded early to avoid having to synchronize access to them,
5794: * for thread safety and performance.
5795: *
5796: * We try to load these data items at most once.
5797: * If it works, we use the resulting singleton object.
5798: * If it fails, then we get a dummy object, which always works unless
5799: * we are seriously out of memory.
5800: * After UCharacter.class loading, we have a never-changing pointer to either the
5801: * real singleton or the dummy.
5802: *
5803: * This method is used in Unicode properties APIs that
5804: * do not have a service object and also do not have an error code parameter.
5805: * Other API implementations get the singleton themselves
5806: * (synchronized), store it in the service object, and report errors.
5807: */
5808: UCaseProps csp;
5809: try {
5810: csp = UCaseProps.getSingleton();
5811: } catch (IOException e) {
5812: csp = UCaseProps.getDummy();
5813: }
5814: gCsp = csp;
5815:
5816: UBiDiProps bdp;
5817: try {
5818: bdp = UBiDiProps.getSingleton();
5819: } catch (IOException e) {
5820: bdp = UBiDiProps.getDummy();
5821: }
5822: gBdp = bdp;
5823: }
5824:
5825: /**
5826: * To get the last character out from a data type
5827: */
5828: private static final int LAST_CHAR_MASK_ = 0xFFFF;
5829:
5830: /**
5831: * To get the last byte out from a data type
5832: */
5833: private static final int LAST_BYTE_MASK_ = 0xFF;
5834:
5835: /**
5836: * Shift 16 bits
5837: */
5838: private static final int SHIFT_16_ = 16;
5839:
5840: /**
5841: * Shift 24 bits
5842: */
5843: private static final int SHIFT_24_ = 24;
5844:
5845: /**
5846: * Decimal radix
5847: */
5848: private static final int DECIMAL_RADIX_ = 10;
5849:
5850: /**
5851: * No break space code point
5852: */
5853: private static final int NO_BREAK_SPACE_ = 0xA0;
5854:
5855: /**
5856: * Narrow no break space code point
5857: */
5858: private static final int NARROW_NO_BREAK_SPACE_ = 0x202F;
5859:
5860: /**
5861: * Zero width no break space code point
5862: */
5863: private static final int ZERO_WIDTH_NO_BREAK_SPACE_ = 0xFEFF;
5864:
5865: /**
5866: * Ideographic number zero code point
5867: */
5868: private static final int IDEOGRAPHIC_NUMBER_ZERO_ = 0x3007;
5869:
5870: /**
5871: * CJK Ideograph, First code point
5872: */
5873: private static final int CJK_IDEOGRAPH_FIRST_ = 0x4e00;
5874:
5875: /**
5876: * CJK Ideograph, Second code point
5877: */
5878: private static final int CJK_IDEOGRAPH_SECOND_ = 0x4e8c;
5879:
5880: /**
5881: * CJK Ideograph, Third code point
5882: */
5883: private static final int CJK_IDEOGRAPH_THIRD_ = 0x4e09;
5884:
5885: /**
5886: * CJK Ideograph, Fourth code point
5887: */
5888: private static final int CJK_IDEOGRAPH_FOURTH_ = 0x56d8;
5889:
5890: /**
5891: * CJK Ideograph, FIFTH code point
5892: */
5893: private static final int CJK_IDEOGRAPH_FIFTH_ = 0x4e94;
5894:
5895: /**
5896: * CJK Ideograph, Sixth code point
5897: */
5898: private static final int CJK_IDEOGRAPH_SIXTH_ = 0x516d;
5899:
5900: /**
5901: * CJK Ideograph, Seventh code point
5902: */
5903: private static final int CJK_IDEOGRAPH_SEVENTH_ = 0x4e03;
5904:
5905: /**
5906: * CJK Ideograph, Eighth code point
5907: */
5908: private static final int CJK_IDEOGRAPH_EIGHTH_ = 0x516b;
5909:
5910: /**
5911: * CJK Ideograph, Nineth code point
5912: */
5913: private static final int CJK_IDEOGRAPH_NINETH_ = 0x4e5d;
5914:
5915: /**
5916: * Application Program command code point
5917: */
5918: private static final int APPLICATION_PROGRAM_COMMAND_ = 0x009F;
5919:
5920: /**
5921: * Unit separator code point
5922: */
5923: private static final int UNIT_SEPARATOR_ = 0x001F;
5924:
5925: /**
5926: * Delete code point
5927: */
5928: private static final int DELETE_ = 0x007F;
5929: /**
5930: * ISO control character first range upper limit 0x0 - 0x1F
5931: */
5932: private static final int ISO_CONTROL_FIRST_RANGE_MAX_ = 0x1F;
5933: /**
5934: * Shift to get numeric type
5935: */
5936: private static final int NUMERIC_TYPE_SHIFT_ = 5;
5937: /**
5938: * Mask to get numeric type
5939: */
5940: private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
5941:
5942: /* encoding of fractional and large numbers */
5943: private static final int MAX_SMALL_NUMBER = 0xff;
5944:
5945: private static final int FRACTION_NUM_SHIFT = 3; /* numerator: bits 7..3 */
5946: private static final int FRACTION_DEN_MASK = 7; /* denominator: bits 2..0 */
5947:
5948: private static final int FRACTION_MAX_NUM = 31;
5949: private static final int FRACTION_DEN_OFFSET = 2; /* denominator values are 2..9 */
5950:
5951: private static final int FRACTION_MIN_DEN = FRACTION_DEN_OFFSET;
5952: private static final int FRACTION_MAX_DEN = FRACTION_MIN_DEN
5953: + FRACTION_DEN_MASK;
5954:
5955: private static final int LARGE_MANT_SHIFT = 4; /* mantissa: bits 7..4 */
5956: private static final int LARGE_EXP_MASK = 0xf; /* exponent: bits 3..0 */
5957: private static final int LARGE_EXP_OFFSET = 2; /* regular exponents 2..17 */
5958: private static final int LARGE_EXP_OFFSET_EXTRA = 18; /* extra large exponents 18..33 */
5959:
5960: private static final int LARGE_MIN_EXP = LARGE_EXP_OFFSET;
5961: private static final int LARGE_MAX_EXP = LARGE_MIN_EXP
5962: + LARGE_EXP_MASK;
5963: private static final int LARGE_MAX_EXP_EXTRA = LARGE_EXP_OFFSET_EXTRA
5964: + LARGE_EXP_MASK;
5965:
5966: /**
5967: * Han digit characters
5968: */
5969: private static final int CJK_IDEOGRAPH_COMPLEX_ZERO_ = 0x96f6;
5970: private static final int CJK_IDEOGRAPH_COMPLEX_ONE_ = 0x58f9;
5971: private static final int CJK_IDEOGRAPH_COMPLEX_TWO_ = 0x8cb3;
5972: private static final int CJK_IDEOGRAPH_COMPLEX_THREE_ = 0x53c3;
5973: private static final int CJK_IDEOGRAPH_COMPLEX_FOUR_ = 0x8086;
5974: private static final int CJK_IDEOGRAPH_COMPLEX_FIVE_ = 0x4f0d;
5975: private static final int CJK_IDEOGRAPH_COMPLEX_SIX_ = 0x9678;
5976: private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN_ = 0x67d2;
5977: private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT_ = 0x634c;
5978: private static final int CJK_IDEOGRAPH_COMPLEX_NINE_ = 0x7396;
5979: private static final int CJK_IDEOGRAPH_TEN_ = 0x5341;
5980: private static final int CJK_IDEOGRAPH_COMPLEX_TEN_ = 0x62fe;
5981: private static final int CJK_IDEOGRAPH_HUNDRED_ = 0x767e;
5982: private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED_ = 0x4f70;
5983: private static final int CJK_IDEOGRAPH_THOUSAND_ = 0x5343;
5984: private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND_ = 0x4edf;
5985: private static final int CJK_IDEOGRAPH_TEN_THOUSAND_ = 0x824c;
5986: private static final int CJK_IDEOGRAPH_HUNDRED_MILLION_ = 0x5104;
5987:
5988: /**
5989: * Zero Width Non Joiner.
5990: * Equivalent to icu4c ZWNJ.
5991: */
5992: private static final int ZERO_WIDTH_NON_JOINER_ = 0x200c;
5993: /**
5994: * Zero Width Joiner
5995: * Equivalent to icu4c ZWJ.
5996: */
5997: private static final int ZERO_WIDTH_JOINER_ = 0x200d;
5998:
5999: /*
6000: * Properties in vector word 2
6001: * Bits
6002: * 31..24 More binary properties (see UCharacterProperty)
6003: * 23..19 reserved
6004: * 18..14 Sentence Break
6005: * 13..10 Word Break
6006: * 9.. 5 Grapheme Cluster Break
6007: * 4.. 0 Decomposition Type
6008: */
6009: private static final int SB_MASK = 0x0007c000;
6010: private static final int SB_SHIFT = 14;
6011:
6012: private static final int WB_MASK = 0x00003c00;
6013: private static final int WB_SHIFT = 10;
6014:
6015: private static final int GCB_MASK = 0x000003e0;
6016: private static final int GCB_SHIFT = 5;
6017:
6018: /**
6019: * Integer properties mask for decomposition type.
6020: * Equivalent to icu4c UPROPS_DT_MASK.
6021: */
6022: private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
6023:
6024: /*
6025: * Properties in vector word 0
6026: * Bits
6027: * 31..24 DerivedAge version major/minor one nibble each (see UCharacterProperty)
6028: * 23..18 Line Break
6029: * 17..15 East Asian Width
6030: * 14.. 7 UBlockCode
6031: * 6.. 0 UScriptCode
6032: */
6033:
6034: /**
6035: * Integer properties mask and shift values for East Asian cell width.
6036: * Equivalent to icu4c UPROPS_EA_MASK
6037: */
6038: private static final int EAST_ASIAN_MASK_ = 0x00038000;
6039: /**
6040: * Integer properties mask and shift values for East Asian cell width.
6041: * Equivalent to icu4c UPROPS_EA_SHIFT
6042: */
6043: private static final int EAST_ASIAN_SHIFT_ = 15;
6044: /**
6045: * Integer properties mask and shift values for line breaks.
6046: * Equivalent to icu4c UPROPS_LB_MASK
6047: */
6048: private static final int LINE_BREAK_MASK_ = 0x00FC0000;
6049: /**
6050: * Integer properties mask and shift values for line breaks.
6051: * Equivalent to icu4c UPROPS_LB_SHIFT
6052: */
6053: private static final int LINE_BREAK_SHIFT_ = 18;
6054: /**
6055: * Integer properties mask and shift values for blocks.
6056: * Equivalent to icu4c UPROPS_BLOCK_MASK
6057: */
6058: private static final int BLOCK_MASK_ = 0x00007f80;
6059: /**
6060: * Integer properties mask and shift values for blocks.
6061: * Equivalent to icu4c UPROPS_BLOCK_SHIFT
6062: */
6063: private static final int BLOCK_SHIFT_ = 7;
6064: /**
6065: * Integer properties mask and shift values for scripts.
6066: * Equivalent to icu4c UPROPS_SHIFT_MASK
6067: */
6068: private static final int SCRIPT_MASK_ = 0x0000007f;
6069:
6070: // private constructor -----------------------------------------------
6071: ///CLOVER:OFF
6072: /**
6073: * Private constructor to prevent instantiation
6074: */
6075: private UCharacter() {
6076: }
6077:
6078: ///CLOVER:ON
6079: // private methods ---------------------------------------------------
6080:
6081: /**
6082: * Getting the digit values of characters like 'A' - 'Z', normal,
6083: * half-width and full-width. This method assumes that the other digit
6084: * characters are checked by the calling method.
6085: * @param ch character to test
6086: * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
6087: * its corresponding digit will be returned.
6088: */
6089: private static int getEuropeanDigit(int ch) {
6090: if ((ch > 0x7a && ch < 0xff21) || ch < 0x41
6091: || (ch > 0x5a && ch < 0x61) || ch > 0xff5a
6092: || (ch > 0xff31 && ch < 0xff41)) {
6093: return -1;
6094: }
6095: if (ch <= 0x7a) {
6096: // ch >= 0x41 or ch < 0x61
6097: return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
6098: }
6099: // ch >= 0xff21
6100: if (ch <= 0xff3a) {
6101: return ch + 10 - 0xff21;
6102: }
6103: // ch >= 0xff41 && ch <= 0xff5a
6104: return ch + 10 - 0xff41;
6105: }
6106:
6107: /**
6108: * Gets the numeric type of the property argument
6109: * @param props 32 bit property
6110: * @return the numeric type
6111: */
6112: private static int getNumericType(int props) {
6113: return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
6114: }
6115:
6116: /**
6117: * Gets the property value at the index.
6118: * This is optimized.
6119: * Note this is alittle different from CharTrie the index m_trieData_
6120: * is never negative.
6121: * This is a duplicate of UCharacterProperty.getProperty. For optimization
6122: * purposes, this method calls the trie data directly instead of through
6123: * UCharacterProperty.getProperty.
6124: * @param ch code point whose property value is to be retrieved
6125: * @return property value of code point
6126: * @stable ICU 2.6
6127: */
6128: private static final int getProperty(int ch) {
6129: if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
6130: || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
6131: // BMP codepoint 0000..D7FF or DC00..FFFF
6132: try { // using try for ch < 0 is faster than using an if statement
6133: return PROPERTY_TRIE_DATA_[(PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
6134: + (ch & 0x1f)];
6135: } catch (ArrayIndexOutOfBoundsException e) {
6136: return PROPERTY_INITIAL_VALUE_;
6137: }
6138: }
6139: if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
6140: // lead surrogate D800..DBFF
6141: return PROPERTY_TRIE_DATA_[(PROPERTY_TRIE_INDEX_[(0x2800 >> 5)
6142: + (ch >> 5)] << 2)
6143: + (ch & 0x1f)];
6144: }
6145: // for optimization
6146: if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
6147: // supplementary code point 10000..10FFFF
6148: // look at the construction of supplementary characters
6149: // trail forms the ends of it.
6150: return PROPERTY_.m_trie_.getSurrogateValue(UTF16
6151: .getLeadSurrogate(ch), (char) (ch & 0x3ff));
6152: }
6153: // return m_dataOffset_ if there is an error, in this case we return
6154: // the default value: m_initialValue_
6155: // we cannot assume that m_initialValue_ is at offset 0
6156: // this is for optimization.
6157: return PROPERTY_INITIAL_VALUE_;
6158: }
6159: }
|