0001: /*
0002: *******************************************************************************
0003: * Copyright (C) 2000-2006, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */
0007: package com.ibm.icu.text;
0008:
0009: import com.ibm.icu.impl.NormalizerImpl;
0010: import com.ibm.icu.impl.UCharacterProperty;
0011: import com.ibm.icu.lang.UCharacter;
0012: import com.ibm.icu.util.VersionInfo;
0013:
0014: import java.text.CharacterIterator;
0015: import com.ibm.icu.impl.Utility;
0016:
0017: /**
0018: * Unicode Normalization
0019: *
0020: * <h2>Unicode normalization API</h2>
0021: *
0022: * <code>normalize</code> transforms Unicode text into an equivalent composed or
0023: * decomposed form, allowing for easier sorting and searching of text.
0024: * <code>normalize</code> supports the standard normalization forms described in
0025: * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
0026: * Unicode Standard Annex #15 — Unicode Normalization Forms</a>.
0027: *
0028: * Characters with accents or other adornments can be encoded in
0029: * several different ways in Unicode. For example, take the character A-acute.
0030: * In Unicode, this can be encoded as a single character (the
0031: * "composed" form):
0032: *
0033: * <p>
0034: * 00C1 LATIN CAPITAL LETTER A WITH ACUTE
0035: * </p>
0036: *
0037: * or as two separate characters (the "decomposed" form):
0038: *
0039: * <p>
0040: * 0041 LATIN CAPITAL LETTER A
0041: * 0301 COMBINING ACUTE ACCENT
0042: * </p>
0043: *
0044: * To a user of your program, however, both of these sequences should be
0045: * treated as the same "user-level" character "A with acute accent". When you
0046: * are searching or comparing text, you must ensure that these two sequences are
0047: * treated equivalently. In addition, you must handle characters with more than
0048: * one accent. Sometimes the order of a character's combining accents is
0049: * significant, while in other cases accent sequences in different orders are
0050: * really equivalent.
0051: *
0052: * Similarly, the string "ffi" can be encoded as three separate letters:
0053: *
0054: * <p>
0055: * 0066 LATIN SMALL LETTER F
0056: * 0066 LATIN SMALL LETTER F
0057: * 0069 LATIN SMALL LETTER I
0058: * <\p>
0059: *
0060: * or as the single character
0061: *
0062: * <p>
0063: * FB03 LATIN SMALL LIGATURE FFI
0064: * <\p>
0065: *
0066: * The ffi ligature is not a distinct semantic character, and strictly speaking
0067: * it shouldn't be in Unicode at all, but it was included for compatibility
0068: * with existing character sets that already provided it. The Unicode standard
0069: * identifies such characters by giving them "compatibility" decompositions
0070: * into the corresponding semantic characters. When sorting and searching, you
0071: * will often want to use these mappings.
0072: *
0073: * <code>normalize</code> helps solve these problems by transforming text into
0074: * the canonical composed and decomposed forms as shown in the first example
0075: * above. In addition, you can have it perform compatibility decompositions so
0076: * that you can treat compatibility characters the same as their equivalents.
0077: * Finally, <code>normalize</code> rearranges accents into the proper canonical
0078: * order, so that you do not have to worry about accent rearrangement on your
0079: * own.
0080: *
0081: * Form FCD, "Fast C or D", is also designed for collation.
0082: * It allows to work on strings that are not necessarily normalized
0083: * with an algorithm (like in collation) that works under "canonical closure",
0084: * i.e., it treats precomposed characters and their decomposed equivalents the
0085: * same.
0086: *
0087: * It is not a normalization form because it does not provide for uniqueness of
0088: * representation. Multiple strings may be canonically equivalent (their NFDs
0089: * are identical) and may all conform to FCD without being identical themselves.
0090: *
0091: * The form is defined such that the "raw decomposition", the recursive
0092: * canonical decomposition of each character, results in a string that is
0093: * canonically ordered. This means that precomposed characters are allowed for
0094: * as long as their decompositions do not need canonical reordering.
0095: *
0096: * Its advantage for a process like collation is that all NFD and most NFC texts
0097: * - and many unnormalized texts - already conform to FCD and do not need to be
0098: * normalized (NFD) for such a process. The FCD quick check will return YES for
0099: * most strings in practice.
0100: *
0101: * normalize(FCD) may be implemented with NFD.
0102: *
0103: * For more details on FCD see the collation design document:
0104: * http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icuhtml/design/collation/ICU_collation_design.htm
0105: *
0106: * ICU collation performs either NFD or FCD normalization automatically if
0107: * normalization is turned on for the collator object. Beyond collation and
0108: * string search, normalized strings may be useful for string equivalence
0109: * comparisons, transliteration/transcription, unique representations, etc.
0110: *
0111: * The W3C generally recommends to exchange texts in NFC.
0112: * Note also that most legacy character encodings use only precomposed forms and
0113: * often do not encode any combining marks by themselves. For conversion to such
0114: * character encodings the Unicode text needs to be normalized to NFC.
0115: * For more usage examples, see the Unicode Standard Annex.
0116: * @stable ICU 2.8
0117: */
0118:
0119: public final class Normalizer implements Cloneable {
0120:
0121: //-------------------------------------------------------------------------
0122: // Private data
0123: //-------------------------------------------------------------------------
0124: private char[] buffer = new char[100];
0125: private int bufferStart = 0;
0126: private int bufferPos = 0;
0127: private int bufferLimit = 0;
0128:
0129: // This tells us what the bits in the "mode" object mean.
0130: private static final int COMPAT_BIT = 1;
0131: private static final int DECOMP_BIT = 2;
0132: private static final int COMPOSE_BIT = 4;
0133:
0134: // The input text and our position in it
0135: private UCharacterIterator text;
0136: private Mode mode = NFC;
0137: private int options = 0;
0138: private int currentIndex;
0139: private int nextIndex;
0140:
0141: /**
0142: * Options bit set value to select Unicode 3.2 normalization
0143: * (except NormalizationCorrections).
0144: * At most one Unicode version can be selected at a time.
0145: * @stable ICU 2.6
0146: */
0147: public static final int UNICODE_3_2 = 0x20;
0148:
0149: /**
0150: * Constant indicating that the end of the iteration has been reached.
0151: * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
0152: * @stable ICU 2.8
0153: */
0154: public static final int DONE = UCharacterIterator.DONE;
0155:
0156: /**
0157: * Constants for normalization modes.
0158: * @stable ICU 2.8
0159: */
0160: public static class Mode {
0161: private int modeValue;
0162:
0163: private Mode(int value) {
0164: modeValue = value;
0165: }
0166:
0167: /**
0168: * This method is used for method dispatch
0169: * @stable ICU 2.6
0170: */
0171: protected int normalize(char[] src, int srcStart, int srcLimit,
0172: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0173: int srcLen = (srcLimit - srcStart);
0174: int destLen = (destLimit - destStart);
0175: if (srcLen > destLen) {
0176: return srcLen;
0177: }
0178: System.arraycopy(src, srcStart, dest, destStart, srcLen);
0179: return srcLen;
0180: }
0181:
0182: /**
0183: * This method is used for method dispatch
0184: * @stable ICU 2.6
0185: */
0186: protected int normalize(char[] src, int srcStart, int srcLimit,
0187: char[] dest, int destStart, int destLimit, int options) {
0188: return normalize(src, srcStart, srcLimit, dest, destStart,
0189: destLimit, NormalizerImpl.getNX(options));
0190: }
0191:
0192: /**
0193: * This method is used for method dispatch
0194: * @stable ICU 2.6
0195: */
0196: protected String normalize(String src, int options) {
0197: return src;
0198: }
0199:
0200: /**
0201: * This method is used for method dispatch
0202: * @stable ICU 2.8
0203: */
0204: protected int getMinC() {
0205: return -1;
0206: }
0207:
0208: /**
0209: * This method is used for method dispatch
0210: * @stable ICU 2.8
0211: */
0212: protected int getMask() {
0213: return -1;
0214: }
0215:
0216: /**
0217: * This method is used for method dispatch
0218: * @stable ICU 2.8
0219: */
0220: protected IsPrevBoundary getPrevBoundary() {
0221: return null;
0222: }
0223:
0224: /**
0225: * This method is used for method dispatch
0226: * @stable ICU 2.8
0227: */
0228: protected IsNextBoundary getNextBoundary() {
0229: return null;
0230: }
0231:
0232: /**
0233: * This method is used for method dispatch
0234: * @stable ICU 2.6
0235: */
0236: protected QuickCheckResult quickCheck(char[] src, int start,
0237: int limit, boolean allowMaybe, UnicodeSet nx) {
0238: if (allowMaybe) {
0239: return MAYBE;
0240: }
0241: return NO;
0242: }
0243:
0244: /**
0245: * This method is used for method dispatch
0246: * @stable ICU 2.8
0247: */
0248: protected boolean isNFSkippable(int c) {
0249: return true;
0250: }
0251: }
0252:
0253: /**
0254: * No decomposition/composition.
0255: * @stable ICU 2.8
0256: */
0257: public static final Mode NONE = new Mode(1);
0258:
0259: /**
0260: * Canonical decomposition.
0261: * @stable ICU 2.8
0262: */
0263: public static final Mode NFD = new NFDMode(2);
0264:
0265: private static final class NFDMode extends Mode {
0266: private NFDMode(int value) {
0267: super (value);
0268: }
0269:
0270: protected int normalize(char[] src, int srcStart, int srcLimit,
0271: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0272: int[] trailCC = new int[1];
0273: return NormalizerImpl.decompose(src, srcStart, srcLimit,
0274: dest, destStart, destLimit, false, trailCC, nx);
0275: }
0276:
0277: protected String normalize(String src, int options) {
0278: return decompose(src, false);
0279: }
0280:
0281: protected int getMinC() {
0282: return NormalizerImpl.MIN_WITH_LEAD_CC;
0283: }
0284:
0285: protected IsPrevBoundary getPrevBoundary() {
0286: return new IsPrevNFDSafe();
0287: }
0288:
0289: protected IsNextBoundary getNextBoundary() {
0290: return new IsNextNFDSafe();
0291: }
0292:
0293: protected int getMask() {
0294: return (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFD);
0295: }
0296:
0297: protected QuickCheckResult quickCheck(char[] src, int start,
0298: int limit, boolean allowMaybe, UnicodeSet nx) {
0299: return NormalizerImpl
0300: .quickCheck(
0301: src,
0302: start,
0303: limit,
0304: NormalizerImpl
0305: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFD_NO_MAYBE),
0306: NormalizerImpl.QC_NFD, 0, allowMaybe, nx);
0307: }
0308:
0309: protected boolean isNFSkippable(int c) {
0310: return NormalizerImpl.isNFSkippable(c, this ,
0311: (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFD));
0312: }
0313: }
0314:
0315: /**
0316: * Compatibility decomposition.
0317: * @stable ICU 2.8
0318: */
0319: public static final Mode NFKD = new NFKDMode(3);
0320:
0321: private static final class NFKDMode extends Mode {
0322: private NFKDMode(int value) {
0323: super (value);
0324: }
0325:
0326: protected int normalize(char[] src, int srcStart, int srcLimit,
0327: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0328: int[] trailCC = new int[1];
0329: return NormalizerImpl.decompose(src, srcStart, srcLimit,
0330: dest, destStart, destLimit, true, trailCC, nx);
0331: }
0332:
0333: protected String normalize(String src, int options) {
0334: return decompose(src, true);
0335: }
0336:
0337: protected int getMinC() {
0338: return NormalizerImpl.MIN_WITH_LEAD_CC;
0339: }
0340:
0341: protected IsPrevBoundary getPrevBoundary() {
0342: return new IsPrevNFDSafe();
0343: }
0344:
0345: protected IsNextBoundary getNextBoundary() {
0346: return new IsNextNFDSafe();
0347: }
0348:
0349: protected int getMask() {
0350: return (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFKD);
0351: }
0352:
0353: protected QuickCheckResult quickCheck(char[] src, int start,
0354: int limit, boolean allowMaybe, UnicodeSet nx) {
0355: return NormalizerImpl
0356: .quickCheck(
0357: src,
0358: start,
0359: limit,
0360: NormalizerImpl
0361: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFKD_NO_MAYBE),
0362: NormalizerImpl.QC_NFKD,
0363: NormalizerImpl.OPTIONS_COMPAT, allowMaybe,
0364: nx);
0365: }
0366:
0367: protected boolean isNFSkippable(int c) {
0368: return NormalizerImpl.isNFSkippable(c, this ,
0369: (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFKD));
0370: }
0371: }
0372:
0373: /**
0374: * Canonical decomposition followed by canonical composition.
0375: * @stable ICU 2.8
0376: */
0377: public static final Mode NFC = new NFCMode(4);
0378:
0379: private static final class NFCMode extends Mode {
0380: private NFCMode(int value) {
0381: super (value);
0382: }
0383:
0384: protected int normalize(char[] src, int srcStart, int srcLimit,
0385: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0386: return NormalizerImpl.compose(src, srcStart, srcLimit,
0387: dest, destStart, destLimit, 0, nx);
0388: }
0389:
0390: protected String normalize(String src, int options) {
0391: return compose(src, false, options);
0392: }
0393:
0394: protected int getMinC() {
0395: return NormalizerImpl
0396: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE);
0397: }
0398:
0399: protected IsPrevBoundary getPrevBoundary() {
0400: return new IsPrevTrueStarter();
0401: }
0402:
0403: protected IsNextBoundary getNextBoundary() {
0404: return new IsNextTrueStarter();
0405: }
0406:
0407: protected int getMask() {
0408: return (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFC);
0409: }
0410:
0411: protected QuickCheckResult quickCheck(char[] src, int start,
0412: int limit, boolean allowMaybe, UnicodeSet nx) {
0413: return NormalizerImpl
0414: .quickCheck(
0415: src,
0416: start,
0417: limit,
0418: NormalizerImpl
0419: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE),
0420: NormalizerImpl.QC_NFC, 0, allowMaybe, nx);
0421: }
0422:
0423: protected boolean isNFSkippable(int c) {
0424: return NormalizerImpl
0425: .isNFSkippable(
0426: c,
0427: this ,
0428: (NormalizerImpl.CC_MASK
0429: | NormalizerImpl.COMBINES_ANY | (NormalizerImpl.QC_NFC & NormalizerImpl.QC_ANY_NO)));
0430: }
0431: };
0432:
0433: /**
0434: * Default normalization.
0435: * @stable ICU 2.8
0436: */
0437: public static final Mode DEFAULT = NFC;
0438:
0439: /**
0440: * Compatibility decomposition followed by canonical composition.
0441: * @stable ICU 2.8
0442: */
0443: public static final Mode NFKC = new NFKCMode(5);
0444:
0445: private static final class NFKCMode extends Mode {
0446: private NFKCMode(int value) {
0447: super (value);
0448: }
0449:
0450: protected int normalize(char[] src, int srcStart, int srcLimit,
0451: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0452: return NormalizerImpl.compose(src, srcStart, srcLimit,
0453: dest, destStart, destLimit,
0454: NormalizerImpl.OPTIONS_COMPAT, nx);
0455: }
0456:
0457: protected String normalize(String src, int options) {
0458: return compose(src, true, options);
0459: }
0460:
0461: protected int getMinC() {
0462: return NormalizerImpl
0463: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE);
0464: }
0465:
0466: protected IsPrevBoundary getPrevBoundary() {
0467: return new IsPrevTrueStarter();
0468: }
0469:
0470: protected IsNextBoundary getNextBoundary() {
0471: return new IsNextTrueStarter();
0472: }
0473:
0474: protected int getMask() {
0475: return (NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFKC);
0476: }
0477:
0478: protected QuickCheckResult quickCheck(char[] src, int start,
0479: int limit, boolean allowMaybe, UnicodeSet nx) {
0480: return NormalizerImpl
0481: .quickCheck(
0482: src,
0483: start,
0484: limit,
0485: NormalizerImpl
0486: .getFromIndexesArr(NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE),
0487: NormalizerImpl.QC_NFKC,
0488: NormalizerImpl.OPTIONS_COMPAT, allowMaybe,
0489: nx);
0490: }
0491:
0492: protected boolean isNFSkippable(int c) {
0493: return NormalizerImpl
0494: .isNFSkippable(
0495: c,
0496: this ,
0497: (NormalizerImpl.CC_MASK
0498: | NormalizerImpl.COMBINES_ANY | (NormalizerImpl.QC_NFKC & NormalizerImpl.QC_ANY_NO)));
0499: }
0500: };
0501:
0502: /**
0503: * "Fast C or D" form.
0504: * @stable ICU 2.8
0505: */
0506: public static final Mode FCD = new FCDMode(6);
0507:
0508: private static final class FCDMode extends Mode {
0509: private FCDMode(int value) {
0510: super (value);
0511: }
0512:
0513: protected int normalize(char[] src, int srcStart, int srcLimit,
0514: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
0515: return NormalizerImpl.makeFCD(src, srcStart, srcLimit,
0516: dest, destStart, destLimit, nx);
0517: }
0518:
0519: protected String normalize(String src, int options) {
0520: return makeFCD(src, options);
0521: }
0522:
0523: protected int getMinC() {
0524: return NormalizerImpl.MIN_WITH_LEAD_CC;
0525: }
0526:
0527: protected IsPrevBoundary getPrevBoundary() {
0528: return new IsPrevNFDSafe();
0529: }
0530:
0531: protected IsNextBoundary getNextBoundary() {
0532: return new IsNextNFDSafe();
0533: }
0534:
0535: protected int getMask() {
0536: return NormalizerImpl.CC_MASK | NormalizerImpl.QC_NFD;
0537: }
0538:
0539: protected QuickCheckResult quickCheck(char[] src, int start,
0540: int limit, boolean allowMaybe, UnicodeSet nx) {
0541: return NormalizerImpl.checkFCD(src, start, limit, nx) ? YES
0542: : NO;
0543: }
0544:
0545: protected boolean isNFSkippable(int c) {
0546: /* FCD: skippable if lead cc==0 and trail cc<=1 */
0547: return (NormalizerImpl.getFCD16(c) > 1);
0548: }
0549: };
0550:
0551: /**
0552: * Null operation for use with the {@link #Normalizer constructors}
0553: * and the static {@link #normalize normalize} method. This value tells
0554: * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
0555: * from the underlying String or CharacterIterator. If you have code which
0556: * requires raw text at some times and normalized text at others, you can
0557: * use <tt>NO_OP</tt> for the cases where you want raw text, rather
0558: * than having a separate code path that bypasses <tt>Normalizer</tt>
0559: * altogether.
0560: * <p>
0561: * @see #setMode
0562: * @deprecated ICU 2.8. Use Nomalizer.NONE
0563: * @see #NONE
0564: */
0565: public static final Mode NO_OP = NONE;
0566:
0567: /**
0568: * Canonical decomposition followed by canonical composition. Used with the
0569: * {@link #Normalizer constructors} and the static
0570: * {@link #normalize normalize} method to determine the operation to be
0571: * performed.
0572: * <p>
0573: * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
0574: * off, this operation produces output that is in
0575: * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
0576: * Form</a>
0577: * <b>C</b>.
0578: * <p>
0579: * @see #setMode
0580: * @deprecated ICU 2.8. Use Normalier.NFC
0581: * @see #NFC
0582: */
0583: public static final Mode COMPOSE = NFC;
0584:
0585: /**
0586: * Compatibility decomposition followed by canonical composition.
0587: * Used with the {@link #Normalizer constructors} and the static
0588: * {@link #normalize normalize} method to determine the operation to be
0589: * performed.
0590: * <p>
0591: * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
0592: * off, this operation produces output that is in
0593: * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
0594: * Form</a>
0595: * <b>KC</b>.
0596: * <p>
0597: * @see #setMode
0598: * @deprecated ICU 2.8. Use Normalizer.NFKC
0599: * @see #NFKC
0600: */
0601: public static final Mode COMPOSE_COMPAT = NFKC;
0602:
0603: /**
0604: * Canonical decomposition. This value is passed to the
0605: * {@link #Normalizer constructors} and the static
0606: * {@link #normalize normalize}
0607: * method to determine the operation to be performed.
0608: * <p>
0609: * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
0610: * off, this operation produces output that is in
0611: * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
0612: * Form</a>
0613: * <b>D</b>.
0614: * <p>
0615: * @see #setMode
0616: * @deprecated ICU 2.8. Use Normalizer.NFD
0617: * @see #NFD
0618: */
0619: public static final Mode DECOMP = NFD;
0620:
0621: /**
0622: * Compatibility decomposition. This value is passed to the
0623: * {@link #Normalizer constructors} and the static
0624: * {@link #normalize normalize}
0625: * method to determine the operation to be performed.
0626: * <p>
0627: * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
0628: * off, this operation produces output that is in
0629: * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
0630: * Form</a>
0631: * <b>KD</b>.
0632: * <p>
0633: * @see #setMode
0634: * @deprecated ICU 2.8. Use Normalizer.NFKD
0635: * @see #NFKD
0636: */
0637: public static final Mode DECOMP_COMPAT = NFKD;
0638:
0639: /**
0640: * Option to disable Hangul/Jamo composition and decomposition.
0641: * This option applies to Korean text,
0642: * which can be represented either in the Jamo alphabet or in Hangul
0643: * characters, which are really just two or three Jamo combined
0644: * into one visual glyph. Since Jamo takes up more storage space than
0645: * Hangul, applications that process only Hangul text may wish to turn
0646: * this option on when decomposing text.
0647: * <p>
0648: * The Unicode standard treates Hangul to Jamo conversion as a
0649: * canonical decomposition, so this option must be turned <b>off</b> if you
0650: * wish to transform strings into one of the standard
0651: * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
0652: * Unicode Normalization Forms</a>.
0653: * <p>
0654: * @see #setOption
0655: * @deprecated ICU 2.8. This option is no longer supported. TODO: check with Ram
0656: */
0657: public static final int IGNORE_HANGUL = 0x0001;
0658:
0659: /**
0660: * Result values for quickCheck().
0661: * For details see Unicode Technical Report 15.
0662: * @stable ICU 2.8
0663: */
0664: public static final class QuickCheckResult {
0665: private int resultValue;
0666:
0667: private QuickCheckResult(int value) {
0668: resultValue = value;
0669: }
0670: }
0671:
0672: /**
0673: * Indicates that string is not in the normalized format
0674: * @stable ICU 2.8
0675: */
0676: public static final QuickCheckResult NO = new QuickCheckResult(0);
0677:
0678: /**
0679: * Indicates that string is in the normalized format
0680: * @stable ICU 2.8
0681: */
0682: public static final QuickCheckResult YES = new QuickCheckResult(1);
0683:
0684: /**
0685: * Indicates it cannot be determined if string is in the normalized
0686: * format without further thorough checks.
0687: * @stable ICU 2.8
0688: */
0689: public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
0690:
0691: /**
0692: * Option bit for compare:
0693: * Case sensitively compare the strings
0694: * @stable ICU 2.8
0695: */
0696: public static final int FOLD_CASE_DEFAULT = UCharacter.FOLD_CASE_DEFAULT;
0697:
0698: /**
0699: * Option bit for compare:
0700: * Both input strings are assumed to fulfill FCD conditions.
0701: * @stable ICU 2.8
0702: */
0703: public static final int INPUT_IS_FCD = 0x20000;
0704:
0705: /**
0706: * Option bit for compare:
0707: * Perform case-insensitive comparison.
0708: * @stable ICU 2.8
0709: */
0710: public static final int COMPARE_IGNORE_CASE = 0x10000;
0711:
0712: /**
0713: * Option bit for compare:
0714: * Compare strings in code point order instead of code unit order.
0715: * @stable ICU 2.8
0716: */
0717: public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
0718:
0719: /**
0720: * Option value for case folding: exclude the mappings for dotted I
0721: * and dotless i marked with 'I' in CaseFolding.txt.
0722: * @stable ICU 2.8
0723: */
0724: public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
0725:
0726: /**
0727: * Lowest-order bit number of compare() options bits corresponding to
0728: * normalization options bits.
0729: *
0730: * The options parameter for compare() uses most bits for
0731: * itself and for various comparison and folding flags.
0732: * The most significant bits, however, are shifted down and passed on
0733: * to the normalization implementation.
0734: * (That is, from compare(..., options, ...),
0735: * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
0736: * internal normalization functions.)
0737: *
0738: * @see #compare
0739: * @stable ICU 2.6
0740: */
0741: public static final int COMPARE_NORM_OPTIONS_SHIFT = 20;
0742:
0743: //-------------------------------------------------------------------------
0744: // Constructors
0745: //-------------------------------------------------------------------------
0746:
0747: /**
0748: * Creates a new <tt>Normalizer</tt> object for iterating over the
0749: * normalized form of a given string.
0750: * <p>
0751: * The <tt>options</tt> parameter specifies which optional
0752: * <tt>Normalizer</tt> features are to be enabled for this object.
0753: * <p>
0754: * @param str The string to be normalized. The normalization
0755: * will start at the beginning of the string.
0756: *
0757: * @param mode The normalization mode.
0758: *
0759: * @param opt Any optional features to be enabled.
0760: * Currently the only available option is {@link #UNICODE_3_2}.
0761: * If you want the default behavior corresponding to one of the
0762: * standard Unicode Normalization Forms, use 0 for this argument.
0763: * @stable ICU 2.6
0764: */
0765: public Normalizer(String str, Mode mode, int opt) {
0766: this .text = UCharacterIterator.getInstance(str);
0767: this .mode = mode;
0768: this .options = opt;
0769: }
0770:
0771: /**
0772: * Creates a new <tt>Normalizer</tt> object for iterating over the
0773: * normalized form of the given text.
0774: * <p>
0775: * @param iter The input text to be normalized. The normalization
0776: * will start at the beginning of the string.
0777: *
0778: * @param mode The normalization mode.
0779: *
0780: * @param opt Any optional features to be enabled.
0781: * Currently the only available option is {@link #UNICODE_3_2}.
0782: * If you want the default behavior corresponding to one of the
0783: * standard Unicode Normalization Forms, use 0 for this argument.
0784: * @stable ICU 2.6
0785: */
0786: public Normalizer(CharacterIterator iter, Mode mode, int opt) {
0787: this .text = UCharacterIterator
0788: .getInstance((CharacterIterator) iter.clone());
0789: this .mode = mode;
0790: this .options = opt;
0791: }
0792:
0793: /**
0794: * Creates a new <tt>Normalizer</tt> object for iterating over the
0795: * normalized form of the given text.
0796: * <p>
0797: * @param iter The input text to be normalized. The normalization
0798: * will start at the beginning of the string.
0799: *
0800: * @param mode The normalization mode.
0801: * @param options The normalization options, ORed together (0 for no options).
0802: * @stable ICU 2.6
0803: */
0804: public Normalizer(UCharacterIterator iter, Mode mode, int options) {
0805: try {
0806: this .text = (UCharacterIterator) iter.clone();
0807: this .mode = mode;
0808: this .options = options;
0809: } catch (CloneNotSupportedException e) {
0810: throw new IllegalStateException(e.toString());
0811: }
0812: }
0813:
0814: /**
0815: * Clones this <tt>Normalizer</tt> object. All properties of this
0816: * object are duplicated in the new object, including the cloning of any
0817: * {@link CharacterIterator} that was passed in to the constructor
0818: * or to {@link #setText(CharacterIterator) setText}.
0819: * However, the text storage underlying
0820: * the <tt>CharacterIterator</tt> is not duplicated unless the
0821: * iterator's <tt>clone</tt> method does so.
0822: * @stable ICU 2.8
0823: */
0824: public Object clone() {
0825: try {
0826: Normalizer copy = (Normalizer) super .clone();
0827: copy.text = (UCharacterIterator) text.clone();
0828: //clone the internal buffer
0829: if (buffer != null) {
0830: copy.buffer = new char[buffer.length];
0831: System.arraycopy(buffer, 0, copy.buffer, 0,
0832: buffer.length);
0833: }
0834: return copy;
0835: } catch (CloneNotSupportedException e) {
0836: throw new IllegalStateException(e.toString());
0837: }
0838: }
0839:
0840: //--------------------------------------------------------------------------
0841: // Static Utility methods
0842: //--------------------------------------------------------------------------
0843:
0844: /**
0845: * Compose a string.
0846: * The string will be composed to according the the specified mode.
0847: * @param str The string to compose.
0848: * @param compat If true the string will be composed accoding to
0849: * NFKC rules and if false will be composed according to
0850: * NFC rules.
0851: * @return String The composed string
0852: * @stable ICU 2.8
0853: */
0854: public static String compose(String str, boolean compat) {
0855: return compose(str, compat, 0);
0856: }
0857:
0858: /**
0859: * Compose a string.
0860: * The string will be composed to according the the specified mode.
0861: * @param str The string to compose.
0862: * @param compat If true the string will be composed accoding to
0863: * NFKC rules and if false will be composed according to
0864: * NFC rules.
0865: * @param options The only recognized option is UNICODE_3_2
0866: * @return String The composed string
0867: * @stable ICU 2.6
0868: */
0869: public static String compose(String str, boolean compat, int options) {
0870:
0871: char[] dest = new char[str.length() * MAX_BUF_SIZE_COMPOSE];
0872: int destSize = 0;
0873: char[] src = str.toCharArray();
0874: UnicodeSet nx = NormalizerImpl.getNX(options);
0875:
0876: /* reset options bits that should only be set here or inside compose() */
0877: options &= ~(NormalizerImpl.OPTIONS_SETS_MASK
0878: | NormalizerImpl.OPTIONS_COMPAT | NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
0879:
0880: if (compat) {
0881: options |= NormalizerImpl.OPTIONS_COMPAT;
0882: }
0883:
0884: for (;;) {
0885: destSize = NormalizerImpl.compose(src, 0, src.length, dest,
0886: 0, dest.length, options, nx);
0887: if (destSize <= dest.length) {
0888: return new String(dest, 0, destSize);
0889: } else {
0890: dest = new char[destSize];
0891: }
0892: }
0893: }
0894:
0895: /**
0896: * Compose a string.
0897: * The string will be composed to according the the specified mode.
0898: * @param source The char array to compose.
0899: * @param target A char buffer to receive the normalized text.
0900: * @param compat If true the char array will be composed accoding to
0901: * NFKC rules and if false will be composed according to
0902: * NFC rules.
0903: * @param options The normalization options, ORed together (0 for no options).
0904: * @return int The total buffer size needed;if greater than length of
0905: * result, the output was truncated.
0906: * @exception IndexOutOfBoundsException if target.length is less than the
0907: * required length
0908: * @stable ICU 2.6
0909: */
0910: public static int compose(char[] source, char[] target,
0911: boolean compat, int options) {
0912: UnicodeSet nx = NormalizerImpl.getNX(options);
0913:
0914: /* reset options bits that should only be set here or inside compose() */
0915: options &= ~(NormalizerImpl.OPTIONS_SETS_MASK
0916: | NormalizerImpl.OPTIONS_COMPAT | NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
0917:
0918: if (compat) {
0919: options |= NormalizerImpl.OPTIONS_COMPAT;
0920: }
0921:
0922: int length = NormalizerImpl.compose(source, 0, source.length,
0923: target, 0, target.length, options, nx);
0924: if (length <= target.length) {
0925: return length;
0926: } else {
0927: throw new IndexOutOfBoundsException(Integer
0928: .toString(length));
0929: }
0930: }
0931:
0932: /**
0933: * Compose a string.
0934: * The string will be composed to according the the specified mode.
0935: * @param src The char array to compose.
0936: * @param srcStart Start index of the source
0937: * @param srcLimit Limit index of the source
0938: * @param dest The char buffer to fill in
0939: * @param destStart Start index of the destination buffer
0940: * @param destLimit End index of the destination buffer
0941: * @param compat If true the char array will be composed accoding to
0942: * NFKC rules and if false will be composed according to
0943: * NFC rules.
0944: * @param options The normalization options, ORed together (0 for no options).
0945: * @return int The total buffer size needed;if greater than length of
0946: * result, the output was truncated.
0947: * @exception IndexOutOfBoundsException if target.length is less than the
0948: * required length
0949: * @stable ICU 2.6
0950: */
0951: public static int compose(char[] src, int srcStart, int srcLimit,
0952: char[] dest, int destStart, int destLimit, boolean compat,
0953: int options) {
0954: UnicodeSet nx = NormalizerImpl.getNX(options);
0955:
0956: /* reset options bits that should only be set here or inside compose() */
0957: options &= ~(NormalizerImpl.OPTIONS_SETS_MASK
0958: | NormalizerImpl.OPTIONS_COMPAT | NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
0959:
0960: if (compat) {
0961: options |= NormalizerImpl.OPTIONS_COMPAT;
0962: }
0963:
0964: int length = NormalizerImpl.compose(src, srcStart, srcLimit,
0965: dest, destStart, destLimit, options, nx);
0966: if (length <= (destLimit - destStart)) {
0967: return length;
0968: } else {
0969: throw new IndexOutOfBoundsException(Integer
0970: .toString(length));
0971: }
0972: }
0973:
0974: private static final int MAX_BUF_SIZE_COMPOSE = 2;
0975: private static final int MAX_BUF_SIZE_DECOMPOSE = 3;
0976:
0977: /**
0978: * Decompose a string.
0979: * The string will be decomposed to according the the specified mode.
0980: * @param str The string to decompose.
0981: * @param compat If true the string will be decomposed accoding to NFKD
0982: * rules and if false will be decomposed according to NFD
0983: * rules.
0984: * @return String The decomposed string
0985: * @stable ICU 2.8
0986: */
0987: public static String decompose(String str, boolean compat) {
0988: return decompose(str, compat, 0);
0989: }
0990:
0991: /**
0992: * Decompose a string.
0993: * The string will be decomposed to according the the specified mode.
0994: * @param str The string to decompose.
0995: * @param compat If true the string will be decomposed accoding to NFKD
0996: * rules and if false will be decomposed according to NFD
0997: * rules.
0998: * @param options The normalization options, ORed together (0 for no options).
0999: * @return String The decomposed string
1000: * @stable ICU 2.6
1001: */
1002: public static String decompose(String str, boolean compat,
1003: int options) {
1004:
1005: char[] dest = new char[str.length() * MAX_BUF_SIZE_DECOMPOSE];
1006: int[] trailCC = new int[1];
1007: int destSize = 0;
1008: UnicodeSet nx = NormalizerImpl.getNX(options);
1009: for (;;) {
1010: destSize = NormalizerImpl.decompose(str.toCharArray(), 0,
1011: str.length(), dest, 0, dest.length, compat,
1012: trailCC, nx);
1013: if (destSize <= dest.length) {
1014: return new String(dest, 0, destSize);
1015: } else {
1016: dest = new char[destSize];
1017: }
1018: }
1019:
1020: }
1021:
1022: /**
1023: * Decompose a string.
1024: * The string will be decomposed to according the the specified mode.
1025: * @param source The char array to decompose.
1026: * @param target A char buffer to receive the normalized text.
1027: * @param compat If true the char array will be decomposed accoding to NFKD
1028: * rules and if false will be decomposed according to
1029: * NFD rules.
1030: * @return int The total buffer size needed;if greater than length of
1031: * result,the output was truncated.
1032: * @param options The normalization options, ORed together (0 for no options).
1033: * @exception IndexOutOfBoundsException if the target capacity is less than
1034: * the required length
1035: * @stable ICU 2.6
1036: */
1037: public static int decompose(char[] source, char[] target,
1038: boolean compat, int options) {
1039: int[] trailCC = new int[1];
1040: UnicodeSet nx = NormalizerImpl.getNX(options);
1041: int length = NormalizerImpl.decompose(source, 0, source.length,
1042: target, 0, target.length, compat, trailCC, nx);
1043: if (length <= target.length) {
1044: return length;
1045: } else {
1046: throw new IndexOutOfBoundsException(Integer
1047: .toString(length));
1048: }
1049: }
1050:
1051: /**
1052: * Decompose a string.
1053: * The string will be decomposed to according the the specified mode.
1054: * @param src The char array to compose.
1055: * @param srcStart Start index of the source
1056: * @param srcLimit Limit index of the source
1057: * @param dest The char buffer to fill in
1058: * @param destStart Start index of the destination buffer
1059: * @param destLimit End index of the destination buffer
1060: * @param compat If true the char array will be decomposed accoding to NFKD
1061: * rules and if false will be decomposed according to
1062: * NFD rules.
1063: * @param options The normalization options, ORed together (0 for no options).
1064: * @return int The total buffer size needed;if greater than length of
1065: * result,the output was truncated.
1066: * @exception IndexOutOfBoundsException if the target capacity is less than
1067: * the required length
1068: * @stable ICU 2.6
1069: */
1070: public static int decompose(char[] src, int srcStart, int srcLimit,
1071: char[] dest, int destStart, int destLimit, boolean compat,
1072: int options) {
1073: int[] trailCC = new int[1];
1074: UnicodeSet nx = NormalizerImpl.getNX(options);
1075: int length = NormalizerImpl.decompose(src, srcStart, srcLimit,
1076: dest, destStart, destLimit, compat, trailCC, nx);
1077: if (length <= (destLimit - destStart)) {
1078: return length;
1079: } else {
1080: throw new IndexOutOfBoundsException(Integer
1081: .toString(length));
1082: }
1083: }
1084:
1085: private static String makeFCD(String src, int options) {
1086: int srcLen = src.length();
1087: char[] dest = new char[MAX_BUF_SIZE_DECOMPOSE * srcLen];
1088: int length = 0;
1089: UnicodeSet nx = NormalizerImpl.getNX(options);
1090: for (;;) {
1091: length = NormalizerImpl.makeFCD(src.toCharArray(), 0,
1092: srcLen, dest, 0, dest.length, nx);
1093: if (length <= dest.length) {
1094: return new String(dest, 0, length);
1095: } else {
1096: dest = new char[length];
1097: }
1098: }
1099: }
1100:
1101: /**
1102: * Normalizes a <tt>String</tt> using the given normalization operation.
1103: * <p>
1104: * The <tt>options</tt> parameter specifies which optional
1105: * <tt>Normalizer</tt> features are to be enabled for this operation.
1106: * Currently the only available option is {@link #UNICODE_3_2}.
1107: * If you want the default behavior corresponding to one of the standard
1108: * Unicode Normalization Forms, use 0 for this argument.
1109: * <p>
1110: * @param str the input string to be normalized.
1111: * @param mode the normalization mode
1112: * @param options the optional features to be enabled.
1113: * @return String the normalized string
1114: * @stable ICU 2.6
1115: */
1116: public static String normalize(String str, Mode mode, int options) {
1117: return mode.normalize(str, options);
1118: }
1119:
1120: /**
1121: * Normalize a string.
1122: * The string will be normalized according the the specified normalization
1123: * mode and options.
1124: * @param src The string to normalize.
1125: * @param mode The normalization mode; one of Normalizer.NONE,
1126: * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
1127: * Normalizer.NFKD, Normalizer.DEFAULT
1128: * @return the normalized string
1129: * @stable ICU 2.8
1130: *
1131: */
1132: public static String normalize(String src, Mode mode) {
1133: return normalize(src, mode, 0);
1134: }
1135:
1136: /**
1137: * Normalize a string.
1138: * The string will be normalized according the the specified normalization
1139: * mode and options.
1140: * @param source The char array to normalize.
1141: * @param target A char buffer to receive the normalized text.
1142: * @param mode The normalization mode; one of Normalizer.NONE,
1143: * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
1144: * Normalizer.NFKD, Normalizer.DEFAULT
1145: * @param options The normalization options, ORed together (0 for no options).
1146: * @return int The total buffer size needed;if greater than length of
1147: * result, the output was truncated.
1148: * @exception IndexOutOfBoundsException if the target capacity is less
1149: * than the required length
1150: * @stable ICU 2.6
1151: */
1152: public static int normalize(char[] source, char[] target,
1153: Mode mode, int options) {
1154: int length = normalize(source, 0, source.length, target, 0,
1155: target.length, mode, options);
1156: if (length <= target.length) {
1157: return length;
1158: } else {
1159: throw new IndexOutOfBoundsException(Integer
1160: .toString(length));
1161: }
1162: }
1163:
1164: /**
1165: * Normalize a string.
1166: * The string will be normalized according the the specified normalization
1167: * mode and options.
1168: * @param src The char array to compose.
1169: * @param srcStart Start index of the source
1170: * @param srcLimit Limit index of the source
1171: * @param dest The char buffer to fill in
1172: * @param destStart Start index of the destination buffer
1173: * @param destLimit End index of the destination buffer
1174: * @param mode The normalization mode; one of Normalizer.NONE,
1175: * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
1176: * Normalizer.NFKD, Normalizer.DEFAULT
1177: * @param options The normalization options, ORed together (0 for no options).
1178: * @return int The total buffer size needed;if greater than length of
1179: * result, the output was truncated.
1180: * @exception IndexOutOfBoundsException if the target capacity is
1181: * less than the required length
1182: * @stable ICU 2.6
1183: */
1184: public static int normalize(char[] src, int srcStart, int srcLimit,
1185: char[] dest, int destStart, int destLimit, Mode mode,
1186: int options) {
1187: int length = mode.normalize(src, srcStart, srcLimit, dest,
1188: destStart, destLimit, options);
1189:
1190: if (length <= (destLimit - destStart)) {
1191: return length;
1192: } else {
1193: throw new IndexOutOfBoundsException(Integer
1194: .toString(length));
1195: }
1196: }
1197:
1198: /**
1199: * Normalize a codepoint accoding to the given mode
1200: * @param char32 The input string to be normalized.
1201: * @param mode The normalization mode
1202: * @param options Options for use with exclusion set an tailored Normalization
1203: * The only option that is currently recognized is UNICODE_3_2
1204: * @return String The normalized string
1205: * @stable ICU 2.6
1206: * @see #UNICODE_3_2
1207: */
1208: // TODO: actually do the optimization when the guts of Normalizer are
1209: // upgraded --has just dumb implementation for now
1210: public static String normalize(int char32, Mode mode, int options) {
1211: return normalize(UTF16.valueOf(char32), mode, options);
1212: }
1213:
1214: /**
1215: * Conveinience method to normalize a codepoint accoding to the given mode
1216: * @param char32 The input string to be normalized.
1217: * @param mode The normalization mode
1218: * @return String The normalized string
1219: * @see #UNICODE_3_2
1220: * @stable ICU 2.6
1221: */
1222: // TODO: actually do the optimization when the guts of Normalizer are
1223: // upgraded --has just dumb implementation for now
1224: public static String normalize(int char32, Mode mode) {
1225: return normalize(UTF16.valueOf(char32), mode, 0);
1226: }
1227:
1228: /**
1229: * Convenience method.
1230: *
1231: * @param source string for determining if it is in a normalized format
1232: * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
1233: * Normalizer.NFKC,Normalizer.NFKD)
1234: * @return Return code to specify if the text is normalized or not
1235: * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
1236: * @stable ICU 2.8
1237: */
1238: public static QuickCheckResult quickCheck(String source, Mode mode) {
1239: return mode.quickCheck(source.toCharArray(), 0,
1240: source.length(), true, null);
1241: }
1242:
1243: /**
1244: * Convenience method.
1245: *
1246: * @param source string for determining if it is in a normalized format
1247: * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
1248: * Normalizer.NFKC,Normalizer.NFKD)
1249: * @param options Options for use with exclusion set an tailored Normalization
1250: * The only option that is currently recognized is UNICODE_3_2
1251: * @return Return code to specify if the text is normalized or not
1252: * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
1253: * @stable ICU 2.6
1254: */
1255: public static QuickCheckResult quickCheck(String source, Mode mode,
1256: int options) {
1257: return mode.quickCheck(source.toCharArray(), 0,
1258: source.length(), true, NormalizerImpl.getNX(options));
1259: }
1260:
1261: /**
1262: * Convenience method.
1263: *
1264: * @param source Array of characters for determining if it is in a
1265: * normalized format
1266: * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
1267: * Normalizer.NFKC,Normalizer.NFKD)
1268: * @param options Options for use with exclusion set an tailored Normalization
1269: * The only option that is currently recognized is UNICODE_3_2
1270: * @return Return code to specify if the text is normalized or not
1271: * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
1272: * @stable ICU 2.6
1273: */
1274: public static QuickCheckResult quickCheck(char[] source, Mode mode,
1275: int options) {
1276: return mode.quickCheck(source, 0, source.length, true,
1277: NormalizerImpl.getNX(options));
1278: }
1279:
1280: /**
1281: * Performing quick check on a string, to quickly determine if the string is
1282: * in a particular normalization format.
1283: * Three types of result can be returned Normalizer.YES, Normalizer.NO or
1284: * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
1285: * string is in the desired normalized format, Normalizer.NO determines that
1286: * argument string is not in the desired normalized format. A
1287: * Normalizer.MAYBE result indicates that a more thorough check is required,
1288: * the user may have to put the string in its normalized form and compare
1289: * the results.
1290: *
1291: * @param source string for determining if it is in a normalized format
1292: * @param start the start index of the source
1293: * @param limit the limit index of the source it is equal to the length
1294: * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
1295: * Normalizer.NFKC,Normalizer.NFKD)
1296: * @param options Options for use with exclusion set an tailored Normalization
1297: * The only option that is currently recognized is UNICODE_3_2
1298: * @return Return code to specify if the text is normalized or not
1299: * (Normalizer.YES, Normalizer.NO or
1300: * Normalizer.MAYBE)
1301: * @stable ICU 2.6
1302: */
1303:
1304: public static QuickCheckResult quickCheck(char[] source, int start,
1305: int limit, Mode mode, int options) {
1306: return mode.quickCheck(source, start, limit, true,
1307: NormalizerImpl.getNX(options));
1308: }
1309:
1310: //-------------------------------------------------------------------------
1311: // Internal methods (for now)
1312: //-------------------------------------------------------------------------
1313:
1314: /**
1315: * Test if a string is in a given normalization form.
1316: * This is semantically equivalent to source.equals(normalize(source, mode)).
1317: *
1318: * Unlike quickCheck(), this function returns a definitive result,
1319: * never a "maybe".
1320: * For NFD, NFKD, and FCD, both functions work exactly the same.
1321: * For NFC and NFKC where quickCheck may return "maybe", this function will
1322: * perform further tests to arrive at a true/false result.
1323: * @param src The input array of characters to be checked to see if
1324: * it is normalized
1325: * @param start The strart index in the source
1326: * @param limit The limit index in the source
1327: * @param mode the normalization mode
1328: * @param options Options for use with exclusion set an tailored Normalization
1329: * The only option that is currently recognized is UNICODE_3_2
1330: * @return Boolean value indicating whether the source string is in the
1331: * "mode" normalization form
1332: * @stable ICU 2.6
1333: */
1334: public static boolean isNormalized(char[] src, int start,
1335: int limit, Mode mode, int options) {
1336: return (mode.quickCheck(src, start, limit, false,
1337: NormalizerImpl.getNX(options)) == YES);
1338: }
1339:
1340: /**
1341: * Convenience Method
1342: * @param str the input string to be checked to see if it is
1343: * normalized
1344: * @param mode the normalization mode
1345: * @param options Options for use with exclusion set an tailored Normalization
1346: * The only option that is currently recognized is UNICODE_3_2
1347: * @see #isNormalized
1348: * @stable ICU 2.6
1349: */
1350: public static boolean isNormalized(String str, Mode mode,
1351: int options) {
1352: return (mode.quickCheck(str.toCharArray(), 0, str.length(),
1353: false, NormalizerImpl.getNX(options)) == YES);
1354: }
1355:
1356: /**
1357: * Convenience Method
1358: * @param char32 the input code point to be checked to see if it is
1359: * normalized
1360: * @param mode the normalization mode
1361: * @param options Options for use with exclusion set an tailored Normalization
1362: * The only option that is currently recognized is UNICODE_3_2
1363: *
1364: * @see #isNormalized
1365: * @stable ICU 2.6
1366: */
1367: // TODO: actually do the optimization when the guts of Normalizer are
1368: // upgraded --has just dumb implementation for now
1369: public static boolean isNormalized(int char32, Mode mode,
1370: int options) {
1371: return isNormalized(UTF16.valueOf(char32), mode, options);
1372: }
1373:
1374: /**
1375: * Compare two strings for canonical equivalence.
1376: * Further options include case-insensitive comparison and
1377: * code point order (as opposed to code unit order).
1378: *
1379: * Canonical equivalence between two strings is defined as their normalized
1380: * forms (NFD or NFC) being identical.
1381: * This function compares strings incrementally instead of normalizing
1382: * (and optionally case-folding) both strings entirely,
1383: * improving performance significantly.
1384: *
1385: * Bulk normalization is only necessary if the strings do not fulfill the
1386: * FCD conditions. Only in this case, and only if the strings are relatively
1387: * long, is memory allocated temporarily.
1388: * For FCD strings and short non-FCD strings there is no memory allocation.
1389: *
1390: * Semantically, this is equivalent to
1391: * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1392: * where code point order and foldCase are all optional.
1393: *
1394: * @param s1 First source character array.
1395: * @param s1Start start index of source
1396: * @param s1Limit limit of the source
1397: *
1398: * @param s2 Second source character array.
1399: * @param s2Start start index of the source
1400: * @param s2Limit limit of the source
1401: *
1402: * @param options A bit set of options:
1403: * - FOLD_CASE_DEFAULT or 0 is used for default options:
1404: * Case-sensitive comparison in code unit order, and the input strings
1405: * are quick-checked for FCD.
1406: *
1407: * - INPUT_IS_FCD
1408: * Set if the caller knows that both s1 and s2 fulfill the FCD
1409: * conditions.If not set, the function will quickCheck for FCD
1410: * and normalize if necessary.
1411: *
1412: * - COMPARE_CODE_POINT_ORDER
1413: * Set to choose code point order instead of code unit order
1414: *
1415: * - COMPARE_IGNORE_CASE
1416: * Set to compare strings case-insensitively using case folding,
1417: * instead of case-sensitively.
1418: * If set, then the following case folding options are used.
1419: *
1420: *
1421: * @return <0 or 0 or >0 as usual for string comparisons
1422: *
1423: * @see #normalize
1424: * @see #FCD
1425: * @stable ICU 2.8
1426: */
1427: public static int compare(char[] s1, int s1Start, int s1Limit,
1428: char[] s2, int s2Start, int s2Limit, int options) {
1429: return internalCompare(s1, s1Start, s1Limit, s2, s2Start,
1430: s2Limit, options);
1431: }
1432:
1433: /**
1434: * Compare two strings for canonical equivalence.
1435: * Further options include case-insensitive comparison and
1436: * code point order (as opposed to code unit order).
1437: * Convenience method.
1438: *
1439: * @param s1 First source string.
1440: * @param s2 Second source string.
1441: *
1442: * @param options A bit set of options:
1443: * - FOLD_CASE_DEFAULT or 0 is used for default options:
1444: * Case-sensitive comparison in code unit order, and the input strings
1445: * are quick-checked for FCD.
1446: *
1447: * - INPUT_IS_FCD
1448: * Set if the caller knows that both s1 and s2 fulfill the FCD
1449: * conditions. If not set, the function will quickCheck for FCD
1450: * and normalize if necessary.
1451: *
1452: * - COMPARE_CODE_POINT_ORDER
1453: * Set to choose code point order instead of code unit order
1454: *
1455: * - COMPARE_IGNORE_CASE
1456: * Set to compare strings case-insensitively using case folding,
1457: * instead of case-sensitively.
1458: * If set, then the following case folding options are used.
1459: *
1460: * @return <0 or 0 or >0 as usual for string comparisons
1461: *
1462: * @see #normalize
1463: * @see #FCD
1464: * @stable ICU 2.8
1465: */
1466: public static int compare(String s1, String s2, int options) {
1467:
1468: return compare(s1.toCharArray(), 0, s1.length(), s2
1469: .toCharArray(), 0, s2.length(), options);
1470: }
1471:
1472: /**
1473: * Compare two strings for canonical equivalence.
1474: * Further options include case-insensitive comparison and
1475: * code point order (as opposed to code unit order).
1476: * Convenience method.
1477: *
1478: * @param s1 First source string.
1479: * @param s2 Second source string.
1480: *
1481: * @param options A bit set of options:
1482: * - FOLD_CASE_DEFAULT or 0 is used for default options:
1483: * Case-sensitive comparison in code unit order, and the input strings
1484: * are quick-checked for FCD.
1485: *
1486: * - INPUT_IS_FCD
1487: * Set if the caller knows that both s1 and s2 fulfill the FCD
1488: * conditions. If not set, the function will quickCheck for FCD
1489: * and normalize if necessary.
1490: *
1491: * - COMPARE_CODE_POINT_ORDER
1492: * Set to choose code point order instead of code unit order
1493: *
1494: * - COMPARE_IGNORE_CASE
1495: * Set to compare strings case-insensitively using case folding,
1496: * instead of case-sensitively.
1497: * If set, then the following case folding options are used.
1498: *
1499: * @return <0 or 0 or >0 as usual for string comparisons
1500: *
1501: * @see #normalize
1502: * @see #FCD
1503: * @stable ICU 2.8
1504: */
1505: public static int compare(char[] s1, char[] s2, int options) {
1506: return compare(s1, 0, s1.length, s2, 0, s2.length, options);
1507: }
1508:
1509: /**
1510: * Convenience method that can have faster implementation
1511: * by not allocating buffers.
1512: * @param char32a the first code point to be checked against the
1513: * @param char32b the second code point
1514: * @param options A bit set of options
1515: * @stable ICU 2.8
1516: */
1517: // TODO: actually do the optimization when the guts of Normalizer are
1518: // upgraded --has just dumb implementation for now
1519: public static int compare(int char32a, int char32b, int options) {
1520: return compare(UTF16.valueOf(char32a), UTF16.valueOf(char32b),
1521: options);
1522: }
1523:
1524: /**
1525: * Convenience method that can have faster implementation
1526: * by not allocating buffers.
1527: * @param char32a the first code point to be checked against
1528: * @param str2 the second string
1529: * @param options A bit set of options
1530: * @stable ICU 2.8
1531: */
1532: // TODO: actually do the optimization when the guts of Normalizer are
1533: // upgraded --has just dumb implementation for now
1534: public static int compare(int char32a, String str2, int options) {
1535: return compare(UTF16.valueOf(char32a), str2, options);
1536: }
1537:
1538: /**
1539: * Concatenate normalized strings, making sure that the result is normalized
1540: * as well.
1541: *
1542: * If both the left and the right strings are in
1543: * the normalization form according to "mode",
1544: * then the result will be
1545: *
1546: * <code>
1547: * dest=normalize(left+right, mode)
1548: * </code>
1549: *
1550: * With the input strings already being normalized,
1551: * this function will use next() and previous()
1552: * to find the adjacent end pieces of the input strings.
1553: * Only the concatenation of these end pieces will be normalized and
1554: * then concatenated with the remaining parts of the input strings.
1555: *
1556: * It is allowed to have dest==left to avoid copying the entire left string.
1557: *
1558: * @param left Left source array, may be same as dest.
1559: * @param leftStart start in the left array.
1560: * @param leftLimit limit in the left array (==length)
1561: * @param right Right source array.
1562: * @param rightStart start in the right array.
1563: * @param rightLimit limit in the right array (==length)
1564: * @param dest The output buffer; can be null if destStart==destLimit==0
1565: * for pure preflighting.
1566: * @param destStart start in the destination array
1567: * @param destLimit limit in the destination array (==length)
1568: * @param mode The normalization mode.
1569: * @param options The normalization options, ORed together (0 for no options).
1570: * @return Length of output (number of chars) when successful or
1571: * IndexOutOfBoundsException
1572: * @exception IndexOutOfBoundsException whose message has the string
1573: * representation of destination capacity required.
1574: * @see #normalize
1575: * @see #next
1576: * @see #previous
1577: * @exception IndexOutOfBoundsException if target capacity is less than the
1578: * required length
1579: * @stable ICU 2.8
1580: */
1581: /* Concatenation of normalized strings --------------------------------- */
1582:
1583: public static int concatenate(char[] left, int leftStart,
1584: int leftLimit, char[] right, int rightStart,
1585: int rightLimit, char[] dest, int destStart, int destLimit,
1586: Normalizer.Mode mode, int options) {
1587:
1588: UCharacterIterator iter;
1589:
1590: int leftBoundary, rightBoundary, destLength;
1591:
1592: if (dest == null) {
1593: throw new IllegalArgumentException();
1594: }
1595:
1596: /* check for overlapping right and destination */
1597: if (right == dest && rightStart < destLimit
1598: && destStart < rightLimit) {
1599: throw new IllegalArgumentException(
1600: "overlapping right and dst ranges");
1601: }
1602:
1603: /* allow left==dest */
1604:
1605: /*
1606: * Input: left[0..leftLength[ + right[0..rightLength[
1607: *
1608: * Find normalization-safe boundaries leftBoundary and rightBoundary
1609: * and copy the end parts together:
1610: * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
1611: *
1612: * dest=left[0..leftBoundary[ +
1613: * normalize(buffer) +
1614: * right[rightBoundary..rightLength[
1615: */
1616:
1617: /*
1618: * find a normalization boundary at the end of the left string
1619: * and copy the end part into the buffer
1620: */
1621:
1622: iter = UCharacterIterator.getInstance(left, leftStart,
1623: leftLimit);
1624:
1625: iter.setIndex(iter.getLength()); /* end of left string */
1626: char[] buffer = new char[100];
1627: int bufferLength;
1628: bufferLength = previous(iter, buffer, 0, buffer.length, mode,
1629: false, null, options);
1630:
1631: leftBoundary = iter.getIndex();
1632:
1633: if (bufferLength > buffer.length) {
1634: char[] newBuf = new char[buffer.length * 2];
1635: buffer = newBuf;
1636: newBuf = null; // null the reference for GC
1637: /* just copy from the left string: we know the boundary already */
1638: System.arraycopy(left, leftBoundary, buffer, 0,
1639: bufferLength);
1640: }
1641:
1642: /*
1643: * find a normalization boundary at the beginning of the right string
1644: * and concatenate the beginning part to the buffer
1645: */
1646:
1647: iter = UCharacterIterator.getInstance(right, rightStart,
1648: rightLimit);
1649:
1650: rightBoundary = next(iter, buffer, bufferLength, buffer.length
1651: - bufferLength, mode, false, null, options);
1652:
1653: if (bufferLength > buffer.length) {
1654: char[] newBuf = new char[buffer.length * 2];
1655: buffer = newBuf;
1656: newBuf = null; // null the reference for GC
1657: /* just copy from the right string: we know the boundary already */
1658: System.arraycopy(right, rightBoundary, buffer,
1659: bufferLength, rightBoundary);
1660: }
1661:
1662: bufferLength += rightBoundary;
1663:
1664: /* copy left[0..leftBoundary[ to dest */
1665: if (left != dest && leftBoundary > 0 && (destLimit) > 0) {
1666: System.arraycopy(left, 0, dest, 0, Math.min(leftBoundary,
1667: destLimit));
1668: }
1669: destLength = leftBoundary;
1670:
1671: /* concatenate the normalization of the buffer to dest */
1672: if (destLimit > destLength) {
1673: destLength += Normalizer.normalize(buffer, 0, bufferLength,
1674: dest, destLength, destLimit, mode, options);
1675:
1676: } else {
1677: destLength += Normalizer.normalize(buffer, 0, bufferLength,
1678: null, 0, 0, mode, options);
1679: }
1680:
1681: /* concatenate right[rightBoundary..rightLength[ to dest */
1682: rightStart += rightBoundary;
1683: int rightLength = (rightLimit - rightStart);
1684: if (rightLength > 0 && destLimit > destLength) {
1685: System.arraycopy(right, rightStart, dest, destLength, Math
1686: .min(rightLength, destLength));
1687: }
1688: destLength += rightLength;
1689:
1690: if (destLength <= (destLimit - destStart)) {
1691: return destLength;
1692: } else {
1693: throw new IndexOutOfBoundsException(Integer
1694: .toString(destLength));
1695: }
1696: }
1697:
1698: /**
1699: * Concatenate normalized strings, making sure that the result is normalized
1700: * as well.
1701: *
1702: * If both the left and the right strings are in
1703: * the normalization form according to "mode",
1704: * then the result will be
1705: *
1706: * <code>
1707: * dest=normalize(left+right, mode)
1708: * </code>
1709: *
1710: * For details see concatenate
1711: *
1712: * @param left Left source string.
1713: * @param right Right source string.
1714: * @param mode The normalization mode.
1715: * @param options The normalization options, ORed together (0 for no options).
1716: * @return result
1717: *
1718: * @see #concatenate
1719: * @see #normalize
1720: * @see #next
1721: * @see #previous
1722: * @see #concatenate
1723: * @stable ICU 2.8
1724: */
1725: public static String concatenate(char[] left, char[] right,
1726: Mode mode, int options) {
1727: char[] result = new char[(left.length + right.length)
1728: * MAX_BUF_SIZE_DECOMPOSE];
1729: for (;;) {
1730:
1731: int length = concatenate(left, 0, left.length, right, 0,
1732: right.length, result, 0, result.length, mode,
1733: options);
1734: if (length <= result.length) {
1735: return new String(result, 0, length);
1736: } else {
1737: result = new char[length];
1738: }
1739: }
1740: }
1741:
1742: /**
1743: * Concatenate normalized strings, making sure that the result is normalized
1744: * as well.
1745: *
1746: * If both the left and the right strings are in
1747: * the normalization form according to "mode",
1748: * then the result will be
1749: *
1750: * <code>
1751: * dest=normalize(left+right, mode)
1752: * </code>
1753: *
1754: * For details see concatenate
1755: *
1756: * @param left Left source string.
1757: * @param right Right source string.
1758: * @param mode The normalization mode.
1759: * @param options The normalization options, ORed together (0 for no options).
1760: * @return result
1761: *
1762: * @see #concatenate
1763: * @see #normalize
1764: * @see #next
1765: * @see #previous
1766: * @see #concatenate
1767: * @stable ICU 2.8
1768: */
1769: public static String concatenate(String left, String right,
1770: Mode mode, int options) {
1771: char[] result = new char[(left.length() + right.length())
1772: * MAX_BUF_SIZE_DECOMPOSE];
1773: for (;;) {
1774:
1775: int length = concatenate(left.toCharArray(), 0, left
1776: .length(), right.toCharArray(), 0, right.length(),
1777: result, 0, result.length, mode, options);
1778: if (length <= result.length) {
1779: return new String(result, 0, length);
1780: } else {
1781: result = new char[length];
1782: }
1783: }
1784: }
1785:
1786: /**
1787: * Gets the FC_NFKC closure set from the normalization data
1788: * @param c The code point whose closure set is to be retrieved
1789: * @param dest The char array to recive the closure set
1790: * @internal
1791: * @deprecated This API is ICU internal only.
1792: */
1793: public static int getFC_NFKC_Closure(int c, char[] dest) {
1794: return NormalizerImpl.getFC_NFKC_Closure(c, dest);
1795: }
1796:
1797: /**
1798: * Gets the FC_NFKC closure set from the normalization data
1799: * @param c The the code point whose closure set is to be retrieved
1800: * @return String representation of the closure set
1801: * @internal
1802: * @deprecated This API is ICU internal only.
1803: */
1804: public static String getFC_NFKC_Closure(int c) {
1805: char[] dest = new char[10];
1806: for (;;) {
1807: int length = getFC_NFKC_Closure(c, dest);
1808: if (length <= dest.length) {
1809: return new String(dest, 0, length);
1810: } else {
1811: dest = new char[length];
1812: }
1813: }
1814: }
1815:
1816: //-------------------------------------------------------------------------
1817: // Iteration API
1818: //-------------------------------------------------------------------------
1819:
1820: /**
1821: * Return the current character in the normalized text->
1822: * @return The codepoint as an int
1823: * @stable ICU 2.8
1824: */
1825: public int current() {
1826: if (bufferPos < bufferLimit || nextNormalize()) {
1827: return getCodePointAt(bufferPos);
1828: } else {
1829: return DONE;
1830: }
1831: }
1832:
1833: /**
1834: * Return the next character in the normalized text and advance
1835: * the iteration position by one. If the end
1836: * of the text has already been reached, {@link #DONE} is returned.
1837: * @return The codepoint as an int
1838: * @stable ICU 2.8
1839: */
1840: public int next() {
1841: if (bufferPos < bufferLimit || nextNormalize()) {
1842: int c = getCodePointAt(bufferPos);
1843: bufferPos += (c > 0xFFFF) ? 2 : 1;
1844: return c;
1845: } else {
1846: return DONE;
1847: }
1848: }
1849:
1850: /**
1851: * Return the previous character in the normalized text and decrement
1852: * the iteration position by one. If the beginning
1853: * of the text has already been reached, {@link #DONE} is returned.
1854: * @return The codepoint as an int
1855: * @stable ICU 2.8
1856: */
1857: public int previous() {
1858: if (bufferPos > 0 || previousNormalize()) {
1859: int c = getCodePointAt(bufferPos - 1);
1860: bufferPos -= (c > 0xFFFF) ? 2 : 1;
1861: return c;
1862: } else {
1863: return DONE;
1864: }
1865: }
1866:
1867: /**
1868: * Reset the index to the beginning of the text.
1869: * This is equivalent to setIndexOnly(startIndex)).
1870: * @stable ICU 2.8
1871: */
1872: public void reset() {
1873: text.setIndex(0);
1874: currentIndex = nextIndex = 0;
1875: clearBuffer();
1876: }
1877:
1878: /**
1879: * Set the iteration position in the input text that is being normalized,
1880: * without any immediate normalization.
1881: * After setIndexOnly(), getIndex() will return the same index that is
1882: * specified here.
1883: *
1884: * @param index the desired index in the input text.
1885: * @stable ICU 2.8
1886: */
1887: public void setIndexOnly(int index) {
1888: text.setIndex(index);
1889: currentIndex = nextIndex = index; // validates index
1890: clearBuffer();
1891: }
1892:
1893: /**
1894: * Set the iteration position in the input text that is being normalized
1895: * and return the first normalized character at that position.
1896: * <p>
1897: * <b>Note:</b> This method sets the position in the <em>input</em> text,
1898: * while {@link #next} and {@link #previous} iterate through characters
1899: * in the normalized <em>output</em>. This means that there is not
1900: * necessarily a one-to-one correspondence between characters returned
1901: * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
1902: * returned from <tt>setIndex</tt> and {@link #getIndex}.
1903: * <p>
1904: * @param index the desired index in the input text->
1905: *
1906: * @return the first normalized character that is the result of iterating
1907: * forward starting at the given index.
1908: *
1909: * @throws IllegalArgumentException if the given index is less than
1910: * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
1911: * @return The codepoint as an int
1912: * @deprecated ICU 3.2
1913: * @obsolete ICU 3.2
1914: */
1915: ///CLOVER:OFF
1916: public int setIndex(int index) {
1917: setIndexOnly(index);
1918: return current();
1919: }
1920:
1921: ///CLOVER:ON
1922: /**
1923: * Retrieve the index of the start of the input text. This is the begin
1924: * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1925: * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1926: * @deprecated ICU 2.2. Use startIndex() instead.
1927: * @return The codepoint as an int
1928: * @see #startIndex
1929: */
1930: public int getBeginIndex() {
1931: return 0;
1932: }
1933:
1934: /**
1935: * Retrieve the index of the end of the input text. This is the end index
1936: * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1937: * over which this <tt>Normalizer</tt> is iterating
1938: * @deprecated ICU 2.2. Use endIndex() instead.
1939: * @return The codepoint as an int
1940: * @see #endIndex
1941: */
1942: public int getEndIndex() {
1943: return endIndex();
1944: }
1945:
1946: /**
1947: * Return the first character in the normalized text-> This resets
1948: * the <tt>Normalizer's</tt> position to the beginning of the text->
1949: * @return The codepoint as an int
1950: * @stable ICU 2.8
1951: */
1952: public int first() {
1953: reset();
1954: return next();
1955: }
1956:
1957: /**
1958: * Return the last character in the normalized text-> This resets
1959: * the <tt>Normalizer's</tt> position to be just before the
1960: * the input text corresponding to that normalized character.
1961: * @return The codepoint as an int
1962: * @stable ICU 2.8
1963: */
1964: public int last() {
1965: text.setToLimit();
1966: currentIndex = nextIndex = text.getIndex();
1967: clearBuffer();
1968: return previous();
1969: }
1970:
1971: /**
1972: * Retrieve the current iteration position in the input text that is
1973: * being normalized. This method is useful in applications such as
1974: * searching, where you need to be able to determine the position in
1975: * the input text that corresponds to a given normalized output character.
1976: * <p>
1977: * <b>Note:</b> This method sets the position in the <em>input</em>, while
1978: * {@link #next} and {@link #previous} iterate through characters in the
1979: * <em>output</em>. This means that there is not necessarily a one-to-one
1980: * correspondence between characters returned by <tt>next</tt> and
1981: * <tt>previous</tt> and the indices passed to and returned from
1982: * <tt>setIndex</tt> and {@link #getIndex}.
1983: * @return The current iteration position
1984: * @stable ICU 2.8
1985: */
1986: public int getIndex() {
1987: if (bufferPos < bufferLimit) {
1988: return currentIndex;
1989: } else {
1990: return nextIndex;
1991: }
1992: }
1993:
1994: /**
1995: * Retrieve the index of the start of the input text. This is the begin
1996: * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1997: * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1998: * @return The current iteration position
1999: * @stable ICU 2.8
2000: */
2001: public int startIndex() {
2002: return 0;
2003: }
2004:
2005: /**
2006: * Retrieve the index of the end of the input text-> This is the end index
2007: * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
2008: * over which this <tt>Normalizer</tt> is iterating
2009: * @return The current iteration position
2010: * @stable ICU 2.8
2011: */
2012: public int endIndex() {
2013: return text.getLength();
2014: }
2015:
2016: //-------------------------------------------------------------------------
2017: // Property access methods
2018: //-------------------------------------------------------------------------
2019: /**
2020: * Set the normalization mode for this object.
2021: * <p>
2022: * <b>Note:</b>If the normalization mode is changed while iterating
2023: * over a string, calls to {@link #next} and {@link #previous} may
2024: * return previously buffers characters in the old normalization mode
2025: * until the iteration is able to re-sync at the next base character.
2026: * It is safest to call {@link #setText setText()}, {@link #first},
2027: * {@link #last}, etc. after calling <tt>setMode</tt>.
2028: * <p>
2029: * @param newMode the new mode for this <tt>Normalizer</tt>.
2030: * The supported modes are:
2031: * <ul>
2032: * <li>{@link #COMPOSE} - Unicode canonical decompositiion
2033: * followed by canonical composition.
2034: * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
2035: * follwed by canonical composition.
2036: * <li>{@link #DECOMP} - Unicode canonical decomposition
2037: * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition.
2038: * <li>{@link #NO_OP} - Do nothing but return characters
2039: * from the underlying input text.
2040: * </ul>
2041: *
2042: * @see #getMode
2043: * @stable ICU 2.8
2044: */
2045: public void setMode(Mode newMode) {
2046: mode = newMode;
2047: }
2048:
2049: /**
2050: * Return the basic operation performed by this <tt>Normalizer</tt>
2051: *
2052: * @see #setMode
2053: * @stable ICU 2.8
2054: */
2055: public Mode getMode() {
2056: return mode;
2057: }
2058:
2059: /**
2060: * Set options that affect this <tt>Normalizer</tt>'s operation.
2061: * Options do not change the basic composition or decomposition operation
2062: * that is being performed , but they control whether
2063: * certain optional portions of the operation are done.
2064: * Currently the only available option is:
2065: * <p>
2066: * <ul>
2067: * <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
2068: * </ul>
2069: * <p>
2070: * @param option the option whose value is to be set.
2071: * @param value the new setting for the option. Use <tt>true</tt> to
2072: * turn the option on and <tt>false</tt> to turn it off.
2073: *
2074: * @see #getOption
2075: * @stable ICU 2.6
2076: */
2077: public void setOption(int option, boolean value) {
2078: if (value) {
2079: options |= option;
2080: } else {
2081: options &= (~option);
2082: }
2083: }
2084:
2085: /**
2086: * Determine whether an option is turned on or off.
2087: * <p>
2088: * @see #setOption
2089: * @stable ICU 2.6
2090: */
2091: public int getOption(int option) {
2092: if ((options & option) != 0) {
2093: return 1;
2094: } else {
2095: return 0;
2096: }
2097: }
2098:
2099: /**
2100: * Gets the underlying text storage
2101: * @param fillIn the char buffer to fill the UTF-16 units.
2102: * The length of the buffer should be equal to the length of the
2103: * underlying text storage
2104: * @throws IndexOutOfBoundsException
2105: * @see #getLength
2106: * @stable ICU 2.8
2107: */
2108: public int getText(char[] fillIn) {
2109: return text.getText(fillIn);
2110: }
2111:
2112: /**
2113: * Gets the length of underlying text storage
2114: * @return the length
2115: * @stable ICU 2.8
2116: */
2117: public int getLength() {
2118: return text.getLength();
2119: }
2120:
2121: /**
2122: * Returns the text under iteration as a string
2123: * @return a copy of the text under iteration.
2124: * @stable ICU 2.8
2125: */
2126: public String getText() {
2127: return text.getText();
2128: }
2129:
2130: /**
2131: * Set the input text over which this <tt>Normalizer</tt> will iterate.
2132: * The iteration position is set to the beginning of the input text->
2133: * @param newText The new string to be normalized.
2134: * @stable ICU 2.8
2135: */
2136: public void setText(StringBuffer newText) {
2137:
2138: UCharacterIterator newIter = UCharacterIterator
2139: .getInstance(newText);
2140: if (newIter == null) {
2141: throw new IllegalStateException(
2142: "Could not create a new UCharacterIterator");
2143: }
2144: text = newIter;
2145: reset();
2146: }
2147:
2148: /**
2149: * Set the input text over which this <tt>Normalizer</tt> will iterate.
2150: * The iteration position is set to the beginning of the input text->
2151: * @param newText The new string to be normalized.
2152: * @stable ICU 2.8
2153: */
2154: public void setText(char[] newText) {
2155:
2156: UCharacterIterator newIter = UCharacterIterator
2157: .getInstance(newText);
2158: if (newIter == null) {
2159: throw new IllegalStateException(
2160: "Could not create a new UCharacterIterator");
2161: }
2162: text = newIter;
2163: reset();
2164: }
2165:
2166: /**
2167: * Set the input text over which this <tt>Normalizer</tt> will iterate.
2168: * The iteration position is set to the beginning of the input text->
2169: * @param newText The new string to be normalized.
2170: * @stable ICU 2.8
2171: */
2172: public void setText(String newText) {
2173:
2174: UCharacterIterator newIter = UCharacterIterator
2175: .getInstance(newText);
2176: if (newIter == null) {
2177: throw new IllegalStateException(
2178: "Could not create a new UCharacterIterator");
2179: }
2180: text = newIter;
2181: reset();
2182: }
2183:
2184: /**
2185: * Set the input text over which this <tt>Normalizer</tt> will iterate.
2186: * The iteration position is set to the beginning of the input text->
2187: * @param newText The new string to be normalized.
2188: * @stable ICU 2.8
2189: */
2190: public void setText(CharacterIterator newText) {
2191:
2192: UCharacterIterator newIter = UCharacterIterator
2193: .getInstance(newText);
2194: if (newIter == null) {
2195: throw new IllegalStateException(
2196: "Could not create a new UCharacterIterator");
2197: }
2198: text = newIter;
2199: reset();
2200: }
2201:
2202: /**
2203: * Set the input text over which this <tt>Normalizer</tt> will iterate.
2204: * The iteration position is set to the beginning of the string.
2205: * @param newText The new string to be normalized.
2206: * @stable ICU 2.8
2207: */
2208: public void setText(UCharacterIterator newText) {
2209: try {
2210: UCharacterIterator newIter = (UCharacterIterator) newText
2211: .clone();
2212: if (newIter == null) {
2213: throw new IllegalStateException(
2214: "Could not create a new UCharacterIterator");
2215: }
2216: text = newIter;
2217: reset();
2218: } catch (CloneNotSupportedException e) {
2219: throw new IllegalStateException(
2220: "Could not clone the UCharacterIterator");
2221: }
2222: }
2223:
2224: //-------------------------------------------------------------------------
2225: // Private utility methods
2226: //-------------------------------------------------------------------------
2227:
2228: /* backward iteration --------------------------------------------------- */
2229:
2230: /*
2231: * read backwards and get norm32
2232: * return 0 if the character is <minC
2233: * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
2234: * surrogate but read second!)
2235: */
2236:
2237: private static long getPrevNorm32(UCharacterIterator src,
2238: int/*unsigned*/minC, int/*unsigned*/mask, char[] chars) {
2239: long norm32;
2240: int ch = 0;
2241: /* need src.hasPrevious() */
2242: if ((ch = src.previous()) == UCharacterIterator.DONE) {
2243: return 0;
2244: }
2245: chars[0] = (char) ch;
2246: chars[1] = 0;
2247:
2248: /* check for a surrogate before getting norm32 to see if we need to
2249: * predecrement further */
2250: if (chars[0] < minC) {
2251: return 0;
2252: } else if (!UTF16.isSurrogate(chars[0])) {
2253: return NormalizerImpl.getNorm32(chars[0]);
2254: } else if (UTF16.isLeadSurrogate(chars[0])
2255: || (src.getIndex() == 0)) {
2256: /* unpaired surrogate */
2257: chars[1] = (char) src.current();
2258: return 0;
2259: } else if (UTF16.isLeadSurrogate(chars[1] = (char) src
2260: .previous())) {
2261: norm32 = NormalizerImpl.getNorm32(chars[1]);
2262: if ((norm32 & mask) == 0) {
2263: /* all surrogate pairs with this lead surrogate have irrelevant
2264: * data */
2265: return 0;
2266: } else {
2267: /* norm32 must be a surrogate special */
2268: return NormalizerImpl.getNorm32FromSurrogatePair(
2269: norm32, chars[0]);
2270: }
2271: } else {
2272: /* unpaired second surrogate, undo the c2=src.previous() movement */
2273: src.moveIndex(1);
2274: return 0;
2275: }
2276: }
2277:
2278: private interface IsPrevBoundary {
2279: public boolean isPrevBoundary(UCharacterIterator src,
2280: int/*unsigned*/minC, int/*unsigned*/mask, char[] chars);
2281: }
2282:
2283: private static final class IsPrevNFDSafe implements IsPrevBoundary {
2284: /*
2285: * for NF*D:
2286: * read backwards and check if the lead combining class is 0
2287: * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
2288: * surrogate but read second!)
2289: */
2290: public boolean isPrevBoundary(UCharacterIterator src,
2291: int/*unsigned*/minC, int/*unsigned*/ccOrQCMask,
2292: char[] chars) {
2293:
2294: return NormalizerImpl.isNFDSafe(getPrevNorm32(src, minC,
2295: ccOrQCMask, chars), ccOrQCMask, ccOrQCMask
2296: & NormalizerImpl.QC_MASK);
2297: }
2298: }
2299:
2300: private static final class IsPrevTrueStarter implements
2301: IsPrevBoundary {
2302: /*
2303: * read backwards and check if the character is (or its decomposition
2304: * begins with) a "true starter" (cc==0 and NF*C_YES)
2305: * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
2306: * surrogate but read second!)
2307: */
2308: public boolean isPrevBoundary(UCharacterIterator src,
2309: int/*unsigned*/minC, int/*unsigned*/ccOrQCMask,
2310: char[] chars) {
2311: long norm32;
2312: int/*unsigned*/decompQCMask;
2313:
2314: decompQCMask = (ccOrQCMask << 2) & 0xf; /*decomposition quick check mask*/
2315: norm32 = getPrevNorm32(src, minC,
2316: ccOrQCMask | decompQCMask, chars);
2317: return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask,
2318: decompQCMask);
2319: }
2320: }
2321:
2322: private static int findPreviousIterationBoundary(
2323: UCharacterIterator src, IsPrevBoundary obj,
2324: int/*unsigned*/minC, int/*mask*/mask, char[] buffer,
2325: int[] startIndex) {
2326: char[] chars = new char[2];
2327: boolean isBoundary;
2328:
2329: /* fill the buffer from the end backwards */
2330: startIndex[0] = buffer.length;
2331: chars[0] = 0;
2332: while (src.getIndex() > 0
2333: && chars[0] != UCharacterIterator.DONE) {
2334: isBoundary = obj.isPrevBoundary(src, minC, mask, chars);
2335:
2336: /* always write this character to the front of the buffer */
2337: /* make sure there is enough space in the buffer */
2338: if (startIndex[0] < (chars[1] == 0 ? 1 : 2)) {
2339:
2340: // grow the buffer
2341: char[] newBuf = new char[buffer.length * 2];
2342: /* move the current buffer contents up */
2343: System
2344: .arraycopy(
2345: buffer,
2346: startIndex[0],
2347: newBuf,
2348: newBuf.length
2349: - (buffer.length - startIndex[0]),
2350: buffer.length - startIndex[0]);
2351: //adjust the startIndex
2352: startIndex[0] += newBuf.length - buffer.length;
2353:
2354: buffer = newBuf;
2355: newBuf = null;
2356:
2357: }
2358:
2359: buffer[--startIndex[0]] = chars[0];
2360: if (chars[1] != 0) {
2361: buffer[--startIndex[0]] = chars[1];
2362: }
2363:
2364: /* stop if this just-copied character is a boundary */
2365: if (isBoundary) {
2366: break;
2367: }
2368: }
2369:
2370: /* return the length of the buffer contents */
2371: return buffer.length - startIndex[0];
2372: }
2373:
2374: private static int previous(UCharacterIterator src, char[] dest,
2375: int destStart, int destLimit, Mode mode,
2376: boolean doNormalize, boolean[] pNeededToNormalize,
2377: int options) {
2378:
2379: IsPrevBoundary isPreviousBoundary;
2380: int destLength, bufferLength;
2381: int/*unsigned*/mask;
2382:
2383: int c, c2;
2384:
2385: char minC;
2386: int destCapacity = destLimit - destStart;
2387: destLength = 0;
2388:
2389: if (pNeededToNormalize != null) {
2390: pNeededToNormalize[0] = false;
2391: }
2392: minC = (char) mode.getMinC();
2393: mask = mode.getMask();
2394: isPreviousBoundary = mode.getPrevBoundary();
2395:
2396: if (isPreviousBoundary == null) {
2397: destLength = 0;
2398: if ((c = src.previous()) >= 0) {
2399: destLength = 1;
2400: if (UTF16.isTrailSurrogate((char) c)) {
2401: c2 = src.previous();
2402: if (c2 != UCharacterIterator.DONE) {
2403: if (UTF16.isLeadSurrogate((char) c2)) {
2404: if (destCapacity >= 2) {
2405: dest[1] = (char) c; // trail surrogate
2406: destLength = 2;
2407: }
2408: // lead surrogate to be written below
2409: c = c2;
2410: } else {
2411: src.moveIndex(1);
2412: }
2413: }
2414: }
2415:
2416: if (destCapacity > 0) {
2417: dest[0] = (char) c;
2418: }
2419: }
2420: return destLength;
2421: }
2422:
2423: char[] buffer = new char[100];
2424: int[] startIndex = new int[1];
2425: bufferLength = findPreviousIterationBoundary(src,
2426: isPreviousBoundary, minC, mask, buffer, startIndex);
2427: if (bufferLength > 0) {
2428: if (doNormalize) {
2429: destLength = Normalizer.normalize(buffer,
2430: startIndex[0], startIndex[0] + bufferLength,
2431: dest, destStart, destLimit, mode, options);
2432:
2433: if (pNeededToNormalize != null) {
2434: pNeededToNormalize[0] = (boolean) (destLength != bufferLength || Utility
2435: .arrayRegionMatches(buffer, 0, dest,
2436: destStart, destLimit));
2437: }
2438: } else {
2439: /* just copy the source characters */
2440: if (destCapacity > 0) {
2441: System
2442: .arraycopy(
2443: buffer,
2444: startIndex[0],
2445: dest,
2446: 0,
2447: (bufferLength < destCapacity) ? bufferLength
2448: : destCapacity);
2449: }
2450: }
2451: }
2452:
2453: return destLength;
2454: }
2455:
2456: /* forward iteration ---------------------------------------------------- */
2457: /*
2458: * read forward and check if the character is a next-iteration boundary
2459: * if c2!=0 then (c, c2) is a surrogate pair
2460: */
2461: private interface IsNextBoundary {
2462: boolean isNextBoundary(UCharacterIterator src,
2463: int/*unsigned*/minC, int/*unsigned*/mask, int[] chars);
2464: }
2465:
2466: /*
2467: * read forward and get norm32
2468: * return 0 if the character is <minC
2469: * if c2!=0 then (c2, c) is a surrogate pair
2470: * always reads complete characters
2471: */
2472: private static long /*unsigned*/getNextNorm32(
2473: UCharacterIterator src, int/*unsigned*/minC,
2474: int/*unsigned*/mask, int[] chars) {
2475: long norm32;
2476:
2477: /* need src.hasNext() to be true */
2478: chars[0] = src.next();
2479: chars[1] = 0;
2480:
2481: if (chars[0] < minC) {
2482: return 0;
2483: }
2484:
2485: norm32 = NormalizerImpl.getNorm32((char) chars[0]);
2486: if (UTF16.isLeadSurrogate((char) chars[0])) {
2487: if (src.current() != UCharacterIterator.DONE
2488: && UTF16.isTrailSurrogate((char) (chars[1] = src
2489: .current()))) {
2490: src.moveIndex(1); /* skip the c2 surrogate */
2491: if ((norm32 & mask) == 0) {
2492: /* irrelevant data */
2493: return 0;
2494: } else {
2495: /* norm32 must be a surrogate special */
2496: return NormalizerImpl.getNorm32FromSurrogatePair(
2497: norm32, (char) chars[1]);
2498: }
2499: } else {
2500: /* unmatched surrogate */
2501: return 0;
2502: }
2503: }
2504: return norm32;
2505: }
2506:
2507: /*
2508: * for NF*D:
2509: * read forward and check if the lead combining class is 0
2510: * if c2!=0 then (c, c2) is a surrogate pair
2511: */
2512: private static final class IsNextNFDSafe implements IsNextBoundary {
2513: public boolean isNextBoundary(UCharacterIterator src,
2514: int/*unsigned*/minC, int/*unsigned*/ccOrQCMask,
2515: int[] chars) {
2516: return NormalizerImpl.isNFDSafe(getNextNorm32(src, minC,
2517: ccOrQCMask, chars), ccOrQCMask, ccOrQCMask
2518: & NormalizerImpl.QC_MASK);
2519: }
2520: }
2521:
2522: /*
2523: * for NF*C:
2524: * read forward and check if the character is (or its decomposition begins
2525: * with) a "true starter" (cc==0 and NF*C_YES)
2526: * if c2!=0 then (c, c2) is a surrogate pair
2527: */
2528: private static final class IsNextTrueStarter implements
2529: IsNextBoundary {
2530: public boolean isNextBoundary(UCharacterIterator src,
2531: int/*unsigned*/minC, int/*unsigned*/ccOrQCMask,
2532: int[] chars) {
2533: long norm32;
2534: int/*unsigned*/decompQCMask;
2535:
2536: decompQCMask = (ccOrQCMask << 2) & 0xf; /*decomposition quick check mask*/
2537: norm32 = getNextNorm32(src, minC,
2538: ccOrQCMask | decompQCMask, chars);
2539: return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask,
2540: decompQCMask);
2541: }
2542: }
2543:
2544: private static int findNextIterationBoundary(
2545: UCharacterIterator src, IsNextBoundary obj,
2546: int/*unsigned*/minC, int/*unsigned*/mask, char[] buffer) {
2547: int[] chars = new int[2];
2548: int bufferIndex = 0;
2549:
2550: if (src.current() == UCharacterIterator.DONE) {
2551: return 0;
2552: }
2553: /* get one character and ignore its properties */
2554: chars[0] = src.next();
2555: buffer[0] = (char) chars[0];
2556: bufferIndex = 1;
2557:
2558: if (UTF16.isLeadSurrogate((char) chars[0])
2559: && src.current() != UCharacterIterator.DONE) {
2560: if (UTF16.isTrailSurrogate((char) (chars[1] = src.next()))) {
2561: buffer[bufferIndex++] = (char) chars[1];
2562: } else {
2563: src.moveIndex(-1); /* back out the non-trail-surrogate */
2564: }
2565: }
2566:
2567: /* get all following characters until we see a boundary */
2568: /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff
2569: * is part of the string */
2570: while (src.current() != UCharacterIterator.DONE) {
2571: if (obj.isNextBoundary(src, minC, mask, chars)) {
2572: /* back out the latest movement to stop at the boundary */
2573: src.moveIndex(chars[1] == 0 ? -1 : -2);
2574: break;
2575: } else {
2576: if (bufferIndex + (chars[1] == 0 ? 1 : 2) <= buffer.length) {
2577: buffer[bufferIndex++] = (char) chars[0];
2578: if (chars[1] != 0) {
2579: buffer[bufferIndex++] = (char) chars[1];
2580: }
2581: } else {
2582: char[] newBuf = new char[buffer.length * 2];
2583: System.arraycopy(buffer, 0, newBuf, 0, bufferIndex);
2584: buffer = newBuf;
2585: buffer[bufferIndex++] = (char) chars[0];
2586: if (chars[1] != 0) {
2587: buffer[bufferIndex++] = (char) chars[1];
2588: }
2589: }
2590: }
2591: }
2592:
2593: /* return the length of the buffer contents */
2594: return bufferIndex;
2595: }
2596:
2597: private static int next(UCharacterIterator src, char[] dest,
2598: int destStart, int destLimit, Normalizer.Mode mode,
2599: boolean doNormalize, boolean[] pNeededToNormalize,
2600: int options) {
2601:
2602: IsNextBoundary isNextBoundary;
2603: int /*unsigned*/mask;
2604: int /*unsigned*/bufferLength;
2605: int c, c2;
2606: char minC;
2607: int destCapacity = destLimit - destStart;
2608: int destLength = 0;
2609:
2610: if (pNeededToNormalize != null) {
2611: pNeededToNormalize[0] = false;
2612: }
2613:
2614: minC = (char) mode.getMinC();
2615: mask = mode.getMask();
2616: isNextBoundary = mode.getNextBoundary();
2617:
2618: if (isNextBoundary == null) {
2619: destLength = 0;
2620: c = src.next();
2621: if (c != UCharacterIterator.DONE) {
2622: destLength = 1;
2623: if (UTF16.isLeadSurrogate((char) c)) {
2624: c2 = src.next();
2625: if (c2 != UCharacterIterator.DONE) {
2626: if (UTF16.isTrailSurrogate((char) c2)) {
2627: if (destCapacity >= 2) {
2628: dest[1] = (char) c2; // trail surrogate
2629: destLength = 2;
2630: }
2631: // lead surrogate to be written below
2632: } else {
2633: src.moveIndex(-1);
2634: }
2635: }
2636: }
2637:
2638: if (destCapacity > 0) {
2639: dest[0] = (char) c;
2640: }
2641: }
2642: return destLength;
2643: }
2644:
2645: char[] buffer = new char[100];
2646: int[] startIndex = new int[1];
2647:
2648: bufferLength = findNextIterationBoundary(src, isNextBoundary,
2649: minC, mask, buffer);
2650: if (bufferLength > 0) {
2651: if (doNormalize) {
2652: destLength = mode.normalize(buffer, startIndex[0],
2653: bufferLength, dest, destStart, destLimit,
2654: options);
2655:
2656: if (pNeededToNormalize != null) {
2657: pNeededToNormalize[0] = (boolean) (destLength != bufferLength || Utility
2658: .arrayRegionMatches(buffer, startIndex[0],
2659: dest, destStart, destLength));
2660: }
2661: } else {
2662: /* just copy the source characters */
2663: if (destCapacity > 0) {
2664: System.arraycopy(buffer, 0, dest, destStart, Math
2665: .min(bufferLength, destCapacity));
2666: }
2667:
2668: }
2669: }
2670: return destLength;
2671: }
2672:
2673: private void clearBuffer() {
2674: bufferLimit = bufferStart = bufferPos = 0;
2675: }
2676:
2677: private boolean nextNormalize() {
2678:
2679: clearBuffer();
2680: currentIndex = nextIndex;
2681: text.setIndex(nextIndex);
2682:
2683: bufferLimit = next(text, buffer, bufferStart, buffer.length,
2684: mode, true, null, options);
2685:
2686: nextIndex = text.getIndex();
2687: return (bufferLimit > 0);
2688: }
2689:
2690: private boolean previousNormalize() {
2691:
2692: clearBuffer();
2693: nextIndex = currentIndex;
2694: text.setIndex(currentIndex);
2695: bufferLimit = previous(text, buffer, bufferStart,
2696: buffer.length, mode, true, null, options);
2697:
2698: currentIndex = text.getIndex();
2699: bufferPos = bufferLimit;
2700: return bufferLimit > 0;
2701: }
2702:
2703: private int getCodePointAt(int index) {
2704: if (UTF16.isSurrogate(buffer[index])) {
2705: if (UTF16.isLeadSurrogate(buffer[index])) {
2706: if ((index + 1) < bufferLimit
2707: && UTF16.isTrailSurrogate(buffer[index + 1])) {
2708: return UCharacterProperty.getRawSupplementary(
2709: buffer[index], buffer[index + 1]);
2710: }
2711: } else if (UTF16.isTrailSurrogate(buffer[index])) {
2712: if (index > 0
2713: && UTF16.isLeadSurrogate(buffer[index - 1])) {
2714: return UCharacterProperty.getRawSupplementary(
2715: buffer[index - 1], buffer[index]);
2716: }
2717: }
2718: }
2719: return buffer[index];
2720:
2721: }
2722:
2723: /**
2724: * Internal API
2725: * @internal
2726: * @deprecated This API is ICU internal only.
2727: */
2728: public static boolean isNFSkippable(int c, Mode mode) {
2729: return mode.isNFSkippable(c);
2730: }
2731:
2732: private static int internalCompare(char[] s1, int s1Start,
2733: int s1Limit, char[] s2, int s2Start, int s2Limit,
2734: int options) {
2735:
2736: char[] fcd1 = new char[300];
2737: char[] fcd2 = new char[300];
2738:
2739: Normalizer.Mode mode;
2740: int result;
2741:
2742: if (s1 == null || s1Start < 0 || s1Limit < 0 || s2 == null
2743: || s2Start < 0 || s2Limit < 0 || s1Limit < s1Start
2744: || s2Limit < s2Start) {
2745:
2746: throw new IllegalArgumentException();
2747: }
2748:
2749: UnicodeSet nx = NormalizerImpl
2750: .getNX((int) (options >> Normalizer.COMPARE_NORM_OPTIONS_SHIFT));
2751: options |= NormalizerImpl.COMPARE_EQUIV;
2752: result = 0;
2753:
2754: /*
2755: * UAX #21 Case Mappings, as fixed for Unicode version 4
2756: * (see Jitterbug 2021), defines a canonical caseless match as
2757: *
2758: * A string X is a canonical caseless match
2759: * for a string Y if and only if
2760: * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
2761: *
2762: * For better performance, we check for FCD (or let the caller tell us that
2763: * both strings are in FCD) for the inner normalization.
2764: * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
2765: * case-folding preserves the FCD-ness of a string.
2766: * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
2767: * when there is a difference.
2768: *
2769: * Exception: When using the Turkic case-folding option, we do perform
2770: * full NFD first. This is because in the Turkic case precomposed characters
2771: * with 0049 capital I or 0069 small i fold differently whether they
2772: * are first decomposed or not, so an FCD check - a check only for
2773: * canonical order - is not sufficient.
2774: */
2775: if ((options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) > 0) {
2776: mode = Normalizer.NFD;
2777: options &= ~Normalizer.INPUT_IS_FCD;
2778: } else {
2779: mode = Normalizer.FCD;
2780: }
2781: if ((options & Normalizer.INPUT_IS_FCD) == 0) {
2782: char[] dest;
2783: int fcdLen1, fcdLen2;
2784: boolean isFCD1, isFCD2;
2785:
2786: // check if s1 and/or s2 fulfill the FCD conditions
2787: isFCD1 = Normalizer.YES == mode.quickCheck(s1, s1Start,
2788: s1Limit, true, nx);
2789: isFCD2 = Normalizer.YES == mode.quickCheck(s2, s2Start,
2790: s2Limit, true, nx);
2791: /*
2792: * ICU 2.4 had a further optimization:
2793: * If both strings were not in FCD, then they were both NFD'ed,
2794: * and the COMPARE_EQUIV option was turned off.
2795: * It is not entirely clear that this is valid with the current
2796: * definition of the canonical caseless match.
2797: * Therefore, ICU 2.6 removes that optimization.
2798: */
2799:
2800: if (!isFCD1) {
2801: fcdLen1 = mode.normalize(s1, 0, s1.length, fcd1, 0,
2802: fcd1.length, nx);
2803:
2804: if (fcdLen1 > fcd1.length) {
2805: dest = new char[fcdLen1];
2806: fcdLen1 = mode.normalize(s1, 0, s1.length, dest, 0,
2807: dest.length, nx);
2808: s1 = dest;
2809: } else {
2810: s1 = fcd1;
2811: }
2812: s1Limit = fcdLen1;
2813: s1Start = 0;
2814: }
2815:
2816: if (!isFCD2) {
2817: fcdLen2 = mode.normalize(s2, s2Start, s2Limit, fcd2, 0,
2818: fcd2.length, nx);
2819:
2820: if (fcdLen2 > fcd2.length) {
2821: dest = new char[fcdLen2];
2822: fcdLen2 = mode.normalize(s2, s2Start, s2Limit,
2823: dest, 0, dest.length, nx);
2824: s2 = dest;
2825: } else {
2826: s2 = fcd2;
2827: }
2828: s2Limit = fcdLen2;
2829: s2Start = 0;
2830: }
2831:
2832: }
2833:
2834: result = NormalizerImpl.cmpEquivFold(s1, s1Start, s1Limit, s2,
2835: s2Start, s2Limit, options);
2836: return result;
2837: }
2838:
2839: /**
2840: * Fetches the Unicode version burned into the Normalization data file
2841: * @return VersionInfo version information of the normalizer
2842: * @internal
2843: * @deprecated This API is ICU internal only.
2844: */
2845: static VersionInfo getUnicodeVersion() {
2846: return NormalizerImpl.getUnicodeVersion();
2847: }
2848: }
|