0001: /*
0002: *******************************************************************************
0003: * Copyright (C) 1996-2005, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */
0007:
0008: package com.ibm.icu.impl;
0009:
0010: import java.io.ByteArrayInputStream;
0011: import java.io.IOException;
0012: import java.io.BufferedInputStream;
0013: import java.io.InputStream;
0014: import java.util.MissingResourceException;
0015:
0016: import com.ibm.icu.text.Normalizer;
0017: import com.ibm.icu.text.UTF16;
0018: import com.ibm.icu.text.UnicodeSet;
0019: import com.ibm.icu.text.UnicodeSetIterator;
0020: import com.ibm.icu.util.RangeValueIterator;
0021: import com.ibm.icu.util.VersionInfo;
0022: import com.ibm.icu.lang.UCharacter;
0023:
0024: /**
0025: * @version 1.0
0026: * @author Ram Viswanadha
0027: */
0028: public final class NormalizerImpl {
0029: // Static block for the class to initialize its own self
0030: static final NormalizerImpl IMPL;
0031:
0032: static {
0033: try {
0034: IMPL = new NormalizerImpl();
0035: } catch (Exception e) {
0036: throw new MissingResourceException(e.getMessage(), "", "");
0037: }
0038: }
0039:
0040: static final int UNSIGNED_BYTE_MASK = 0xFF;
0041: static final long UNSIGNED_INT_MASK = 0xffffffffL;
0042: /*
0043: * This new implementation of the normalization code loads its data from
0044: * unorm.icu, which is generated with the gennorm tool.
0045: * The format of that file is described at the end of this file.
0046: */
0047: private static final String DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE
0048: + "/unorm.icu";
0049:
0050: // norm32 value constants
0051:
0052: // quick check flags 0..3 set mean "no" for their forms
0053: public static final int QC_NFC = 0x11; /* no|maybe */
0054: public static final int QC_NFKC = 0x22; /* no|maybe */
0055: public static final int QC_NFD = 4; /* no */
0056: public static final int QC_NFKD = 8; /* no */
0057:
0058: public static final int QC_ANY_NO = 0xf;
0059:
0060: /* quick check flags 4..5 mean "maybe" for their forms;
0061: * test flags>=QC_MAYBE
0062: */
0063: public static final int QC_MAYBE = 0x10;
0064: public static final int QC_ANY_MAYBE = 0x30;
0065:
0066: public static final int QC_MASK = 0x3f;
0067:
0068: private static final int COMBINES_FWD = 0x40;
0069: private static final int COMBINES_BACK = 0x80;
0070: public static final int COMBINES_ANY = 0xc0;
0071: // UnicodeData.txt combining class in bits 15.
0072: private static final int CC_SHIFT = 8;
0073: public static final int CC_MASK = 0xff00;
0074: // 16 bits for the index to UChars and other extra data
0075: private static final int EXTRA_SHIFT = 16;
0076: // start of surrogate specials after shift
0077: private static final int EXTRA_INDEX_TOP = 0xfc00;
0078:
0079: private static final int EXTRA_SURROGATE_MASK = 0x3ff;
0080: private static final int EXTRA_SURROGATE_TOP = 0x3f0; /* hangul etc. */
0081:
0082: private static final int EXTRA_HANGUL = EXTRA_SURROGATE_TOP;
0083: private static final int EXTRA_JAMO_L = EXTRA_SURROGATE_TOP + 1;/* ### not used */
0084: private static final int EXTRA_JAMO_V = EXTRA_SURROGATE_TOP + 2;
0085: private static final int EXTRA_JAMO_T = EXTRA_SURROGATE_TOP + 3;
0086:
0087: /* norm32 value constants using >16 bits */
0088: private static final long MIN_SPECIAL = (long) (0xfc000000 & UNSIGNED_INT_MASK);
0089: private static final long SURROGATES_TOP = (long) (0xfff00000 & UNSIGNED_INT_MASK);
0090: private static final long MIN_HANGUL = (long) (0xfff00000 & UNSIGNED_INT_MASK);
0091: private static final long MIN_JAMO_V = (long) (0xfff20000 & UNSIGNED_INT_MASK);
0092: private static final long JAMO_V_TOP = (long) (0xfff30000 & UNSIGNED_INT_MASK);
0093:
0094: /* indexes[] value names */
0095: /* number of bytes in normalization trie */
0096: static final int INDEX_TRIE_SIZE = 0;
0097: /* number of chars in extra data */
0098: static final int INDEX_CHAR_COUNT = 1;
0099: /* number of uint16_t words for combining data */
0100: static final int INDEX_COMBINE_DATA_COUNT = 2;
0101: /* number of code points that combine forward */
0102: static final int INDEX_COMBINE_FWD_COUNT = 3;
0103: /* number of code points that combine forward and backward */
0104: static final int INDEX_COMBINE_BOTH_COUNT = 4;
0105: /* number of code points that combine backward */
0106: static final int INDEX_COMBINE_BACK_COUNT = 5;
0107: /* first code point with quick check NFC NO/MAYBE */
0108: public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
0109: /* first code point with quick check NFKC NO/MAYBE */
0110: public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
0111: /* first code point with quick check NFD NO/MAYBE */
0112: public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
0113: /* first code point with quick check NFKD NO/MAYBE */
0114: public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
0115: /* number of bytes in FCD trie */
0116: static final int INDEX_FCD_TRIE_SIZE = 10;
0117: /* number of bytes in the auxiliary trie */
0118: static final int INDEX_AUX_TRIE_SIZE = 11;
0119: /* number of uint16_t in the array of serialized USet */
0120: static final int INDEX_CANON_SET_COUNT = 12;
0121: /* changing this requires a new formatVersion */
0122: static final int INDEX_TOP = 32;
0123:
0124: /* AUX constants */
0125: /* value constants for auxTrie */
0126: private static final int AUX_UNSAFE_SHIFT = 11;
0127: private static final int AUX_COMP_EX_SHIFT = 10;
0128: private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
0129:
0130: private static final int AUX_MAX_FNC = ((int) 1 << AUX_COMP_EX_SHIFT);
0131: private static final int AUX_UNSAFE_MASK = (int) ((1 << AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
0132: private static final int AUX_FNC_MASK = (int) ((AUX_MAX_FNC - 1) & UNSIGNED_INT_MASK);
0133: private static final int AUX_COMP_EX_MASK = (int) ((1 << AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
0134: private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK & 1) << AUX_NFC_SKIPPABLE_F_SHIFT);
0135:
0136: /* canonStartSets[0..31] contains indexes for what is in the array */
0137: /* number of uint16_t in canonical starter sets */
0138: static final int SET_INDEX_CANON_SETS_LENGTH = 0;
0139: /* number of uint16_t in the BMP search table (contains pairs) */
0140: static final int SET_INDEX_CANON_BMP_TABLE_LENGTH = 1;
0141: /* number of uint16_t in the supplementary search table(contains triplets)*/
0142: static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH = 2;
0143: /* changing this requires a new formatVersion */
0144: static final int SET_INDEX_TOP = 32;
0145:
0146: static final int CANON_SET_INDICIES_INDEX = 0;
0147: static final int CANON_SET_START_SETS_INDEX = 1;
0148: static final int CANON_SET_BMP_TABLE_INDEX = 2;
0149: static final int CANON_SET_SUPP_TABLE_INDEX = 3;
0150: /* 14 bit indexes to canonical USerializedSets */
0151: static final int CANON_SET_MAX_CANON_SETS = 0x4000;
0152: /* single-code point BMP sets are encoded directly in the search table
0153: * except if result=0x4000..0x7fff
0154: */
0155: static final int CANON_SET_BMP_MASK = 0xc000;
0156: static final int CANON_SET_BMP_IS_INDEX = 0x4000;
0157:
0158: private static final int MAX_BUFFER_SIZE = 20;
0159:
0160: /**
0161: * Internal option for cmpEquivFold() for decomposing.
0162: * If not set, just do strcasecmp().
0163: * @internal
0164: */
0165: public static final int COMPARE_EQUIV = 0x80000;
0166:
0167: /*******************************/
0168:
0169: /* Wrappers for Trie implementations */
0170: static final class NormTrieImpl implements Trie.DataManipulate {
0171: static IntTrie normTrie = null;
0172:
0173: /**
0174: * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
0175: * data the index array offset of the indexes for that lead surrogate.
0176: * @param property data value for a surrogate from the trie, including
0177: * the folding offset
0178: * @return data offset or 0 if there is no data for the lead surrogate
0179: */
0180: /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
0181: public int getFoldingOffset(int value) {
0182: return BMP_INDEX_LENGTH
0183: + ((value >> (EXTRA_SHIFT - SURROGATE_BLOCK_BITS)) & (0x3ff << SURROGATE_BLOCK_BITS));
0184: }
0185:
0186: }
0187:
0188: static final class FCDTrieImpl implements Trie.DataManipulate {
0189: static CharTrie fcdTrie = null;
0190:
0191: /**
0192: * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
0193: * data the index array offset of the indexes for that lead surrogate.
0194: * @param property data value for a surrogate from the trie, including
0195: * the folding offset
0196: * @return data offset or 0 if there is no data for the lead surrogate
0197: */
0198: /* fcdTrie: the folding offset is the lead FCD value itself */
0199: public int getFoldingOffset(int value) {
0200: return value;
0201: }
0202: }
0203:
0204: static final class AuxTrieImpl implements Trie.DataManipulate {
0205: static CharTrie auxTrie = null;
0206:
0207: /**
0208: * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
0209: * data the index array offset of the indexes for that lead surrogate.
0210: * @param property data value for a surrogate from the trie, including
0211: * the folding offset
0212: * @return data offset or 0 if there is no data for the lead surrogate
0213: */
0214: /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
0215: public int getFoldingOffset(int value) {
0216: return (int) (value & AUX_FNC_MASK) << SURROGATE_BLOCK_BITS;
0217: }
0218: }
0219:
0220: /****************************************************/
0221:
0222: private static FCDTrieImpl fcdTrieImpl;
0223: private static NormTrieImpl normTrieImpl;
0224: private static AuxTrieImpl auxTrieImpl;
0225: private static int[] indexes;
0226: private static char[] combiningTable;
0227: private static char[] extraData;
0228: private static Object[] canonStartSets;
0229:
0230: private static boolean isDataLoaded;
0231: private static boolean isFormatVersion_2_1;
0232: private static boolean isFormatVersion_2_2;
0233: private static byte[] unicodeVersion;
0234:
0235: /**
0236: * Default buffer size of datafile
0237: */
0238: private static final int DATA_BUFFER_SIZE = 25000;
0239:
0240: /**
0241: * FCD check: everything below this code point is known to have a 0
0242: * lead combining class
0243: */
0244: public static final int MIN_WITH_LEAD_CC = 0x300;
0245:
0246: /**
0247: * Bit 7 of the length byte for a decomposition string in extra data is
0248: * a flag indicating whether the decomposition string is
0249: * preceded by a 16-bit word with the leading and trailing cc
0250: * of the decomposition (like for A-umlaut);
0251: * if not, then both cc's are zero (like for compatibility ideographs).
0252: */
0253: private static final int DECOMP_FLAG_LENGTH_HAS_CC = 0x80;
0254: /**
0255: * Bits 6..0 of the length byte contain the actual length.
0256: */
0257: private static final int DECOMP_LENGTH_MASK = 0x7f;
0258:
0259: /** Length of the BMP portion of the index (stage 1) array. */
0260: private static final int BMP_INDEX_LENGTH = 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
0261: /** Number of bits of a trail surrogate that are used in index table
0262: * lookups.
0263: */
0264: private static final int SURROGATE_BLOCK_BITS = 10 - Trie.INDEX_STAGE_1_SHIFT_;
0265:
0266: // public utility
0267: public static int getFromIndexesArr(int index) {
0268: return indexes[index];
0269: }
0270:
0271: // protected constructor ---------------------------------------------
0272:
0273: /**
0274: * Constructor
0275: * @exception thrown when data reading fails or data corrupted
0276: */
0277: private NormalizerImpl() throws IOException {
0278: //data should be loaded only once
0279: if (!isDataLoaded) {
0280:
0281: // jar access
0282: InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
0283: BufferedInputStream b = new BufferedInputStream(i,
0284: DATA_BUFFER_SIZE);
0285: NormalizerDataReader reader = new NormalizerDataReader(b);
0286:
0287: // read the indexes
0288: indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
0289:
0290: byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
0291:
0292: int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
0293: combiningTable = new char[combiningTableTop];
0294:
0295: int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
0296: extraData = new char[extraDataTop];
0297:
0298: byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
0299: byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
0300: canonStartSets = new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
0301:
0302: fcdTrieImpl = new FCDTrieImpl();
0303: normTrieImpl = new NormTrieImpl();
0304: auxTrieImpl = new AuxTrieImpl();
0305:
0306: // load the rest of the data data and initialize the data members
0307: reader.read(normBytes, fcdBytes, auxBytes, extraData,
0308: combiningTable, canonStartSets);
0309:
0310: NormTrieImpl.normTrie = new IntTrie(
0311: new ByteArrayInputStream(normBytes), normTrieImpl);
0312: FCDTrieImpl.fcdTrie = new CharTrie(
0313: new ByteArrayInputStream(fcdBytes), fcdTrieImpl);
0314: AuxTrieImpl.auxTrie = new CharTrie(
0315: new ByteArrayInputStream(auxBytes), auxTrieImpl);
0316:
0317: // we reached here without any exceptions so the data is fully
0318: // loaded set the variable to true
0319: isDataLoaded = true;
0320:
0321: // get the data format version
0322: byte[] formatVersion = reader.getDataFormatVersion();
0323:
0324: isFormatVersion_2_1 = (formatVersion[0] > 2 || (formatVersion[0] == 2 && formatVersion[1] >= 1));
0325: isFormatVersion_2_2 = (formatVersion[0] > 2 || (formatVersion[0] == 2 && formatVersion[1] >= 2));
0326: unicodeVersion = reader.getUnicodeVersion();
0327: b.close();
0328: }
0329: }
0330:
0331: /* ---------------------------------------------------------------------- */
0332:
0333: /* Korean Hangul and Jamo constants */
0334:
0335: public static final int JAMO_L_BASE = 0x1100; /* "lead" jamo */
0336: public static final int JAMO_V_BASE = 0x1161; /* "vowel" jamo */
0337: public static final int JAMO_T_BASE = 0x11a7; /* "trail" jamo */
0338:
0339: public static final int HANGUL_BASE = 0xac00;
0340:
0341: public static final int JAMO_L_COUNT = 19;
0342: public static final int JAMO_V_COUNT = 21;
0343: public static final int JAMO_T_COUNT = 28;
0344: public static final int HANGUL_COUNT = JAMO_L_COUNT * JAMO_V_COUNT
0345: * JAMO_T_COUNT;
0346:
0347: private static boolean isHangulWithoutJamoT(char c) {
0348: c -= HANGUL_BASE;
0349: return c < HANGUL_COUNT && c % JAMO_T_COUNT == 0;
0350: }
0351:
0352: /* norm32 helpers */
0353:
0354: /* is this a norm32 with a regular index? */
0355: private static boolean isNorm32Regular(long norm32) {
0356: return norm32 < MIN_SPECIAL;
0357: }
0358:
0359: /* is this a norm32 with a special index for a lead surrogate? */
0360: private static boolean isNorm32LeadSurrogate(long norm32) {
0361: return MIN_SPECIAL <= norm32 && norm32 < SURROGATES_TOP;
0362: }
0363:
0364: /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
0365: private static boolean isNorm32HangulOrJamo(long norm32) {
0366: return norm32 >= MIN_HANGUL;
0367: }
0368:
0369: /*
0370: * Given isNorm32HangulOrJamo(),
0371: * is this a Hangul syllable or a Jamo?
0372: */
0373: ///CLOVER:OFF
0374: private static boolean isHangulJamoNorm32HangulOrJamoL(long norm32) {
0375: return norm32 < MIN_JAMO_V;
0376: }
0377:
0378: ///CLOVER:ON
0379:
0380: /*
0381: * Given norm32 for Jamo V or T,
0382: * is this a Jamo V?
0383: */
0384: private static boolean isJamoVTNorm32JamoV(long norm32) {
0385: return norm32 < JAMO_V_TOP;
0386: }
0387:
0388: /* data access primitives ----------------------------------------------- */
0389:
0390: public static long/*unsigned*/getNorm32(char c) {
0391: return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie
0392: .getLeadValue(c)));
0393: }
0394:
0395: public static long/*unsigned*/getNorm32FromSurrogatePair(
0396: long norm32, char c2) {
0397: /*
0398: * the surrogate index in norm32 stores only the number of the surrogate
0399: * index block see gennorm/store.c/getFoldedNormValue()
0400: */
0401: return ((UNSIGNED_INT_MASK) & NormTrieImpl.normTrie
0402: .getTrailValue((int) norm32, c2));
0403: }
0404:
0405: ///CLOVER:OFF
0406: private static long getNorm32(int c) {
0407: return (UNSIGNED_INT_MASK & (NormTrieImpl.normTrie
0408: .getCodePointValue(c)));
0409: }
0410:
0411: private static long getNorm32(int c, int mask) {
0412: long/*unsigned*/norm32 = getNorm32(UTF16.getLeadSurrogate(c));
0413: if (((norm32 & mask) > 0) && isNorm32LeadSurrogate(norm32)) {
0414: /* c is a lead surrogate, get the real norm32 */
0415: norm32 = getNorm32FromSurrogatePair(norm32, UTF16
0416: .getTrailSurrogate(c));
0417: }
0418: return norm32;
0419: }
0420:
0421: ///CLOVER:ON
0422:
0423: /*
0424: * get a norm32 from text with complete code points
0425: * (like from decompositions)
0426: */
0427: private static long/*unsigned*/getNorm32(char[] p, int start,
0428: int/*unsigned*/mask) {
0429: long/*unsigned*/norm32 = getNorm32(p[start]);
0430: if (((norm32 & mask) > 0) && isNorm32LeadSurrogate(norm32)) {
0431: /* *p is a lead surrogate, get the real norm32 */
0432: norm32 = getNorm32FromSurrogatePair(norm32, p[start + 1]);
0433: }
0434: return norm32;
0435: }
0436:
0437: public static VersionInfo getUnicodeVersion() {
0438: return VersionInfo
0439: .getInstance(unicodeVersion[0], unicodeVersion[1],
0440: unicodeVersion[2], unicodeVersion[3]);
0441: }
0442:
0443: public static char getFCD16(char c) {
0444: return FCDTrieImpl.fcdTrie.getLeadValue(c);
0445: }
0446:
0447: public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
0448: /* the surrogate index in fcd16 is an absolute offset over the
0449: * start of stage 1
0450: * */
0451: return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
0452: }
0453:
0454: public static int getFCD16(int c) {
0455: return FCDTrieImpl.fcdTrie.getCodePointValue(c);
0456: }
0457:
0458: private static int getExtraDataIndex(long norm32) {
0459: return (int) (norm32 >> EXTRA_SHIFT);
0460: }
0461:
0462: private static final class DecomposeArgs {
0463: int /*unsigned byte*/cc;
0464: int /*unsigned byte*/trailCC;
0465: int length;
0466: }
0467:
0468: /**
0469: *
0470: * get the canonical or compatibility decomposition for one character
0471: *
0472: * @return index into the extraData array
0473: */
0474: private static int/*index*/decompose(long/*unsigned*/norm32,
0475: int/*unsigned*/qcMask, DecomposeArgs args) {
0476: int p = getExtraDataIndex(norm32);
0477: args.length = extraData[p++];
0478:
0479: if ((norm32 & qcMask & QC_NFKD) != 0 && args.length >= 0x100) {
0480: /* use compatibility decomposition, skip canonical data */
0481: p += ((args.length >> 7) & 1)
0482: + (args.length & DECOMP_LENGTH_MASK);
0483: args.length >>= 8;
0484: }
0485:
0486: if ((args.length & DECOMP_FLAG_LENGTH_HAS_CC) > 0) {
0487: /* get the lead and trail cc's */
0488: char bothCCs = extraData[p++];
0489: args.cc = (UNSIGNED_BYTE_MASK) & (bothCCs >> 8);
0490: args.trailCC = (UNSIGNED_BYTE_MASK) & bothCCs;
0491: } else {
0492: /* lead and trail cc's are both 0 */
0493: args.cc = args.trailCC = 0;
0494: }
0495:
0496: args.length &= DECOMP_LENGTH_MASK;
0497: return p;
0498: }
0499:
0500: /**
0501: * get the canonical decomposition for one character
0502: * @return index into the extraData array
0503: */
0504: private static int decompose(long/*unsigned*/norm32,
0505: DecomposeArgs args) {
0506:
0507: int p = getExtraDataIndex(norm32);
0508: args.length = extraData[p++];
0509:
0510: if ((args.length & DECOMP_FLAG_LENGTH_HAS_CC) > 0) {
0511: /* get the lead and trail cc's */
0512: char bothCCs = extraData[p++];
0513: args.cc = (UNSIGNED_BYTE_MASK) & (bothCCs >> 8);
0514: args.trailCC = (UNSIGNED_BYTE_MASK) & bothCCs;
0515: } else {
0516: /* lead and trail cc's are both 0 */
0517: args.cc = args.trailCC = 0;
0518: }
0519:
0520: args.length &= DECOMP_LENGTH_MASK;
0521: return p;
0522: }
0523:
0524: private static final class NextCCArgs {
0525: char[] source;
0526: int next;
0527: int limit;
0528: char c;
0529: char c2;
0530: }
0531:
0532: /*
0533: * get the combining class of (c, c2)= args.source[args.next++]
0534: * before: args.next<args.limit after: args.next<=args.limit
0535: * if only one code unit is used, then c2==0
0536: */
0537: private static int /*unsigned byte*/getNextCC(NextCCArgs args) {
0538: long /*unsigned*/norm32;
0539:
0540: args.c = args.source[args.next++];
0541:
0542: norm32 = getNorm32(args.c);
0543: if ((norm32 & CC_MASK) == 0) {
0544: args.c2 = 0;
0545: return 0;
0546: } else {
0547: if (!isNorm32LeadSurrogate(norm32)) {
0548: args.c2 = 0;
0549: } else {
0550: /* c is a lead surrogate, get the real norm32 */
0551: if (args.next != args.limit
0552: && UTF16
0553: .isTrailSurrogate(args.c2 = args.source[args.next])) {
0554: ++args.next;
0555: norm32 = getNorm32FromSurrogatePair(norm32, args.c2);
0556: } else {
0557: args.c2 = 0;
0558: return 0;
0559: }
0560: }
0561:
0562: return (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
0563: }
0564: }
0565:
0566: private static final class PrevArgs {
0567: char[] src;
0568: int start;
0569: int current;
0570: char c;
0571: char c2;
0572: }
0573:
0574: /*
0575: * read backwards and get norm32
0576: * return 0 if the character is <minC
0577: * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
0578: * surrogate but read second!)
0579: */
0580: private static long /*unsigned*/getPrevNorm32(PrevArgs args,
0581: int/*unsigned*/minC, int/*unsigned*/mask) {
0582: long/*unsigned*/norm32;
0583:
0584: args.c = args.src[--args.current];
0585: args.c2 = 0;
0586:
0587: /* check for a surrogate before getting norm32 to see if we need to
0588: * predecrement further
0589: */
0590: if (args.c < minC) {
0591: return 0;
0592: } else if (!UTF16.isSurrogate(args.c)) {
0593: return getNorm32(args.c);
0594: } else if (UTF16.isLeadSurrogate(args.c)) {
0595: /* unpaired first surrogate */
0596: return 0;
0597: } else if (args.current != args.start
0598: && UTF16
0599: .isLeadSurrogate(args.c2 = args.src[args.current - 1])) {
0600: --args.current;
0601: norm32 = getNorm32(args.c2);
0602:
0603: if ((norm32 & mask) == 0) {
0604: /* all surrogate pairs with this lead surrogate have
0605: * only irrelevant data
0606: */
0607: return 0;
0608: } else {
0609: /* norm32 must be a surrogate special */
0610: return getNorm32FromSurrogatePair(norm32, args.c);
0611: }
0612: } else {
0613: /* unpaired second surrogate */
0614: args.c2 = 0;
0615: return 0;
0616: }
0617: }
0618:
0619: /*
0620: * get the combining class of (c, c2)=*--p
0621: * before: start<p after: start<=p
0622: */
0623: private static int /*unsigned byte*/getPrevCC(PrevArgs args) {
0624:
0625: return (int) ((UNSIGNED_BYTE_MASK) & (getPrevNorm32(args,
0626: MIN_WITH_LEAD_CC, CC_MASK) >> CC_SHIFT));
0627: }
0628:
0629: /*
0630: * is this a safe boundary character for NF*D?
0631: * (lead cc==0)
0632: */
0633: public static boolean isNFDSafe(long/*unsigned*/norm32,
0634: int/*unsigned*/ccOrQCMask, int/*unsigned*/decompQCMask) {
0635: if ((norm32 & ccOrQCMask) == 0) {
0636: return true; /* cc==0 and no decomposition: this is NF*D safe */
0637: }
0638:
0639: /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
0640: if (isNorm32Regular(norm32) && (norm32 & decompQCMask) != 0) {
0641: DecomposeArgs args = new DecomposeArgs();
0642: /* decomposes, get everything from the variable-length extra data */
0643: decompose(norm32, decompQCMask, args);
0644: return args.cc == 0;
0645: } else {
0646: /* no decomposition (or Hangul), test the cc directly */
0647: return (norm32 & CC_MASK) == 0;
0648: }
0649: }
0650:
0651: /*
0652: * is this (or does its decomposition begin with) a "true starter"?
0653: * (cc==0 and NF*C_YES)
0654: */
0655: public static boolean isTrueStarter(long/*unsigned*/norm32,
0656: int/*unsigned*/ccOrQCMask, int/*unsigned*/decompQCMask) {
0657: if ((norm32 & ccOrQCMask) == 0) {
0658: return true; /* this is a true starter (could be Hangul or Jamo L)*/
0659: }
0660:
0661: /* inspect its decomposition - not a Hangul or a surrogate here */
0662: if ((norm32 & decompQCMask) != 0) {
0663: int p; /* index into extra data array */
0664: DecomposeArgs args = new DecomposeArgs();
0665: /* decomposes, get everything from the variable-length extra data */
0666: p = decompose(norm32, decompQCMask, args);
0667:
0668: if (args.cc == 0) {
0669: int/*unsigned*/qcMask = ccOrQCMask & QC_MASK;
0670:
0671: /* does it begin with NFC_YES? */
0672: if ((getNorm32(extraData, p, qcMask) & qcMask) == 0) {
0673: /* yes, the decomposition begins with a true starter */
0674: return true;
0675: }
0676: }
0677: }
0678: return false;
0679: }
0680:
0681: /* reorder UTF-16 in-place ---------------------------------------------- */
0682:
0683: /**
0684: * simpler, single-character version of mergeOrdered() -
0685: * bubble-insert one single code point into the preceding string
0686: * which is already canonically ordered
0687: * (c, c2) may or may not yet have been inserted at src[current]..src[p]
0688: *
0689: * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
0690: *
0691: * before: src[start]..src[current] is already ordered, and
0692: * src[current]..src[p] may or may not hold (c, c2) but
0693: * must be exactly the same length as (c, c2)
0694: * after: src[start]..src[p] is ordered
0695: *
0696: * @return the trailing combining class
0697: */
0698: private static int/*unsigned byte*/insertOrdered(char[] source,
0699: int start, int current, int p, char c, char c2,
0700: int/*unsigned byte*/cc) {
0701: int back, preBack;
0702: int r;
0703: int prevCC, trailCC = cc;
0704:
0705: if (start < current && cc != 0) {
0706: // search for the insertion point where cc>=prevCC
0707: preBack = back = current;
0708: PrevArgs prevArgs = new PrevArgs();
0709: prevArgs.current = current;
0710: prevArgs.start = start;
0711: prevArgs.src = source;
0712: // get the prevCC
0713: prevCC = getPrevCC(prevArgs);
0714: preBack = prevArgs.current;
0715:
0716: if (cc < prevCC) {
0717: // this will be the last code point, so keep its cc
0718: trailCC = prevCC;
0719: back = preBack;
0720: while (start < preBack) {
0721: prevCC = getPrevCC(prevArgs);
0722: preBack = prevArgs.current;
0723: if (cc >= prevCC) {
0724: break;
0725: }
0726: back = preBack;
0727: }
0728:
0729: // this is where we are right now with all these indicies:
0730: // [start]..[pPreBack] 0..? code points that we can ignore
0731: // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
0732: // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
0733: // [current]..[p] 1 code point (c, c2) with cc
0734:
0735: // move the code units in between up
0736: r = p;
0737: do {
0738: source[--r] = source[--current];
0739: } while (back != current);
0740: }
0741: }
0742:
0743: // insert (c, c2)
0744: source[current] = c;
0745: if (c2 != 0) {
0746: source[(current + 1)] = c2;
0747: }
0748:
0749: // we know the cc of the last code point
0750: return trailCC;
0751: }
0752:
0753: /**
0754: * merge two UTF-16 string parts together
0755: * to canonically order (order by combining classes) their concatenation
0756: *
0757: * the two strings may already be adjacent, so that the merging is done
0758: * in-place if the two strings are not adjacent, then the buffer holding the
0759: * first one must be large enough
0760: * the second string may or may not be ordered in itself
0761: *
0762: * before: [start]..[current] is already ordered, and
0763: * [next]..[limit] may be ordered in itself, but
0764: * is not in relation to [start..current[
0765: * after: [start..current+(limit-next)[ is ordered
0766: *
0767: * the algorithm is a simple bubble-sort that takes the characters from
0768: * src[next++] and inserts them in correct combining class order into the
0769: * preceding part of the string
0770: *
0771: * since this function is called much less often than the single-code point
0772: * insertOrdered(), it just uses that for easier maintenance
0773: *
0774: * @return the trailing combining class
0775: */
0776: private static int /*unsigned byte*/mergeOrdered(char[] source,
0777: int start, int current, char[] data, int next, int limit,
0778: boolean isOrdered) {
0779: int r;
0780: int /*unsigned byte*/cc, trailCC = 0;
0781: boolean adjacent;
0782:
0783: adjacent = current == next;
0784: NextCCArgs ncArgs = new NextCCArgs();
0785: ncArgs.source = data;
0786: ncArgs.next = next;
0787: ncArgs.limit = limit;
0788:
0789: if (start != current || !isOrdered) {
0790:
0791: while (ncArgs.next < ncArgs.limit) {
0792: cc = getNextCC(ncArgs);
0793: if (cc == 0) {
0794: // does not bubble back
0795: trailCC = 0;
0796: if (adjacent) {
0797: current = ncArgs.next;
0798: } else {
0799: data[current++] = ncArgs.c;
0800: if (ncArgs.c2 != 0) {
0801: data[current++] = ncArgs.c2;
0802: }
0803: }
0804: if (isOrdered) {
0805: break;
0806: } else {
0807: start = current;
0808: }
0809: } else {
0810: r = current + (ncArgs.c2 == 0 ? 1 : 2);
0811: trailCC = insertOrdered(source, start, current, r,
0812: ncArgs.c, ncArgs.c2, cc);
0813: current = r;
0814: }
0815: }
0816: }
0817:
0818: if (ncArgs.next == ncArgs.limit) {
0819: // we know the cc of the last code point
0820: return trailCC;
0821: } else {
0822: if (!adjacent) {
0823: // copy the second string part
0824: do {
0825: source[current++] = data[ncArgs.next++];
0826: } while (ncArgs.next != ncArgs.limit);
0827: ncArgs.limit = current;
0828: }
0829: PrevArgs prevArgs = new PrevArgs();
0830: prevArgs.src = data;
0831: prevArgs.start = start;
0832: prevArgs.current = ncArgs.limit;
0833: return getPrevCC(prevArgs);
0834: }
0835:
0836: }
0837:
0838: private static int /*unsigned byte*/mergeOrdered(char[] source,
0839: int start, int current, char[] data, final int next,
0840: final int limit) {
0841: return mergeOrdered(source, start, current, data, next, limit,
0842: true);
0843: }
0844:
0845: public static boolean checkFCD(char[] src, int srcStart,
0846: int srcLimit, UnicodeSet nx) {
0847:
0848: char fcd16, c, c2;
0849: int prevCC = 0, cc;
0850: int i = srcStart, length = srcLimit;
0851:
0852: for (;;) {
0853: for (;;) {
0854: if (i == length) {
0855: return true;
0856: } else if ((c = src[i++]) < MIN_WITH_LEAD_CC) {
0857: prevCC = (int) -c;
0858: } else if ((fcd16 = getFCD16(c)) == 0) {
0859: prevCC = 0;
0860: } else {
0861: break;
0862: }
0863: }
0864:
0865: // check one above-minimum, relevant code unit
0866: if (UTF16.isLeadSurrogate(c)) {
0867: // c is a lead surrogate, get the real fcd16
0868: if (i != length && UTF16.isTrailSurrogate(c2 = src[i])) {
0869: ++i;
0870: fcd16 = getFCD16FromSurrogatePair(fcd16, c2);
0871: } else {
0872: c2 = 0;
0873: fcd16 = 0;
0874: }
0875: } else {
0876: c2 = 0;
0877: }
0878:
0879: if (nx_contains(nx, c, c2)) {
0880: prevCC = 0; /* excluded: fcd16==0 */
0881: continue;
0882: }
0883:
0884: // prevCC has values from the following ranges:
0885: // 0..0xff -the previous trail combining class
0886: // <0 -the negative value of the previous code unit;
0887: // that code unit was <MIN_WITH_LEAD_CC and its getFCD16()
0888: // was deferred so that average text is checked faster
0889: //
0890:
0891: // check the combining order
0892: cc = (int) (fcd16 >> 8);
0893: if (cc != 0) {
0894: if (prevCC < 0) {
0895: // the previous character was <_NORM_MIN_WITH_LEAD_CC,
0896: // we need to get its trail cc
0897: //
0898: if (!nx_contains(nx, (int) -prevCC)) {
0899: prevCC = (int) (FCDTrieImpl.fcdTrie
0900: .getBMPValue((char) -prevCC) & 0xff);
0901: } else {
0902: prevCC = 0; /* excluded: fcd16==0 */
0903: }
0904:
0905: }
0906:
0907: if (cc < prevCC) {
0908: return false;
0909: }
0910: }
0911: prevCC = (int) (fcd16 & 0xff);
0912: }
0913: }
0914:
0915: public static Normalizer.QuickCheckResult quickCheck(char[] src,
0916: int srcStart, int srcLimit, int minNoMaybe, int qcMask,
0917: int options, boolean allowMaybe, UnicodeSet nx) {
0918:
0919: int ccOrQCMask;
0920: long norm32;
0921: char c, c2;
0922: char cc, prevCC;
0923: long qcNorm32;
0924: Normalizer.QuickCheckResult result;
0925: ComposePartArgs args = new ComposePartArgs();
0926: char[] buffer;
0927: int start = srcStart;
0928:
0929: if (!isDataLoaded) {
0930: return Normalizer.MAYBE;
0931: }
0932: // initialize
0933: ccOrQCMask = CC_MASK | qcMask;
0934: result = Normalizer.YES;
0935: prevCC = 0;
0936:
0937: for (;;) {
0938: for (;;) {
0939: if (srcStart == srcLimit) {
0940: return result;
0941: } else if ((c = src[srcStart++]) >= minNoMaybe
0942: && ((norm32 = getNorm32(c)) & ccOrQCMask) != 0) {
0943: break;
0944: }
0945: prevCC = 0;
0946: }
0947:
0948: // check one above-minimum, relevant code unit
0949: if (isNorm32LeadSurrogate(norm32)) {
0950: // c is a lead surrogate, get the real norm32
0951: if (srcStart != srcLimit
0952: && UTF16.isTrailSurrogate(c2 = src[srcStart])) {
0953: ++srcStart;
0954: norm32 = getNorm32FromSurrogatePair(norm32, c2);
0955: } else {
0956: norm32 = 0;
0957: c2 = 0;
0958: }
0959: } else {
0960: c2 = 0;
0961: }
0962: if (nx_contains(nx, c, c2)) {
0963: /* excluded: norm32==0 */
0964: norm32 = 0;
0965: }
0966:
0967: // check the combining order
0968: cc = (char) ((norm32 >> CC_SHIFT) & 0xFF);
0969: if (cc != 0 && cc < prevCC) {
0970: return Normalizer.NO;
0971: }
0972: prevCC = cc;
0973:
0974: // check for "no" or "maybe" quick check flags
0975: qcNorm32 = norm32 & qcMask;
0976: if ((qcNorm32 & QC_ANY_NO) >= 1) {
0977: result = Normalizer.NO;
0978: break;
0979: } else if (qcNorm32 != 0) {
0980: // "maybe" can only occur for NFC and NFKC
0981: if (allowMaybe) {
0982: result = Normalizer.MAYBE;
0983: } else {
0984: // normalize a section around here to see if it is really
0985: // normalized or not
0986: int prevStarter;
0987: int/*unsigned*/decompQCMask;
0988:
0989: decompQCMask = (qcMask << 2) & 0xf; // decomposition quick check mask
0990:
0991: // find the previous starter
0992:
0993: // set prevStarter to the beginning of the current character
0994: prevStarter = srcStart - 1;
0995: if (UTF16.isTrailSurrogate(src[prevStarter])) {
0996: // safe because unpaired surrogates do not result
0997: // in "maybe"
0998: --prevStarter;
0999: }
1000:
1001: prevStarter = findPreviousStarter(src, start,
1002: prevStarter, ccOrQCMask, decompQCMask,
1003: (char) minNoMaybe);
1004:
1005: // find the next true starter in [src..limit[ - modifies
1006: // src to point to the next starter
1007: srcStart = findNextStarter(src, srcStart, srcLimit,
1008: qcMask, decompQCMask, (char) minNoMaybe);
1009:
1010: //set the args for compose part
1011: args.prevCC = prevCC;
1012:
1013: // decompose and recompose [prevStarter..src[
1014: buffer = composePart(args, prevStarter, src,
1015: srcStart, srcLimit, options, nx);
1016:
1017: // compare the normalized version with the original
1018: if (0 != strCompare(buffer, 0, args.length, src,
1019: prevStarter, srcStart, false)) {
1020: result = Normalizer.NO; // normalization differs
1021: break;
1022: }
1023:
1024: // continue after the next starter
1025: }
1026: }
1027: }
1028: return result;
1029: }
1030:
1031: //------------------------------------------------------
1032: // make NFD & NFKD
1033: //------------------------------------------------------
1034: public static int getDecomposition(int c /*UTF-32*/,
1035: boolean compat, char[] dest, int destStart, int destCapacity) {
1036:
1037: if ((UNSIGNED_INT_MASK & c) <= 0x10ffff) {
1038: long /*unsigned*/norm32;
1039: int qcMask;
1040: int minNoMaybe;
1041: int length;
1042:
1043: // initialize
1044: if (!compat) {
1045: minNoMaybe = (int) indexes[INDEX_MIN_NFD_NO_MAYBE];
1046: qcMask = QC_NFD;
1047: } else {
1048: minNoMaybe = (int) indexes[INDEX_MIN_NFKD_NO_MAYBE];
1049: qcMask = QC_NFKD;
1050: }
1051:
1052: if (c < minNoMaybe) {
1053: // trivial case
1054: if (destCapacity > 0) {
1055: dest[0] = (char) c;
1056: }
1057: return -1;
1058: }
1059:
1060: /* data lookup */
1061: norm32 = getNorm32(c);
1062: if ((norm32 & qcMask) == 0) {
1063: /* simple case: no decomposition */
1064: if (c <= 0xffff) {
1065: if (destCapacity > 0) {
1066: dest[0] = (char) c;
1067: }
1068: return -1;
1069: } else {
1070: if (destCapacity >= 2) {
1071: dest[0] = UTF16.getLeadSurrogate(c);
1072: dest[1] = UTF16.getTrailSurrogate(c);
1073: }
1074: return -2;
1075: }
1076: } else if (isNorm32HangulOrJamo(norm32)) {
1077: /* Hangul syllable: decompose algorithmically */
1078: char c2;
1079:
1080: c -= HANGUL_BASE;
1081:
1082: c2 = (char) (c % JAMO_T_COUNT);
1083: c /= JAMO_T_COUNT;
1084: if (c2 > 0) {
1085: if (destCapacity >= 3) {
1086: dest[2] = (char) (JAMO_T_BASE + c2);
1087: }
1088: length = 3;
1089: } else {
1090: length = 2;
1091: }
1092:
1093: if (destCapacity >= 2) {
1094: dest[1] = (char) (JAMO_V_BASE + c % JAMO_V_COUNT);
1095: dest[0] = (char) (JAMO_L_BASE + c / JAMO_V_COUNT);
1096: }
1097: return length;
1098: } else {
1099: /* c decomposes, get everything from the variable-length extra
1100: * data
1101: */
1102: int p, limit;
1103: DecomposeArgs args = new DecomposeArgs();
1104: /* the index into extra data array*/
1105: p = decompose(norm32, qcMask, args);
1106: if (args.length <= destCapacity) {
1107: limit = p + args.length;
1108: do {
1109: dest[destStart++] = extraData[p++];
1110: } while (p < limit);
1111: }
1112: return args.length;
1113: }
1114: } else {
1115: return 0;
1116: }
1117: }
1118:
1119: public static int decompose(char[] src, int srcStart, int srcLimit,
1120: char[] dest, int destStart, int destLimit, boolean compat,
1121: int[] outTrailCC, UnicodeSet nx) {
1122:
1123: char[] buffer = new char[3];
1124: int prevSrc;
1125: long norm32;
1126: int ccOrQCMask, qcMask;
1127: int reorderStartIndex, length;
1128: char c, c2, minNoMaybe;
1129: int/*unsigned byte*/cc, prevCC, trailCC;
1130: char[] p;
1131: int pStart;
1132: int destIndex = destStart;
1133: int srcIndex = srcStart;
1134: if (!compat) {
1135: minNoMaybe = (char) indexes[INDEX_MIN_NFD_NO_MAYBE];
1136: qcMask = QC_NFD;
1137: } else {
1138: minNoMaybe = (char) indexes[INDEX_MIN_NFKD_NO_MAYBE];
1139: qcMask = QC_NFKD;
1140: }
1141:
1142: /* initialize */
1143: ccOrQCMask = CC_MASK | qcMask;
1144: reorderStartIndex = 0;
1145: prevCC = 0;
1146: norm32 = 0;
1147: c = 0;
1148: pStart = 0;
1149:
1150: cc = trailCC = -1;//initialize to bogus value
1151:
1152: for (;;) {
1153: /* count code units below the minimum or with irrelevant data for
1154: * the quick check
1155: */
1156: prevSrc = srcIndex;
1157:
1158: while (srcIndex != srcLimit
1159: && ((c = src[srcIndex]) < minNoMaybe || ((norm32 = getNorm32(c)) & ccOrQCMask) == 0)) {
1160: prevCC = 0;
1161: ++srcIndex;
1162: }
1163:
1164: /* copy these code units all at once */
1165: if (srcIndex != prevSrc) {
1166: length = (int) (srcIndex - prevSrc);
1167: if ((destIndex + length) <= destLimit) {
1168: System.arraycopy(src, prevSrc, dest, destIndex,
1169: length);
1170: }
1171:
1172: destIndex += length;
1173: reorderStartIndex = destIndex;
1174: }
1175:
1176: /* end of source reached? */
1177: if (srcIndex == srcLimit) {
1178: break;
1179: }
1180:
1181: /* c already contains *src and norm32 is set for it, increment src*/
1182: ++srcIndex;
1183:
1184: /* check one above-minimum, relevant code unit */
1185: /*
1186: * generally, set p and length to the decomposition string
1187: * in simple cases, p==NULL and (c, c2) will hold the length code
1188: * units to append in all cases, set cc to the lead and trailCC to
1189: * the trail combining class
1190: *
1191: * the following merge-sort of the current character into the
1192: * preceding, canonically ordered result text will use the
1193: * optimized insertOrdered()
1194: * if there is only one single code point to process;
1195: * this is indicated with p==NULL, and (c, c2) is the character to
1196: * insert
1197: * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1198: * for a supplementary character)
1199: * otherwise, p[length] is merged in with _mergeOrdered()
1200: */
1201: if (isNorm32HangulOrJamo(norm32)) {
1202: if (nx_contains(nx, c)) {
1203: c2 = 0;
1204: p = null;
1205: length = 1;
1206: } else {
1207: // Hangul syllable: decompose algorithmically
1208: p = buffer;
1209: pStart = 0;
1210: cc = trailCC = 0;
1211:
1212: c -= HANGUL_BASE;
1213:
1214: c2 = (char) (c % JAMO_T_COUNT);
1215: c /= JAMO_T_COUNT;
1216: if (c2 > 0) {
1217: buffer[2] = (char) (JAMO_T_BASE + c2);
1218: length = 3;
1219: } else {
1220: length = 2;
1221: }
1222:
1223: buffer[1] = (char) (JAMO_V_BASE + c % JAMO_V_COUNT);
1224: buffer[0] = (char) (JAMO_L_BASE + c / JAMO_V_COUNT);
1225: }
1226: } else {
1227: if (isNorm32Regular(norm32)) {
1228: c2 = 0;
1229: length = 1;
1230: } else {
1231: // c is a lead surrogate, get the real norm32
1232: if (srcIndex != srcLimit
1233: && UTF16
1234: .isTrailSurrogate(c2 = src[srcIndex])) {
1235: ++srcIndex;
1236: length = 2;
1237: norm32 = getNorm32FromSurrogatePair(norm32, c2);
1238: } else {
1239: c2 = 0;
1240: length = 1;
1241: norm32 = 0;
1242: }
1243: }
1244:
1245: /* get the decomposition and the lead and trail cc's */
1246: if (nx_contains(nx, c, c2)) {
1247: /* excluded: norm32==0 */
1248: cc = trailCC = 0;
1249: p = null;
1250: } else if ((norm32 & qcMask) == 0) {
1251: /* c does not decompose */
1252: cc = trailCC = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
1253: p = null;
1254: pStart = -1;
1255: } else {
1256: DecomposeArgs arg = new DecomposeArgs();
1257: /* c decomposes, get everything from the variable-length
1258: * extra data
1259: */
1260: pStart = decompose(norm32, qcMask, arg);
1261: p = extraData;
1262: length = arg.length;
1263: cc = arg.cc;
1264: trailCC = arg.trailCC;
1265: if (length == 1) {
1266: /* fastpath a single code unit from decomposition */
1267: c = p[pStart];
1268: c2 = 0;
1269: p = null;
1270: pStart = -1;
1271: }
1272: }
1273: }
1274:
1275: /* append the decomposition to the destination buffer, assume
1276: * length>0
1277: */
1278: if ((destIndex + length) <= destLimit) {
1279: int reorderSplit = destIndex;
1280: if (p == null) {
1281: /* fastpath: single code point */
1282: if (cc != 0 && cc < prevCC) {
1283: /* (c, c2) is out of order with respect to the preceding
1284: * text
1285: */
1286: destIndex += length;
1287: trailCC = insertOrdered(dest,
1288: reorderStartIndex, reorderSplit,
1289: destIndex, c, c2, cc);
1290: } else {
1291: /* just append (c, c2) */
1292: dest[destIndex++] = c;
1293: if (c2 != 0) {
1294: dest[destIndex++] = c2;
1295: }
1296: }
1297: } else {
1298: /* general: multiple code points (ordered by themselves)
1299: * from decomposition
1300: */
1301: if (cc != 0 && cc < prevCC) {
1302: /* the decomposition is out of order with respect to the
1303: * preceding text
1304: */
1305: destIndex += length;
1306: trailCC = mergeOrdered(dest, reorderStartIndex,
1307: reorderSplit, p, pStart, pStart
1308: + length);
1309: } else {
1310: /* just append the decomposition */
1311: do {
1312: dest[destIndex++] = p[pStart++];
1313: } while (--length > 0);
1314: }
1315: }
1316: } else {
1317: /* buffer overflow */
1318: /* keep incrementing the destIndex for preflighting */
1319: destIndex += length;
1320: }
1321:
1322: prevCC = trailCC;
1323: if (prevCC == 0) {
1324: reorderStartIndex = destIndex;
1325: }
1326: }
1327:
1328: outTrailCC[0] = prevCC;
1329:
1330: return destIndex - destStart;
1331: }
1332:
1333: /* make NFC & NFKC ------------------------------------------------------ */
1334: private static final class NextCombiningArgs {
1335: char[] source;
1336: int start;
1337: //int limit;
1338: char c;
1339: char c2;
1340: int/*unsigned*/combiningIndex;
1341: char /*unsigned byte*/cc;
1342: }
1343:
1344: /* get the composition properties of the next character */
1345: private static int /*unsigned*/getNextCombining(
1346: NextCombiningArgs args, int limit, UnicodeSet nx) {
1347: long/*unsigned*/norm32;
1348: int combineFlags;
1349: /* get properties */
1350: args.c = args.source[args.start++];
1351: norm32 = getNorm32(args.c);
1352:
1353: /* preset output values for most characters */
1354: args.c2 = 0;
1355: args.combiningIndex = 0;
1356: args.cc = 0;
1357:
1358: if ((norm32 & (CC_MASK | COMBINES_ANY)) == 0) {
1359: return 0;
1360: } else {
1361: if (isNorm32Regular(norm32)) {
1362: /* set cc etc. below */
1363: } else if (isNorm32HangulOrJamo(norm32)) {
1364: /* a compatibility decomposition contained Jamos */
1365: args.combiningIndex = (int) ((UNSIGNED_INT_MASK) & (0xfff0 | (norm32 >> EXTRA_SHIFT)));
1366: return (int) (norm32 & COMBINES_ANY);
1367: } else {
1368: /* c is a lead surrogate, get the real norm32 */
1369: if (args.start != limit
1370: && UTF16
1371: .isTrailSurrogate(args.c2 = args.source[args.start])) {
1372: ++args.start;
1373: norm32 = getNorm32FromSurrogatePair(norm32, args.c2);
1374: } else {
1375: args.c2 = 0;
1376: return 0;
1377: }
1378: }
1379:
1380: if (nx_contains(nx, args.c, args.c2)) {
1381: return 0; /* excluded: norm32==0 */
1382: }
1383:
1384: args.cc = (char) ((norm32 >> CC_SHIFT) & 0xff);
1385:
1386: combineFlags = (int) (norm32 & COMBINES_ANY);
1387: if (combineFlags != 0) {
1388: int index = getExtraDataIndex(norm32);
1389: args.combiningIndex = index > 0 ? extraData[(index - 1)]
1390: : 0;
1391: }
1392:
1393: return combineFlags;
1394: }
1395: }
1396:
1397: /*
1398: * given a composition-result starter (c, c2) - which means its cc==0,
1399: * it combines forward, it has extra data, its norm32!=0,
1400: * it is not a Hangul or Jamo,
1401: * get just its combineFwdIndex
1402: *
1403: * norm32(c) is special if and only if c2!=0
1404: */
1405: private static int/*unsigned*/getCombiningIndexFromStarter(char c,
1406: char c2) {
1407: long/*unsigned*/norm32;
1408:
1409: norm32 = getNorm32(c);
1410: if (c2 != 0) {
1411: norm32 = getNorm32FromSurrogatePair(norm32, c2);
1412: }
1413: return extraData[(getExtraDataIndex(norm32) - 1)];
1414: }
1415:
1416: /*
1417: * Find the recomposition result for
1418: * a forward-combining character
1419: * (specified with a pointer to its part of the combiningTable[])
1420: * and a backward-combining character
1421: * (specified with its combineBackIndex).
1422: *
1423: * If these two characters combine, then set (value, value2)
1424: * with the code unit(s) of the composition character.
1425: *
1426: * Return value:
1427: * 0 do not combine
1428: * 1 combine
1429: * >1 combine, and the composition is a forward-combining starter
1430: *
1431: * See unormimp.h for a description of the composition table format.
1432: */
1433: private static int/*unsigned*/combine(char[] table,
1434: int tableStart, int/*unsinged*/combineBackIndex,
1435: int[] outValues) {
1436: int/*unsigned*/key;
1437: int value, value2;
1438:
1439: if (outValues.length < 2) {
1440: throw new IllegalArgumentException();
1441: }
1442:
1443: /* search in the starter's composition table */
1444: for (;;) {
1445: key = table[tableStart++];
1446: if (key >= combineBackIndex) {
1447: break;
1448: }
1449: tableStart += ((table[tableStart] & 0x8000) != 0) ? 2 : 1;
1450: }
1451:
1452: /* mask off bit 15, the last-entry-in-the-list flag */
1453: if ((key & 0x7fff) == combineBackIndex) {
1454: /* found! combine! */
1455: value = table[tableStart];
1456:
1457: /* is the composition a starter that combines forward? */
1458: key = (int) ((UNSIGNED_INT_MASK) & ((value & 0x2000) + 1));
1459:
1460: /* get the composition result code point from the variable-length
1461: * result value
1462: */
1463: if ((value & 0x8000) != 0) {
1464: if ((value & 0x4000) != 0) {
1465: /* surrogate pair composition result */
1466: value = (int) ((UNSIGNED_INT_MASK) & ((value & 0x3ff) | 0xd800));
1467: value2 = table[tableStart + 1];
1468: } else {
1469: /* BMP composition result U+2000..U+ffff */
1470: value = table[tableStart + 1];
1471: value2 = 0;
1472: }
1473: } else {
1474: /* BMP composition result U+0000..U+1fff */
1475: value &= 0x1fff;
1476: value2 = 0;
1477: }
1478: outValues[0] = value;
1479: outValues[1] = value2;
1480: return key;
1481: } else {
1482: /* not found */
1483: return 0;
1484: }
1485: }
1486:
1487: private static final class RecomposeArgs {
1488: char[] source;
1489: int start;
1490: int limit;
1491: }
1492:
1493: /*
1494: * recompose the characters in [p..limit[
1495: * (which is in NFD - decomposed and canonically ordered),
1496: * adjust limit, and return the trailing cc
1497: *
1498: * since for NFKC we may get Jamos in decompositions, we need to
1499: * recompose those too
1500: *
1501: * note that recomposition never lengthens the text:
1502: * any character consists of either one or two code units;
1503: * a composition may contain at most one more code unit than the original
1504: * starter, while the combining mark that is removed has at least one code
1505: * unit
1506: */
1507: private static char/*unsigned byte*/recompose(RecomposeArgs args,
1508: int options, UnicodeSet nx) {
1509: int remove, q, r;
1510: int /*unsigned*/combineFlags;
1511: int /*unsigned*/combineFwdIndex, combineBackIndex;
1512: int /*unsigned*/result, value = 0, value2 = 0;
1513: int /*unsigned byte*/prevCC;
1514: boolean starterIsSupplementary;
1515: int starter;
1516: int[] outValues = new int[2];
1517: starter = -1; /* no starter */
1518: combineFwdIndex = 0; /* will not be used until starter!=NULL */
1519: starterIsSupplementary = false; /* will not be used until starter!=NULL */
1520: prevCC = 0;
1521:
1522: NextCombiningArgs ncArg = new NextCombiningArgs();
1523: ncArg.source = args.source;
1524:
1525: ncArg.cc = 0;
1526: ncArg.c2 = 0;
1527:
1528: for (;;) {
1529: ncArg.start = args.start;
1530: combineFlags = getNextCombining(ncArg, args.limit, nx);
1531: combineBackIndex = ncArg.combiningIndex;
1532: args.start = ncArg.start;
1533:
1534: if (((combineFlags & COMBINES_BACK) != 0) && starter != -1) {
1535: if ((combineBackIndex & 0x8000) != 0) {
1536: /* c is a Jamo V/T, see if we can compose it with the
1537: * previous character
1538: */
1539: /* for the PRI #29 fix, check that there is no intervening combining mark */
1540: if ((options & BEFORE_PRI_29) != 0 || prevCC == 0) {
1541: remove = -1; /* NULL while no Hangul composition */
1542: combineFlags = 0;
1543: ncArg.c2 = args.source[starter];
1544: if (combineBackIndex == 0xfff2) {
1545: /* Jamo V, compose with previous Jamo L and following
1546: * Jamo T
1547: */
1548: ncArg.c2 = (char) (ncArg.c2 - JAMO_L_BASE);
1549: if (ncArg.c2 < JAMO_L_COUNT) {
1550: remove = args.start - 1;
1551: ncArg.c = (char) (HANGUL_BASE + (ncArg.c2
1552: * JAMO_V_COUNT + (ncArg.c - JAMO_V_BASE))
1553: * JAMO_T_COUNT);
1554: if (args.start != args.limit
1555: && (ncArg.c2 = (char) (args.source[args.start] - JAMO_T_BASE)) < JAMO_T_COUNT) {
1556: ++args.start;
1557: ncArg.c += ncArg.c2;
1558: } else {
1559: /* the result is an LV syllable, which is a starter (unlike LVT) */
1560: combineFlags = COMBINES_FWD;
1561: }
1562: if (!nx_contains(nx, ncArg.c)) {
1563: args.source[starter] = ncArg.c;
1564: } else {
1565: /* excluded */
1566: if (!isHangulWithoutJamoT(ncArg.c)) {
1567: --args.start; /* undo the ++args.start from reading the Jamo T */
1568: }
1569: /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
1570: remove = args.start;
1571: }
1572: }
1573:
1574: /*
1575: * Normally, the following can not occur:
1576: * Since the input is in NFD, there are no Hangul LV syllables that
1577: * a Jamo T could combine with.
1578: * All Jamo Ts are combined above when handling Jamo Vs.
1579: *
1580: * However, before the PRI #29 fix, this can occur due to
1581: * an intervening combining mark between the Hangul LV and the Jamo T.
1582: */
1583: } else {
1584: /* Jamo T, compose with previous Hangul that does not have a Jamo T */
1585: if (isHangulWithoutJamoT(ncArg.c2)) {
1586: ncArg.c2 += ncArg.c - JAMO_T_BASE;
1587: if (!nx_contains(nx, ncArg.c2)) {
1588: remove = args.start - 1;
1589: args.source[starter] = ncArg.c2;
1590: }
1591: }
1592: }
1593:
1594: if (remove != -1) {
1595: /* remove the Jamo(s) */
1596: q = remove;
1597: r = args.start;
1598: while (r < args.limit) {
1599: args.source[q++] = args.source[r++];
1600: }
1601: args.start = remove;
1602: args.limit = q;
1603: }
1604:
1605: ncArg.c2 = 0; /* c2 held *starter temporarily */
1606:
1607: if (combineFlags != 0) {
1608: /*
1609: * not starter=NULL because the composition is a Hangul LV syllable
1610: * and might combine once more (but only before the PRI #29 fix)
1611: */
1612:
1613: /* done? */
1614: if (args.start == args.limit) {
1615: return (char) prevCC;
1616: }
1617:
1618: /* the composition is a Hangul LV syllable which is a starter that combines forward */
1619: combineFwdIndex = 0xfff0;
1620:
1621: /* we combined; continue with looking for compositions */
1622: continue;
1623: }
1624: }
1625:
1626: /*
1627: * now: cc==0 and the combining index does not include
1628: * "forward" -> the rest of the loop body will reset starter
1629: * to NULL; technically, a composed Hangul syllable is a
1630: * starter, but it does not combine forward now that we have
1631: * consumed all eligible Jamos; for Jamo V/T, combineFlags
1632: * does not contain _NORM_COMBINES_FWD
1633: */
1634:
1635: } else if (
1636: /* the starter is not a Hangul LV or Jamo V/T and */
1637: !((combineFwdIndex & 0x8000) != 0)
1638: &&
1639: /* the combining mark is not blocked and */
1640: ((options & BEFORE_PRI_29) != 0 ? (prevCC != ncArg.cc || prevCC == 0)
1641: : (prevCC < ncArg.cc || prevCC == 0))
1642: &&
1643: /* the starter and the combining mark (c, c2) do combine */
1644: 0 != (result = combine(combiningTable,
1645: combineFwdIndex, combineBackIndex,
1646: outValues)) &&
1647: /* the composition result is not excluded */
1648: !nx_contains(nx, (char) value, (char) value2)) {
1649: value = outValues[0];
1650: value2 = outValues[1];
1651: /* replace the starter with the composition, remove the
1652: * combining mark
1653: */
1654: remove = ncArg.c2 == 0 ? args.start - 1
1655: : args.start - 2; /* index to the combining mark */
1656:
1657: /* replace the starter with the composition */
1658: args.source[starter] = (char) value;
1659: if (starterIsSupplementary) {
1660: if (value2 != 0) {
1661: /* both are supplementary */
1662: args.source[starter + 1] = (char) value2;
1663: } else {
1664: /* the composition is shorter than the starter,
1665: * move the intermediate characters forward one */
1666: starterIsSupplementary = false;
1667: q = starter + 1;
1668: r = q + 1;
1669: while (r < remove) {
1670: args.source[q++] = args.source[r++];
1671: }
1672: --remove;
1673: }
1674: } else if (value2 != 0) {
1675: /* the composition is longer than the starter,
1676: * move the intermediate characters back one */
1677: starterIsSupplementary = true;
1678: /* temporarily increment for the loop boundary */
1679: ++starter;
1680: q = remove;
1681: r = ++remove;
1682: while (starter < q) {
1683: args.source[--r] = args.source[--q];
1684: }
1685: args.source[starter] = (char) value2;
1686: --starter; /* undo the temporary increment */
1687: /* } else { both are on the BMP, nothing more to do */
1688: }
1689:
1690: /* remove the combining mark by moving the following text
1691: * over it */
1692: if (remove < args.start) {
1693: q = remove;
1694: r = args.start;
1695: while (r < args.limit) {
1696: args.source[q++] = args.source[r++];
1697: }
1698: args.start = remove;
1699: args.limit = q;
1700: }
1701:
1702: /* keep prevCC because we removed the combining mark */
1703:
1704: /* done? */
1705: if (args.start == args.limit) {
1706: return (char) prevCC;
1707: }
1708:
1709: /* is the composition a starter that combines forward? */
1710: if (result > 1) {
1711: combineFwdIndex = getCombiningIndexFromStarter(
1712: (char) value, (char) value2);
1713: } else {
1714: starter = -1;
1715: }
1716:
1717: /* we combined; continue with looking for compositions */
1718: continue;
1719: }
1720: }
1721:
1722: /* no combination this time */
1723: prevCC = ncArg.cc;
1724: if (args.start == args.limit) {
1725: return (char) prevCC;
1726: }
1727:
1728: /* if (c, c2) did not combine, then check if it is a starter */
1729: if (ncArg.cc == 0) {
1730: /* found a new starter; combineFlags==0 if (c, c2) is excluded */
1731: if ((combineFlags & COMBINES_FWD) != 0) {
1732: /* it may combine with something, prepare for it */
1733: if (ncArg.c2 == 0) {
1734: starterIsSupplementary = false;
1735: starter = args.start - 1;
1736: } else {
1737: starterIsSupplementary = false;
1738: starter = args.start - 2;
1739: }
1740: combineFwdIndex = combineBackIndex;
1741: } else {
1742: /* it will not combine with anything */
1743: starter = -1;
1744: }
1745: } else if ((options & OPTIONS_COMPOSE_CONTIGUOUS) != 0) {
1746: /* FCC: no discontiguous compositions; any intervening character blocks */
1747: starter = -1;
1748: }
1749: }
1750: }
1751:
1752: // find the last true starter between src[start]....src[current] going
1753: // backwards and return its index
1754: private static int findPreviousStarter(char[] src, int srcStart,
1755: int current, int/*unsigned*/ccOrQCMask,
1756: int/*unsigned*/decompQCMask, char minNoMaybe) {
1757: long norm32;
1758: PrevArgs args = new PrevArgs();
1759: args.src = src;
1760: args.start = srcStart;
1761: args.current = current;
1762:
1763: while (args.start < args.current) {
1764: norm32 = getPrevNorm32(args, minNoMaybe, ccOrQCMask
1765: | decompQCMask);
1766: if (isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1767: break;
1768: }
1769: }
1770: return args.current;
1771: }
1772:
1773: /* find the first true starter in [src..limit[ and return the
1774: * pointer to it
1775: */
1776: private static int/*index*/findNextStarter(char[] src, int start,
1777: int limit, int/*unsigned*/qcMask,
1778: int/*unsigned*/decompQCMask, char minNoMaybe) {
1779: int p;
1780: long/*unsigned*/norm32;
1781: int ccOrQCMask;
1782: char c, c2;
1783:
1784: ccOrQCMask = CC_MASK | qcMask;
1785:
1786: DecomposeArgs decompArgs = new DecomposeArgs();
1787:
1788: for (;;) {
1789: if (start == limit) {
1790: break; /* end of string */
1791: }
1792: c = src[start];
1793: if (c < minNoMaybe) {
1794: break; /* catches NUL terminater, too */
1795: }
1796:
1797: norm32 = getNorm32(c);
1798: if ((norm32 & ccOrQCMask) == 0) {
1799: break; /* true starter */
1800: }
1801:
1802: if (isNorm32LeadSurrogate(norm32)) {
1803: /* c is a lead surrogate, get the real norm32 */
1804: if ((start + 1) == limit
1805: || !UTF16
1806: .isTrailSurrogate(c2 = (src[start + 1]))) {
1807: /* unmatched first surrogate: counts as a true starter */
1808: break;
1809: }
1810: norm32 = getNorm32FromSurrogatePair(norm32, c2);
1811:
1812: if ((norm32 & ccOrQCMask) == 0) {
1813: break; /* true starter */
1814: }
1815: } else {
1816: c2 = 0;
1817: }
1818:
1819: /* (c, c2) is not a true starter but its decomposition may be */
1820: if ((norm32 & decompQCMask) != 0) {
1821: /* (c, c2) decomposes, get everything from the variable-length
1822: * extra data */
1823: p = decompose(norm32, decompQCMask, decompArgs);
1824:
1825: /* get the first character's norm32 to check if it is a true
1826: * starter */
1827: if (decompArgs.cc == 0
1828: && (getNorm32(extraData, p, qcMask) & qcMask) == 0) {
1829: break; /* true starter */
1830: }
1831: }
1832:
1833: start += c2 == 0 ? 1 : 2; /* not a true starter, continue */
1834: }
1835:
1836: return start;
1837: }
1838:
1839: private static final class ComposePartArgs {
1840: int prevCC;
1841: int length; /* length of decomposed part */
1842: }
1843:
1844: /* decompose and recompose [prevStarter..src[ */
1845: private static char[] composePart(ComposePartArgs args,
1846: int prevStarter, char[] src, int start, int limit,
1847: int options, UnicodeSet nx) {
1848: int recomposeLimit;
1849: boolean compat = ((options & OPTIONS_COMPAT) != 0);
1850:
1851: /* decompose [prevStarter..src[ */
1852: int[] outTrailCC = new int[1];
1853: char[] buffer = new char[(limit - prevStarter)
1854: * MAX_BUFFER_SIZE];
1855:
1856: for (;;) {
1857: args.length = decompose(src, prevStarter, (start), buffer,
1858: 0, buffer.length, compat, outTrailCC, nx);
1859: if (args.length <= buffer.length) {
1860: break;
1861: } else {
1862: buffer = new char[args.length];
1863: }
1864: }
1865:
1866: /* recompose the decomposition */
1867: recomposeLimit = args.length;
1868:
1869: if (args.length >= 2) {
1870: RecomposeArgs rcArgs = new RecomposeArgs();
1871: rcArgs.source = buffer;
1872: rcArgs.start = 0;
1873: rcArgs.limit = recomposeLimit;
1874: args.prevCC = recompose(rcArgs, options, nx);
1875: recomposeLimit = rcArgs.limit;
1876: }
1877:
1878: /* return with a pointer to the recomposition and its length */
1879: args.length = recomposeLimit;
1880: return buffer;
1881: }
1882:
1883: private static boolean composeHangul(char prev, char c,
1884: long/*unsigned*/norm32, char[] src, int[] srcIndex,
1885: int limit, boolean compat, char[] dest, int destIndex,
1886: UnicodeSet nx) {
1887: int start = srcIndex[0];
1888: if (isJamoVTNorm32JamoV(norm32)) {
1889: /* c is a Jamo V, compose with previous Jamo L and
1890: * following Jamo T */
1891: prev = (char) (prev - JAMO_L_BASE);
1892: if (prev < JAMO_L_COUNT) {
1893: c = (char) (HANGUL_BASE + (prev * JAMO_V_COUNT + (c - JAMO_V_BASE))
1894: * JAMO_T_COUNT);
1895:
1896: /* check if the next character is a Jamo T (normal or
1897: * compatibility) */
1898: if (start != limit) {
1899: char next, t;
1900:
1901: next = src[start];
1902: if ((t = (char) (next - JAMO_T_BASE)) < JAMO_T_COUNT) {
1903: /* normal Jamo T */
1904: ++start;
1905: c += t;
1906: } else if (compat) {
1907: /* if NFKC, then check for compatibility Jamo T
1908: * (BMP only) */
1909: norm32 = getNorm32(next);
1910: if (isNorm32Regular(norm32)
1911: && ((norm32 & QC_NFKD) != 0)) {
1912: int p /*index into extra data array*/;
1913: DecomposeArgs dcArgs = new DecomposeArgs();
1914: p = decompose(norm32, QC_NFKD, dcArgs);
1915: if (dcArgs.length == 1
1916: && (t = (char) (extraData[p] - JAMO_T_BASE)) < JAMO_T_COUNT) {
1917: /* compatibility Jamo T */
1918: ++start;
1919: c += t;
1920: }
1921: }
1922: }
1923: }
1924: if (nx_contains(nx, c)) {
1925: if (!isHangulWithoutJamoT(c)) {
1926: --start; /* undo ++start from reading the Jamo T */
1927: }
1928: return false;
1929: }
1930: dest[destIndex] = c;
1931: srcIndex[0] = start;
1932: return true;
1933: }
1934: } else if (isHangulWithoutJamoT(prev)) {
1935: /* c is a Jamo T, compose with previous Hangul LV that does not
1936: * contain a Jamo T */
1937: c = (char) (prev + (c - JAMO_T_BASE));
1938: if (nx_contains(nx, c)) {
1939: return false;
1940: }
1941: dest[destIndex] = c;
1942: srcIndex[0] = start;
1943: return true;
1944: }
1945: return false;
1946: }
1947:
1948: /*
1949: public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
1950: return compose(src,0,src.length,dest,0,dest.length,compat, nx);
1951: }
1952: */
1953:
1954: public static int compose(char[] src, int srcStart, int srcLimit,
1955: char[] dest, int destStart, int destLimit, int options,
1956: UnicodeSet nx) {
1957:
1958: int prevSrc, prevStarter;
1959: long/*unsigned*/norm32;
1960: int ccOrQCMask, qcMask;
1961: int reorderStartIndex, length;
1962: char c, c2, minNoMaybe;
1963: int/*unsigned byte*/cc, prevCC;
1964: int[] ioIndex = new int[1];
1965: int destIndex = destStart;
1966: int srcIndex = srcStart;
1967:
1968: if ((options & OPTIONS_COMPAT) != 0) {
1969: minNoMaybe = (char) indexes[INDEX_MIN_NFKC_NO_MAYBE];
1970: qcMask = QC_NFKC;
1971: } else {
1972: minNoMaybe = (char) indexes[INDEX_MIN_NFC_NO_MAYBE];
1973: qcMask = QC_NFC;
1974: }
1975:
1976: /*
1977: * prevStarter points to the last character before the current one
1978: * that is a "true" starter with cc==0 and quick check "yes".
1979: *
1980: * prevStarter will be used instead of looking for a true starter
1981: * while incrementally decomposing [prevStarter..prevSrc[
1982: * in _composePart(). Having a good prevStarter allows to just decompose
1983: * the entire [prevStarter..prevSrc[.
1984: *
1985: * When _composePart() backs out from prevSrc back to prevStarter,
1986: * then it also backs out destIndex by the same amount.
1987: * Therefore, at all times, the (prevSrc-prevStarter) source units
1988: * must correspond 1:1 to destination units counted with destIndex,
1989: * except for reordering.
1990: * This is true for the qc "yes" characters copied in the fast loop,
1991: * and for pure reordering.
1992: * prevStarter must be set forward to src when this is not true:
1993: * In _composePart() and after composing a Hangul syllable.
1994: *
1995: * This mechanism relies on the assumption that the decomposition of a
1996: * true starter also begins with a true starter. gennorm/store.c checks
1997: * for this.
1998: */
1999: prevStarter = srcIndex;
2000:
2001: ccOrQCMask = CC_MASK | qcMask;
2002: /*destIndex=*/reorderStartIndex = 0;/* ####TODO#### check this **/
2003: prevCC = 0;
2004:
2005: /* avoid compiler warnings */
2006: norm32 = 0;
2007: c = 0;
2008:
2009: for (;;) {
2010: /* count code units below the minimum or with irrelevant data for
2011: * the quick check */
2012: prevSrc = srcIndex;
2013:
2014: while (srcIndex != srcLimit
2015: && ((c = src[srcIndex]) < minNoMaybe || ((norm32 = getNorm32(c)) & ccOrQCMask) == 0)) {
2016: prevCC = 0;
2017: ++srcIndex;
2018: }
2019:
2020: /* copy these code units all at once */
2021: if (srcIndex != prevSrc) {
2022: length = (int) (srcIndex - prevSrc);
2023: if ((destIndex + length) <= destLimit) {
2024: System.arraycopy(src, prevSrc, dest, destIndex,
2025: length);
2026: }
2027: destIndex += length;
2028: reorderStartIndex = destIndex;
2029:
2030: /* set prevStarter to the last character in the quick check
2031: * loop */
2032: prevStarter = srcIndex - 1;
2033: if (UTF16.isTrailSurrogate(src[prevStarter])
2034: && prevSrc < prevStarter
2035: && UTF16
2036: .isLeadSurrogate(src[(prevStarter - 1)])) {
2037: --prevStarter;
2038: }
2039:
2040: prevSrc = srcIndex;
2041: }
2042:
2043: /* end of source reached? */
2044: if (srcIndex == srcLimit) {
2045: break;
2046: }
2047:
2048: /* c already contains *src and norm32 is set for it, increment src*/
2049: ++srcIndex;
2050:
2051: /*
2052: * source buffer pointers:
2053: *
2054: * all done quick check current char not yet
2055: * "yes" but (c, c2) processed
2056: * may combine
2057: * forward
2058: * [-------------[-------------[-------------[-------------[
2059: * | | | | |
2060: * start prevStarter prevSrc src limit
2061: *
2062: *
2063: * destination buffer pointers and indexes:
2064: *
2065: * all done might take not filled yet
2066: * characters for
2067: * reordering
2068: * [-------------[-------------[-------------[
2069: * | | | |
2070: * dest reorderStartIndex destIndex destCapacity
2071: */
2072:
2073: /* check one above-minimum, relevant code unit */
2074: /*
2075: * norm32 is for c=*(src-1), and the quick check flag is "no" or
2076: * "maybe", and/or cc!=0
2077: * check for Jamo V/T, then for surrogates and regular characters
2078: * c is not a Hangul syllable or Jamo L because
2079: * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
2080: */
2081: if (isNorm32HangulOrJamo(norm32)) {
2082: /*
2083: * c is a Jamo V/T:
2084: * try to compose with the previous character, Jamo V also with
2085: * a following Jamo T, and set values here right now in case we
2086: * just continue with the main loop
2087: */
2088: prevCC = cc = 0;
2089: reorderStartIndex = destIndex;
2090: ioIndex[0] = srcIndex;
2091: if (destIndex > 0
2092: && composeHangul(src[(prevSrc - 1)], c, norm32,
2093: src, ioIndex, srcLimit,
2094: (options & OPTIONS_COMPAT) != 0, dest,
2095: destIndex <= destLimit ? destIndex - 1
2096: : 0, nx)) {
2097: srcIndex = ioIndex[0];
2098: prevStarter = srcIndex;
2099: continue;
2100: }
2101:
2102: srcIndex = ioIndex[0];
2103:
2104: /* the Jamo V/T did not compose into a Hangul syllable, just
2105: * append to dest */
2106: c2 = 0;
2107: length = 1;
2108: prevStarter = prevSrc;
2109: } else {
2110: if (isNorm32Regular(norm32)) {
2111: c2 = 0;
2112: length = 1;
2113: } else {
2114: /* c is a lead surrogate, get the real norm32 */
2115: if (srcIndex != srcLimit
2116: && UTF16
2117: .isTrailSurrogate(c2 = src[srcIndex])) {
2118: ++srcIndex;
2119: length = 2;
2120: norm32 = getNorm32FromSurrogatePair(norm32, c2);
2121: } else {
2122: /* c is an unpaired lead surrogate, nothing to do */
2123: c2 = 0;
2124: length = 1;
2125: norm32 = 0;
2126: }
2127: }
2128: ComposePartArgs args = new ComposePartArgs();
2129:
2130: /* we are looking at the character (c, c2) at [prevSrc..src[ */
2131: if (nx_contains(nx, c, c2)) {
2132: /* excluded: norm32==0 */
2133: cc = 0;
2134: } else if ((norm32 & qcMask) == 0) {
2135: cc = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
2136: } else {
2137: char[] p;
2138:
2139: /*
2140: * find appropriate boundaries around this character,
2141: * decompose the source text from between the boundaries,
2142: * and recompose it
2143: *
2144: * this puts the intermediate text into the side buffer because
2145: * it might be longer than the recomposition end result,
2146: * or the destination buffer may be too short or missing
2147: *
2148: * note that destIndex may be adjusted backwards to account
2149: * for source text that passed the quick check but needed to
2150: * take part in the recomposition
2151: */
2152: int decompQCMask = (qcMask << 2) & 0xf; /* decomposition quick check mask */
2153: /*
2154: * find the last true starter in [prevStarter..src[
2155: * it is either the decomposition of the current character (at prevSrc),
2156: * or prevStarter
2157: */
2158: if (isTrueStarter(norm32, CC_MASK | qcMask,
2159: decompQCMask)) {
2160: prevStarter = prevSrc;
2161: } else {
2162: /* adjust destIndex: back out what had been copied with qc "yes" */
2163: destIndex -= prevSrc - prevStarter;
2164: }
2165:
2166: /* find the next true starter in [src..limit[ */
2167: srcIndex = findNextStarter(src, srcIndex, srcLimit,
2168: qcMask, decompQCMask, minNoMaybe);
2169: //args.prevStarter = prevStarter;
2170: args.prevCC = prevCC;
2171: //args.destIndex = destIndex;
2172: args.length = length;
2173: p = composePart(args, prevStarter, src, srcIndex,
2174: srcLimit, options, nx);
2175:
2176: if (p == null) {
2177: /* an error occurred (out of memory) */
2178: break;
2179: }
2180:
2181: prevCC = args.prevCC;
2182: length = args.length;
2183:
2184: /* append the recomposed buffer contents to the destination
2185: * buffer */
2186: if ((destIndex + args.length) <= destLimit) {
2187: int i = 0;
2188: while (i < args.length) {
2189: dest[destIndex++] = p[i++];
2190: --length;
2191: }
2192: } else {
2193: /* buffer overflow */
2194: /* keep incrementing the destIndex for preflighting */
2195: destIndex += length;
2196: }
2197:
2198: prevStarter = srcIndex;
2199: continue;
2200: }
2201: }
2202:
2203: /* append the single code point (c, c2) to the destination buffer */
2204: if ((destIndex + length) <= destLimit) {
2205: if (cc != 0 && cc < prevCC) {
2206: /* (c, c2) is out of order with respect to the preceding
2207: * text */
2208: int reorderSplit = destIndex;
2209: destIndex += length;
2210: prevCC = insertOrdered(dest, reorderStartIndex,
2211: reorderSplit, destIndex, c, c2, cc);
2212: } else {
2213: /* just append (c, c2) */
2214: dest[destIndex++] = c;
2215: if (c2 != 0) {
2216: dest[destIndex++] = c2;
2217: }
2218: prevCC = cc;
2219: }
2220: } else {
2221: /* buffer overflow */
2222: /* keep incrementing the destIndex for preflighting */
2223: destIndex += length;
2224: prevCC = cc;
2225: }
2226: }
2227:
2228: return destIndex - destStart;
2229: }
2230:
2231: /* make FCD --------------------------------------------------------------*/
2232:
2233: private static int/*index*/findSafeFCD(char[] src, int start,
2234: int limit, char fcd16) {
2235: char c, c2;
2236:
2237: /*
2238: * find the first position in [src..limit[ after some cc==0 according
2239: * to FCD data
2240: *
2241: * at the beginning of the loop, we have fcd16 from before src
2242: *
2243: * stop at positions:
2244: * - after trail cc==0
2245: * - at the end of the source
2246: * - before lead cc==0
2247: */
2248: for (;;) {
2249: /* stop if trail cc==0 for the previous character */
2250: if ((fcd16 & 0xff) == 0) {
2251: break;
2252: }
2253:
2254: /* get c=*src - stop at end of string */
2255: if (start == limit) {
2256: break;
2257: }
2258: c = src[start];
2259:
2260: /* stop if lead cc==0 for this character */
2261: if (c < MIN_WITH_LEAD_CC || (fcd16 = getFCD16(c)) == 0) {
2262: break; /* catches terminating NUL, too */
2263: }
2264:
2265: if (!UTF16.isLeadSurrogate(c)) {
2266: if (fcd16 <= 0xff) {
2267: break;
2268: }
2269: ++start;
2270: } else if (start + 1 != limit
2271: && (UTF16.isTrailSurrogate(c2 = src[start + 1]))) {
2272: /* c is a lead surrogate, get the real fcd16 */
2273: fcd16 = getFCD16FromSurrogatePair(fcd16, c2);
2274: if (fcd16 <= 0xff) {
2275: break;
2276: }
2277: start += 2;
2278: } else {
2279: /* c is an unpaired first surrogate, lead cc==0 */
2280: break;
2281: }
2282: }
2283:
2284: return start;
2285: }
2286:
2287: private static int/*unsigned byte*/decomposeFCD(char[] src,
2288: int start, int decompLimit, char[] dest,
2289: int[] destIndexArr, UnicodeSet nx) {
2290: char[] p = null;
2291: int pStart = -1;
2292:
2293: long /*unsigned int*/norm32;
2294: int reorderStartIndex;
2295: char c, c2;
2296: int/*unsigned byte*/prevCC;
2297: DecomposeArgs args = new DecomposeArgs();
2298: int destIndex = destIndexArr[0];
2299: /*
2300: * canonically decompose [src..decompLimit[
2301: *
2302: * all characters in this range have some non-zero cc,
2303: * directly or in decomposition,
2304: * so that we do not need to check in the following for quick-check
2305: * limits etc.
2306: *
2307: * there _are_ _no_ Hangul syllables or Jamos in here because they are
2308: * FCD-safe (cc==0)!
2309: *
2310: * we also do not need to check for c==0 because we have an established
2311: * decompLimit
2312: */
2313: reorderStartIndex = destIndex;
2314: prevCC = 0;
2315:
2316: while (start < decompLimit) {
2317: c = src[start++];
2318: norm32 = getNorm32(c);
2319: if (isNorm32Regular(norm32)) {
2320: c2 = 0;
2321: args.length = 1;
2322: } else {
2323: /*
2324: * reminder: this function is called with [src..decompLimit[
2325: * not containing any Hangul/Jamo characters,
2326: * therefore the only specials are lead surrogates
2327: */
2328: /* c is a lead surrogate, get the real norm32 */
2329: if (start != decompLimit
2330: && UTF16.isTrailSurrogate(c2 = src[start])) {
2331: ++start;
2332: args.length = 2;
2333: norm32 = getNorm32FromSurrogatePair(norm32, c2);
2334: } else {
2335: c2 = 0;
2336: args.length = 1;
2337: norm32 = 0;
2338: }
2339: }
2340:
2341: /* get the decomposition and the lead and trail cc's */
2342: if (nx_contains(nx, c, c2)) {
2343: /* excluded: norm32==0 */
2344: args.cc = args.trailCC = 0;
2345: p = null;
2346: } else if ((norm32 & QC_NFD) == 0) {
2347: /* c does not decompose */
2348: args.cc = args.trailCC = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
2349: p = null;
2350: } else {
2351: /* c decomposes, get everything from the variable-length extra
2352: * data */
2353: pStart = decompose(norm32, args);
2354: p = extraData;
2355: if (args.length == 1) {
2356: /* fastpath a single code unit from decomposition */
2357: c = p[pStart];
2358: c2 = 0;
2359: p = null;
2360: }
2361: }
2362:
2363: /* append the decomposition to the destination buffer, assume
2364: * length>0 */
2365: if ((destIndex + args.length) <= dest.length) {
2366: int reorderSplit = destIndex;
2367: if (p == null) {
2368: /* fastpath: single code point */
2369: if (args.cc != 0 && args.cc < prevCC) {
2370: /* (c, c2) is out of order with respect to the preceding
2371: * text */
2372: destIndex += args.length;
2373: args.trailCC = insertOrdered(dest,
2374: reorderStartIndex, reorderSplit,
2375: destIndex, c, c2, args.cc);
2376: } else {
2377: /* just append (c, c2) */
2378: dest[destIndex++] = c;
2379: if (c2 != 0) {
2380: dest[destIndex++] = c2;
2381: }
2382: }
2383: } else {
2384: /* general: multiple code points (ordered by themselves)
2385: * from decomposition */
2386: if (args.cc != 0 && args.cc < prevCC) {
2387: /* the decomposition is out of order with respect to
2388: * the preceding text */
2389: destIndex += args.length;
2390: args.trailCC = mergeOrdered(dest,
2391: reorderStartIndex, reorderSplit, p,
2392: pStart, pStart + args.length);
2393: } else {
2394: /* just append the decomposition */
2395: do {
2396: dest[destIndex++] = p[pStart++];
2397: } while (--args.length > 0);
2398: }
2399: }
2400: } else {
2401: /* buffer overflow */
2402: /* keep incrementing the destIndex for preflighting */
2403: destIndex += args.length;
2404: }
2405:
2406: prevCC = args.trailCC;
2407: if (prevCC == 0) {
2408: reorderStartIndex = destIndex;
2409: }
2410: }
2411: destIndexArr[0] = destIndex;
2412: return prevCC;
2413: }
2414:
2415: public static int makeFCD(char[] src, int srcStart, int srcLimit,
2416: char[] dest, int destStart, int destLimit, UnicodeSet nx) {
2417:
2418: int prevSrc, decompStart;
2419: int destIndex, length;
2420: char c, c2;
2421: int /* unsigned int*/fcd16;
2422: int prevCC, cc;
2423:
2424: /* initialize */
2425: decompStart = srcStart;
2426: destIndex = destStart;
2427: prevCC = 0;
2428: c = 0;
2429: fcd16 = 0;
2430: int[] destIndexArr = new int[1];
2431: destIndexArr[0] = destIndex;
2432:
2433: for (;;) {
2434: /* skip a run of code units below the minimum or with irrelevant
2435: * data for the FCD check */
2436: prevSrc = srcStart;
2437:
2438: for (;;) {
2439: if (srcStart == srcLimit) {
2440: break;
2441: } else if ((c = src[srcStart]) < MIN_WITH_LEAD_CC) {
2442: prevCC = (int) -c;
2443: } else if ((fcd16 = getFCD16(c)) == 0) {
2444: prevCC = 0;
2445: } else {
2446: break;
2447: }
2448: ++srcStart;
2449: }
2450:
2451: /*
2452: * prevCC has values from the following ranges:
2453: * 0..0xff - the previous trail combining class
2454: * <0 - the negative value of the previous code unit;
2455: * that code unit was <_NORM_MIN_WITH_LEAD_CC and its
2456: * getFCD16()
2457: * was deferred so that average text is checked faster
2458: */
2459:
2460: /* copy these code units all at once */
2461: if (srcStart != prevSrc) {
2462: length = (int) (srcStart - prevSrc);
2463: if ((destIndex + length) <= destLimit) {
2464: System.arraycopy(src, prevSrc, dest, destIndex,
2465: length);
2466: }
2467: destIndex += length;
2468: prevSrc = srcStart;
2469:
2470: /* prevCC<0 is only possible from the above loop, i.e., only if
2471: * prevSrc<src */
2472: if (prevCC < 0) {
2473: /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we
2474: * need to get its trail cc */
2475: if (!nx_contains(nx, (int) -prevCC)) {
2476: prevCC = (int) (getFCD16((int) -prevCC) & 0xff);
2477: } else {
2478: prevCC = 0; /* excluded: fcd16==0 */
2479: }
2480: /*
2481: * set a pointer to this below-U+0300 character;
2482: * if prevCC==0 then it will moved to after this character
2483: * below
2484: */
2485: decompStart = prevSrc - 1;
2486: }
2487: }
2488: /*
2489: * now:
2490: * prevSrc==src - used later to adjust destIndex before
2491: * decomposition
2492: * prevCC>=0
2493: */
2494:
2495: /* end of source reached? */
2496: if (srcStart == srcLimit) {
2497: break;
2498: }
2499:
2500: /* set a pointer to after the last source position where prevCC==0*/
2501: if (prevCC == 0) {
2502: decompStart = prevSrc;
2503: }
2504:
2505: /* c already contains *src and fcd16 is set for it, increment src */
2506: ++srcStart;
2507:
2508: /* check one above-minimum, relevant code unit */
2509: if (UTF16.isLeadSurrogate(c)) {
2510: /* c is a lead surrogate, get the real fcd16 */
2511: if (srcStart != srcLimit
2512: && UTF16.isTrailSurrogate(c2 = src[srcStart])) {
2513: ++srcStart;
2514: fcd16 = getFCD16FromSurrogatePair((char) fcd16, c2);
2515: } else {
2516: c2 = 0;
2517: fcd16 = 0;
2518: }
2519: } else {
2520: c2 = 0;
2521: }
2522:
2523: /* we are looking at the character (c, c2) at [prevSrc..src[ */
2524: if (nx_contains(nx, c, c2)) {
2525: fcd16 = 0; /* excluded: fcd16==0 */
2526: }
2527: /* check the combining order, get the lead cc */
2528: cc = (int) (fcd16 >> 8);
2529: if (cc == 0 || cc >= prevCC) {
2530: /* the order is ok */
2531: if (cc == 0) {
2532: decompStart = prevSrc;
2533: }
2534: prevCC = (int) (fcd16 & 0xff);
2535:
2536: /* just append (c, c2) */
2537: length = c2 == 0 ? 1 : 2;
2538: if ((destIndex + length) <= destLimit) {
2539: dest[destIndex++] = c;
2540: if (c2 != 0) {
2541: dest[destIndex++] = c2;
2542: }
2543: } else {
2544: destIndex += length;
2545: }
2546: } else {
2547: /*
2548: * back out the part of the source that we copied already but
2549: * is now going to be decomposed;
2550: * prevSrc is set to after what was copied
2551: */
2552: destIndex -= (int) (prevSrc - decompStart);
2553:
2554: /*
2555: * find the part of the source that needs to be decomposed;
2556: * to be safe and simple, decompose to before the next character
2557: * with lead cc==0
2558: */
2559: srcStart = findSafeFCD(src, srcStart, srcLimit,
2560: (char) fcd16);
2561:
2562: /*
2563: * the source text does not fulfill the conditions for FCD;
2564: * decompose and reorder a limited piece of the text
2565: */
2566: destIndexArr[0] = destIndex;
2567: prevCC = decomposeFCD(src, decompStart, srcStart, dest,
2568: destIndexArr, nx);
2569: decompStart = srcStart;
2570: destIndex = destIndexArr[0];
2571: }
2572: }
2573:
2574: return destIndex - destStart;
2575:
2576: }
2577:
2578: public static int getCombiningClass(int c) {
2579: long norm32;
2580: norm32 = getNorm32(c);
2581: return (char) ((norm32 >> CC_SHIFT) & 0xFF);
2582: }
2583:
2584: public static boolean isFullCompositionExclusion(int c) {
2585: if (isFormatVersion_2_1) {
2586: int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2587: return (boolean) ((aux & AUX_COMP_EX_MASK) != 0);
2588: } else {
2589: return false;
2590: }
2591: }
2592:
2593: public static boolean isCanonSafeStart(int c) {
2594: if (isFormatVersion_2_1) {
2595: int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2596: return (boolean) ((aux & AUX_UNSAFE_MASK) == 0);
2597: } else {
2598: return false;
2599: }
2600: }
2601:
2602: public static boolean getCanonStartSet(int c, USerializedSet fillSet) {
2603:
2604: if (fillSet != null && canonStartSets != null) {
2605: /*
2606: * binary search for c
2607: *
2608: * There are two search tables,
2609: * one for BMP code points and one for supplementary ones.
2610: * See unormimp.h for details.
2611: */
2612: char[] table;
2613: int i = 0, start, limit;
2614:
2615: int[] indexes = (int[]) canonStartSets[CANON_SET_INDICIES_INDEX];
2616: char[] startSets = (char[]) canonStartSets[CANON_SET_START_SETS_INDEX];
2617:
2618: if (c <= 0xffff) {
2619: table = (char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX];
2620: start = 0;
2621: limit = table.length;
2622:
2623: /* each entry is a pair { c, result } */
2624: while (start < limit - 2) {
2625: i = (char) (((start + limit) / 4) * 2);
2626: if (c < table[i]) {
2627: limit = i;
2628: } else {
2629: start = i;
2630: }
2631: }
2632: //System.out.println(i);
2633: /* found? */
2634: if (c == table[start]) {
2635: i = table[start + 1];
2636: if ((i & CANON_SET_BMP_MASK) == CANON_SET_BMP_IS_INDEX) {
2637: /* result 01xxxxxx xxxxxx contains index x to a
2638: * USerializedSet */
2639: i &= (CANON_SET_MAX_CANON_SETS - 1);
2640: return fillSet.getSet(startSets,
2641: (i - indexes.length));
2642: } else {
2643: /* other result values are BMP code points for
2644: * single-code point sets */
2645: fillSet.setToOne(i);
2646: return true;
2647: }
2648: }
2649: } else {
2650: char high, low, h, j = 0;
2651:
2652: table = (char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX];
2653: start = 0;
2654: limit = table.length;
2655:
2656: high = (char) (c >> 16);
2657: low = (char) c;
2658:
2659: /* each entry is a triplet { high(c), low(c), result } */
2660: while (start < limit - 3) {
2661: /* (start+limit)/2 and address triplets */
2662: i = (char) (((start + limit) / 6) * 3);
2663: j = (char) (table[i] & 0x1f); /* high word */
2664: int tableVal = table[i + 1];
2665: int lowInt = low;
2666: if (high < j
2667: || ((tableVal > lowInt) && (high == j))) {
2668: limit = i;
2669: } else {
2670: start = i;
2671: }
2672:
2673: //System.err.println("\t((high==j) && (table[i+1]>low)) == " + ((high==j) && (tableVal>lowInt)) );
2674:
2675: // KLUDGE: IBM JIT in 1.4.0 is sooo broken
2676: // The below lines make TestExhaustive pass
2677: if (ICUDebug.enabled()) {
2678: System.err.println("\t\t j = "
2679: + Utility.hex(j, 4) + "\t i = "
2680: + Utility.hex(i, 4) + "\t high = "
2681: + Utility.hex(high) + "\t low = "
2682: + Utility.hex(lowInt, 4)
2683: + "\t table[i+1]: "
2684: + Utility.hex(tableVal, 4));
2685: }
2686:
2687: }
2688:
2689: /* found? */
2690: h = table[start];
2691:
2692: //System.err.println("c: \\U"+ Integer.toHexString(c)+" i : "+Integer.toHexString(i) +" h : " + Integer.toHexString(h));
2693: int tableVal1 = table[start + 1];
2694: int lowInt = low;
2695:
2696: if (high == (h & 0x1f) && lowInt == tableVal1) {
2697: int tableVal2 = table[start + 2];
2698: i = tableVal2;
2699: if ((h & 0x8000) == 0) {
2700: /* the result is an index to a USerializedSet */
2701: return fillSet.getSet(startSets,
2702: (i - indexes.length));
2703: } else {
2704: /*
2705: * single-code point set {x} in
2706: * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
2707: */
2708: //i|=((int)h & 0x1f00)<<8; /* add high bits from high(c) */
2709: int temp = ((int) h & 0x1f00) << 8;
2710: i |= temp; /* add high bits from high(c) */
2711: fillSet.setToOne((int) i);
2712: return true;
2713: }
2714: }
2715: }
2716: }
2717:
2718: return false; /* not found */
2719: }
2720:
2721: public static int getFC_NFKC_Closure(int c, char[] dest) {
2722:
2723: int destCapacity;
2724:
2725: if (dest == null) {
2726: destCapacity = 0;
2727: } else {
2728: destCapacity = dest.length;
2729: }
2730:
2731: int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2732:
2733: aux &= AUX_FNC_MASK;
2734: if (aux != 0) {
2735: int s;
2736: int index = aux;
2737: int length;
2738:
2739: s = extraData[index];
2740: if (s < 0xff00) {
2741: /* s points to the single-unit string */
2742: length = 1;
2743: } else {
2744: length = s & 0xff;
2745: ++index;
2746: }
2747: if (0 < length && length <= destCapacity) {
2748: System.arraycopy(extraData, index, dest, 0, length);
2749: }
2750: return length;
2751: } else {
2752: return 0;
2753: }
2754: }
2755:
2756: /* Is c an NF<mode>-skippable code point? See unormimp.h. */
2757: public static boolean isNFSkippable(int c, Normalizer.Mode mode,
2758: long mask) {
2759: long /*unsigned int*/norm32;
2760: mask = mask & UNSIGNED_INT_MASK;
2761: char aux;
2762:
2763: /* check conditions (a)..(e), see unormimp.h */
2764: norm32 = getNorm32(c);
2765:
2766: if ((norm32 & mask) != 0) {
2767: return false; /* fails (a)..(e), not skippable */
2768: }
2769:
2770: if (mode == Normalizer.NFD || mode == Normalizer.NFKD
2771: || mode == Normalizer.NONE) {
2772: return true; /* NF*D, passed (a)..(c), is skippable */
2773: }
2774: /* check conditions (a)..(e), see unormimp.h */
2775:
2776: /* NF*C/FCC, passed (a)..(e) */
2777: if ((norm32 & QC_NFD) == 0) {
2778: return true; /* no canonical decomposition, is skippable */
2779: }
2780:
2781: /* check Hangul syllables algorithmically */
2782: if (isNorm32HangulOrJamo(norm32)) {
2783: /* Jamo passed (a)..(e) above, must be Hangul */
2784: return !isHangulWithoutJamoT((char) c); /* LVT are skippable, LV are not */
2785: }
2786:
2787: /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
2788: /* NF*C, test (f) flag */
2789: if (!isFormatVersion_2_2) {
2790: return false; /* no (f) data, say not skippable to be safe */
2791: }
2792:
2793: aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2794: return (aux & AUX_NFC_SKIP_F_MASK) == 0; /* TRUE=skippable if the (f) flag is not set */
2795:
2796: /* } else { FCC, test fcd<=1 instead of the above } */
2797: }
2798:
2799: /*
2800: private static final boolean
2801: _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
2802: // add the start code point to the USet
2803: uset_add((USet *)context, start);
2804: return TRUE;
2805: }
2806: */
2807:
2808: public static UnicodeSet addPropertyStarts(UnicodeSet set) {
2809: int c;
2810:
2811: /* add the start code point of each same-value range of each trie */
2812: //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
2813: TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
2814: RangeValueIterator.Element normResult = new RangeValueIterator.Element();
2815:
2816: while (normIter.next(normResult)) {
2817: set.add(normResult.start);
2818: }
2819:
2820: //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
2821: TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie);
2822: RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();
2823:
2824: while (fcdIter.next(fcdResult)) {
2825: set.add(fcdResult.start);
2826: }
2827:
2828: if (isFormatVersion_2_1) {
2829: //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
2830: TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
2831: RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
2832: while (auxIter.next(auxResult)) {
2833: set.add(auxResult.start);
2834: }
2835: }
2836: /* add Hangul LV syllables and LV+1 because of skippables */
2837: for (c = HANGUL_BASE; c < HANGUL_BASE + HANGUL_COUNT; c += JAMO_T_COUNT) {
2838: set.add(c);
2839: set.add(c + 1);
2840: }
2841: set.add(HANGUL_BASE + HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
2842: return set; // for chaining
2843: }
2844:
2845: /**
2846: * Internal API, used in UCharacter.getIntPropertyValue().
2847: * @internal
2848: * @param c code point
2849: * @param modeValue numeric value compatible with Mode
2850: * @return numeric value compatible with QuickCheck
2851: */
2852: public static final int quickCheck(int c, int modeValue) {
2853: final int qcMask[/*UNORM_MODE_COUNT*/] = { 0, 0, QC_NFD,
2854: QC_NFKD, QC_NFC, QC_NFKC };
2855:
2856: int norm32 = (int) getNorm32(c) & qcMask[modeValue];
2857:
2858: if (norm32 == 0) {
2859: return 1; // YES
2860: } else if ((norm32 & QC_ANY_NO) != 0) {
2861: return 0; // NO
2862: } else /* _NORM_QC_ANY_MAYBE */{
2863: return 2; // MAYBE;
2864: }
2865: }
2866:
2867: /**
2868: * Internal API, used by collation code.
2869: * Get access to the internal FCD trie table to be able to perform
2870: * incremental, per-code unit, FCD checks in collation.
2871: * One pointer is sufficient because the trie index values are offset
2872: * by the index size, so that the same pointer is used to access the trie
2873: * data.
2874: * @internal
2875: */
2876: ///CLOVER:OFF
2877: public CharTrie getFCDTrie() {
2878: return FCDTrieImpl.fcdTrie;
2879: }
2880:
2881: ///CLOVER:ON
2882:
2883: /* compare canonically equivalent ---------------------------------------- */
2884:
2885: /*
2886: * Compare two strings for canonical equivalence.
2887: * Further options include case-insensitive comparison and
2888: * code point order (as opposed to code unit order).
2889: *
2890: * In this function, canonical equivalence is optional as well.
2891: * If canonical equivalence is tested, then both strings must fulfill
2892: * the FCD check.
2893: *
2894: * Semantically, this is equivalent to
2895: * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
2896: * where code point order, NFD and foldCase are all optional.
2897: *
2898: * String comparisons almost always yield results before processing both
2899: * strings completely.
2900: * They are generally more efficient working incrementally instead of
2901: * performing the sub-processing (strlen, normalization, case-folding)
2902: * on the entire strings first.
2903: *
2904: * It is also unnecessary to not normalize identical characters.
2905: *
2906: * This function works in principle as follows:
2907: *
2908: * loop {
2909: * get one code unit c1 from s1 (-1 if end of source)
2910: * get one code unit c2 from s2 (-1 if end of source)
2911: *
2912: * if(either string finished) {
2913: * return result;
2914: * }
2915: * if(c1==c2) {
2916: * continue;
2917: * }
2918: *
2919: * // c1!=c2
2920: * try to decompose/case-fold c1/c2, and continue if one does;
2921: *
2922: * // still c1!=c2 and neither decomposes/case-folds, return result
2923: * return c1-c2;
2924: * }
2925: *
2926: * When a character decomposes, then the pointer for that source changes to
2927: * the decomposition, pushing the previous pointer onto a stack.
2928: * When the end of the decomposition is reached, then the code unit reader
2929: * pops the previous source from the stack.
2930: * (Same for case-folding.)
2931: *
2932: * This is complicated further by operating on variable-width UTF-16.
2933: * The top part of the loop works on code units, while lookups for decomposition
2934: * and case-folding need code points.
2935: * Code points are assembled after the equality/end-of-source part.
2936: * The source pointer is only advanced beyond all code units when the code point
2937: * actually decomposes/case-folds.
2938: *
2939: * If we were on a trail surrogate unit when assembling a code point,
2940: * and the code point decomposes/case-folds, then the decomposition/folding
2941: * result must be compared with the part of the other string that corresponds to
2942: * this string's lead surrogate.
2943: * Since we only assemble a code point when hitting a trail unit when the
2944: * preceding lead units were identical, we back up the other string by one unit
2945: * in such a case.
2946: *
2947: * The optional code point order comparison at the end works with
2948: * the same fix-up as the other code point order comparison functions.
2949: * See ustring.c and the comment near the end of this function.
2950: *
2951: * Assumption: A decomposition or case-folding result string never contains
2952: * a single surrogate. This is a safe assumption in the Unicode Standard.
2953: * Therefore, we do not need to check for surrogate pairs across
2954: * decomposition/case-folding boundaries.
2955: * Further assumptions (see verifications tstnorm.cpp):
2956: * The API function checks for FCD first, while the core function
2957: * first case-folds and then decomposes. This requires that case-folding does not
2958: * un-FCD any strings.
2959: *
2960: * The API function may also NFD the input and turn off decomposition.
2961: * This requires that case-folding does not un-NFD strings either.
2962: *
2963: * TODO If any of the above two assumptions is violated,
2964: * then this entire code must be re-thought.
2965: * If this happens, then a simple solution is to case-fold both strings up front
2966: * and to turn off UNORM_INPUT_IS_FCD.
2967: * We already do this when not both strings are in FCD because makeFCD
2968: * would be a partial NFD before the case folding, which does not work.
2969: * Note that all of this is only a problem when case-folding _and_
2970: * canonical equivalence come together.
2971: *
2972: * This function could be moved to a different source file, at increased cost
2973: * for calling the decomposition access function.
2974: */
2975:
2976: // stack element for previous-level source/decomposition pointers
2977: private static class CmpEquivLevel {
2978: char[] source;
2979: int start;
2980: int s;
2981: int limit;
2982: }
2983:
2984: /**
2985: * Get the canonical decomposition for one code point.
2986: * @param c code point
2987: * @param buffer out-only buffer for algorithmic decompositions of Hangul
2988: * @param length out-only, takes the length of the decomposition, if any
2989: * @return index into the extraData array, or 0 if none
2990: * @internal
2991: */
2992: private static int decompose(int c, char[] buffer) {
2993:
2994: long norm32;
2995: int length = 0;
2996: norm32 = (long) ((UNSIGNED_INT_MASK) & NormTrieImpl.normTrie
2997: .getCodePointValue(c));
2998: if ((norm32 & QC_NFD) != 0) {
2999: if (isNorm32HangulOrJamo(norm32)) {
3000: /* Hangul syllable: decompose algorithmically */
3001: char c2;
3002:
3003: c -= HANGUL_BASE;
3004:
3005: c2 = (char) (c % JAMO_T_COUNT);
3006: c /= JAMO_T_COUNT;
3007: if (c2 > 0) {
3008: buffer[2] = (char) (JAMO_T_BASE + c2);
3009: length = 3;
3010: } else {
3011: length = 2;
3012: }
3013: buffer[1] = (char) (JAMO_V_BASE + c % JAMO_V_COUNT);
3014: buffer[0] = (char) (JAMO_L_BASE + c / JAMO_V_COUNT);
3015: return length;
3016: } else {
3017: /* normal decomposition */
3018: DecomposeArgs args = new DecomposeArgs();
3019: int index = decompose(norm32, args);
3020: System.arraycopy(extraData, index, buffer, 0,
3021: args.length);
3022: return args.length;
3023: }
3024: } else {
3025: return 0;
3026: }
3027: }
3028:
3029: private static int foldCase(int c, char[] dest, int destStart,
3030: int destLimit, int options) {
3031: String src = UTF16.valueOf(c);
3032: String foldedStr = UCharacter.foldCase(src, options);
3033: char[] foldedC = foldedStr.toCharArray();
3034: for (int i = 0; i < foldedC.length; i++) {
3035: if (destStart < destLimit) {
3036: dest[destStart] = foldedC[i];
3037: }
3038: // always increment destStart so that we can return
3039: // the required length
3040: destStart++;
3041: }
3042: return (c == UTF16.charAt(foldedStr, 0)) ? -destStart
3043: : destStart;
3044: }
3045:
3046: /*
3047: private static int foldCase(char[] src,int srcStart,int srcLimit,
3048: char[] dest, int destStart, int destLimit,
3049: int options){
3050: String source =new String(src,srcStart,(srcLimit-srcStart));
3051: String foldedStr = UCharacter.foldCase(source,options);
3052: char[] foldedC = foldedStr.toCharArray();
3053: for(int i=0;i<foldedC.length;i++){
3054: if(destStart<destLimit){
3055: dest[destStart]=foldedC[i];
3056: }
3057: // always increment destStart so that we can return
3058: // the required length
3059: destStart++;
3060:
3061: }
3062: return destStart;
3063: }
3064: */
3065: public static int cmpEquivFold(String s1, String s2, int options) {
3066: return cmpEquivFold(s1.toCharArray(), 0, s1.length(), s2
3067: .toCharArray(), 0, s2.length(), options);
3068: }
3069:
3070: // internal function
3071: public static int cmpEquivFold(char[] s1, int s1Start, int s1Limit,
3072: char[] s2, int s2Start, int s2Limit, int options) {
3073: // current-level start/limit - s1/s2 as current
3074: int start1, start2, limit1, limit2;
3075: char[] cSource1, cSource2;
3076:
3077: cSource1 = s1;
3078: cSource2 = s2;
3079: // decomposition variables
3080: int length;
3081:
3082: // stacks of previous-level start/current/limit
3083: CmpEquivLevel[] stack1 = new CmpEquivLevel[] {
3084: new CmpEquivLevel(), new CmpEquivLevel() };
3085: CmpEquivLevel[] stack2 = new CmpEquivLevel[] {
3086: new CmpEquivLevel(), new CmpEquivLevel() };
3087:
3088: // decomposition buffers for Hangul
3089: char[] decomp1 = new char[8];
3090: char[] decomp2 = new char[8];
3091:
3092: // case folding buffers, only use current-level start/limit
3093: char[] fold1 = new char[32];
3094: char[] fold2 = new char[32];
3095:
3096: // track which is the current level per string
3097: int level1, level2;
3098:
3099: // current code units, and code points for lookups
3100: int c1, c2;
3101: int cp1, cp2;
3102:
3103: // no argument error checking because this itself is not an API
3104:
3105: // assume that at least one of the options COMPARE_EQUIV and
3106: // COMPARE_IGNORE_CASE is set
3107: // otherwise this function must behave exactly as uprv_strCompare()
3108: // not checking for that here makes testing this function easier
3109:
3110: // initialize
3111: start1 = s1Start;
3112: limit1 = s1Limit;
3113:
3114: start2 = s2Start;
3115: limit2 = s2Limit;
3116:
3117: level1 = level2 = 0;
3118: c1 = c2 = -1;
3119: cp1 = cp2 = -1;
3120: // comparison loop
3121: for (;;) {
3122: // here a code unit value of -1 means "get another code unit"
3123: // below it will mean "this source is finished"
3124:
3125: if (c1 < 0) {
3126: // get next code unit from string 1, post-increment
3127: for (;;) {
3128: if (s1Start >= limit1) {
3129: if (level1 == 0) {
3130: c1 = -1;
3131: break;
3132: }
3133: } else {
3134: c1 = cSource1[s1Start];
3135: ++s1Start;
3136: break;
3137: }
3138:
3139: // reached end of level buffer, pop one level
3140: do {
3141: --level1;
3142: start1 = stack1[level1].start;
3143: } while (start1 == -1); //###### check this
3144: s1Start = stack1[level1].s;
3145: limit1 = stack1[level1].limit;
3146: cSource1 = stack1[level1].source;
3147: }
3148: }
3149:
3150: if (c2 < 0) {
3151: // get next code unit from string 2, post-increment
3152: for (;;) {
3153: if (s2Start >= limit2) {
3154: if (level2 == 0) {
3155: c2 = -1;
3156: break;
3157: }
3158: } else {
3159: c2 = cSource2[s2Start];
3160: ++s2Start;
3161: break;
3162: }
3163:
3164: // reached end of level buffer, pop one level
3165: do {
3166: --level2;
3167: start2 = stack2[level2].start;
3168: } while (start2 == -1);
3169: s2Start = stack2[level2].s;
3170: limit2 = stack2[level2].limit;
3171: cSource2 = stack2[level2].source;
3172: }
3173: }
3174:
3175: // compare c1 and c2
3176: // either variable c1, c2 is -1 only if the corresponding string
3177: // is finished
3178: if (c1 == c2) {
3179: if (c1 < 0) {
3180: return 0; // c1==c2==-1 indicating end of strings
3181: }
3182: c1 = c2 = -1; // make us fetch new code units
3183: continue;
3184: } else if (c1 < 0) {
3185: return -1; // string 1 ends before string 2
3186: } else if (c2 < 0) {
3187: return 1; // string 2 ends before string 1
3188: }
3189: // c1!=c2 && c1>=0 && c2>=0
3190:
3191: // get complete code points for c1, c2 for lookups if either is a
3192: // surrogate
3193: cp1 = c1;
3194: if (UTF16.isSurrogate((char) c1)) {
3195: char c;
3196:
3197: if (UTF16.isLeadSurrogate((char) c1)) {
3198: if (s1Start != limit1
3199: && UTF16
3200: .isTrailSurrogate(c = cSource1[s1Start])) {
3201: // advance ++s1; only below if cp1 decomposes/case-folds
3202: cp1 = UCharacterProperty.getRawSupplementary(
3203: (char) c1, c);
3204: }
3205: } else /* isTrail(c1) */{
3206: if (start1 <= (s1Start - 2)
3207: && UTF16
3208: .isLeadSurrogate(c = cSource1[(s1Start - 2)])) {
3209: cp1 = UCharacterProperty.getRawSupplementary(c,
3210: (char) c1);
3211: }
3212: }
3213: }
3214: cp2 = c2;
3215: if (UTF16.isSurrogate((char) c2)) {
3216: char c;
3217:
3218: if (UTF16.isLeadSurrogate((char) c2)) {
3219: if (s2Start != limit2
3220: && UTF16
3221: .isTrailSurrogate(c = cSource2[s2Start])) {
3222: // advance ++s2; only below if cp2 decomposes/case-folds
3223: cp2 = UCharacterProperty.getRawSupplementary(
3224: (char) c2, c);
3225: }
3226: } else /* isTrail(c2) */{
3227: if (start2 <= (s2Start - 2)
3228: && UTF16
3229: .isLeadSurrogate(c = cSource2[s2Start - 2])) {
3230: cp2 = UCharacterProperty.getRawSupplementary(c,
3231: (char) c2);
3232: }
3233: }
3234: }
3235:
3236: // go down one level for each string
3237: // continue with the main loop as soon as there is a real change
3238: if (level1 < 2
3239: && ((options & Normalizer.COMPARE_IGNORE_CASE) != 0)
3240: && (length = foldCase(cp1, fold1, 0, 32, options)) >= 0) {
3241: // cp1 case-folds to fold1[length]
3242: if (UTF16.isSurrogate((char) c1)) {
3243: if (UTF16.isLeadSurrogate((char) c1)) {
3244: // advance beyond source surrogate pair if it
3245: // case-folds
3246: ++s1Start;
3247: } else /* isTrail(c1) */{
3248: // we got a supplementary code point when hitting its
3249: // trail surrogate, therefore the lead surrogate must
3250: // have been the same as in the other string;
3251: // compare this decomposition with the lead surrogate
3252: // in the other string
3253: --s2Start;
3254: c2 = cSource2[(s2Start - 1)];
3255: }
3256: }
3257:
3258: // push current level pointers
3259: stack1[0].start = start1;
3260: stack1[0].s = s1Start;
3261: stack1[0].limit = limit1;
3262: stack1[0].source = cSource1;
3263: ++level1;
3264:
3265: cSource1 = fold1;
3266: start1 = s1Start = 0;
3267: limit1 = length;
3268:
3269: // get ready to read from decomposition, continue with loop
3270: c1 = -1;
3271: continue;
3272: }
3273:
3274: if (level2 < 2
3275: && ((options & Normalizer.COMPARE_IGNORE_CASE) != 0)
3276: && (length = foldCase(cp2, fold2, 0, 32, options)) >= 0) {
3277: // cp2 case-folds to fold2[length]
3278: if (UTF16.isSurrogate((char) c2)) {
3279: if (UTF16.isLeadSurrogate((char) c2)) {
3280: // advance beyond source surrogate pair if it
3281: // case-folds
3282: ++s2Start;
3283: } else /* isTrail(c2) */{
3284: // we got a supplementary code point when hitting its
3285: // trail surrogate, therefore the lead surrogate must
3286: // have been the same as in the other string;
3287: // compare this decomposition with the lead surrogate
3288: // in the other string
3289: --s1Start;
3290: c1 = cSource1[(s1Start - 1)];
3291: }
3292: }
3293:
3294: // push current level pointers
3295: stack2[0].start = start2;
3296: stack2[0].s = s2Start;
3297: stack2[0].limit = limit2;
3298: stack2[0].source = cSource2;
3299: ++level2;
3300:
3301: cSource2 = fold2;
3302: start2 = s2Start = 0;
3303: limit2 = length;
3304:
3305: // get ready to read from decomposition, continue with loop
3306: c2 = -1;
3307: continue;
3308: }
3309:
3310: if (level1 < 2 && ((options & COMPARE_EQUIV) != 0)
3311: && 0 != (length = decompose(cp1, decomp1))) {
3312: // cp1 decomposes into p[length]
3313: if (UTF16.isSurrogate((char) c1)) {
3314: if (UTF16.isLeadSurrogate((char) c1)) {
3315: // advance beyond source surrogate pair if it
3316: //decomposes
3317: ++s1Start;
3318: } else /* isTrail(c1) */{
3319: // we got a supplementary code point when hitting
3320: // its trail surrogate, therefore the lead surrogate
3321: // must have been the same as in the other string;
3322: // compare this decomposition with the lead surrogate
3323: // in the other string
3324: --s2Start;
3325: c2 = cSource2[(s2Start - 1)];
3326: }
3327: }
3328:
3329: // push current level pointers
3330: stack1[level1].start = start1;
3331: stack1[level1].s = s1Start;
3332: stack1[level1].limit = limit1;
3333: stack1[level1].source = cSource1;
3334: ++level1;
3335:
3336: // set next level pointers to decomposition
3337: cSource1 = decomp1;
3338: start1 = s1Start = 0;
3339: limit1 = length;
3340:
3341: // set empty intermediate level if skipped
3342: if (level1 < 2) {
3343: stack1[level1++].start = -1;
3344: }
3345: // get ready to read from decomposition, continue with loop
3346: c1 = -1;
3347: continue;
3348: }
3349:
3350: if (level2 < 2 && ((options & COMPARE_EQUIV) != 0)
3351: && 0 != (length = decompose(cp2, decomp2))) {
3352: // cp2 decomposes into p[length]
3353: if (UTF16.isSurrogate((char) c2)) {
3354: if (UTF16.isLeadSurrogate((char) c2)) {
3355: // advance beyond source surrogate pair if it
3356: // decomposes
3357: ++s2Start;
3358: } else /* isTrail(c2) */{
3359: // we got a supplementary code point when hitting its
3360: // trail surrogate, therefore the lead surrogate must
3361: // have been the same as in the other string;
3362: // compare this decomposition with the lead surrogate
3363: // in the other string
3364: --s1Start;
3365: c1 = cSource1[(s1Start - 1)];
3366: }
3367: }
3368:
3369: // push current level pointers
3370: stack2[level2].start = start2;
3371: stack2[level2].s = s2Start;
3372: stack2[level2].limit = limit2;
3373: stack2[level2].source = cSource2;
3374: ++level2;
3375:
3376: // set next level pointers to decomposition
3377: cSource2 = decomp2;
3378: start2 = s2Start = 0;
3379: limit2 = length;
3380:
3381: // set empty intermediate level if skipped
3382: if (level2 < 2) {
3383: stack2[level2++].start = -1;
3384: }
3385:
3386: // get ready to read from decomposition, continue with loop
3387: c2 = -1;
3388: continue;
3389: }
3390:
3391: // no decomposition/case folding, max level for both sides:
3392: // return difference result
3393:
3394: // code point order comparison must not just return cp1-cp2
3395: // because when single surrogates are present then the surrogate
3396: // pairs that formed cp1 and cp2 may be from different string
3397: // indexes
3398:
3399: // example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at
3400: // second code units
3401: // c1=d800 cp1=10001 c2=dc00 cp2=10000
3402: // cp1-cp2>0 but c1-c2<0 and in fact in UTF-32
3403: // it is { d800 10001 } < { 10000 }
3404: // therefore fix-up
3405:
3406: if (c1 >= 0xd800
3407: && c2 >= 0xd800
3408: && ((options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0)) {
3409: /* subtract 0x2800 from BMP code points to make them smaller
3410: * than supplementary ones */
3411: if ((c1 <= 0xdbff && s1Start != limit1 && UTF16
3412: .isTrailSurrogate(cSource1[s1Start]))
3413: || (UTF16.isTrailSurrogate((char) c1)
3414: && start1 != (s1Start - 1) && UTF16
3415: .isLeadSurrogate(cSource1[(s1Start - 2)]))) {
3416: /* part of a surrogate pair, leave >=d800 */
3417: } else {
3418: /* BMP code point - may be surrogate code point -
3419: * make <d800 */
3420: c1 -= 0x2800;
3421: }
3422:
3423: if ((c2 <= 0xdbff && s2Start != limit2 && UTF16
3424: .isTrailSurrogate(cSource2[s2Start]))
3425: || (UTF16.isTrailSurrogate((char) c2)
3426: && start2 != (s2Start - 1) && UTF16
3427: .isLeadSurrogate(cSource2[(s2Start - 2)]))) {
3428: /* part of a surrogate pair, leave >=d800 */
3429: } else {
3430: /* BMP code point - may be surrogate code point -
3431: * make <d800 */
3432: c2 -= 0x2800;
3433: }
3434: }
3435:
3436: return c1 - c2;
3437: }
3438: }
3439:
3440: private static int strCompare(char[] s1, int s1Start, int s1Limit,
3441: char[] s2, int s2Start, int s2Limit, boolean codePointOrder) {
3442:
3443: int start1, start2, limit1, limit2;
3444:
3445: char c1, c2;
3446:
3447: /* setup for fix-up */
3448: start1 = s1Start;
3449: start2 = s2Start;
3450:
3451: int length1, length2;
3452:
3453: length1 = s1Limit - s1Start;
3454: length2 = s2Limit - s2Start;
3455:
3456: int lengthResult;
3457:
3458: if (length1 < length2) {
3459: lengthResult = -1;
3460: limit1 = start1 + length1;
3461: } else if (length1 == length2) {
3462: lengthResult = 0;
3463: limit1 = start1 + length1;
3464: } else /* length1>length2 */{
3465: lengthResult = 1;
3466: limit1 = start1 + length2;
3467: }
3468:
3469: if (s1 == s2) {
3470: return lengthResult;
3471: }
3472:
3473: for (;;) {
3474: /* check pseudo-limit */
3475: if (s1Start == limit1) {
3476: return lengthResult;
3477: }
3478:
3479: c1 = s1[s1Start];
3480: c2 = s2[s2Start];
3481: if (c1 != c2) {
3482: break;
3483: }
3484: ++s1Start;
3485: ++s2Start;
3486: }
3487:
3488: /* setup for fix-up */
3489: limit1 = start1 + length1;
3490: limit2 = start2 + length2;
3491:
3492: /* if both values are in or above the surrogate range, fix them up */
3493: if (c1 >= 0xd800 && c2 >= 0xd800 && codePointOrder) {
3494: /* subtract 0x2800 from BMP code points to make them smaller than
3495: * supplementary ones */
3496: if ((c1 <= 0xdbff && (s1Start + 1) != limit1 && UTF16
3497: .isTrailSurrogate(s1[(s1Start + 1)]))
3498: || (UTF16.isTrailSurrogate(c1) && start1 != s1Start && UTF16
3499: .isLeadSurrogate(s1[(s1Start - 1)]))) {
3500: /* part of a surrogate pair, leave >=d800 */
3501: } else {
3502: /* BMP code point - may be surrogate code point - make <d800 */
3503: c1 -= 0x2800;
3504: }
3505:
3506: if ((c2 <= 0xdbff && (s2Start + 1) != limit2 && UTF16
3507: .isTrailSurrogate(s2[(s2Start + 1)]))
3508: || (UTF16.isTrailSurrogate(c2) && start2 != s2Start && UTF16
3509: .isLeadSurrogate(s2[(s2Start - 1)]))) {
3510: /* part of a surrogate pair, leave >=d800 */
3511: } else {
3512: /* BMP code point - may be surrogate code point - make <d800 */
3513: c2 -= 0x2800;
3514: }
3515: }
3516:
3517: /* now c1 and c2 are in UTF-32-compatible order */
3518: return (int) c1 - (int) c2;
3519: }
3520:
3521: /*
3522: * Status of tailored normalization
3523: *
3524: * This was done initially for investigation on Unicode public review issue 7
3525: * (http://www.unicode.org/review/). See Jitterbug 2481.
3526: * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
3527: * a permanent feature in ICU 2.6 in support of IDNA which requires true
3528: * Unicode 3.2 normalization.
3529: * (NormalizationCorrections are rolled into IDNA mapping tables.)
3530: *
3531: * Tailored normalization as implemented here allows to "normalize less"
3532: * than full Unicode normalization would.
3533: * Based internally on a UnicodeSet of code points that are
3534: * "excluded from normalization", the normalization functions leave those
3535: * code points alone ("inert"). This means that tailored normalization
3536: * still transforms text into a canonically equivalent form.
3537: * It does not add decompositions to code points that do not have any or
3538: * change decomposition results.
3539: *
3540: * Any function that searches for a safe boundary has not been touched,
3541: * which means that these functions will be over-pessimistic when
3542: * exclusions are applied.
3543: * This should not matter because subsequent checks and normalizations
3544: * do apply the exclusions; only a little more of the text may be processed
3545: * than necessary under exclusions.
3546: *
3547: * Normalization exclusions have the following effect on excluded code points c:
3548: * - c is not decomposed
3549: * - c is not a composition target
3550: * - c does not combine forward or backward for composition
3551: * except that this is not implemented for Jamo
3552: * - c is treated as having a combining class of 0
3553: */
3554:
3555: /*
3556: * Constants for the bit fields in the options bit set parameter.
3557: * These need not be public.
3558: * A user only needs to know the currently assigned values.
3559: * The number and positions of reserved bits per field can remain private.
3560: */
3561: private static final int OPTIONS_NX_MASK = 0x1f;
3562: private static final int OPTIONS_UNICODE_MASK = 0xe0;
3563: public static final int OPTIONS_SETS_MASK = 0xff;
3564: private static final int OPTIONS_UNICODE_SHIFT = 5;
3565: private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK + 1];
3566:
3567: /* Constants for options flags for normalization.*/
3568:
3569: /**
3570: * Options bit 0, do not decompose Hangul syllables.
3571: * @draft ICU 2.6
3572: */
3573: private static final int NX_HANGUL = 1;
3574: /**
3575: * Options bit 1, do not decompose CJK compatibility characters.
3576: * @draft ICU 2.6
3577: */
3578: private static final int NX_CJK_COMPAT = 2;
3579: /**
3580: * Options bit 8, use buggy recomposition described in
3581: * Unicode Public Review Issue #29
3582: * at http://www.unicode.org/review/resolved-pri.html#pri29
3583: *
3584: * Used in IDNA implementation according to strict interpretation
3585: * of IDNA definition based on Unicode 3.2 which predates PRI #29.
3586: *
3587: * See ICU4C unormimp.h
3588: *
3589: * @draft ICU 3.2
3590: */
3591: public static final int BEFORE_PRI_29 = 0x100;
3592:
3593: /*
3594: * The following options are used only in some composition functions.
3595: * They use bits 12 and up to preserve lower bits for the available options
3596: * space in unorm_compare() -
3597: * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
3598: */
3599:
3600: /** Options bit 12, for compatibility vs. canonical decomposition. */
3601: public static final int OPTIONS_COMPAT = 0x1000;
3602: /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
3603: public static final int OPTIONS_COMPOSE_CONTIGUOUS = 0x2000;
3604:
3605: /* normalization exclusion sets --------------------------------------------- */
3606:
3607: /*
3608: * Normalization exclusion UnicodeSets are used for tailored normalization;
3609: * see the comment near the beginning of this file.
3610: *
3611: * By specifying one or several sets of code points,
3612: * those code points become inert for normalization.
3613: */
3614: private static final synchronized UnicodeSet internalGetNXHangul() {
3615: /* internal function, does not check for incoming U_FAILURE */
3616:
3617: if (nxCache[NX_HANGUL] == null) {
3618: nxCache[NX_HANGUL] = new UnicodeSet(0xac00, 0xd7a3);
3619: }
3620: return nxCache[NX_HANGUL];
3621: }
3622:
3623: private static final synchronized UnicodeSet internalGetNXCJKCompat() {
3624: /* internal function, does not check for incoming U_FAILURE */
3625:
3626: if (nxCache[NX_CJK_COMPAT] == null) {
3627:
3628: /* build a set from [CJK Ideographs]&[has canonical decomposition] */
3629: UnicodeSet set, hasDecomp;
3630:
3631: set = new UnicodeSet("[:Ideographic:]");
3632:
3633: /* start with an empty set for [has canonical decomposition] */
3634: hasDecomp = new UnicodeSet();
3635:
3636: /* iterate over all ideographs and remember which canonically decompose */
3637: UnicodeSetIterator it = new UnicodeSetIterator(set);
3638: int start, end;
3639: long norm32;
3640:
3641: while (it.nextRange()
3642: && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
3643: start = it.codepoint;
3644: end = it.codepointEnd;
3645: while (start <= end) {
3646: norm32 = getNorm32(start);
3647: if ((norm32 & QC_NFD) > 0) {
3648: hasDecomp.add(start);
3649: }
3650: ++start;
3651: }
3652: }
3653:
3654: /* hasDecomp now contains all ideographs that decompose canonically */
3655: nxCache[NX_CJK_COMPAT] = hasDecomp;
3656:
3657: }
3658:
3659: return nxCache[NX_CJK_COMPAT];
3660: }
3661:
3662: private static final synchronized UnicodeSet internalGetNXUnicode(
3663: int options) {
3664: options &= OPTIONS_UNICODE_MASK;
3665: if (options == 0) {
3666: return null;
3667: }
3668:
3669: if (nxCache[options] == null) {
3670: /* build a set with all code points that were not designated by the specified Unicode version */
3671: UnicodeSet set = new UnicodeSet();
3672:
3673: switch (options) {
3674: case Normalizer.UNICODE_3_2:
3675: set.applyPattern("[:^Age=3.2:]");
3676: break;
3677: default:
3678: return null;
3679: }
3680:
3681: nxCache[options] = set;
3682: }
3683:
3684: return nxCache[options];
3685: }
3686:
3687: /* Get a decomposition exclusion set. The data must be loaded. */
3688: private static final synchronized UnicodeSet internalGetNX(
3689: int options) {
3690: options &= OPTIONS_SETS_MASK;
3691:
3692: if (nxCache[options] == null) {
3693: /* return basic sets */
3694: if (options == NX_HANGUL) {
3695: return internalGetNXHangul();
3696: }
3697: if (options == NX_CJK_COMPAT) {
3698: return internalGetNXCJKCompat();
3699: }
3700: if ((options & OPTIONS_UNICODE_MASK) != 0
3701: && (options & OPTIONS_NX_MASK) == 0) {
3702: return internalGetNXUnicode(options);
3703: }
3704:
3705: /* build a set from multiple subsets */
3706: UnicodeSet set;
3707: UnicodeSet other;
3708:
3709: set = new UnicodeSet();
3710:
3711: if ((options & NX_HANGUL) != 0
3712: && null != (other = internalGetNXHangul())) {
3713: set.addAll(other);
3714: }
3715: if ((options & NX_CJK_COMPAT) != 0
3716: && null != (other = internalGetNXCJKCompat())) {
3717: set.addAll(other);
3718: }
3719: if ((options & OPTIONS_UNICODE_MASK) != 0
3720: && null != (other = internalGetNXUnicode(options))) {
3721: set.addAll(other);
3722: }
3723:
3724: nxCache[options] = set;
3725: }
3726: return nxCache[options];
3727: }
3728:
3729: public static final UnicodeSet getNX(int options) {
3730: if ((options &= OPTIONS_SETS_MASK) == 0) {
3731: /* incoming failure, or no decomposition exclusions requested */
3732: return null;
3733: } else {
3734: return internalGetNX(options);
3735: }
3736: }
3737:
3738: private static final boolean nx_contains(UnicodeSet nx, int c) {
3739: return nx != null && nx.contains(c);
3740: }
3741:
3742: private static final boolean nx_contains(UnicodeSet nx, char c,
3743: char c2) {
3744: return nx != null
3745: && nx.contains(c2 == 0 ? c : UCharacterProperty
3746: .getRawSupplementary(c, c2));
3747: }
3748:
3749: }
|