0001: /*
0002: * Portions Copyright 2003-2006 Sun Microsystems, Inc. All Rights Reserved.
0003: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0004: *
0005: * This code is free software; you can redistribute it and/or modify it
0006: * under the terms of the GNU General Public License version 2 only, as
0007: * published by the Free Software Foundation. Sun designates this
0008: * particular file as subject to the "Classpath" exception as provided
0009: * by Sun in the LICENSE file that accompanied this code.
0010: *
0011: * This code is distributed in the hope that it will be useful, but WITHOUT
0012: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0013: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
0014: * version 2 for more details (a copy is included in the LICENSE file that
0015: * accompanied this code).
0016: *
0017: * You should have received a copy of the GNU General Public License version
0018: * 2 along with this work; if not, write to the Free Software Foundation,
0019: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0020: *
0021: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
0022: * CA 95054 USA or visit www.sun.com if you need additional information or
0023: * have any questions.
0024: */
0025:
0026: /*
0027: *******************************************************************************
0028: * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
0029: * *
0030: * The original version of this source code and documentation is copyrighted *
0031: * and owned by IBM, These materials are provided under terms of a License *
0032: * Agreement between IBM and Sun. This technology is protected by multiple *
0033: * US and International patents. This notice and attribution to IBM may not *
0034: * to removed. *
0035: *******************************************************************************
0036: */
0037:
0038: package sun.text.normalizer;
0039:
0040: import java.io.BufferedInputStream;
0041: import java.io.ByteArrayInputStream;
0042: import java.io.IOException;
0043: import java.io.BufferedInputStream;
0044: import java.io.InputStream;
0045:
0046: /**
0047: * @version 1.0
0048: * @author Ram Viswanadha
0049: */
0050: public final class NormalizerImpl {
0051: // Static block for the class to initialize its own self
0052: static final NormalizerImpl IMPL;
0053:
0054: static {
0055: try {
0056: IMPL = new NormalizerImpl();
0057: } catch (Exception e) {
0058: throw new RuntimeException(e.getMessage());
0059: }
0060: }
0061:
0062: static final int UNSIGNED_BYTE_MASK = 0xFF;
0063: static final long UNSIGNED_INT_MASK = 0xffffffffL;
0064: /*
0065: * This new implementation of the normalization code loads its data from
0066: * unorm.icu, which is generated with the gennorm tool.
0067: * The format of that file is described at the end of this file.
0068: */
0069: private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu";
0070:
0071: // norm32 value constants
0072:
0073: // quick check flags 0..3 set mean "no" for their forms
0074: public static final int QC_NFC = 0x11; /* no|maybe */
0075: public static final int QC_NFKC = 0x22; /* no|maybe */
0076: public static final int QC_NFD = 4; /* no */
0077: public static final int QC_NFKD = 8; /* no */
0078:
0079: public static final int QC_ANY_NO = 0xf;
0080:
0081: /* quick check flags 4..5 mean "maybe" for their forms;
0082: * test flags>=QC_MAYBE
0083: */
0084: public static final int QC_MAYBE = 0x10;
0085: public static final int QC_ANY_MAYBE = 0x30;
0086:
0087: public static final int QC_MASK = 0x3f;
0088:
0089: private static final int COMBINES_FWD = 0x40;
0090: private static final int COMBINES_BACK = 0x80;
0091: public static final int COMBINES_ANY = 0xc0;
0092: // UnicodeData.txt combining class in bits 15.
0093: private static final int CC_SHIFT = 8;
0094: public static final int CC_MASK = 0xff00;
0095: // 16 bits for the index to UChars and other extra data
0096: private static final int EXTRA_SHIFT = 16;
0097:
0098: /* norm32 value constants using >16 bits */
0099: private static final long MIN_SPECIAL = (long) (0xfc000000 & UNSIGNED_INT_MASK);
0100: private static final long SURROGATES_TOP = (long) (0xfff00000 & UNSIGNED_INT_MASK);
0101: private static final long MIN_HANGUL = (long) (0xfff00000 & UNSIGNED_INT_MASK);
0102: private static final long MIN_JAMO_V = (long) (0xfff20000 & UNSIGNED_INT_MASK);
0103: private static final long JAMO_V_TOP = (long) (0xfff30000 & UNSIGNED_INT_MASK);
0104:
0105: /* indexes[] value names */
0106: /* number of bytes in normalization trie */
0107: static final int INDEX_TRIE_SIZE = 0;
0108: /* number of chars in extra data */
0109: static final int INDEX_CHAR_COUNT = 1;
0110: /* number of uint16_t words for combining data */
0111: static final int INDEX_COMBINE_DATA_COUNT = 2;
0112: /* first code point with quick check NFC NO/MAYBE */
0113: public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
0114: /* first code point with quick check NFKC NO/MAYBE */
0115: public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
0116: /* first code point with quick check NFD NO/MAYBE */
0117: public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
0118: /* first code point with quick check NFKD NO/MAYBE */
0119: public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
0120: /* number of bytes in FCD trie */
0121: static final int INDEX_FCD_TRIE_SIZE = 10;
0122: /* number of bytes in the auxiliary trie */
0123: static final int INDEX_AUX_TRIE_SIZE = 11;
0124: /* changing this requires a new formatVersion */
0125: static final int INDEX_TOP = 32;
0126:
0127: /* AUX constants */
0128: /* value constants for auxTrie */
0129: private static final int AUX_UNSAFE_SHIFT = 11;
0130: private static final int AUX_COMP_EX_SHIFT = 10;
0131: private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
0132:
0133: private static final int AUX_MAX_FNC = ((int) 1 << AUX_COMP_EX_SHIFT);
0134: private static final int AUX_UNSAFE_MASK = (int) ((1 << AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
0135: private static final int AUX_FNC_MASK = (int) ((AUX_MAX_FNC - 1) & UNSIGNED_INT_MASK);
0136: private static final int AUX_COMP_EX_MASK = (int) ((1 << AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
0137: private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK & 1) << AUX_NFC_SKIPPABLE_F_SHIFT);
0138:
0139: private static final int MAX_BUFFER_SIZE = 20;
0140:
0141: /*******************************/
0142:
0143: /* Wrappers for Trie implementations */
0144: static final class NormTrieImpl implements Trie.DataManipulate {
0145: static IntTrie normTrie = null;
0146:
0147: /**
0148: * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
0149: * data the index array offset of the indexes for that lead surrogate.
0150: * @param property data value for a surrogate from the trie, including
0151: * the folding offset
0152: * @return data offset or 0 if there is no data for the lead surrogate
0153: */
0154: /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
0155: public int getFoldingOffset(int value) {
0156: return BMP_INDEX_LENGTH
0157: + ((value >> (EXTRA_SHIFT - SURROGATE_BLOCK_BITS)) & (0x3ff << SURROGATE_BLOCK_BITS));
0158: }
0159:
0160: }
0161:
0162: static final class FCDTrieImpl implements Trie.DataManipulate {
0163: static CharTrie fcdTrie = null;
0164:
0165: /**
0166: * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
0167: * data the index array offset of the indexes for that lead surrogate.
0168: * @param property data value for a surrogate from the trie, including
0169: * the folding offset
0170: * @return data offset or 0 if there is no data for the lead surrogate
0171: */
0172: /* fcdTrie: the folding offset is the lead FCD value itself */
0173: public int getFoldingOffset(int value) {
0174: return value;
0175: }
0176: }
0177:
0178: static final class AuxTrieImpl implements Trie.DataManipulate {
0179: static CharTrie auxTrie = null;
0180:
0181: /**
0182: * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
0183: * data the index array offset of the indexes for that lead surrogate.
0184: * @param property data value for a surrogate from the trie, including
0185: * the folding offset
0186: * @return data offset or 0 if there is no data for the lead surrogate
0187: */
0188: /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
0189: public int getFoldingOffset(int value) {
0190: return (int) (value & AUX_FNC_MASK) << SURROGATE_BLOCK_BITS;
0191: }
0192: }
0193:
0194: /****************************************************/
0195:
0196: private static FCDTrieImpl fcdTrieImpl;
0197: private static NormTrieImpl normTrieImpl;
0198: private static AuxTrieImpl auxTrieImpl;
0199: private static int[] indexes;
0200: private static char[] combiningTable;
0201: private static char[] extraData;
0202:
0203: private static boolean isDataLoaded;
0204: private static boolean isFormatVersion_2_1;
0205: private static boolean isFormatVersion_2_2;
0206: private static byte[] unicodeVersion;
0207:
0208: /**
0209: * Default buffer size of datafile
0210: */
0211: private static final int DATA_BUFFER_SIZE = 25000;
0212:
0213: /**
0214: * FCD check: everything below this code point is known to have a 0
0215: * lead combining class
0216: */
0217: public static final int MIN_WITH_LEAD_CC = 0x300;
0218:
0219: /**
0220: * Bit 7 of the length byte for a decomposition string in extra data is
0221: * a flag indicating whether the decomposition string is
0222: * preceded by a 16-bit word with the leading and trailing cc
0223: * of the decomposition (like for A-umlaut);
0224: * if not, then both cc's are zero (like for compatibility ideographs).
0225: */
0226: private static final int DECOMP_FLAG_LENGTH_HAS_CC = 0x80;
0227: /**
0228: * Bits 6..0 of the length byte contain the actual length.
0229: */
0230: private static final int DECOMP_LENGTH_MASK = 0x7f;
0231:
0232: /** Length of the BMP portion of the index (stage 1) array. */
0233: private static final int BMP_INDEX_LENGTH = 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
0234: /** Number of bits of a trail surrogate that are used in index table
0235: * lookups.
0236: */
0237: private static final int SURROGATE_BLOCK_BITS = 10 - Trie.INDEX_STAGE_1_SHIFT_;
0238:
0239: // public utility
0240: public static int getFromIndexesArr(int index) {
0241: return indexes[index];
0242: }
0243:
0244: // protected constructor ---------------------------------------------
0245:
0246: /**
0247: * Constructor
0248: * @exception thrown when data reading fails or data corrupted
0249: */
0250: private NormalizerImpl() throws IOException {
0251: //data should be loaded only once
0252: if (!isDataLoaded) {
0253:
0254: // jar access
0255: InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
0256: BufferedInputStream b = new BufferedInputStream(i,
0257: DATA_BUFFER_SIZE);
0258: NormalizerDataReader reader = new NormalizerDataReader(b);
0259:
0260: // read the indexes
0261: indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
0262:
0263: byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
0264:
0265: int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
0266: combiningTable = new char[combiningTableTop];
0267:
0268: int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
0269: extraData = new char[extraDataTop];
0270:
0271: byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
0272: byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
0273:
0274: fcdTrieImpl = new FCDTrieImpl();
0275: normTrieImpl = new NormTrieImpl();
0276: auxTrieImpl = new AuxTrieImpl();
0277:
0278: // load the rest of the data data and initialize the data members
0279: reader.read(normBytes, fcdBytes, auxBytes, extraData,
0280: combiningTable);
0281:
0282: NormTrieImpl.normTrie = new IntTrie(
0283: new ByteArrayInputStream(normBytes), normTrieImpl);
0284: FCDTrieImpl.fcdTrie = new CharTrie(
0285: new ByteArrayInputStream(fcdBytes), fcdTrieImpl);
0286: AuxTrieImpl.auxTrie = new CharTrie(
0287: new ByteArrayInputStream(auxBytes), auxTrieImpl);
0288:
0289: // we reached here without any exceptions so the data is fully
0290: // loaded set the variable to true
0291: isDataLoaded = true;
0292:
0293: // get the data format version
0294: byte[] formatVersion = reader.getDataFormatVersion();
0295:
0296: isFormatVersion_2_1 = (formatVersion[0] > 2 || (formatVersion[0] == 2 && formatVersion[1] >= 1));
0297: isFormatVersion_2_2 = (formatVersion[0] > 2 || (formatVersion[0] == 2 && formatVersion[1] >= 2));
0298: unicodeVersion = reader.getUnicodeVersion();
0299: b.close();
0300: }
0301: }
0302:
0303: /* ---------------------------------------------------------------------- */
0304:
0305: /* Korean Hangul and Jamo constants */
0306:
0307: public static final int JAMO_L_BASE = 0x1100; /* "lead" jamo */
0308: public static final int JAMO_V_BASE = 0x1161; /* "vowel" jamo */
0309: public static final int JAMO_T_BASE = 0x11a7; /* "trail" jamo */
0310:
0311: public static final int HANGUL_BASE = 0xac00;
0312:
0313: public static final int JAMO_L_COUNT = 19;
0314: public static final int JAMO_V_COUNT = 21;
0315: public static final int JAMO_T_COUNT = 28;
0316: public static final int HANGUL_COUNT = JAMO_L_COUNT * JAMO_V_COUNT
0317: * JAMO_T_COUNT;
0318:
0319: private static boolean isHangulWithoutJamoT(char c) {
0320: c -= HANGUL_BASE;
0321: return c < HANGUL_COUNT && c % JAMO_T_COUNT == 0;
0322: }
0323:
0324: /* norm32 helpers */
0325:
0326: /* is this a norm32 with a regular index? */
0327: private static boolean isNorm32Regular(long norm32) {
0328: return norm32 < MIN_SPECIAL;
0329: }
0330:
0331: /* is this a norm32 with a special index for a lead surrogate? */
0332: private static boolean isNorm32LeadSurrogate(long norm32) {
0333: return MIN_SPECIAL <= norm32 && norm32 < SURROGATES_TOP;
0334: }
0335:
0336: /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
0337: private static boolean isNorm32HangulOrJamo(long norm32) {
0338: return norm32 >= MIN_HANGUL;
0339: }
0340:
0341: /*
0342: * Given norm32 for Jamo V or T,
0343: * is this a Jamo V?
0344: */
0345: private static boolean isJamoVTNorm32JamoV(long norm32) {
0346: return norm32 < JAMO_V_TOP;
0347: }
0348:
0349: /* data access primitives ----------------------------------------------- */
0350:
0351: public static long/*unsigned*/getNorm32(char c) {
0352: return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie
0353: .getLeadValue(c)));
0354: }
0355:
0356: public static long/*unsigned*/getNorm32FromSurrogatePair(
0357: long norm32, char c2) {
0358: /*
0359: * the surrogate index in norm32 stores only the number of the surrogate
0360: * index block see gennorm/store.c/getFoldedNormValue()
0361: */
0362: return ((UNSIGNED_INT_MASK) & NormTrieImpl.normTrie
0363: .getTrailValue((int) norm32, c2));
0364: }
0365:
0366: ///CLOVER:OFF
0367: private static long getNorm32(int c) {
0368: return (UNSIGNED_INT_MASK & (NormTrieImpl.normTrie
0369: .getCodePointValue(c)));
0370: }
0371:
0372: /*
0373: * get a norm32 from text with complete code points
0374: * (like from decompositions)
0375: */
0376: private static long/*unsigned*/getNorm32(char[] p, int start,
0377: int/*unsigned*/mask) {
0378: long/*unsigned*/norm32 = getNorm32(p[start]);
0379: if (((norm32 & mask) > 0) && isNorm32LeadSurrogate(norm32)) {
0380: /* *p is a lead surrogate, get the real norm32 */
0381: norm32 = getNorm32FromSurrogatePair(norm32, p[start + 1]);
0382: }
0383: return norm32;
0384: }
0385:
0386: //// for StringPrep
0387: public static VersionInfo getUnicodeVersion() {
0388: return VersionInfo
0389: .getInstance(unicodeVersion[0], unicodeVersion[1],
0390: unicodeVersion[2], unicodeVersion[3]);
0391: }
0392:
0393: public static char getFCD16(char c) {
0394: return FCDTrieImpl.fcdTrie.getLeadValue(c);
0395: }
0396:
0397: public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
0398: /* the surrogate index in fcd16 is an absolute offset over the
0399: * start of stage 1
0400: * */
0401: return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
0402: }
0403:
0404: public static int getFCD16(int c) {
0405: return FCDTrieImpl.fcdTrie.getCodePointValue(c);
0406: }
0407:
0408: private static int getExtraDataIndex(long norm32) {
0409: return (int) (norm32 >> EXTRA_SHIFT);
0410: }
0411:
0412: private static final class DecomposeArgs {
0413: int /*unsigned byte*/cc;
0414: int /*unsigned byte*/trailCC;
0415: int length;
0416: }
0417:
0418: /**
0419: *
0420: * get the canonical or compatibility decomposition for one character
0421: *
0422: * @return index into the extraData array
0423: */
0424: private static int/*index*/decompose(long/*unsigned*/norm32,
0425: int/*unsigned*/qcMask, DecomposeArgs args) {
0426: int p = getExtraDataIndex(norm32);
0427: args.length = extraData[p++];
0428:
0429: if ((norm32 & qcMask & QC_NFKD) != 0 && args.length >= 0x100) {
0430: /* use compatibility decomposition, skip canonical data */
0431: p += ((args.length >> 7) & 1)
0432: + (args.length & DECOMP_LENGTH_MASK);
0433: args.length >>= 8;
0434: }
0435:
0436: if ((args.length & DECOMP_FLAG_LENGTH_HAS_CC) > 0) {
0437: /* get the lead and trail cc's */
0438: char bothCCs = extraData[p++];
0439: args.cc = (UNSIGNED_BYTE_MASK) & (bothCCs >> 8);
0440: args.trailCC = (UNSIGNED_BYTE_MASK) & bothCCs;
0441: } else {
0442: /* lead and trail cc's are both 0 */
0443: args.cc = args.trailCC = 0;
0444: }
0445:
0446: args.length &= DECOMP_LENGTH_MASK;
0447: return p;
0448: }
0449:
0450: /**
0451: * get the canonical decomposition for one character
0452: * @return index into the extraData array
0453: */
0454: private static int decompose(long/*unsigned*/norm32,
0455: DecomposeArgs args) {
0456:
0457: int p = getExtraDataIndex(norm32);
0458: args.length = extraData[p++];
0459:
0460: if ((args.length & DECOMP_FLAG_LENGTH_HAS_CC) > 0) {
0461: /* get the lead and trail cc's */
0462: char bothCCs = extraData[p++];
0463: args.cc = (UNSIGNED_BYTE_MASK) & (bothCCs >> 8);
0464: args.trailCC = (UNSIGNED_BYTE_MASK) & bothCCs;
0465: } else {
0466: /* lead and trail cc's are both 0 */
0467: args.cc = args.trailCC = 0;
0468: }
0469:
0470: args.length &= DECOMP_LENGTH_MASK;
0471: return p;
0472: }
0473:
0474: private static final class NextCCArgs {
0475: char[] source;
0476: int next;
0477: int limit;
0478: char c;
0479: char c2;
0480: }
0481:
0482: /*
0483: * get the combining class of (c, c2)= args.source[args.next++]
0484: * before: args.next<args.limit after: args.next<=args.limit
0485: * if only one code unit is used, then c2==0
0486: */
0487: private static int /*unsigned byte*/getNextCC(NextCCArgs args) {
0488: long /*unsigned*/norm32;
0489:
0490: args.c = args.source[args.next++];
0491:
0492: norm32 = getNorm32(args.c);
0493: if ((norm32 & CC_MASK) == 0) {
0494: args.c2 = 0;
0495: return 0;
0496: } else {
0497: if (!isNorm32LeadSurrogate(norm32)) {
0498: args.c2 = 0;
0499: } else {
0500: /* c is a lead surrogate, get the real norm32 */
0501: if (args.next != args.limit
0502: && UTF16
0503: .isTrailSurrogate(args.c2 = args.source[args.next])) {
0504: ++args.next;
0505: norm32 = getNorm32FromSurrogatePair(norm32, args.c2);
0506: } else {
0507: args.c2 = 0;
0508: return 0;
0509: }
0510: }
0511:
0512: return (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
0513: }
0514: }
0515:
0516: private static final class PrevArgs {
0517: char[] src;
0518: int start;
0519: int current;
0520: char c;
0521: char c2;
0522: }
0523:
0524: /*
0525: * read backwards and get norm32
0526: * return 0 if the character is <minC
0527: * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
0528: * surrogate but read second!)
0529: */
0530: private static long /*unsigned*/getPrevNorm32(PrevArgs args,
0531: int/*unsigned*/minC, int/*unsigned*/mask) {
0532: long/*unsigned*/norm32;
0533:
0534: args.c = args.src[--args.current];
0535: args.c2 = 0;
0536:
0537: /* check for a surrogate before getting norm32 to see if we need to
0538: * predecrement further
0539: */
0540: if (args.c < minC) {
0541: return 0;
0542: } else if (!UTF16.isSurrogate(args.c)) {
0543: return getNorm32(args.c);
0544: } else if (UTF16.isLeadSurrogate(args.c)) {
0545: /* unpaired first surrogate */
0546: return 0;
0547: } else if (args.current != args.start
0548: && UTF16
0549: .isLeadSurrogate(args.c2 = args.src[args.current - 1])) {
0550: --args.current;
0551: norm32 = getNorm32(args.c2);
0552:
0553: if ((norm32 & mask) == 0) {
0554: /* all surrogate pairs with this lead surrogate have
0555: * only irrelevant data
0556: */
0557: return 0;
0558: } else {
0559: /* norm32 must be a surrogate special */
0560: return getNorm32FromSurrogatePair(norm32, args.c);
0561: }
0562: } else {
0563: /* unpaired second surrogate */
0564: args.c2 = 0;
0565: return 0;
0566: }
0567: }
0568:
0569: /*
0570: * get the combining class of (c, c2)=*--p
0571: * before: start<p after: start<=p
0572: */
0573: private static int /*unsigned byte*/getPrevCC(PrevArgs args) {
0574:
0575: return (int) ((UNSIGNED_BYTE_MASK) & (getPrevNorm32(args,
0576: MIN_WITH_LEAD_CC, CC_MASK) >> CC_SHIFT));
0577: }
0578:
0579: /*
0580: * is this a safe boundary character for NF*D?
0581: * (lead cc==0)
0582: */
0583: public static boolean isNFDSafe(long/*unsigned*/norm32,
0584: int/*unsigned*/ccOrQCMask, int/*unsigned*/decompQCMask) {
0585: if ((norm32 & ccOrQCMask) == 0) {
0586: return true; /* cc==0 and no decomposition: this is NF*D safe */
0587: }
0588:
0589: /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
0590: if (isNorm32Regular(norm32) && (norm32 & decompQCMask) != 0) {
0591: DecomposeArgs args = new DecomposeArgs();
0592: /* decomposes, get everything from the variable-length extra data */
0593: decompose(norm32, decompQCMask, args);
0594: return args.cc == 0;
0595: } else {
0596: /* no decomposition (or Hangul), test the cc directly */
0597: return (norm32 & CC_MASK) == 0;
0598: }
0599: }
0600:
0601: /*
0602: * is this (or does its decomposition begin with) a "true starter"?
0603: * (cc==0 and NF*C_YES)
0604: */
0605: public static boolean isTrueStarter(long/*unsigned*/norm32,
0606: int/*unsigned*/ccOrQCMask, int/*unsigned*/decompQCMask) {
0607: if ((norm32 & ccOrQCMask) == 0) {
0608: return true; /* this is a true starter (could be Hangul or Jamo L)*/
0609: }
0610:
0611: /* inspect its decomposition - not a Hangul or a surrogate here */
0612: if ((norm32 & decompQCMask) != 0) {
0613: int p; /* index into extra data array */
0614: DecomposeArgs args = new DecomposeArgs();
0615: /* decomposes, get everything from the variable-length extra data */
0616: p = decompose(norm32, decompQCMask, args);
0617:
0618: if (args.cc == 0) {
0619: int/*unsigned*/qcMask = ccOrQCMask & QC_MASK;
0620:
0621: /* does it begin with NFC_YES? */
0622: if ((getNorm32(extraData, p, qcMask) & qcMask) == 0) {
0623: /* yes, the decomposition begins with a true starter */
0624: return true;
0625: }
0626: }
0627: }
0628: return false;
0629: }
0630:
0631: /* reorder UTF-16 in-place ---------------------------------------------- */
0632:
0633: /**
0634: * simpler, single-character version of mergeOrdered() -
0635: * bubble-insert one single code point into the preceding string
0636: * which is already canonically ordered
0637: * (c, c2) may or may not yet have been inserted at src[current]..src[p]
0638: *
0639: * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
0640: *
0641: * before: src[start]..src[current] is already ordered, and
0642: * src[current]..src[p] may or may not hold (c, c2) but
0643: * must be exactly the same length as (c, c2)
0644: * after: src[start]..src[p] is ordered
0645: *
0646: * @return the trailing combining class
0647: */
0648: private static int/*unsigned byte*/insertOrdered(char[] source,
0649: int start, int current, int p, char c, char c2,
0650: int/*unsigned byte*/cc) {
0651: int back, preBack;
0652: int r;
0653: int prevCC, trailCC = cc;
0654:
0655: if (start < current && cc != 0) {
0656: // search for the insertion point where cc>=prevCC
0657: preBack = back = current;
0658: PrevArgs prevArgs = new PrevArgs();
0659: prevArgs.current = current;
0660: prevArgs.start = start;
0661: prevArgs.src = source;
0662: // get the prevCC
0663: prevCC = getPrevCC(prevArgs);
0664: preBack = prevArgs.current;
0665:
0666: if (cc < prevCC) {
0667: // this will be the last code point, so keep its cc
0668: trailCC = prevCC;
0669: back = preBack;
0670: while (start < preBack) {
0671: prevCC = getPrevCC(prevArgs);
0672: preBack = prevArgs.current;
0673: if (cc >= prevCC) {
0674: break;
0675: }
0676: back = preBack;
0677: }
0678:
0679: // this is where we are right now with all these indicies:
0680: // [start]..[pPreBack] 0..? code points that we can ignore
0681: // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
0682: // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
0683: // [current]..[p] 1 code point (c, c2) with cc
0684:
0685: // move the code units in between up
0686: r = p;
0687: do {
0688: source[--r] = source[--current];
0689: } while (back != current);
0690: }
0691: }
0692:
0693: // insert (c, c2)
0694: source[current] = c;
0695: if (c2 != 0) {
0696: source[(current + 1)] = c2;
0697: }
0698:
0699: // we know the cc of the last code point
0700: return trailCC;
0701: }
0702:
0703: /**
0704: * merge two UTF-16 string parts together
0705: * to canonically order (order by combining classes) their concatenation
0706: *
0707: * the two strings may already be adjacent, so that the merging is done
0708: * in-place if the two strings are not adjacent, then the buffer holding the
0709: * first one must be large enough
0710: * the second string may or may not be ordered in itself
0711: *
0712: * before: [start]..[current] is already ordered, and
0713: * [next]..[limit] may be ordered in itself, but
0714: * is not in relation to [start..current[
0715: * after: [start..current+(limit-next)[ is ordered
0716: *
0717: * the algorithm is a simple bubble-sort that takes the characters from
0718: * src[next++] and inserts them in correct combining class order into the
0719: * preceding part of the string
0720: *
0721: * since this function is called much less often than the single-code point
0722: * insertOrdered(), it just uses that for easier maintenance
0723: *
0724: * @return the trailing combining class
0725: */
0726: private static int /*unsigned byte*/mergeOrdered(char[] source,
0727: int start, int current, char[] data, int next, int limit,
0728: boolean isOrdered) {
0729: int r;
0730: int /*unsigned byte*/cc, trailCC = 0;
0731: boolean adjacent;
0732:
0733: adjacent = current == next;
0734: NextCCArgs ncArgs = new NextCCArgs();
0735: ncArgs.source = data;
0736: ncArgs.next = next;
0737: ncArgs.limit = limit;
0738:
0739: if (start != current || !isOrdered) {
0740:
0741: while (ncArgs.next < ncArgs.limit) {
0742: cc = getNextCC(ncArgs);
0743: if (cc == 0) {
0744: // does not bubble back
0745: trailCC = 0;
0746: if (adjacent) {
0747: current = ncArgs.next;
0748: } else {
0749: data[current++] = ncArgs.c;
0750: if (ncArgs.c2 != 0) {
0751: data[current++] = ncArgs.c2;
0752: }
0753: }
0754: if (isOrdered) {
0755: break;
0756: } else {
0757: start = current;
0758: }
0759: } else {
0760: r = current + (ncArgs.c2 == 0 ? 1 : 2);
0761: trailCC = insertOrdered(source, start, current, r,
0762: ncArgs.c, ncArgs.c2, cc);
0763: current = r;
0764: }
0765: }
0766: }
0767:
0768: if (ncArgs.next == ncArgs.limit) {
0769: // we know the cc of the last code point
0770: return trailCC;
0771: } else {
0772: if (!adjacent) {
0773: // copy the second string part
0774: do {
0775: source[current++] = data[ncArgs.next++];
0776: } while (ncArgs.next != ncArgs.limit);
0777: ncArgs.limit = current;
0778: }
0779: PrevArgs prevArgs = new PrevArgs();
0780: prevArgs.src = data;
0781: prevArgs.start = start;
0782: prevArgs.current = ncArgs.limit;
0783: return getPrevCC(prevArgs);
0784: }
0785:
0786: }
0787:
0788: private static int /*unsigned byte*/mergeOrdered(char[] source,
0789: int start, int current, char[] data, final int next,
0790: final int limit) {
0791: return mergeOrdered(source, start, current, data, next, limit,
0792: true);
0793: }
0794:
0795: public static NormalizerBase.QuickCheckResult quickCheck(
0796: char[] src, int srcStart, int srcLimit, int minNoMaybe,
0797: int qcMask, int options, boolean allowMaybe, UnicodeSet nx) {
0798:
0799: int ccOrQCMask;
0800: long norm32;
0801: char c, c2;
0802: char cc, prevCC;
0803: long qcNorm32;
0804: NormalizerBase.QuickCheckResult result;
0805: ComposePartArgs args = new ComposePartArgs();
0806: char[] buffer;
0807: int start = srcStart;
0808:
0809: if (!isDataLoaded) {
0810: return NormalizerBase.MAYBE;
0811: }
0812: // initialize
0813: ccOrQCMask = CC_MASK | qcMask;
0814: result = NormalizerBase.YES;
0815: prevCC = 0;
0816:
0817: for (;;) {
0818: for (;;) {
0819: if (srcStart == srcLimit) {
0820: return result;
0821: } else if ((c = src[srcStart++]) >= minNoMaybe
0822: && ((norm32 = getNorm32(c)) & ccOrQCMask) != 0) {
0823: break;
0824: }
0825: prevCC = 0;
0826: }
0827:
0828: // check one above-minimum, relevant code unit
0829: if (isNorm32LeadSurrogate(norm32)) {
0830: // c is a lead surrogate, get the real norm32
0831: if (srcStart != srcLimit
0832: && UTF16.isTrailSurrogate(c2 = src[srcStart])) {
0833: ++srcStart;
0834: norm32 = getNorm32FromSurrogatePair(norm32, c2);
0835: } else {
0836: norm32 = 0;
0837: c2 = 0;
0838: }
0839: } else {
0840: c2 = 0;
0841: }
0842: if (nx_contains(nx, c, c2)) {
0843: /* excluded: norm32==0 */
0844: norm32 = 0;
0845: }
0846:
0847: // check the combining order
0848: cc = (char) ((norm32 >> CC_SHIFT) & 0xFF);
0849: if (cc != 0 && cc < prevCC) {
0850: return NormalizerBase.NO;
0851: }
0852: prevCC = cc;
0853:
0854: // check for "no" or "maybe" quick check flags
0855: qcNorm32 = norm32 & qcMask;
0856: if ((qcNorm32 & QC_ANY_NO) >= 1) {
0857: result = NormalizerBase.NO;
0858: break;
0859: } else if (qcNorm32 != 0) {
0860: // "maybe" can only occur for NFC and NFKC
0861: if (allowMaybe) {
0862: result = NormalizerBase.MAYBE;
0863: } else {
0864: // normalize a section around here to see if it is really
0865: // normalized or not
0866: int prevStarter;
0867: int/*unsigned*/decompQCMask;
0868:
0869: decompQCMask = (qcMask << 2) & 0xf; // decomposition quick check mask
0870:
0871: // find the previous starter
0872:
0873: // set prevStarter to the beginning of the current character
0874: prevStarter = srcStart - 1;
0875: if (UTF16.isTrailSurrogate(src[prevStarter])) {
0876: // safe because unpaired surrogates do not result
0877: // in "maybe"
0878: --prevStarter;
0879: }
0880: prevStarter = findPreviousStarter(src, start,
0881: prevStarter, ccOrQCMask, decompQCMask,
0882: (char) minNoMaybe);
0883:
0884: // find the next true starter in [src..limit[ - modifies
0885: // src to point to the next starter
0886: srcStart = findNextStarter(src, srcStart, srcLimit,
0887: qcMask, decompQCMask, (char) minNoMaybe);
0888:
0889: //set the args for compose part
0890: args.prevCC = prevCC;
0891:
0892: // decompose and recompose [prevStarter..src[
0893: buffer = composePart(args, prevStarter, src,
0894: srcStart, srcLimit, options, nx);
0895:
0896: // compare the normalized version with the original
0897: if (0 != strCompare(buffer, 0, args.length, src,
0898: prevStarter, (srcStart - prevStarter),
0899: false)) {
0900: result = NormalizerBase.NO; // normalization differs
0901: break;
0902: }
0903:
0904: // continue after the next starter
0905: }
0906: }
0907: }
0908: return result;
0909: }
0910:
0911: //------------------------------------------------------
0912: // make NFD & NFKD
0913: //------------------------------------------------------
0914:
0915: public static int decompose(char[] src, int srcStart, int srcLimit,
0916: char[] dest, int destStart, int destLimit, boolean compat,
0917: int[] outTrailCC, UnicodeSet nx) {
0918:
0919: char[] buffer = new char[3];
0920: int prevSrc;
0921: long norm32;
0922: int ccOrQCMask, qcMask;
0923: int reorderStartIndex, length;
0924: char c, c2, minNoMaybe;
0925: int/*unsigned byte*/cc, prevCC, trailCC;
0926: char[] p;
0927: int pStart;
0928: int destIndex = destStart;
0929: int srcIndex = srcStart;
0930: if (!compat) {
0931: minNoMaybe = (char) indexes[INDEX_MIN_NFD_NO_MAYBE];
0932: qcMask = QC_NFD;
0933: } else {
0934: minNoMaybe = (char) indexes[INDEX_MIN_NFKD_NO_MAYBE];
0935: qcMask = QC_NFKD;
0936: }
0937:
0938: /* initialize */
0939: ccOrQCMask = CC_MASK | qcMask;
0940: reorderStartIndex = 0;
0941: prevCC = 0;
0942: norm32 = 0;
0943: c = 0;
0944: pStart = 0;
0945:
0946: cc = trailCC = -1;//initialize to bogus value
0947:
0948: for (;;) {
0949: /* count code units below the minimum or with irrelevant data for
0950: * the quick check
0951: */
0952: prevSrc = srcIndex;
0953:
0954: while (srcIndex != srcLimit
0955: && ((c = src[srcIndex]) < minNoMaybe || ((norm32 = getNorm32(c)) & ccOrQCMask) == 0)) {
0956: prevCC = 0;
0957: ++srcIndex;
0958: }
0959:
0960: /* copy these code units all at once */
0961: if (srcIndex != prevSrc) {
0962: length = (int) (srcIndex - prevSrc);
0963: if ((destIndex + length) <= destLimit) {
0964: System.arraycopy(src, prevSrc, dest, destIndex,
0965: length);
0966: }
0967:
0968: destIndex += length;
0969: reorderStartIndex = destIndex;
0970: }
0971:
0972: /* end of source reached? */
0973: if (srcIndex == srcLimit) {
0974: break;
0975: }
0976:
0977: /* c already contains *src and norm32 is set for it, increment src*/
0978: ++srcIndex;
0979:
0980: /* check one above-minimum, relevant code unit */
0981: /*
0982: * generally, set p and length to the decomposition string
0983: * in simple cases, p==NULL and (c, c2) will hold the length code
0984: * units to append in all cases, set cc to the lead and trailCC to
0985: * the trail combining class
0986: *
0987: * the following merge-sort of the current character into the
0988: * preceding, canonically ordered result text will use the
0989: * optimized insertOrdered()
0990: * if there is only one single code point to process;
0991: * this is indicated with p==NULL, and (c, c2) is the character to
0992: * insert
0993: * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
0994: * for a supplementary character)
0995: * otherwise, p[length] is merged in with _mergeOrdered()
0996: */
0997: if (isNorm32HangulOrJamo(norm32)) {
0998: if (nx_contains(nx, c)) {
0999: c2 = 0;
1000: p = null;
1001: length = 1;
1002: } else {
1003: // Hangul syllable: decompose algorithmically
1004: p = buffer;
1005: pStart = 0;
1006: cc = trailCC = 0;
1007:
1008: c -= HANGUL_BASE;
1009:
1010: c2 = (char) (c % JAMO_T_COUNT);
1011: c /= JAMO_T_COUNT;
1012: if (c2 > 0) {
1013: buffer[2] = (char) (JAMO_T_BASE + c2);
1014: length = 3;
1015: } else {
1016: length = 2;
1017: }
1018:
1019: buffer[1] = (char) (JAMO_V_BASE + c % JAMO_V_COUNT);
1020: buffer[0] = (char) (JAMO_L_BASE + c / JAMO_V_COUNT);
1021: }
1022: } else {
1023: if (isNorm32Regular(norm32)) {
1024: c2 = 0;
1025: length = 1;
1026: } else {
1027: // c is a lead surrogate, get the real norm32
1028: if (srcIndex != srcLimit
1029: && UTF16
1030: .isTrailSurrogate(c2 = src[srcIndex])) {
1031: ++srcIndex;
1032: length = 2;
1033: norm32 = getNorm32FromSurrogatePair(norm32, c2);
1034: } else {
1035: c2 = 0;
1036: length = 1;
1037: norm32 = 0;
1038: }
1039: }
1040:
1041: /* get the decomposition and the lead and trail cc's */
1042: if (nx_contains(nx, c, c2)) {
1043: /* excluded: norm32==0 */
1044: cc = trailCC = 0;
1045: p = null;
1046: } else if ((norm32 & qcMask) == 0) {
1047: /* c does not decompose */
1048: cc = trailCC = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
1049: p = null;
1050: pStart = -1;
1051: } else {
1052: DecomposeArgs arg = new DecomposeArgs();
1053: /* c decomposes, get everything from the variable-length
1054: * extra data
1055: */
1056: pStart = decompose(norm32, qcMask, arg);
1057: p = extraData;
1058: length = arg.length;
1059: cc = arg.cc;
1060: trailCC = arg.trailCC;
1061: if (length == 1) {
1062: /* fastpath a single code unit from decomposition */
1063: c = p[pStart];
1064: c2 = 0;
1065: p = null;
1066: pStart = -1;
1067: }
1068: }
1069: }
1070:
1071: /* append the decomposition to the destination buffer, assume
1072: * length>0
1073: */
1074: if ((destIndex + length) <= destLimit) {
1075: int reorderSplit = destIndex;
1076: if (p == null) {
1077: /* fastpath: single code point */
1078: if (cc != 0 && cc < prevCC) {
1079: /* (c, c2) is out of order with respect to the preceding
1080: * text
1081: */
1082: destIndex += length;
1083: trailCC = insertOrdered(dest,
1084: reorderStartIndex, reorderSplit,
1085: destIndex, c, c2, cc);
1086: } else {
1087: /* just append (c, c2) */
1088: dest[destIndex++] = c;
1089: if (c2 != 0) {
1090: dest[destIndex++] = c2;
1091: }
1092: }
1093: } else {
1094: /* general: multiple code points (ordered by themselves)
1095: * from decomposition
1096: */
1097: if (cc != 0 && cc < prevCC) {
1098: /* the decomposition is out of order with respect to the
1099: * preceding text
1100: */
1101: destIndex += length;
1102: trailCC = mergeOrdered(dest, reorderStartIndex,
1103: reorderSplit, p, pStart, pStart
1104: + length);
1105: } else {
1106: /* just append the decomposition */
1107: do {
1108: dest[destIndex++] = p[pStart++];
1109: } while (--length > 0);
1110: }
1111: }
1112: } else {
1113: /* buffer overflow */
1114: /* keep incrementing the destIndex for preflighting */
1115: destIndex += length;
1116: }
1117:
1118: prevCC = trailCC;
1119: if (prevCC == 0) {
1120: reorderStartIndex = destIndex;
1121: }
1122: }
1123:
1124: outTrailCC[0] = prevCC;
1125:
1126: return destIndex - destStart;
1127: }
1128:
1129: /* make NFC & NFKC ------------------------------------------------------ */
1130: private static final class NextCombiningArgs {
1131: char[] source;
1132: int start;
1133: //int limit;
1134: char c;
1135: char c2;
1136: int/*unsigned*/combiningIndex;
1137: char /*unsigned byte*/cc;
1138: }
1139:
1140: /* get the composition properties of the next character */
1141: private static int /*unsigned*/getNextCombining(
1142: NextCombiningArgs args, int limit, UnicodeSet nx) {
1143: long/*unsigned*/norm32;
1144: int combineFlags;
1145: /* get properties */
1146: args.c = args.source[args.start++];
1147: norm32 = getNorm32(args.c);
1148:
1149: /* preset output values for most characters */
1150: args.c2 = 0;
1151: args.combiningIndex = 0;
1152: args.cc = 0;
1153:
1154: if ((norm32 & (CC_MASK | COMBINES_ANY)) == 0) {
1155: return 0;
1156: } else {
1157: if (isNorm32Regular(norm32)) {
1158: /* set cc etc. below */
1159: } else if (isNorm32HangulOrJamo(norm32)) {
1160: /* a compatibility decomposition contained Jamos */
1161: args.combiningIndex = (int) ((UNSIGNED_INT_MASK) & (0xfff0 | (norm32 >> EXTRA_SHIFT)));
1162: return (int) (norm32 & COMBINES_ANY);
1163: } else {
1164: /* c is a lead surrogate, get the real norm32 */
1165: if (args.start != limit
1166: && UTF16
1167: .isTrailSurrogate(args.c2 = args.source[args.start])) {
1168: ++args.start;
1169: norm32 = getNorm32FromSurrogatePair(norm32, args.c2);
1170: } else {
1171: args.c2 = 0;
1172: return 0;
1173: }
1174: }
1175:
1176: if (nx_contains(nx, args.c, args.c2)) {
1177: return 0; /* excluded: norm32==0 */
1178: }
1179:
1180: args.cc = (char) ((norm32 >> CC_SHIFT) & 0xff);
1181:
1182: combineFlags = (int) (norm32 & COMBINES_ANY);
1183: if (combineFlags != 0) {
1184: int index = getExtraDataIndex(norm32);
1185: args.combiningIndex = index > 0 ? extraData[(index - 1)]
1186: : 0;
1187: }
1188:
1189: return combineFlags;
1190: }
1191: }
1192:
1193: /*
1194: * given a composition-result starter (c, c2) - which means its cc==0,
1195: * it combines forward, it has extra data, its norm32!=0,
1196: * it is not a Hangul or Jamo,
1197: * get just its combineFwdIndex
1198: *
1199: * norm32(c) is special if and only if c2!=0
1200: */
1201: private static int/*unsigned*/getCombiningIndexFromStarter(char c,
1202: char c2) {
1203: long/*unsigned*/norm32;
1204:
1205: norm32 = getNorm32(c);
1206: if (c2 != 0) {
1207: norm32 = getNorm32FromSurrogatePair(norm32, c2);
1208: }
1209: return extraData[(getExtraDataIndex(norm32) - 1)];
1210: }
1211:
1212: /*
1213: * Find the recomposition result for
1214: * a forward-combining character
1215: * (specified with a pointer to its part of the combiningTable[])
1216: * and a backward-combining character
1217: * (specified with its combineBackIndex).
1218: *
1219: * If these two characters combine, then set (value, value2)
1220: * with the code unit(s) of the composition character.
1221: *
1222: * Return value:
1223: * 0 do not combine
1224: * 1 combine
1225: * >1 combine, and the composition is a forward-combining starter
1226: *
1227: * See unormimp.h for a description of the composition table format.
1228: */
1229: private static int/*unsigned*/combine(char[] table,
1230: int tableStart, int/*unsinged*/combineBackIndex,
1231: int[] outValues) {
1232: int/*unsigned*/key;
1233: int value, value2;
1234:
1235: if (outValues.length < 2) {
1236: throw new IllegalArgumentException();
1237: }
1238:
1239: /* search in the starter's composition table */
1240: for (;;) {
1241: key = table[tableStart++];
1242: if (key >= combineBackIndex) {
1243: break;
1244: }
1245: tableStart += ((table[tableStart] & 0x8000) != 0) ? 2 : 1;
1246: }
1247:
1248: /* mask off bit 15, the last-entry-in-the-list flag */
1249: if ((key & 0x7fff) == combineBackIndex) {
1250: /* found! combine! */
1251: value = table[tableStart];
1252:
1253: /* is the composition a starter that combines forward? */
1254: key = (int) ((UNSIGNED_INT_MASK) & ((value & 0x2000) + 1));
1255:
1256: /* get the composition result code point from the variable-length
1257: * result value
1258: */
1259: if ((value & 0x8000) != 0) {
1260: if ((value & 0x4000) != 0) {
1261: /* surrogate pair composition result */
1262: value = (int) ((UNSIGNED_INT_MASK) & ((value & 0x3ff) | 0xd800));
1263: value2 = table[tableStart + 1];
1264: } else {
1265: /* BMP composition result U+2000..U+ffff */
1266: value = table[tableStart + 1];
1267: value2 = 0;
1268: }
1269: } else {
1270: /* BMP composition result U+0000..U+1fff */
1271: value &= 0x1fff;
1272: value2 = 0;
1273: }
1274: outValues[0] = value;
1275: outValues[1] = value2;
1276: return key;
1277: } else {
1278: /* not found */
1279: return 0;
1280: }
1281: }
1282:
1283: private static final class RecomposeArgs {
1284: char[] source;
1285: int start;
1286: int limit;
1287: }
1288:
1289: /*
1290: * recompose the characters in [p..limit[
1291: * (which is in NFD - decomposed and canonically ordered),
1292: * adjust limit, and return the trailing cc
1293: *
1294: * since for NFKC we may get Jamos in decompositions, we need to
1295: * recompose those too
1296: *
1297: * note that recomposition never lengthens the text:
1298: * any character consists of either one or two code units;
1299: * a composition may contain at most one more code unit than the original
1300: * starter, while the combining mark that is removed has at least one code
1301: * unit
1302: */
1303: private static char/*unsigned byte*/recompose(RecomposeArgs args,
1304: int options, UnicodeSet nx) {
1305: int remove, q, r;
1306: int /*unsigned*/combineFlags;
1307: int /*unsigned*/combineFwdIndex, combineBackIndex;
1308: int /*unsigned*/result, value = 0, value2 = 0;
1309: int /*unsigned byte*/prevCC;
1310: boolean starterIsSupplementary;
1311: int starter;
1312: int[] outValues = new int[2];
1313: starter = -1; /* no starter */
1314: combineFwdIndex = 0; /* will not be used until starter!=NULL */
1315: starterIsSupplementary = false; /* will not be used until starter!=NULL */
1316: prevCC = 0;
1317:
1318: NextCombiningArgs ncArg = new NextCombiningArgs();
1319: ncArg.source = args.source;
1320:
1321: ncArg.cc = 0;
1322: ncArg.c2 = 0;
1323:
1324: for (;;) {
1325: ncArg.start = args.start;
1326: combineFlags = getNextCombining(ncArg, args.limit, nx);
1327: combineBackIndex = ncArg.combiningIndex;
1328: args.start = ncArg.start;
1329:
1330: if (((combineFlags & COMBINES_BACK) != 0) && starter != -1) {
1331: if ((combineBackIndex & 0x8000) != 0) {
1332: /* c is a Jamo V/T, see if we can compose it with the
1333: * previous character
1334: */
1335: /* for the PRI #29 fix, check that there is no intervening combining mark */
1336: if ((options & BEFORE_PRI_29) != 0 || prevCC == 0) {
1337: remove = -1; /* NULL while no Hangul composition */
1338: combineFlags = 0;
1339: ncArg.c2 = args.source[starter];
1340: if (combineBackIndex == 0xfff2) {
1341: /* Jamo V, compose with previous Jamo L and following
1342: * Jamo T
1343: */
1344: ncArg.c2 = (char) (ncArg.c2 - JAMO_L_BASE);
1345: if (ncArg.c2 < JAMO_L_COUNT) {
1346: remove = args.start - 1;
1347: ncArg.c = (char) (HANGUL_BASE + (ncArg.c2
1348: * JAMO_V_COUNT + (ncArg.c - JAMO_V_BASE))
1349: * JAMO_T_COUNT);
1350: if (args.start != args.limit
1351: && (ncArg.c2 = (char) (args.source[args.start] - JAMO_T_BASE)) < JAMO_T_COUNT) {
1352: ++args.start;
1353: ncArg.c += ncArg.c2;
1354: } else {
1355: /* the result is an LV syllable, which is a starter (unlike LVT) */
1356: combineFlags = COMBINES_FWD;
1357: }
1358: if (!nx_contains(nx, ncArg.c)) {
1359: args.source[starter] = ncArg.c;
1360: } else {
1361: /* excluded */
1362: if (!isHangulWithoutJamoT(ncArg.c)) {
1363: --args.start; /* undo the ++args.start from reading the Jamo T */
1364: }
1365: /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
1366: remove = args.start;
1367: }
1368: }
1369:
1370: /*
1371: * Normally, the following can not occur:
1372: * Since the input is in NFD, there are no Hangul LV syllables that
1373: * a Jamo T could combine with.
1374: * All Jamo Ts are combined above when handling Jamo Vs.
1375: *
1376: * However, before the PRI #29 fix, this can occur due to
1377: * an intervening combining mark between the Hangul LV and the Jamo T.
1378: */
1379: } else {
1380: /* Jamo T, compose with previous Hangul that does not have a Jamo T */
1381: if (isHangulWithoutJamoT(ncArg.c2)) {
1382: ncArg.c2 += ncArg.c - JAMO_T_BASE;
1383: if (!nx_contains(nx, ncArg.c2)) {
1384: remove = args.start - 1;
1385: args.source[starter] = ncArg.c2;
1386: }
1387: }
1388: }
1389:
1390: if (remove != -1) {
1391: /* remove the Jamo(s) */
1392: q = remove;
1393: r = args.start;
1394: while (r < args.limit) {
1395: args.source[q++] = args.source[r++];
1396: }
1397: args.start = remove;
1398: args.limit = q;
1399: }
1400:
1401: ncArg.c2 = 0; /* c2 held *starter temporarily */
1402:
1403: if (combineFlags != 0) {
1404: /*
1405: * not starter=NULL because the composition is a Hangul LV syllable
1406: * and might combine once more (but only before the PRI #29 fix)
1407: */
1408:
1409: /* done? */
1410: if (args.start == args.limit) {
1411: return (char) prevCC;
1412: }
1413:
1414: /* the composition is a Hangul LV syllable which is a starter that combines forward */
1415: combineFwdIndex = 0xfff0;
1416:
1417: /* we combined; continue with looking for compositions */
1418: continue;
1419: }
1420: }
1421:
1422: /*
1423: * now: cc==0 and the combining index does not include
1424: * "forward" -> the rest of the loop body will reset starter
1425: * to NULL; technically, a composed Hangul syllable is a
1426: * starter, but it does not combine forward now that we have
1427: * consumed all eligible Jamos; for Jamo V/T, combineFlags
1428: * does not contain _NORM_COMBINES_FWD
1429: */
1430:
1431: } else if (
1432: /* the starter is not a Hangul LV or Jamo V/T and */
1433: !((combineFwdIndex & 0x8000) != 0)
1434: &&
1435: /* the combining mark is not blocked and */
1436: ((options & BEFORE_PRI_29) != 0 ? (prevCC != ncArg.cc || prevCC == 0)
1437: : (prevCC < ncArg.cc || prevCC == 0))
1438: &&
1439: /* the starter and the combining mark (c, c2) do combine */
1440: 0 != (result = combine(combiningTable,
1441: combineFwdIndex, combineBackIndex,
1442: outValues)) &&
1443: /* the composition result is not excluded */
1444: !nx_contains(nx, (char) value, (char) value2)) {
1445: value = outValues[0];
1446: value2 = outValues[1];
1447: /* replace the starter with the composition, remove the
1448: * combining mark
1449: */
1450: remove = ncArg.c2 == 0 ? args.start - 1
1451: : args.start - 2; /* index to the combining mark */
1452:
1453: /* replace the starter with the composition */
1454: args.source[starter] = (char) value;
1455: if (starterIsSupplementary) {
1456: if (value2 != 0) {
1457: /* both are supplementary */
1458: args.source[starter + 1] = (char) value2;
1459: } else {
1460: /* the composition is shorter than the starter,
1461: * move the intermediate characters forward one */
1462: starterIsSupplementary = false;
1463: q = starter + 1;
1464: r = q + 1;
1465: while (r < remove) {
1466: args.source[q++] = args.source[r++];
1467: }
1468: --remove;
1469: }
1470: } else if (value2 != 0) {
1471: /* the composition is longer than the starter,
1472: * move the intermediate characters back one */
1473: starterIsSupplementary = true;
1474: /* temporarily increment for the loop boundary */
1475: ++starter;
1476: q = remove;
1477: r = ++remove;
1478: while (starter < q) {
1479: args.source[--r] = args.source[--q];
1480: }
1481: args.source[starter] = (char) value2;
1482: --starter; /* undo the temporary increment */
1483: /* } else { both are on the BMP, nothing more to do */
1484: }
1485:
1486: /* remove the combining mark by moving the following text
1487: * over it */
1488: if (remove < args.start) {
1489: q = remove;
1490: r = args.start;
1491: while (r < args.limit) {
1492: args.source[q++] = args.source[r++];
1493: }
1494: args.start = remove;
1495: args.limit = q;
1496: }
1497:
1498: /* keep prevCC because we removed the combining mark */
1499:
1500: /* done? */
1501: if (args.start == args.limit) {
1502: return (char) prevCC;
1503: }
1504:
1505: /* is the composition a starter that combines forward? */
1506: if (result > 1) {
1507: combineFwdIndex = getCombiningIndexFromStarter(
1508: (char) value, (char) value2);
1509: } else {
1510: starter = -1;
1511: }
1512:
1513: /* we combined; continue with looking for compositions */
1514: continue;
1515: }
1516: }
1517:
1518: /* no combination this time */
1519: prevCC = ncArg.cc;
1520: if (args.start == args.limit) {
1521: return (char) prevCC;
1522: }
1523:
1524: /* if (c, c2) did not combine, then check if it is a starter */
1525: if (ncArg.cc == 0) {
1526: /* found a new starter; combineFlags==0 if (c, c2) is excluded */
1527: if ((combineFlags & COMBINES_FWD) != 0) {
1528: /* it may combine with something, prepare for it */
1529: if (ncArg.c2 == 0) {
1530: starterIsSupplementary = false;
1531: starter = args.start - 1;
1532: } else {
1533: starterIsSupplementary = false;
1534: starter = args.start - 2;
1535: }
1536: combineFwdIndex = combineBackIndex;
1537: } else {
1538: /* it will not combine with anything */
1539: starter = -1;
1540: }
1541: } else if ((options & OPTIONS_COMPOSE_CONTIGUOUS) != 0) {
1542: /* FCC: no discontiguous compositions; any intervening character blocks */
1543: starter = -1;
1544: }
1545: }
1546: }
1547:
1548: // find the last true starter between src[start]....src[current] going
1549: // backwards and return its index
1550: private static int findPreviousStarter(char[] src, int srcStart,
1551: int current, int/*unsigned*/ccOrQCMask,
1552: int/*unsigned*/decompQCMask, char minNoMaybe) {
1553: long norm32;
1554: PrevArgs args = new PrevArgs();
1555: args.src = src;
1556: args.start = srcStart;
1557: args.current = current;
1558:
1559: while (args.start < args.current) {
1560: norm32 = getPrevNorm32(args, minNoMaybe, ccOrQCMask
1561: | decompQCMask);
1562: if (isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1563: break;
1564: }
1565: }
1566: return args.current;
1567: }
1568:
1569: /* find the first true starter in [src..limit[ and return the
1570: * pointer to it
1571: */
1572: private static int/*index*/findNextStarter(char[] src, int start,
1573: int limit, int/*unsigned*/qcMask,
1574: int/*unsigned*/decompQCMask, char minNoMaybe) {
1575: int p;
1576: long/*unsigned*/norm32;
1577: int ccOrQCMask;
1578: char c, c2;
1579:
1580: ccOrQCMask = CC_MASK | qcMask;
1581:
1582: DecomposeArgs decompArgs = new DecomposeArgs();
1583:
1584: for (;;) {
1585: if (start == limit) {
1586: break; /* end of string */
1587: }
1588: c = src[start];
1589: if (c < minNoMaybe) {
1590: break; /* catches NUL terminater, too */
1591: }
1592:
1593: norm32 = getNorm32(c);
1594: if ((norm32 & ccOrQCMask) == 0) {
1595: break; /* true starter */
1596: }
1597:
1598: if (isNorm32LeadSurrogate(norm32)) {
1599: /* c is a lead surrogate, get the real norm32 */
1600: if ((start + 1) == limit
1601: || !UTF16
1602: .isTrailSurrogate(c2 = (src[start + 1]))) {
1603: /* unmatched first surrogate: counts as a true starter */
1604: break;
1605: }
1606: norm32 = getNorm32FromSurrogatePair(norm32, c2);
1607:
1608: if ((norm32 & ccOrQCMask) == 0) {
1609: break; /* true starter */
1610: }
1611: } else {
1612: c2 = 0;
1613: }
1614:
1615: /* (c, c2) is not a true starter but its decomposition may be */
1616: if ((norm32 & decompQCMask) != 0) {
1617: /* (c, c2) decomposes, get everything from the variable-length
1618: * extra data */
1619: p = decompose(norm32, decompQCMask, decompArgs);
1620:
1621: /* get the first character's norm32 to check if it is a true
1622: * starter */
1623: if (decompArgs.cc == 0
1624: && (getNorm32(extraData, p, qcMask) & qcMask) == 0) {
1625: break; /* true starter */
1626: }
1627: }
1628:
1629: start += c2 == 0 ? 1 : 2; /* not a true starter, continue */
1630: }
1631:
1632: return start;
1633: }
1634:
1635: private static final class ComposePartArgs {
1636: int prevCC;
1637: int length; /* length of decomposed part */
1638: }
1639:
1640: /* decompose and recompose [prevStarter..src[ */
1641: private static char[] composePart(ComposePartArgs args,
1642: int prevStarter, char[] src, int start, int limit,
1643: int options, UnicodeSet nx) {
1644: int recomposeLimit;
1645: boolean compat = ((options & OPTIONS_COMPAT) != 0);
1646:
1647: /* decompose [prevStarter..src[ */
1648: int[] outTrailCC = new int[1];
1649: char[] buffer = new char[(limit - prevStarter)
1650: * MAX_BUFFER_SIZE];
1651:
1652: for (;;) {
1653: args.length = decompose(src, prevStarter, (start), buffer,
1654: 0, buffer.length, compat, outTrailCC, nx);
1655: if (args.length <= buffer.length) {
1656: break;
1657: } else {
1658: buffer = new char[args.length];
1659: }
1660: }
1661:
1662: /* recompose the decomposition */
1663: recomposeLimit = args.length;
1664:
1665: if (args.length >= 2) {
1666: RecomposeArgs rcArgs = new RecomposeArgs();
1667: rcArgs.source = buffer;
1668: rcArgs.start = 0;
1669: rcArgs.limit = recomposeLimit;
1670: args.prevCC = recompose(rcArgs, options, nx);
1671: recomposeLimit = rcArgs.limit;
1672: }
1673:
1674: /* return with a pointer to the recomposition and its length */
1675: args.length = recomposeLimit;
1676: return buffer;
1677: }
1678:
1679: private static boolean composeHangul(char prev, char c,
1680: long/*unsigned*/norm32, char[] src, int[] srcIndex,
1681: int limit, boolean compat, char[] dest, int destIndex,
1682: UnicodeSet nx) {
1683: int start = srcIndex[0];
1684: if (isJamoVTNorm32JamoV(norm32)) {
1685: /* c is a Jamo V, compose with previous Jamo L and
1686: * following Jamo T */
1687: prev = (char) (prev - JAMO_L_BASE);
1688: if (prev < JAMO_L_COUNT) {
1689: c = (char) (HANGUL_BASE + (prev * JAMO_V_COUNT + (c - JAMO_V_BASE))
1690: * JAMO_T_COUNT);
1691:
1692: /* check if the next character is a Jamo T (normal or
1693: * compatibility) */
1694: if (start != limit) {
1695: char next, t;
1696:
1697: next = src[start];
1698: if ((t = (char) (next - JAMO_T_BASE)) < JAMO_T_COUNT) {
1699: /* normal Jamo T */
1700: ++start;
1701: c += t;
1702: } else if (compat) {
1703: /* if NFKC, then check for compatibility Jamo T
1704: * (BMP only) */
1705: norm32 = getNorm32(next);
1706: if (isNorm32Regular(norm32)
1707: && ((norm32 & QC_NFKD) != 0)) {
1708: int p /*index into extra data array*/;
1709: DecomposeArgs dcArgs = new DecomposeArgs();
1710: p = decompose(norm32, QC_NFKD, dcArgs);
1711: if (dcArgs.length == 1
1712: && (t = (char) (extraData[p] - JAMO_T_BASE)) < JAMO_T_COUNT) {
1713: /* compatibility Jamo T */
1714: ++start;
1715: c += t;
1716: }
1717: }
1718: }
1719: }
1720: if (nx_contains(nx, c)) {
1721: if (!isHangulWithoutJamoT(c)) {
1722: --start; /* undo ++start from reading the Jamo T */
1723: }
1724: return false;
1725: }
1726: dest[destIndex] = c;
1727: srcIndex[0] = start;
1728: return true;
1729: }
1730: } else if (isHangulWithoutJamoT(prev)) {
1731: /* c is a Jamo T, compose with previous Hangul LV that does not
1732: * contain a Jamo T */
1733: c = (char) (prev + (c - JAMO_T_BASE));
1734: if (nx_contains(nx, c)) {
1735: return false;
1736: }
1737: dest[destIndex] = c;
1738: srcIndex[0] = start;
1739: return true;
1740: }
1741: return false;
1742: }
1743:
1744: /*
1745: public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
1746: return compose(src,0,src.length,dest,0,dest.length,compat, nx);
1747: }
1748: */
1749:
1750: public static int compose(char[] src, int srcStart, int srcLimit,
1751: char[] dest, int destStart, int destLimit, int options,
1752: UnicodeSet nx) {
1753:
1754: int prevSrc, prevStarter;
1755: long/*unsigned*/norm32;
1756: int ccOrQCMask, qcMask;
1757: int reorderStartIndex, length;
1758: char c, c2, minNoMaybe;
1759: int/*unsigned byte*/cc, prevCC;
1760: int[] ioIndex = new int[1];
1761: int destIndex = destStart;
1762: int srcIndex = srcStart;
1763:
1764: if ((options & OPTIONS_COMPAT) != 0) {
1765: minNoMaybe = (char) indexes[INDEX_MIN_NFKC_NO_MAYBE];
1766: qcMask = QC_NFKC;
1767: } else {
1768: minNoMaybe = (char) indexes[INDEX_MIN_NFC_NO_MAYBE];
1769: qcMask = QC_NFC;
1770: }
1771:
1772: /*
1773: * prevStarter points to the last character before the current one
1774: * that is a "true" starter with cc==0 and quick check "yes".
1775: *
1776: * prevStarter will be used instead of looking for a true starter
1777: * while incrementally decomposing [prevStarter..prevSrc[
1778: * in _composePart(). Having a good prevStarter allows to just decompose
1779: * the entire [prevStarter..prevSrc[.
1780: *
1781: * When _composePart() backs out from prevSrc back to prevStarter,
1782: * then it also backs out destIndex by the same amount.
1783: * Therefore, at all times, the (prevSrc-prevStarter) source units
1784: * must correspond 1:1 to destination units counted with destIndex,
1785: * except for reordering.
1786: * This is true for the qc "yes" characters copied in the fast loop,
1787: * and for pure reordering.
1788: * prevStarter must be set forward to src when this is not true:
1789: * In _composePart() and after composing a Hangul syllable.
1790: *
1791: * This mechanism relies on the assumption that the decomposition of a
1792: * true starter also begins with a true starter. gennorm/store.c checks
1793: * for this.
1794: */
1795: prevStarter = srcIndex;
1796:
1797: ccOrQCMask = CC_MASK | qcMask;
1798: /*destIndex=*/reorderStartIndex = 0;/* ####TODO#### check this **/
1799: prevCC = 0;
1800:
1801: /* avoid compiler warnings */
1802: norm32 = 0;
1803: c = 0;
1804:
1805: for (;;) {
1806: /* count code units below the minimum or with irrelevant data for
1807: * the quick check */
1808: prevSrc = srcIndex;
1809:
1810: while (srcIndex != srcLimit
1811: && ((c = src[srcIndex]) < minNoMaybe || ((norm32 = getNorm32(c)) & ccOrQCMask) == 0)) {
1812: prevCC = 0;
1813: ++srcIndex;
1814: }
1815:
1816: /* copy these code units all at once */
1817: if (srcIndex != prevSrc) {
1818: length = (int) (srcIndex - prevSrc);
1819: if ((destIndex + length) <= destLimit) {
1820: System.arraycopy(src, prevSrc, dest, destIndex,
1821: length);
1822: }
1823: destIndex += length;
1824: reorderStartIndex = destIndex;
1825:
1826: /* set prevStarter to the last character in the quick check
1827: * loop */
1828: prevStarter = srcIndex - 1;
1829: if (UTF16.isTrailSurrogate(src[prevStarter])
1830: && prevSrc < prevStarter
1831: && UTF16
1832: .isLeadSurrogate(src[(prevStarter - 1)])) {
1833: --prevStarter;
1834: }
1835:
1836: prevSrc = srcIndex;
1837: }
1838:
1839: /* end of source reached? */
1840: if (srcIndex == srcLimit) {
1841: break;
1842: }
1843:
1844: /* c already contains *src and norm32 is set for it, increment src*/
1845: ++srcIndex;
1846:
1847: /*
1848: * source buffer pointers:
1849: *
1850: * all done quick check current char not yet
1851: * "yes" but (c, c2) processed
1852: * may combine
1853: * forward
1854: * [-------------[-------------[-------------[-------------[
1855: * | | | | |
1856: * start prevStarter prevSrc src limit
1857: *
1858: *
1859: * destination buffer pointers and indexes:
1860: *
1861: * all done might take not filled yet
1862: * characters for
1863: * reordering
1864: * [-------------[-------------[-------------[
1865: * | | | |
1866: * dest reorderStartIndex destIndex destCapacity
1867: */
1868:
1869: /* check one above-minimum, relevant code unit */
1870: /*
1871: * norm32 is for c=*(src-1), and the quick check flag is "no" or
1872: * "maybe", and/or cc!=0
1873: * check for Jamo V/T, then for surrogates and regular characters
1874: * c is not a Hangul syllable or Jamo L because
1875: * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
1876: */
1877: if (isNorm32HangulOrJamo(norm32)) {
1878: /*
1879: * c is a Jamo V/T:
1880: * try to compose with the previous character, Jamo V also with
1881: * a following Jamo T, and set values here right now in case we
1882: * just continue with the main loop
1883: */
1884: prevCC = cc = 0;
1885: reorderStartIndex = destIndex;
1886: ioIndex[0] = srcIndex;
1887: if (destIndex > 0
1888: && composeHangul(src[(prevSrc - 1)], c, norm32,
1889: src, ioIndex, srcLimit,
1890: (options & OPTIONS_COMPAT) != 0, dest,
1891: destIndex <= destLimit ? destIndex - 1
1892: : 0, nx)) {
1893: srcIndex = ioIndex[0];
1894: prevStarter = srcIndex;
1895: continue;
1896: }
1897:
1898: srcIndex = ioIndex[0];
1899:
1900: /* the Jamo V/T did not compose into a Hangul syllable, just
1901: * append to dest */
1902: c2 = 0;
1903: length = 1;
1904: prevStarter = prevSrc;
1905: } else {
1906: if (isNorm32Regular(norm32)) {
1907: c2 = 0;
1908: length = 1;
1909: } else {
1910: /* c is a lead surrogate, get the real norm32 */
1911: if (srcIndex != srcLimit
1912: && UTF16
1913: .isTrailSurrogate(c2 = src[srcIndex])) {
1914: ++srcIndex;
1915: length = 2;
1916: norm32 = getNorm32FromSurrogatePair(norm32, c2);
1917: } else {
1918: /* c is an unpaired lead surrogate, nothing to do */
1919: c2 = 0;
1920: length = 1;
1921: norm32 = 0;
1922: }
1923: }
1924: ComposePartArgs args = new ComposePartArgs();
1925:
1926: /* we are looking at the character (c, c2) at [prevSrc..src[ */
1927: if (nx_contains(nx, c, c2)) {
1928: /* excluded: norm32==0 */
1929: cc = 0;
1930: } else if ((norm32 & qcMask) == 0) {
1931: cc = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
1932: } else {
1933: char[] p;
1934:
1935: /*
1936: * find appropriate boundaries around this character,
1937: * decompose the source text from between the boundaries,
1938: * and recompose it
1939: *
1940: * this puts the intermediate text into the side buffer because
1941: * it might be longer than the recomposition end result,
1942: * or the destination buffer may be too short or missing
1943: *
1944: * note that destIndex may be adjusted backwards to account
1945: * for source text that passed the quick check but needed to
1946: * take part in the recomposition
1947: */
1948: int decompQCMask = (qcMask << 2) & 0xf; /* decomposition quick check mask */
1949: /*
1950: * find the last true starter in [prevStarter..src[
1951: * it is either the decomposition of the current character (at prevSrc),
1952: * or prevStarter
1953: */
1954: if (isTrueStarter(norm32, CC_MASK | qcMask,
1955: decompQCMask)) {
1956: prevStarter = prevSrc;
1957: } else {
1958: /* adjust destIndex: back out what had been copied with qc "yes" */
1959: destIndex -= prevSrc - prevStarter;
1960: }
1961:
1962: /* find the next true starter in [src..limit[ */
1963: srcIndex = findNextStarter(src, srcIndex, srcLimit,
1964: qcMask, decompQCMask, minNoMaybe);
1965: //args.prevStarter = prevStarter;
1966: args.prevCC = prevCC;
1967: //args.destIndex = destIndex;
1968: args.length = length;
1969: p = composePart(args, prevStarter, src, srcIndex,
1970: srcLimit, options, nx);
1971:
1972: if (p == null) {
1973: /* an error occurred (out of memory) */
1974: break;
1975: }
1976:
1977: prevCC = args.prevCC;
1978: length = args.length;
1979:
1980: /* append the recomposed buffer contents to the destination
1981: * buffer */
1982: if ((destIndex + args.length) <= destLimit) {
1983: int i = 0;
1984: while (i < args.length) {
1985: dest[destIndex++] = p[i++];
1986: --length;
1987: }
1988: } else {
1989: /* buffer overflow */
1990: /* keep incrementing the destIndex for preflighting */
1991: destIndex += length;
1992: }
1993:
1994: prevStarter = srcIndex;
1995: continue;
1996: }
1997: }
1998:
1999: /* append the single code point (c, c2) to the destination buffer */
2000: if ((destIndex + length) <= destLimit) {
2001: if (cc != 0 && cc < prevCC) {
2002: /* (c, c2) is out of order with respect to the preceding
2003: * text */
2004: int reorderSplit = destIndex;
2005: destIndex += length;
2006: prevCC = insertOrdered(dest, reorderStartIndex,
2007: reorderSplit, destIndex, c, c2, cc);
2008: } else {
2009: /* just append (c, c2) */
2010: dest[destIndex++] = c;
2011: if (c2 != 0) {
2012: dest[destIndex++] = c2;
2013: }
2014: prevCC = cc;
2015: }
2016: } else {
2017: /* buffer overflow */
2018: /* keep incrementing the destIndex for preflighting */
2019: destIndex += length;
2020: prevCC = cc;
2021: }
2022: }
2023:
2024: return destIndex - destStart;
2025: }
2026:
2027: public static int getCombiningClass(int c) {
2028: long norm32;
2029: norm32 = getNorm32(c);
2030: return (char) ((norm32 >> CC_SHIFT) & 0xFF);
2031: }
2032:
2033: public static boolean isFullCompositionExclusion(int c) {
2034: if (isFormatVersion_2_1) {
2035: int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2036: return (boolean) ((aux & AUX_COMP_EX_MASK) != 0);
2037: } else {
2038: return false;
2039: }
2040: }
2041:
2042: public static boolean isCanonSafeStart(int c) {
2043: if (isFormatVersion_2_1) {
2044: int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2045: return (boolean) ((aux & AUX_UNSAFE_MASK) == 0);
2046: } else {
2047: return false;
2048: }
2049: }
2050:
2051: /* Is c an NF<mode>-skippable code point? See unormimp.h. */
2052: public static boolean isNFSkippable(int c,
2053: NormalizerBase.Mode mode, long mask) {
2054: long /*unsigned int*/norm32;
2055: mask = mask & UNSIGNED_INT_MASK;
2056: char aux;
2057:
2058: /* check conditions (a)..(e), see unormimp.h */
2059: norm32 = getNorm32(c);
2060:
2061: if ((norm32 & mask) != 0) {
2062: return false; /* fails (a)..(e), not skippable */
2063: }
2064:
2065: if (mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD
2066: || mode == NormalizerBase.NONE) {
2067: return true; /* NF*D, passed (a)..(c), is skippable */
2068: }
2069: /* check conditions (a)..(e), see unormimp.h */
2070:
2071: /* NF*C/FCC, passed (a)..(e) */
2072: if ((norm32 & QC_NFD) == 0) {
2073: return true; /* no canonical decomposition, is skippable */
2074: }
2075:
2076: /* check Hangul syllables algorithmically */
2077: if (isNorm32HangulOrJamo(norm32)) {
2078: /* Jamo passed (a)..(e) above, must be Hangul */
2079: return !isHangulWithoutJamoT((char) c); /* LVT are skippable, LV are not */
2080: }
2081:
2082: /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
2083: /* NF*C, test (f) flag */
2084: if (!isFormatVersion_2_2) {
2085: return false; /* no (f) data, say not skippable to be safe */
2086: }
2087:
2088: aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2089: return (aux & AUX_NFC_SKIP_F_MASK) == 0; /* TRUE=skippable if the (f) flag is not set */
2090:
2091: /* } else { FCC, test fcd<=1 instead of the above } */
2092: }
2093:
2094: public static UnicodeSet addPropertyStarts(UnicodeSet set) {
2095: int c;
2096:
2097: /* add the start code point of each same-value range of each trie */
2098: //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
2099: TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
2100: RangeValueIterator.Element normResult = new RangeValueIterator.Element();
2101:
2102: while (normIter.next(normResult)) {
2103: set.add(normResult.start);
2104: }
2105:
2106: //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
2107: TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie);
2108: RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();
2109:
2110: while (fcdIter.next(fcdResult)) {
2111: set.add(fcdResult.start);
2112: }
2113:
2114: if (isFormatVersion_2_1) {
2115: //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
2116: TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
2117: RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
2118: while (auxIter.next(auxResult)) {
2119: set.add(auxResult.start);
2120: }
2121: }
2122: /* add Hangul LV syllables and LV+1 because of skippables */
2123: for (c = HANGUL_BASE; c < HANGUL_BASE + HANGUL_COUNT; c += JAMO_T_COUNT) {
2124: set.add(c);
2125: set.add(c + 1);
2126: }
2127: set.add(HANGUL_BASE + HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
2128: return set; // for chaining
2129: }
2130:
2131: /**
2132: * Internal API, used in UCharacter.getIntPropertyValue().
2133: * @internal
2134: * @param c code point
2135: * @param modeValue numeric value compatible with Mode
2136: * @return numeric value compatible with QuickCheck
2137: */
2138: public static final int quickCheck(int c, int modeValue) {
2139: final int qcMask[/*UNORM_MODE_COUNT*/] = { 0, 0, QC_NFD,
2140: QC_NFKD, QC_NFC, QC_NFKC };
2141:
2142: int norm32 = (int) getNorm32(c) & qcMask[modeValue];
2143:
2144: if (norm32 == 0) {
2145: return 1; // YES
2146: } else if ((norm32 & QC_ANY_NO) != 0) {
2147: return 0; // NO
2148: } else /* _NORM_QC_ANY_MAYBE */{
2149: return 2; // MAYBE;
2150: }
2151: }
2152:
2153: private static int strCompare(char[] s1, int s1Start, int s1Limit,
2154: char[] s2, int s2Start, int s2Limit, boolean codePointOrder) {
2155:
2156: int start1, start2, limit1, limit2;
2157:
2158: char c1, c2;
2159:
2160: /* setup for fix-up */
2161: start1 = s1Start;
2162: start2 = s2Start;
2163:
2164: int length1, length2;
2165:
2166: length1 = s1Limit - s1Start;
2167: length2 = s2Limit - s2Start;
2168:
2169: int lengthResult;
2170:
2171: if (length1 < length2) {
2172: lengthResult = -1;
2173: limit1 = start1 + length1;
2174: } else if (length1 == length2) {
2175: lengthResult = 0;
2176: limit1 = start1 + length1;
2177: } else /* length1>length2 */{
2178: lengthResult = 1;
2179: limit1 = start1 + length2;
2180: }
2181:
2182: if (s1 == s2) {
2183: return lengthResult;
2184: }
2185:
2186: for (;;) {
2187: /* check pseudo-limit */
2188: if (s1Start == limit1) {
2189: return lengthResult;
2190: }
2191:
2192: c1 = s1[s1Start];
2193: c2 = s2[s2Start];
2194: if (c1 != c2) {
2195: break;
2196: }
2197: ++s1Start;
2198: ++s2Start;
2199: }
2200:
2201: /* setup for fix-up */
2202: limit1 = start1 + length1;
2203: limit2 = start2 + length2;
2204:
2205: /* if both values are in or above the surrogate range, fix them up */
2206: if (c1 >= 0xd800 && c2 >= 0xd800 && codePointOrder) {
2207: /* subtract 0x2800 from BMP code points to make them smaller than
2208: * supplementary ones */
2209: if ((c1 <= 0xdbff && (s1Start + 1) != limit1 && UTF16
2210: .isTrailSurrogate(s1[(s1Start + 1)]))
2211: || (UTF16.isTrailSurrogate(c1) && start1 != s1Start && UTF16
2212: .isLeadSurrogate(s1[(s1Start - 1)]))) {
2213: /* part of a surrogate pair, leave >=d800 */
2214: } else {
2215: /* BMP code point - may be surrogate code point - make <d800 */
2216: c1 -= 0x2800;
2217: }
2218:
2219: if ((c2 <= 0xdbff && (s2Start + 1) != limit2 && UTF16
2220: .isTrailSurrogate(s2[(s2Start + 1)]))
2221: || (UTF16.isTrailSurrogate(c2) && start2 != s2Start && UTF16
2222: .isLeadSurrogate(s2[(s2Start - 1)]))) {
2223: /* part of a surrogate pair, leave >=d800 */
2224: } else {
2225: /* BMP code point - may be surrogate code point - make <d800 */
2226: c2 -= 0x2800;
2227: }
2228: }
2229:
2230: /* now c1 and c2 are in UTF-32-compatible order */
2231: return (int) c1 - (int) c2;
2232: }
2233:
2234: /*
2235: * Status of tailored normalization
2236: *
2237: * This was done initially for investigation on Unicode public review issue 7
2238: * (http://www.unicode.org/review/). See Jitterbug 2481.
2239: * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
2240: * a permanent feature in ICU 2.6 in support of IDNA which requires true
2241: * Unicode 3.2 normalization.
2242: * (NormalizationCorrections are rolled into IDNA mapping tables.)
2243: *
2244: * Tailored normalization as implemented here allows to "normalize less"
2245: * than full Unicode normalization would.
2246: * Based internally on a UnicodeSet of code points that are
2247: * "excluded from normalization", the normalization functions leave those
2248: * code points alone ("inert"). This means that tailored normalization
2249: * still transforms text into a canonically equivalent form.
2250: * It does not add decompositions to code points that do not have any or
2251: * change decomposition results.
2252: *
2253: * Any function that searches for a safe boundary has not been touched,
2254: * which means that these functions will be over-pessimistic when
2255: * exclusions are applied.
2256: * This should not matter because subsequent checks and normalizations
2257: * do apply the exclusions; only a little more of the text may be processed
2258: * than necessary under exclusions.
2259: *
2260: * Normalization exclusions have the following effect on excluded code points c:
2261: * - c is not decomposed
2262: * - c is not a composition target
2263: * - c does not combine forward or backward for composition
2264: * except that this is not implemented for Jamo
2265: * - c is treated as having a combining class of 0
2266: */
2267:
2268: /*
2269: * Constants for the bit fields in the options bit set parameter.
2270: * These need not be public.
2271: * A user only needs to know the currently assigned values.
2272: * The number and positions of reserved bits per field can remain private.
2273: */
2274: private static final int OPTIONS_NX_MASK = 0x1f;
2275: private static final int OPTIONS_UNICODE_MASK = 0xe0;
2276: public static final int OPTIONS_SETS_MASK = 0xff;
2277: private static final int OPTIONS_UNICODE_SHIFT = 5;
2278: private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK + 1];
2279:
2280: /* Constants for options flags for normalization.*/
2281:
2282: /**
2283: * Options bit 0, do not decompose Hangul syllables.
2284: * @draft ICU 2.6
2285: */
2286: private static final int NX_HANGUL = 1;
2287: /**
2288: * Options bit 1, do not decompose CJK compatibility characters.
2289: * @draft ICU 2.6
2290: */
2291: private static final int NX_CJK_COMPAT = 2;
2292: /**
2293: * Options bit 8, use buggy recomposition described in
2294: * Unicode Public Review Issue #29
2295: * at http://www.unicode.org/review/resolved-pri.html#pri29
2296: *
2297: * Used in IDNA implementation according to strict interpretation
2298: * of IDNA definition based on Unicode 3.2 which predates PRI #29.
2299: *
2300: * See ICU4C unormimp.h
2301: *
2302: * @draft ICU 3.2
2303: */
2304: public static final int BEFORE_PRI_29 = 0x100;
2305:
2306: /*
2307: * The following options are used only in some composition functions.
2308: * They use bits 12 and up to preserve lower bits for the available options
2309: * space in unorm_compare() -
2310: * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
2311: */
2312:
2313: /** Options bit 12, for compatibility vs. canonical decomposition. */
2314: public static final int OPTIONS_COMPAT = 0x1000;
2315: /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
2316: public static final int OPTIONS_COMPOSE_CONTIGUOUS = 0x2000;
2317:
2318: /* normalization exclusion sets --------------------------------------------- */
2319:
2320: /*
2321: * Normalization exclusion UnicodeSets are used for tailored normalization;
2322: * see the comment near the beginning of this file.
2323: *
2324: * By specifying one or several sets of code points,
2325: * those code points become inert for normalization.
2326: */
2327: private static final synchronized UnicodeSet internalGetNXHangul() {
2328: /* internal function, does not check for incoming U_FAILURE */
2329:
2330: if (nxCache[NX_HANGUL] == null) {
2331: nxCache[NX_HANGUL] = new UnicodeSet(0xac00, 0xd7a3);
2332: }
2333: return nxCache[NX_HANGUL];
2334: }
2335:
2336: private static final synchronized UnicodeSet internalGetNXCJKCompat() {
2337: /* internal function, does not check for incoming U_FAILURE */
2338:
2339: if (nxCache[NX_CJK_COMPAT] == null) {
2340:
2341: /* build a set from [CJK Ideographs]&[has canonical decomposition] */
2342: UnicodeSet set, hasDecomp;
2343:
2344: set = new UnicodeSet("[:Ideographic:]");
2345:
2346: /* start with an empty set for [has canonical decomposition] */
2347: hasDecomp = new UnicodeSet();
2348:
2349: /* iterate over all ideographs and remember which canonically decompose */
2350: UnicodeSetIterator it = new UnicodeSetIterator(set);
2351: int start, end;
2352: long norm32;
2353:
2354: while (it.nextRange()
2355: && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
2356: start = it.codepoint;
2357: end = it.codepointEnd;
2358: while (start <= end) {
2359: norm32 = getNorm32(start);
2360: if ((norm32 & QC_NFD) > 0) {
2361: hasDecomp.add(start);
2362: }
2363: ++start;
2364: }
2365: }
2366:
2367: /* hasDecomp now contains all ideographs that decompose canonically */
2368: nxCache[NX_CJK_COMPAT] = hasDecomp;
2369:
2370: }
2371:
2372: return nxCache[NX_CJK_COMPAT];
2373: }
2374:
2375: private static final synchronized UnicodeSet internalGetNXUnicode(
2376: int options) {
2377: options &= OPTIONS_UNICODE_MASK;
2378: if (options == 0) {
2379: return null;
2380: }
2381:
2382: if (nxCache[options] == null) {
2383: /* build a set with all code points that were not designated by the specified Unicode version */
2384: UnicodeSet set = new UnicodeSet();
2385:
2386: switch (options) {
2387: case NormalizerBase.UNICODE_3_2:
2388: set.applyPattern("[:^Age=3.2:]");
2389: break;
2390: default:
2391: return null;
2392: }
2393:
2394: nxCache[options] = set;
2395: }
2396:
2397: return nxCache[options];
2398: }
2399:
2400: /* Get a decomposition exclusion set. The data must be loaded. */
2401: private static final synchronized UnicodeSet internalGetNX(
2402: int options) {
2403: options &= OPTIONS_SETS_MASK;
2404:
2405: if (nxCache[options] == null) {
2406: /* return basic sets */
2407: if (options == NX_HANGUL) {
2408: return internalGetNXHangul();
2409: }
2410: if (options == NX_CJK_COMPAT) {
2411: return internalGetNXCJKCompat();
2412: }
2413: if ((options & OPTIONS_UNICODE_MASK) != 0
2414: && (options & OPTIONS_NX_MASK) == 0) {
2415: return internalGetNXUnicode(options);
2416: }
2417:
2418: /* build a set from multiple subsets */
2419: UnicodeSet set;
2420: UnicodeSet other;
2421:
2422: set = new UnicodeSet();
2423:
2424: if ((options & NX_HANGUL) != 0
2425: && null != (other = internalGetNXHangul())) {
2426: set.addAll(other);
2427: }
2428: if ((options & NX_CJK_COMPAT) != 0
2429: && null != (other = internalGetNXCJKCompat())) {
2430: set.addAll(other);
2431: }
2432: if ((options & OPTIONS_UNICODE_MASK) != 0
2433: && null != (other = internalGetNXUnicode(options))) {
2434: set.addAll(other);
2435: }
2436:
2437: nxCache[options] = set;
2438: }
2439: return nxCache[options];
2440: }
2441:
2442: public static final UnicodeSet getNX(int options) {
2443: if ((options &= OPTIONS_SETS_MASK) == 0) {
2444: /* incoming failure, or no decomposition exclusions requested */
2445: return null;
2446: } else {
2447: return internalGetNX(options);
2448: }
2449: }
2450:
2451: private static final boolean nx_contains(UnicodeSet nx, int c) {
2452: return nx != null && nx.contains(c);
2453: }
2454:
2455: private static final boolean nx_contains(UnicodeSet nx, char c,
2456: char c2) {
2457: return nx != null
2458: && nx.contains(c2 == 0 ? c : UCharacterProperty
2459: .getRawSupplementary(c, c2));
2460: }
2461:
2462: /*****************************************************************************/
2463:
2464: /**
2465: * Get the canonical decomposition
2466: * sherman for ComposedCharIter
2467: */
2468:
2469: public static int getDecompose(int chars[], String decomps[]) {
2470: DecomposeArgs args = new DecomposeArgs();
2471: int length = 0;
2472: long norm32 = 0;
2473: int ch = -1;
2474: int index = 0;
2475: int i = 0;
2476:
2477: while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff
2478: //TBD !!!! the hack code heres save us about 50ms for startup
2479: //need a better solution/lookup
2480: if (ch == 0x30ff)
2481: ch = 0xf900;
2482: else if (ch == 0x10000)
2483: ch = 0x1d15e;
2484: else if (ch == 0x1d1c1)
2485: ch = 0x2f800;
2486:
2487: norm32 = NormalizerImpl.getNorm32(ch);
2488: if ((norm32 & QC_NFD) != 0 && i < chars.length) {
2489: chars[i] = ch;
2490: index = decompose(norm32, args);
2491: decomps[i++] = new String(extraData, index, args.length);
2492: }
2493: }
2494: return i;
2495: }
2496:
2497: //------------------------------------------------------
2498: // special method for Collation
2499: //------------------------------------------------------
2500: private static boolean needSingleQuotation(char c) {
2501: return (c >= 0x0009 && c <= 0x000D)
2502: || (c >= 0x0020 && c <= 0x002F)
2503: || (c >= 0x003A && c <= 0x0040)
2504: || (c >= 0x005B && c <= 0x0060)
2505: || (c >= 0x007B && c <= 0x007E);
2506: }
2507:
2508: public static String canonicalDecomposeWithSingleQuotation(
2509: String string) {
2510: char[] src = string.toCharArray();
2511: int srcIndex = 0;
2512: int srcLimit = src.length;
2513: char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3
2514: int destIndex = 0;
2515: int destLimit = dest.length;
2516:
2517: char[] buffer = new char[3];
2518: int prevSrc;
2519: long norm32;
2520: int ccOrQCMask;
2521: int qcMask = QC_NFD;
2522: int reorderStartIndex, length;
2523: char c, c2;
2524: char minNoMaybe = (char) indexes[INDEX_MIN_NFD_NO_MAYBE];
2525: int cc, prevCC, trailCC;
2526: char[] p;
2527: int pStart;
2528:
2529: // initialize
2530: ccOrQCMask = CC_MASK | qcMask;
2531: reorderStartIndex = 0;
2532: prevCC = 0;
2533: norm32 = 0;
2534: c = 0;
2535: pStart = 0;
2536:
2537: cc = trailCC = -1; // initialize to bogus value
2538: for (;;) {
2539: prevSrc = srcIndex;
2540: //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
2541: while (srcIndex != srcLimit
2542: && ((c = src[srcIndex]) < minNoMaybe
2543: || ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 || (c >= '\uac00' && c <= '\ud7a3'))) {
2544:
2545: prevCC = 0;
2546: ++srcIndex;
2547: }
2548:
2549: // copy these code units all at once
2550: if (srcIndex != prevSrc) {
2551: length = (int) (srcIndex - prevSrc);
2552: if ((destIndex + length) <= destLimit) {
2553: System.arraycopy(src, prevSrc, dest, destIndex,
2554: length);
2555: }
2556:
2557: destIndex += length;
2558: reorderStartIndex = destIndex;
2559: }
2560:
2561: // end of source reached?
2562: if (srcIndex == srcLimit) {
2563: break;
2564: }
2565: // c already contains *src and norm32 is set for it, increment src
2566: ++srcIndex;
2567:
2568: if (isNorm32Regular(norm32)) {
2569: c2 = 0;
2570: length = 1;
2571: } else {
2572: // c is a lead surrogate, get the real norm32
2573: if (srcIndex != srcLimit
2574: && Character.isLowSurrogate(c2 = src[srcIndex])) {
2575: ++srcIndex;
2576: length = 2;
2577: norm32 = getNorm32FromSurrogatePair(norm32, c2);
2578: } else {
2579: c2 = 0;
2580: length = 1;
2581: norm32 = 0;
2582: }
2583: }
2584:
2585: // get the decomposition and the lead and trail cc's
2586: if ((norm32 & qcMask) == 0) {
2587: // c does not decompose
2588: cc = trailCC = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
2589: p = null;
2590: pStart = -1;
2591: } else {
2592: DecomposeArgs arg = new DecomposeArgs();
2593: // c decomposes, get everything from the variable-length
2594: // extra data
2595: pStart = decompose(norm32, qcMask, arg);
2596: p = extraData;
2597: length = arg.length;
2598: cc = arg.cc;
2599: trailCC = arg.trailCC;
2600: if (length == 1) {
2601: // fastpath a single code unit from decomposition
2602: c = p[pStart];
2603: c2 = 0;
2604: p = null;
2605: pStart = -1;
2606: }
2607: }
2608:
2609: if ((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations
2610: // buffer overflow
2611: char[] tmpBuf = new char[destLimit * 2];
2612: System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
2613: dest = tmpBuf;
2614: destLimit = dest.length;
2615: }
2616: // append the decomposition to the destination buffer, assume length>0
2617: {
2618: int reorderSplit = destIndex;
2619: if (p == null) {
2620: // fastpath: single code point
2621: if (needSingleQuotation(c)) {
2622: //if we need single quotation, no need to consider "prevCC"
2623: //and it must NOT be a supplementary pair
2624: dest[destIndex++] = '\'';
2625: dest[destIndex++] = c;
2626: dest[destIndex++] = '\'';
2627: trailCC = 0;
2628: } else if (cc != 0 && cc < prevCC) {
2629: // (c, c2) is out of order with respect to the preceding
2630: // text
2631: destIndex += length;
2632: trailCC = insertOrdered(dest,
2633: reorderStartIndex, reorderSplit,
2634: destIndex, c, c2, cc);
2635: } else {
2636: // just append (c, c2)
2637: dest[destIndex++] = c;
2638: if (c2 != 0) {
2639: dest[destIndex++] = c2;
2640: }
2641: }
2642: } else {
2643: // general: multiple code points (ordered by themselves)
2644: // from decomposition
2645: if (needSingleQuotation(p[pStart])) {
2646: dest[destIndex++] = '\'';
2647: dest[destIndex++] = p[pStart++];
2648: dest[destIndex++] = '\'';
2649: length--;
2650: do {
2651: dest[destIndex++] = p[pStart++];
2652: } while (--length > 0);
2653: } else if (cc != 0 && cc < prevCC) {
2654: destIndex += length;
2655: trailCC = mergeOrdered(dest, reorderStartIndex,
2656: reorderSplit, p, pStart, pStart
2657: + length);
2658: } else {
2659: // just append the decomposition
2660: do {
2661: dest[destIndex++] = p[pStart++];
2662: } while (--length > 0);
2663: }
2664: }
2665: }
2666: prevCC = trailCC;
2667: if (prevCC == 0) {
2668: reorderStartIndex = destIndex;
2669: }
2670: }
2671: return new String(dest, 0, destIndex);
2672: }
2673:
2674: //------------------------------------------------------
2675: // mapping method for IDNA/StringPrep
2676: //------------------------------------------------------
2677:
2678: /*
2679: * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode
2680: * 3.2 normalization with Corrigendum 4 corrections. However, normalization
2681: * without the corrections is necessary for IDNA/StringPrep support.
2682: * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option
2683: * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five
2684: * characters in Corrigendum 4 before normalization in order to avoid
2685: * incorrect normalization.
2686: * For the Corrigendum 4 issue, refer
2687: * http://www.unicode.org/versions/corrigendum4.html
2688: */
2689:
2690: /*
2691: * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL.
2692: */
2693: public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS = 0x40000;
2694:
2695: private static final char[][] corrigendum4MappingTable = {
2696: { '\uD844', '\uDF6A' }, // 0x2F868
2697: { '\u5F33' }, // 0x2F874
2698: { '\u43AB' }, // 0x2F91F
2699: { '\u7AAE' }, // 0x2F95F
2700: { '\u4D57' } }; // 0x2F9BF
2701:
2702: /*
2703: * Removing Corrigendum 4 fix
2704: * @return normalized text
2705: */
2706: public static String convert(String str) {
2707: if (str == null) {
2708: return null;
2709: }
2710:
2711: int ch = UCharacterIterator.DONE;
2712: StringBuffer dest = new StringBuffer();
2713: UCharacterIterator iter = UCharacterIterator.getInstance(str);
2714:
2715: while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
2716: switch (ch) {
2717: case 0x2F868:
2718: dest.append(corrigendum4MappingTable[0]);
2719: break;
2720: case 0x2F874:
2721: dest.append(corrigendum4MappingTable[1]);
2722: break;
2723: case 0x2F91F:
2724: dest.append(corrigendum4MappingTable[2]);
2725: break;
2726: case 0x2F95F:
2727: dest.append(corrigendum4MappingTable[3]);
2728: break;
2729: case 0x2F9BF:
2730: dest.append(corrigendum4MappingTable[4]);
2731: break;
2732: default:
2733: UTF16.append(dest, ch);
2734: break;
2735: }
2736: }
2737:
2738: return dest.toString();
2739: }
2740: }
|