Source Code Cross Referenced for NormalizerImpl.java in » 6.0-JDK-Modules-sun » text » sun » text » normalizer » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » 6.0 JDK Modules sun » text » sun.text.normalizer
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /*
0002:         * Portions Copyright 2003-2006 Sun Microsystems, Inc.  All Rights Reserved.
0003:         * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0004:         *
0005:         * This code is free software; you can redistribute it and/or modify it
0006:         * under the terms of the GNU General Public License version 2 only, as
0007:         * published by the Free Software Foundation.  Sun designates this
0008:         * particular file as subject to the "Classpath" exception as provided
0009:         * by Sun in the LICENSE file that accompanied this code.
0010:         *
0011:         * This code is distributed in the hope that it will be useful, but WITHOUT
0012:         * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0013:         * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
0014:         * version 2 for more details (a copy is included in the LICENSE file that
0015:         * accompanied this code).
0016:         *
0017:         * You should have received a copy of the GNU General Public License version
0018:         * 2 along with this work; if not, write to the Free Software Foundation,
0019:         * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0020:         *
0021:         * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
0022:         * CA 95054 USA or visit www.sun.com if you need additional information or
0023:         * have any questions.
0024:         */
0025:
0026:        /*
0027:         *******************************************************************************
0028:         * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
0029:         *                                                                             *
0030:         * The original version of this source code and documentation is copyrighted   *
0031:         * and owned by IBM, These materials are provided under terms of a License     *
0032:         * Agreement between IBM and Sun. This technology is protected by multiple     *
0033:         * US and International patents. This notice and attribution to IBM may not    *
0034:         * to removed.                                                                 *
0035:         *******************************************************************************
0036:         */
0037:
0038:        package sun.text.normalizer;
0039:
0040:        import java.io.BufferedInputStream;
0041:        import java.io.ByteArrayInputStream;
0042:        import java.io.IOException;
0043:        import java.io.BufferedInputStream;
0044:        import java.io.InputStream;
0045:
0046:        /**
0047:         * @version     1.0
0048:         * @author  Ram Viswanadha
0049:         */
0050:        public final class NormalizerImpl {
0051:            // Static block for the class to initialize its own self 
0052:            static final NormalizerImpl IMPL;
0053:
0054:            static {
0055:                try {
0056:                    IMPL = new NormalizerImpl();
0057:                } catch (Exception e) {
0058:                    throw new RuntimeException(e.getMessage());
0059:                }
0060:            }
0061:
0062:            static final int UNSIGNED_BYTE_MASK = 0xFF;
0063:            static final long UNSIGNED_INT_MASK = 0xffffffffL;
0064:            /*
0065:             * This new implementation of the normalization code loads its data from
0066:             * unorm.icu, which is generated with the gennorm tool.
0067:             * The format of that file is described at the end of this file.
0068:             */
0069:            private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu";
0070:
0071:            // norm32 value constants 
0072:
0073:            // quick check flags 0..3 set mean "no" for their forms 
0074:            public static final int QC_NFC = 0x11; /* no|maybe */
0075:            public static final int QC_NFKC = 0x22; /* no|maybe */
0076:            public static final int QC_NFD = 4; /* no */
0077:            public static final int QC_NFKD = 8; /* no */
0078:
0079:            public static final int QC_ANY_NO = 0xf;
0080:
0081:            /* quick check flags 4..5 mean "maybe" for their forms; 
0082:             * test flags>=QC_MAYBE 
0083:             */
0084:            public static final int QC_MAYBE = 0x10;
0085:            public static final int QC_ANY_MAYBE = 0x30;
0086:
0087:            public static final int QC_MASK = 0x3f;
0088:
0089:            private static final int COMBINES_FWD = 0x40;
0090:            private static final int COMBINES_BACK = 0x80;
0091:            public static final int COMBINES_ANY = 0xc0;
0092:            // UnicodeData.txt combining class in bits 15.
0093:            private static final int CC_SHIFT = 8;
0094:            public static final int CC_MASK = 0xff00;
0095:            // 16 bits for the index to UChars and other extra data
0096:            private static final int EXTRA_SHIFT = 16;
0097:
0098:            /* norm32 value constants using >16 bits */
0099:            private static final long MIN_SPECIAL = (long) (0xfc000000 & UNSIGNED_INT_MASK);
0100:            private static final long SURROGATES_TOP = (long) (0xfff00000 & UNSIGNED_INT_MASK);
0101:            private static final long MIN_HANGUL = (long) (0xfff00000 & UNSIGNED_INT_MASK);
0102:            private static final long MIN_JAMO_V = (long) (0xfff20000 & UNSIGNED_INT_MASK);
0103:            private static final long JAMO_V_TOP = (long) (0xfff30000 & UNSIGNED_INT_MASK);
0104:
0105:            /* indexes[] value names */
0106:            /* number of bytes in normalization trie */
0107:            static final int INDEX_TRIE_SIZE = 0;
0108:            /* number of chars in extra data */
0109:            static final int INDEX_CHAR_COUNT = 1;
0110:            /* number of uint16_t words for combining data */
0111:            static final int INDEX_COMBINE_DATA_COUNT = 2;
0112:            /* first code point with quick check NFC NO/MAYBE */
0113:            public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
0114:            /* first code point with quick check NFKC NO/MAYBE */
0115:            public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
0116:            /* first code point with quick check NFD NO/MAYBE */
0117:            public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
0118:            /* first code point with quick check NFKD NO/MAYBE */
0119:            public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
0120:            /* number of bytes in FCD trie */
0121:            static final int INDEX_FCD_TRIE_SIZE = 10;
0122:            /* number of bytes in the auxiliary trie */
0123:            static final int INDEX_AUX_TRIE_SIZE = 11;
0124:            /* changing this requires a new formatVersion */
0125:            static final int INDEX_TOP = 32;
0126:
0127:            /* AUX constants */
0128:            /* value constants for auxTrie */
0129:            private static final int AUX_UNSAFE_SHIFT = 11;
0130:            private static final int AUX_COMP_EX_SHIFT = 10;
0131:            private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
0132:
0133:            private static final int AUX_MAX_FNC = ((int) 1 << AUX_COMP_EX_SHIFT);
0134:            private static final int AUX_UNSAFE_MASK = (int) ((1 << AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
0135:            private static final int AUX_FNC_MASK = (int) ((AUX_MAX_FNC - 1) & UNSIGNED_INT_MASK);
0136:            private static final int AUX_COMP_EX_MASK = (int) ((1 << AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
0137:            private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK & 1) << AUX_NFC_SKIPPABLE_F_SHIFT);
0138:
0139:            private static final int MAX_BUFFER_SIZE = 20;
0140:
0141:            /*******************************/
0142:
0143:            /* Wrappers for Trie implementations */
0144:            static final class NormTrieImpl implements  Trie.DataManipulate {
0145:                static IntTrie normTrie = null;
0146:
0147:                /**
0148:                 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 
0149:                 * data the index array offset of the indexes for that lead surrogate.
0150:                 * @param property data value for a surrogate from the trie, including 
0151:                 *         the folding offset
0152:                 * @return data offset or 0 if there is no data for the lead surrogate
0153:                 */
0154:                /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
0155:                public int getFoldingOffset(int value) {
0156:                    return BMP_INDEX_LENGTH
0157:                            + ((value >> (EXTRA_SHIFT - SURROGATE_BLOCK_BITS)) & (0x3ff << SURROGATE_BLOCK_BITS));
0158:                }
0159:
0160:            }
0161:
0162:            static final class FCDTrieImpl implements  Trie.DataManipulate {
0163:                static CharTrie fcdTrie = null;
0164:
0165:                /**
0166:                 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 
0167:                 * data the index array offset of the indexes for that lead surrogate.
0168:                 * @param property data value for a surrogate from the trie, including
0169:                 *         the folding offset
0170:                 * @return data offset or 0 if there is no data for the lead surrogate
0171:                 */
0172:                /* fcdTrie: the folding offset is the lead FCD value itself */
0173:                public int getFoldingOffset(int value) {
0174:                    return value;
0175:                }
0176:            }
0177:
0178:            static final class AuxTrieImpl implements  Trie.DataManipulate {
0179:                static CharTrie auxTrie = null;
0180:
0181:                /**
0182:                 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 
0183:                 * data the index array offset of the indexes for that lead surrogate.
0184:                 * @param property data value for a surrogate from the trie, including 
0185:                 *        the folding offset
0186:                 * @return data offset or 0 if there is no data for the lead surrogate
0187:                 */
0188:                /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
0189:                public int getFoldingOffset(int value) {
0190:                    return (int) (value & AUX_FNC_MASK) << SURROGATE_BLOCK_BITS;
0191:                }
0192:            }
0193:
0194:            /****************************************************/
0195:
0196:            private static FCDTrieImpl fcdTrieImpl;
0197:            private static NormTrieImpl normTrieImpl;
0198:            private static AuxTrieImpl auxTrieImpl;
0199:            private static int[] indexes;
0200:            private static char[] combiningTable;
0201:            private static char[] extraData;
0202:
0203:            private static boolean isDataLoaded;
0204:            private static boolean isFormatVersion_2_1;
0205:            private static boolean isFormatVersion_2_2;
0206:            private static byte[] unicodeVersion;
0207:
0208:            /**
0209:             * Default buffer size of datafile
0210:             */
0211:            private static final int DATA_BUFFER_SIZE = 25000;
0212:
0213:            /**
0214:             * FCD check: everything below this code point is known to have a 0 
0215:             * lead combining class 
0216:             */
0217:            public static final int MIN_WITH_LEAD_CC = 0x300;
0218:
0219:            /**
0220:             * Bit 7 of the length byte for a decomposition string in extra data is
0221:             * a flag indicating whether the decomposition string is
0222:             * preceded by a 16-bit word with the leading and trailing cc
0223:             * of the decomposition (like for A-umlaut);
0224:             * if not, then both cc's are zero (like for compatibility ideographs).
0225:             */
0226:            private static final int DECOMP_FLAG_LENGTH_HAS_CC = 0x80;
0227:            /**
0228:             * Bits 6..0 of the length byte contain the actual length.
0229:             */
0230:            private static final int DECOMP_LENGTH_MASK = 0x7f;
0231:
0232:            /** Length of the BMP portion of the index (stage 1) array. */
0233:            private static final int BMP_INDEX_LENGTH = 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
0234:            /** Number of bits of a trail surrogate that are used in index table 
0235:             * lookups. 
0236:             */
0237:            private static final int SURROGATE_BLOCK_BITS = 10 - Trie.INDEX_STAGE_1_SHIFT_;
0238:
0239:            // public utility
0240:            public static int getFromIndexesArr(int index) {
0241:                return indexes[index];
0242:            }
0243:
0244:            // protected constructor ---------------------------------------------
0245:
0246:            /**
0247:             * Constructor
0248:             * @exception thrown when data reading fails or data corrupted
0249:             */
0250:            private NormalizerImpl() throws IOException {
0251:                //data should be loaded only once
0252:                if (!isDataLoaded) {
0253:
0254:                    // jar access
0255:                    InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
0256:                    BufferedInputStream b = new BufferedInputStream(i,
0257:                            DATA_BUFFER_SIZE);
0258:                    NormalizerDataReader reader = new NormalizerDataReader(b);
0259:
0260:                    // read the indexes            
0261:                    indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
0262:
0263:                    byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
0264:
0265:                    int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
0266:                    combiningTable = new char[combiningTableTop];
0267:
0268:                    int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
0269:                    extraData = new char[extraDataTop];
0270:
0271:                    byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
0272:                    byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
0273:
0274:                    fcdTrieImpl = new FCDTrieImpl();
0275:                    normTrieImpl = new NormTrieImpl();
0276:                    auxTrieImpl = new AuxTrieImpl();
0277:
0278:                    // load the rest of the data data and initialize the data members
0279:                    reader.read(normBytes, fcdBytes, auxBytes, extraData,
0280:                            combiningTable);
0281:
0282:                    NormTrieImpl.normTrie = new IntTrie(
0283:                            new ByteArrayInputStream(normBytes), normTrieImpl);
0284:                    FCDTrieImpl.fcdTrie = new CharTrie(
0285:                            new ByteArrayInputStream(fcdBytes), fcdTrieImpl);
0286:                    AuxTrieImpl.auxTrie = new CharTrie(
0287:                            new ByteArrayInputStream(auxBytes), auxTrieImpl);
0288:
0289:                    // we reached here without any exceptions so the data is fully 
0290:                    // loaded set the variable to true
0291:                    isDataLoaded = true;
0292:
0293:                    // get the data format version                           
0294:                    byte[] formatVersion = reader.getDataFormatVersion();
0295:
0296:                    isFormatVersion_2_1 = (formatVersion[0] > 2 || (formatVersion[0] == 2 && formatVersion[1] >= 1));
0297:                    isFormatVersion_2_2 = (formatVersion[0] > 2 || (formatVersion[0] == 2 && formatVersion[1] >= 2));
0298:                    unicodeVersion = reader.getUnicodeVersion();
0299:                    b.close();
0300:                }
0301:            }
0302:
0303:            /* ---------------------------------------------------------------------- */
0304:
0305:            /* Korean Hangul and Jamo constants */
0306:
0307:            public static final int JAMO_L_BASE = 0x1100; /* "lead" jamo */
0308:            public static final int JAMO_V_BASE = 0x1161; /* "vowel" jamo */
0309:            public static final int JAMO_T_BASE = 0x11a7; /* "trail" jamo */
0310:
0311:            public static final int HANGUL_BASE = 0xac00;
0312:
0313:            public static final int JAMO_L_COUNT = 19;
0314:            public static final int JAMO_V_COUNT = 21;
0315:            public static final int JAMO_T_COUNT = 28;
0316:            public static final int HANGUL_COUNT = JAMO_L_COUNT * JAMO_V_COUNT
0317:                    * JAMO_T_COUNT;
0318:
0319:            private static boolean isHangulWithoutJamoT(char c) {
0320:                c -= HANGUL_BASE;
0321:                return c < HANGUL_COUNT && c % JAMO_T_COUNT == 0;
0322:            }
0323:
0324:            /* norm32 helpers */
0325:
0326:            /* is this a norm32 with a regular index? */
0327:            private static boolean isNorm32Regular(long norm32) {
0328:                return norm32 < MIN_SPECIAL;
0329:            }
0330:
0331:            /* is this a norm32 with a special index for a lead surrogate? */
0332:            private static boolean isNorm32LeadSurrogate(long norm32) {
0333:                return MIN_SPECIAL <= norm32 && norm32 < SURROGATES_TOP;
0334:            }
0335:
0336:            /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
0337:            private static boolean isNorm32HangulOrJamo(long norm32) {
0338:                return norm32 >= MIN_HANGUL;
0339:            }
0340:
0341:            /*
0342:             * Given norm32 for Jamo V or T,
0343:             * is this a Jamo V?
0344:             */
0345:            private static boolean isJamoVTNorm32JamoV(long norm32) {
0346:                return norm32 < JAMO_V_TOP;
0347:            }
0348:
0349:            /* data access primitives ----------------------------------------------- */
0350:
0351:            public static long/*unsigned*/getNorm32(char c) {
0352:                return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie
0353:                        .getLeadValue(c)));
0354:            }
0355:
0356:            public static long/*unsigned*/getNorm32FromSurrogatePair(
0357:                    long norm32, char c2) {
0358:                /*
0359:                 * the surrogate index in norm32 stores only the number of the surrogate
0360:                 * index block see gennorm/store.c/getFoldedNormValue()
0361:                 */
0362:                return ((UNSIGNED_INT_MASK) & NormTrieImpl.normTrie
0363:                        .getTrailValue((int) norm32, c2));
0364:            }
0365:
0366:            ///CLOVER:OFF
0367:            private static long getNorm32(int c) {
0368:                return (UNSIGNED_INT_MASK & (NormTrieImpl.normTrie
0369:                        .getCodePointValue(c)));
0370:            }
0371:
0372:            /*
0373:             * get a norm32 from text with complete code points
0374:             * (like from decompositions)
0375:             */
0376:            private static long/*unsigned*/getNorm32(char[] p, int start,
0377:                    int/*unsigned*/mask) {
0378:                long/*unsigned*/norm32 = getNorm32(p[start]);
0379:                if (((norm32 & mask) > 0) && isNorm32LeadSurrogate(norm32)) {
0380:                    /* *p is a lead surrogate, get the real norm32 */
0381:                    norm32 = getNorm32FromSurrogatePair(norm32, p[start + 1]);
0382:                }
0383:                return norm32;
0384:            }
0385:
0386:            //// for StringPrep
0387:            public static VersionInfo getUnicodeVersion() {
0388:                return VersionInfo
0389:                        .getInstance(unicodeVersion[0], unicodeVersion[1],
0390:                                unicodeVersion[2], unicodeVersion[3]);
0391:            }
0392:
0393:            public static char getFCD16(char c) {
0394:                return FCDTrieImpl.fcdTrie.getLeadValue(c);
0395:            }
0396:
0397:            public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
0398:                /* the surrogate index in fcd16 is an absolute offset over the 
0399:                 * start of stage 1 
0400:                 * */
0401:                return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
0402:            }
0403:
0404:            public static int getFCD16(int c) {
0405:                return FCDTrieImpl.fcdTrie.getCodePointValue(c);
0406:            }
0407:
0408:            private static int getExtraDataIndex(long norm32) {
0409:                return (int) (norm32 >> EXTRA_SHIFT);
0410:            }
0411:
0412:            private static final class DecomposeArgs {
0413:                int /*unsigned byte*/cc;
0414:                int /*unsigned byte*/trailCC;
0415:                int length;
0416:            }
0417:
0418:            /**
0419:             * 
0420:             * get the canonical or compatibility decomposition for one character 
0421:             * 
0422:             * @return index into the extraData array
0423:             */
0424:            private static int/*index*/decompose(long/*unsigned*/norm32,
0425:                    int/*unsigned*/qcMask, DecomposeArgs args) {
0426:                int p = getExtraDataIndex(norm32);
0427:                args.length = extraData[p++];
0428:
0429:                if ((norm32 & qcMask & QC_NFKD) != 0 && args.length >= 0x100) {
0430:                    /* use compatibility decomposition, skip canonical data */
0431:                    p += ((args.length >> 7) & 1)
0432:                            + (args.length & DECOMP_LENGTH_MASK);
0433:                    args.length >>= 8;
0434:                }
0435:
0436:                if ((args.length & DECOMP_FLAG_LENGTH_HAS_CC) > 0) {
0437:                    /* get the lead and trail cc's */
0438:                    char bothCCs = extraData[p++];
0439:                    args.cc = (UNSIGNED_BYTE_MASK) & (bothCCs >> 8);
0440:                    args.trailCC = (UNSIGNED_BYTE_MASK) & bothCCs;
0441:                } else {
0442:                    /* lead and trail cc's are both 0 */
0443:                    args.cc = args.trailCC = 0;
0444:                }
0445:
0446:                args.length &= DECOMP_LENGTH_MASK;
0447:                return p;
0448:            }
0449:
0450:            /**
0451:             * get the canonical decomposition for one character 
0452:             * @return index into the extraData array
0453:             */
0454:            private static int decompose(long/*unsigned*/norm32,
0455:                    DecomposeArgs args) {
0456:
0457:                int p = getExtraDataIndex(norm32);
0458:                args.length = extraData[p++];
0459:
0460:                if ((args.length & DECOMP_FLAG_LENGTH_HAS_CC) > 0) {
0461:                    /* get the lead and trail cc's */
0462:                    char bothCCs = extraData[p++];
0463:                    args.cc = (UNSIGNED_BYTE_MASK) & (bothCCs >> 8);
0464:                    args.trailCC = (UNSIGNED_BYTE_MASK) & bothCCs;
0465:                } else {
0466:                    /* lead and trail cc's are both 0 */
0467:                    args.cc = args.trailCC = 0;
0468:                }
0469:
0470:                args.length &= DECOMP_LENGTH_MASK;
0471:                return p;
0472:            }
0473:
0474:            private static final class NextCCArgs {
0475:                char[] source;
0476:                int next;
0477:                int limit;
0478:                char c;
0479:                char c2;
0480:            }
0481:
0482:            /*
0483:             * get the combining class of (c, c2)= args.source[args.next++]
0484:             * before: args.next<args.limit  after: args.next<=args.limit
0485:             * if only one code unit is used, then c2==0
0486:             */
0487:            private static int /*unsigned byte*/getNextCC(NextCCArgs args) {
0488:                long /*unsigned*/norm32;
0489:
0490:                args.c = args.source[args.next++];
0491:
0492:                norm32 = getNorm32(args.c);
0493:                if ((norm32 & CC_MASK) == 0) {
0494:                    args.c2 = 0;
0495:                    return 0;
0496:                } else {
0497:                    if (!isNorm32LeadSurrogate(norm32)) {
0498:                        args.c2 = 0;
0499:                    } else {
0500:                        /* c is a lead surrogate, get the real norm32 */
0501:                        if (args.next != args.limit
0502:                                && UTF16
0503:                                        .isTrailSurrogate(args.c2 = args.source[args.next])) {
0504:                            ++args.next;
0505:                            norm32 = getNorm32FromSurrogatePair(norm32, args.c2);
0506:                        } else {
0507:                            args.c2 = 0;
0508:                            return 0;
0509:                        }
0510:                    }
0511:
0512:                    return (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
0513:                }
0514:            }
0515:
0516:            private static final class PrevArgs {
0517:                char[] src;
0518:                int start;
0519:                int current;
0520:                char c;
0521:                char c2;
0522:            }
0523:
0524:            /*
0525:             * read backwards and get norm32
0526:             * return 0 if the character is <minC
0527:             * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first 
0528:             * surrogate but read second!)
0529:             */
0530:            private static long /*unsigned*/getPrevNorm32(PrevArgs args,
0531:                    int/*unsigned*/minC, int/*unsigned*/mask) {
0532:                long/*unsigned*/norm32;
0533:
0534:                args.c = args.src[--args.current];
0535:                args.c2 = 0;
0536:
0537:                /* check for a surrogate before getting norm32 to see if we need to 
0538:                 * predecrement further 
0539:                 */
0540:                if (args.c < minC) {
0541:                    return 0;
0542:                } else if (!UTF16.isSurrogate(args.c)) {
0543:                    return getNorm32(args.c);
0544:                } else if (UTF16.isLeadSurrogate(args.c)) {
0545:                    /* unpaired first surrogate */
0546:                    return 0;
0547:                } else if (args.current != args.start
0548:                        && UTF16
0549:                                .isLeadSurrogate(args.c2 = args.src[args.current - 1])) {
0550:                    --args.current;
0551:                    norm32 = getNorm32(args.c2);
0552:
0553:                    if ((norm32 & mask) == 0) {
0554:                        /* all surrogate pairs with this lead surrogate have 
0555:                         * only irrelevant data 
0556:                         */
0557:                        return 0;
0558:                    } else {
0559:                        /* norm32 must be a surrogate special */
0560:                        return getNorm32FromSurrogatePair(norm32, args.c);
0561:                    }
0562:                } else {
0563:                    /* unpaired second surrogate */
0564:                    args.c2 = 0;
0565:                    return 0;
0566:                }
0567:            }
0568:
0569:            /*
0570:             * get the combining class of (c, c2)=*--p
0571:             * before: start<p  after: start<=p
0572:             */
0573:            private static int /*unsigned byte*/getPrevCC(PrevArgs args) {
0574:
0575:                return (int) ((UNSIGNED_BYTE_MASK) & (getPrevNorm32(args,
0576:                        MIN_WITH_LEAD_CC, CC_MASK) >> CC_SHIFT));
0577:            }
0578:
0579:            /*
0580:             * is this a safe boundary character for NF*D?
0581:             * (lead cc==0)
0582:             */
0583:            public static boolean isNFDSafe(long/*unsigned*/norm32,
0584:                    int/*unsigned*/ccOrQCMask, int/*unsigned*/decompQCMask) {
0585:                if ((norm32 & ccOrQCMask) == 0) {
0586:                    return true; /* cc==0 and no decomposition: this is NF*D safe */
0587:                }
0588:
0589:                /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
0590:                if (isNorm32Regular(norm32) && (norm32 & decompQCMask) != 0) {
0591:                    DecomposeArgs args = new DecomposeArgs();
0592:                    /* decomposes, get everything from the variable-length extra data */
0593:                    decompose(norm32, decompQCMask, args);
0594:                    return args.cc == 0;
0595:                } else {
0596:                    /* no decomposition (or Hangul), test the cc directly */
0597:                    return (norm32 & CC_MASK) == 0;
0598:                }
0599:            }
0600:
0601:            /*
0602:             * is this (or does its decomposition begin with) a "true starter"?
0603:             * (cc==0 and NF*C_YES)
0604:             */
0605:            public static boolean isTrueStarter(long/*unsigned*/norm32,
0606:                    int/*unsigned*/ccOrQCMask, int/*unsigned*/decompQCMask) {
0607:                if ((norm32 & ccOrQCMask) == 0) {
0608:                    return true; /* this is a true starter (could be Hangul or Jamo L)*/
0609:                }
0610:
0611:                /* inspect its decomposition - not a Hangul or a surrogate here */
0612:                if ((norm32 & decompQCMask) != 0) {
0613:                    int p; /* index into extra data array */
0614:                    DecomposeArgs args = new DecomposeArgs();
0615:                    /* decomposes, get everything from the variable-length extra data */
0616:                    p = decompose(norm32, decompQCMask, args);
0617:
0618:                    if (args.cc == 0) {
0619:                        int/*unsigned*/qcMask = ccOrQCMask & QC_MASK;
0620:
0621:                        /* does it begin with NFC_YES? */
0622:                        if ((getNorm32(extraData, p, qcMask) & qcMask) == 0) {
0623:                            /* yes, the decomposition begins with a true starter */
0624:                            return true;
0625:                        }
0626:                    }
0627:                }
0628:                return false;
0629:            }
0630:
0631:            /* reorder UTF-16 in-place ---------------------------------------------- */
0632:
0633:            /**
0634:             * simpler, single-character version of mergeOrdered() -
0635:             * bubble-insert one single code point into the preceding string
0636:             * which is already canonically ordered
0637:             * (c, c2) may or may not yet have been inserted at src[current]..src[p]
0638:             *
0639:             * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
0640:             *
0641:             * before: src[start]..src[current] is already ordered, and
0642:             *         src[current]..src[p]     may or may not hold (c, c2) but
0643:             *                          must be exactly the same length as (c, c2)
0644:             * after: src[start]..src[p] is ordered
0645:             *
0646:             * @return the trailing combining class
0647:             */
0648:            private static int/*unsigned byte*/insertOrdered(char[] source,
0649:                    int start, int current, int p, char c, char c2,
0650:                    int/*unsigned byte*/cc) {
0651:                int back, preBack;
0652:                int r;
0653:                int prevCC, trailCC = cc;
0654:
0655:                if (start < current && cc != 0) {
0656:                    // search for the insertion point where cc>=prevCC 
0657:                    preBack = back = current;
0658:                    PrevArgs prevArgs = new PrevArgs();
0659:                    prevArgs.current = current;
0660:                    prevArgs.start = start;
0661:                    prevArgs.src = source;
0662:                    // get the prevCC 
0663:                    prevCC = getPrevCC(prevArgs);
0664:                    preBack = prevArgs.current;
0665:
0666:                    if (cc < prevCC) {
0667:                        // this will be the last code point, so keep its cc 
0668:                        trailCC = prevCC;
0669:                        back = preBack;
0670:                        while (start < preBack) {
0671:                            prevCC = getPrevCC(prevArgs);
0672:                            preBack = prevArgs.current;
0673:                            if (cc >= prevCC) {
0674:                                break;
0675:                            }
0676:                            back = preBack;
0677:                        }
0678:
0679:                        // this is where we are right now with all these indicies:
0680:                        // [start]..[pPreBack] 0..? code points that we can ignore
0681:                        // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
0682:                        // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
0683:                        // [current]..[p]         1 code point (c, c2) with cc
0684:
0685:                        // move the code units in between up 
0686:                        r = p;
0687:                        do {
0688:                            source[--r] = source[--current];
0689:                        } while (back != current);
0690:                    }
0691:                }
0692:
0693:                // insert (c, c2) 
0694:                source[current] = c;
0695:                if (c2 != 0) {
0696:                    source[(current + 1)] = c2;
0697:                }
0698:
0699:                // we know the cc of the last code point 
0700:                return trailCC;
0701:            }
0702:
0703:            /**
0704:             * merge two UTF-16 string parts together
0705:             * to canonically order (order by combining classes) their concatenation
0706:             *
0707:             * the two strings may already be adjacent, so that the merging is done 
0708:             * in-place if the two strings are not adjacent, then the buffer holding the
0709:             * first one must be large enough
0710:             * the second string may or may not be ordered in itself
0711:             *
0712:             * before: [start]..[current] is already ordered, and
0713:             *         [next]..[limit]    may be ordered in itself, but
0714:             *                          is not in relation to [start..current[
0715:             * after: [start..current+(limit-next)[ is ordered
0716:             *
0717:             * the algorithm is a simple bubble-sort that takes the characters from 
0718:             * src[next++] and inserts them in correct combining class order into the 
0719:             * preceding part of the string
0720:             *
0721:             * since this function is called much less often than the single-code point
0722:             * insertOrdered(), it just uses that for easier maintenance
0723:             *
0724:             * @return the trailing combining class
0725:             */
0726:            private static int /*unsigned byte*/mergeOrdered(char[] source,
0727:                    int start, int current, char[] data, int next, int limit,
0728:                    boolean isOrdered) {
0729:                int r;
0730:                int /*unsigned byte*/cc, trailCC = 0;
0731:                boolean adjacent;
0732:
0733:                adjacent = current == next;
0734:                NextCCArgs ncArgs = new NextCCArgs();
0735:                ncArgs.source = data;
0736:                ncArgs.next = next;
0737:                ncArgs.limit = limit;
0738:
0739:                if (start != current || !isOrdered) {
0740:
0741:                    while (ncArgs.next < ncArgs.limit) {
0742:                        cc = getNextCC(ncArgs);
0743:                        if (cc == 0) {
0744:                            // does not bubble back 
0745:                            trailCC = 0;
0746:                            if (adjacent) {
0747:                                current = ncArgs.next;
0748:                            } else {
0749:                                data[current++] = ncArgs.c;
0750:                                if (ncArgs.c2 != 0) {
0751:                                    data[current++] = ncArgs.c2;
0752:                                }
0753:                            }
0754:                            if (isOrdered) {
0755:                                break;
0756:                            } else {
0757:                                start = current;
0758:                            }
0759:                        } else {
0760:                            r = current + (ncArgs.c2 == 0 ? 1 : 2);
0761:                            trailCC = insertOrdered(source, start, current, r,
0762:                                    ncArgs.c, ncArgs.c2, cc);
0763:                            current = r;
0764:                        }
0765:                    }
0766:                }
0767:
0768:                if (ncArgs.next == ncArgs.limit) {
0769:                    // we know the cc of the last code point 
0770:                    return trailCC;
0771:                } else {
0772:                    if (!adjacent) {
0773:                        // copy the second string part 
0774:                        do {
0775:                            source[current++] = data[ncArgs.next++];
0776:                        } while (ncArgs.next != ncArgs.limit);
0777:                        ncArgs.limit = current;
0778:                    }
0779:                    PrevArgs prevArgs = new PrevArgs();
0780:                    prevArgs.src = data;
0781:                    prevArgs.start = start;
0782:                    prevArgs.current = ncArgs.limit;
0783:                    return getPrevCC(prevArgs);
0784:                }
0785:
0786:            }
0787:
0788:            private static int /*unsigned byte*/mergeOrdered(char[] source,
0789:                    int start, int current, char[] data, final int next,
0790:                    final int limit) {
0791:                return mergeOrdered(source, start, current, data, next, limit,
0792:                        true);
0793:            }
0794:
0795:            public static NormalizerBase.QuickCheckResult quickCheck(
0796:                    char[] src, int srcStart, int srcLimit, int minNoMaybe,
0797:                    int qcMask, int options, boolean allowMaybe, UnicodeSet nx) {
0798:
0799:                int ccOrQCMask;
0800:                long norm32;
0801:                char c, c2;
0802:                char cc, prevCC;
0803:                long qcNorm32;
0804:                NormalizerBase.QuickCheckResult result;
0805:                ComposePartArgs args = new ComposePartArgs();
0806:                char[] buffer;
0807:                int start = srcStart;
0808:
0809:                if (!isDataLoaded) {
0810:                    return NormalizerBase.MAYBE;
0811:                }
0812:                // initialize 
0813:                ccOrQCMask = CC_MASK | qcMask;
0814:                result = NormalizerBase.YES;
0815:                prevCC = 0;
0816:
0817:                for (;;) {
0818:                    for (;;) {
0819:                        if (srcStart == srcLimit) {
0820:                            return result;
0821:                        } else if ((c = src[srcStart++]) >= minNoMaybe
0822:                                && ((norm32 = getNorm32(c)) & ccOrQCMask) != 0) {
0823:                            break;
0824:                        }
0825:                        prevCC = 0;
0826:                    }
0827:
0828:                    // check one above-minimum, relevant code unit 
0829:                    if (isNorm32LeadSurrogate(norm32)) {
0830:                        // c is a lead surrogate, get the real norm32 
0831:                        if (srcStart != srcLimit
0832:                                && UTF16.isTrailSurrogate(c2 = src[srcStart])) {
0833:                            ++srcStart;
0834:                            norm32 = getNorm32FromSurrogatePair(norm32, c2);
0835:                        } else {
0836:                            norm32 = 0;
0837:                            c2 = 0;
0838:                        }
0839:                    } else {
0840:                        c2 = 0;
0841:                    }
0842:                    if (nx_contains(nx, c, c2)) {
0843:                        /* excluded: norm32==0 */
0844:                        norm32 = 0;
0845:                    }
0846:
0847:                    // check the combining order 
0848:                    cc = (char) ((norm32 >> CC_SHIFT) & 0xFF);
0849:                    if (cc != 0 && cc < prevCC) {
0850:                        return NormalizerBase.NO;
0851:                    }
0852:                    prevCC = cc;
0853:
0854:                    // check for "no" or "maybe" quick check flags 
0855:                    qcNorm32 = norm32 & qcMask;
0856:                    if ((qcNorm32 & QC_ANY_NO) >= 1) {
0857:                        result = NormalizerBase.NO;
0858:                        break;
0859:                    } else if (qcNorm32 != 0) {
0860:                        // "maybe" can only occur for NFC and NFKC 
0861:                        if (allowMaybe) {
0862:                            result = NormalizerBase.MAYBE;
0863:                        } else {
0864:                            // normalize a section around here to see if it is really 
0865:                            // normalized or not 
0866:                            int prevStarter;
0867:                            int/*unsigned*/decompQCMask;
0868:
0869:                            decompQCMask = (qcMask << 2) & 0xf; // decomposition quick check mask 
0870:
0871:                            // find the previous starter 
0872:
0873:                            // set prevStarter to the beginning of the current character 
0874:                            prevStarter = srcStart - 1;
0875:                            if (UTF16.isTrailSurrogate(src[prevStarter])) {
0876:                                // safe because unpaired surrogates do not result 
0877:                                // in "maybe"
0878:                                --prevStarter;
0879:                            }
0880:                            prevStarter = findPreviousStarter(src, start,
0881:                                    prevStarter, ccOrQCMask, decompQCMask,
0882:                                    (char) minNoMaybe);
0883:
0884:                            // find the next true starter in [src..limit[ - modifies 
0885:                            // src to point to the next starter 
0886:                            srcStart = findNextStarter(src, srcStart, srcLimit,
0887:                                    qcMask, decompQCMask, (char) minNoMaybe);
0888:
0889:                            //set the args for compose part
0890:                            args.prevCC = prevCC;
0891:
0892:                            // decompose and recompose [prevStarter..src[ 
0893:                            buffer = composePart(args, prevStarter, src,
0894:                                    srcStart, srcLimit, options, nx);
0895:
0896:                            // compare the normalized version with the original 
0897:                            if (0 != strCompare(buffer, 0, args.length, src,
0898:                                    prevStarter, (srcStart - prevStarter),
0899:                                    false)) {
0900:                                result = NormalizerBase.NO; // normalization differs 
0901:                                break;
0902:                            }
0903:
0904:                            // continue after the next starter 
0905:                        }
0906:                    }
0907:                }
0908:                return result;
0909:            }
0910:
0911:            //------------------------------------------------------ 
0912:            // make NFD & NFKD 
0913:            //------------------------------------------------------
0914:
0915:            public static int decompose(char[] src, int srcStart, int srcLimit,
0916:                    char[] dest, int destStart, int destLimit, boolean compat,
0917:                    int[] outTrailCC, UnicodeSet nx) {
0918:
0919:                char[] buffer = new char[3];
0920:                int prevSrc;
0921:                long norm32;
0922:                int ccOrQCMask, qcMask;
0923:                int reorderStartIndex, length;
0924:                char c, c2, minNoMaybe;
0925:                int/*unsigned byte*/cc, prevCC, trailCC;
0926:                char[] p;
0927:                int pStart;
0928:                int destIndex = destStart;
0929:                int srcIndex = srcStart;
0930:                if (!compat) {
0931:                    minNoMaybe = (char) indexes[INDEX_MIN_NFD_NO_MAYBE];
0932:                    qcMask = QC_NFD;
0933:                } else {
0934:                    minNoMaybe = (char) indexes[INDEX_MIN_NFKD_NO_MAYBE];
0935:                    qcMask = QC_NFKD;
0936:                }
0937:
0938:                /* initialize */
0939:                ccOrQCMask = CC_MASK | qcMask;
0940:                reorderStartIndex = 0;
0941:                prevCC = 0;
0942:                norm32 = 0;
0943:                c = 0;
0944:                pStart = 0;
0945:
0946:                cc = trailCC = -1;//initialize to bogus value
0947:
0948:                for (;;) {
0949:                    /* count code units below the minimum or with irrelevant data for 
0950:                     * the quick check 
0951:                     */
0952:                    prevSrc = srcIndex;
0953:
0954:                    while (srcIndex != srcLimit
0955:                            && ((c = src[srcIndex]) < minNoMaybe || ((norm32 = getNorm32(c)) & ccOrQCMask) == 0)) {
0956:                        prevCC = 0;
0957:                        ++srcIndex;
0958:                    }
0959:
0960:                    /* copy these code units all at once */
0961:                    if (srcIndex != prevSrc) {
0962:                        length = (int) (srcIndex - prevSrc);
0963:                        if ((destIndex + length) <= destLimit) {
0964:                            System.arraycopy(src, prevSrc, dest, destIndex,
0965:                                    length);
0966:                        }
0967:
0968:                        destIndex += length;
0969:                        reorderStartIndex = destIndex;
0970:                    }
0971:
0972:                    /* end of source reached? */
0973:                    if (srcIndex == srcLimit) {
0974:                        break;
0975:                    }
0976:
0977:                    /* c already contains *src and norm32 is set for it, increment src*/
0978:                    ++srcIndex;
0979:
0980:                    /* check one above-minimum, relevant code unit */
0981:                    /*
0982:                     * generally, set p and length to the decomposition string
0983:                     * in simple cases, p==NULL and (c, c2) will hold the length code 
0984:                     * units to append in all cases, set cc to the lead and trailCC to 
0985:                     * the trail combining class
0986:                     *
0987:                     * the following merge-sort of the current character into the 
0988:                     * preceding, canonically ordered result text will use the 
0989:                     * optimized insertOrdered()
0990:                     * if there is only one single code point to process;
0991:                     * this is indicated with p==NULL, and (c, c2) is the character to 
0992:                     * insert
0993:                     * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
0994:                     * for a supplementary character)
0995:                     * otherwise, p[length] is merged in with _mergeOrdered()
0996:                     */
0997:                    if (isNorm32HangulOrJamo(norm32)) {
0998:                        if (nx_contains(nx, c)) {
0999:                            c2 = 0;
1000:                            p = null;
1001:                            length = 1;
1002:                        } else {
1003:                            // Hangul syllable: decompose algorithmically 
1004:                            p = buffer;
1005:                            pStart = 0;
1006:                            cc = trailCC = 0;
1007:
1008:                            c -= HANGUL_BASE;
1009:
1010:                            c2 = (char) (c % JAMO_T_COUNT);
1011:                            c /= JAMO_T_COUNT;
1012:                            if (c2 > 0) {
1013:                                buffer[2] = (char) (JAMO_T_BASE + c2);
1014:                                length = 3;
1015:                            } else {
1016:                                length = 2;
1017:                            }
1018:
1019:                            buffer[1] = (char) (JAMO_V_BASE + c % JAMO_V_COUNT);
1020:                            buffer[0] = (char) (JAMO_L_BASE + c / JAMO_V_COUNT);
1021:                        }
1022:                    } else {
1023:                        if (isNorm32Regular(norm32)) {
1024:                            c2 = 0;
1025:                            length = 1;
1026:                        } else {
1027:                            // c is a lead surrogate, get the real norm32 
1028:                            if (srcIndex != srcLimit
1029:                                    && UTF16
1030:                                            .isTrailSurrogate(c2 = src[srcIndex])) {
1031:                                ++srcIndex;
1032:                                length = 2;
1033:                                norm32 = getNorm32FromSurrogatePair(norm32, c2);
1034:                            } else {
1035:                                c2 = 0;
1036:                                length = 1;
1037:                                norm32 = 0;
1038:                            }
1039:                        }
1040:
1041:                        /* get the decomposition and the lead and trail cc's */
1042:                        if (nx_contains(nx, c, c2)) {
1043:                            /* excluded: norm32==0 */
1044:                            cc = trailCC = 0;
1045:                            p = null;
1046:                        } else if ((norm32 & qcMask) == 0) {
1047:                            /* c does not decompose */
1048:                            cc = trailCC = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
1049:                            p = null;
1050:                            pStart = -1;
1051:                        } else {
1052:                            DecomposeArgs arg = new DecomposeArgs();
1053:                            /* c decomposes, get everything from the variable-length 
1054:                             * extra data 
1055:                             */
1056:                            pStart = decompose(norm32, qcMask, arg);
1057:                            p = extraData;
1058:                            length = arg.length;
1059:                            cc = arg.cc;
1060:                            trailCC = arg.trailCC;
1061:                            if (length == 1) {
1062:                                /* fastpath a single code unit from decomposition */
1063:                                c = p[pStart];
1064:                                c2 = 0;
1065:                                p = null;
1066:                                pStart = -1;
1067:                            }
1068:                        }
1069:                    }
1070:
1071:                    /* append the decomposition to the destination buffer, assume 
1072:                     * length>0 
1073:                     */
1074:                    if ((destIndex + length) <= destLimit) {
1075:                        int reorderSplit = destIndex;
1076:                        if (p == null) {
1077:                            /* fastpath: single code point */
1078:                            if (cc != 0 && cc < prevCC) {
1079:                                /* (c, c2) is out of order with respect to the preceding
1080:                                 *  text 
1081:                                 */
1082:                                destIndex += length;
1083:                                trailCC = insertOrdered(dest,
1084:                                        reorderStartIndex, reorderSplit,
1085:                                        destIndex, c, c2, cc);
1086:                            } else {
1087:                                /* just append (c, c2) */
1088:                                dest[destIndex++] = c;
1089:                                if (c2 != 0) {
1090:                                    dest[destIndex++] = c2;
1091:                                }
1092:                            }
1093:                        } else {
1094:                            /* general: multiple code points (ordered by themselves) 
1095:                             * from decomposition 
1096:                             */
1097:                            if (cc != 0 && cc < prevCC) {
1098:                                /* the decomposition is out of order with respect to the
1099:                                 *  preceding text 
1100:                                 */
1101:                                destIndex += length;
1102:                                trailCC = mergeOrdered(dest, reorderStartIndex,
1103:                                        reorderSplit, p, pStart, pStart
1104:                                                + length);
1105:                            } else {
1106:                                /* just append the decomposition */
1107:                                do {
1108:                                    dest[destIndex++] = p[pStart++];
1109:                                } while (--length > 0);
1110:                            }
1111:                        }
1112:                    } else {
1113:                        /* buffer overflow */
1114:                        /* keep incrementing the destIndex for preflighting */
1115:                        destIndex += length;
1116:                    }
1117:
1118:                    prevCC = trailCC;
1119:                    if (prevCC == 0) {
1120:                        reorderStartIndex = destIndex;
1121:                    }
1122:                }
1123:
1124:                outTrailCC[0] = prevCC;
1125:
1126:                return destIndex - destStart;
1127:            }
1128:
1129:            /* make NFC & NFKC ------------------------------------------------------ */
1130:            private static final class NextCombiningArgs {
1131:                char[] source;
1132:                int start;
1133:                //int limit;
1134:                char c;
1135:                char c2;
1136:                int/*unsigned*/combiningIndex;
1137:                char /*unsigned byte*/cc;
1138:            }
1139:
1140:            /* get the composition properties of the next character */
1141:            private static int /*unsigned*/getNextCombining(
1142:                    NextCombiningArgs args, int limit, UnicodeSet nx) {
1143:                long/*unsigned*/norm32;
1144:                int combineFlags;
1145:                /* get properties */
1146:                args.c = args.source[args.start++];
1147:                norm32 = getNorm32(args.c);
1148:
1149:                /* preset output values for most characters */
1150:                args.c2 = 0;
1151:                args.combiningIndex = 0;
1152:                args.cc = 0;
1153:
1154:                if ((norm32 & (CC_MASK | COMBINES_ANY)) == 0) {
1155:                    return 0;
1156:                } else {
1157:                    if (isNorm32Regular(norm32)) {
1158:                        /* set cc etc. below */
1159:                    } else if (isNorm32HangulOrJamo(norm32)) {
1160:                        /* a compatibility decomposition contained Jamos */
1161:                        args.combiningIndex = (int) ((UNSIGNED_INT_MASK) & (0xfff0 | (norm32 >> EXTRA_SHIFT)));
1162:                        return (int) (norm32 & COMBINES_ANY);
1163:                    } else {
1164:                        /* c is a lead surrogate, get the real norm32 */
1165:                        if (args.start != limit
1166:                                && UTF16
1167:                                        .isTrailSurrogate(args.c2 = args.source[args.start])) {
1168:                            ++args.start;
1169:                            norm32 = getNorm32FromSurrogatePair(norm32, args.c2);
1170:                        } else {
1171:                            args.c2 = 0;
1172:                            return 0;
1173:                        }
1174:                    }
1175:
1176:                    if (nx_contains(nx, args.c, args.c2)) {
1177:                        return 0; /* excluded: norm32==0 */
1178:                    }
1179:
1180:                    args.cc = (char) ((norm32 >> CC_SHIFT) & 0xff);
1181:
1182:                    combineFlags = (int) (norm32 & COMBINES_ANY);
1183:                    if (combineFlags != 0) {
1184:                        int index = getExtraDataIndex(norm32);
1185:                        args.combiningIndex = index > 0 ? extraData[(index - 1)]
1186:                                : 0;
1187:                    }
1188:
1189:                    return combineFlags;
1190:                }
1191:            }
1192:
1193:            /*
1194:             * given a composition-result starter (c, c2) - which means its cc==0,
1195:             * it combines forward, it has extra data, its norm32!=0,
1196:             * it is not a Hangul or Jamo,
1197:             * get just its combineFwdIndex
1198:             *
1199:             * norm32(c) is special if and only if c2!=0
1200:             */
1201:            private static int/*unsigned*/getCombiningIndexFromStarter(char c,
1202:                    char c2) {
1203:                long/*unsigned*/norm32;
1204:
1205:                norm32 = getNorm32(c);
1206:                if (c2 != 0) {
1207:                    norm32 = getNorm32FromSurrogatePair(norm32, c2);
1208:                }
1209:                return extraData[(getExtraDataIndex(norm32) - 1)];
1210:            }
1211:
1212:            /*
1213:             * Find the recomposition result for
1214:             * a forward-combining character
1215:             * (specified with a pointer to its part of the combiningTable[])
1216:             * and a backward-combining character
1217:             * (specified with its combineBackIndex).
1218:             *
1219:             * If these two characters combine, then set (value, value2)
1220:             * with the code unit(s) of the composition character.
1221:             *
1222:             * Return value:
1223:             * 0    do not combine
1224:             * 1    combine
1225:             * >1   combine, and the composition is a forward-combining starter
1226:             *
1227:             * See unormimp.h for a description of the composition table format.
1228:             */
1229:            private static int/*unsigned*/combine(char[] table,
1230:                    int tableStart, int/*unsinged*/combineBackIndex,
1231:                    int[] outValues) {
1232:                int/*unsigned*/key;
1233:                int value, value2;
1234:
1235:                if (outValues.length < 2) {
1236:                    throw new IllegalArgumentException();
1237:                }
1238:
1239:                /* search in the starter's composition table */
1240:                for (;;) {
1241:                    key = table[tableStart++];
1242:                    if (key >= combineBackIndex) {
1243:                        break;
1244:                    }
1245:                    tableStart += ((table[tableStart] & 0x8000) != 0) ? 2 : 1;
1246:                }
1247:
1248:                /* mask off bit 15, the last-entry-in-the-list flag */
1249:                if ((key & 0x7fff) == combineBackIndex) {
1250:                    /* found! combine! */
1251:                    value = table[tableStart];
1252:
1253:                    /* is the composition a starter that combines forward? */
1254:                    key = (int) ((UNSIGNED_INT_MASK) & ((value & 0x2000) + 1));
1255:
1256:                    /* get the composition result code point from the variable-length 
1257:                     * result value 
1258:                     */
1259:                    if ((value & 0x8000) != 0) {
1260:                        if ((value & 0x4000) != 0) {
1261:                            /* surrogate pair composition result */
1262:                            value = (int) ((UNSIGNED_INT_MASK) & ((value & 0x3ff) | 0xd800));
1263:                            value2 = table[tableStart + 1];
1264:                        } else {
1265:                            /* BMP composition result U+2000..U+ffff */
1266:                            value = table[tableStart + 1];
1267:                            value2 = 0;
1268:                        }
1269:                    } else {
1270:                        /* BMP composition result U+0000..U+1fff */
1271:                        value &= 0x1fff;
1272:                        value2 = 0;
1273:                    }
1274:                    outValues[0] = value;
1275:                    outValues[1] = value2;
1276:                    return key;
1277:                } else {
1278:                    /* not found */
1279:                    return 0;
1280:                }
1281:            }
1282:
1283:            private static final class RecomposeArgs {
1284:                char[] source;
1285:                int start;
1286:                int limit;
1287:            }
1288:
1289:            /*
1290:             * recompose the characters in [p..limit[
1291:             * (which is in NFD - decomposed and canonically ordered),
1292:             * adjust limit, and return the trailing cc
1293:             *
1294:             * since for NFKC we may get Jamos in decompositions, we need to
1295:             * recompose those too
1296:             *
1297:             * note that recomposition never lengthens the text:
1298:             * any character consists of either one or two code units;
1299:             * a composition may contain at most one more code unit than the original 
1300:             * starter, while the combining mark that is removed has at least one code 
1301:             * unit
1302:             */
1303:            private static char/*unsigned byte*/recompose(RecomposeArgs args,
1304:                    int options, UnicodeSet nx) {
1305:                int remove, q, r;
1306:                int /*unsigned*/combineFlags;
1307:                int /*unsigned*/combineFwdIndex, combineBackIndex;
1308:                int /*unsigned*/result, value = 0, value2 = 0;
1309:                int /*unsigned byte*/prevCC;
1310:                boolean starterIsSupplementary;
1311:                int starter;
1312:                int[] outValues = new int[2];
1313:                starter = -1; /* no starter */
1314:                combineFwdIndex = 0; /* will not be used until starter!=NULL */
1315:                starterIsSupplementary = false; /* will not be used until starter!=NULL */
1316:                prevCC = 0;
1317:
1318:                NextCombiningArgs ncArg = new NextCombiningArgs();
1319:                ncArg.source = args.source;
1320:
1321:                ncArg.cc = 0;
1322:                ncArg.c2 = 0;
1323:
1324:                for (;;) {
1325:                    ncArg.start = args.start;
1326:                    combineFlags = getNextCombining(ncArg, args.limit, nx);
1327:                    combineBackIndex = ncArg.combiningIndex;
1328:                    args.start = ncArg.start;
1329:
1330:                    if (((combineFlags & COMBINES_BACK) != 0) && starter != -1) {
1331:                        if ((combineBackIndex & 0x8000) != 0) {
1332:                            /* c is a Jamo V/T, see if we can compose it with the 
1333:                             * previous character 
1334:                             */
1335:                            /* for the PRI #29 fix, check that there is no intervening combining mark */
1336:                            if ((options & BEFORE_PRI_29) != 0 || prevCC == 0) {
1337:                                remove = -1; /* NULL while no Hangul composition */
1338:                                combineFlags = 0;
1339:                                ncArg.c2 = args.source[starter];
1340:                                if (combineBackIndex == 0xfff2) {
1341:                                    /* Jamo V, compose with previous Jamo L and following 
1342:                                     * Jamo T 
1343:                                     */
1344:                                    ncArg.c2 = (char) (ncArg.c2 - JAMO_L_BASE);
1345:                                    if (ncArg.c2 < JAMO_L_COUNT) {
1346:                                        remove = args.start - 1;
1347:                                        ncArg.c = (char) (HANGUL_BASE + (ncArg.c2
1348:                                                * JAMO_V_COUNT + (ncArg.c - JAMO_V_BASE))
1349:                                                * JAMO_T_COUNT);
1350:                                        if (args.start != args.limit
1351:                                                && (ncArg.c2 = (char) (args.source[args.start] - JAMO_T_BASE)) < JAMO_T_COUNT) {
1352:                                            ++args.start;
1353:                                            ncArg.c += ncArg.c2;
1354:                                        } else {
1355:                                            /* the result is an LV syllable, which is a starter (unlike LVT) */
1356:                                            combineFlags = COMBINES_FWD;
1357:                                        }
1358:                                        if (!nx_contains(nx, ncArg.c)) {
1359:                                            args.source[starter] = ncArg.c;
1360:                                        } else {
1361:                                            /* excluded */
1362:                                            if (!isHangulWithoutJamoT(ncArg.c)) {
1363:                                                --args.start; /* undo the ++args.start from reading the Jamo T */
1364:                                            }
1365:                                            /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
1366:                                            remove = args.start;
1367:                                        }
1368:                                    }
1369:
1370:                                    /*
1371:                                     * Normally, the following can not occur:
1372:                                     * Since the input is in NFD, there are no Hangul LV syllables that
1373:                                     * a Jamo T could combine with.
1374:                                     * All Jamo Ts are combined above when handling Jamo Vs.
1375:                                     *
1376:                                     * However, before the PRI #29 fix, this can occur due to
1377:                                     * an intervening combining mark between the Hangul LV and the Jamo T.
1378:                                     */
1379:                                } else {
1380:                                    /* Jamo T, compose with previous Hangul that does not have a Jamo T */
1381:                                    if (isHangulWithoutJamoT(ncArg.c2)) {
1382:                                        ncArg.c2 += ncArg.c - JAMO_T_BASE;
1383:                                        if (!nx_contains(nx, ncArg.c2)) {
1384:                                            remove = args.start - 1;
1385:                                            args.source[starter] = ncArg.c2;
1386:                                        }
1387:                                    }
1388:                                }
1389:
1390:                                if (remove != -1) {
1391:                                    /* remove the Jamo(s) */
1392:                                    q = remove;
1393:                                    r = args.start;
1394:                                    while (r < args.limit) {
1395:                                        args.source[q++] = args.source[r++];
1396:                                    }
1397:                                    args.start = remove;
1398:                                    args.limit = q;
1399:                                }
1400:
1401:                                ncArg.c2 = 0; /* c2 held *starter temporarily */
1402:
1403:                                if (combineFlags != 0) {
1404:                                    /*
1405:                                     * not starter=NULL because the composition is a Hangul LV syllable
1406:                                     * and might combine once more (but only before the PRI #29 fix)
1407:                                     */
1408:
1409:                                    /* done? */
1410:                                    if (args.start == args.limit) {
1411:                                        return (char) prevCC;
1412:                                    }
1413:
1414:                                    /* the composition is a Hangul LV syllable which is a starter that combines forward */
1415:                                    combineFwdIndex = 0xfff0;
1416:
1417:                                    /* we combined; continue with looking for compositions */
1418:                                    continue;
1419:                                }
1420:                            }
1421:
1422:                            /*
1423:                             * now: cc==0 and the combining index does not include 
1424:                             * "forward" -> the rest of the loop body will reset starter
1425:                             * to NULL; technically, a composed Hangul syllable is a 
1426:                             * starter, but it does not combine forward now that we have
1427:                             * consumed all eligible Jamos; for Jamo V/T, combineFlags 
1428:                             * does not contain _NORM_COMBINES_FWD
1429:                             */
1430:
1431:                        } else if (
1432:                        /* the starter is not a Hangul LV or Jamo V/T and */
1433:                        !((combineFwdIndex & 0x8000) != 0)
1434:                                &&
1435:                                /* the combining mark is not blocked and */
1436:                                ((options & BEFORE_PRI_29) != 0 ? (prevCC != ncArg.cc || prevCC == 0)
1437:                                        : (prevCC < ncArg.cc || prevCC == 0))
1438:                                &&
1439:                                /* the starter and the combining mark (c, c2) do combine */
1440:                                0 != (result = combine(combiningTable,
1441:                                        combineFwdIndex, combineBackIndex,
1442:                                        outValues)) &&
1443:                                /* the composition result is not excluded */
1444:                                !nx_contains(nx, (char) value, (char) value2)) {
1445:                            value = outValues[0];
1446:                            value2 = outValues[1];
1447:                            /* replace the starter with the composition, remove the 
1448:                             * combining mark 
1449:                             */
1450:                            remove = ncArg.c2 == 0 ? args.start - 1
1451:                                    : args.start - 2; /* index to the combining mark */
1452:
1453:                            /* replace the starter with the composition */
1454:                            args.source[starter] = (char) value;
1455:                            if (starterIsSupplementary) {
1456:                                if (value2 != 0) {
1457:                                    /* both are supplementary */
1458:                                    args.source[starter + 1] = (char) value2;
1459:                                } else {
1460:                                    /* the composition is shorter than the starter, 
1461:                                     * move the intermediate characters forward one */
1462:                                    starterIsSupplementary = false;
1463:                                    q = starter + 1;
1464:                                    r = q + 1;
1465:                                    while (r < remove) {
1466:                                        args.source[q++] = args.source[r++];
1467:                                    }
1468:                                    --remove;
1469:                                }
1470:                            } else if (value2 != 0) {
1471:                                /* the composition is longer than the starter, 
1472:                                 * move the intermediate characters back one */
1473:                                starterIsSupplementary = true;
1474:                                /* temporarily increment for the loop boundary */
1475:                                ++starter;
1476:                                q = remove;
1477:                                r = ++remove;
1478:                                while (starter < q) {
1479:                                    args.source[--r] = args.source[--q];
1480:                                }
1481:                                args.source[starter] = (char) value2;
1482:                                --starter; /* undo the temporary increment */
1483:                                /* } else { both are on the BMP, nothing more to do */
1484:                            }
1485:
1486:                            /* remove the combining mark by moving the following text 
1487:                             * over it */
1488:                            if (remove < args.start) {
1489:                                q = remove;
1490:                                r = args.start;
1491:                                while (r < args.limit) {
1492:                                    args.source[q++] = args.source[r++];
1493:                                }
1494:                                args.start = remove;
1495:                                args.limit = q;
1496:                            }
1497:
1498:                            /* keep prevCC because we removed the combining mark */
1499:
1500:                            /* done? */
1501:                            if (args.start == args.limit) {
1502:                                return (char) prevCC;
1503:                            }
1504:
1505:                            /* is the composition a starter that combines forward? */
1506:                            if (result > 1) {
1507:                                combineFwdIndex = getCombiningIndexFromStarter(
1508:                                        (char) value, (char) value2);
1509:                            } else {
1510:                                starter = -1;
1511:                            }
1512:
1513:                            /* we combined; continue with looking for compositions */
1514:                            continue;
1515:                        }
1516:                    }
1517:
1518:                    /* no combination this time */
1519:                    prevCC = ncArg.cc;
1520:                    if (args.start == args.limit) {
1521:                        return (char) prevCC;
1522:                    }
1523:
1524:                    /* if (c, c2) did not combine, then check if it is a starter */
1525:                    if (ncArg.cc == 0) {
1526:                        /* found a new starter; combineFlags==0 if (c, c2) is excluded */
1527:                        if ((combineFlags & COMBINES_FWD) != 0) {
1528:                            /* it may combine with something, prepare for it */
1529:                            if (ncArg.c2 == 0) {
1530:                                starterIsSupplementary = false;
1531:                                starter = args.start - 1;
1532:                            } else {
1533:                                starterIsSupplementary = false;
1534:                                starter = args.start - 2;
1535:                            }
1536:                            combineFwdIndex = combineBackIndex;
1537:                        } else {
1538:                            /* it will not combine with anything */
1539:                            starter = -1;
1540:                        }
1541:                    } else if ((options & OPTIONS_COMPOSE_CONTIGUOUS) != 0) {
1542:                        /* FCC: no discontiguous compositions; any intervening character blocks */
1543:                        starter = -1;
1544:                    }
1545:                }
1546:            }
1547:
1548:            // find the last true starter between src[start]....src[current] going 
1549:            // backwards and return its index
1550:            private static int findPreviousStarter(char[] src, int srcStart,
1551:                    int current, int/*unsigned*/ccOrQCMask,
1552:                    int/*unsigned*/decompQCMask, char minNoMaybe) {
1553:                long norm32;
1554:                PrevArgs args = new PrevArgs();
1555:                args.src = src;
1556:                args.start = srcStart;
1557:                args.current = current;
1558:
1559:                while (args.start < args.current) {
1560:                    norm32 = getPrevNorm32(args, minNoMaybe, ccOrQCMask
1561:                            | decompQCMask);
1562:                    if (isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1563:                        break;
1564:                    }
1565:                }
1566:                return args.current;
1567:            }
1568:
1569:            /* find the first true starter in [src..limit[ and return the 
1570:             * pointer to it 
1571:             */
1572:            private static int/*index*/findNextStarter(char[] src, int start,
1573:                    int limit, int/*unsigned*/qcMask,
1574:                    int/*unsigned*/decompQCMask, char minNoMaybe) {
1575:                int p;
1576:                long/*unsigned*/norm32;
1577:                int ccOrQCMask;
1578:                char c, c2;
1579:
1580:                ccOrQCMask = CC_MASK | qcMask;
1581:
1582:                DecomposeArgs decompArgs = new DecomposeArgs();
1583:
1584:                for (;;) {
1585:                    if (start == limit) {
1586:                        break; /* end of string */
1587:                    }
1588:                    c = src[start];
1589:                    if (c < minNoMaybe) {
1590:                        break; /* catches NUL terminater, too */
1591:                    }
1592:
1593:                    norm32 = getNorm32(c);
1594:                    if ((norm32 & ccOrQCMask) == 0) {
1595:                        break; /* true starter */
1596:                    }
1597:
1598:                    if (isNorm32LeadSurrogate(norm32)) {
1599:                        /* c is a lead surrogate, get the real norm32 */
1600:                        if ((start + 1) == limit
1601:                                || !UTF16
1602:                                        .isTrailSurrogate(c2 = (src[start + 1]))) {
1603:                            /* unmatched first surrogate: counts as a true starter */
1604:                            break;
1605:                        }
1606:                        norm32 = getNorm32FromSurrogatePair(norm32, c2);
1607:
1608:                        if ((norm32 & ccOrQCMask) == 0) {
1609:                            break; /* true starter */
1610:                        }
1611:                    } else {
1612:                        c2 = 0;
1613:                    }
1614:
1615:                    /* (c, c2) is not a true starter but its decomposition may be */
1616:                    if ((norm32 & decompQCMask) != 0) {
1617:                        /* (c, c2) decomposes, get everything from the variable-length
1618:                         *  extra data */
1619:                        p = decompose(norm32, decompQCMask, decompArgs);
1620:
1621:                        /* get the first character's norm32 to check if it is a true 
1622:                         * starter */
1623:                        if (decompArgs.cc == 0
1624:                                && (getNorm32(extraData, p, qcMask) & qcMask) == 0) {
1625:                            break; /* true starter */
1626:                        }
1627:                    }
1628:
1629:                    start += c2 == 0 ? 1 : 2; /* not a true starter, continue */
1630:                }
1631:
1632:                return start;
1633:            }
1634:
1635:            private static final class ComposePartArgs {
1636:                int prevCC;
1637:                int length; /* length of decomposed part */
1638:            }
1639:
1640:            /* decompose and recompose [prevStarter..src[ */
1641:            private static char[] composePart(ComposePartArgs args,
1642:                    int prevStarter, char[] src, int start, int limit,
1643:                    int options, UnicodeSet nx) {
1644:                int recomposeLimit;
1645:                boolean compat = ((options & OPTIONS_COMPAT) != 0);
1646:
1647:                /* decompose [prevStarter..src[ */
1648:                int[] outTrailCC = new int[1];
1649:                char[] buffer = new char[(limit - prevStarter)
1650:                        * MAX_BUFFER_SIZE];
1651:
1652:                for (;;) {
1653:                    args.length = decompose(src, prevStarter, (start), buffer,
1654:                            0, buffer.length, compat, outTrailCC, nx);
1655:                    if (args.length <= buffer.length) {
1656:                        break;
1657:                    } else {
1658:                        buffer = new char[args.length];
1659:                    }
1660:                }
1661:
1662:                /* recompose the decomposition */
1663:                recomposeLimit = args.length;
1664:
1665:                if (args.length >= 2) {
1666:                    RecomposeArgs rcArgs = new RecomposeArgs();
1667:                    rcArgs.source = buffer;
1668:                    rcArgs.start = 0;
1669:                    rcArgs.limit = recomposeLimit;
1670:                    args.prevCC = recompose(rcArgs, options, nx);
1671:                    recomposeLimit = rcArgs.limit;
1672:                }
1673:
1674:                /* return with a pointer to the recomposition and its length */
1675:                args.length = recomposeLimit;
1676:                return buffer;
1677:            }
1678:
1679:            private static boolean composeHangul(char prev, char c,
1680:                    long/*unsigned*/norm32, char[] src, int[] srcIndex,
1681:                    int limit, boolean compat, char[] dest, int destIndex,
1682:                    UnicodeSet nx) {
1683:                int start = srcIndex[0];
1684:                if (isJamoVTNorm32JamoV(norm32)) {
1685:                    /* c is a Jamo V, compose with previous Jamo L and 
1686:                     * following Jamo T */
1687:                    prev = (char) (prev - JAMO_L_BASE);
1688:                    if (prev < JAMO_L_COUNT) {
1689:                        c = (char) (HANGUL_BASE + (prev * JAMO_V_COUNT + (c - JAMO_V_BASE))
1690:                                * JAMO_T_COUNT);
1691:
1692:                        /* check if the next character is a Jamo T (normal or 
1693:                         * compatibility) */
1694:                        if (start != limit) {
1695:                            char next, t;
1696:
1697:                            next = src[start];
1698:                            if ((t = (char) (next - JAMO_T_BASE)) < JAMO_T_COUNT) {
1699:                                /* normal Jamo T */
1700:                                ++start;
1701:                                c += t;
1702:                            } else if (compat) {
1703:                                /* if NFKC, then check for compatibility Jamo T 
1704:                                 * (BMP only) */
1705:                                norm32 = getNorm32(next);
1706:                                if (isNorm32Regular(norm32)
1707:                                        && ((norm32 & QC_NFKD) != 0)) {
1708:                                    int p /*index into extra data array*/;
1709:                                    DecomposeArgs dcArgs = new DecomposeArgs();
1710:                                    p = decompose(norm32, QC_NFKD, dcArgs);
1711:                                    if (dcArgs.length == 1
1712:                                            && (t = (char) (extraData[p] - JAMO_T_BASE)) < JAMO_T_COUNT) {
1713:                                        /* compatibility Jamo T */
1714:                                        ++start;
1715:                                        c += t;
1716:                                    }
1717:                                }
1718:                            }
1719:                        }
1720:                        if (nx_contains(nx, c)) {
1721:                            if (!isHangulWithoutJamoT(c)) {
1722:                                --start; /* undo ++start from reading the Jamo T */
1723:                            }
1724:                            return false;
1725:                        }
1726:                        dest[destIndex] = c;
1727:                        srcIndex[0] = start;
1728:                        return true;
1729:                    }
1730:                } else if (isHangulWithoutJamoT(prev)) {
1731:                    /* c is a Jamo T, compose with previous Hangul LV that does not 
1732:                     * contain a Jamo T */
1733:                    c = (char) (prev + (c - JAMO_T_BASE));
1734:                    if (nx_contains(nx, c)) {
1735:                        return false;
1736:                    }
1737:                    dest[destIndex] = c;
1738:                    srcIndex[0] = start;
1739:                    return true;
1740:                }
1741:                return false;
1742:            }
1743:
1744:            /*
1745:            public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
1746:                return compose(src,0,src.length,dest,0,dest.length,compat, nx);
1747:            }
1748:             */
1749:
1750:            public static int compose(char[] src, int srcStart, int srcLimit,
1751:                    char[] dest, int destStart, int destLimit, int options,
1752:                    UnicodeSet nx) {
1753:
1754:                int prevSrc, prevStarter;
1755:                long/*unsigned*/norm32;
1756:                int ccOrQCMask, qcMask;
1757:                int reorderStartIndex, length;
1758:                char c, c2, minNoMaybe;
1759:                int/*unsigned byte*/cc, prevCC;
1760:                int[] ioIndex = new int[1];
1761:                int destIndex = destStart;
1762:                int srcIndex = srcStart;
1763:
1764:                if ((options & OPTIONS_COMPAT) != 0) {
1765:                    minNoMaybe = (char) indexes[INDEX_MIN_NFKC_NO_MAYBE];
1766:                    qcMask = QC_NFKC;
1767:                } else {
1768:                    minNoMaybe = (char) indexes[INDEX_MIN_NFC_NO_MAYBE];
1769:                    qcMask = QC_NFC;
1770:                }
1771:
1772:                /*
1773:                 * prevStarter points to the last character before the current one
1774:                 * that is a "true" starter with cc==0 and quick check "yes".
1775:                 *
1776:                 * prevStarter will be used instead of looking for a true starter
1777:                 * while incrementally decomposing [prevStarter..prevSrc[
1778:                 * in _composePart(). Having a good prevStarter allows to just decompose
1779:                 * the entire [prevStarter..prevSrc[.
1780:                 *
1781:                 * When _composePart() backs out from prevSrc back to prevStarter,
1782:                 * then it also backs out destIndex by the same amount.
1783:                 * Therefore, at all times, the (prevSrc-prevStarter) source units
1784:                 * must correspond 1:1 to destination units counted with destIndex,
1785:                 * except for reordering.
1786:                 * This is true for the qc "yes" characters copied in the fast loop,
1787:                 * and for pure reordering.
1788:                 * prevStarter must be set forward to src when this is not true:
1789:                 * In _composePart() and after composing a Hangul syllable.
1790:                 *
1791:                 * This mechanism relies on the assumption that the decomposition of a 
1792:                 * true starter also begins with a true starter. gennorm/store.c checks 
1793:                 * for this.
1794:                 */
1795:                prevStarter = srcIndex;
1796:
1797:                ccOrQCMask = CC_MASK | qcMask;
1798:                /*destIndex=*/reorderStartIndex = 0;/* ####TODO#### check this **/
1799:                prevCC = 0;
1800:
1801:                /* avoid compiler warnings */
1802:                norm32 = 0;
1803:                c = 0;
1804:
1805:                for (;;) {
1806:                    /* count code units below the minimum or with irrelevant data for 
1807:                     * the quick check */
1808:                    prevSrc = srcIndex;
1809:
1810:                    while (srcIndex != srcLimit
1811:                            && ((c = src[srcIndex]) < minNoMaybe || ((norm32 = getNorm32(c)) & ccOrQCMask) == 0)) {
1812:                        prevCC = 0;
1813:                        ++srcIndex;
1814:                    }
1815:
1816:                    /* copy these code units all at once */
1817:                    if (srcIndex != prevSrc) {
1818:                        length = (int) (srcIndex - prevSrc);
1819:                        if ((destIndex + length) <= destLimit) {
1820:                            System.arraycopy(src, prevSrc, dest, destIndex,
1821:                                    length);
1822:                        }
1823:                        destIndex += length;
1824:                        reorderStartIndex = destIndex;
1825:
1826:                        /* set prevStarter to the last character in the quick check 
1827:                         * loop */
1828:                        prevStarter = srcIndex - 1;
1829:                        if (UTF16.isTrailSurrogate(src[prevStarter])
1830:                                && prevSrc < prevStarter
1831:                                && UTF16
1832:                                        .isLeadSurrogate(src[(prevStarter - 1)])) {
1833:                            --prevStarter;
1834:                        }
1835:
1836:                        prevSrc = srcIndex;
1837:                    }
1838:
1839:                    /* end of source reached? */
1840:                    if (srcIndex == srcLimit) {
1841:                        break;
1842:                    }
1843:
1844:                    /* c already contains *src and norm32 is set for it, increment src*/
1845:                    ++srcIndex;
1846:
1847:                    /*
1848:                     * source buffer pointers:
1849:                     *
1850:                     *  all done      quick check   current char  not yet
1851:                     *                "yes" but     (c, c2)       processed
1852:                     *                may combine
1853:                     *                forward
1854:                     * [-------------[-------------[-------------[-------------[
1855:                     * |             |             |             |             |
1856:                     * start         prevStarter   prevSrc       src           limit
1857:                     *
1858:                     *
1859:                     * destination buffer pointers and indexes:
1860:                     *
1861:                     *  all done      might take    not filled yet
1862:                     *                characters for
1863:                     *                reordering
1864:                     * [-------------[-------------[-------------[
1865:                     * |             |             |             |
1866:                     * dest      reorderStartIndex destIndex     destCapacity
1867:                     */
1868:
1869:                    /* check one above-minimum, relevant code unit */
1870:                    /*
1871:                     * norm32 is for c=*(src-1), and the quick check flag is "no" or 
1872:                     * "maybe", and/or cc!=0
1873:                     * check for Jamo V/T, then for surrogates and regular characters
1874:                     * c is not a Hangul syllable or Jamo L because
1875:                     * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
1876:                     */
1877:                    if (isNorm32HangulOrJamo(norm32)) {
1878:                        /*
1879:                         * c is a Jamo V/T:
1880:                         * try to compose with the previous character, Jamo V also with 
1881:                         * a following Jamo T, and set values here right now in case we 
1882:                         * just continue with the main loop
1883:                         */
1884:                        prevCC = cc = 0;
1885:                        reorderStartIndex = destIndex;
1886:                        ioIndex[0] = srcIndex;
1887:                        if (destIndex > 0
1888:                                && composeHangul(src[(prevSrc - 1)], c, norm32,
1889:                                        src, ioIndex, srcLimit,
1890:                                        (options & OPTIONS_COMPAT) != 0, dest,
1891:                                        destIndex <= destLimit ? destIndex - 1
1892:                                                : 0, nx)) {
1893:                            srcIndex = ioIndex[0];
1894:                            prevStarter = srcIndex;
1895:                            continue;
1896:                        }
1897:
1898:                        srcIndex = ioIndex[0];
1899:
1900:                        /* the Jamo V/T did not compose into a Hangul syllable, just 
1901:                         * append to dest */
1902:                        c2 = 0;
1903:                        length = 1;
1904:                        prevStarter = prevSrc;
1905:                    } else {
1906:                        if (isNorm32Regular(norm32)) {
1907:                            c2 = 0;
1908:                            length = 1;
1909:                        } else {
1910:                            /* c is a lead surrogate, get the real norm32 */
1911:                            if (srcIndex != srcLimit
1912:                                    && UTF16
1913:                                            .isTrailSurrogate(c2 = src[srcIndex])) {
1914:                                ++srcIndex;
1915:                                length = 2;
1916:                                norm32 = getNorm32FromSurrogatePair(norm32, c2);
1917:                            } else {
1918:                                /* c is an unpaired lead surrogate, nothing to do */
1919:                                c2 = 0;
1920:                                length = 1;
1921:                                norm32 = 0;
1922:                            }
1923:                        }
1924:                        ComposePartArgs args = new ComposePartArgs();
1925:
1926:                        /* we are looking at the character (c, c2) at [prevSrc..src[ */
1927:                        if (nx_contains(nx, c, c2)) {
1928:                            /* excluded: norm32==0 */
1929:                            cc = 0;
1930:                        } else if ((norm32 & qcMask) == 0) {
1931:                            cc = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
1932:                        } else {
1933:                            char[] p;
1934:
1935:                            /*
1936:                             * find appropriate boundaries around this character,
1937:                             * decompose the source text from between the boundaries,
1938:                             * and recompose it
1939:                             *
1940:                             * this puts the intermediate text into the side buffer because
1941:                             * it might be longer than the recomposition end result,
1942:                             * or the destination buffer may be too short or missing
1943:                             *
1944:                             * note that destIndex may be adjusted backwards to account
1945:                             * for source text that passed the quick check but needed to
1946:                             * take part in the recomposition
1947:                             */
1948:                            int decompQCMask = (qcMask << 2) & 0xf; /* decomposition quick check mask */
1949:                            /*
1950:                             * find the last true starter in [prevStarter..src[
1951:                             * it is either the decomposition of the current character (at prevSrc),
1952:                             * or prevStarter
1953:                             */
1954:                            if (isTrueStarter(norm32, CC_MASK | qcMask,
1955:                                    decompQCMask)) {
1956:                                prevStarter = prevSrc;
1957:                            } else {
1958:                                /* adjust destIndex: back out what had been copied with qc "yes" */
1959:                                destIndex -= prevSrc - prevStarter;
1960:                            }
1961:
1962:                            /* find the next true starter in [src..limit[ */
1963:                            srcIndex = findNextStarter(src, srcIndex, srcLimit,
1964:                                    qcMask, decompQCMask, minNoMaybe);
1965:                            //args.prevStarter = prevStarter;
1966:                            args.prevCC = prevCC;
1967:                            //args.destIndex = destIndex;
1968:                            args.length = length;
1969:                            p = composePart(args, prevStarter, src, srcIndex,
1970:                                    srcLimit, options, nx);
1971:
1972:                            if (p == null) {
1973:                                /* an error occurred (out of memory) */
1974:                                break;
1975:                            }
1976:
1977:                            prevCC = args.prevCC;
1978:                            length = args.length;
1979:
1980:                            /* append the recomposed buffer contents to the destination 
1981:                             * buffer */
1982:                            if ((destIndex + args.length) <= destLimit) {
1983:                                int i = 0;
1984:                                while (i < args.length) {
1985:                                    dest[destIndex++] = p[i++];
1986:                                    --length;
1987:                                }
1988:                            } else {
1989:                                /* buffer overflow */
1990:                                /* keep incrementing the destIndex for preflighting */
1991:                                destIndex += length;
1992:                            }
1993:
1994:                            prevStarter = srcIndex;
1995:                            continue;
1996:                        }
1997:                    }
1998:
1999:                    /* append the single code point (c, c2) to the destination buffer */
2000:                    if ((destIndex + length) <= destLimit) {
2001:                        if (cc != 0 && cc < prevCC) {
2002:                            /* (c, c2) is out of order with respect to the preceding 
2003:                             * text */
2004:                            int reorderSplit = destIndex;
2005:                            destIndex += length;
2006:                            prevCC = insertOrdered(dest, reorderStartIndex,
2007:                                    reorderSplit, destIndex, c, c2, cc);
2008:                        } else {
2009:                            /* just append (c, c2) */
2010:                            dest[destIndex++] = c;
2011:                            if (c2 != 0) {
2012:                                dest[destIndex++] = c2;
2013:                            }
2014:                            prevCC = cc;
2015:                        }
2016:                    } else {
2017:                        /* buffer overflow */
2018:                        /* keep incrementing the destIndex for preflighting */
2019:                        destIndex += length;
2020:                        prevCC = cc;
2021:                    }
2022:                }
2023:
2024:                return destIndex - destStart;
2025:            }
2026:
2027:            public static int getCombiningClass(int c) {
2028:                long norm32;
2029:                norm32 = getNorm32(c);
2030:                return (char) ((norm32 >> CC_SHIFT) & 0xFF);
2031:            }
2032:
2033:            public static boolean isFullCompositionExclusion(int c) {
2034:                if (isFormatVersion_2_1) {
2035:                    int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2036:                    return (boolean) ((aux & AUX_COMP_EX_MASK) != 0);
2037:                } else {
2038:                    return false;
2039:                }
2040:            }
2041:
2042:            public static boolean isCanonSafeStart(int c) {
2043:                if (isFormatVersion_2_1) {
2044:                    int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2045:                    return (boolean) ((aux & AUX_UNSAFE_MASK) == 0);
2046:                } else {
2047:                    return false;
2048:                }
2049:            }
2050:
2051:            /* Is c an NF<mode>-skippable code point? See unormimp.h. */
2052:            public static boolean isNFSkippable(int c,
2053:                    NormalizerBase.Mode mode, long mask) {
2054:                long /*unsigned int*/norm32;
2055:                mask = mask & UNSIGNED_INT_MASK;
2056:                char aux;
2057:
2058:                /* check conditions (a)..(e), see unormimp.h */
2059:                norm32 = getNorm32(c);
2060:
2061:                if ((norm32 & mask) != 0) {
2062:                    return false; /* fails (a)..(e), not skippable */
2063:                }
2064:
2065:                if (mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD
2066:                        || mode == NormalizerBase.NONE) {
2067:                    return true; /* NF*D, passed (a)..(c), is skippable */
2068:                }
2069:                /* check conditions (a)..(e), see unormimp.h */
2070:
2071:                /* NF*C/FCC, passed (a)..(e) */
2072:                if ((norm32 & QC_NFD) == 0) {
2073:                    return true; /* no canonical decomposition, is skippable */
2074:                }
2075:
2076:                /* check Hangul syllables algorithmically */
2077:                if (isNorm32HangulOrJamo(norm32)) {
2078:                    /* Jamo passed (a)..(e) above, must be Hangul */
2079:                    return !isHangulWithoutJamoT((char) c); /* LVT are skippable, LV are not */
2080:                }
2081:
2082:                /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
2083:                /* NF*C, test (f) flag */
2084:                if (!isFormatVersion_2_2) {
2085:                    return false; /* no (f) data, say not skippable to be safe */
2086:                }
2087:
2088:                aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2089:                return (aux & AUX_NFC_SKIP_F_MASK) == 0; /* TRUE=skippable if the (f) flag is not set */
2090:
2091:                /* } else { FCC, test fcd<=1 instead of the above } */
2092:            }
2093:
2094:            public static UnicodeSet addPropertyStarts(UnicodeSet set) {
2095:                int c;
2096:
2097:                /* add the start code point of each same-value range of each trie */
2098:                //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
2099:                TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
2100:                RangeValueIterator.Element normResult = new RangeValueIterator.Element();
2101:
2102:                while (normIter.next(normResult)) {
2103:                    set.add(normResult.start);
2104:                }
2105:
2106:                //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
2107:                TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie);
2108:                RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();
2109:
2110:                while (fcdIter.next(fcdResult)) {
2111:                    set.add(fcdResult.start);
2112:                }
2113:
2114:                if (isFormatVersion_2_1) {
2115:                    //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
2116:                    TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
2117:                    RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
2118:                    while (auxIter.next(auxResult)) {
2119:                        set.add(auxResult.start);
2120:                    }
2121:                }
2122:                /* add Hangul LV syllables and LV+1 because of skippables */
2123:                for (c = HANGUL_BASE; c < HANGUL_BASE + HANGUL_COUNT; c += JAMO_T_COUNT) {
2124:                    set.add(c);
2125:                    set.add(c + 1);
2126:                }
2127:                set.add(HANGUL_BASE + HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
2128:                return set; // for chaining
2129:            }
2130:
2131:            /**
2132:             * Internal API, used in UCharacter.getIntPropertyValue().
2133:             * @internal
2134:             * @param c code point
2135:             * @param modeValue numeric value compatible with Mode
2136:             * @return numeric value compatible with QuickCheck
2137:             */
2138:            public static final int quickCheck(int c, int modeValue) {
2139:                final int qcMask[/*UNORM_MODE_COUNT*/] = { 0, 0, QC_NFD,
2140:                        QC_NFKD, QC_NFC, QC_NFKC };
2141:
2142:                int norm32 = (int) getNorm32(c) & qcMask[modeValue];
2143:
2144:                if (norm32 == 0) {
2145:                    return 1; // YES
2146:                } else if ((norm32 & QC_ANY_NO) != 0) {
2147:                    return 0; // NO
2148:                } else /* _NORM_QC_ANY_MAYBE */{
2149:                    return 2; // MAYBE;
2150:                }
2151:            }
2152:
2153:            private static int strCompare(char[] s1, int s1Start, int s1Limit,
2154:                    char[] s2, int s2Start, int s2Limit, boolean codePointOrder) {
2155:
2156:                int start1, start2, limit1, limit2;
2157:
2158:                char c1, c2;
2159:
2160:                /* setup for fix-up */
2161:                start1 = s1Start;
2162:                start2 = s2Start;
2163:
2164:                int length1, length2;
2165:
2166:                length1 = s1Limit - s1Start;
2167:                length2 = s2Limit - s2Start;
2168:
2169:                int lengthResult;
2170:
2171:                if (length1 < length2) {
2172:                    lengthResult = -1;
2173:                    limit1 = start1 + length1;
2174:                } else if (length1 == length2) {
2175:                    lengthResult = 0;
2176:                    limit1 = start1 + length1;
2177:                } else /* length1>length2 */{
2178:                    lengthResult = 1;
2179:                    limit1 = start1 + length2;
2180:                }
2181:
2182:                if (s1 == s2) {
2183:                    return lengthResult;
2184:                }
2185:
2186:                for (;;) {
2187:                    /* check pseudo-limit */
2188:                    if (s1Start == limit1) {
2189:                        return lengthResult;
2190:                    }
2191:
2192:                    c1 = s1[s1Start];
2193:                    c2 = s2[s2Start];
2194:                    if (c1 != c2) {
2195:                        break;
2196:                    }
2197:                    ++s1Start;
2198:                    ++s2Start;
2199:                }
2200:
2201:                /* setup for fix-up */
2202:                limit1 = start1 + length1;
2203:                limit2 = start2 + length2;
2204:
2205:                /* if both values are in or above the surrogate range, fix them up */
2206:                if (c1 >= 0xd800 && c2 >= 0xd800 && codePointOrder) {
2207:                    /* subtract 0x2800 from BMP code points to make them smaller than
2208:                     *  supplementary ones */
2209:                    if ((c1 <= 0xdbff && (s1Start + 1) != limit1 && UTF16
2210:                            .isTrailSurrogate(s1[(s1Start + 1)]))
2211:                            || (UTF16.isTrailSurrogate(c1) && start1 != s1Start && UTF16
2212:                                    .isLeadSurrogate(s1[(s1Start - 1)]))) {
2213:                        /* part of a surrogate pair, leave >=d800 */
2214:                    } else {
2215:                        /* BMP code point - may be surrogate code point - make <d800 */
2216:                        c1 -= 0x2800;
2217:                    }
2218:
2219:                    if ((c2 <= 0xdbff && (s2Start + 1) != limit2 && UTF16
2220:                            .isTrailSurrogate(s2[(s2Start + 1)]))
2221:                            || (UTF16.isTrailSurrogate(c2) && start2 != s2Start && UTF16
2222:                                    .isLeadSurrogate(s2[(s2Start - 1)]))) {
2223:                        /* part of a surrogate pair, leave >=d800 */
2224:                    } else {
2225:                        /* BMP code point - may be surrogate code point - make <d800 */
2226:                        c2 -= 0x2800;
2227:                    }
2228:                }
2229:
2230:                /* now c1 and c2 are in UTF-32-compatible order */
2231:                return (int) c1 - (int) c2;
2232:            }
2233:
2234:            /*
2235:             * Status of tailored normalization
2236:             *
2237:             * This was done initially for investigation on Unicode public review issue 7
2238:             * (http://www.unicode.org/review/). See Jitterbug 2481.
2239:             * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
2240:             * a permanent feature in ICU 2.6 in support of IDNA which requires true
2241:             * Unicode 3.2 normalization.
2242:             * (NormalizationCorrections are rolled into IDNA mapping tables.)
2243:             *
2244:             * Tailored normalization as implemented here allows to "normalize less"
2245:             * than full Unicode normalization would.
2246:             * Based internally on a UnicodeSet of code points that are
2247:             * "excluded from normalization", the normalization functions leave those
2248:             * code points alone ("inert"). This means that tailored normalization
2249:             * still transforms text into a canonically equivalent form.
2250:             * It does not add decompositions to code points that do not have any or
2251:             * change decomposition results.
2252:             *
2253:             * Any function that searches for a safe boundary has not been touched,
2254:             * which means that these functions will be over-pessimistic when
2255:             * exclusions are applied.
2256:             * This should not matter because subsequent checks and normalizations
2257:             * do apply the exclusions; only a little more of the text may be processed
2258:             * than necessary under exclusions.
2259:             *
2260:             * Normalization exclusions have the following effect on excluded code points c:
2261:             * - c is not decomposed
2262:             * - c is not a composition target
2263:             * - c does not combine forward or backward for composition
2264:             *   except that this is not implemented for Jamo
2265:             * - c is treated as having a combining class of 0
2266:             */
2267:
2268:            /* 
2269:             * Constants for the bit fields in the options bit set parameter. 
2270:             * These need not be public. 
2271:             * A user only needs to know the currently assigned values. 
2272:             * The number and positions of reserved bits per field can remain private. 
2273:             */
2274:            private static final int OPTIONS_NX_MASK = 0x1f;
2275:            private static final int OPTIONS_UNICODE_MASK = 0xe0;
2276:            public static final int OPTIONS_SETS_MASK = 0xff;
2277:            private static final int OPTIONS_UNICODE_SHIFT = 5;
2278:            private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK + 1];
2279:
2280:            /* Constants for options flags for normalization.*/
2281:
2282:            /** 
2283:             * Options bit 0, do not decompose Hangul syllables. 
2284:             * @draft ICU 2.6 
2285:             */
2286:            private static final int NX_HANGUL = 1;
2287:            /** 
2288:             * Options bit 1, do not decompose CJK compatibility characters.
2289:             * @draft ICU 2.6 
2290:             */
2291:            private static final int NX_CJK_COMPAT = 2;
2292:            /**
2293:             * Options bit 8, use buggy recomposition described in
2294:             * Unicode Public Review Issue #29
2295:             * at http://www.unicode.org/review/resolved-pri.html#pri29
2296:             *
2297:             * Used in IDNA implementation according to strict interpretation
2298:             * of IDNA definition based on Unicode 3.2 which predates PRI #29.
2299:             *
2300:             * See ICU4C unormimp.h
2301:             * 
2302:             * @draft ICU 3.2
2303:             */
2304:            public static final int BEFORE_PRI_29 = 0x100;
2305:
2306:            /*
2307:             * The following options are used only in some composition functions.
2308:             * They use bits 12 and up to preserve lower bits for the available options
2309:             * space in unorm_compare() -
2310:             * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
2311:             */
2312:
2313:            /** Options bit 12, for compatibility vs. canonical decomposition. */
2314:            public static final int OPTIONS_COMPAT = 0x1000;
2315:            /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
2316:            public static final int OPTIONS_COMPOSE_CONTIGUOUS = 0x2000;
2317:
2318:            /* normalization exclusion sets --------------------------------------------- */
2319:
2320:            /*
2321:             * Normalization exclusion UnicodeSets are used for tailored normalization;
2322:             * see the comment near the beginning of this file.
2323:             *
2324:             * By specifying one or several sets of code points,
2325:             * those code points become inert for normalization.
2326:             */
2327:            private static final synchronized UnicodeSet internalGetNXHangul() {
2328:                /* internal function, does not check for incoming U_FAILURE */
2329:
2330:                if (nxCache[NX_HANGUL] == null) {
2331:                    nxCache[NX_HANGUL] = new UnicodeSet(0xac00, 0xd7a3);
2332:                }
2333:                return nxCache[NX_HANGUL];
2334:            }
2335:
2336:            private static final synchronized UnicodeSet internalGetNXCJKCompat() {
2337:                /* internal function, does not check for incoming U_FAILURE */
2338:
2339:                if (nxCache[NX_CJK_COMPAT] == null) {
2340:
2341:                    /* build a set from [CJK Ideographs]&[has canonical decomposition] */
2342:                    UnicodeSet set, hasDecomp;
2343:
2344:                    set = new UnicodeSet("[:Ideographic:]");
2345:
2346:                    /* start with an empty set for [has canonical decomposition] */
2347:                    hasDecomp = new UnicodeSet();
2348:
2349:                    /* iterate over all ideographs and remember which canonically decompose */
2350:                    UnicodeSetIterator it = new UnicodeSetIterator(set);
2351:                    int start, end;
2352:                    long norm32;
2353:
2354:                    while (it.nextRange()
2355:                            && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
2356:                        start = it.codepoint;
2357:                        end = it.codepointEnd;
2358:                        while (start <= end) {
2359:                            norm32 = getNorm32(start);
2360:                            if ((norm32 & QC_NFD) > 0) {
2361:                                hasDecomp.add(start);
2362:                            }
2363:                            ++start;
2364:                        }
2365:                    }
2366:
2367:                    /* hasDecomp now contains all ideographs that decompose canonically */
2368:                    nxCache[NX_CJK_COMPAT] = hasDecomp;
2369:
2370:                }
2371:
2372:                return nxCache[NX_CJK_COMPAT];
2373:            }
2374:
2375:            private static final synchronized UnicodeSet internalGetNXUnicode(
2376:                    int options) {
2377:                options &= OPTIONS_UNICODE_MASK;
2378:                if (options == 0) {
2379:                    return null;
2380:                }
2381:
2382:                if (nxCache[options] == null) {
2383:                    /* build a set with all code points that were not designated by the specified Unicode version */
2384:                    UnicodeSet set = new UnicodeSet();
2385:
2386:                    switch (options) {
2387:                    case NormalizerBase.UNICODE_3_2:
2388:                        set.applyPattern("[:^Age=3.2:]");
2389:                        break;
2390:                    default:
2391:                        return null;
2392:                    }
2393:
2394:                    nxCache[options] = set;
2395:                }
2396:
2397:                return nxCache[options];
2398:            }
2399:
2400:            /* Get a decomposition exclusion set. The data must be loaded. */
2401:            private static final synchronized UnicodeSet internalGetNX(
2402:                    int options) {
2403:                options &= OPTIONS_SETS_MASK;
2404:
2405:                if (nxCache[options] == null) {
2406:                    /* return basic sets */
2407:                    if (options == NX_HANGUL) {
2408:                        return internalGetNXHangul();
2409:                    }
2410:                    if (options == NX_CJK_COMPAT) {
2411:                        return internalGetNXCJKCompat();
2412:                    }
2413:                    if ((options & OPTIONS_UNICODE_MASK) != 0
2414:                            && (options & OPTIONS_NX_MASK) == 0) {
2415:                        return internalGetNXUnicode(options);
2416:                    }
2417:
2418:                    /* build a set from multiple subsets */
2419:                    UnicodeSet set;
2420:                    UnicodeSet other;
2421:
2422:                    set = new UnicodeSet();
2423:
2424:                    if ((options & NX_HANGUL) != 0
2425:                            && null != (other = internalGetNXHangul())) {
2426:                        set.addAll(other);
2427:                    }
2428:                    if ((options & NX_CJK_COMPAT) != 0
2429:                            && null != (other = internalGetNXCJKCompat())) {
2430:                        set.addAll(other);
2431:                    }
2432:                    if ((options & OPTIONS_UNICODE_MASK) != 0
2433:                            && null != (other = internalGetNXUnicode(options))) {
2434:                        set.addAll(other);
2435:                    }
2436:
2437:                    nxCache[options] = set;
2438:                }
2439:                return nxCache[options];
2440:            }
2441:
2442:            public static final UnicodeSet getNX(int options) {
2443:                if ((options &= OPTIONS_SETS_MASK) == 0) {
2444:                    /* incoming failure, or no decomposition exclusions requested */
2445:                    return null;
2446:                } else {
2447:                    return internalGetNX(options);
2448:                }
2449:            }
2450:
2451:            private static final boolean nx_contains(UnicodeSet nx, int c) {
2452:                return nx != null && nx.contains(c);
2453:            }
2454:
2455:            private static final boolean nx_contains(UnicodeSet nx, char c,
2456:                    char c2) {
2457:                return nx != null
2458:                        && nx.contains(c2 == 0 ? c : UCharacterProperty
2459:                                .getRawSupplementary(c, c2));
2460:            }
2461:
2462:            /*****************************************************************************/
2463:
2464:            /**
2465:             * Get the canonical decomposition 
2466:             * sherman  for ComposedCharIter
2467:             */
2468:
2469:            public static int getDecompose(int chars[], String decomps[]) {
2470:                DecomposeArgs args = new DecomposeArgs();
2471:                int length = 0;
2472:                long norm32 = 0;
2473:                int ch = -1;
2474:                int index = 0;
2475:                int i = 0;
2476:
2477:                while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff
2478:                    //TBD !!!! the hack code heres save us about 50ms for startup
2479:                    //need a better solution/lookup
2480:                    if (ch == 0x30ff)
2481:                        ch = 0xf900;
2482:                    else if (ch == 0x10000)
2483:                        ch = 0x1d15e;
2484:                    else if (ch == 0x1d1c1)
2485:                        ch = 0x2f800;
2486:
2487:                    norm32 = NormalizerImpl.getNorm32(ch);
2488:                    if ((norm32 & QC_NFD) != 0 && i < chars.length) {
2489:                        chars[i] = ch;
2490:                        index = decompose(norm32, args);
2491:                        decomps[i++] = new String(extraData, index, args.length);
2492:                    }
2493:                }
2494:                return i;
2495:            }
2496:
2497:            //------------------------------------------------------ 
2498:            // special method for Collation 
2499:            //------------------------------------------------------
2500:            private static boolean needSingleQuotation(char c) {
2501:                return (c >= 0x0009 && c <= 0x000D)
2502:                        || (c >= 0x0020 && c <= 0x002F)
2503:                        || (c >= 0x003A && c <= 0x0040)
2504:                        || (c >= 0x005B && c <= 0x0060)
2505:                        || (c >= 0x007B && c <= 0x007E);
2506:            }
2507:
2508:            public static String canonicalDecomposeWithSingleQuotation(
2509:                    String string) {
2510:                char[] src = string.toCharArray();
2511:                int srcIndex = 0;
2512:                int srcLimit = src.length;
2513:                char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3
2514:                int destIndex = 0;
2515:                int destLimit = dest.length;
2516:
2517:                char[] buffer = new char[3];
2518:                int prevSrc;
2519:                long norm32;
2520:                int ccOrQCMask;
2521:                int qcMask = QC_NFD;
2522:                int reorderStartIndex, length;
2523:                char c, c2;
2524:                char minNoMaybe = (char) indexes[INDEX_MIN_NFD_NO_MAYBE];
2525:                int cc, prevCC, trailCC;
2526:                char[] p;
2527:                int pStart;
2528:
2529:                // initialize
2530:                ccOrQCMask = CC_MASK | qcMask;
2531:                reorderStartIndex = 0;
2532:                prevCC = 0;
2533:                norm32 = 0;
2534:                c = 0;
2535:                pStart = 0;
2536:
2537:                cc = trailCC = -1; // initialize to bogus value
2538:                for (;;) {
2539:                    prevSrc = srcIndex;
2540:                    //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
2541:                    while (srcIndex != srcLimit
2542:                            && ((c = src[srcIndex]) < minNoMaybe
2543:                                    || ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 || (c >= '\uac00' && c <= '\ud7a3'))) {
2544:
2545:                        prevCC = 0;
2546:                        ++srcIndex;
2547:                    }
2548:
2549:                    // copy these code units all at once 
2550:                    if (srcIndex != prevSrc) {
2551:                        length = (int) (srcIndex - prevSrc);
2552:                        if ((destIndex + length) <= destLimit) {
2553:                            System.arraycopy(src, prevSrc, dest, destIndex,
2554:                                    length);
2555:                        }
2556:
2557:                        destIndex += length;
2558:                        reorderStartIndex = destIndex;
2559:                    }
2560:
2561:                    // end of source reached? 
2562:                    if (srcIndex == srcLimit) {
2563:                        break;
2564:                    }
2565:                    // c already contains *src and norm32 is set for it, increment src
2566:                    ++srcIndex;
2567:
2568:                    if (isNorm32Regular(norm32)) {
2569:                        c2 = 0;
2570:                        length = 1;
2571:                    } else {
2572:                        // c is a lead surrogate, get the real norm32 
2573:                        if (srcIndex != srcLimit
2574:                                && Character.isLowSurrogate(c2 = src[srcIndex])) {
2575:                            ++srcIndex;
2576:                            length = 2;
2577:                            norm32 = getNorm32FromSurrogatePair(norm32, c2);
2578:                        } else {
2579:                            c2 = 0;
2580:                            length = 1;
2581:                            norm32 = 0;
2582:                        }
2583:                    }
2584:
2585:                    // get the decomposition and the lead and trail cc's 
2586:                    if ((norm32 & qcMask) == 0) {
2587:                        // c does not decompose
2588:                        cc = trailCC = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
2589:                        p = null;
2590:                        pStart = -1;
2591:                    } else {
2592:                        DecomposeArgs arg = new DecomposeArgs();
2593:                        // c decomposes, get everything from the variable-length 
2594:                        // extra data
2595:                        pStart = decompose(norm32, qcMask, arg);
2596:                        p = extraData;
2597:                        length = arg.length;
2598:                        cc = arg.cc;
2599:                        trailCC = arg.trailCC;
2600:                        if (length == 1) {
2601:                            // fastpath a single code unit from decomposition 
2602:                            c = p[pStart];
2603:                            c2 = 0;
2604:                            p = null;
2605:                            pStart = -1;
2606:                        }
2607:                    }
2608:
2609:                    if ((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations 
2610:                        // buffer overflow 
2611:                        char[] tmpBuf = new char[destLimit * 2];
2612:                        System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
2613:                        dest = tmpBuf;
2614:                        destLimit = dest.length;
2615:                    }
2616:                    // append the decomposition to the destination buffer, assume length>0
2617:                    {
2618:                        int reorderSplit = destIndex;
2619:                        if (p == null) {
2620:                            // fastpath: single code point
2621:                            if (needSingleQuotation(c)) {
2622:                                //if we need single quotation, no need to consider "prevCC"
2623:                                //and it must NOT be a supplementary pair
2624:                                dest[destIndex++] = '\'';
2625:                                dest[destIndex++] = c;
2626:                                dest[destIndex++] = '\'';
2627:                                trailCC = 0;
2628:                            } else if (cc != 0 && cc < prevCC) {
2629:                                // (c, c2) is out of order with respect to the preceding
2630:                                //  text 
2631:                                destIndex += length;
2632:                                trailCC = insertOrdered(dest,
2633:                                        reorderStartIndex, reorderSplit,
2634:                                        destIndex, c, c2, cc);
2635:                            } else {
2636:                                // just append (c, c2)
2637:                                dest[destIndex++] = c;
2638:                                if (c2 != 0) {
2639:                                    dest[destIndex++] = c2;
2640:                                }
2641:                            }
2642:                        } else {
2643:                            // general: multiple code points (ordered by themselves) 
2644:                            // from decomposition 
2645:                            if (needSingleQuotation(p[pStart])) {
2646:                                dest[destIndex++] = '\'';
2647:                                dest[destIndex++] = p[pStart++];
2648:                                dest[destIndex++] = '\'';
2649:                                length--;
2650:                                do {
2651:                                    dest[destIndex++] = p[pStart++];
2652:                                } while (--length > 0);
2653:                            } else if (cc != 0 && cc < prevCC) {
2654:                                destIndex += length;
2655:                                trailCC = mergeOrdered(dest, reorderStartIndex,
2656:                                        reorderSplit, p, pStart, pStart
2657:                                                + length);
2658:                            } else {
2659:                                // just append the decomposition 
2660:                                do {
2661:                                    dest[destIndex++] = p[pStart++];
2662:                                } while (--length > 0);
2663:                            }
2664:                        }
2665:                    }
2666:                    prevCC = trailCC;
2667:                    if (prevCC == 0) {
2668:                        reorderStartIndex = destIndex;
2669:                    }
2670:                }
2671:                return new String(dest, 0, destIndex);
2672:            }
2673:
2674:            //------------------------------------------------------ 
2675:            // mapping method for IDNA/StringPrep 
2676:            //------------------------------------------------------
2677:
2678:            /*
2679:             * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode
2680:             * 3.2 normalization with Corrigendum 4 corrections. However, normalization
2681:             * without the corrections is necessary for IDNA/StringPrep support.
2682:             * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option
2683:             * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five
2684:             * characters in Corrigendum 4 before normalization in order to avoid
2685:             * incorrect normalization.
2686:             * For the Corrigendum 4 issue, refer
2687:             *   http://www.unicode.org/versions/corrigendum4.html
2688:             */
2689:
2690:            /*
2691:             * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL.
2692:             */
2693:            public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS = 0x40000;
2694:
2695:            private static final char[][] corrigendum4MappingTable = {
2696:                    { '\uD844', '\uDF6A' }, // 0x2F868
2697:                    { '\u5F33' }, // 0x2F874
2698:                    { '\u43AB' }, // 0x2F91F
2699:                    { '\u7AAE' }, // 0x2F95F
2700:                    { '\u4D57' } }; // 0x2F9BF
2701:
2702:            /*
2703:             * Removing Corrigendum 4 fix
2704:             * @return normalized text
2705:             */
2706:            public static String convert(String str) {
2707:                if (str == null) {
2708:                    return null;
2709:                }
2710:
2711:                int ch = UCharacterIterator.DONE;
2712:                StringBuffer dest = new StringBuffer();
2713:                UCharacterIterator iter = UCharacterIterator.getInstance(str);
2714:
2715:                while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
2716:                    switch (ch) {
2717:                    case 0x2F868:
2718:                        dest.append(corrigendum4MappingTable[0]);
2719:                        break;
2720:                    case 0x2F874:
2721:                        dest.append(corrigendum4MappingTable[1]);
2722:                        break;
2723:                    case 0x2F91F:
2724:                        dest.append(corrigendum4MappingTable[2]);
2725:                        break;
2726:                    case 0x2F95F:
2727:                        dest.append(corrigendum4MappingTable[3]);
2728:                        break;
2729:                    case 0x2F9BF:
2730:                        dest.append(corrigendum4MappingTable[4]);
2731:                        break;
2732:                    default:
2733:                        UTF16.append(dest, ch);
2734:                        break;
2735:                    }
2736:                }
2737:
2738:                return dest.toString();
2739:            }
2740:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.