Source Code Cross Referenced for NormalizerImpl.java in  » Internationalization-Localization » icu4j » com » ibm » icu » impl » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.impl 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /*
0002:         *******************************************************************************
0003:         * Copyright (C) 1996-2005, International Business Machines Corporation and    *
0004:         * others. All Rights Reserved.                                                *
0005:         *******************************************************************************
0006:         */
0007:
0008:        package com.ibm.icu.impl;
0009:
0010:        import java.io.ByteArrayInputStream;
0011:        import java.io.IOException;
0012:        import java.io.BufferedInputStream;
0013:        import java.io.InputStream;
0014:        import java.util.MissingResourceException;
0015:
0016:        import com.ibm.icu.text.Normalizer;
0017:        import com.ibm.icu.text.UTF16;
0018:        import com.ibm.icu.text.UnicodeSet;
0019:        import com.ibm.icu.text.UnicodeSetIterator;
0020:        import com.ibm.icu.util.RangeValueIterator;
0021:        import com.ibm.icu.util.VersionInfo;
0022:        import com.ibm.icu.lang.UCharacter;
0023:
0024:        /**
0025:         * @version     1.0
0026:         * @author  Ram Viswanadha
0027:         */
0028:        public final class NormalizerImpl {
0029:            // Static block for the class to initialize its own self 
0030:            static final NormalizerImpl IMPL;
0031:
0032:            static {
0033:                try {
0034:                    IMPL = new NormalizerImpl();
0035:                } catch (Exception e) {
0036:                    throw new MissingResourceException(e.getMessage(), "", "");
0037:                }
0038:            }
0039:
0040:            static final int UNSIGNED_BYTE_MASK = 0xFF;
0041:            static final long UNSIGNED_INT_MASK = 0xffffffffL;
0042:            /*
0043:             * This new implementation of the normalization code loads its data from
0044:             * unorm.icu, which is generated with the gennorm tool.
0045:             * The format of that file is described at the end of this file.
0046:             */
0047:            private static final String DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE
0048:                    + "/unorm.icu";
0049:
0050:            // norm32 value constants 
0051:
0052:            // quick check flags 0..3 set mean "no" for their forms 
0053:            public static final int QC_NFC = 0x11; /* no|maybe */
0054:            public static final int QC_NFKC = 0x22; /* no|maybe */
0055:            public static final int QC_NFD = 4; /* no */
0056:            public static final int QC_NFKD = 8; /* no */
0057:
0058:            public static final int QC_ANY_NO = 0xf;
0059:
0060:            /* quick check flags 4..5 mean "maybe" for their forms; 
0061:             * test flags>=QC_MAYBE 
0062:             */
0063:            public static final int QC_MAYBE = 0x10;
0064:            public static final int QC_ANY_MAYBE = 0x30;
0065:
0066:            public static final int QC_MASK = 0x3f;
0067:
0068:            private static final int COMBINES_FWD = 0x40;
0069:            private static final int COMBINES_BACK = 0x80;
0070:            public static final int COMBINES_ANY = 0xc0;
0071:            // UnicodeData.txt combining class in bits 15.
0072:            private static final int CC_SHIFT = 8;
0073:            public static final int CC_MASK = 0xff00;
0074:            // 16 bits for the index to UChars and other extra data
0075:            private static final int EXTRA_SHIFT = 16;
0076:            // start of surrogate specials after shift                
0077:            private static final int EXTRA_INDEX_TOP = 0xfc00;
0078:
0079:            private static final int EXTRA_SURROGATE_MASK = 0x3ff;
0080:            private static final int EXTRA_SURROGATE_TOP = 0x3f0; /* hangul etc. */
0081:
0082:            private static final int EXTRA_HANGUL = EXTRA_SURROGATE_TOP;
0083:            private static final int EXTRA_JAMO_L = EXTRA_SURROGATE_TOP + 1;/* ### not used */
0084:            private static final int EXTRA_JAMO_V = EXTRA_SURROGATE_TOP + 2;
0085:            private static final int EXTRA_JAMO_T = EXTRA_SURROGATE_TOP + 3;
0086:
0087:            /* norm32 value constants using >16 bits */
0088:            private static final long MIN_SPECIAL = (long) (0xfc000000 & UNSIGNED_INT_MASK);
0089:            private static final long SURROGATES_TOP = (long) (0xfff00000 & UNSIGNED_INT_MASK);
0090:            private static final long MIN_HANGUL = (long) (0xfff00000 & UNSIGNED_INT_MASK);
0091:            private static final long MIN_JAMO_V = (long) (0xfff20000 & UNSIGNED_INT_MASK);
0092:            private static final long JAMO_V_TOP = (long) (0xfff30000 & UNSIGNED_INT_MASK);
0093:
0094:            /* indexes[] value names */
0095:            /* number of bytes in normalization trie */
0096:            static final int INDEX_TRIE_SIZE = 0;
0097:            /* number of chars in extra data */
0098:            static final int INDEX_CHAR_COUNT = 1;
0099:            /* number of uint16_t words for combining data */
0100:            static final int INDEX_COMBINE_DATA_COUNT = 2;
0101:            /* number of code points that combine forward */
0102:            static final int INDEX_COMBINE_FWD_COUNT = 3;
0103:            /* number of code points that combine forward and backward */
0104:            static final int INDEX_COMBINE_BOTH_COUNT = 4;
0105:            /* number of code points that combine backward */
0106:            static final int INDEX_COMBINE_BACK_COUNT = 5;
0107:            /* first code point with quick check NFC NO/MAYBE */
0108:            public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
0109:            /* first code point with quick check NFKC NO/MAYBE */
0110:            public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
0111:            /* first code point with quick check NFD NO/MAYBE */
0112:            public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
0113:            /* first code point with quick check NFKD NO/MAYBE */
0114:            public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
0115:            /* number of bytes in FCD trie */
0116:            static final int INDEX_FCD_TRIE_SIZE = 10;
0117:            /* number of bytes in the auxiliary trie */
0118:            static final int INDEX_AUX_TRIE_SIZE = 11;
0119:            /* number of uint16_t in the array of serialized USet */
0120:            static final int INDEX_CANON_SET_COUNT = 12;
0121:            /* changing this requires a new formatVersion */
0122:            static final int INDEX_TOP = 32;
0123:
0124:            /* AUX constants */
0125:            /* value constants for auxTrie */
0126:            private static final int AUX_UNSAFE_SHIFT = 11;
0127:            private static final int AUX_COMP_EX_SHIFT = 10;
0128:            private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
0129:
0130:            private static final int AUX_MAX_FNC = ((int) 1 << AUX_COMP_EX_SHIFT);
0131:            private static final int AUX_UNSAFE_MASK = (int) ((1 << AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
0132:            private static final int AUX_FNC_MASK = (int) ((AUX_MAX_FNC - 1) & UNSIGNED_INT_MASK);
0133:            private static final int AUX_COMP_EX_MASK = (int) ((1 << AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
0134:            private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK & 1) << AUX_NFC_SKIPPABLE_F_SHIFT);
0135:
0136:            /* canonStartSets[0..31] contains indexes for what is in the array */
0137:            /* number of uint16_t in canonical starter sets */
0138:            static final int SET_INDEX_CANON_SETS_LENGTH = 0;
0139:            /* number of uint16_t in the BMP search table (contains pairs) */
0140:            static final int SET_INDEX_CANON_BMP_TABLE_LENGTH = 1;
0141:            /* number of uint16_t in the supplementary search table(contains triplets)*/
0142:            static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH = 2;
0143:            /* changing this requires a new formatVersion */
0144:            static final int SET_INDEX_TOP = 32;
0145:
0146:            static final int CANON_SET_INDICIES_INDEX = 0;
0147:            static final int CANON_SET_START_SETS_INDEX = 1;
0148:            static final int CANON_SET_BMP_TABLE_INDEX = 2;
0149:            static final int CANON_SET_SUPP_TABLE_INDEX = 3;
0150:            /* 14 bit indexes to canonical USerializedSets */
0151:            static final int CANON_SET_MAX_CANON_SETS = 0x4000;
0152:            /* single-code point BMP sets are encoded directly in the search table 
0153:             * except if result=0x4000..0x7fff 
0154:             */
0155:            static final int CANON_SET_BMP_MASK = 0xc000;
0156:            static final int CANON_SET_BMP_IS_INDEX = 0x4000;
0157:
0158:            private static final int MAX_BUFFER_SIZE = 20;
0159:
0160:            /**
0161:             * Internal option for cmpEquivFold() for decomposing.
0162:             * If not set, just do strcasecmp().
0163:             * @internal
0164:             */
0165:            public static final int COMPARE_EQUIV = 0x80000;
0166:
0167:            /*******************************/
0168:
0169:            /* Wrappers for Trie implementations */
0170:            static final class NormTrieImpl implements  Trie.DataManipulate {
0171:                static IntTrie normTrie = null;
0172:
0173:                /**
0174:                 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 
0175:                 * data the index array offset of the indexes for that lead surrogate.
0176:                 * @param property data value for a surrogate from the trie, including 
0177:                 *         the folding offset
0178:                 * @return data offset or 0 if there is no data for the lead surrogate
0179:                 */
0180:                /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
0181:                public int getFoldingOffset(int value) {
0182:                    return BMP_INDEX_LENGTH
0183:                            + ((value >> (EXTRA_SHIFT - SURROGATE_BLOCK_BITS)) & (0x3ff << SURROGATE_BLOCK_BITS));
0184:                }
0185:
0186:            }
0187:
0188:            static final class FCDTrieImpl implements  Trie.DataManipulate {
0189:                static CharTrie fcdTrie = null;
0190:
0191:                /**
0192:                 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 
0193:                 * data the index array offset of the indexes for that lead surrogate.
0194:                 * @param property data value for a surrogate from the trie, including
0195:                 *         the folding offset
0196:                 * @return data offset or 0 if there is no data for the lead surrogate
0197:                 */
0198:                /* fcdTrie: the folding offset is the lead FCD value itself */
0199:                public int getFoldingOffset(int value) {
0200:                    return value;
0201:                }
0202:            }
0203:
0204:            static final class AuxTrieImpl implements  Trie.DataManipulate {
0205:                static CharTrie auxTrie = null;
0206:
0207:                /**
0208:                 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 
0209:                 * data the index array offset of the indexes for that lead surrogate.
0210:                 * @param property data value for a surrogate from the trie, including 
0211:                 *        the folding offset
0212:                 * @return data offset or 0 if there is no data for the lead surrogate
0213:                 */
0214:                /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
0215:                public int getFoldingOffset(int value) {
0216:                    return (int) (value & AUX_FNC_MASK) << SURROGATE_BLOCK_BITS;
0217:                }
0218:            }
0219:
0220:            /****************************************************/
0221:
0222:            private static FCDTrieImpl fcdTrieImpl;
0223:            private static NormTrieImpl normTrieImpl;
0224:            private static AuxTrieImpl auxTrieImpl;
0225:            private static int[] indexes;
0226:            private static char[] combiningTable;
0227:            private static char[] extraData;
0228:            private static Object[] canonStartSets;
0229:
0230:            private static boolean isDataLoaded;
0231:            private static boolean isFormatVersion_2_1;
0232:            private static boolean isFormatVersion_2_2;
0233:            private static byte[] unicodeVersion;
0234:
0235:            /**
0236:             * Default buffer size of datafile
0237:             */
0238:            private static final int DATA_BUFFER_SIZE = 25000;
0239:
0240:            /**
0241:             * FCD check: everything below this code point is known to have a 0 
0242:             * lead combining class 
0243:             */
0244:            public static final int MIN_WITH_LEAD_CC = 0x300;
0245:
0246:            /**
0247:             * Bit 7 of the length byte for a decomposition string in extra data is
0248:             * a flag indicating whether the decomposition string is
0249:             * preceded by a 16-bit word with the leading and trailing cc
0250:             * of the decomposition (like for A-umlaut);
0251:             * if not, then both cc's are zero (like for compatibility ideographs).
0252:             */
0253:            private static final int DECOMP_FLAG_LENGTH_HAS_CC = 0x80;
0254:            /**
0255:             * Bits 6..0 of the length byte contain the actual length.
0256:             */
0257:            private static final int DECOMP_LENGTH_MASK = 0x7f;
0258:
0259:            /** Length of the BMP portion of the index (stage 1) array. */
0260:            private static final int BMP_INDEX_LENGTH = 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
0261:            /** Number of bits of a trail surrogate that are used in index table 
0262:             * lookups. 
0263:             */
0264:            private static final int SURROGATE_BLOCK_BITS = 10 - Trie.INDEX_STAGE_1_SHIFT_;
0265:
0266:            // public utility
0267:            public static int getFromIndexesArr(int index) {
0268:                return indexes[index];
0269:            }
0270:
0271:            // protected constructor ---------------------------------------------
0272:
0273:            /**
0274:             * Constructor
0275:             * @exception thrown when data reading fails or data corrupted
0276:             */
0277:            private NormalizerImpl() throws IOException {
0278:                //data should be loaded only once
0279:                if (!isDataLoaded) {
0280:
0281:                    // jar access
0282:                    InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
0283:                    BufferedInputStream b = new BufferedInputStream(i,
0284:                            DATA_BUFFER_SIZE);
0285:                    NormalizerDataReader reader = new NormalizerDataReader(b);
0286:
0287:                    // read the indexes            
0288:                    indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
0289:
0290:                    byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
0291:
0292:                    int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
0293:                    combiningTable = new char[combiningTableTop];
0294:
0295:                    int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
0296:                    extraData = new char[extraDataTop];
0297:
0298:                    byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
0299:                    byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
0300:                    canonStartSets = new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
0301:
0302:                    fcdTrieImpl = new FCDTrieImpl();
0303:                    normTrieImpl = new NormTrieImpl();
0304:                    auxTrieImpl = new AuxTrieImpl();
0305:
0306:                    // load the rest of the data data and initialize the data members
0307:                    reader.read(normBytes, fcdBytes, auxBytes, extraData,
0308:                            combiningTable, canonStartSets);
0309:
0310:                    NormTrieImpl.normTrie = new IntTrie(
0311:                            new ByteArrayInputStream(normBytes), normTrieImpl);
0312:                    FCDTrieImpl.fcdTrie = new CharTrie(
0313:                            new ByteArrayInputStream(fcdBytes), fcdTrieImpl);
0314:                    AuxTrieImpl.auxTrie = new CharTrie(
0315:                            new ByteArrayInputStream(auxBytes), auxTrieImpl);
0316:
0317:                    // we reached here without any exceptions so the data is fully 
0318:                    // loaded set the variable to true
0319:                    isDataLoaded = true;
0320:
0321:                    // get the data format version                           
0322:                    byte[] formatVersion = reader.getDataFormatVersion();
0323:
0324:                    isFormatVersion_2_1 = (formatVersion[0] > 2 || (formatVersion[0] == 2 && formatVersion[1] >= 1));
0325:                    isFormatVersion_2_2 = (formatVersion[0] > 2 || (formatVersion[0] == 2 && formatVersion[1] >= 2));
0326:                    unicodeVersion = reader.getUnicodeVersion();
0327:                    b.close();
0328:                }
0329:            }
0330:
0331:            /* ---------------------------------------------------------------------- */
0332:
0333:            /* Korean Hangul and Jamo constants */
0334:
0335:            public static final int JAMO_L_BASE = 0x1100; /* "lead" jamo */
0336:            public static final int JAMO_V_BASE = 0x1161; /* "vowel" jamo */
0337:            public static final int JAMO_T_BASE = 0x11a7; /* "trail" jamo */
0338:
0339:            public static final int HANGUL_BASE = 0xac00;
0340:
0341:            public static final int JAMO_L_COUNT = 19;
0342:            public static final int JAMO_V_COUNT = 21;
0343:            public static final int JAMO_T_COUNT = 28;
0344:            public static final int HANGUL_COUNT = JAMO_L_COUNT * JAMO_V_COUNT
0345:                    * JAMO_T_COUNT;
0346:
0347:            private static boolean isHangulWithoutJamoT(char c) {
0348:                c -= HANGUL_BASE;
0349:                return c < HANGUL_COUNT && c % JAMO_T_COUNT == 0;
0350:            }
0351:
0352:            /* norm32 helpers */
0353:
0354:            /* is this a norm32 with a regular index? */
0355:            private static boolean isNorm32Regular(long norm32) {
0356:                return norm32 < MIN_SPECIAL;
0357:            }
0358:
0359:            /* is this a norm32 with a special index for a lead surrogate? */
0360:            private static boolean isNorm32LeadSurrogate(long norm32) {
0361:                return MIN_SPECIAL <= norm32 && norm32 < SURROGATES_TOP;
0362:            }
0363:
0364:            /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
0365:            private static boolean isNorm32HangulOrJamo(long norm32) {
0366:                return norm32 >= MIN_HANGUL;
0367:            }
0368:
0369:            /*
0370:             * Given isNorm32HangulOrJamo(),
0371:             * is this a Hangul syllable or a Jamo?
0372:             */
0373:            ///CLOVER:OFF
0374:            private static boolean isHangulJamoNorm32HangulOrJamoL(long norm32) {
0375:                return norm32 < MIN_JAMO_V;
0376:            }
0377:
0378:            ///CLOVER:ON
0379:
0380:            /*
0381:             * Given norm32 for Jamo V or T,
0382:             * is this a Jamo V?
0383:             */
0384:            private static boolean isJamoVTNorm32JamoV(long norm32) {
0385:                return norm32 < JAMO_V_TOP;
0386:            }
0387:
0388:            /* data access primitives ----------------------------------------------- */
0389:
0390:            public static long/*unsigned*/getNorm32(char c) {
0391:                return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie
0392:                        .getLeadValue(c)));
0393:            }
0394:
0395:            public static long/*unsigned*/getNorm32FromSurrogatePair(
0396:                    long norm32, char c2) {
0397:                /*
0398:                 * the surrogate index in norm32 stores only the number of the surrogate
0399:                 * index block see gennorm/store.c/getFoldedNormValue()
0400:                 */
0401:                return ((UNSIGNED_INT_MASK) & NormTrieImpl.normTrie
0402:                        .getTrailValue((int) norm32, c2));
0403:            }
0404:
0405:            ///CLOVER:OFF
0406:            private static long getNorm32(int c) {
0407:                return (UNSIGNED_INT_MASK & (NormTrieImpl.normTrie
0408:                        .getCodePointValue(c)));
0409:            }
0410:
0411:            private static long getNorm32(int c, int mask) {
0412:                long/*unsigned*/norm32 = getNorm32(UTF16.getLeadSurrogate(c));
0413:                if (((norm32 & mask) > 0) && isNorm32LeadSurrogate(norm32)) {
0414:                    /* c is a lead surrogate, get the real norm32 */
0415:                    norm32 = getNorm32FromSurrogatePair(norm32, UTF16
0416:                            .getTrailSurrogate(c));
0417:                }
0418:                return norm32;
0419:            }
0420:
0421:            ///CLOVER:ON
0422:
0423:            /*
0424:             * get a norm32 from text with complete code points
0425:             * (like from decompositions)
0426:             */
0427:            private static long/*unsigned*/getNorm32(char[] p, int start,
0428:                    int/*unsigned*/mask) {
0429:                long/*unsigned*/norm32 = getNorm32(p[start]);
0430:                if (((norm32 & mask) > 0) && isNorm32LeadSurrogate(norm32)) {
0431:                    /* *p is a lead surrogate, get the real norm32 */
0432:                    norm32 = getNorm32FromSurrogatePair(norm32, p[start + 1]);
0433:                }
0434:                return norm32;
0435:            }
0436:
0437:            public static VersionInfo getUnicodeVersion() {
0438:                return VersionInfo
0439:                        .getInstance(unicodeVersion[0], unicodeVersion[1],
0440:                                unicodeVersion[2], unicodeVersion[3]);
0441:            }
0442:
0443:            public static char getFCD16(char c) {
0444:                return FCDTrieImpl.fcdTrie.getLeadValue(c);
0445:            }
0446:
0447:            public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
0448:                /* the surrogate index in fcd16 is an absolute offset over the 
0449:                 * start of stage 1 
0450:                 * */
0451:                return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
0452:            }
0453:
0454:            public static int getFCD16(int c) {
0455:                return FCDTrieImpl.fcdTrie.getCodePointValue(c);
0456:            }
0457:
0458:            private static int getExtraDataIndex(long norm32) {
0459:                return (int) (norm32 >> EXTRA_SHIFT);
0460:            }
0461:
0462:            private static final class DecomposeArgs {
0463:                int /*unsigned byte*/cc;
0464:                int /*unsigned byte*/trailCC;
0465:                int length;
0466:            }
0467:
0468:            /**
0469:             * 
0470:             * get the canonical or compatibility decomposition for one character 
0471:             * 
0472:             * @return index into the extraData array
0473:             */
0474:            private static int/*index*/decompose(long/*unsigned*/norm32,
0475:                    int/*unsigned*/qcMask, DecomposeArgs args) {
0476:                int p = getExtraDataIndex(norm32);
0477:                args.length = extraData[p++];
0478:
0479:                if ((norm32 & qcMask & QC_NFKD) != 0 && args.length >= 0x100) {
0480:                    /* use compatibility decomposition, skip canonical data */
0481:                    p += ((args.length >> 7) & 1)
0482:                            + (args.length & DECOMP_LENGTH_MASK);
0483:                    args.length >>= 8;
0484:                }
0485:
0486:                if ((args.length & DECOMP_FLAG_LENGTH_HAS_CC) > 0) {
0487:                    /* get the lead and trail cc's */
0488:                    char bothCCs = extraData[p++];
0489:                    args.cc = (UNSIGNED_BYTE_MASK) & (bothCCs >> 8);
0490:                    args.trailCC = (UNSIGNED_BYTE_MASK) & bothCCs;
0491:                } else {
0492:                    /* lead and trail cc's are both 0 */
0493:                    args.cc = args.trailCC = 0;
0494:                }
0495:
0496:                args.length &= DECOMP_LENGTH_MASK;
0497:                return p;
0498:            }
0499:
0500:            /**
0501:             * get the canonical decomposition for one character 
0502:             * @return index into the extraData array
0503:             */
0504:            private static int decompose(long/*unsigned*/norm32,
0505:                    DecomposeArgs args) {
0506:
0507:                int p = getExtraDataIndex(norm32);
0508:                args.length = extraData[p++];
0509:
0510:                if ((args.length & DECOMP_FLAG_LENGTH_HAS_CC) > 0) {
0511:                    /* get the lead and trail cc's */
0512:                    char bothCCs = extraData[p++];
0513:                    args.cc = (UNSIGNED_BYTE_MASK) & (bothCCs >> 8);
0514:                    args.trailCC = (UNSIGNED_BYTE_MASK) & bothCCs;
0515:                } else {
0516:                    /* lead and trail cc's are both 0 */
0517:                    args.cc = args.trailCC = 0;
0518:                }
0519:
0520:                args.length &= DECOMP_LENGTH_MASK;
0521:                return p;
0522:            }
0523:
0524:            private static final class NextCCArgs {
0525:                char[] source;
0526:                int next;
0527:                int limit;
0528:                char c;
0529:                char c2;
0530:            }
0531:
0532:            /*
0533:             * get the combining class of (c, c2)= args.source[args.next++]
0534:             * before: args.next<args.limit  after: args.next<=args.limit
0535:             * if only one code unit is used, then c2==0
0536:             */
0537:            private static int /*unsigned byte*/getNextCC(NextCCArgs args) {
0538:                long /*unsigned*/norm32;
0539:
0540:                args.c = args.source[args.next++];
0541:
0542:                norm32 = getNorm32(args.c);
0543:                if ((norm32 & CC_MASK) == 0) {
0544:                    args.c2 = 0;
0545:                    return 0;
0546:                } else {
0547:                    if (!isNorm32LeadSurrogate(norm32)) {
0548:                        args.c2 = 0;
0549:                    } else {
0550:                        /* c is a lead surrogate, get the real norm32 */
0551:                        if (args.next != args.limit
0552:                                && UTF16
0553:                                        .isTrailSurrogate(args.c2 = args.source[args.next])) {
0554:                            ++args.next;
0555:                            norm32 = getNorm32FromSurrogatePair(norm32, args.c2);
0556:                        } else {
0557:                            args.c2 = 0;
0558:                            return 0;
0559:                        }
0560:                    }
0561:
0562:                    return (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
0563:                }
0564:            }
0565:
0566:            private static final class PrevArgs {
0567:                char[] src;
0568:                int start;
0569:                int current;
0570:                char c;
0571:                char c2;
0572:            }
0573:
0574:            /*
0575:             * read backwards and get norm32
0576:             * return 0 if the character is <minC
0577:             * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first 
0578:             * surrogate but read second!)
0579:             */
0580:            private static long /*unsigned*/getPrevNorm32(PrevArgs args,
0581:                    int/*unsigned*/minC, int/*unsigned*/mask) {
0582:                long/*unsigned*/norm32;
0583:
0584:                args.c = args.src[--args.current];
0585:                args.c2 = 0;
0586:
0587:                /* check for a surrogate before getting norm32 to see if we need to 
0588:                 * predecrement further 
0589:                 */
0590:                if (args.c < minC) {
0591:                    return 0;
0592:                } else if (!UTF16.isSurrogate(args.c)) {
0593:                    return getNorm32(args.c);
0594:                } else if (UTF16.isLeadSurrogate(args.c)) {
0595:                    /* unpaired first surrogate */
0596:                    return 0;
0597:                } else if (args.current != args.start
0598:                        && UTF16
0599:                                .isLeadSurrogate(args.c2 = args.src[args.current - 1])) {
0600:                    --args.current;
0601:                    norm32 = getNorm32(args.c2);
0602:
0603:                    if ((norm32 & mask) == 0) {
0604:                        /* all surrogate pairs with this lead surrogate have 
0605:                         * only irrelevant data 
0606:                         */
0607:                        return 0;
0608:                    } else {
0609:                        /* norm32 must be a surrogate special */
0610:                        return getNorm32FromSurrogatePair(norm32, args.c);
0611:                    }
0612:                } else {
0613:                    /* unpaired second surrogate */
0614:                    args.c2 = 0;
0615:                    return 0;
0616:                }
0617:            }
0618:
0619:            /*
0620:             * get the combining class of (c, c2)=*--p
0621:             * before: start<p  after: start<=p
0622:             */
0623:            private static int /*unsigned byte*/getPrevCC(PrevArgs args) {
0624:
0625:                return (int) ((UNSIGNED_BYTE_MASK) & (getPrevNorm32(args,
0626:                        MIN_WITH_LEAD_CC, CC_MASK) >> CC_SHIFT));
0627:            }
0628:
0629:            /*
0630:             * is this a safe boundary character for NF*D?
0631:             * (lead cc==0)
0632:             */
0633:            public static boolean isNFDSafe(long/*unsigned*/norm32,
0634:                    int/*unsigned*/ccOrQCMask, int/*unsigned*/decompQCMask) {
0635:                if ((norm32 & ccOrQCMask) == 0) {
0636:                    return true; /* cc==0 and no decomposition: this is NF*D safe */
0637:                }
0638:
0639:                /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
0640:                if (isNorm32Regular(norm32) && (norm32 & decompQCMask) != 0) {
0641:                    DecomposeArgs args = new DecomposeArgs();
0642:                    /* decomposes, get everything from the variable-length extra data */
0643:                    decompose(norm32, decompQCMask, args);
0644:                    return args.cc == 0;
0645:                } else {
0646:                    /* no decomposition (or Hangul), test the cc directly */
0647:                    return (norm32 & CC_MASK) == 0;
0648:                }
0649:            }
0650:
0651:            /*
0652:             * is this (or does its decomposition begin with) a "true starter"?
0653:             * (cc==0 and NF*C_YES)
0654:             */
0655:            public static boolean isTrueStarter(long/*unsigned*/norm32,
0656:                    int/*unsigned*/ccOrQCMask, int/*unsigned*/decompQCMask) {
0657:                if ((norm32 & ccOrQCMask) == 0) {
0658:                    return true; /* this is a true starter (could be Hangul or Jamo L)*/
0659:                }
0660:
0661:                /* inspect its decomposition - not a Hangul or a surrogate here */
0662:                if ((norm32 & decompQCMask) != 0) {
0663:                    int p; /* index into extra data array */
0664:                    DecomposeArgs args = new DecomposeArgs();
0665:                    /* decomposes, get everything from the variable-length extra data */
0666:                    p = decompose(norm32, decompQCMask, args);
0667:
0668:                    if (args.cc == 0) {
0669:                        int/*unsigned*/qcMask = ccOrQCMask & QC_MASK;
0670:
0671:                        /* does it begin with NFC_YES? */
0672:                        if ((getNorm32(extraData, p, qcMask) & qcMask) == 0) {
0673:                            /* yes, the decomposition begins with a true starter */
0674:                            return true;
0675:                        }
0676:                    }
0677:                }
0678:                return false;
0679:            }
0680:
0681:            /* reorder UTF-16 in-place ---------------------------------------------- */
0682:
0683:            /**
0684:             * simpler, single-character version of mergeOrdered() -
0685:             * bubble-insert one single code point into the preceding string
0686:             * which is already canonically ordered
0687:             * (c, c2) may or may not yet have been inserted at src[current]..src[p]
0688:             *
0689:             * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
0690:             *
0691:             * before: src[start]..src[current] is already ordered, and
0692:             *         src[current]..src[p]     may or may not hold (c, c2) but
0693:             *                          must be exactly the same length as (c, c2)
0694:             * after: src[start]..src[p] is ordered
0695:             *
0696:             * @return the trailing combining class
0697:             */
0698:            private static int/*unsigned byte*/insertOrdered(char[] source,
0699:                    int start, int current, int p, char c, char c2,
0700:                    int/*unsigned byte*/cc) {
0701:                int back, preBack;
0702:                int r;
0703:                int prevCC, trailCC = cc;
0704:
0705:                if (start < current && cc != 0) {
0706:                    // search for the insertion point where cc>=prevCC 
0707:                    preBack = back = current;
0708:                    PrevArgs prevArgs = new PrevArgs();
0709:                    prevArgs.current = current;
0710:                    prevArgs.start = start;
0711:                    prevArgs.src = source;
0712:                    // get the prevCC 
0713:                    prevCC = getPrevCC(prevArgs);
0714:                    preBack = prevArgs.current;
0715:
0716:                    if (cc < prevCC) {
0717:                        // this will be the last code point, so keep its cc 
0718:                        trailCC = prevCC;
0719:                        back = preBack;
0720:                        while (start < preBack) {
0721:                            prevCC = getPrevCC(prevArgs);
0722:                            preBack = prevArgs.current;
0723:                            if (cc >= prevCC) {
0724:                                break;
0725:                            }
0726:                            back = preBack;
0727:                        }
0728:
0729:                        // this is where we are right now with all these indicies:
0730:                        // [start]..[pPreBack] 0..? code points that we can ignore
0731:                        // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
0732:                        // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
0733:                        // [current]..[p]         1 code point (c, c2) with cc
0734:
0735:                        // move the code units in between up 
0736:                        r = p;
0737:                        do {
0738:                            source[--r] = source[--current];
0739:                        } while (back != current);
0740:                    }
0741:                }
0742:
0743:                // insert (c, c2) 
0744:                source[current] = c;
0745:                if (c2 != 0) {
0746:                    source[(current + 1)] = c2;
0747:                }
0748:
0749:                // we know the cc of the last code point 
0750:                return trailCC;
0751:            }
0752:
0753:            /**
0754:             * merge two UTF-16 string parts together
0755:             * to canonically order (order by combining classes) their concatenation
0756:             *
0757:             * the two strings may already be adjacent, so that the merging is done 
0758:             * in-place if the two strings are not adjacent, then the buffer holding the
0759:             * first one must be large enough
0760:             * the second string may or may not be ordered in itself
0761:             *
0762:             * before: [start]..[current] is already ordered, and
0763:             *         [next]..[limit]    may be ordered in itself, but
0764:             *                          is not in relation to [start..current[
0765:             * after: [start..current+(limit-next)[ is ordered
0766:             *
0767:             * the algorithm is a simple bubble-sort that takes the characters from 
0768:             * src[next++] and inserts them in correct combining class order into the 
0769:             * preceding part of the string
0770:             *
0771:             * since this function is called much less often than the single-code point
0772:             * insertOrdered(), it just uses that for easier maintenance
0773:             *
0774:             * @return the trailing combining class
0775:             */
0776:            private static int /*unsigned byte*/mergeOrdered(char[] source,
0777:                    int start, int current, char[] data, int next, int limit,
0778:                    boolean isOrdered) {
0779:                int r;
0780:                int /*unsigned byte*/cc, trailCC = 0;
0781:                boolean adjacent;
0782:
0783:                adjacent = current == next;
0784:                NextCCArgs ncArgs = new NextCCArgs();
0785:                ncArgs.source = data;
0786:                ncArgs.next = next;
0787:                ncArgs.limit = limit;
0788:
0789:                if (start != current || !isOrdered) {
0790:
0791:                    while (ncArgs.next < ncArgs.limit) {
0792:                        cc = getNextCC(ncArgs);
0793:                        if (cc == 0) {
0794:                            // does not bubble back 
0795:                            trailCC = 0;
0796:                            if (adjacent) {
0797:                                current = ncArgs.next;
0798:                            } else {
0799:                                data[current++] = ncArgs.c;
0800:                                if (ncArgs.c2 != 0) {
0801:                                    data[current++] = ncArgs.c2;
0802:                                }
0803:                            }
0804:                            if (isOrdered) {
0805:                                break;
0806:                            } else {
0807:                                start = current;
0808:                            }
0809:                        } else {
0810:                            r = current + (ncArgs.c2 == 0 ? 1 : 2);
0811:                            trailCC = insertOrdered(source, start, current, r,
0812:                                    ncArgs.c, ncArgs.c2, cc);
0813:                            current = r;
0814:                        }
0815:                    }
0816:                }
0817:
0818:                if (ncArgs.next == ncArgs.limit) {
0819:                    // we know the cc of the last code point 
0820:                    return trailCC;
0821:                } else {
0822:                    if (!adjacent) {
0823:                        // copy the second string part 
0824:                        do {
0825:                            source[current++] = data[ncArgs.next++];
0826:                        } while (ncArgs.next != ncArgs.limit);
0827:                        ncArgs.limit = current;
0828:                    }
0829:                    PrevArgs prevArgs = new PrevArgs();
0830:                    prevArgs.src = data;
0831:                    prevArgs.start = start;
0832:                    prevArgs.current = ncArgs.limit;
0833:                    return getPrevCC(prevArgs);
0834:                }
0835:
0836:            }
0837:
0838:            private static int /*unsigned byte*/mergeOrdered(char[] source,
0839:                    int start, int current, char[] data, final int next,
0840:                    final int limit) {
0841:                return mergeOrdered(source, start, current, data, next, limit,
0842:                        true);
0843:            }
0844:
0845:            public static boolean checkFCD(char[] src, int srcStart,
0846:                    int srcLimit, UnicodeSet nx) {
0847:
0848:                char fcd16, c, c2;
0849:                int prevCC = 0, cc;
0850:                int i = srcStart, length = srcLimit;
0851:
0852:                for (;;) {
0853:                    for (;;) {
0854:                        if (i == length) {
0855:                            return true;
0856:                        } else if ((c = src[i++]) < MIN_WITH_LEAD_CC) {
0857:                            prevCC = (int) -c;
0858:                        } else if ((fcd16 = getFCD16(c)) == 0) {
0859:                            prevCC = 0;
0860:                        } else {
0861:                            break;
0862:                        }
0863:                    }
0864:
0865:                    // check one above-minimum, relevant code unit 
0866:                    if (UTF16.isLeadSurrogate(c)) {
0867:                        // c is a lead surrogate, get the real fcd16 
0868:                        if (i != length && UTF16.isTrailSurrogate(c2 = src[i])) {
0869:                            ++i;
0870:                            fcd16 = getFCD16FromSurrogatePair(fcd16, c2);
0871:                        } else {
0872:                            c2 = 0;
0873:                            fcd16 = 0;
0874:                        }
0875:                    } else {
0876:                        c2 = 0;
0877:                    }
0878:
0879:                    if (nx_contains(nx, c, c2)) {
0880:                        prevCC = 0; /* excluded: fcd16==0 */
0881:                        continue;
0882:                    }
0883:
0884:                    // prevCC has values from the following ranges:
0885:                    // 0..0xff -the previous trail combining class
0886:                    // <0      -the negative value of the previous code unit;
0887:                    //          that code unit was <MIN_WITH_LEAD_CC and its getFCD16()
0888:                    //          was deferred so that average text is checked faster
0889:                    //
0890:
0891:                    // check the combining order 
0892:                    cc = (int) (fcd16 >> 8);
0893:                    if (cc != 0) {
0894:                        if (prevCC < 0) {
0895:                            // the previous character was <_NORM_MIN_WITH_LEAD_CC, 
0896:                            // we need to get its trail cc 
0897:                            //
0898:                            if (!nx_contains(nx, (int) -prevCC)) {
0899:                                prevCC = (int) (FCDTrieImpl.fcdTrie
0900:                                        .getBMPValue((char) -prevCC) & 0xff);
0901:                            } else {
0902:                                prevCC = 0; /* excluded: fcd16==0 */
0903:                            }
0904:
0905:                        }
0906:
0907:                        if (cc < prevCC) {
0908:                            return false;
0909:                        }
0910:                    }
0911:                    prevCC = (int) (fcd16 & 0xff);
0912:                }
0913:            }
0914:
0915:            public static Normalizer.QuickCheckResult quickCheck(char[] src,
0916:                    int srcStart, int srcLimit, int minNoMaybe, int qcMask,
0917:                    int options, boolean allowMaybe, UnicodeSet nx) {
0918:
0919:                int ccOrQCMask;
0920:                long norm32;
0921:                char c, c2;
0922:                char cc, prevCC;
0923:                long qcNorm32;
0924:                Normalizer.QuickCheckResult result;
0925:                ComposePartArgs args = new ComposePartArgs();
0926:                char[] buffer;
0927:                int start = srcStart;
0928:
0929:                if (!isDataLoaded) {
0930:                    return Normalizer.MAYBE;
0931:                }
0932:                // initialize 
0933:                ccOrQCMask = CC_MASK | qcMask;
0934:                result = Normalizer.YES;
0935:                prevCC = 0;
0936:
0937:                for (;;) {
0938:                    for (;;) {
0939:                        if (srcStart == srcLimit) {
0940:                            return result;
0941:                        } else if ((c = src[srcStart++]) >= minNoMaybe
0942:                                && ((norm32 = getNorm32(c)) & ccOrQCMask) != 0) {
0943:                            break;
0944:                        }
0945:                        prevCC = 0;
0946:                    }
0947:
0948:                    // check one above-minimum, relevant code unit 
0949:                    if (isNorm32LeadSurrogate(norm32)) {
0950:                        // c is a lead surrogate, get the real norm32 
0951:                        if (srcStart != srcLimit
0952:                                && UTF16.isTrailSurrogate(c2 = src[srcStart])) {
0953:                            ++srcStart;
0954:                            norm32 = getNorm32FromSurrogatePair(norm32, c2);
0955:                        } else {
0956:                            norm32 = 0;
0957:                            c2 = 0;
0958:                        }
0959:                    } else {
0960:                        c2 = 0;
0961:                    }
0962:                    if (nx_contains(nx, c, c2)) {
0963:                        /* excluded: norm32==0 */
0964:                        norm32 = 0;
0965:                    }
0966:
0967:                    // check the combining order 
0968:                    cc = (char) ((norm32 >> CC_SHIFT) & 0xFF);
0969:                    if (cc != 0 && cc < prevCC) {
0970:                        return Normalizer.NO;
0971:                    }
0972:                    prevCC = cc;
0973:
0974:                    // check for "no" or "maybe" quick check flags 
0975:                    qcNorm32 = norm32 & qcMask;
0976:                    if ((qcNorm32 & QC_ANY_NO) >= 1) {
0977:                        result = Normalizer.NO;
0978:                        break;
0979:                    } else if (qcNorm32 != 0) {
0980:                        // "maybe" can only occur for NFC and NFKC 
0981:                        if (allowMaybe) {
0982:                            result = Normalizer.MAYBE;
0983:                        } else {
0984:                            // normalize a section around here to see if it is really 
0985:                            // normalized or not 
0986:                            int prevStarter;
0987:                            int/*unsigned*/decompQCMask;
0988:
0989:                            decompQCMask = (qcMask << 2) & 0xf; // decomposition quick check mask 
0990:
0991:                            // find the previous starter 
0992:
0993:                            // set prevStarter to the beginning of the current character 
0994:                            prevStarter = srcStart - 1;
0995:                            if (UTF16.isTrailSurrogate(src[prevStarter])) {
0996:                                // safe because unpaired surrogates do not result 
0997:                                // in "maybe"
0998:                                --prevStarter;
0999:                            }
1000:
1001:                            prevStarter = findPreviousStarter(src, start,
1002:                                    prevStarter, ccOrQCMask, decompQCMask,
1003:                                    (char) minNoMaybe);
1004:
1005:                            // find the next true starter in [src..limit[ - modifies 
1006:                            // src to point to the next starter 
1007:                            srcStart = findNextStarter(src, srcStart, srcLimit,
1008:                                    qcMask, decompQCMask, (char) minNoMaybe);
1009:
1010:                            //set the args for compose part
1011:                            args.prevCC = prevCC;
1012:
1013:                            // decompose and recompose [prevStarter..src[ 
1014:                            buffer = composePart(args, prevStarter, src,
1015:                                    srcStart, srcLimit, options, nx);
1016:
1017:                            // compare the normalized version with the original 
1018:                            if (0 != strCompare(buffer, 0, args.length, src,
1019:                                    prevStarter, srcStart, false)) {
1020:                                result = Normalizer.NO; // normalization differs 
1021:                                break;
1022:                            }
1023:
1024:                            // continue after the next starter 
1025:                        }
1026:                    }
1027:                }
1028:                return result;
1029:            }
1030:
1031:            //------------------------------------------------------ 
1032:            // make NFD & NFKD 
1033:            //------------------------------------------------------
1034:            public static int getDecomposition(int c /*UTF-32*/,
1035:                    boolean compat, char[] dest, int destStart, int destCapacity) {
1036:
1037:                if ((UNSIGNED_INT_MASK & c) <= 0x10ffff) {
1038:                    long /*unsigned*/norm32;
1039:                    int qcMask;
1040:                    int minNoMaybe;
1041:                    int length;
1042:
1043:                    // initialize 
1044:                    if (!compat) {
1045:                        minNoMaybe = (int) indexes[INDEX_MIN_NFD_NO_MAYBE];
1046:                        qcMask = QC_NFD;
1047:                    } else {
1048:                        minNoMaybe = (int) indexes[INDEX_MIN_NFKD_NO_MAYBE];
1049:                        qcMask = QC_NFKD;
1050:                    }
1051:
1052:                    if (c < minNoMaybe) {
1053:                        // trivial case 
1054:                        if (destCapacity > 0) {
1055:                            dest[0] = (char) c;
1056:                        }
1057:                        return -1;
1058:                    }
1059:
1060:                    /* data lookup */
1061:                    norm32 = getNorm32(c);
1062:                    if ((norm32 & qcMask) == 0) {
1063:                        /* simple case: no decomposition */
1064:                        if (c <= 0xffff) {
1065:                            if (destCapacity > 0) {
1066:                                dest[0] = (char) c;
1067:                            }
1068:                            return -1;
1069:                        } else {
1070:                            if (destCapacity >= 2) {
1071:                                dest[0] = UTF16.getLeadSurrogate(c);
1072:                                dest[1] = UTF16.getTrailSurrogate(c);
1073:                            }
1074:                            return -2;
1075:                        }
1076:                    } else if (isNorm32HangulOrJamo(norm32)) {
1077:                        /* Hangul syllable: decompose algorithmically */
1078:                        char c2;
1079:
1080:                        c -= HANGUL_BASE;
1081:
1082:                        c2 = (char) (c % JAMO_T_COUNT);
1083:                        c /= JAMO_T_COUNT;
1084:                        if (c2 > 0) {
1085:                            if (destCapacity >= 3) {
1086:                                dest[2] = (char) (JAMO_T_BASE + c2);
1087:                            }
1088:                            length = 3;
1089:                        } else {
1090:                            length = 2;
1091:                        }
1092:
1093:                        if (destCapacity >= 2) {
1094:                            dest[1] = (char) (JAMO_V_BASE + c % JAMO_V_COUNT);
1095:                            dest[0] = (char) (JAMO_L_BASE + c / JAMO_V_COUNT);
1096:                        }
1097:                        return length;
1098:                    } else {
1099:                        /* c decomposes, get everything from the variable-length extra 
1100:                         * data 
1101:                         */
1102:                        int p, limit;
1103:                        DecomposeArgs args = new DecomposeArgs();
1104:                        /* the index into extra data array*/
1105:                        p = decompose(norm32, qcMask, args);
1106:                        if (args.length <= destCapacity) {
1107:                            limit = p + args.length;
1108:                            do {
1109:                                dest[destStart++] = extraData[p++];
1110:                            } while (p < limit);
1111:                        }
1112:                        return args.length;
1113:                    }
1114:                } else {
1115:                    return 0;
1116:                }
1117:            }
1118:
1119:            public static int decompose(char[] src, int srcStart, int srcLimit,
1120:                    char[] dest, int destStart, int destLimit, boolean compat,
1121:                    int[] outTrailCC, UnicodeSet nx) {
1122:
1123:                char[] buffer = new char[3];
1124:                int prevSrc;
1125:                long norm32;
1126:                int ccOrQCMask, qcMask;
1127:                int reorderStartIndex, length;
1128:                char c, c2, minNoMaybe;
1129:                int/*unsigned byte*/cc, prevCC, trailCC;
1130:                char[] p;
1131:                int pStart;
1132:                int destIndex = destStart;
1133:                int srcIndex = srcStart;
1134:                if (!compat) {
1135:                    minNoMaybe = (char) indexes[INDEX_MIN_NFD_NO_MAYBE];
1136:                    qcMask = QC_NFD;
1137:                } else {
1138:                    minNoMaybe = (char) indexes[INDEX_MIN_NFKD_NO_MAYBE];
1139:                    qcMask = QC_NFKD;
1140:                }
1141:
1142:                /* initialize */
1143:                ccOrQCMask = CC_MASK | qcMask;
1144:                reorderStartIndex = 0;
1145:                prevCC = 0;
1146:                norm32 = 0;
1147:                c = 0;
1148:                pStart = 0;
1149:
1150:                cc = trailCC = -1;//initialize to bogus value
1151:
1152:                for (;;) {
1153:                    /* count code units below the minimum or with irrelevant data for 
1154:                     * the quick check 
1155:                     */
1156:                    prevSrc = srcIndex;
1157:
1158:                    while (srcIndex != srcLimit
1159:                            && ((c = src[srcIndex]) < minNoMaybe || ((norm32 = getNorm32(c)) & ccOrQCMask) == 0)) {
1160:                        prevCC = 0;
1161:                        ++srcIndex;
1162:                    }
1163:
1164:                    /* copy these code units all at once */
1165:                    if (srcIndex != prevSrc) {
1166:                        length = (int) (srcIndex - prevSrc);
1167:                        if ((destIndex + length) <= destLimit) {
1168:                            System.arraycopy(src, prevSrc, dest, destIndex,
1169:                                    length);
1170:                        }
1171:
1172:                        destIndex += length;
1173:                        reorderStartIndex = destIndex;
1174:                    }
1175:
1176:                    /* end of source reached? */
1177:                    if (srcIndex == srcLimit) {
1178:                        break;
1179:                    }
1180:
1181:                    /* c already contains *src and norm32 is set for it, increment src*/
1182:                    ++srcIndex;
1183:
1184:                    /* check one above-minimum, relevant code unit */
1185:                    /*
1186:                     * generally, set p and length to the decomposition string
1187:                     * in simple cases, p==NULL and (c, c2) will hold the length code 
1188:                     * units to append in all cases, set cc to the lead and trailCC to 
1189:                     * the trail combining class
1190:                     *
1191:                     * the following merge-sort of the current character into the 
1192:                     * preceding, canonically ordered result text will use the 
1193:                     * optimized insertOrdered()
1194:                     * if there is only one single code point to process;
1195:                     * this is indicated with p==NULL, and (c, c2) is the character to 
1196:                     * insert
1197:                     * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1198:                     * for a supplementary character)
1199:                     * otherwise, p[length] is merged in with _mergeOrdered()
1200:                     */
1201:                    if (isNorm32HangulOrJamo(norm32)) {
1202:                        if (nx_contains(nx, c)) {
1203:                            c2 = 0;
1204:                            p = null;
1205:                            length = 1;
1206:                        } else {
1207:                            // Hangul syllable: decompose algorithmically 
1208:                            p = buffer;
1209:                            pStart = 0;
1210:                            cc = trailCC = 0;
1211:
1212:                            c -= HANGUL_BASE;
1213:
1214:                            c2 = (char) (c % JAMO_T_COUNT);
1215:                            c /= JAMO_T_COUNT;
1216:                            if (c2 > 0) {
1217:                                buffer[2] = (char) (JAMO_T_BASE + c2);
1218:                                length = 3;
1219:                            } else {
1220:                                length = 2;
1221:                            }
1222:
1223:                            buffer[1] = (char) (JAMO_V_BASE + c % JAMO_V_COUNT);
1224:                            buffer[0] = (char) (JAMO_L_BASE + c / JAMO_V_COUNT);
1225:                        }
1226:                    } else {
1227:                        if (isNorm32Regular(norm32)) {
1228:                            c2 = 0;
1229:                            length = 1;
1230:                        } else {
1231:                            // c is a lead surrogate, get the real norm32 
1232:                            if (srcIndex != srcLimit
1233:                                    && UTF16
1234:                                            .isTrailSurrogate(c2 = src[srcIndex])) {
1235:                                ++srcIndex;
1236:                                length = 2;
1237:                                norm32 = getNorm32FromSurrogatePair(norm32, c2);
1238:                            } else {
1239:                                c2 = 0;
1240:                                length = 1;
1241:                                norm32 = 0;
1242:                            }
1243:                        }
1244:
1245:                        /* get the decomposition and the lead and trail cc's */
1246:                        if (nx_contains(nx, c, c2)) {
1247:                            /* excluded: norm32==0 */
1248:                            cc = trailCC = 0;
1249:                            p = null;
1250:                        } else if ((norm32 & qcMask) == 0) {
1251:                            /* c does not decompose */
1252:                            cc = trailCC = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
1253:                            p = null;
1254:                            pStart = -1;
1255:                        } else {
1256:                            DecomposeArgs arg = new DecomposeArgs();
1257:                            /* c decomposes, get everything from the variable-length 
1258:                             * extra data 
1259:                             */
1260:                            pStart = decompose(norm32, qcMask, arg);
1261:                            p = extraData;
1262:                            length = arg.length;
1263:                            cc = arg.cc;
1264:                            trailCC = arg.trailCC;
1265:                            if (length == 1) {
1266:                                /* fastpath a single code unit from decomposition */
1267:                                c = p[pStart];
1268:                                c2 = 0;
1269:                                p = null;
1270:                                pStart = -1;
1271:                            }
1272:                        }
1273:                    }
1274:
1275:                    /* append the decomposition to the destination buffer, assume 
1276:                     * length>0 
1277:                     */
1278:                    if ((destIndex + length) <= destLimit) {
1279:                        int reorderSplit = destIndex;
1280:                        if (p == null) {
1281:                            /* fastpath: single code point */
1282:                            if (cc != 0 && cc < prevCC) {
1283:                                /* (c, c2) is out of order with respect to the preceding
1284:                                 *  text 
1285:                                 */
1286:                                destIndex += length;
1287:                                trailCC = insertOrdered(dest,
1288:                                        reorderStartIndex, reorderSplit,
1289:                                        destIndex, c, c2, cc);
1290:                            } else {
1291:                                /* just append (c, c2) */
1292:                                dest[destIndex++] = c;
1293:                                if (c2 != 0) {
1294:                                    dest[destIndex++] = c2;
1295:                                }
1296:                            }
1297:                        } else {
1298:                            /* general: multiple code points (ordered by themselves) 
1299:                             * from decomposition 
1300:                             */
1301:                            if (cc != 0 && cc < prevCC) {
1302:                                /* the decomposition is out of order with respect to the
1303:                                 *  preceding text 
1304:                                 */
1305:                                destIndex += length;
1306:                                trailCC = mergeOrdered(dest, reorderStartIndex,
1307:                                        reorderSplit, p, pStart, pStart
1308:                                                + length);
1309:                            } else {
1310:                                /* just append the decomposition */
1311:                                do {
1312:                                    dest[destIndex++] = p[pStart++];
1313:                                } while (--length > 0);
1314:                            }
1315:                        }
1316:                    } else {
1317:                        /* buffer overflow */
1318:                        /* keep incrementing the destIndex for preflighting */
1319:                        destIndex += length;
1320:                    }
1321:
1322:                    prevCC = trailCC;
1323:                    if (prevCC == 0) {
1324:                        reorderStartIndex = destIndex;
1325:                    }
1326:                }
1327:
1328:                outTrailCC[0] = prevCC;
1329:
1330:                return destIndex - destStart;
1331:            }
1332:
1333:            /* make NFC & NFKC ------------------------------------------------------ */
1334:            private static final class NextCombiningArgs {
1335:                char[] source;
1336:                int start;
1337:                //int limit;
1338:                char c;
1339:                char c2;
1340:                int/*unsigned*/combiningIndex;
1341:                char /*unsigned byte*/cc;
1342:            }
1343:
1344:            /* get the composition properties of the next character */
1345:            private static int /*unsigned*/getNextCombining(
1346:                    NextCombiningArgs args, int limit, UnicodeSet nx) {
1347:                long/*unsigned*/norm32;
1348:                int combineFlags;
1349:                /* get properties */
1350:                args.c = args.source[args.start++];
1351:                norm32 = getNorm32(args.c);
1352:
1353:                /* preset output values for most characters */
1354:                args.c2 = 0;
1355:                args.combiningIndex = 0;
1356:                args.cc = 0;
1357:
1358:                if ((norm32 & (CC_MASK | COMBINES_ANY)) == 0) {
1359:                    return 0;
1360:                } else {
1361:                    if (isNorm32Regular(norm32)) {
1362:                        /* set cc etc. below */
1363:                    } else if (isNorm32HangulOrJamo(norm32)) {
1364:                        /* a compatibility decomposition contained Jamos */
1365:                        args.combiningIndex = (int) ((UNSIGNED_INT_MASK) & (0xfff0 | (norm32 >> EXTRA_SHIFT)));
1366:                        return (int) (norm32 & COMBINES_ANY);
1367:                    } else {
1368:                        /* c is a lead surrogate, get the real norm32 */
1369:                        if (args.start != limit
1370:                                && UTF16
1371:                                        .isTrailSurrogate(args.c2 = args.source[args.start])) {
1372:                            ++args.start;
1373:                            norm32 = getNorm32FromSurrogatePair(norm32, args.c2);
1374:                        } else {
1375:                            args.c2 = 0;
1376:                            return 0;
1377:                        }
1378:                    }
1379:
1380:                    if (nx_contains(nx, args.c, args.c2)) {
1381:                        return 0; /* excluded: norm32==0 */
1382:                    }
1383:
1384:                    args.cc = (char) ((norm32 >> CC_SHIFT) & 0xff);
1385:
1386:                    combineFlags = (int) (norm32 & COMBINES_ANY);
1387:                    if (combineFlags != 0) {
1388:                        int index = getExtraDataIndex(norm32);
1389:                        args.combiningIndex = index > 0 ? extraData[(index - 1)]
1390:                                : 0;
1391:                    }
1392:
1393:                    return combineFlags;
1394:                }
1395:            }
1396:
1397:            /*
1398:             * given a composition-result starter (c, c2) - which means its cc==0,
1399:             * it combines forward, it has extra data, its norm32!=0,
1400:             * it is not a Hangul or Jamo,
1401:             * get just its combineFwdIndex
1402:             *
1403:             * norm32(c) is special if and only if c2!=0
1404:             */
1405:            private static int/*unsigned*/getCombiningIndexFromStarter(char c,
1406:                    char c2) {
1407:                long/*unsigned*/norm32;
1408:
1409:                norm32 = getNorm32(c);
1410:                if (c2 != 0) {
1411:                    norm32 = getNorm32FromSurrogatePair(norm32, c2);
1412:                }
1413:                return extraData[(getExtraDataIndex(norm32) - 1)];
1414:            }
1415:
1416:            /*
1417:             * Find the recomposition result for
1418:             * a forward-combining character
1419:             * (specified with a pointer to its part of the combiningTable[])
1420:             * and a backward-combining character
1421:             * (specified with its combineBackIndex).
1422:             *
1423:             * If these two characters combine, then set (value, value2)
1424:             * with the code unit(s) of the composition character.
1425:             *
1426:             * Return value:
1427:             * 0    do not combine
1428:             * 1    combine
1429:             * >1   combine, and the composition is a forward-combining starter
1430:             *
1431:             * See unormimp.h for a description of the composition table format.
1432:             */
1433:            private static int/*unsigned*/combine(char[] table,
1434:                    int tableStart, int/*unsinged*/combineBackIndex,
1435:                    int[] outValues) {
1436:                int/*unsigned*/key;
1437:                int value, value2;
1438:
1439:                if (outValues.length < 2) {
1440:                    throw new IllegalArgumentException();
1441:                }
1442:
1443:                /* search in the starter's composition table */
1444:                for (;;) {
1445:                    key = table[tableStart++];
1446:                    if (key >= combineBackIndex) {
1447:                        break;
1448:                    }
1449:                    tableStart += ((table[tableStart] & 0x8000) != 0) ? 2 : 1;
1450:                }
1451:
1452:                /* mask off bit 15, the last-entry-in-the-list flag */
1453:                if ((key & 0x7fff) == combineBackIndex) {
1454:                    /* found! combine! */
1455:                    value = table[tableStart];
1456:
1457:                    /* is the composition a starter that combines forward? */
1458:                    key = (int) ((UNSIGNED_INT_MASK) & ((value & 0x2000) + 1));
1459:
1460:                    /* get the composition result code point from the variable-length 
1461:                     * result value 
1462:                     */
1463:                    if ((value & 0x8000) != 0) {
1464:                        if ((value & 0x4000) != 0) {
1465:                            /* surrogate pair composition result */
1466:                            value = (int) ((UNSIGNED_INT_MASK) & ((value & 0x3ff) | 0xd800));
1467:                            value2 = table[tableStart + 1];
1468:                        } else {
1469:                            /* BMP composition result U+2000..U+ffff */
1470:                            value = table[tableStart + 1];
1471:                            value2 = 0;
1472:                        }
1473:                    } else {
1474:                        /* BMP composition result U+0000..U+1fff */
1475:                        value &= 0x1fff;
1476:                        value2 = 0;
1477:                    }
1478:                    outValues[0] = value;
1479:                    outValues[1] = value2;
1480:                    return key;
1481:                } else {
1482:                    /* not found */
1483:                    return 0;
1484:                }
1485:            }
1486:
1487:            private static final class RecomposeArgs {
1488:                char[] source;
1489:                int start;
1490:                int limit;
1491:            }
1492:
1493:            /*
1494:             * recompose the characters in [p..limit[
1495:             * (which is in NFD - decomposed and canonically ordered),
1496:             * adjust limit, and return the trailing cc
1497:             *
1498:             * since for NFKC we may get Jamos in decompositions, we need to
1499:             * recompose those too
1500:             *
1501:             * note that recomposition never lengthens the text:
1502:             * any character consists of either one or two code units;
1503:             * a composition may contain at most one more code unit than the original 
1504:             * starter, while the combining mark that is removed has at least one code 
1505:             * unit
1506:             */
1507:            private static char/*unsigned byte*/recompose(RecomposeArgs args,
1508:                    int options, UnicodeSet nx) {
1509:                int remove, q, r;
1510:                int /*unsigned*/combineFlags;
1511:                int /*unsigned*/combineFwdIndex, combineBackIndex;
1512:                int /*unsigned*/result, value = 0, value2 = 0;
1513:                int /*unsigned byte*/prevCC;
1514:                boolean starterIsSupplementary;
1515:                int starter;
1516:                int[] outValues = new int[2];
1517:                starter = -1; /* no starter */
1518:                combineFwdIndex = 0; /* will not be used until starter!=NULL */
1519:                starterIsSupplementary = false; /* will not be used until starter!=NULL */
1520:                prevCC = 0;
1521:
1522:                NextCombiningArgs ncArg = new NextCombiningArgs();
1523:                ncArg.source = args.source;
1524:
1525:                ncArg.cc = 0;
1526:                ncArg.c2 = 0;
1527:
1528:                for (;;) {
1529:                    ncArg.start = args.start;
1530:                    combineFlags = getNextCombining(ncArg, args.limit, nx);
1531:                    combineBackIndex = ncArg.combiningIndex;
1532:                    args.start = ncArg.start;
1533:
1534:                    if (((combineFlags & COMBINES_BACK) != 0) && starter != -1) {
1535:                        if ((combineBackIndex & 0x8000) != 0) {
1536:                            /* c is a Jamo V/T, see if we can compose it with the 
1537:                             * previous character 
1538:                             */
1539:                            /* for the PRI #29 fix, check that there is no intervening combining mark */
1540:                            if ((options & BEFORE_PRI_29) != 0 || prevCC == 0) {
1541:                                remove = -1; /* NULL while no Hangul composition */
1542:                                combineFlags = 0;
1543:                                ncArg.c2 = args.source[starter];
1544:                                if (combineBackIndex == 0xfff2) {
1545:                                    /* Jamo V, compose with previous Jamo L and following 
1546:                                     * Jamo T 
1547:                                     */
1548:                                    ncArg.c2 = (char) (ncArg.c2 - JAMO_L_BASE);
1549:                                    if (ncArg.c2 < JAMO_L_COUNT) {
1550:                                        remove = args.start - 1;
1551:                                        ncArg.c = (char) (HANGUL_BASE + (ncArg.c2
1552:                                                * JAMO_V_COUNT + (ncArg.c - JAMO_V_BASE))
1553:                                                * JAMO_T_COUNT);
1554:                                        if (args.start != args.limit
1555:                                                && (ncArg.c2 = (char) (args.source[args.start] - JAMO_T_BASE)) < JAMO_T_COUNT) {
1556:                                            ++args.start;
1557:                                            ncArg.c += ncArg.c2;
1558:                                        } else {
1559:                                            /* the result is an LV syllable, which is a starter (unlike LVT) */
1560:                                            combineFlags = COMBINES_FWD;
1561:                                        }
1562:                                        if (!nx_contains(nx, ncArg.c)) {
1563:                                            args.source[starter] = ncArg.c;
1564:                                        } else {
1565:                                            /* excluded */
1566:                                            if (!isHangulWithoutJamoT(ncArg.c)) {
1567:                                                --args.start; /* undo the ++args.start from reading the Jamo T */
1568:                                            }
1569:                                            /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
1570:                                            remove = args.start;
1571:                                        }
1572:                                    }
1573:
1574:                                    /*
1575:                                     * Normally, the following can not occur:
1576:                                     * Since the input is in NFD, there are no Hangul LV syllables that
1577:                                     * a Jamo T could combine with.
1578:                                     * All Jamo Ts are combined above when handling Jamo Vs.
1579:                                     *
1580:                                     * However, before the PRI #29 fix, this can occur due to
1581:                                     * an intervening combining mark between the Hangul LV and the Jamo T.
1582:                                     */
1583:                                } else {
1584:                                    /* Jamo T, compose with previous Hangul that does not have a Jamo T */
1585:                                    if (isHangulWithoutJamoT(ncArg.c2)) {
1586:                                        ncArg.c2 += ncArg.c - JAMO_T_BASE;
1587:                                        if (!nx_contains(nx, ncArg.c2)) {
1588:                                            remove = args.start - 1;
1589:                                            args.source[starter] = ncArg.c2;
1590:                                        }
1591:                                    }
1592:                                }
1593:
1594:                                if (remove != -1) {
1595:                                    /* remove the Jamo(s) */
1596:                                    q = remove;
1597:                                    r = args.start;
1598:                                    while (r < args.limit) {
1599:                                        args.source[q++] = args.source[r++];
1600:                                    }
1601:                                    args.start = remove;
1602:                                    args.limit = q;
1603:                                }
1604:
1605:                                ncArg.c2 = 0; /* c2 held *starter temporarily */
1606:
1607:                                if (combineFlags != 0) {
1608:                                    /*
1609:                                     * not starter=NULL because the composition is a Hangul LV syllable
1610:                                     * and might combine once more (but only before the PRI #29 fix)
1611:                                     */
1612:
1613:                                    /* done? */
1614:                                    if (args.start == args.limit) {
1615:                                        return (char) prevCC;
1616:                                    }
1617:
1618:                                    /* the composition is a Hangul LV syllable which is a starter that combines forward */
1619:                                    combineFwdIndex = 0xfff0;
1620:
1621:                                    /* we combined; continue with looking for compositions */
1622:                                    continue;
1623:                                }
1624:                            }
1625:
1626:                            /*
1627:                             * now: cc==0 and the combining index does not include 
1628:                             * "forward" -> the rest of the loop body will reset starter
1629:                             * to NULL; technically, a composed Hangul syllable is a 
1630:                             * starter, but it does not combine forward now that we have
1631:                             * consumed all eligible Jamos; for Jamo V/T, combineFlags 
1632:                             * does not contain _NORM_COMBINES_FWD
1633:                             */
1634:
1635:                        } else if (
1636:                        /* the starter is not a Hangul LV or Jamo V/T and */
1637:                        !((combineFwdIndex & 0x8000) != 0)
1638:                                &&
1639:                                /* the combining mark is not blocked and */
1640:                                ((options & BEFORE_PRI_29) != 0 ? (prevCC != ncArg.cc || prevCC == 0)
1641:                                        : (prevCC < ncArg.cc || prevCC == 0))
1642:                                &&
1643:                                /* the starter and the combining mark (c, c2) do combine */
1644:                                0 != (result = combine(combiningTable,
1645:                                        combineFwdIndex, combineBackIndex,
1646:                                        outValues)) &&
1647:                                /* the composition result is not excluded */
1648:                                !nx_contains(nx, (char) value, (char) value2)) {
1649:                            value = outValues[0];
1650:                            value2 = outValues[1];
1651:                            /* replace the starter with the composition, remove the 
1652:                             * combining mark 
1653:                             */
1654:                            remove = ncArg.c2 == 0 ? args.start - 1
1655:                                    : args.start - 2; /* index to the combining mark */
1656:
1657:                            /* replace the starter with the composition */
1658:                            args.source[starter] = (char) value;
1659:                            if (starterIsSupplementary) {
1660:                                if (value2 != 0) {
1661:                                    /* both are supplementary */
1662:                                    args.source[starter + 1] = (char) value2;
1663:                                } else {
1664:                                    /* the composition is shorter than the starter, 
1665:                                     * move the intermediate characters forward one */
1666:                                    starterIsSupplementary = false;
1667:                                    q = starter + 1;
1668:                                    r = q + 1;
1669:                                    while (r < remove) {
1670:                                        args.source[q++] = args.source[r++];
1671:                                    }
1672:                                    --remove;
1673:                                }
1674:                            } else if (value2 != 0) {
1675:                                /* the composition is longer than the starter, 
1676:                                 * move the intermediate characters back one */
1677:                                starterIsSupplementary = true;
1678:                                /* temporarily increment for the loop boundary */
1679:                                ++starter;
1680:                                q = remove;
1681:                                r = ++remove;
1682:                                while (starter < q) {
1683:                                    args.source[--r] = args.source[--q];
1684:                                }
1685:                                args.source[starter] = (char) value2;
1686:                                --starter; /* undo the temporary increment */
1687:                                /* } else { both are on the BMP, nothing more to do */
1688:                            }
1689:
1690:                            /* remove the combining mark by moving the following text 
1691:                             * over it */
1692:                            if (remove < args.start) {
1693:                                q = remove;
1694:                                r = args.start;
1695:                                while (r < args.limit) {
1696:                                    args.source[q++] = args.source[r++];
1697:                                }
1698:                                args.start = remove;
1699:                                args.limit = q;
1700:                            }
1701:
1702:                            /* keep prevCC because we removed the combining mark */
1703:
1704:                            /* done? */
1705:                            if (args.start == args.limit) {
1706:                                return (char) prevCC;
1707:                            }
1708:
1709:                            /* is the composition a starter that combines forward? */
1710:                            if (result > 1) {
1711:                                combineFwdIndex = getCombiningIndexFromStarter(
1712:                                        (char) value, (char) value2);
1713:                            } else {
1714:                                starter = -1;
1715:                            }
1716:
1717:                            /* we combined; continue with looking for compositions */
1718:                            continue;
1719:                        }
1720:                    }
1721:
1722:                    /* no combination this time */
1723:                    prevCC = ncArg.cc;
1724:                    if (args.start == args.limit) {
1725:                        return (char) prevCC;
1726:                    }
1727:
1728:                    /* if (c, c2) did not combine, then check if it is a starter */
1729:                    if (ncArg.cc == 0) {
1730:                        /* found a new starter; combineFlags==0 if (c, c2) is excluded */
1731:                        if ((combineFlags & COMBINES_FWD) != 0) {
1732:                            /* it may combine with something, prepare for it */
1733:                            if (ncArg.c2 == 0) {
1734:                                starterIsSupplementary = false;
1735:                                starter = args.start - 1;
1736:                            } else {
1737:                                starterIsSupplementary = false;
1738:                                starter = args.start - 2;
1739:                            }
1740:                            combineFwdIndex = combineBackIndex;
1741:                        } else {
1742:                            /* it will not combine with anything */
1743:                            starter = -1;
1744:                        }
1745:                    } else if ((options & OPTIONS_COMPOSE_CONTIGUOUS) != 0) {
1746:                        /* FCC: no discontiguous compositions; any intervening character blocks */
1747:                        starter = -1;
1748:                    }
1749:                }
1750:            }
1751:
1752:            // find the last true starter between src[start]....src[current] going 
1753:            // backwards and return its index
1754:            private static int findPreviousStarter(char[] src, int srcStart,
1755:                    int current, int/*unsigned*/ccOrQCMask,
1756:                    int/*unsigned*/decompQCMask, char minNoMaybe) {
1757:                long norm32;
1758:                PrevArgs args = new PrevArgs();
1759:                args.src = src;
1760:                args.start = srcStart;
1761:                args.current = current;
1762:
1763:                while (args.start < args.current) {
1764:                    norm32 = getPrevNorm32(args, minNoMaybe, ccOrQCMask
1765:                            | decompQCMask);
1766:                    if (isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1767:                        break;
1768:                    }
1769:                }
1770:                return args.current;
1771:            }
1772:
1773:            /* find the first true starter in [src..limit[ and return the 
1774:             * pointer to it 
1775:             */
1776:            private static int/*index*/findNextStarter(char[] src, int start,
1777:                    int limit, int/*unsigned*/qcMask,
1778:                    int/*unsigned*/decompQCMask, char minNoMaybe) {
1779:                int p;
1780:                long/*unsigned*/norm32;
1781:                int ccOrQCMask;
1782:                char c, c2;
1783:
1784:                ccOrQCMask = CC_MASK | qcMask;
1785:
1786:                DecomposeArgs decompArgs = new DecomposeArgs();
1787:
1788:                for (;;) {
1789:                    if (start == limit) {
1790:                        break; /* end of string */
1791:                    }
1792:                    c = src[start];
1793:                    if (c < minNoMaybe) {
1794:                        break; /* catches NUL terminater, too */
1795:                    }
1796:
1797:                    norm32 = getNorm32(c);
1798:                    if ((norm32 & ccOrQCMask) == 0) {
1799:                        break; /* true starter */
1800:                    }
1801:
1802:                    if (isNorm32LeadSurrogate(norm32)) {
1803:                        /* c is a lead surrogate, get the real norm32 */
1804:                        if ((start + 1) == limit
1805:                                || !UTF16
1806:                                        .isTrailSurrogate(c2 = (src[start + 1]))) {
1807:                            /* unmatched first surrogate: counts as a true starter */
1808:                            break;
1809:                        }
1810:                        norm32 = getNorm32FromSurrogatePair(norm32, c2);
1811:
1812:                        if ((norm32 & ccOrQCMask) == 0) {
1813:                            break; /* true starter */
1814:                        }
1815:                    } else {
1816:                        c2 = 0;
1817:                    }
1818:
1819:                    /* (c, c2) is not a true starter but its decomposition may be */
1820:                    if ((norm32 & decompQCMask) != 0) {
1821:                        /* (c, c2) decomposes, get everything from the variable-length
1822:                         *  extra data */
1823:                        p = decompose(norm32, decompQCMask, decompArgs);
1824:
1825:                        /* get the first character's norm32 to check if it is a true 
1826:                         * starter */
1827:                        if (decompArgs.cc == 0
1828:                                && (getNorm32(extraData, p, qcMask) & qcMask) == 0) {
1829:                            break; /* true starter */
1830:                        }
1831:                    }
1832:
1833:                    start += c2 == 0 ? 1 : 2; /* not a true starter, continue */
1834:                }
1835:
1836:                return start;
1837:            }
1838:
1839:            private static final class ComposePartArgs {
1840:                int prevCC;
1841:                int length; /* length of decomposed part */
1842:            }
1843:
1844:            /* decompose and recompose [prevStarter..src[ */
1845:            private static char[] composePart(ComposePartArgs args,
1846:                    int prevStarter, char[] src, int start, int limit,
1847:                    int options, UnicodeSet nx) {
1848:                int recomposeLimit;
1849:                boolean compat = ((options & OPTIONS_COMPAT) != 0);
1850:
1851:                /* decompose [prevStarter..src[ */
1852:                int[] outTrailCC = new int[1];
1853:                char[] buffer = new char[(limit - prevStarter)
1854:                        * MAX_BUFFER_SIZE];
1855:
1856:                for (;;) {
1857:                    args.length = decompose(src, prevStarter, (start), buffer,
1858:                            0, buffer.length, compat, outTrailCC, nx);
1859:                    if (args.length <= buffer.length) {
1860:                        break;
1861:                    } else {
1862:                        buffer = new char[args.length];
1863:                    }
1864:                }
1865:
1866:                /* recompose the decomposition */
1867:                recomposeLimit = args.length;
1868:
1869:                if (args.length >= 2) {
1870:                    RecomposeArgs rcArgs = new RecomposeArgs();
1871:                    rcArgs.source = buffer;
1872:                    rcArgs.start = 0;
1873:                    rcArgs.limit = recomposeLimit;
1874:                    args.prevCC = recompose(rcArgs, options, nx);
1875:                    recomposeLimit = rcArgs.limit;
1876:                }
1877:
1878:                /* return with a pointer to the recomposition and its length */
1879:                args.length = recomposeLimit;
1880:                return buffer;
1881:            }
1882:
1883:            private static boolean composeHangul(char prev, char c,
1884:                    long/*unsigned*/norm32, char[] src, int[] srcIndex,
1885:                    int limit, boolean compat, char[] dest, int destIndex,
1886:                    UnicodeSet nx) {
1887:                int start = srcIndex[0];
1888:                if (isJamoVTNorm32JamoV(norm32)) {
1889:                    /* c is a Jamo V, compose with previous Jamo L and 
1890:                     * following Jamo T */
1891:                    prev = (char) (prev - JAMO_L_BASE);
1892:                    if (prev < JAMO_L_COUNT) {
1893:                        c = (char) (HANGUL_BASE + (prev * JAMO_V_COUNT + (c - JAMO_V_BASE))
1894:                                * JAMO_T_COUNT);
1895:
1896:                        /* check if the next character is a Jamo T (normal or 
1897:                         * compatibility) */
1898:                        if (start != limit) {
1899:                            char next, t;
1900:
1901:                            next = src[start];
1902:                            if ((t = (char) (next - JAMO_T_BASE)) < JAMO_T_COUNT) {
1903:                                /* normal Jamo T */
1904:                                ++start;
1905:                                c += t;
1906:                            } else if (compat) {
1907:                                /* if NFKC, then check for compatibility Jamo T 
1908:                                 * (BMP only) */
1909:                                norm32 = getNorm32(next);
1910:                                if (isNorm32Regular(norm32)
1911:                                        && ((norm32 & QC_NFKD) != 0)) {
1912:                                    int p /*index into extra data array*/;
1913:                                    DecomposeArgs dcArgs = new DecomposeArgs();
1914:                                    p = decompose(norm32, QC_NFKD, dcArgs);
1915:                                    if (dcArgs.length == 1
1916:                                            && (t = (char) (extraData[p] - JAMO_T_BASE)) < JAMO_T_COUNT) {
1917:                                        /* compatibility Jamo T */
1918:                                        ++start;
1919:                                        c += t;
1920:                                    }
1921:                                }
1922:                            }
1923:                        }
1924:                        if (nx_contains(nx, c)) {
1925:                            if (!isHangulWithoutJamoT(c)) {
1926:                                --start; /* undo ++start from reading the Jamo T */
1927:                            }
1928:                            return false;
1929:                        }
1930:                        dest[destIndex] = c;
1931:                        srcIndex[0] = start;
1932:                        return true;
1933:                    }
1934:                } else if (isHangulWithoutJamoT(prev)) {
1935:                    /* c is a Jamo T, compose with previous Hangul LV that does not 
1936:                     * contain a Jamo T */
1937:                    c = (char) (prev + (c - JAMO_T_BASE));
1938:                    if (nx_contains(nx, c)) {
1939:                        return false;
1940:                    }
1941:                    dest[destIndex] = c;
1942:                    srcIndex[0] = start;
1943:                    return true;
1944:                }
1945:                return false;
1946:            }
1947:
1948:            /*
1949:            public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
1950:                return compose(src,0,src.length,dest,0,dest.length,compat, nx);
1951:            }
1952:             */
1953:
1954:            public static int compose(char[] src, int srcStart, int srcLimit,
1955:                    char[] dest, int destStart, int destLimit, int options,
1956:                    UnicodeSet nx) {
1957:
1958:                int prevSrc, prevStarter;
1959:                long/*unsigned*/norm32;
1960:                int ccOrQCMask, qcMask;
1961:                int reorderStartIndex, length;
1962:                char c, c2, minNoMaybe;
1963:                int/*unsigned byte*/cc, prevCC;
1964:                int[] ioIndex = new int[1];
1965:                int destIndex = destStart;
1966:                int srcIndex = srcStart;
1967:
1968:                if ((options & OPTIONS_COMPAT) != 0) {
1969:                    minNoMaybe = (char) indexes[INDEX_MIN_NFKC_NO_MAYBE];
1970:                    qcMask = QC_NFKC;
1971:                } else {
1972:                    minNoMaybe = (char) indexes[INDEX_MIN_NFC_NO_MAYBE];
1973:                    qcMask = QC_NFC;
1974:                }
1975:
1976:                /*
1977:                 * prevStarter points to the last character before the current one
1978:                 * that is a "true" starter with cc==0 and quick check "yes".
1979:                 *
1980:                 * prevStarter will be used instead of looking for a true starter
1981:                 * while incrementally decomposing [prevStarter..prevSrc[
1982:                 * in _composePart(). Having a good prevStarter allows to just decompose
1983:                 * the entire [prevStarter..prevSrc[.
1984:                 *
1985:                 * When _composePart() backs out from prevSrc back to prevStarter,
1986:                 * then it also backs out destIndex by the same amount.
1987:                 * Therefore, at all times, the (prevSrc-prevStarter) source units
1988:                 * must correspond 1:1 to destination units counted with destIndex,
1989:                 * except for reordering.
1990:                 * This is true for the qc "yes" characters copied in the fast loop,
1991:                 * and for pure reordering.
1992:                 * prevStarter must be set forward to src when this is not true:
1993:                 * In _composePart() and after composing a Hangul syllable.
1994:                 *
1995:                 * This mechanism relies on the assumption that the decomposition of a 
1996:                 * true starter also begins with a true starter. gennorm/store.c checks 
1997:                 * for this.
1998:                 */
1999:                prevStarter = srcIndex;
2000:
2001:                ccOrQCMask = CC_MASK | qcMask;
2002:                /*destIndex=*/reorderStartIndex = 0;/* ####TODO#### check this **/
2003:                prevCC = 0;
2004:
2005:                /* avoid compiler warnings */
2006:                norm32 = 0;
2007:                c = 0;
2008:
2009:                for (;;) {
2010:                    /* count code units below the minimum or with irrelevant data for 
2011:                     * the quick check */
2012:                    prevSrc = srcIndex;
2013:
2014:                    while (srcIndex != srcLimit
2015:                            && ((c = src[srcIndex]) < minNoMaybe || ((norm32 = getNorm32(c)) & ccOrQCMask) == 0)) {
2016:                        prevCC = 0;
2017:                        ++srcIndex;
2018:                    }
2019:
2020:                    /* copy these code units all at once */
2021:                    if (srcIndex != prevSrc) {
2022:                        length = (int) (srcIndex - prevSrc);
2023:                        if ((destIndex + length) <= destLimit) {
2024:                            System.arraycopy(src, prevSrc, dest, destIndex,
2025:                                    length);
2026:                        }
2027:                        destIndex += length;
2028:                        reorderStartIndex = destIndex;
2029:
2030:                        /* set prevStarter to the last character in the quick check 
2031:                         * loop */
2032:                        prevStarter = srcIndex - 1;
2033:                        if (UTF16.isTrailSurrogate(src[prevStarter])
2034:                                && prevSrc < prevStarter
2035:                                && UTF16
2036:                                        .isLeadSurrogate(src[(prevStarter - 1)])) {
2037:                            --prevStarter;
2038:                        }
2039:
2040:                        prevSrc = srcIndex;
2041:                    }
2042:
2043:                    /* end of source reached? */
2044:                    if (srcIndex == srcLimit) {
2045:                        break;
2046:                    }
2047:
2048:                    /* c already contains *src and norm32 is set for it, increment src*/
2049:                    ++srcIndex;
2050:
2051:                    /*
2052:                     * source buffer pointers:
2053:                     *
2054:                     *  all done      quick check   current char  not yet
2055:                     *                "yes" but     (c, c2)       processed
2056:                     *                may combine
2057:                     *                forward
2058:                     * [-------------[-------------[-------------[-------------[
2059:                     * |             |             |             |             |
2060:                     * start         prevStarter   prevSrc       src           limit
2061:                     *
2062:                     *
2063:                     * destination buffer pointers and indexes:
2064:                     *
2065:                     *  all done      might take    not filled yet
2066:                     *                characters for
2067:                     *                reordering
2068:                     * [-------------[-------------[-------------[
2069:                     * |             |             |             |
2070:                     * dest      reorderStartIndex destIndex     destCapacity
2071:                     */
2072:
2073:                    /* check one above-minimum, relevant code unit */
2074:                    /*
2075:                     * norm32 is for c=*(src-1), and the quick check flag is "no" or 
2076:                     * "maybe", and/or cc!=0
2077:                     * check for Jamo V/T, then for surrogates and regular characters
2078:                     * c is not a Hangul syllable or Jamo L because
2079:                     * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
2080:                     */
2081:                    if (isNorm32HangulOrJamo(norm32)) {
2082:                        /*
2083:                         * c is a Jamo V/T:
2084:                         * try to compose with the previous character, Jamo V also with 
2085:                         * a following Jamo T, and set values here right now in case we 
2086:                         * just continue with the main loop
2087:                         */
2088:                        prevCC = cc = 0;
2089:                        reorderStartIndex = destIndex;
2090:                        ioIndex[0] = srcIndex;
2091:                        if (destIndex > 0
2092:                                && composeHangul(src[(prevSrc - 1)], c, norm32,
2093:                                        src, ioIndex, srcLimit,
2094:                                        (options & OPTIONS_COMPAT) != 0, dest,
2095:                                        destIndex <= destLimit ? destIndex - 1
2096:                                                : 0, nx)) {
2097:                            srcIndex = ioIndex[0];
2098:                            prevStarter = srcIndex;
2099:                            continue;
2100:                        }
2101:
2102:                        srcIndex = ioIndex[0];
2103:
2104:                        /* the Jamo V/T did not compose into a Hangul syllable, just 
2105:                         * append to dest */
2106:                        c2 = 0;
2107:                        length = 1;
2108:                        prevStarter = prevSrc;
2109:                    } else {
2110:                        if (isNorm32Regular(norm32)) {
2111:                            c2 = 0;
2112:                            length = 1;
2113:                        } else {
2114:                            /* c is a lead surrogate, get the real norm32 */
2115:                            if (srcIndex != srcLimit
2116:                                    && UTF16
2117:                                            .isTrailSurrogate(c2 = src[srcIndex])) {
2118:                                ++srcIndex;
2119:                                length = 2;
2120:                                norm32 = getNorm32FromSurrogatePair(norm32, c2);
2121:                            } else {
2122:                                /* c is an unpaired lead surrogate, nothing to do */
2123:                                c2 = 0;
2124:                                length = 1;
2125:                                norm32 = 0;
2126:                            }
2127:                        }
2128:                        ComposePartArgs args = new ComposePartArgs();
2129:
2130:                        /* we are looking at the character (c, c2) at [prevSrc..src[ */
2131:                        if (nx_contains(nx, c, c2)) {
2132:                            /* excluded: norm32==0 */
2133:                            cc = 0;
2134:                        } else if ((norm32 & qcMask) == 0) {
2135:                            cc = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
2136:                        } else {
2137:                            char[] p;
2138:
2139:                            /*
2140:                             * find appropriate boundaries around this character,
2141:                             * decompose the source text from between the boundaries,
2142:                             * and recompose it
2143:                             *
2144:                             * this puts the intermediate text into the side buffer because
2145:                             * it might be longer than the recomposition end result,
2146:                             * or the destination buffer may be too short or missing
2147:                             *
2148:                             * note that destIndex may be adjusted backwards to account
2149:                             * for source text that passed the quick check but needed to
2150:                             * take part in the recomposition
2151:                             */
2152:                            int decompQCMask = (qcMask << 2) & 0xf; /* decomposition quick check mask */
2153:                            /*
2154:                             * find the last true starter in [prevStarter..src[
2155:                             * it is either the decomposition of the current character (at prevSrc),
2156:                             * or prevStarter
2157:                             */
2158:                            if (isTrueStarter(norm32, CC_MASK | qcMask,
2159:                                    decompQCMask)) {
2160:                                prevStarter = prevSrc;
2161:                            } else {
2162:                                /* adjust destIndex: back out what had been copied with qc "yes" */
2163:                                destIndex -= prevSrc - prevStarter;
2164:                            }
2165:
2166:                            /* find the next true starter in [src..limit[ */
2167:                            srcIndex = findNextStarter(src, srcIndex, srcLimit,
2168:                                    qcMask, decompQCMask, minNoMaybe);
2169:                            //args.prevStarter = prevStarter;
2170:                            args.prevCC = prevCC;
2171:                            //args.destIndex = destIndex;
2172:                            args.length = length;
2173:                            p = composePart(args, prevStarter, src, srcIndex,
2174:                                    srcLimit, options, nx);
2175:
2176:                            if (p == null) {
2177:                                /* an error occurred (out of memory) */
2178:                                break;
2179:                            }
2180:
2181:                            prevCC = args.prevCC;
2182:                            length = args.length;
2183:
2184:                            /* append the recomposed buffer contents to the destination 
2185:                             * buffer */
2186:                            if ((destIndex + args.length) <= destLimit) {
2187:                                int i = 0;
2188:                                while (i < args.length) {
2189:                                    dest[destIndex++] = p[i++];
2190:                                    --length;
2191:                                }
2192:                            } else {
2193:                                /* buffer overflow */
2194:                                /* keep incrementing the destIndex for preflighting */
2195:                                destIndex += length;
2196:                            }
2197:
2198:                            prevStarter = srcIndex;
2199:                            continue;
2200:                        }
2201:                    }
2202:
2203:                    /* append the single code point (c, c2) to the destination buffer */
2204:                    if ((destIndex + length) <= destLimit) {
2205:                        if (cc != 0 && cc < prevCC) {
2206:                            /* (c, c2) is out of order with respect to the preceding 
2207:                             * text */
2208:                            int reorderSplit = destIndex;
2209:                            destIndex += length;
2210:                            prevCC = insertOrdered(dest, reorderStartIndex,
2211:                                    reorderSplit, destIndex, c, c2, cc);
2212:                        } else {
2213:                            /* just append (c, c2) */
2214:                            dest[destIndex++] = c;
2215:                            if (c2 != 0) {
2216:                                dest[destIndex++] = c2;
2217:                            }
2218:                            prevCC = cc;
2219:                        }
2220:                    } else {
2221:                        /* buffer overflow */
2222:                        /* keep incrementing the destIndex for preflighting */
2223:                        destIndex += length;
2224:                        prevCC = cc;
2225:                    }
2226:                }
2227:
2228:                return destIndex - destStart;
2229:            }
2230:
2231:            /* make FCD --------------------------------------------------------------*/
2232:
2233:            private static int/*index*/findSafeFCD(char[] src, int start,
2234:                    int limit, char fcd16) {
2235:                char c, c2;
2236:
2237:                /*
2238:                 * find the first position in [src..limit[ after some cc==0 according 
2239:                 * to FCD data
2240:                 *
2241:                 * at the beginning of the loop, we have fcd16 from before src
2242:                 *
2243:                 * stop at positions:
2244:                 * - after trail cc==0
2245:                 * - at the end of the source
2246:                 * - before lead cc==0
2247:                 */
2248:                for (;;) {
2249:                    /* stop if trail cc==0 for the previous character */
2250:                    if ((fcd16 & 0xff) == 0) {
2251:                        break;
2252:                    }
2253:
2254:                    /* get c=*src - stop at end of string */
2255:                    if (start == limit) {
2256:                        break;
2257:                    }
2258:                    c = src[start];
2259:
2260:                    /* stop if lead cc==0 for this character */
2261:                    if (c < MIN_WITH_LEAD_CC || (fcd16 = getFCD16(c)) == 0) {
2262:                        break; /* catches terminating NUL, too */
2263:                    }
2264:
2265:                    if (!UTF16.isLeadSurrogate(c)) {
2266:                        if (fcd16 <= 0xff) {
2267:                            break;
2268:                        }
2269:                        ++start;
2270:                    } else if (start + 1 != limit
2271:                            && (UTF16.isTrailSurrogate(c2 = src[start + 1]))) {
2272:                        /* c is a lead surrogate, get the real fcd16 */
2273:                        fcd16 = getFCD16FromSurrogatePair(fcd16, c2);
2274:                        if (fcd16 <= 0xff) {
2275:                            break;
2276:                        }
2277:                        start += 2;
2278:                    } else {
2279:                        /* c is an unpaired first surrogate, lead cc==0 */
2280:                        break;
2281:                    }
2282:                }
2283:
2284:                return start;
2285:            }
2286:
2287:            private static int/*unsigned byte*/decomposeFCD(char[] src,
2288:                    int start, int decompLimit, char[] dest,
2289:                    int[] destIndexArr, UnicodeSet nx) {
2290:                char[] p = null;
2291:                int pStart = -1;
2292:
2293:                long /*unsigned int*/norm32;
2294:                int reorderStartIndex;
2295:                char c, c2;
2296:                int/*unsigned byte*/prevCC;
2297:                DecomposeArgs args = new DecomposeArgs();
2298:                int destIndex = destIndexArr[0];
2299:                /*
2300:                 * canonically decompose [src..decompLimit[
2301:                 *
2302:                 * all characters in this range have some non-zero cc,
2303:                 * directly or in decomposition,
2304:                 * so that we do not need to check in the following for quick-check 
2305:                 * limits etc.
2306:                 *
2307:                 * there _are_ _no_ Hangul syllables or Jamos in here because they are 
2308:                 * FCD-safe (cc==0)!
2309:                 *
2310:                 * we also do not need to check for c==0 because we have an established 
2311:                 * decompLimit
2312:                 */
2313:                reorderStartIndex = destIndex;
2314:                prevCC = 0;
2315:
2316:                while (start < decompLimit) {
2317:                    c = src[start++];
2318:                    norm32 = getNorm32(c);
2319:                    if (isNorm32Regular(norm32)) {
2320:                        c2 = 0;
2321:                        args.length = 1;
2322:                    } else {
2323:                        /*
2324:                         * reminder: this function is called with [src..decompLimit[
2325:                         * not containing any Hangul/Jamo characters,
2326:                         * therefore the only specials are lead surrogates
2327:                         */
2328:                        /* c is a lead surrogate, get the real norm32 */
2329:                        if (start != decompLimit
2330:                                && UTF16.isTrailSurrogate(c2 = src[start])) {
2331:                            ++start;
2332:                            args.length = 2;
2333:                            norm32 = getNorm32FromSurrogatePair(norm32, c2);
2334:                        } else {
2335:                            c2 = 0;
2336:                            args.length = 1;
2337:                            norm32 = 0;
2338:                        }
2339:                    }
2340:
2341:                    /* get the decomposition and the lead and trail cc's */
2342:                    if (nx_contains(nx, c, c2)) {
2343:                        /* excluded: norm32==0 */
2344:                        args.cc = args.trailCC = 0;
2345:                        p = null;
2346:                    } else if ((norm32 & QC_NFD) == 0) {
2347:                        /* c does not decompose */
2348:                        args.cc = args.trailCC = (int) ((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
2349:                        p = null;
2350:                    } else {
2351:                        /* c decomposes, get everything from the variable-length extra 
2352:                         * data */
2353:                        pStart = decompose(norm32, args);
2354:                        p = extraData;
2355:                        if (args.length == 1) {
2356:                            /* fastpath a single code unit from decomposition */
2357:                            c = p[pStart];
2358:                            c2 = 0;
2359:                            p = null;
2360:                        }
2361:                    }
2362:
2363:                    /* append the decomposition to the destination buffer, assume 
2364:                     * length>0 */
2365:                    if ((destIndex + args.length) <= dest.length) {
2366:                        int reorderSplit = destIndex;
2367:                        if (p == null) {
2368:                            /* fastpath: single code point */
2369:                            if (args.cc != 0 && args.cc < prevCC) {
2370:                                /* (c, c2) is out of order with respect to the preceding
2371:                                 *  text */
2372:                                destIndex += args.length;
2373:                                args.trailCC = insertOrdered(dest,
2374:                                        reorderStartIndex, reorderSplit,
2375:                                        destIndex, c, c2, args.cc);
2376:                            } else {
2377:                                /* just append (c, c2) */
2378:                                dest[destIndex++] = c;
2379:                                if (c2 != 0) {
2380:                                    dest[destIndex++] = c2;
2381:                                }
2382:                            }
2383:                        } else {
2384:                            /* general: multiple code points (ordered by themselves) 
2385:                             * from decomposition */
2386:                            if (args.cc != 0 && args.cc < prevCC) {
2387:                                /* the decomposition is out of order with respect to 
2388:                                 * the preceding text */
2389:                                destIndex += args.length;
2390:                                args.trailCC = mergeOrdered(dest,
2391:                                        reorderStartIndex, reorderSplit, p,
2392:                                        pStart, pStart + args.length);
2393:                            } else {
2394:                                /* just append the decomposition */
2395:                                do {
2396:                                    dest[destIndex++] = p[pStart++];
2397:                                } while (--args.length > 0);
2398:                            }
2399:                        }
2400:                    } else {
2401:                        /* buffer overflow */
2402:                        /* keep incrementing the destIndex for preflighting */
2403:                        destIndex += args.length;
2404:                    }
2405:
2406:                    prevCC = args.trailCC;
2407:                    if (prevCC == 0) {
2408:                        reorderStartIndex = destIndex;
2409:                    }
2410:                }
2411:                destIndexArr[0] = destIndex;
2412:                return prevCC;
2413:            }
2414:
2415:            public static int makeFCD(char[] src, int srcStart, int srcLimit,
2416:                    char[] dest, int destStart, int destLimit, UnicodeSet nx) {
2417:
2418:                int prevSrc, decompStart;
2419:                int destIndex, length;
2420:                char c, c2;
2421:                int /* unsigned int*/fcd16;
2422:                int prevCC, cc;
2423:
2424:                /* initialize */
2425:                decompStart = srcStart;
2426:                destIndex = destStart;
2427:                prevCC = 0;
2428:                c = 0;
2429:                fcd16 = 0;
2430:                int[] destIndexArr = new int[1];
2431:                destIndexArr[0] = destIndex;
2432:
2433:                for (;;) {
2434:                    /* skip a run of code units below the minimum or with irrelevant 
2435:                     * data for the FCD check */
2436:                    prevSrc = srcStart;
2437:
2438:                    for (;;) {
2439:                        if (srcStart == srcLimit) {
2440:                            break;
2441:                        } else if ((c = src[srcStart]) < MIN_WITH_LEAD_CC) {
2442:                            prevCC = (int) -c;
2443:                        } else if ((fcd16 = getFCD16(c)) == 0) {
2444:                            prevCC = 0;
2445:                        } else {
2446:                            break;
2447:                        }
2448:                        ++srcStart;
2449:                    }
2450:
2451:                    /*
2452:                     * prevCC has values from the following ranges:
2453:                     * 0..0xff - the previous trail combining class
2454:                     * <0      - the negative value of the previous code unit;
2455:                     *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its 
2456:                     *           getFCD16()
2457:                     *           was deferred so that average text is checked faster
2458:                     */
2459:
2460:                    /* copy these code units all at once */
2461:                    if (srcStart != prevSrc) {
2462:                        length = (int) (srcStart - prevSrc);
2463:                        if ((destIndex + length) <= destLimit) {
2464:                            System.arraycopy(src, prevSrc, dest, destIndex,
2465:                                    length);
2466:                        }
2467:                        destIndex += length;
2468:                        prevSrc = srcStart;
2469:
2470:                        /* prevCC<0 is only possible from the above loop, i.e., only if
2471:                         *  prevSrc<src */
2472:                        if (prevCC < 0) {
2473:                            /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we 
2474:                             * need to get its trail cc */
2475:                            if (!nx_contains(nx, (int) -prevCC)) {
2476:                                prevCC = (int) (getFCD16((int) -prevCC) & 0xff);
2477:                            } else {
2478:                                prevCC = 0; /* excluded: fcd16==0 */
2479:                            }
2480:                            /*
2481:                             * set a pointer to this below-U+0300 character;
2482:                             * if prevCC==0 then it will moved to after this character 
2483:                             * below
2484:                             */
2485:                            decompStart = prevSrc - 1;
2486:                        }
2487:                    }
2488:                    /*
2489:                     * now:
2490:                     * prevSrc==src - used later to adjust destIndex before 
2491:                     *          decomposition
2492:                     * prevCC>=0
2493:                     */
2494:
2495:                    /* end of source reached? */
2496:                    if (srcStart == srcLimit) {
2497:                        break;
2498:                    }
2499:
2500:                    /* set a pointer to after the last source position where prevCC==0*/
2501:                    if (prevCC == 0) {
2502:                        decompStart = prevSrc;
2503:                    }
2504:
2505:                    /* c already contains *src and fcd16 is set for it, increment src */
2506:                    ++srcStart;
2507:
2508:                    /* check one above-minimum, relevant code unit */
2509:                    if (UTF16.isLeadSurrogate(c)) {
2510:                        /* c is a lead surrogate, get the real fcd16 */
2511:                        if (srcStart != srcLimit
2512:                                && UTF16.isTrailSurrogate(c2 = src[srcStart])) {
2513:                            ++srcStart;
2514:                            fcd16 = getFCD16FromSurrogatePair((char) fcd16, c2);
2515:                        } else {
2516:                            c2 = 0;
2517:                            fcd16 = 0;
2518:                        }
2519:                    } else {
2520:                        c2 = 0;
2521:                    }
2522:
2523:                    /* we are looking at the character (c, c2) at [prevSrc..src[ */
2524:                    if (nx_contains(nx, c, c2)) {
2525:                        fcd16 = 0; /* excluded: fcd16==0 */
2526:                    }
2527:                    /* check the combining order, get the lead cc */
2528:                    cc = (int) (fcd16 >> 8);
2529:                    if (cc == 0 || cc >= prevCC) {
2530:                        /* the order is ok */
2531:                        if (cc == 0) {
2532:                            decompStart = prevSrc;
2533:                        }
2534:                        prevCC = (int) (fcd16 & 0xff);
2535:
2536:                        /* just append (c, c2) */
2537:                        length = c2 == 0 ? 1 : 2;
2538:                        if ((destIndex + length) <= destLimit) {
2539:                            dest[destIndex++] = c;
2540:                            if (c2 != 0) {
2541:                                dest[destIndex++] = c2;
2542:                            }
2543:                        } else {
2544:                            destIndex += length;
2545:                        }
2546:                    } else {
2547:                        /*
2548:                         * back out the part of the source that we copied already but
2549:                         * is now going to be decomposed;
2550:                         * prevSrc is set to after what was copied
2551:                         */
2552:                        destIndex -= (int) (prevSrc - decompStart);
2553:
2554:                        /*
2555:                         * find the part of the source that needs to be decomposed;
2556:                         * to be safe and simple, decompose to before the next character
2557:                         * with lead cc==0
2558:                         */
2559:                        srcStart = findSafeFCD(src, srcStart, srcLimit,
2560:                                (char) fcd16);
2561:
2562:                        /*
2563:                         * the source text does not fulfill the conditions for FCD;
2564:                         * decompose and reorder a limited piece of the text
2565:                         */
2566:                        destIndexArr[0] = destIndex;
2567:                        prevCC = decomposeFCD(src, decompStart, srcStart, dest,
2568:                                destIndexArr, nx);
2569:                        decompStart = srcStart;
2570:                        destIndex = destIndexArr[0];
2571:                    }
2572:                }
2573:
2574:                return destIndex - destStart;
2575:
2576:            }
2577:
2578:            public static int getCombiningClass(int c) {
2579:                long norm32;
2580:                norm32 = getNorm32(c);
2581:                return (char) ((norm32 >> CC_SHIFT) & 0xFF);
2582:            }
2583:
2584:            public static boolean isFullCompositionExclusion(int c) {
2585:                if (isFormatVersion_2_1) {
2586:                    int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2587:                    return (boolean) ((aux & AUX_COMP_EX_MASK) != 0);
2588:                } else {
2589:                    return false;
2590:                }
2591:            }
2592:
2593:            public static boolean isCanonSafeStart(int c) {
2594:                if (isFormatVersion_2_1) {
2595:                    int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2596:                    return (boolean) ((aux & AUX_UNSAFE_MASK) == 0);
2597:                } else {
2598:                    return false;
2599:                }
2600:            }
2601:
2602:            public static boolean getCanonStartSet(int c, USerializedSet fillSet) {
2603:
2604:                if (fillSet != null && canonStartSets != null) {
2605:                    /*
2606:                     * binary search for c
2607:                     *
2608:                     * There are two search tables,
2609:                     * one for BMP code points and one for supplementary ones.
2610:                     * See unormimp.h for details.
2611:                     */
2612:                    char[] table;
2613:                    int i = 0, start, limit;
2614:
2615:                    int[] indexes = (int[]) canonStartSets[CANON_SET_INDICIES_INDEX];
2616:                    char[] startSets = (char[]) canonStartSets[CANON_SET_START_SETS_INDEX];
2617:
2618:                    if (c <= 0xffff) {
2619:                        table = (char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX];
2620:                        start = 0;
2621:                        limit = table.length;
2622:
2623:                        /* each entry is a pair { c, result } */
2624:                        while (start < limit - 2) {
2625:                            i = (char) (((start + limit) / 4) * 2);
2626:                            if (c < table[i]) {
2627:                                limit = i;
2628:                            } else {
2629:                                start = i;
2630:                            }
2631:                        }
2632:                        //System.out.println(i);
2633:                        /* found? */
2634:                        if (c == table[start]) {
2635:                            i = table[start + 1];
2636:                            if ((i & CANON_SET_BMP_MASK) == CANON_SET_BMP_IS_INDEX) {
2637:                                /* result 01xxxxxx xxxxxx contains index x to a 
2638:                                 * USerializedSet */
2639:                                i &= (CANON_SET_MAX_CANON_SETS - 1);
2640:                                return fillSet.getSet(startSets,
2641:                                        (i - indexes.length));
2642:                            } else {
2643:                                /* other result values are BMP code points for 
2644:                                 * single-code point sets */
2645:                                fillSet.setToOne(i);
2646:                                return true;
2647:                            }
2648:                        }
2649:                    } else {
2650:                        char high, low, h, j = 0;
2651:
2652:                        table = (char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX];
2653:                        start = 0;
2654:                        limit = table.length;
2655:
2656:                        high = (char) (c >> 16);
2657:                        low = (char) c;
2658:
2659:                        /* each entry is a triplet { high(c), low(c), result } */
2660:                        while (start < limit - 3) {
2661:                            /* (start+limit)/2 and address triplets */
2662:                            i = (char) (((start + limit) / 6) * 3);
2663:                            j = (char) (table[i] & 0x1f); /* high word */
2664:                            int tableVal = table[i + 1];
2665:                            int lowInt = low;
2666:                            if (high < j
2667:                                    || ((tableVal > lowInt) && (high == j))) {
2668:                                limit = i;
2669:                            } else {
2670:                                start = i;
2671:                            }
2672:
2673:                            //System.err.println("\t((high==j) && (table[i+1]>low)) == " + ((high==j) && (tableVal>lowInt)) );
2674:
2675:                            // KLUDGE: IBM JIT in 1.4.0 is sooo broken
2676:                            // The below lines make TestExhaustive pass
2677:                            if (ICUDebug.enabled()) {
2678:                                System.err.println("\t\t j = "
2679:                                        + Utility.hex(j, 4) + "\t i = "
2680:                                        + Utility.hex(i, 4) + "\t high = "
2681:                                        + Utility.hex(high) + "\t low = "
2682:                                        + Utility.hex(lowInt, 4)
2683:                                        + "\t table[i+1]: "
2684:                                        + Utility.hex(tableVal, 4));
2685:                            }
2686:
2687:                        }
2688:
2689:                        /* found? */
2690:                        h = table[start];
2691:
2692:                        //System.err.println("c: \\U"+ Integer.toHexString(c)+" i : "+Integer.toHexString(i) +" h : " + Integer.toHexString(h));
2693:                        int tableVal1 = table[start + 1];
2694:                        int lowInt = low;
2695:
2696:                        if (high == (h & 0x1f) && lowInt == tableVal1) {
2697:                            int tableVal2 = table[start + 2];
2698:                            i = tableVal2;
2699:                            if ((h & 0x8000) == 0) {
2700:                                /* the result is an index to a USerializedSet */
2701:                                return fillSet.getSet(startSets,
2702:                                        (i - indexes.length));
2703:                            } else {
2704:                                /*
2705:                                 * single-code point set {x} in
2706:                                 * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }
2707:                                 */
2708:                                //i|=((int)h & 0x1f00)<<8; /* add high bits from high(c) */
2709:                                int temp = ((int) h & 0x1f00) << 8;
2710:                                i |= temp; /* add high bits from high(c) */
2711:                                fillSet.setToOne((int) i);
2712:                                return true;
2713:                            }
2714:                        }
2715:                    }
2716:                }
2717:
2718:                return false; /* not found */
2719:            }
2720:
2721:            public static int getFC_NFKC_Closure(int c, char[] dest) {
2722:
2723:                int destCapacity;
2724:
2725:                if (dest == null) {
2726:                    destCapacity = 0;
2727:                } else {
2728:                    destCapacity = dest.length;
2729:                }
2730:
2731:                int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2732:
2733:                aux &= AUX_FNC_MASK;
2734:                if (aux != 0) {
2735:                    int s;
2736:                    int index = aux;
2737:                    int length;
2738:
2739:                    s = extraData[index];
2740:                    if (s < 0xff00) {
2741:                        /* s points to the single-unit string */
2742:                        length = 1;
2743:                    } else {
2744:                        length = s & 0xff;
2745:                        ++index;
2746:                    }
2747:                    if (0 < length && length <= destCapacity) {
2748:                        System.arraycopy(extraData, index, dest, 0, length);
2749:                    }
2750:                    return length;
2751:                } else {
2752:                    return 0;
2753:                }
2754:            }
2755:
2756:            /* Is c an NF<mode>-skippable code point? See unormimp.h. */
2757:            public static boolean isNFSkippable(int c, Normalizer.Mode mode,
2758:                    long mask) {
2759:                long /*unsigned int*/norm32;
2760:                mask = mask & UNSIGNED_INT_MASK;
2761:                char aux;
2762:
2763:                /* check conditions (a)..(e), see unormimp.h */
2764:                norm32 = getNorm32(c);
2765:
2766:                if ((norm32 & mask) != 0) {
2767:                    return false; /* fails (a)..(e), not skippable */
2768:                }
2769:
2770:                if (mode == Normalizer.NFD || mode == Normalizer.NFKD
2771:                        || mode == Normalizer.NONE) {
2772:                    return true; /* NF*D, passed (a)..(c), is skippable */
2773:                }
2774:                /* check conditions (a)..(e), see unormimp.h */
2775:
2776:                /* NF*C/FCC, passed (a)..(e) */
2777:                if ((norm32 & QC_NFD) == 0) {
2778:                    return true; /* no canonical decomposition, is skippable */
2779:                }
2780:
2781:                /* check Hangul syllables algorithmically */
2782:                if (isNorm32HangulOrJamo(norm32)) {
2783:                    /* Jamo passed (a)..(e) above, must be Hangul */
2784:                    return !isHangulWithoutJamoT((char) c); /* LVT are skippable, LV are not */
2785:                }
2786:
2787:                /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
2788:                /* NF*C, test (f) flag */
2789:                if (!isFormatVersion_2_2) {
2790:                    return false; /* no (f) data, say not skippable to be safe */
2791:                }
2792:
2793:                aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2794:                return (aux & AUX_NFC_SKIP_F_MASK) == 0; /* TRUE=skippable if the (f) flag is not set */
2795:
2796:                /* } else { FCC, test fcd<=1 instead of the above } */
2797:            }
2798:
2799:            /*
2800:                private static final boolean
2801:            _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
2802:                // add the start code point to the USet 
2803:                uset_add((USet *)context, start);
2804:                return TRUE;
2805:            }
2806:             */
2807:
2808:            public static UnicodeSet addPropertyStarts(UnicodeSet set) {
2809:                int c;
2810:
2811:                /* add the start code point of each same-value range of each trie */
2812:                //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
2813:                TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
2814:                RangeValueIterator.Element normResult = new RangeValueIterator.Element();
2815:
2816:                while (normIter.next(normResult)) {
2817:                    set.add(normResult.start);
2818:                }
2819:
2820:                //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
2821:                TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie);
2822:                RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();
2823:
2824:                while (fcdIter.next(fcdResult)) {
2825:                    set.add(fcdResult.start);
2826:                }
2827:
2828:                if (isFormatVersion_2_1) {
2829:                    //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
2830:                    TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
2831:                    RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
2832:                    while (auxIter.next(auxResult)) {
2833:                        set.add(auxResult.start);
2834:                    }
2835:                }
2836:                /* add Hangul LV syllables and LV+1 because of skippables */
2837:                for (c = HANGUL_BASE; c < HANGUL_BASE + HANGUL_COUNT; c += JAMO_T_COUNT) {
2838:                    set.add(c);
2839:                    set.add(c + 1);
2840:                }
2841:                set.add(HANGUL_BASE + HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
2842:                return set; // for chaining
2843:            }
2844:
2845:            /**
2846:             * Internal API, used in UCharacter.getIntPropertyValue().
2847:             * @internal
2848:             * @param c code point
2849:             * @param modeValue numeric value compatible with Mode
2850:             * @return numeric value compatible with QuickCheck
2851:             */
2852:            public static final int quickCheck(int c, int modeValue) {
2853:                final int qcMask[/*UNORM_MODE_COUNT*/] = { 0, 0, QC_NFD,
2854:                        QC_NFKD, QC_NFC, QC_NFKC };
2855:
2856:                int norm32 = (int) getNorm32(c) & qcMask[modeValue];
2857:
2858:                if (norm32 == 0) {
2859:                    return 1; // YES
2860:                } else if ((norm32 & QC_ANY_NO) != 0) {
2861:                    return 0; // NO
2862:                } else /* _NORM_QC_ANY_MAYBE */{
2863:                    return 2; // MAYBE;
2864:                }
2865:            }
2866:
2867:            /**
2868:             * Internal API, used by collation code.
2869:             * Get access to the internal FCD trie table to be able to perform
2870:             * incremental, per-code unit, FCD checks in collation.
2871:             * One pointer is sufficient because the trie index values are offset
2872:             * by the index size, so that the same pointer is used to access the trie 
2873:             * data.
2874:             * @internal
2875:             */
2876:            ///CLOVER:OFF
2877:            public CharTrie getFCDTrie() {
2878:                return FCDTrieImpl.fcdTrie;
2879:            }
2880:
2881:            ///CLOVER:ON
2882:
2883:            /* compare canonically equivalent ---------------------------------------- */
2884:
2885:            /*
2886:             * Compare two strings for canonical equivalence.
2887:             * Further options include case-insensitive comparison and
2888:             * code point order (as opposed to code unit order).
2889:             *
2890:             * In this function, canonical equivalence is optional as well.
2891:             * If canonical equivalence is tested, then both strings must fulfill
2892:             * the FCD check.
2893:             *
2894:             * Semantically, this is equivalent to
2895:             *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
2896:             * where code point order, NFD and foldCase are all optional.
2897:             *
2898:             * String comparisons almost always yield results before processing both 
2899:             * strings completely.
2900:             * They are generally more efficient working incrementally instead of
2901:             * performing the sub-processing (strlen, normalization, case-folding)
2902:             * on the entire strings first.
2903:             *
2904:             * It is also unnecessary to not normalize identical characters.
2905:             *
2906:             * This function works in principle as follows:
2907:             *
2908:             * loop {
2909:             *   get one code unit c1 from s1 (-1 if end of source)
2910:             *   get one code unit c2 from s2 (-1 if end of source)
2911:             *
2912:             *   if(either string finished) {
2913:             *     return result;
2914:             *   }
2915:             *   if(c1==c2) {
2916:             *     continue;
2917:             *   }
2918:             *
2919:             *   // c1!=c2
2920:             *   try to decompose/case-fold c1/c2, and continue if one does;
2921:             *
2922:             *   // still c1!=c2 and neither decomposes/case-folds, return result
2923:             *   return c1-c2;
2924:             * }
2925:             *
2926:             * When a character decomposes, then the pointer for that source changes to
2927:             * the decomposition, pushing the previous pointer onto a stack.
2928:             * When the end of the decomposition is reached, then the code unit reader
2929:             * pops the previous source from the stack.
2930:             * (Same for case-folding.)
2931:             *
2932:             * This is complicated further by operating on variable-width UTF-16.
2933:             * The top part of the loop works on code units, while lookups for decomposition
2934:             * and case-folding need code points.
2935:             * Code points are assembled after the equality/end-of-source part.
2936:             * The source pointer is only advanced beyond all code units when the code point
2937:             * actually decomposes/case-folds.
2938:             *
2939:             * If we were on a trail surrogate unit when assembling a code point,
2940:             * and the code point decomposes/case-folds, then the decomposition/folding
2941:             * result must be compared with the part of the other string that corresponds to
2942:             * this string's lead surrogate.
2943:             * Since we only assemble a code point when hitting a trail unit when the
2944:             * preceding lead units were identical, we back up the other string by one unit
2945:             * in such a case.
2946:             *
2947:             * The optional code point order comparison at the end works with
2948:             * the same fix-up as the other code point order comparison functions.
2949:             * See ustring.c and the comment near the end of this function.
2950:             *
2951:             * Assumption: A decomposition or case-folding result string never contains
2952:             * a single surrogate. This is a safe assumption in the Unicode Standard.
2953:             * Therefore, we do not need to check for surrogate pairs across
2954:             * decomposition/case-folding boundaries.
2955:             * Further assumptions (see verifications tstnorm.cpp):
2956:             * The API function checks for FCD first, while the core function
2957:             * first case-folds and then decomposes. This requires that case-folding does not
2958:             * un-FCD any strings.
2959:             *
2960:             * The API function may also NFD the input and turn off decomposition.
2961:             * This requires that case-folding does not un-NFD strings either.
2962:             *
2963:             * TODO If any of the above two assumptions is violated,
2964:             * then this entire code must be re-thought.
2965:             * If this happens, then a simple solution is to case-fold both strings up front
2966:             * and to turn off UNORM_INPUT_IS_FCD.
2967:             * We already do this when not both strings are in FCD because makeFCD
2968:             * would be a partial NFD before the case folding, which does not work.
2969:             * Note that all of this is only a problem when case-folding _and_
2970:             * canonical equivalence come together.
2971:             * 
2972:             * This function could be moved to a different source file, at increased cost
2973:             * for calling the decomposition access function.
2974:             */
2975:
2976:            // stack element for previous-level source/decomposition pointers
2977:            private static class CmpEquivLevel {
2978:                char[] source;
2979:                int start;
2980:                int s;
2981:                int limit;
2982:            }
2983:
2984:            /**
2985:             * Get the canonical decomposition for one code point.
2986:             * @param c code point
2987:             * @param buffer out-only buffer for algorithmic decompositions of Hangul
2988:             * @param length out-only, takes the length of the decomposition, if any
2989:             * @return index into the extraData array, or 0 if none
2990:             * @internal
2991:             */
2992:            private static int decompose(int c, char[] buffer) {
2993:
2994:                long norm32;
2995:                int length = 0;
2996:                norm32 = (long) ((UNSIGNED_INT_MASK) & NormTrieImpl.normTrie
2997:                        .getCodePointValue(c));
2998:                if ((norm32 & QC_NFD) != 0) {
2999:                    if (isNorm32HangulOrJamo(norm32)) {
3000:                        /* Hangul syllable: decompose algorithmically */
3001:                        char c2;
3002:
3003:                        c -= HANGUL_BASE;
3004:
3005:                        c2 = (char) (c % JAMO_T_COUNT);
3006:                        c /= JAMO_T_COUNT;
3007:                        if (c2 > 0) {
3008:                            buffer[2] = (char) (JAMO_T_BASE + c2);
3009:                            length = 3;
3010:                        } else {
3011:                            length = 2;
3012:                        }
3013:                        buffer[1] = (char) (JAMO_V_BASE + c % JAMO_V_COUNT);
3014:                        buffer[0] = (char) (JAMO_L_BASE + c / JAMO_V_COUNT);
3015:                        return length;
3016:                    } else {
3017:                        /* normal decomposition */
3018:                        DecomposeArgs args = new DecomposeArgs();
3019:                        int index = decompose(norm32, args);
3020:                        System.arraycopy(extraData, index, buffer, 0,
3021:                                args.length);
3022:                        return args.length;
3023:                    }
3024:                } else {
3025:                    return 0;
3026:                }
3027:            }
3028:
3029:            private static int foldCase(int c, char[] dest, int destStart,
3030:                    int destLimit, int options) {
3031:                String src = UTF16.valueOf(c);
3032:                String foldedStr = UCharacter.foldCase(src, options);
3033:                char[] foldedC = foldedStr.toCharArray();
3034:                for (int i = 0; i < foldedC.length; i++) {
3035:                    if (destStart < destLimit) {
3036:                        dest[destStart] = foldedC[i];
3037:                    }
3038:                    // always increment destStart so that we can return 
3039:                    // the required length
3040:                    destStart++;
3041:                }
3042:                return (c == UTF16.charAt(foldedStr, 0)) ? -destStart
3043:                        : destStart;
3044:            }
3045:
3046:            /*
3047:             private static int foldCase(char[] src,int srcStart,int srcLimit,
3048:                                        char[] dest, int destStart, int destLimit,
3049:                                        int options){
3050:                String source =new String(src,srcStart,(srcLimit-srcStart));
3051:                String foldedStr = UCharacter.foldCase(source,options);
3052:                char[] foldedC = foldedStr.toCharArray();
3053:                for(int i=0;i<foldedC.length;i++){
3054:                    if(destStart<destLimit){
3055:                        dest[destStart]=foldedC[i];
3056:                    }
3057:                    // always increment destStart so that we can return 
3058:                    // the required length
3059:                    destStart++;
3060:                    
3061:                }
3062:                return destStart;
3063:            }
3064:             */
3065:            public static int cmpEquivFold(String s1, String s2, int options) {
3066:                return cmpEquivFold(s1.toCharArray(), 0, s1.length(), s2
3067:                        .toCharArray(), 0, s2.length(), options);
3068:            }
3069:
3070:            // internal function
3071:            public static int cmpEquivFold(char[] s1, int s1Start, int s1Limit,
3072:                    char[] s2, int s2Start, int s2Limit, int options) {
3073:                // current-level start/limit - s1/s2 as current
3074:                int start1, start2, limit1, limit2;
3075:                char[] cSource1, cSource2;
3076:
3077:                cSource1 = s1;
3078:                cSource2 = s2;
3079:                // decomposition variables
3080:                int length;
3081:
3082:                // stacks of previous-level start/current/limit
3083:                CmpEquivLevel[] stack1 = new CmpEquivLevel[] {
3084:                        new CmpEquivLevel(), new CmpEquivLevel() };
3085:                CmpEquivLevel[] stack2 = new CmpEquivLevel[] {
3086:                        new CmpEquivLevel(), new CmpEquivLevel() };
3087:
3088:                // decomposition buffers for Hangul
3089:                char[] decomp1 = new char[8];
3090:                char[] decomp2 = new char[8];
3091:
3092:                // case folding buffers, only use current-level start/limit
3093:                char[] fold1 = new char[32];
3094:                char[] fold2 = new char[32];
3095:
3096:                // track which is the current level per string
3097:                int level1, level2;
3098:
3099:                // current code units, and code points for lookups
3100:                int c1, c2;
3101:                int cp1, cp2;
3102:
3103:                // no argument error checking because this itself is not an API
3104:
3105:                // assume that at least one of the options COMPARE_EQUIV and 
3106:                // COMPARE_IGNORE_CASE is set
3107:                // otherwise this function must behave exactly as uprv_strCompare()
3108:                // not checking for that here makes testing this function easier
3109:
3110:                // initialize
3111:                start1 = s1Start;
3112:                limit1 = s1Limit;
3113:
3114:                start2 = s2Start;
3115:                limit2 = s2Limit;
3116:
3117:                level1 = level2 = 0;
3118:                c1 = c2 = -1;
3119:                cp1 = cp2 = -1;
3120:                // comparison loop
3121:                for (;;) {
3122:                    // here a code unit value of -1 means "get another code unit"
3123:                    // below it will mean "this source is finished"
3124:
3125:                    if (c1 < 0) {
3126:                        // get next code unit from string 1, post-increment
3127:                        for (;;) {
3128:                            if (s1Start >= limit1) {
3129:                                if (level1 == 0) {
3130:                                    c1 = -1;
3131:                                    break;
3132:                                }
3133:                            } else {
3134:                                c1 = cSource1[s1Start];
3135:                                ++s1Start;
3136:                                break;
3137:                            }
3138:
3139:                            // reached end of level buffer, pop one level
3140:                            do {
3141:                                --level1;
3142:                                start1 = stack1[level1].start;
3143:                            } while (start1 == -1); //###### check this
3144:                            s1Start = stack1[level1].s;
3145:                            limit1 = stack1[level1].limit;
3146:                            cSource1 = stack1[level1].source;
3147:                        }
3148:                    }
3149:
3150:                    if (c2 < 0) {
3151:                        // get next code unit from string 2, post-increment
3152:                        for (;;) {
3153:                            if (s2Start >= limit2) {
3154:                                if (level2 == 0) {
3155:                                    c2 = -1;
3156:                                    break;
3157:                                }
3158:                            } else {
3159:                                c2 = cSource2[s2Start];
3160:                                ++s2Start;
3161:                                break;
3162:                            }
3163:
3164:                            // reached end of level buffer, pop one level
3165:                            do {
3166:                                --level2;
3167:                                start2 = stack2[level2].start;
3168:                            } while (start2 == -1);
3169:                            s2Start = stack2[level2].s;
3170:                            limit2 = stack2[level2].limit;
3171:                            cSource2 = stack2[level2].source;
3172:                        }
3173:                    }
3174:
3175:                    // compare c1 and c2
3176:                    // either variable c1, c2 is -1 only if the corresponding string 
3177:                    // is finished
3178:                    if (c1 == c2) {
3179:                        if (c1 < 0) {
3180:                            return 0; // c1==c2==-1 indicating end of strings
3181:                        }
3182:                        c1 = c2 = -1; // make us fetch new code units
3183:                        continue;
3184:                    } else if (c1 < 0) {
3185:                        return -1; // string 1 ends before string 2
3186:                    } else if (c2 < 0) {
3187:                        return 1; // string 2 ends before string 1
3188:                    }
3189:                    // c1!=c2 && c1>=0 && c2>=0
3190:
3191:                    // get complete code points for c1, c2 for lookups if either is a 
3192:                    // surrogate
3193:                    cp1 = c1;
3194:                    if (UTF16.isSurrogate((char) c1)) {
3195:                        char c;
3196:
3197:                        if (UTF16.isLeadSurrogate((char) c1)) {
3198:                            if (s1Start != limit1
3199:                                    && UTF16
3200:                                            .isTrailSurrogate(c = cSource1[s1Start])) {
3201:                                // advance ++s1; only below if cp1 decomposes/case-folds
3202:                                cp1 = UCharacterProperty.getRawSupplementary(
3203:                                        (char) c1, c);
3204:                            }
3205:                        } else /* isTrail(c1) */{
3206:                            if (start1 <= (s1Start - 2)
3207:                                    && UTF16
3208:                                            .isLeadSurrogate(c = cSource1[(s1Start - 2)])) {
3209:                                cp1 = UCharacterProperty.getRawSupplementary(c,
3210:                                        (char) c1);
3211:                            }
3212:                        }
3213:                    }
3214:                    cp2 = c2;
3215:                    if (UTF16.isSurrogate((char) c2)) {
3216:                        char c;
3217:
3218:                        if (UTF16.isLeadSurrogate((char) c2)) {
3219:                            if (s2Start != limit2
3220:                                    && UTF16
3221:                                            .isTrailSurrogate(c = cSource2[s2Start])) {
3222:                                // advance ++s2; only below if cp2 decomposes/case-folds
3223:                                cp2 = UCharacterProperty.getRawSupplementary(
3224:                                        (char) c2, c);
3225:                            }
3226:                        } else /* isTrail(c2) */{
3227:                            if (start2 <= (s2Start - 2)
3228:                                    && UTF16
3229:                                            .isLeadSurrogate(c = cSource2[s2Start - 2])) {
3230:                                cp2 = UCharacterProperty.getRawSupplementary(c,
3231:                                        (char) c2);
3232:                            }
3233:                        }
3234:                    }
3235:
3236:                    // go down one level for each string
3237:                    // continue with the main loop as soon as there is a real change
3238:                    if (level1 < 2
3239:                            && ((options & Normalizer.COMPARE_IGNORE_CASE) != 0)
3240:                            && (length = foldCase(cp1, fold1, 0, 32, options)) >= 0) {
3241:                        // cp1 case-folds to fold1[length]
3242:                        if (UTF16.isSurrogate((char) c1)) {
3243:                            if (UTF16.isLeadSurrogate((char) c1)) {
3244:                                // advance beyond source surrogate pair if it 
3245:                                // case-folds
3246:                                ++s1Start;
3247:                            } else /* isTrail(c1) */{
3248:                                // we got a supplementary code point when hitting its 
3249:                                // trail surrogate, therefore the lead surrogate must 
3250:                                // have been the same as in the other string;
3251:                                // compare this decomposition with the lead surrogate
3252:                                // in the other string
3253:                                --s2Start;
3254:                                c2 = cSource2[(s2Start - 1)];
3255:                            }
3256:                        }
3257:
3258:                        // push current level pointers
3259:                        stack1[0].start = start1;
3260:                        stack1[0].s = s1Start;
3261:                        stack1[0].limit = limit1;
3262:                        stack1[0].source = cSource1;
3263:                        ++level1;
3264:
3265:                        cSource1 = fold1;
3266:                        start1 = s1Start = 0;
3267:                        limit1 = length;
3268:
3269:                        // get ready to read from decomposition, continue with loop
3270:                        c1 = -1;
3271:                        continue;
3272:                    }
3273:
3274:                    if (level2 < 2
3275:                            && ((options & Normalizer.COMPARE_IGNORE_CASE) != 0)
3276:                            && (length = foldCase(cp2, fold2, 0, 32, options)) >= 0) {
3277:                        // cp2 case-folds to fold2[length]
3278:                        if (UTF16.isSurrogate((char) c2)) {
3279:                            if (UTF16.isLeadSurrogate((char) c2)) {
3280:                                // advance beyond source surrogate pair if it 
3281:                                // case-folds
3282:                                ++s2Start;
3283:                            } else /* isTrail(c2) */{
3284:                                // we got a supplementary code point when hitting its 
3285:                                // trail surrogate, therefore the lead surrogate must 
3286:                                // have been the same as in the other string;
3287:                                // compare this decomposition with the lead surrogate 
3288:                                // in the other string
3289:                                --s1Start;
3290:                                c1 = cSource1[(s1Start - 1)];
3291:                            }
3292:                        }
3293:
3294:                        // push current level pointers
3295:                        stack2[0].start = start2;
3296:                        stack2[0].s = s2Start;
3297:                        stack2[0].limit = limit2;
3298:                        stack2[0].source = cSource2;
3299:                        ++level2;
3300:
3301:                        cSource2 = fold2;
3302:                        start2 = s2Start = 0;
3303:                        limit2 = length;
3304:
3305:                        // get ready to read from decomposition, continue with loop
3306:                        c2 = -1;
3307:                        continue;
3308:                    }
3309:
3310:                    if (level1 < 2 && ((options & COMPARE_EQUIV) != 0)
3311:                            && 0 != (length = decompose(cp1, decomp1))) {
3312:                        // cp1 decomposes into p[length]
3313:                        if (UTF16.isSurrogate((char) c1)) {
3314:                            if (UTF16.isLeadSurrogate((char) c1)) {
3315:                                // advance beyond source surrogate pair if it 
3316:                                //decomposes
3317:                                ++s1Start;
3318:                            } else /* isTrail(c1) */{
3319:                                // we got a supplementary code point when hitting 
3320:                                // its trail surrogate, therefore the lead surrogate 
3321:                                // must have been the same as in the other string;
3322:                                // compare this decomposition with the lead surrogate 
3323:                                // in the other string
3324:                                --s2Start;
3325:                                c2 = cSource2[(s2Start - 1)];
3326:                            }
3327:                        }
3328:
3329:                        // push current level pointers
3330:                        stack1[level1].start = start1;
3331:                        stack1[level1].s = s1Start;
3332:                        stack1[level1].limit = limit1;
3333:                        stack1[level1].source = cSource1;
3334:                        ++level1;
3335:
3336:                        // set next level pointers to decomposition
3337:                        cSource1 = decomp1;
3338:                        start1 = s1Start = 0;
3339:                        limit1 = length;
3340:
3341:                        // set empty intermediate level if skipped
3342:                        if (level1 < 2) {
3343:                            stack1[level1++].start = -1;
3344:                        }
3345:                        // get ready to read from decomposition, continue with loop
3346:                        c1 = -1;
3347:                        continue;
3348:                    }
3349:
3350:                    if (level2 < 2 && ((options & COMPARE_EQUIV) != 0)
3351:                            && 0 != (length = decompose(cp2, decomp2))) {
3352:                        // cp2 decomposes into p[length]
3353:                        if (UTF16.isSurrogate((char) c2)) {
3354:                            if (UTF16.isLeadSurrogate((char) c2)) {
3355:                                // advance beyond source surrogate pair if it 
3356:                                // decomposes
3357:                                ++s2Start;
3358:                            } else /* isTrail(c2) */{
3359:                                // we got a supplementary code point when hitting its 
3360:                                // trail surrogate, therefore the lead surrogate must 
3361:                                // have been the same as in the other string;
3362:                                // compare this decomposition with the lead surrogate 
3363:                                // in the other string
3364:                                --s1Start;
3365:                                c1 = cSource1[(s1Start - 1)];
3366:                            }
3367:                        }
3368:
3369:                        // push current level pointers
3370:                        stack2[level2].start = start2;
3371:                        stack2[level2].s = s2Start;
3372:                        stack2[level2].limit = limit2;
3373:                        stack2[level2].source = cSource2;
3374:                        ++level2;
3375:
3376:                        // set next level pointers to decomposition
3377:                        cSource2 = decomp2;
3378:                        start2 = s2Start = 0;
3379:                        limit2 = length;
3380:
3381:                        // set empty intermediate level if skipped
3382:                        if (level2 < 2) {
3383:                            stack2[level2++].start = -1;
3384:                        }
3385:
3386:                        // get ready to read from decomposition, continue with loop
3387:                        c2 = -1;
3388:                        continue;
3389:                    }
3390:
3391:                    // no decomposition/case folding, max level for both sides:
3392:                    // return difference result
3393:
3394:                    // code point order comparison must not just return cp1-cp2
3395:                    // because when single surrogates are present then the surrogate 
3396:                    // pairs that formed cp1 and cp2 may be from different string 
3397:                    // indexes
3398:
3399:                    // example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at 
3400:                    // second code units
3401:                    // c1=d800 cp1=10001 c2=dc00 cp2=10000
3402:                    // cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 
3403:                    // it is { d800 10001 } < { 10000 }
3404:                    // therefore fix-up 
3405:
3406:                    if (c1 >= 0xd800
3407:                            && c2 >= 0xd800
3408:                            && ((options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0)) {
3409:                        /* subtract 0x2800 from BMP code points to make them smaller 
3410:                         * than supplementary ones */
3411:                        if ((c1 <= 0xdbff && s1Start != limit1 && UTF16
3412:                                .isTrailSurrogate(cSource1[s1Start]))
3413:                                || (UTF16.isTrailSurrogate((char) c1)
3414:                                        && start1 != (s1Start - 1) && UTF16
3415:                                        .isLeadSurrogate(cSource1[(s1Start - 2)]))) {
3416:                            /* part of a surrogate pair, leave >=d800 */
3417:                        } else {
3418:                            /* BMP code point - may be surrogate code point - 
3419:                             * make <d800 */
3420:                            c1 -= 0x2800;
3421:                        }
3422:
3423:                        if ((c2 <= 0xdbff && s2Start != limit2 && UTF16
3424:                                .isTrailSurrogate(cSource2[s2Start]))
3425:                                || (UTF16.isTrailSurrogate((char) c2)
3426:                                        && start2 != (s2Start - 1) && UTF16
3427:                                        .isLeadSurrogate(cSource2[(s2Start - 2)]))) {
3428:                            /* part of a surrogate pair, leave >=d800 */
3429:                        } else {
3430:                            /* BMP code point - may be surrogate code point - 
3431:                             * make <d800 */
3432:                            c2 -= 0x2800;
3433:                        }
3434:                    }
3435:
3436:                    return c1 - c2;
3437:                }
3438:            }
3439:
3440:            private static int strCompare(char[] s1, int s1Start, int s1Limit,
3441:                    char[] s2, int s2Start, int s2Limit, boolean codePointOrder) {
3442:
3443:                int start1, start2, limit1, limit2;
3444:
3445:                char c1, c2;
3446:
3447:                /* setup for fix-up */
3448:                start1 = s1Start;
3449:                start2 = s2Start;
3450:
3451:                int length1, length2;
3452:
3453:                length1 = s1Limit - s1Start;
3454:                length2 = s2Limit - s2Start;
3455:
3456:                int lengthResult;
3457:
3458:                if (length1 < length2) {
3459:                    lengthResult = -1;
3460:                    limit1 = start1 + length1;
3461:                } else if (length1 == length2) {
3462:                    lengthResult = 0;
3463:                    limit1 = start1 + length1;
3464:                } else /* length1>length2 */{
3465:                    lengthResult = 1;
3466:                    limit1 = start1 + length2;
3467:                }
3468:
3469:                if (s1 == s2) {
3470:                    return lengthResult;
3471:                }
3472:
3473:                for (;;) {
3474:                    /* check pseudo-limit */
3475:                    if (s1Start == limit1) {
3476:                        return lengthResult;
3477:                    }
3478:
3479:                    c1 = s1[s1Start];
3480:                    c2 = s2[s2Start];
3481:                    if (c1 != c2) {
3482:                        break;
3483:                    }
3484:                    ++s1Start;
3485:                    ++s2Start;
3486:                }
3487:
3488:                /* setup for fix-up */
3489:                limit1 = start1 + length1;
3490:                limit2 = start2 + length2;
3491:
3492:                /* if both values are in or above the surrogate range, fix them up */
3493:                if (c1 >= 0xd800 && c2 >= 0xd800 && codePointOrder) {
3494:                    /* subtract 0x2800 from BMP code points to make them smaller than
3495:                     *  supplementary ones */
3496:                    if ((c1 <= 0xdbff && (s1Start + 1) != limit1 && UTF16
3497:                            .isTrailSurrogate(s1[(s1Start + 1)]))
3498:                            || (UTF16.isTrailSurrogate(c1) && start1 != s1Start && UTF16
3499:                                    .isLeadSurrogate(s1[(s1Start - 1)]))) {
3500:                        /* part of a surrogate pair, leave >=d800 */
3501:                    } else {
3502:                        /* BMP code point - may be surrogate code point - make <d800 */
3503:                        c1 -= 0x2800;
3504:                    }
3505:
3506:                    if ((c2 <= 0xdbff && (s2Start + 1) != limit2 && UTF16
3507:                            .isTrailSurrogate(s2[(s2Start + 1)]))
3508:                            || (UTF16.isTrailSurrogate(c2) && start2 != s2Start && UTF16
3509:                                    .isLeadSurrogate(s2[(s2Start - 1)]))) {
3510:                        /* part of a surrogate pair, leave >=d800 */
3511:                    } else {
3512:                        /* BMP code point - may be surrogate code point - make <d800 */
3513:                        c2 -= 0x2800;
3514:                    }
3515:                }
3516:
3517:                /* now c1 and c2 are in UTF-32-compatible order */
3518:                return (int) c1 - (int) c2;
3519:            }
3520:
3521:            /*
3522:             * Status of tailored normalization
3523:             *
3524:             * This was done initially for investigation on Unicode public review issue 7
3525:             * (http://www.unicode.org/review/). See Jitterbug 2481.
3526:             * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
3527:             * a permanent feature in ICU 2.6 in support of IDNA which requires true
3528:             * Unicode 3.2 normalization.
3529:             * (NormalizationCorrections are rolled into IDNA mapping tables.)
3530:             *
3531:             * Tailored normalization as implemented here allows to "normalize less"
3532:             * than full Unicode normalization would.
3533:             * Based internally on a UnicodeSet of code points that are
3534:             * "excluded from normalization", the normalization functions leave those
3535:             * code points alone ("inert"). This means that tailored normalization
3536:             * still transforms text into a canonically equivalent form.
3537:             * It does not add decompositions to code points that do not have any or
3538:             * change decomposition results.
3539:             *
3540:             * Any function that searches for a safe boundary has not been touched,
3541:             * which means that these functions will be over-pessimistic when
3542:             * exclusions are applied.
3543:             * This should not matter because subsequent checks and normalizations
3544:             * do apply the exclusions; only a little more of the text may be processed
3545:             * than necessary under exclusions.
3546:             *
3547:             * Normalization exclusions have the following effect on excluded code points c:
3548:             * - c is not decomposed
3549:             * - c is not a composition target
3550:             * - c does not combine forward or backward for composition
3551:             *   except that this is not implemented for Jamo
3552:             * - c is treated as having a combining class of 0
3553:             */
3554:
3555:            /* 
3556:             * Constants for the bit fields in the options bit set parameter. 
3557:             * These need not be public. 
3558:             * A user only needs to know the currently assigned values. 
3559:             * The number and positions of reserved bits per field can remain private. 
3560:             */
3561:            private static final int OPTIONS_NX_MASK = 0x1f;
3562:            private static final int OPTIONS_UNICODE_MASK = 0xe0;
3563:            public static final int OPTIONS_SETS_MASK = 0xff;
3564:            private static final int OPTIONS_UNICODE_SHIFT = 5;
3565:            private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK + 1];
3566:
3567:            /* Constants for options flags for normalization.*/
3568:
3569:            /** 
3570:             * Options bit 0, do not decompose Hangul syllables. 
3571:             * @draft ICU 2.6 
3572:             */
3573:            private static final int NX_HANGUL = 1;
3574:            /** 
3575:             * Options bit 1, do not decompose CJK compatibility characters.
3576:             * @draft ICU 2.6 
3577:             */
3578:            private static final int NX_CJK_COMPAT = 2;
3579:            /**
3580:             * Options bit 8, use buggy recomposition described in
3581:             * Unicode Public Review Issue #29
3582:             * at http://www.unicode.org/review/resolved-pri.html#pri29
3583:             *
3584:             * Used in IDNA implementation according to strict interpretation
3585:             * of IDNA definition based on Unicode 3.2 which predates PRI #29.
3586:             *
3587:             * See ICU4C unormimp.h
3588:             * 
3589:             * @draft ICU 3.2
3590:             */
3591:            public static final int BEFORE_PRI_29 = 0x100;
3592:
3593:            /*
3594:             * The following options are used only in some composition functions.
3595:             * They use bits 12 and up to preserve lower bits for the available options
3596:             * space in unorm_compare() -
3597:             * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
3598:             */
3599:
3600:            /** Options bit 12, for compatibility vs. canonical decomposition. */
3601:            public static final int OPTIONS_COMPAT = 0x1000;
3602:            /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
3603:            public static final int OPTIONS_COMPOSE_CONTIGUOUS = 0x2000;
3604:
3605:            /* normalization exclusion sets --------------------------------------------- */
3606:
3607:            /*
3608:             * Normalization exclusion UnicodeSets are used for tailored normalization;
3609:             * see the comment near the beginning of this file.
3610:             *
3611:             * By specifying one or several sets of code points,
3612:             * those code points become inert for normalization.
3613:             */
3614:            private static final synchronized UnicodeSet internalGetNXHangul() {
3615:                /* internal function, does not check for incoming U_FAILURE */
3616:
3617:                if (nxCache[NX_HANGUL] == null) {
3618:                    nxCache[NX_HANGUL] = new UnicodeSet(0xac00, 0xd7a3);
3619:                }
3620:                return nxCache[NX_HANGUL];
3621:            }
3622:
3623:            private static final synchronized UnicodeSet internalGetNXCJKCompat() {
3624:                /* internal function, does not check for incoming U_FAILURE */
3625:
3626:                if (nxCache[NX_CJK_COMPAT] == null) {
3627:
3628:                    /* build a set from [CJK Ideographs]&[has canonical decomposition] */
3629:                    UnicodeSet set, hasDecomp;
3630:
3631:                    set = new UnicodeSet("[:Ideographic:]");
3632:
3633:                    /* start with an empty set for [has canonical decomposition] */
3634:                    hasDecomp = new UnicodeSet();
3635:
3636:                    /* iterate over all ideographs and remember which canonically decompose */
3637:                    UnicodeSetIterator it = new UnicodeSetIterator(set);
3638:                    int start, end;
3639:                    long norm32;
3640:
3641:                    while (it.nextRange()
3642:                            && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
3643:                        start = it.codepoint;
3644:                        end = it.codepointEnd;
3645:                        while (start <= end) {
3646:                            norm32 = getNorm32(start);
3647:                            if ((norm32 & QC_NFD) > 0) {
3648:                                hasDecomp.add(start);
3649:                            }
3650:                            ++start;
3651:                        }
3652:                    }
3653:
3654:                    /* hasDecomp now contains all ideographs that decompose canonically */
3655:                    nxCache[NX_CJK_COMPAT] = hasDecomp;
3656:
3657:                }
3658:
3659:                return nxCache[NX_CJK_COMPAT];
3660:            }
3661:
3662:            private static final synchronized UnicodeSet internalGetNXUnicode(
3663:                    int options) {
3664:                options &= OPTIONS_UNICODE_MASK;
3665:                if (options == 0) {
3666:                    return null;
3667:                }
3668:
3669:                if (nxCache[options] == null) {
3670:                    /* build a set with all code points that were not designated by the specified Unicode version */
3671:                    UnicodeSet set = new UnicodeSet();
3672:
3673:                    switch (options) {
3674:                    case Normalizer.UNICODE_3_2:
3675:                        set.applyPattern("[:^Age=3.2:]");
3676:                        break;
3677:                    default:
3678:                        return null;
3679:                    }
3680:
3681:                    nxCache[options] = set;
3682:                }
3683:
3684:                return nxCache[options];
3685:            }
3686:
3687:            /* Get a decomposition exclusion set. The data must be loaded. */
3688:            private static final synchronized UnicodeSet internalGetNX(
3689:                    int options) {
3690:                options &= OPTIONS_SETS_MASK;
3691:
3692:                if (nxCache[options] == null) {
3693:                    /* return basic sets */
3694:                    if (options == NX_HANGUL) {
3695:                        return internalGetNXHangul();
3696:                    }
3697:                    if (options == NX_CJK_COMPAT) {
3698:                        return internalGetNXCJKCompat();
3699:                    }
3700:                    if ((options & OPTIONS_UNICODE_MASK) != 0
3701:                            && (options & OPTIONS_NX_MASK) == 0) {
3702:                        return internalGetNXUnicode(options);
3703:                    }
3704:
3705:                    /* build a set from multiple subsets */
3706:                    UnicodeSet set;
3707:                    UnicodeSet other;
3708:
3709:                    set = new UnicodeSet();
3710:
3711:                    if ((options & NX_HANGUL) != 0
3712:                            && null != (other = internalGetNXHangul())) {
3713:                        set.addAll(other);
3714:                    }
3715:                    if ((options & NX_CJK_COMPAT) != 0
3716:                            && null != (other = internalGetNXCJKCompat())) {
3717:                        set.addAll(other);
3718:                    }
3719:                    if ((options & OPTIONS_UNICODE_MASK) != 0
3720:                            && null != (other = internalGetNXUnicode(options))) {
3721:                        set.addAll(other);
3722:                    }
3723:
3724:                    nxCache[options] = set;
3725:                }
3726:                return nxCache[options];
3727:            }
3728:
3729:            public static final UnicodeSet getNX(int options) {
3730:                if ((options &= OPTIONS_SETS_MASK) == 0) {
3731:                    /* incoming failure, or no decomposition exclusions requested */
3732:                    return null;
3733:                } else {
3734:                    return internalGetNX(options);
3735:                }
3736:            }
3737:
3738:            private static final boolean nx_contains(UnicodeSet nx, int c) {
3739:                return nx != null && nx.contains(c);
3740:            }
3741:
3742:            private static final boolean nx_contains(UnicodeSet nx, char c,
3743:                    char c2) {
3744:                return nx != null
3745:                        && nx.contains(c2 == 0 ? c : UCharacterProperty
3746:                                .getRawSupplementary(c, c2));
3747:            }
3748:
3749:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.