Source Code Cross Referenced for UCaseProps.java in » Internationalization-Localization » icu4j » com » ibm » icu » impl » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.impl
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /*
0002:         *******************************************************************************
0003:         *
0004:         *   Copyright (C) 2004-2006, International Business Machines
0005:         *   Corporation and others.  All Rights Reserved.
0006:         *
0007:         *******************************************************************************
0008:         *   file name:  UCaseProps.java
0009:         *   encoding:   US-ASCII
0010:         *   tab size:   8 (not used)
0011:         *   indentation:4
0012:         *
0013:         *   created on: 2005jan29
0014:         *   created by: Markus W. Scherer
0015:         *
0016:         *   Low-level Unicode character/string case mapping code.
0017:         *   Java port of ucase.h/.c.
0018:         */
0019:
0020:        package com.ibm.icu.impl;
0021:
0022:        import java.io.InputStream;
0023:        import java.io.DataInputStream;
0024:        import java.io.BufferedInputStream;
0025:        import java.io.IOException;
0026:
0027:        import com.ibm.icu.util.RangeValueIterator;
0028:        import com.ibm.icu.util.ULocale;
0029:
0030:        import com.ibm.icu.text.UTF16;
0031:        import com.ibm.icu.text.UnicodeSet;
0032:
0033:        import com.ibm.icu.lang.UCharacter;
0034:
0035:        public final class UCaseProps {
0036:            // constructors etc. --------------------------------------------------- ***
0037:
0038:            // port of ucase_openProps()
0039:            public UCaseProps() throws IOException {
0040:                InputStream is = ICUData
0041:                        .getRequiredStream(ICUResourceBundle.ICU_BUNDLE + "/"
0042:                                + DATA_FILE_NAME);
0043:                BufferedInputStream b = new BufferedInputStream(is, 4096 /* data buffer size */);
0044:                readData(b);
0045:                b.close();
0046:                is.close();
0047:            }
0048:
0049:            private final void readData(InputStream is) throws IOException {
0050:                DataInputStream inputStream = new DataInputStream(is);
0051:
0052:                // read the header
0053:                unicodeVersion = ICUBinary.readHeader(inputStream, FMT,
0054:                        new IsAcceptable());
0055:
0056:                // read indexes[]
0057:                int i, count;
0058:                count = inputStream.readInt();
0059:                if (count < IX_INDEX_TOP) {
0060:                    throw new IOException("indexes[0] too small in "
0061:                            + DATA_FILE_NAME);
0062:                }
0063:                indexes = new int[count];
0064:
0065:                indexes[0] = count;
0066:                for (i = 1; i < count; ++i) {
0067:                    indexes[i] = inputStream.readInt();
0068:                }
0069:
0070:                // read the trie
0071:                trie = new CharTrie(inputStream, null);
0072:
0073:                // read exceptions[]
0074:                count = indexes[IX_EXC_LENGTH];
0075:                if (count > 0) {
0076:                    exceptions = new char[count];
0077:                    for (i = 0; i < count; ++i) {
0078:                        exceptions[i] = inputStream.readChar();
0079:                    }
0080:                }
0081:
0082:                // read unfold[]
0083:                count = indexes[IX_UNFOLD_LENGTH];
0084:                if (count > 0) {
0085:                    unfold = new char[count];
0086:                    for (i = 0; i < count; ++i) {
0087:                        unfold[i] = inputStream.readChar();
0088:                    }
0089:                }
0090:            }
0091:
0092:            // implement ICUBinary.Authenticate
0093:            private final class IsAcceptable implements  ICUBinary.Authenticate {
0094:                public boolean isDataVersionAcceptable(byte version[]) {
0095:                    formatVersion = version;
0096:                    return version[0] == 1
0097:                            && version[2] == Trie.INDEX_STAGE_1_SHIFT_
0098:                            && version[3] == Trie.INDEX_STAGE_2_SHIFT_;
0099:                }
0100:            }
0101:
0102:            // UCaseProps singleton
0103:            private static UCaseProps gCsp = null;
0104:
0105:            // port of ucase_getSingleton()
0106:            public static final synchronized UCaseProps getSingleton()
0107:                    throws IOException {
0108:                if (gCsp == null) {
0109:                    gCsp = new UCaseProps();
0110:                }
0111:                return gCsp;
0112:            }
0113:
0114:            // UCaseProps dummy singleton
0115:            private static UCaseProps gCspDummy = null;
0116:
0117:            private UCaseProps(boolean makeDummy) { // ignore makeDummy, only creates a unique signature
0118:                formatVersion = new byte[] { 1, 0, Trie.INDEX_STAGE_1_SHIFT_,
0119:                        Trie.INDEX_STAGE_2_SHIFT_ };
0120:                unicodeVersion = new byte[] { 2, 0, 0, 0 };
0121:                indexes = new int[IX_TOP];
0122:                indexes[0] = IX_TOP;
0123:                trie = new CharTrie(0, 0, null); // dummy trie, always returns 0
0124:            }
0125:
0126:            /**
0127:             * Get a singleton dummy object, one that works with no real data.
0128:             * This can be used when the real data is not available.
0129:             * Using the dummy can reduce checks for available data after an initial failure.
0130:             * Port of ucase_getDummy().
0131:             */
0132:            public static final synchronized UCaseProps getDummy() {
0133:                if (gCspDummy == null) {
0134:                    gCspDummy = new UCaseProps(true);
0135:                }
0136:                return gCspDummy;
0137:            }
0138:
0139:            // set of property starts for UnicodeSet ------------------------------- ***
0140:
0141:            public final void addPropertyStarts(UnicodeSet set) {
0142:                /* add the start code point of each same-value range of the trie */
0143:                TrieIterator iter = new TrieIterator(trie);
0144:                RangeValueIterator.Element element = new RangeValueIterator.Element();
0145:
0146:                while (iter.next(element)) {
0147:                    set.add(element.start);
0148:                }
0149:
0150:                /* add code points with hardcoded properties, plus the ones following them */
0151:
0152:                /* (none right now, see comment below) */
0153:
0154:                /*
0155:                 * Omit code points with hardcoded specialcasing properties
0156:                 * because we do not build property UnicodeSets for them right now.
0157:                 */
0158:            }
0159:
0160:            // data access primitives ---------------------------------------------- ***
0161:            private static final int getExceptionsOffset(int props) {
0162:                return props >> EXC_SHIFT;
0163:            }
0164:
0165:            private static final boolean propsHasException(int props) {
0166:                return (props & EXCEPTION) != 0;
0167:            }
0168:
0169:            /* number of bits in an 8-bit integer value */
0170:            private static final byte flagsOffset[/*256*/] = { 0, 1, 1, 2, 1,
0171:                    2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2,
0172:                    3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3,
0173:                    4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1,
0174:                    2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3,
0175:                    4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3,
0176:                    4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5,
0177:                    6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2,
0178:                    3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3,
0179:                    4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4,
0180:                    5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4,
0181:                    5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3,
0182:                    4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5,
0183:                    6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 };
0184:
0185:            private static final boolean hasSlot(int flags, int index) {
0186:                return (flags & (1 << index)) != 0;
0187:            }
0188:
0189:            private static final byte slotOffset(int flags, int index) {
0190:                return flagsOffset[flags & ((1 << index) - 1)];
0191:            }
0192:
0193:            /*
0194:             * Get the value of an optional-value slot where hasSlot(excWord, index).
0195:             *
0196:             * @param excWord (in) initial exceptions word
0197:             * @param index (in) desired slot index
0198:             * @param excOffset (in) offset into exceptions[] after excWord=exceptions[excOffset++];
0199:             * @return bits 31..0: slot value
0200:             *             63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot 
0201:             */
0202:            private final long getSlotValueAndOffset(int excWord, int index,
0203:                    int excOffset) {
0204:                long value;
0205:                if ((excWord & EXC_DOUBLE_SLOTS) == 0) {
0206:                    excOffset += slotOffset(excWord, index);
0207:                    value = exceptions[excOffset];
0208:                } else {
0209:                    excOffset += 2 * slotOffset(excWord, index);
0210:                    value = exceptions[excOffset++];
0211:                    value = (value << 16) | exceptions[excOffset];
0212:                }
0213:                return (long) value | ((long) excOffset << 32);
0214:            }
0215:
0216:            /* same as getSlotValueAndOffset() but does not return the slot offset */
0217:            private final int getSlotValue(int excWord, int index, int excOffset) {
0218:                int value;
0219:                if ((excWord & EXC_DOUBLE_SLOTS) == 0) {
0220:                    excOffset += slotOffset(excWord, index);
0221:                    value = exceptions[excOffset];
0222:                } else {
0223:                    excOffset += 2 * slotOffset(excWord, index);
0224:                    value = exceptions[excOffset++];
0225:                    value = (value << 16) | exceptions[excOffset];
0226:                }
0227:                return value;
0228:            }
0229:
0230:            // simple case mappings ------------------------------------------------ ***
0231:
0232:            public final int tolower(int c) {
0233:                int props = trie.getCodePointValue(c);
0234:                if (!propsHasException(props)) {
0235:                    if (getTypeFromProps(props) >= UPPER) {
0236:                        c += getDelta(props);
0237:                    }
0238:                } else {
0239:                    int excOffset = getExceptionsOffset(props);
0240:                    int excWord = exceptions[excOffset++];
0241:                    if (hasSlot(excWord, EXC_LOWER)) {
0242:                        c = getSlotValue(excWord, EXC_LOWER, excOffset);
0243:                    }
0244:                }
0245:                return c;
0246:            }
0247:
0248:            public final int toupper(int c) {
0249:                int props = trie.getCodePointValue(c);
0250:                if (!propsHasException(props)) {
0251:                    if (getTypeFromProps(props) == LOWER) {
0252:                        c += getDelta(props);
0253:                    }
0254:                } else {
0255:                    int excOffset = getExceptionsOffset(props);
0256:                    int excWord = exceptions[excOffset++];
0257:                    if (hasSlot(excWord, EXC_UPPER)) {
0258:                        c = getSlotValue(excWord, EXC_UPPER, excOffset);
0259:                    }
0260:                }
0261:                return c;
0262:            }
0263:
0264:            public final int totitle(int c) {
0265:                int props = trie.getCodePointValue(c);
0266:                if (!propsHasException(props)) {
0267:                    if (getTypeFromProps(props) == LOWER) {
0268:                        c += getDelta(props);
0269:                    }
0270:                } else {
0271:                    int excOffset = getExceptionsOffset(props);
0272:                    int excWord = exceptions[excOffset++];
0273:                    int index;
0274:                    if (hasSlot(excWord, EXC_TITLE)) {
0275:                        index = EXC_TITLE;
0276:                    } else if (hasSlot(excWord, EXC_UPPER)) {
0277:                        index = EXC_UPPER;
0278:                    } else {
0279:                        return c;
0280:                    }
0281:                    c = getSlotValue(excWord, index, excOffset);
0282:                }
0283:                return c;
0284:            }
0285:
0286:            /**
0287:             * Adds all simple case mappings and the full case folding for c to sa,
0288:             * and also adds special case closure mappings.
0289:             * c itself is not added.
0290:             * For example, the mappings
0291:             * - for s include long s
0292:             * - for sharp s include ss
0293:             * - for k include the Kelvin sign
0294:             */
0295:            public final void addCaseClosure(int c, UnicodeSet set) {
0296:                /*
0297:                 * Hardcode the case closure of i and its relatives and ignore the
0298:                 * data file data for these characters.
0299:                 * The Turkic dotless i and dotted I with their case mapping conditions
0300:                 * and case folding option make the related characters behave specially.
0301:                 * This code matches their closure behavior to their case folding behavior.
0302:                 */
0303:
0304:                switch (c) {
0305:                case 0x49:
0306:                    /* regular i and I are in one equivalence class */
0307:                    set.add(0x69);
0308:                    return;
0309:                case 0x69:
0310:                    set.add(0x49);
0311:                    return;
0312:                case 0x130:
0313:                    /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
0314:                    set.add(iDot);
0315:                    return;
0316:                case 0x131:
0317:                    /* dotless i is in a class by itself */
0318:                    return;
0319:                default:
0320:                    /* otherwise use the data file data */
0321:                    break;
0322:                }
0323:
0324:                int props = trie.getCodePointValue(c);
0325:                if (!propsHasException(props)) {
0326:                    if (getTypeFromProps(props) != NONE) {
0327:                        /* add the one simple case mapping, no matter what type it is */
0328:                        int delta = getDelta(props);
0329:                        if (delta != 0) {
0330:                            set.add(c + delta);
0331:                        }
0332:                    }
0333:                } else {
0334:                    /*
0335:                     * c has exceptions, so there may be multiple simple and/or
0336:                     * full case mappings. Add them all.
0337:                     */
0338:                    int excOffset0, excOffset = getExceptionsOffset(props);
0339:                    int closureOffset;
0340:                    int excWord = exceptions[excOffset++];
0341:                    int index, closureLength, fullLength, length;
0342:
0343:                    excOffset0 = excOffset;
0344:
0345:                    /* add all simple case mappings */
0346:                    for (index = EXC_LOWER; index <= EXC_TITLE; ++index) {
0347:                        if (hasSlot(excWord, index)) {
0348:                            excOffset = excOffset0;
0349:                            c = getSlotValue(excWord, index, excOffset);
0350:                            set.add(c);
0351:                        }
0352:                    }
0353:
0354:                    /* get the closure string pointer & length */
0355:                    if (hasSlot(excWord, EXC_CLOSURE)) {
0356:                        excOffset = excOffset0;
0357:                        long value = getSlotValueAndOffset(excWord,
0358:                                EXC_CLOSURE, excOffset);
0359:                        closureLength = (int) value & CLOSURE_MAX_LENGTH; /* higher bits are reserved */
0360:                        closureOffset = (int) (value >> 32) + 1; /* behind this slot, unless there are full case mappings */
0361:                    } else {
0362:                        closureLength = 0;
0363:                        closureOffset = 0;
0364:                    }
0365:
0366:                    /* add the full case folding */
0367:                    if (hasSlot(excWord, EXC_FULL_MAPPINGS)) {
0368:                        excOffset = excOffset0;
0369:                        long value = getSlotValueAndOffset(excWord,
0370:                                EXC_FULL_MAPPINGS, excOffset);
0371:                        fullLength = (int) value;
0372:
0373:                        /* start of full case mapping strings */
0374:                        excOffset = (int) (value >> 32) + 1;
0375:
0376:                        fullLength &= 0xffff; /* bits 16 and higher are reserved */
0377:
0378:                        /* skip the lowercase result string */
0379:                        excOffset += fullLength & FULL_LOWER;
0380:                        fullLength >>= 4;
0381:
0382:                        /* add the full case folding string */
0383:                        length = fullLength & 0xf;
0384:                        if (length != 0) {
0385:                            set.add(new String(exceptions, excOffset, length));
0386:                            excOffset += length;
0387:                        }
0388:
0389:                        /* skip the uppercase and titlecase strings */
0390:                        fullLength >>= 4;
0391:                        excOffset += fullLength & 0xf;
0392:                        fullLength >>= 4;
0393:                        excOffset += fullLength;
0394:
0395:                        closureOffset = excOffset; /* behind full case mappings */
0396:                    }
0397:
0398:                    /* add each code point in the closure string */
0399:                    for (index = 0; index < closureLength; index += UTF16
0400:                            .getCharCount(c)) {
0401:                        c = UTF16.charAt(exceptions, closureOffset,
0402:                                exceptions.length, index);
0403:                        set.add(c);
0404:                    }
0405:                }
0406:            }
0407:
0408:            /*
0409:             * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated
0410:             * must be s.length()>0 and max>0 and s.length()<=max
0411:             */
0412:            private final int strcmpMax(String s, int unfoldOffset, int max) {
0413:                int i1, length, c1, c2;
0414:
0415:                length = s.length();
0416:                max -= length; /* we require length<=max, so no need to decrement max in the loop */
0417:                i1 = 0;
0418:                do {
0419:                    c1 = s.charAt(i1++);
0420:                    c2 = unfold[unfoldOffset++];
0421:                    if (c2 == 0) {
0422:                        return 1; /* reached the end of t but not of s */
0423:                    }
0424:                    c1 -= c2;
0425:                    if (c1 != 0) {
0426:                        return c1; /* return difference result */
0427:                    }
0428:                } while (--length > 0);
0429:                /* ends with length==0 */
0430:
0431:                if (max == 0 || unfold[unfoldOffset] == 0) {
0432:                    return 0; /* equal to length of both strings */
0433:                } else {
0434:                    return -max; /* return lengh difference */
0435:                }
0436:            }
0437:
0438:            /**
0439:             * Maps the string to single code points and adds the associated case closure
0440:             * mappings.
0441:             * The string is mapped to code points if it is their full case folding string.
0442:             * In other words, this performs a reverse full case folding and then
0443:             * adds the case closure items of the resulting code points.
0444:             * If the string is found and its closure applied, then
0445:             * the string itself is added as well as part of its code points' closure.
0446:             *
0447:             * @return true if the string was found
0448:             */
0449:            public final boolean addStringCaseClosure(String s, UnicodeSet set) {
0450:                int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;
0451:
0452:                if (unfold == null || s == null) {
0453:                    return false; /* no reverse case folding data, or no string */
0454:                }
0455:                length = s.length();
0456:                if (length <= 1) {
0457:                    /* the string is too short to find any match */
0458:                    /*
0459:                     * more precise would be:
0460:                     * if(!u_strHasMoreChar32Than(s, length, 1))
0461:                     * but this does not make much practical difference because
0462:                     * a single supplementary code point would just not be found
0463:                     */
0464:                    return false;
0465:                }
0466:
0467:                unfoldRows = unfold[UNFOLD_ROWS];
0468:                unfoldRowWidth = unfold[UNFOLD_ROW_WIDTH];
0469:                unfoldStringWidth = unfold[UNFOLD_STRING_WIDTH];
0470:                //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;
0471:
0472:                if (length > unfoldStringWidth) {
0473:                    /* the string is too long to find any match */
0474:                    return false;
0475:                }
0476:
0477:                /* do a binary search for the string */
0478:                start = 0;
0479:                limit = unfoldRows;
0480:                while (start < limit) {
0481:                    i = (start + limit) / 2;
0482:                    unfoldOffset = ((i + 1) * unfoldRowWidth); // +1 to skip the header values above
0483:                    result = strcmpMax(s, unfoldOffset, unfoldStringWidth);
0484:
0485:                    if (result == 0) {
0486:                        /* found the string: add each code point, and its case closure */
0487:                        int c;
0488:
0489:                        for (i = unfoldStringWidth; i < unfoldRowWidth
0490:                                && unfold[unfoldOffset + i] != 0; i += UTF16
0491:                                .getCharCount(c)) {
0492:                            c = UTF16.charAt(unfold, unfoldOffset,
0493:                                    unfold.length, i);
0494:                            set.add(c);
0495:                            addCaseClosure(c, set);
0496:                        }
0497:                        return true;
0498:                    } else if (result < 0) {
0499:                        limit = i;
0500:                    } else /* result>0 */{
0501:                        start = i + 1;
0502:                    }
0503:                }
0504:
0505:                return false; /* string not found */
0506:            }
0507:
0508:            /** @return NONE, LOWER, UPPER, TITLE */
0509:            public final int getType(int c) {
0510:                return getTypeFromProps(trie.getCodePointValue(c));
0511:            }
0512:
0513:            /** @return same as getType(), or <0 if c is case-ignorable */
0514:            public final int getTypeOrIgnorable(int c) {
0515:                int props = trie.getCodePointValue(c);
0516:                int type = getTypeFromProps(props);
0517:                if (type != NONE) {
0518:                    return type;
0519:                } else if (c == 0x307
0520:                        || (props & (EXCEPTION | CASE_IGNORABLE)) == CASE_IGNORABLE) {
0521:                    return -1; /* case-ignorable */
0522:                } else {
0523:                    return 0; /* c is neither cased nor case-ignorable */
0524:                }
0525:            }
0526:
0527:            /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */
0528:            public final int getDotType(int c) {
0529:                int props = trie.getCodePointValue(c);
0530:                if (!propsHasException(props)) {
0531:                    return props & DOT_MASK;
0532:                } else {
0533:                    return (exceptions[getExceptionsOffset(props)] >> EXC_DOT_SHIFT)
0534:                            & DOT_MASK;
0535:                }
0536:            }
0537:
0538:            public final boolean isSoftDotted(int c) {
0539:                return getDotType(c) == SOFT_DOTTED;
0540:            }
0541:
0542:            public final boolean isCaseSensitive(int c) {
0543:                return (trie.getCodePointValue(c) & SENSITIVE) != 0;
0544:            }
0545:
0546:            // string casing ------------------------------------------------------- ***
0547:
0548:            /*
0549:             * These internal functions form the core of string case mappings.
0550:             * They map single code points to result code points or strings and take
0551:             * all necessary conditions (context, locale ID, options) into account.
0552:             *
0553:             * They do not iterate over the source or write to the destination
0554:             * so that the same functions are useful for non-standard string storage,
0555:             * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
0556:             * For the same reason, the "surrounding text" context is passed in as a
0557:             * ContextIterator which does not make any assumptions about
0558:             * the underlying storage.
0559:             *
0560:             * This section contains helper functions that check for conditions
0561:             * in the input text surrounding the current code point
0562:             * according to SpecialCasing.txt.
0563:             *
0564:             * Each helper function gets the index
0565:             * - after the current code point if it looks at following text
0566:             * - before the current code point if it looks at preceding text
0567:             *
0568:             * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
0569:             *
0570:             * Final_Sigma
0571:             *   C is preceded by a sequence consisting of
0572:             *     a cased letter and a case-ignorable sequence,
0573:             *   and C is not followed by a sequence consisting of
0574:             *     an ignorable sequence and then a cased letter.
0575:             *
0576:             * More_Above
0577:             *   C is followed by one or more characters of combining class 230 (ABOVE)
0578:             *   in the combining character sequence.
0579:             *
0580:             * After_Soft_Dotted
0581:             *   The last preceding character with combining class of zero before C
0582:             *   was Soft_Dotted,
0583:             *   and there is no intervening combining character class 230 (ABOVE).
0584:             *
0585:             * Before_Dot
0586:             *   C is followed by combining dot above (U+0307).
0587:             *   Any sequence of characters with a combining class that is neither 0 nor 230
0588:             *   may intervene between the current character and the combining dot above.
0589:             *
0590:             * The erratum from 2002-10-31 adds the condition
0591:             *
0592:             * After_I
0593:             *   The last preceding base character was an uppercase I, and there is no
0594:             *   intervening combining character class 230 (ABOVE).
0595:             *
0596:             *   (See Jitterbug 2344 and the comments on After_I below.)
0597:             *
0598:             * Helper definitions in Unicode 3.2 UAX 21:
0599:             *
0600:             * D1. A character C is defined to be cased
0601:             *     if it meets any of the following criteria:
0602:             *
0603:             *   - The general category of C is Titlecase Letter (Lt)
0604:             *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
0605:             *   - Given D = NFD(C), then it is not the case that:
0606:             *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
0607:             *     (This third criterium does not add any characters to the list
0608:             *      for Unicode 3.2. Ignored.)
0609:             *
0610:             * D2. A character C is defined to be case-ignorable
0611:             *     if it meets either of the following criteria:
0612:             *
0613:             *   - The general category of C is
0614:             *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
0615:             *     Letter Modifier (Lm), or Symbol Modifier (Sk)
0616:             *   - C is one of the following characters 
0617:             *     U+0027 APOSTROPHE
0618:             *     U+00AD SOFT HYPHEN (SHY)
0619:             *     U+2019 RIGHT SINGLE QUOTATION MARK
0620:             *            (the preferred character for apostrophe)
0621:             *
0622:             * D3. A case-ignorable sequence is a sequence of
0623:             *     zero or more case-ignorable characters.
0624:             */
0625:
0626:            /**
0627:             * Iterator for string case mappings, which need to look at the
0628:             * context (surrounding text) of a given character for conditional mappings.
0629:             *
0630:             * The iterator only needs to go backward or forward away from the
0631:             * character in question. It does not use any indexes on this interface.
0632:             * It does not support random access or an arbitrary change of
0633:             * iteration direction.
0634:             *
0635:             * The code point being case-mapped itself is never returned by
0636:             * this iterator.
0637:             */
0638:            public interface ContextIterator {
0639:                /**
0640:                 * Reset the iterator for forward or backward iteration.
0641:                 * @param dir >0: Begin iterating forward from the first code point
0642:                 * after the one that is being case-mapped.
0643:                 *            <0: Begin iterating backward from the first code point
0644:                 * before the one that is being case-mapped.   
0645:                 */
0646:                public void reset(int dir);
0647:
0648:                /**
0649:                 * Iterate and return the next code point, moving in the direction
0650:                 * determined by the reset() call.
0651:                 * @return Next code point, or <0 when the iteration is done. 
0652:                 */
0653:                public int next();
0654:            }
0655:
0656:            /**
0657:             * For string case mappings, a single character (a code point) is mapped
0658:             * either to itself (in which case in-place mapping functions do nothing),
0659:             * or to another single code point, or to a string.
0660:             * Aside from the string contents, these are indicated with a single int
0661:             * value as follows:
0662:             *
0663:             * Mapping to self: Negative values (~self instead of -self to support U+0000)
0664:             *
0665:             * Mapping to another code point: Positive values >MAX_STRING_LENGTH
0666:             *
0667:             * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is
0668:             * returned. Note that the string result may indeed have zero length.
0669:             */
0670:            public static final int MAX_STRING_LENGTH = 0x1f;
0671:
0672:            private static final int LOC_UNKNOWN = 0;
0673:            private static final int LOC_ROOT = 1;
0674:            private static final int LOC_TURKISH = 2;
0675:            private static final int LOC_LITHUANIAN = 3;
0676:
0677:            /*
0678:             * Checks and caches the type of locale ID as it is relevant for case mapping.
0679:             * If the locCache is not null, then it must be initialized with locCache[0]=0 .
0680:             */
0681:            private static final int getCaseLocale(ULocale locale,
0682:                    int[] locCache) {
0683:                int result;
0684:
0685:                if (locCache != null && (result = locCache[0]) != LOC_UNKNOWN) {
0686:                    return result;
0687:                }
0688:
0689:                result = LOC_ROOT;
0690:
0691:                String language = locale.getLanguage();
0692:                if (language.equals("tr") || language.equals("tur")
0693:                        || language.equals("az") || language.equals("aze")) {
0694:                    result = LOC_TURKISH;
0695:                } else if (language.equals("lt") || language.equals("lit")) {
0696:                    result = LOC_LITHUANIAN;
0697:                }
0698:
0699:                if (locCache != null) {
0700:                    locCache[0] = result;
0701:                }
0702:                return result;
0703:            }
0704:
0705:            /* Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward) */
0706:            private final boolean isFollowedByCasedLetter(ContextIterator iter,
0707:                    int dir) {
0708:                int c;
0709:                int props;
0710:
0711:                if (iter == null) {
0712:                    return false;
0713:                }
0714:
0715:                for (iter.reset(dir); (c = iter.next()) >= 0;) {
0716:                    props = trie.getCodePointValue(c);
0717:                    if (getTypeFromProps(props) != NONE) {
0718:                        return true; /* followed by cased letter */
0719:                    } else if (c == 0x307
0720:                            || (props & (EXCEPTION | CASE_IGNORABLE)) == CASE_IGNORABLE) {
0721:                        /* case-ignorable, continue with the loop */
0722:                    } else {
0723:                        return false; /* not ignorable */
0724:                    }
0725:                }
0726:
0727:                return false; /* not followed by cased letter */
0728:            }
0729:
0730:            /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
0731:            private final boolean isPrecededBySoftDotted(ContextIterator iter) {
0732:                int c;
0733:                int dotType;
0734:
0735:                if (iter == null) {
0736:                    return false;
0737:                }
0738:
0739:                for (iter.reset(-1); (c = iter.next()) >= 0;) {
0740:                    dotType = getDotType(c);
0741:                    if (dotType == SOFT_DOTTED) {
0742:                        return true; /* preceded by TYPE_i */
0743:                    } else if (dotType != OTHER_ACCENT) {
0744:                        return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
0745:                    }
0746:                }
0747:
0748:                return false; /* not preceded by TYPE_i */
0749:            }
0750:
0751:            /*
0752:             * See Jitterbug 2344:
0753:             * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
0754:             * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
0755:             * we made those releases compatible with Unicode 3.2 which had not fixed
0756:             * a related bug in SpecialCasing.txt.
0757:             *
0758:             * From the Jitterbug 2344 text:
0759:             * ... this bug is listed as a Unicode erratum
0760:             * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
0761:             * <quote>
0762:             * There are two errors in SpecialCasing.txt.
0763:             * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
0764:             * 2. An incorrect context definition. Correct as follows:
0765:             * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
0766:             * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
0767:             * ---
0768:             * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
0769:             * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
0770:             * where the context After_I is defined as:
0771:             * The last preceding base character was an uppercase I, and there is no
0772:             * intervening combining character class 230 (ABOVE).
0773:             * </quote>
0774:             *
0775:             * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
0776:             *
0777:             * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
0778:             * # This matches the behavior of the canonically equivalent I-dot_above
0779:             *
0780:             * See also the description in this place in older versions of uchar.c (revision 1.100).
0781:             *
0782:             * Markus W. Scherer 2003-feb-15
0783:             */
0784:
0785:            /* Is preceded by base character 'I' with no intervening cc=230 ? */
0786:            private final boolean isPrecededBy_I(ContextIterator iter) {
0787:                int c;
0788:                int dotType;
0789:
0790:                if (iter == null) {
0791:                    return false;
0792:                }
0793:
0794:                for (iter.reset(-1); (c = iter.next()) >= 0;) {
0795:                    if (c == 0x49) {
0796:                        return true; /* preceded by I */
0797:                    }
0798:                    dotType = getDotType(c);
0799:                    if (dotType != OTHER_ACCENT) {
0800:                        return false; /* preceded by different base character (not I), or intervening cc==230 */
0801:                    }
0802:                }
0803:
0804:                return false; /* not preceded by I */
0805:            }
0806:
0807:            /* Is followed by one or more cc==230 ? */
0808:            private final boolean isFollowedByMoreAbove(ContextIterator iter) {
0809:                int c;
0810:                int dotType;
0811:
0812:                if (iter == null) {
0813:                    return false;
0814:                }
0815:
0816:                for (iter.reset(1); (c = iter.next()) >= 0;) {
0817:                    dotType = getDotType(c);
0818:                    if (dotType == ABOVE) {
0819:                        return true; /* at least one cc==230 following */
0820:                    } else if (dotType != OTHER_ACCENT) {
0821:                        return false; /* next base character, no more cc==230 following */
0822:                    }
0823:                }
0824:
0825:                return false; /* no more cc==230 following */
0826:            }
0827:
0828:            /* Is followed by a dot above (without cc==230 in between) ? */
0829:            private final boolean isFollowedByDotAbove(ContextIterator iter) {
0830:                int c;
0831:                int dotType;
0832:
0833:                if (iter == null) {
0834:                    return false;
0835:                }
0836:
0837:                for (iter.reset(1); (c = iter.next()) >= 0;) {
0838:                    if (c == 0x307) {
0839:                        return true;
0840:                    }
0841:                    dotType = getDotType(c);
0842:                    if (dotType != OTHER_ACCENT) {
0843:                        return false; /* next base character or cc==230 in between */
0844:                    }
0845:                }
0846:
0847:                return false; /* no dot above following */
0848:            }
0849:
0850:            private static final String iDot = "i\u0307", jDot = "j\u0307",
0851:                    iOgonekDot = "\u012f\u0307", iDotGrave = "i\u0307\u0300",
0852:                    iDotAcute = "i\u0307\u0301", iDotTilde = "i\u0307\u0303";
0853:
0854:            /**
0855:             * Get the full lowercase mapping for c.
0856:             *
0857:             * @param c Character to be mapped.
0858:             * @param iter Character iterator, used for context-sensitive mappings.
0859:             *             See ContextIterator for details.
0860:             *             If iter==null then a context-independent result is returned.
0861:             * @param out If the mapping result is a string, then it is appended to out.
0862:             * @param locale Locale ID for locale-dependent mappings.
0863:             * @param locCache Initialize locCache[0] to 0; may be used to cache the result of parsing
0864:             *                 the locale ID for subsequent calls.
0865:             *                 Can be null.
0866:             * @return Output code point or string length, see MAX_STRING_LENGTH.
0867:             *
0868:             * @see ContextIterator
0869:             * @see #MAX_STRING_LENGTH
0870:             * @internal
0871:             */
0872:            public final int toFullLower(int c, ContextIterator iter,
0873:                    StringBuffer out, ULocale locale, int[] locCache) {
0874:                int result, props;
0875:
0876:                result = c;
0877:                props = trie.getCodePointValue(c);
0878:                if (!propsHasException(props)) {
0879:                    if (getTypeFromProps(props) >= UPPER) {
0880:                        result = c + getDelta(props);
0881:                    }
0882:                } else {
0883:                    int excOffset = getExceptionsOffset(props), excOffset2;
0884:                    int excWord = exceptions[excOffset++];
0885:                    int full;
0886:
0887:                    excOffset2 = excOffset;
0888:
0889:                    if ((excWord & EXC_CONDITIONAL_SPECIAL) != 0) {
0890:                        /* use hardcoded conditions and mappings */
0891:                        int loc = getCaseLocale(locale, locCache);
0892:
0893:                        /*
0894:                         * Test for conditional mappings first
0895:                         *   (otherwise the unconditional default mappings are always taken),
0896:                         * then test for characters that have unconditional mappings in SpecialCasing.txt,
0897:                         * then get the UnicodeData.txt mappings.
0898:                         */
0899:                        if (loc == LOC_LITHUANIAN
0900:                                &&
0901:                                /* base characters, find accents above */
0902:                                (((c == 0x49 || c == 0x4a || c == 0x12e) && isFollowedByMoreAbove(iter)) ||
0903:                                /* precomposed with accent above, no need to find one */
0904:                                (c == 0xcc || c == 0xcd || c == 0x128))) {
0905:                            /*
0906:                                # Lithuanian
0907:
0908:                                # Lithuanian retains the dot in a lowercase i when followed by accents.
0909:
0910:                                # Introduce an explicit dot above when lowercasing capital I's and J's
0911:                                # whenever there are more accents above.
0912:                                # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
0913:
0914:                                0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
0915:                                004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
0916:                                012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
0917:                                00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
0918:                                00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
0919:                                0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
0920:                             */
0921:                            switch (c) {
0922:                            case 0x49: /* LATIN CAPITAL LETTER I */
0923:                                out.append(iDot);
0924:                                return 2;
0925:                            case 0x4a: /* LATIN CAPITAL LETTER J */
0926:                                out.append(jDot);
0927:                                return 2;
0928:                            case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
0929:                                out.append(iOgonekDot);
0930:                                return 2;
0931:                            case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
0932:                                out.append(iDotGrave);
0933:                                return 3;
0934:                            case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
0935:                                out.append(iDotAcute);
0936:                                return 3;
0937:                            case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
0938:                                out.append(iDotTilde);
0939:                                return 3;
0940:                            default:
0941:                                return 0; /* will not occur */
0942:                            }
0943:                            /* # Turkish and Azeri */
0944:                        } else if (loc == LOC_TURKISH && c == 0x130) {
0945:                            /*
0946:                                # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
0947:                                # The following rules handle those cases.
0948:
0949:                                0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
0950:                                0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
0951:                             */
0952:                            return 0x69;
0953:                        } else if (loc == LOC_TURKISH && c == 0x307
0954:                                && isPrecededBy_I(iter)) {
0955:                            /*
0956:                                # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
0957:                                # This matches the behavior of the canonically equivalent I-dot_above
0958:
0959:                                0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
0960:                                0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
0961:                             */
0962:                            return 0; /* remove the dot (continue without output) */
0963:                        } else if (loc == LOC_TURKISH && c == 0x49
0964:                                && !isFollowedByDotAbove(iter)) {
0965:                            /*
0966:                                # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
0967:
0968:                                0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
0969:                                0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
0970:                             */
0971:                            return 0x131;
0972:                        } else if (c == 0x130) {
0973:                            /*
0974:                                # Preserve canonical equivalence for I with dot. Turkic is handled below.
0975:
0976:                                0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
0977:                             */
0978:                            out.append(iDot);
0979:                            return 2;
0980:                        } else if (c == 0x3a3
0981:                                && !isFollowedByCasedLetter(iter, 1)
0982:                                && isFollowedByCasedLetter(iter, -1) /* -1=preceded */
0983:                        ) {
0984:                            /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
0985:                            /*
0986:                                # Special case for final form of sigma
0987:
0988:                                03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
0989:                             */
0990:                            return 0x3c2; /* greek small final sigma */
0991:                        } else {
0992:                            /* no known conditional special case mapping, use a normal mapping */
0993:                        }
0994:                    } else if (hasSlot(excWord, EXC_FULL_MAPPINGS)) {
0995:                        long value = getSlotValueAndOffset(excWord,
0996:                                EXC_FULL_MAPPINGS, excOffset);
0997:                        full = (int) value & FULL_LOWER;
0998:                        if (full != 0) {
0999:                            /* start of full case mapping strings */
1000:                            excOffset = (int) (value >> 32) + 1;
1001:
1002:                            /* set the output pointer to the lowercase mapping */
1003:                            out.append(new String(exceptions, excOffset, full));
1004:
1005:                            /* return the string length */
1006:                            return full;
1007:                        }
1008:                    }
1009:
1010:                    if (hasSlot(excWord, EXC_LOWER)) {
1011:                        result = getSlotValue(excWord, EXC_LOWER, excOffset2);
1012:                    }
1013:                }
1014:
1015:                return (result == c) ? ~result : result;
1016:            }
1017:
1018:            /* internal */
1019:            private final int toUpperOrTitle(int c, ContextIterator iter,
1020:                    StringBuffer out, ULocale locale, int[] locCache,
1021:                    boolean upperNotTitle) {
1022:                int result;
1023:                int props;
1024:
1025:                result = c;
1026:                props = trie.getCodePointValue(c);
1027:                if (!propsHasException(props)) {
1028:                    if (getTypeFromProps(props) == LOWER) {
1029:                        result = c + getDelta(props);
1030:                    }
1031:                } else {
1032:                    int excOffset = getExceptionsOffset(props), excOffset2;
1033:                    int excWord = exceptions[excOffset++];
1034:                    int full, index;
1035:
1036:                    excOffset2 = excOffset;
1037:
1038:                    if ((excWord & EXC_CONDITIONAL_SPECIAL) != 0) {
1039:                        /* use hardcoded conditions and mappings */
1040:                        int loc = getCaseLocale(locale, locCache);
1041:
1042:                        if (loc == LOC_TURKISH && c == 0x69) {
1043:                            /*
1044:                                # Turkish and Azeri
1045:
1046:                                # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1047:                                # The following rules handle those cases.
1048:
1049:                                # When uppercasing, i turns into a dotted capital I
1050:
1051:                                0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1052:                                0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1053:                             */
1054:                            return 0x130;
1055:                        } else if (loc == LOC_LITHUANIAN && c == 0x307
1056:                                && isPrecededBySoftDotted(iter)) {
1057:                            /*
1058:                                # Lithuanian
1059:
1060:                                # Lithuanian retains the dot in a lowercase i when followed by accents.
1061:
1062:                                # Remove DOT ABOVE after "i" with upper or titlecase
1063:
1064:                                0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1065:                             */
1066:                            return 0; /* remove the dot (continue without output) */
1067:                        } else {
1068:                            /* no known conditional special case mapping, use a normal mapping */
1069:                        }
1070:                    } else if (hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1071:                        long value = getSlotValueAndOffset(excWord,
1072:                                EXC_FULL_MAPPINGS, excOffset);
1073:                        full = (int) value & 0xffff;
1074:
1075:                        /* start of full case mapping strings */
1076:                        excOffset = (int) (value >> 32) + 1;
1077:
1078:                        /* skip the lowercase and case-folding result strings */
1079:                        excOffset += full & FULL_LOWER;
1080:                        full >>= 4;
1081:                        excOffset += full & 0xf;
1082:                        full >>= 4;
1083:
1084:                        if (upperNotTitle) {
1085:                            full &= 0xf;
1086:                        } else {
1087:                            /* skip the uppercase result string */
1088:                            excOffset += full & 0xf;
1089:                            full = (full >> 4) & 0xf;
1090:                        }
1091:
1092:                        if (full != 0) {
1093:                            /* set the output pointer to the result string */
1094:                            out.append(new String(exceptions, excOffset, full));
1095:
1096:                            /* return the string length */
1097:                            return full;
1098:                        }
1099:                    }
1100:
1101:                    if (!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {
1102:                        index = EXC_TITLE;
1103:                    } else if (hasSlot(excWord, EXC_UPPER)) {
1104:                        /* here, titlecase is same as uppercase */
1105:                        index = EXC_UPPER;
1106:                    } else {
1107:                        return ~c;
1108:                    }
1109:                    result = getSlotValue(excWord, index, excOffset2);
1110:                }
1111:
1112:                return (result == c) ? ~result : result;
1113:            }
1114:
1115:            public final int toFullUpper(int c, ContextIterator iter,
1116:                    StringBuffer out, ULocale locale, int[] locCache) {
1117:                return toUpperOrTitle(c, iter, out, locale, locCache, true);
1118:            }
1119:
1120:            public final int toFullTitle(int c, ContextIterator iter,
1121:                    StringBuffer out, ULocale locale, int[] locCache) {
1122:                return toUpperOrTitle(c, iter, out, locale, locCache, false);
1123:            }
1124:
1125:            /* case folding ------------------------------------------------------------- */
1126:
1127:            /*
1128:             * Case folding is similar to lowercasing.
1129:             * The result may be a simple mapping, i.e., a single code point, or
1130:             * a full mapping, i.e., a string.
1131:             * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1132:             * then only the lowercase mapping is stored.
1133:             *
1134:             * Some special cases are hardcoded because their conditions cannot be
1135:             * parsed and processed from CaseFolding.txt.
1136:             *
1137:             * Unicode 3.2 CaseFolding.txt specifies for its status field:
1138:
1139:            # C: common case folding, common mappings shared by both simple and full mappings.
1140:            # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1141:            # S: simple case folding, mappings to single characters where different from F.
1142:            # T: special case for uppercase I and dotted uppercase I
1143:            #    - For non-Turkic languages, this mapping is normally not used.
1144:            #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1145:            #
1146:            # Usage:
1147:            #  A. To do a simple case folding, use the mappings with status C + S.
1148:            #  B. To do a full case folding, use the mappings with status C + F.
1149:            #
1150:            #    The mappings with status T can be used or omitted depending on the desired case-folding
1151:            #    behavior. (The default option is to exclude them.)
1152:
1153:             * Unicode 3.2 has 'T' mappings as follows:
1154:
1155:            0049; T; 0131; # LATIN CAPITAL LETTER I
1156:            0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1157:
1158:             * while the default mappings for these code points are:
1159:
1160:            0049; C; 0069; # LATIN CAPITAL LETTER I
1161:            0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1162:
1163:             * U+0130 has no simple case folding (simple-case-folds to itself).
1164:             */
1165:
1166:            /**
1167:             * Bit mask for getting just the options from a string compare options word
1168:             * that are relevant for case folding (of a single string or code point).
1169:             * @internal
1170:             */
1171:            private static final int FOLD_CASE_OPTIONS_MASK = 0xff;
1172:
1173:            /* return the simple case folding mapping for c */
1174:            public final int fold(int c, int options) {
1175:                int props = trie.getCodePointValue(c);
1176:                if (!propsHasException(props)) {
1177:                    if (getTypeFromProps(props) >= UPPER) {
1178:                        c += getDelta(props);
1179:                    }
1180:                } else {
1181:                    int excOffset = getExceptionsOffset(props);
1182:                    int excWord = exceptions[excOffset++];
1183:                    int index;
1184:                    if ((excWord & EXC_CONDITIONAL_FOLD) != 0) {
1185:                        /* special case folding mappings, hardcoded */
1186:                        if ((options & FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT) {
1187:                            /* default mappings */
1188:                            if (c == 0x49) {
1189:                                /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1190:                                return 0x69;
1191:                            } else if (c == 0x130) {
1192:                                /* no simple case folding for U+0130 */
1193:                                return c;
1194:                            }
1195:                        } else {
1196:                            /* Turkic mappings */
1197:                            if (c == 0x49) {
1198:                                /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1199:                                return 0x131;
1200:                            } else if (c == 0x130) {
1201:                                /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1202:                                return 0x69;
1203:                            }
1204:                        }
1205:                    }
1206:                    if (hasSlot(excWord, EXC_FOLD)) {
1207:                        index = EXC_FOLD;
1208:                    } else if (hasSlot(excWord, EXC_LOWER)) {
1209:                        index = EXC_LOWER;
1210:                    } else {
1211:                        return c;
1212:                    }
1213:                    c = getSlotValue(excWord, index, excOffset);
1214:                }
1215:                return c;
1216:            }
1217:
1218:            /*
1219:             * Issue for canonical caseless match (UAX #21):
1220:             * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1221:             * canonical equivalence, unlike default-option casefolding.
1222:             * For example, I-grave and I + grave fold to strings that are not canonically
1223:             * equivalent.
1224:             * For more details, see the comment in unorm_compare() in unorm.cpp
1225:             * and the intermediate prototype changes for Jitterbug 2021.
1226:             * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1227:             *
1228:             * This did not get fixed because it appears that it is not possible to fix
1229:             * it for uppercase and lowercase characters (I-grave vs. i-grave)
1230:             * together in a way that they still fold to common result strings.
1231:             */
1232:
1233:            public final int toFullFolding(int c, StringBuffer out, int options) {
1234:                int result;
1235:                int props;
1236:
1237:                result = c;
1238:                props = trie.getCodePointValue(c);
1239:                if (!propsHasException(props)) {
1240:                    if (getTypeFromProps(props) >= UPPER) {
1241:                        result = c + getDelta(props);
1242:                    }
1243:                } else {
1244:                    int excOffset = getExceptionsOffset(props), excOffset2;
1245:                    int excWord = exceptions[excOffset++];
1246:                    int full, index;
1247:
1248:                    excOffset2 = excOffset;
1249:
1250:                    if ((excWord & EXC_CONDITIONAL_FOLD) != 0) {
1251:                        /* use hardcoded conditions and mappings */
1252:                        if ((options & FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT) {
1253:                            /* default mappings */
1254:                            if (c == 0x49) {
1255:                                /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1256:                                return 0x69;
1257:                            } else if (c == 0x130) {
1258:                                /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1259:                                out.append(iDot);
1260:                                return 2;
1261:                            }
1262:                        } else {
1263:                            /* Turkic mappings */
1264:                            if (c == 0x49) {
1265:                                /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1266:                                return 0x131;
1267:                            } else if (c == 0x130) {
1268:                                /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1269:                                return 0x69;
1270:                            }
1271:                        }
1272:                    } else if (hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1273:                        long value = getSlotValueAndOffset(excWord,
1274:                                EXC_FULL_MAPPINGS, excOffset);
1275:                        full = (int) value & 0xffff;
1276:
1277:                        /* start of full case mapping strings */
1278:                        excOffset = (int) (value >> 32) + 1;
1279:
1280:                        /* skip the lowercase result string */
1281:                        excOffset += full & FULL_LOWER;
1282:                        full = (full >> 4) & 0xf;
1283:
1284:                        if (full != 0) {
1285:                            /* set the output pointer to the result string */
1286:                            out.append(new String(exceptions, excOffset, full));
1287:
1288:                            /* return the string length */
1289:                            return full;
1290:                        }
1291:                    }
1292:
1293:                    if (hasSlot(excWord, EXC_FOLD)) {
1294:                        index = EXC_FOLD;
1295:                    } else if (hasSlot(excWord, EXC_LOWER)) {
1296:                        index = EXC_LOWER;
1297:                    } else {
1298:                        return ~c;
1299:                    }
1300:                    result = getSlotValue(excWord, index, excOffset2);
1301:                }
1302:
1303:                return (result == c) ? ~result : result;
1304:            }
1305:
1306:            // data members -------------------------------------------------------- ***
1307:            private int indexes[];
1308:            private char exceptions[];
1309:            private char unfold[];
1310:
1311:            private CharTrie trie;
1312:            private byte formatVersion[];
1313:            private byte unicodeVersion[];
1314:
1315:            // data format constants ----------------------------------------------- ***
1316:            private static final String DATA_NAME = "ucase";
1317:            private static final String DATA_TYPE = "icu";
1318:            private static final String DATA_FILE_NAME = DATA_NAME + "."
1319:                    + DATA_TYPE;
1320:
1321:            /* format "cAsE" */
1322:            private static final byte FMT[] = { 0x63, 0x41, 0x53, 0x45 };
1323:
1324:            /* indexes into indexes[] */
1325:            private static final int IX_INDEX_TOP = 0;
1326:            private static final int IX_LENGTH = 1;
1327:            private static final int IX_TRIE_SIZE = 2;
1328:            private static final int IX_EXC_LENGTH = 3;
1329:            private static final int IX_UNFOLD_LENGTH = 4;
1330:
1331:            private static final int IX_MAX_FULL_LENGTH = 15;
1332:            private static final int IX_TOP = 16;
1333:
1334:            // definitions for 16-bit case properties word ------------------------- ***
1335:
1336:            /* 2-bit constants for types of cased characters */
1337:            public static final int TYPE_MASK = 3;
1338:            public static final int NONE = 0;
1339:            public static final int LOWER = 1;
1340:            public static final int UPPER = 2;
1341:            public static final int TITLE = 3;
1342:
1343:            private static final int getTypeFromProps(int props) {
1344:                return props & TYPE_MASK;
1345:            }
1346:
1347:            private static final int SENSITIVE = 4;
1348:            private static final int EXCEPTION = 8;
1349:
1350:            private static final int DOT_MASK = 0x30;
1351:            private static final int NO_DOT = 0; /* normal characters with cc=0 */
1352:            private static final int SOFT_DOTTED = 0x10; /* soft-dotted characters with cc=0 */
1353:            private static final int ABOVE = 0x20; /* "above" accents with cc=230 */
1354:            private static final int OTHER_ACCENT = 0x30; /* other accent character (0<cc!=230) */
1355:
1356:            /* no exception: bits 15..6 are a 10-bit signed case mapping delta */
1357:            private static final int DELTA_SHIFT = 6;
1358:            private static final int DELTA_MASK = 0xffc0;
1359:            private static final int MAX_DELTA = 0x1ff;
1360:            private static final int MIN_DELTA = (-MAX_DELTA - 1);
1361:
1362:            private static final int getDelta(int props) {
1363:                return (short) props >> DELTA_SHIFT;
1364:            }
1365:
1366:            /* case-ignorable uses one of the delta bits, see gencase/store.c */
1367:            private static final int CASE_IGNORABLE = 0x40;
1368:
1369:            /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
1370:            private static final int EXC_SHIFT = 4;
1371:            private static final int EXC_MASK = 0xfff0;
1372:            private static final int MAX_EXCEPTIONS = 0x1000;
1373:
1374:            /* definitions for 16-bit main exceptions word ------------------------------ */
1375:
1376:            /* first 8 bits indicate values in optional slots */
1377:            private static final int EXC_LOWER = 0;
1378:            private static final int EXC_FOLD = 1;
1379:            private static final int EXC_UPPER = 2;
1380:            private static final int EXC_TITLE = 3;
1381:            private static final int EXC_4 = 4; /* reserved */
1382:            private static final int EXC_5 = 5; /* reserved */
1383:            private static final int EXC_CLOSURE = 6;
1384:            private static final int EXC_FULL_MAPPINGS = 7;
1385:            private static final int EXC_ALL_SLOTS = 8; /* one past the last slot */
1386:
1387:            /* each slot is 2 uint16_t instead of 1 */
1388:            private static final int EXC_DOUBLE_SLOTS = 0x100;
1389:
1390:            /* reserved: exception bits 11..9 */
1391:
1392:            /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */
1393:            private static final int EXC_DOT_SHIFT = 8;
1394:
1395:            /* normally stored in the main word, but pushed out for larger exception indexes */
1396:            private static final int EXC_DOT_MASK = 0x3000;
1397:            private static final int EXC_NO_DOT = 0;
1398:            private static final int EXC_SOFT_DOTTED = 0x1000;
1399:            private static final int EXC_ABOVE = 0x2000; /* "above" accents with cc=230 */
1400:            private static final int EXC_OTHER_ACCENT = 0x3000; /* other character (0<cc!=230) */
1401:
1402:            /* complex/conditional mappings */
1403:            private static final int EXC_CONDITIONAL_SPECIAL = 0x4000;
1404:            private static final int EXC_CONDITIONAL_FOLD = 0x8000;
1405:
1406:            /* definitions for lengths word for full case mappings */
1407:            private static final int FULL_LOWER = 0xf;
1408:            private static final int FULL_FOLDING = 0xf0;
1409:            private static final int FULL_UPPER = 0xf00;
1410:            private static final int FULL_TITLE = 0xf000;
1411:
1412:            /* maximum lengths */
1413:            private static final int FULL_MAPPINGS_MAX_LENGTH = 4 * 0xf;
1414:            private static final int CLOSURE_MAX_LENGTH = 0xf;
1415:
1416:            /* constants for reverse case folding ("unfold") data */
1417:            private static final int UNFOLD_ROWS = 0;
1418:            private static final int UNFOLD_ROW_WIDTH = 1;
1419:            private static final int UNFOLD_STRING_WIDTH = 2;
1420:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.