0001: /*
0002: *******************************************************************************
0003: *
0004: * Copyright (C) 2004-2006, International Business Machines
0005: * Corporation and others. All Rights Reserved.
0006: *
0007: *******************************************************************************
0008: * file name: UCaseProps.java
0009: * encoding: US-ASCII
0010: * tab size: 8 (not used)
0011: * indentation:4
0012: *
0013: * created on: 2005jan29
0014: * created by: Markus W. Scherer
0015: *
0016: * Low-level Unicode character/string case mapping code.
0017: * Java port of ucase.h/.c.
0018: */
0019:
0020: package com.ibm.icu.impl;
0021:
0022: import java.io.InputStream;
0023: import java.io.DataInputStream;
0024: import java.io.BufferedInputStream;
0025: import java.io.IOException;
0026:
0027: import com.ibm.icu.util.RangeValueIterator;
0028: import com.ibm.icu.util.ULocale;
0029:
0030: import com.ibm.icu.text.UTF16;
0031: import com.ibm.icu.text.UnicodeSet;
0032:
0033: import com.ibm.icu.lang.UCharacter;
0034:
0035: public final class UCaseProps {
0036: // constructors etc. --------------------------------------------------- ***
0037:
0038: // port of ucase_openProps()
0039: public UCaseProps() throws IOException {
0040: InputStream is = ICUData
0041: .getRequiredStream(ICUResourceBundle.ICU_BUNDLE + "/"
0042: + DATA_FILE_NAME);
0043: BufferedInputStream b = new BufferedInputStream(is, 4096 /* data buffer size */);
0044: readData(b);
0045: b.close();
0046: is.close();
0047: }
0048:
0049: private final void readData(InputStream is) throws IOException {
0050: DataInputStream inputStream = new DataInputStream(is);
0051:
0052: // read the header
0053: unicodeVersion = ICUBinary.readHeader(inputStream, FMT,
0054: new IsAcceptable());
0055:
0056: // read indexes[]
0057: int i, count;
0058: count = inputStream.readInt();
0059: if (count < IX_INDEX_TOP) {
0060: throw new IOException("indexes[0] too small in "
0061: + DATA_FILE_NAME);
0062: }
0063: indexes = new int[count];
0064:
0065: indexes[0] = count;
0066: for (i = 1; i < count; ++i) {
0067: indexes[i] = inputStream.readInt();
0068: }
0069:
0070: // read the trie
0071: trie = new CharTrie(inputStream, null);
0072:
0073: // read exceptions[]
0074: count = indexes[IX_EXC_LENGTH];
0075: if (count > 0) {
0076: exceptions = new char[count];
0077: for (i = 0; i < count; ++i) {
0078: exceptions[i] = inputStream.readChar();
0079: }
0080: }
0081:
0082: // read unfold[]
0083: count = indexes[IX_UNFOLD_LENGTH];
0084: if (count > 0) {
0085: unfold = new char[count];
0086: for (i = 0; i < count; ++i) {
0087: unfold[i] = inputStream.readChar();
0088: }
0089: }
0090: }
0091:
0092: // implement ICUBinary.Authenticate
0093: private final class IsAcceptable implements ICUBinary.Authenticate {
0094: public boolean isDataVersionAcceptable(byte version[]) {
0095: formatVersion = version;
0096: return version[0] == 1
0097: && version[2] == Trie.INDEX_STAGE_1_SHIFT_
0098: && version[3] == Trie.INDEX_STAGE_2_SHIFT_;
0099: }
0100: }
0101:
0102: // UCaseProps singleton
0103: private static UCaseProps gCsp = null;
0104:
0105: // port of ucase_getSingleton()
0106: public static final synchronized UCaseProps getSingleton()
0107: throws IOException {
0108: if (gCsp == null) {
0109: gCsp = new UCaseProps();
0110: }
0111: return gCsp;
0112: }
0113:
0114: // UCaseProps dummy singleton
0115: private static UCaseProps gCspDummy = null;
0116:
0117: private UCaseProps(boolean makeDummy) { // ignore makeDummy, only creates a unique signature
0118: formatVersion = new byte[] { 1, 0, Trie.INDEX_STAGE_1_SHIFT_,
0119: Trie.INDEX_STAGE_2_SHIFT_ };
0120: unicodeVersion = new byte[] { 2, 0, 0, 0 };
0121: indexes = new int[IX_TOP];
0122: indexes[0] = IX_TOP;
0123: trie = new CharTrie(0, 0, null); // dummy trie, always returns 0
0124: }
0125:
0126: /**
0127: * Get a singleton dummy object, one that works with no real data.
0128: * This can be used when the real data is not available.
0129: * Using the dummy can reduce checks for available data after an initial failure.
0130: * Port of ucase_getDummy().
0131: */
0132: public static final synchronized UCaseProps getDummy() {
0133: if (gCspDummy == null) {
0134: gCspDummy = new UCaseProps(true);
0135: }
0136: return gCspDummy;
0137: }
0138:
0139: // set of property starts for UnicodeSet ------------------------------- ***
0140:
0141: public final void addPropertyStarts(UnicodeSet set) {
0142: /* add the start code point of each same-value range of the trie */
0143: TrieIterator iter = new TrieIterator(trie);
0144: RangeValueIterator.Element element = new RangeValueIterator.Element();
0145:
0146: while (iter.next(element)) {
0147: set.add(element.start);
0148: }
0149:
0150: /* add code points with hardcoded properties, plus the ones following them */
0151:
0152: /* (none right now, see comment below) */
0153:
0154: /*
0155: * Omit code points with hardcoded specialcasing properties
0156: * because we do not build property UnicodeSets for them right now.
0157: */
0158: }
0159:
0160: // data access primitives ---------------------------------------------- ***
0161: private static final int getExceptionsOffset(int props) {
0162: return props >> EXC_SHIFT;
0163: }
0164:
0165: private static final boolean propsHasException(int props) {
0166: return (props & EXCEPTION) != 0;
0167: }
0168:
0169: /* number of bits in an 8-bit integer value */
0170: private static final byte flagsOffset[/*256*/] = { 0, 1, 1, 2, 1,
0171: 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2,
0172: 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3,
0173: 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1,
0174: 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3,
0175: 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3,
0176: 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5,
0177: 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2,
0178: 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3,
0179: 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4,
0180: 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4,
0181: 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3,
0182: 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5,
0183: 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 };
0184:
0185: private static final boolean hasSlot(int flags, int index) {
0186: return (flags & (1 << index)) != 0;
0187: }
0188:
0189: private static final byte slotOffset(int flags, int index) {
0190: return flagsOffset[flags & ((1 << index) - 1)];
0191: }
0192:
0193: /*
0194: * Get the value of an optional-value slot where hasSlot(excWord, index).
0195: *
0196: * @param excWord (in) initial exceptions word
0197: * @param index (in) desired slot index
0198: * @param excOffset (in) offset into exceptions[] after excWord=exceptions[excOffset++];
0199: * @return bits 31..0: slot value
0200: * 63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot
0201: */
0202: private final long getSlotValueAndOffset(int excWord, int index,
0203: int excOffset) {
0204: long value;
0205: if ((excWord & EXC_DOUBLE_SLOTS) == 0) {
0206: excOffset += slotOffset(excWord, index);
0207: value = exceptions[excOffset];
0208: } else {
0209: excOffset += 2 * slotOffset(excWord, index);
0210: value = exceptions[excOffset++];
0211: value = (value << 16) | exceptions[excOffset];
0212: }
0213: return (long) value | ((long) excOffset << 32);
0214: }
0215:
0216: /* same as getSlotValueAndOffset() but does not return the slot offset */
0217: private final int getSlotValue(int excWord, int index, int excOffset) {
0218: int value;
0219: if ((excWord & EXC_DOUBLE_SLOTS) == 0) {
0220: excOffset += slotOffset(excWord, index);
0221: value = exceptions[excOffset];
0222: } else {
0223: excOffset += 2 * slotOffset(excWord, index);
0224: value = exceptions[excOffset++];
0225: value = (value << 16) | exceptions[excOffset];
0226: }
0227: return value;
0228: }
0229:
0230: // simple case mappings ------------------------------------------------ ***
0231:
0232: public final int tolower(int c) {
0233: int props = trie.getCodePointValue(c);
0234: if (!propsHasException(props)) {
0235: if (getTypeFromProps(props) >= UPPER) {
0236: c += getDelta(props);
0237: }
0238: } else {
0239: int excOffset = getExceptionsOffset(props);
0240: int excWord = exceptions[excOffset++];
0241: if (hasSlot(excWord, EXC_LOWER)) {
0242: c = getSlotValue(excWord, EXC_LOWER, excOffset);
0243: }
0244: }
0245: return c;
0246: }
0247:
0248: public final int toupper(int c) {
0249: int props = trie.getCodePointValue(c);
0250: if (!propsHasException(props)) {
0251: if (getTypeFromProps(props) == LOWER) {
0252: c += getDelta(props);
0253: }
0254: } else {
0255: int excOffset = getExceptionsOffset(props);
0256: int excWord = exceptions[excOffset++];
0257: if (hasSlot(excWord, EXC_UPPER)) {
0258: c = getSlotValue(excWord, EXC_UPPER, excOffset);
0259: }
0260: }
0261: return c;
0262: }
0263:
0264: public final int totitle(int c) {
0265: int props = trie.getCodePointValue(c);
0266: if (!propsHasException(props)) {
0267: if (getTypeFromProps(props) == LOWER) {
0268: c += getDelta(props);
0269: }
0270: } else {
0271: int excOffset = getExceptionsOffset(props);
0272: int excWord = exceptions[excOffset++];
0273: int index;
0274: if (hasSlot(excWord, EXC_TITLE)) {
0275: index = EXC_TITLE;
0276: } else if (hasSlot(excWord, EXC_UPPER)) {
0277: index = EXC_UPPER;
0278: } else {
0279: return c;
0280: }
0281: c = getSlotValue(excWord, index, excOffset);
0282: }
0283: return c;
0284: }
0285:
0286: /**
0287: * Adds all simple case mappings and the full case folding for c to sa,
0288: * and also adds special case closure mappings.
0289: * c itself is not added.
0290: * For example, the mappings
0291: * - for s include long s
0292: * - for sharp s include ss
0293: * - for k include the Kelvin sign
0294: */
0295: public final void addCaseClosure(int c, UnicodeSet set) {
0296: /*
0297: * Hardcode the case closure of i and its relatives and ignore the
0298: * data file data for these characters.
0299: * The Turkic dotless i and dotted I with their case mapping conditions
0300: * and case folding option make the related characters behave specially.
0301: * This code matches their closure behavior to their case folding behavior.
0302: */
0303:
0304: switch (c) {
0305: case 0x49:
0306: /* regular i and I are in one equivalence class */
0307: set.add(0x69);
0308: return;
0309: case 0x69:
0310: set.add(0x49);
0311: return;
0312: case 0x130:
0313: /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
0314: set.add(iDot);
0315: return;
0316: case 0x131:
0317: /* dotless i is in a class by itself */
0318: return;
0319: default:
0320: /* otherwise use the data file data */
0321: break;
0322: }
0323:
0324: int props = trie.getCodePointValue(c);
0325: if (!propsHasException(props)) {
0326: if (getTypeFromProps(props) != NONE) {
0327: /* add the one simple case mapping, no matter what type it is */
0328: int delta = getDelta(props);
0329: if (delta != 0) {
0330: set.add(c + delta);
0331: }
0332: }
0333: } else {
0334: /*
0335: * c has exceptions, so there may be multiple simple and/or
0336: * full case mappings. Add them all.
0337: */
0338: int excOffset0, excOffset = getExceptionsOffset(props);
0339: int closureOffset;
0340: int excWord = exceptions[excOffset++];
0341: int index, closureLength, fullLength, length;
0342:
0343: excOffset0 = excOffset;
0344:
0345: /* add all simple case mappings */
0346: for (index = EXC_LOWER; index <= EXC_TITLE; ++index) {
0347: if (hasSlot(excWord, index)) {
0348: excOffset = excOffset0;
0349: c = getSlotValue(excWord, index, excOffset);
0350: set.add(c);
0351: }
0352: }
0353:
0354: /* get the closure string pointer & length */
0355: if (hasSlot(excWord, EXC_CLOSURE)) {
0356: excOffset = excOffset0;
0357: long value = getSlotValueAndOffset(excWord,
0358: EXC_CLOSURE, excOffset);
0359: closureLength = (int) value & CLOSURE_MAX_LENGTH; /* higher bits are reserved */
0360: closureOffset = (int) (value >> 32) + 1; /* behind this slot, unless there are full case mappings */
0361: } else {
0362: closureLength = 0;
0363: closureOffset = 0;
0364: }
0365:
0366: /* add the full case folding */
0367: if (hasSlot(excWord, EXC_FULL_MAPPINGS)) {
0368: excOffset = excOffset0;
0369: long value = getSlotValueAndOffset(excWord,
0370: EXC_FULL_MAPPINGS, excOffset);
0371: fullLength = (int) value;
0372:
0373: /* start of full case mapping strings */
0374: excOffset = (int) (value >> 32) + 1;
0375:
0376: fullLength &= 0xffff; /* bits 16 and higher are reserved */
0377:
0378: /* skip the lowercase result string */
0379: excOffset += fullLength & FULL_LOWER;
0380: fullLength >>= 4;
0381:
0382: /* add the full case folding string */
0383: length = fullLength & 0xf;
0384: if (length != 0) {
0385: set.add(new String(exceptions, excOffset, length));
0386: excOffset += length;
0387: }
0388:
0389: /* skip the uppercase and titlecase strings */
0390: fullLength >>= 4;
0391: excOffset += fullLength & 0xf;
0392: fullLength >>= 4;
0393: excOffset += fullLength;
0394:
0395: closureOffset = excOffset; /* behind full case mappings */
0396: }
0397:
0398: /* add each code point in the closure string */
0399: for (index = 0; index < closureLength; index += UTF16
0400: .getCharCount(c)) {
0401: c = UTF16.charAt(exceptions, closureOffset,
0402: exceptions.length, index);
0403: set.add(c);
0404: }
0405: }
0406: }
0407:
0408: /*
0409: * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated
0410: * must be s.length()>0 and max>0 and s.length()<=max
0411: */
0412: private final int strcmpMax(String s, int unfoldOffset, int max) {
0413: int i1, length, c1, c2;
0414:
0415: length = s.length();
0416: max -= length; /* we require length<=max, so no need to decrement max in the loop */
0417: i1 = 0;
0418: do {
0419: c1 = s.charAt(i1++);
0420: c2 = unfold[unfoldOffset++];
0421: if (c2 == 0) {
0422: return 1; /* reached the end of t but not of s */
0423: }
0424: c1 -= c2;
0425: if (c1 != 0) {
0426: return c1; /* return difference result */
0427: }
0428: } while (--length > 0);
0429: /* ends with length==0 */
0430:
0431: if (max == 0 || unfold[unfoldOffset] == 0) {
0432: return 0; /* equal to length of both strings */
0433: } else {
0434: return -max; /* return lengh difference */
0435: }
0436: }
0437:
0438: /**
0439: * Maps the string to single code points and adds the associated case closure
0440: * mappings.
0441: * The string is mapped to code points if it is their full case folding string.
0442: * In other words, this performs a reverse full case folding and then
0443: * adds the case closure items of the resulting code points.
0444: * If the string is found and its closure applied, then
0445: * the string itself is added as well as part of its code points' closure.
0446: *
0447: * @return true if the string was found
0448: */
0449: public final boolean addStringCaseClosure(String s, UnicodeSet set) {
0450: int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;
0451:
0452: if (unfold == null || s == null) {
0453: return false; /* no reverse case folding data, or no string */
0454: }
0455: length = s.length();
0456: if (length <= 1) {
0457: /* the string is too short to find any match */
0458: /*
0459: * more precise would be:
0460: * if(!u_strHasMoreChar32Than(s, length, 1))
0461: * but this does not make much practical difference because
0462: * a single supplementary code point would just not be found
0463: */
0464: return false;
0465: }
0466:
0467: unfoldRows = unfold[UNFOLD_ROWS];
0468: unfoldRowWidth = unfold[UNFOLD_ROW_WIDTH];
0469: unfoldStringWidth = unfold[UNFOLD_STRING_WIDTH];
0470: //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;
0471:
0472: if (length > unfoldStringWidth) {
0473: /* the string is too long to find any match */
0474: return false;
0475: }
0476:
0477: /* do a binary search for the string */
0478: start = 0;
0479: limit = unfoldRows;
0480: while (start < limit) {
0481: i = (start + limit) / 2;
0482: unfoldOffset = ((i + 1) * unfoldRowWidth); // +1 to skip the header values above
0483: result = strcmpMax(s, unfoldOffset, unfoldStringWidth);
0484:
0485: if (result == 0) {
0486: /* found the string: add each code point, and its case closure */
0487: int c;
0488:
0489: for (i = unfoldStringWidth; i < unfoldRowWidth
0490: && unfold[unfoldOffset + i] != 0; i += UTF16
0491: .getCharCount(c)) {
0492: c = UTF16.charAt(unfold, unfoldOffset,
0493: unfold.length, i);
0494: set.add(c);
0495: addCaseClosure(c, set);
0496: }
0497: return true;
0498: } else if (result < 0) {
0499: limit = i;
0500: } else /* result>0 */{
0501: start = i + 1;
0502: }
0503: }
0504:
0505: return false; /* string not found */
0506: }
0507:
0508: /** @return NONE, LOWER, UPPER, TITLE */
0509: public final int getType(int c) {
0510: return getTypeFromProps(trie.getCodePointValue(c));
0511: }
0512:
0513: /** @return same as getType(), or <0 if c is case-ignorable */
0514: public final int getTypeOrIgnorable(int c) {
0515: int props = trie.getCodePointValue(c);
0516: int type = getTypeFromProps(props);
0517: if (type != NONE) {
0518: return type;
0519: } else if (c == 0x307
0520: || (props & (EXCEPTION | CASE_IGNORABLE)) == CASE_IGNORABLE) {
0521: return -1; /* case-ignorable */
0522: } else {
0523: return 0; /* c is neither cased nor case-ignorable */
0524: }
0525: }
0526:
0527: /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */
0528: public final int getDotType(int c) {
0529: int props = trie.getCodePointValue(c);
0530: if (!propsHasException(props)) {
0531: return props & DOT_MASK;
0532: } else {
0533: return (exceptions[getExceptionsOffset(props)] >> EXC_DOT_SHIFT)
0534: & DOT_MASK;
0535: }
0536: }
0537:
0538: public final boolean isSoftDotted(int c) {
0539: return getDotType(c) == SOFT_DOTTED;
0540: }
0541:
0542: public final boolean isCaseSensitive(int c) {
0543: return (trie.getCodePointValue(c) & SENSITIVE) != 0;
0544: }
0545:
0546: // string casing ------------------------------------------------------- ***
0547:
0548: /*
0549: * These internal functions form the core of string case mappings.
0550: * They map single code points to result code points or strings and take
0551: * all necessary conditions (context, locale ID, options) into account.
0552: *
0553: * They do not iterate over the source or write to the destination
0554: * so that the same functions are useful for non-standard string storage,
0555: * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
0556: * For the same reason, the "surrounding text" context is passed in as a
0557: * ContextIterator which does not make any assumptions about
0558: * the underlying storage.
0559: *
0560: * This section contains helper functions that check for conditions
0561: * in the input text surrounding the current code point
0562: * according to SpecialCasing.txt.
0563: *
0564: * Each helper function gets the index
0565: * - after the current code point if it looks at following text
0566: * - before the current code point if it looks at preceding text
0567: *
0568: * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
0569: *
0570: * Final_Sigma
0571: * C is preceded by a sequence consisting of
0572: * a cased letter and a case-ignorable sequence,
0573: * and C is not followed by a sequence consisting of
0574: * an ignorable sequence and then a cased letter.
0575: *
0576: * More_Above
0577: * C is followed by one or more characters of combining class 230 (ABOVE)
0578: * in the combining character sequence.
0579: *
0580: * After_Soft_Dotted
0581: * The last preceding character with combining class of zero before C
0582: * was Soft_Dotted,
0583: * and there is no intervening combining character class 230 (ABOVE).
0584: *
0585: * Before_Dot
0586: * C is followed by combining dot above (U+0307).
0587: * Any sequence of characters with a combining class that is neither 0 nor 230
0588: * may intervene between the current character and the combining dot above.
0589: *
0590: * The erratum from 2002-10-31 adds the condition
0591: *
0592: * After_I
0593: * The last preceding base character was an uppercase I, and there is no
0594: * intervening combining character class 230 (ABOVE).
0595: *
0596: * (See Jitterbug 2344 and the comments on After_I below.)
0597: *
0598: * Helper definitions in Unicode 3.2 UAX 21:
0599: *
0600: * D1. A character C is defined to be cased
0601: * if it meets any of the following criteria:
0602: *
0603: * - The general category of C is Titlecase Letter (Lt)
0604: * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
0605: * - Given D = NFD(C), then it is not the case that:
0606: * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
0607: * (This third criterium does not add any characters to the list
0608: * for Unicode 3.2. Ignored.)
0609: *
0610: * D2. A character C is defined to be case-ignorable
0611: * if it meets either of the following criteria:
0612: *
0613: * - The general category of C is
0614: * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
0615: * Letter Modifier (Lm), or Symbol Modifier (Sk)
0616: * - C is one of the following characters
0617: * U+0027 APOSTROPHE
0618: * U+00AD SOFT HYPHEN (SHY)
0619: * U+2019 RIGHT SINGLE QUOTATION MARK
0620: * (the preferred character for apostrophe)
0621: *
0622: * D3. A case-ignorable sequence is a sequence of
0623: * zero or more case-ignorable characters.
0624: */
0625:
0626: /**
0627: * Iterator for string case mappings, which need to look at the
0628: * context (surrounding text) of a given character for conditional mappings.
0629: *
0630: * The iterator only needs to go backward or forward away from the
0631: * character in question. It does not use any indexes on this interface.
0632: * It does not support random access or an arbitrary change of
0633: * iteration direction.
0634: *
0635: * The code point being case-mapped itself is never returned by
0636: * this iterator.
0637: */
0638: public interface ContextIterator {
0639: /**
0640: * Reset the iterator for forward or backward iteration.
0641: * @param dir >0: Begin iterating forward from the first code point
0642: * after the one that is being case-mapped.
0643: * <0: Begin iterating backward from the first code point
0644: * before the one that is being case-mapped.
0645: */
0646: public void reset(int dir);
0647:
0648: /**
0649: * Iterate and return the next code point, moving in the direction
0650: * determined by the reset() call.
0651: * @return Next code point, or <0 when the iteration is done.
0652: */
0653: public int next();
0654: }
0655:
0656: /**
0657: * For string case mappings, a single character (a code point) is mapped
0658: * either to itself (in which case in-place mapping functions do nothing),
0659: * or to another single code point, or to a string.
0660: * Aside from the string contents, these are indicated with a single int
0661: * value as follows:
0662: *
0663: * Mapping to self: Negative values (~self instead of -self to support U+0000)
0664: *
0665: * Mapping to another code point: Positive values >MAX_STRING_LENGTH
0666: *
0667: * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is
0668: * returned. Note that the string result may indeed have zero length.
0669: */
0670: public static final int MAX_STRING_LENGTH = 0x1f;
0671:
0672: private static final int LOC_UNKNOWN = 0;
0673: private static final int LOC_ROOT = 1;
0674: private static final int LOC_TURKISH = 2;
0675: private static final int LOC_LITHUANIAN = 3;
0676:
0677: /*
0678: * Checks and caches the type of locale ID as it is relevant for case mapping.
0679: * If the locCache is not null, then it must be initialized with locCache[0]=0 .
0680: */
0681: private static final int getCaseLocale(ULocale locale,
0682: int[] locCache) {
0683: int result;
0684:
0685: if (locCache != null && (result = locCache[0]) != LOC_UNKNOWN) {
0686: return result;
0687: }
0688:
0689: result = LOC_ROOT;
0690:
0691: String language = locale.getLanguage();
0692: if (language.equals("tr") || language.equals("tur")
0693: || language.equals("az") || language.equals("aze")) {
0694: result = LOC_TURKISH;
0695: } else if (language.equals("lt") || language.equals("lit")) {
0696: result = LOC_LITHUANIAN;
0697: }
0698:
0699: if (locCache != null) {
0700: locCache[0] = result;
0701: }
0702: return result;
0703: }
0704:
0705: /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
0706: private final boolean isFollowedByCasedLetter(ContextIterator iter,
0707: int dir) {
0708: int c;
0709: int props;
0710:
0711: if (iter == null) {
0712: return false;
0713: }
0714:
0715: for (iter.reset(dir); (c = iter.next()) >= 0;) {
0716: props = trie.getCodePointValue(c);
0717: if (getTypeFromProps(props) != NONE) {
0718: return true; /* followed by cased letter */
0719: } else if (c == 0x307
0720: || (props & (EXCEPTION | CASE_IGNORABLE)) == CASE_IGNORABLE) {
0721: /* case-ignorable, continue with the loop */
0722: } else {
0723: return false; /* not ignorable */
0724: }
0725: }
0726:
0727: return false; /* not followed by cased letter */
0728: }
0729:
0730: /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
0731: private final boolean isPrecededBySoftDotted(ContextIterator iter) {
0732: int c;
0733: int dotType;
0734:
0735: if (iter == null) {
0736: return false;
0737: }
0738:
0739: for (iter.reset(-1); (c = iter.next()) >= 0;) {
0740: dotType = getDotType(c);
0741: if (dotType == SOFT_DOTTED) {
0742: return true; /* preceded by TYPE_i */
0743: } else if (dotType != OTHER_ACCENT) {
0744: return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
0745: }
0746: }
0747:
0748: return false; /* not preceded by TYPE_i */
0749: }
0750:
0751: /*
0752: * See Jitterbug 2344:
0753: * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
0754: * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
0755: * we made those releases compatible with Unicode 3.2 which had not fixed
0756: * a related bug in SpecialCasing.txt.
0757: *
0758: * From the Jitterbug 2344 text:
0759: * ... this bug is listed as a Unicode erratum
0760: * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
0761: * <quote>
0762: * There are two errors in SpecialCasing.txt.
0763: * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
0764: * 2. An incorrect context definition. Correct as follows:
0765: * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
0766: * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
0767: * ---
0768: * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
0769: * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
0770: * where the context After_I is defined as:
0771: * The last preceding base character was an uppercase I, and there is no
0772: * intervening combining character class 230 (ABOVE).
0773: * </quote>
0774: *
0775: * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
0776: *
0777: * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
0778: * # This matches the behavior of the canonically equivalent I-dot_above
0779: *
0780: * See also the description in this place in older versions of uchar.c (revision 1.100).
0781: *
0782: * Markus W. Scherer 2003-feb-15
0783: */
0784:
0785: /* Is preceded by base character 'I' with no intervening cc=230 ? */
0786: private final boolean isPrecededBy_I(ContextIterator iter) {
0787: int c;
0788: int dotType;
0789:
0790: if (iter == null) {
0791: return false;
0792: }
0793:
0794: for (iter.reset(-1); (c = iter.next()) >= 0;) {
0795: if (c == 0x49) {
0796: return true; /* preceded by I */
0797: }
0798: dotType = getDotType(c);
0799: if (dotType != OTHER_ACCENT) {
0800: return false; /* preceded by different base character (not I), or intervening cc==230 */
0801: }
0802: }
0803:
0804: return false; /* not preceded by I */
0805: }
0806:
0807: /* Is followed by one or more cc==230 ? */
0808: private final boolean isFollowedByMoreAbove(ContextIterator iter) {
0809: int c;
0810: int dotType;
0811:
0812: if (iter == null) {
0813: return false;
0814: }
0815:
0816: for (iter.reset(1); (c = iter.next()) >= 0;) {
0817: dotType = getDotType(c);
0818: if (dotType == ABOVE) {
0819: return true; /* at least one cc==230 following */
0820: } else if (dotType != OTHER_ACCENT) {
0821: return false; /* next base character, no more cc==230 following */
0822: }
0823: }
0824:
0825: return false; /* no more cc==230 following */
0826: }
0827:
0828: /* Is followed by a dot above (without cc==230 in between) ? */
0829: private final boolean isFollowedByDotAbove(ContextIterator iter) {
0830: int c;
0831: int dotType;
0832:
0833: if (iter == null) {
0834: return false;
0835: }
0836:
0837: for (iter.reset(1); (c = iter.next()) >= 0;) {
0838: if (c == 0x307) {
0839: return true;
0840: }
0841: dotType = getDotType(c);
0842: if (dotType != OTHER_ACCENT) {
0843: return false; /* next base character or cc==230 in between */
0844: }
0845: }
0846:
0847: return false; /* no dot above following */
0848: }
0849:
0850: private static final String iDot = "i\u0307", jDot = "j\u0307",
0851: iOgonekDot = "\u012f\u0307", iDotGrave = "i\u0307\u0300",
0852: iDotAcute = "i\u0307\u0301", iDotTilde = "i\u0307\u0303";
0853:
0854: /**
0855: * Get the full lowercase mapping for c.
0856: *
0857: * @param c Character to be mapped.
0858: * @param iter Character iterator, used for context-sensitive mappings.
0859: * See ContextIterator for details.
0860: * If iter==null then a context-independent result is returned.
0861: * @param out If the mapping result is a string, then it is appended to out.
0862: * @param locale Locale ID for locale-dependent mappings.
0863: * @param locCache Initialize locCache[0] to 0; may be used to cache the result of parsing
0864: * the locale ID for subsequent calls.
0865: * Can be null.
0866: * @return Output code point or string length, see MAX_STRING_LENGTH.
0867: *
0868: * @see ContextIterator
0869: * @see #MAX_STRING_LENGTH
0870: * @internal
0871: */
0872: public final int toFullLower(int c, ContextIterator iter,
0873: StringBuffer out, ULocale locale, int[] locCache) {
0874: int result, props;
0875:
0876: result = c;
0877: props = trie.getCodePointValue(c);
0878: if (!propsHasException(props)) {
0879: if (getTypeFromProps(props) >= UPPER) {
0880: result = c + getDelta(props);
0881: }
0882: } else {
0883: int excOffset = getExceptionsOffset(props), excOffset2;
0884: int excWord = exceptions[excOffset++];
0885: int full;
0886:
0887: excOffset2 = excOffset;
0888:
0889: if ((excWord & EXC_CONDITIONAL_SPECIAL) != 0) {
0890: /* use hardcoded conditions and mappings */
0891: int loc = getCaseLocale(locale, locCache);
0892:
0893: /*
0894: * Test for conditional mappings first
0895: * (otherwise the unconditional default mappings are always taken),
0896: * then test for characters that have unconditional mappings in SpecialCasing.txt,
0897: * then get the UnicodeData.txt mappings.
0898: */
0899: if (loc == LOC_LITHUANIAN
0900: &&
0901: /* base characters, find accents above */
0902: (((c == 0x49 || c == 0x4a || c == 0x12e) && isFollowedByMoreAbove(iter)) ||
0903: /* precomposed with accent above, no need to find one */
0904: (c == 0xcc || c == 0xcd || c == 0x128))) {
0905: /*
0906: # Lithuanian
0907:
0908: # Lithuanian retains the dot in a lowercase i when followed by accents.
0909:
0910: # Introduce an explicit dot above when lowercasing capital I's and J's
0911: # whenever there are more accents above.
0912: # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
0913:
0914: 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
0915: 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
0916: 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
0917: 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
0918: 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
0919: 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
0920: */
0921: switch (c) {
0922: case 0x49: /* LATIN CAPITAL LETTER I */
0923: out.append(iDot);
0924: return 2;
0925: case 0x4a: /* LATIN CAPITAL LETTER J */
0926: out.append(jDot);
0927: return 2;
0928: case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
0929: out.append(iOgonekDot);
0930: return 2;
0931: case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
0932: out.append(iDotGrave);
0933: return 3;
0934: case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
0935: out.append(iDotAcute);
0936: return 3;
0937: case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
0938: out.append(iDotTilde);
0939: return 3;
0940: default:
0941: return 0; /* will not occur */
0942: }
0943: /* # Turkish and Azeri */
0944: } else if (loc == LOC_TURKISH && c == 0x130) {
0945: /*
0946: # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
0947: # The following rules handle those cases.
0948:
0949: 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
0950: 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
0951: */
0952: return 0x69;
0953: } else if (loc == LOC_TURKISH && c == 0x307
0954: && isPrecededBy_I(iter)) {
0955: /*
0956: # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
0957: # This matches the behavior of the canonically equivalent I-dot_above
0958:
0959: 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
0960: 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
0961: */
0962: return 0; /* remove the dot (continue without output) */
0963: } else if (loc == LOC_TURKISH && c == 0x49
0964: && !isFollowedByDotAbove(iter)) {
0965: /*
0966: # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
0967:
0968: 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
0969: 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
0970: */
0971: return 0x131;
0972: } else if (c == 0x130) {
0973: /*
0974: # Preserve canonical equivalence for I with dot. Turkic is handled below.
0975:
0976: 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
0977: */
0978: out.append(iDot);
0979: return 2;
0980: } else if (c == 0x3a3
0981: && !isFollowedByCasedLetter(iter, 1)
0982: && isFollowedByCasedLetter(iter, -1) /* -1=preceded */
0983: ) {
0984: /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
0985: /*
0986: # Special case for final form of sigma
0987:
0988: 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
0989: */
0990: return 0x3c2; /* greek small final sigma */
0991: } else {
0992: /* no known conditional special case mapping, use a normal mapping */
0993: }
0994: } else if (hasSlot(excWord, EXC_FULL_MAPPINGS)) {
0995: long value = getSlotValueAndOffset(excWord,
0996: EXC_FULL_MAPPINGS, excOffset);
0997: full = (int) value & FULL_LOWER;
0998: if (full != 0) {
0999: /* start of full case mapping strings */
1000: excOffset = (int) (value >> 32) + 1;
1001:
1002: /* set the output pointer to the lowercase mapping */
1003: out.append(new String(exceptions, excOffset, full));
1004:
1005: /* return the string length */
1006: return full;
1007: }
1008: }
1009:
1010: if (hasSlot(excWord, EXC_LOWER)) {
1011: result = getSlotValue(excWord, EXC_LOWER, excOffset2);
1012: }
1013: }
1014:
1015: return (result == c) ? ~result : result;
1016: }
1017:
1018: /* internal */
1019: private final int toUpperOrTitle(int c, ContextIterator iter,
1020: StringBuffer out, ULocale locale, int[] locCache,
1021: boolean upperNotTitle) {
1022: int result;
1023: int props;
1024:
1025: result = c;
1026: props = trie.getCodePointValue(c);
1027: if (!propsHasException(props)) {
1028: if (getTypeFromProps(props) == LOWER) {
1029: result = c + getDelta(props);
1030: }
1031: } else {
1032: int excOffset = getExceptionsOffset(props), excOffset2;
1033: int excWord = exceptions[excOffset++];
1034: int full, index;
1035:
1036: excOffset2 = excOffset;
1037:
1038: if ((excWord & EXC_CONDITIONAL_SPECIAL) != 0) {
1039: /* use hardcoded conditions and mappings */
1040: int loc = getCaseLocale(locale, locCache);
1041:
1042: if (loc == LOC_TURKISH && c == 0x69) {
1043: /*
1044: # Turkish and Azeri
1045:
1046: # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1047: # The following rules handle those cases.
1048:
1049: # When uppercasing, i turns into a dotted capital I
1050:
1051: 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1052: 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1053: */
1054: return 0x130;
1055: } else if (loc == LOC_LITHUANIAN && c == 0x307
1056: && isPrecededBySoftDotted(iter)) {
1057: /*
1058: # Lithuanian
1059:
1060: # Lithuanian retains the dot in a lowercase i when followed by accents.
1061:
1062: # Remove DOT ABOVE after "i" with upper or titlecase
1063:
1064: 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1065: */
1066: return 0; /* remove the dot (continue without output) */
1067: } else {
1068: /* no known conditional special case mapping, use a normal mapping */
1069: }
1070: } else if (hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1071: long value = getSlotValueAndOffset(excWord,
1072: EXC_FULL_MAPPINGS, excOffset);
1073: full = (int) value & 0xffff;
1074:
1075: /* start of full case mapping strings */
1076: excOffset = (int) (value >> 32) + 1;
1077:
1078: /* skip the lowercase and case-folding result strings */
1079: excOffset += full & FULL_LOWER;
1080: full >>= 4;
1081: excOffset += full & 0xf;
1082: full >>= 4;
1083:
1084: if (upperNotTitle) {
1085: full &= 0xf;
1086: } else {
1087: /* skip the uppercase result string */
1088: excOffset += full & 0xf;
1089: full = (full >> 4) & 0xf;
1090: }
1091:
1092: if (full != 0) {
1093: /* set the output pointer to the result string */
1094: out.append(new String(exceptions, excOffset, full));
1095:
1096: /* return the string length */
1097: return full;
1098: }
1099: }
1100:
1101: if (!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {
1102: index = EXC_TITLE;
1103: } else if (hasSlot(excWord, EXC_UPPER)) {
1104: /* here, titlecase is same as uppercase */
1105: index = EXC_UPPER;
1106: } else {
1107: return ~c;
1108: }
1109: result = getSlotValue(excWord, index, excOffset2);
1110: }
1111:
1112: return (result == c) ? ~result : result;
1113: }
1114:
1115: public final int toFullUpper(int c, ContextIterator iter,
1116: StringBuffer out, ULocale locale, int[] locCache) {
1117: return toUpperOrTitle(c, iter, out, locale, locCache, true);
1118: }
1119:
1120: public final int toFullTitle(int c, ContextIterator iter,
1121: StringBuffer out, ULocale locale, int[] locCache) {
1122: return toUpperOrTitle(c, iter, out, locale, locCache, false);
1123: }
1124:
1125: /* case folding ------------------------------------------------------------- */
1126:
1127: /*
1128: * Case folding is similar to lowercasing.
1129: * The result may be a simple mapping, i.e., a single code point, or
1130: * a full mapping, i.e., a string.
1131: * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1132: * then only the lowercase mapping is stored.
1133: *
1134: * Some special cases are hardcoded because their conditions cannot be
1135: * parsed and processed from CaseFolding.txt.
1136: *
1137: * Unicode 3.2 CaseFolding.txt specifies for its status field:
1138:
1139: # C: common case folding, common mappings shared by both simple and full mappings.
1140: # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1141: # S: simple case folding, mappings to single characters where different from F.
1142: # T: special case for uppercase I and dotted uppercase I
1143: # - For non-Turkic languages, this mapping is normally not used.
1144: # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1145: #
1146: # Usage:
1147: # A. To do a simple case folding, use the mappings with status C + S.
1148: # B. To do a full case folding, use the mappings with status C + F.
1149: #
1150: # The mappings with status T can be used or omitted depending on the desired case-folding
1151: # behavior. (The default option is to exclude them.)
1152:
1153: * Unicode 3.2 has 'T' mappings as follows:
1154:
1155: 0049; T; 0131; # LATIN CAPITAL LETTER I
1156: 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1157:
1158: * while the default mappings for these code points are:
1159:
1160: 0049; C; 0069; # LATIN CAPITAL LETTER I
1161: 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1162:
1163: * U+0130 has no simple case folding (simple-case-folds to itself).
1164: */
1165:
1166: /**
1167: * Bit mask for getting just the options from a string compare options word
1168: * that are relevant for case folding (of a single string or code point).
1169: * @internal
1170: */
1171: private static final int FOLD_CASE_OPTIONS_MASK = 0xff;
1172:
1173: /* return the simple case folding mapping for c */
1174: public final int fold(int c, int options) {
1175: int props = trie.getCodePointValue(c);
1176: if (!propsHasException(props)) {
1177: if (getTypeFromProps(props) >= UPPER) {
1178: c += getDelta(props);
1179: }
1180: } else {
1181: int excOffset = getExceptionsOffset(props);
1182: int excWord = exceptions[excOffset++];
1183: int index;
1184: if ((excWord & EXC_CONDITIONAL_FOLD) != 0) {
1185: /* special case folding mappings, hardcoded */
1186: if ((options & FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT) {
1187: /* default mappings */
1188: if (c == 0x49) {
1189: /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1190: return 0x69;
1191: } else if (c == 0x130) {
1192: /* no simple case folding for U+0130 */
1193: return c;
1194: }
1195: } else {
1196: /* Turkic mappings */
1197: if (c == 0x49) {
1198: /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1199: return 0x131;
1200: } else if (c == 0x130) {
1201: /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1202: return 0x69;
1203: }
1204: }
1205: }
1206: if (hasSlot(excWord, EXC_FOLD)) {
1207: index = EXC_FOLD;
1208: } else if (hasSlot(excWord, EXC_LOWER)) {
1209: index = EXC_LOWER;
1210: } else {
1211: return c;
1212: }
1213: c = getSlotValue(excWord, index, excOffset);
1214: }
1215: return c;
1216: }
1217:
1218: /*
1219: * Issue for canonical caseless match (UAX #21):
1220: * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1221: * canonical equivalence, unlike default-option casefolding.
1222: * For example, I-grave and I + grave fold to strings that are not canonically
1223: * equivalent.
1224: * For more details, see the comment in unorm_compare() in unorm.cpp
1225: * and the intermediate prototype changes for Jitterbug 2021.
1226: * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1227: *
1228: * This did not get fixed because it appears that it is not possible to fix
1229: * it for uppercase and lowercase characters (I-grave vs. i-grave)
1230: * together in a way that they still fold to common result strings.
1231: */
1232:
1233: public final int toFullFolding(int c, StringBuffer out, int options) {
1234: int result;
1235: int props;
1236:
1237: result = c;
1238: props = trie.getCodePointValue(c);
1239: if (!propsHasException(props)) {
1240: if (getTypeFromProps(props) >= UPPER) {
1241: result = c + getDelta(props);
1242: }
1243: } else {
1244: int excOffset = getExceptionsOffset(props), excOffset2;
1245: int excWord = exceptions[excOffset++];
1246: int full, index;
1247:
1248: excOffset2 = excOffset;
1249:
1250: if ((excWord & EXC_CONDITIONAL_FOLD) != 0) {
1251: /* use hardcoded conditions and mappings */
1252: if ((options & FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT) {
1253: /* default mappings */
1254: if (c == 0x49) {
1255: /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1256: return 0x69;
1257: } else if (c == 0x130) {
1258: /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1259: out.append(iDot);
1260: return 2;
1261: }
1262: } else {
1263: /* Turkic mappings */
1264: if (c == 0x49) {
1265: /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1266: return 0x131;
1267: } else if (c == 0x130) {
1268: /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1269: return 0x69;
1270: }
1271: }
1272: } else if (hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1273: long value = getSlotValueAndOffset(excWord,
1274: EXC_FULL_MAPPINGS, excOffset);
1275: full = (int) value & 0xffff;
1276:
1277: /* start of full case mapping strings */
1278: excOffset = (int) (value >> 32) + 1;
1279:
1280: /* skip the lowercase result string */
1281: excOffset += full & FULL_LOWER;
1282: full = (full >> 4) & 0xf;
1283:
1284: if (full != 0) {
1285: /* set the output pointer to the result string */
1286: out.append(new String(exceptions, excOffset, full));
1287:
1288: /* return the string length */
1289: return full;
1290: }
1291: }
1292:
1293: if (hasSlot(excWord, EXC_FOLD)) {
1294: index = EXC_FOLD;
1295: } else if (hasSlot(excWord, EXC_LOWER)) {
1296: index = EXC_LOWER;
1297: } else {
1298: return ~c;
1299: }
1300: result = getSlotValue(excWord, index, excOffset2);
1301: }
1302:
1303: return (result == c) ? ~result : result;
1304: }
1305:
1306: // data members -------------------------------------------------------- ***
1307: private int indexes[];
1308: private char exceptions[];
1309: private char unfold[];
1310:
1311: private CharTrie trie;
1312: private byte formatVersion[];
1313: private byte unicodeVersion[];
1314:
1315: // data format constants ----------------------------------------------- ***
1316: private static final String DATA_NAME = "ucase";
1317: private static final String DATA_TYPE = "icu";
1318: private static final String DATA_FILE_NAME = DATA_NAME + "."
1319: + DATA_TYPE;
1320:
1321: /* format "cAsE" */
1322: private static final byte FMT[] = { 0x63, 0x41, 0x53, 0x45 };
1323:
1324: /* indexes into indexes[] */
1325: private static final int IX_INDEX_TOP = 0;
1326: private static final int IX_LENGTH = 1;
1327: private static final int IX_TRIE_SIZE = 2;
1328: private static final int IX_EXC_LENGTH = 3;
1329: private static final int IX_UNFOLD_LENGTH = 4;
1330:
1331: private static final int IX_MAX_FULL_LENGTH = 15;
1332: private static final int IX_TOP = 16;
1333:
1334: // definitions for 16-bit case properties word ------------------------- ***
1335:
1336: /* 2-bit constants for types of cased characters */
1337: public static final int TYPE_MASK = 3;
1338: public static final int NONE = 0;
1339: public static final int LOWER = 1;
1340: public static final int UPPER = 2;
1341: public static final int TITLE = 3;
1342:
1343: private static final int getTypeFromProps(int props) {
1344: return props & TYPE_MASK;
1345: }
1346:
1347: private static final int SENSITIVE = 4;
1348: private static final int EXCEPTION = 8;
1349:
1350: private static final int DOT_MASK = 0x30;
1351: private static final int NO_DOT = 0; /* normal characters with cc=0 */
1352: private static final int SOFT_DOTTED = 0x10; /* soft-dotted characters with cc=0 */
1353: private static final int ABOVE = 0x20; /* "above" accents with cc=230 */
1354: private static final int OTHER_ACCENT = 0x30; /* other accent character (0<cc!=230) */
1355:
1356: /* no exception: bits 15..6 are a 10-bit signed case mapping delta */
1357: private static final int DELTA_SHIFT = 6;
1358: private static final int DELTA_MASK = 0xffc0;
1359: private static final int MAX_DELTA = 0x1ff;
1360: private static final int MIN_DELTA = (-MAX_DELTA - 1);
1361:
1362: private static final int getDelta(int props) {
1363: return (short) props >> DELTA_SHIFT;
1364: }
1365:
1366: /* case-ignorable uses one of the delta bits, see gencase/store.c */
1367: private static final int CASE_IGNORABLE = 0x40;
1368:
1369: /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
1370: private static final int EXC_SHIFT = 4;
1371: private static final int EXC_MASK = 0xfff0;
1372: private static final int MAX_EXCEPTIONS = 0x1000;
1373:
1374: /* definitions for 16-bit main exceptions word ------------------------------ */
1375:
1376: /* first 8 bits indicate values in optional slots */
1377: private static final int EXC_LOWER = 0;
1378: private static final int EXC_FOLD = 1;
1379: private static final int EXC_UPPER = 2;
1380: private static final int EXC_TITLE = 3;
1381: private static final int EXC_4 = 4; /* reserved */
1382: private static final int EXC_5 = 5; /* reserved */
1383: private static final int EXC_CLOSURE = 6;
1384: private static final int EXC_FULL_MAPPINGS = 7;
1385: private static final int EXC_ALL_SLOTS = 8; /* one past the last slot */
1386:
1387: /* each slot is 2 uint16_t instead of 1 */
1388: private static final int EXC_DOUBLE_SLOTS = 0x100;
1389:
1390: /* reserved: exception bits 11..9 */
1391:
1392: /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */
1393: private static final int EXC_DOT_SHIFT = 8;
1394:
1395: /* normally stored in the main word, but pushed out for larger exception indexes */
1396: private static final int EXC_DOT_MASK = 0x3000;
1397: private static final int EXC_NO_DOT = 0;
1398: private static final int EXC_SOFT_DOTTED = 0x1000;
1399: private static final int EXC_ABOVE = 0x2000; /* "above" accents with cc=230 */
1400: private static final int EXC_OTHER_ACCENT = 0x3000; /* other character (0<cc!=230) */
1401:
1402: /* complex/conditional mappings */
1403: private static final int EXC_CONDITIONAL_SPECIAL = 0x4000;
1404: private static final int EXC_CONDITIONAL_FOLD = 0x8000;
1405:
1406: /* definitions for lengths word for full case mappings */
1407: private static final int FULL_LOWER = 0xf;
1408: private static final int FULL_FOLDING = 0xf0;
1409: private static final int FULL_UPPER = 0xf00;
1410: private static final int FULL_TITLE = 0xf000;
1411:
1412: /* maximum lengths */
1413: private static final int FULL_MAPPINGS_MAX_LENGTH = 4 * 0xf;
1414: private static final int CLOSURE_MAX_LENGTH = 0xf;
1415:
1416: /* constants for reverse case folding ("unfold") data */
1417: private static final int UNFOLD_ROWS = 0;
1418: private static final int UNFOLD_ROW_WIDTH = 1;
1419: private static final int UNFOLD_STRING_WIDTH = 2;
1420: }
|