001: package com.ibm.icu.dev.test.normalizer;
002:
003: import java.util.BitSet;
004:
005: import com.ibm.icu.dev.test.UTF16Util;
006:
007: /**
008: * Accesses the Normalization Data used for Forms C and D.<br>
009: * Copyright (C) 1998-2004 International Business Machines Corporation and
010: * Unicode, Inc. All Rights Reserved.<br>
011: * The Unicode Consortium makes no expressed or implied warranty of any
012: * kind, and assumes no liability for errors or omissions.
013: * No liability is assumed for incidental and consequential damages
014: * in connection with or arising out of the use of the information here.
015: * @author Mark Davis
016: * Updates for supplementary code points:
017: * Vladimir Weinstein & Markus Scherer
018: */
019: public class NormalizerData {
020: static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
021:
022: /**
023: * Constant for use in getPairwiseComposition
024: */
025: public static final int NOT_COMPOSITE = '\uFFFF';
026:
027: /**
028: * Gets the combining class of a character from the
029: * Unicode Character Database.
030: * @param ch the source character
031: * @return value from 0 to 255
032: */
033: public int getCanonicalClass(int ch) {
034: return canonicalClass.get(ch);
035: }
036:
037: /**
038: * Returns the composite of the two characters. If the two
039: * characters don't combine, returns NOT_COMPOSITE.
040: * @param first first character (e.g. 'c')
041: * @param second second character (e.g. \u0327 cedilla)
042: * @return composite (e.g. \u00C7 c cedilla)
043: */
044: public int getPairwiseComposition(int first, int second) {
045: return compose.get(((long) first << 32) | second);
046: }
047:
048: /**
049: * Gets recursive decomposition of a character from the
050: * Unicode Character Database.
051: * @param canonical If true
052: * bit is on in this byte, then selects the recursive
053: * canonical decomposition, otherwise selects
054: * the recursive compatibility and canonical decomposition.
055: * @param ch the source character
056: * @param buffer buffer to be filled with the decomposition
057: */
058: public void getRecursiveDecomposition(boolean canonical, int ch,
059: StringBuffer buffer) {
060: String decomp = decompose.get(ch);
061: if (decomp != null && !(canonical && isCompatibility.get(ch))) {
062: for (int i = 0; i < decomp.length(); i += UTF16Util
063: .codePointLength(ch)) {
064: ch = UTF16Util.nextCodePoint(decomp, i);
065: getRecursiveDecomposition(canonical, ch, buffer);
066: }
067: } else { // if no decomp, append
068: UTF16Util.appendCodePoint(buffer, ch);
069: }
070: }
071:
072: // =================================================
073: // PRIVATES
074: // =================================================
075:
076: /**
077: * Only accessed by NormalizerBuilder.
078: */
079: NormalizerData(IntHashtable canonicalClass,
080: IntStringHashtable decompose, LongHashtable compose,
081: BitSet isCompatibility, BitSet isExcluded) {
082: this .canonicalClass = canonicalClass;
083: this .decompose = decompose;
084: this .compose = compose;
085: this .isCompatibility = isCompatibility;
086: this .isExcluded = isExcluded;
087: }
088:
089: /**
090: * Just accessible for testing.
091: */
092: boolean getExcluded(char ch) {
093: return isExcluded.get(ch);
094: }
095:
096: /**
097: * Just accessible for testing.
098: */
099: String getRawDecompositionMapping(char ch) {
100: return decompose.get(ch);
101: }
102:
103: /**
104: * For now, just use IntHashtable
105: * Two-stage tables would be used in an optimized implementation.
106: */
107: private IntHashtable canonicalClass;
108:
109: /**
110: * The main data table maps chars to a 32-bit int.
111: * It holds either a pair: top = first, bottom = second
112: * or singleton: top = 0, bottom = single.
113: * If there is no decomposition, the value is 0.
114: * Two-stage tables would be used in an optimized implementation.
115: * An optimization could also map chars to a small index, then use that
116: * index in a small array of ints.
117: */
118: private IntStringHashtable decompose;
119:
120: /**
121: * Maps from pairs of characters to single.
122: * If there is no decomposition, the value is NOT_COMPOSITE.
123: */
124: private LongHashtable compose;
125:
126: /**
127: * Tells whether decomposition is canonical or not.
128: */
129: private BitSet isCompatibility = new BitSet();
130:
131: /**
132: * Tells whether character is script-excluded or not.
133: * Used only while building, and for testing.
134: */
135:
136: private BitSet isExcluded = new BitSet();
137: }
|