001: package net.sf.saxon.codenorm;
002:
003: import net.sf.saxon.om.XMLChar;
004: import net.sf.saxon.sort.IntHashMap;
005: import net.sf.saxon.sort.IntToIntHashMap;
006:
007: import java.util.BitSet;
008:
009: /**
010: * Accesses the Normalization Data used for Forms C and D.<br>
011: * Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
012: * The Unicode Consortium makes no expressed or implied warranty of any
013: * kind, and assumes no liability for errors or omissions.
014: * No liability is assumed for incidental and consequential damages
015: * in connection with or arising out of the use of the information here.
016: * @author Mark Davis
017: */
018: public class NormalizerData {
019: static final String copyright = "Copyright © 1998-1999 Unicode, Inc.";
020:
021: /**
022: * Constant for use in getPairwiseComposition
023: */
024: public static final int NOT_COMPOSITE = '\uFFFF';
025:
026: /**
027: * Gets the combining class of a character from the
028: * Unicode Character Database.
029: * @param ch the source character
030: * @return value from 0 to 255
031: */
032: public int getCanonicalClass(int ch) {
033: return canonicalClass.get(ch);
034: }
035:
036: /**
037: * Returns the composite of the two characters. If the two
038: * characters don't combine, returns NOT_COMPOSITE.
039: * Only has to worry about BMP characters, since those are the only ones that can ever compose.
040: * @param first first character (e.g. 'c')
041: * @param first second character (e.g. '¸' cedilla)
042: * @return composite (e.g. 'ç')
043: */
044: public char getPairwiseComposition(int first, int second) {
045: if (first < 0 || first > 0x10FFFF || second < 0
046: || second > 0x10FFFF)
047: return NOT_COMPOSITE;
048: return (char) compose.get((first << 16) | second);
049: }
050:
051: /**
052: * Gets recursive decomposition of a character from the
053: * Unicode Character Database.
054: * @param canonical If true
055: * bit is on in this byte, then selects the recursive
056: * canonical decomposition, otherwise selects
057: * the recursive compatibility and canonical decomposition.
058: * @param ch the source character
059: * @param buffer buffer to be filled with the decomposition
060: */
061: public void getRecursiveDecomposition(boolean canonical, int ch,
062: StringBuffer buffer) {
063: String decomp = (String) decompose.get(ch);
064: if (decomp != null && !(canonical && isCompatibility.get(ch))) {
065: for (int i = 0; i < decomp.length(); ++i) {
066: getRecursiveDecomposition(canonical, decomp.charAt(i),
067: buffer);
068: }
069: } else { // if no decomp, append
070: //UTF16.append(buffer, ch);
071: if (ch < 65536) {
072: buffer.append((char) ch);
073: } else { // output a surrogate pair
074: buffer.append(XMLChar.highSurrogate(ch));
075: buffer.append(XMLChar.lowSurrogate(ch));
076: }
077: }
078: }
079:
080: // =================================================
081: // PRIVATES
082: // =================================================
083:
084: /**
085: * Only accessed by NormalizerBuilder.
086: */
087: NormalizerData(IntToIntHashMap canonicalClass,
088: IntHashMap decompose, IntToIntHashMap compose,
089: BitSet isCompatibility, BitSet isExcluded) {
090: this .canonicalClass = canonicalClass;
091: this .decompose = decompose;
092: this .compose = compose;
093: this .isCompatibility = isCompatibility;
094: this .isExcluded = isExcluded;
095: }
096:
097: /**
098: * Just accessible for testing.
099: */
100: boolean getExcluded(char ch) {
101: return isExcluded.get(ch);
102: }
103:
104: /**
105: * Just accessible for testing.
106: */
107: String getRawDecompositionMapping(char ch) {
108: return (String) decompose.get(ch);
109: }
110:
111: /**
112: * For now, just use IntHashtable
113: * Two-stage tables would be used in an optimized implementation.
114: */
115: private IntToIntHashMap canonicalClass;
116:
117: /**
118: * The main data table maps chars to a 32-bit int.
119: * It holds either a pair: top = first, bottom = second
120: * or singleton: top = 0, bottom = single.
121: * If there is no decomposition, the value is 0.
122: * Two-stage tables would be used in an optimized implementation.
123: * An optimization could also map chars to a small index, then use that
124: * index in a small array of ints.
125: */
126: private IntHashMap decompose;
127:
128: /**
129: * Maps from pairs of characters to single.
130: * If there is no decomposition, the value is NOT_COMPOSITE.
131: */
132: private IntToIntHashMap compose;
133:
134: /**
135: * Tells whether decomposition is canonical or not.
136: */
137: private BitSet isCompatibility = new BitSet();
138:
139: /**
140: * Tells whether character is script-excluded or not.
141: * Used only while building, and for testing.
142: */
143:
144: private BitSet isExcluded = new BitSet();
145: }
|