001: /**
002: * Implements Unicode Normalization Forms C, D, KC, KD.<br>
003: * See UTR#15 for details.<br>
004: * Copyright (C) 1998-2003 International Business Machines Corporation and
005: * Unicode, Inc. All Rights Reserved.<br>
006: * The Unicode Consortium makes no expressed or implied warranty of any
007: * kind, and assumes no liability for errors or omissions.
008: * No liability is assumed for incidental and consequential damages
009: * in connection with or arising out of the use of the information here.
010: * @author Mark Davis
011: * Updates for supplementary code points:
012: * Vladimir Weinstein & Markus Scherer
013: */package com.ibm.icu.dev.test.normalizer;
014:
015: import com.ibm.icu.dev.test.UTF16Util;
016:
017: public class UnicodeNormalizer {
018: static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
019:
020: /**
021: * Create a normalizer for a given form.
022: */
023: public UnicodeNormalizer(byte form, boolean fullData) {
024: this .form = form;
025: if (data == null)
026: data = NormalizerBuilder.build(fullData); // load 1st time
027: }
028:
029: /**
030: * Masks for the form selector
031: */
032: static final byte COMPATIBILITY_MASK = 1, COMPOSITION_MASK = 2;
033:
034: /**
035: * Normalization Form Selector
036: */
037: public static final byte D = 0, C = COMPOSITION_MASK,
038: KD = COMPATIBILITY_MASK,
039: KC = (byte) (COMPATIBILITY_MASK + COMPOSITION_MASK);
040:
041: /**
042: * Normalizes text according to the chosen form,
043: * replacing contents of the target buffer.
044: * @param source the original text, unnormalized
045: * @param target the resulting normalized text
046: */
047: public StringBuffer normalize(String source, StringBuffer target) {
048:
049: // First decompose the source into target,
050: // then compose if the form requires.
051:
052: if (source.length() != 0) {
053: internalDecompose(source, target);
054: if ((form & COMPOSITION_MASK) != 0) {
055: internalCompose(target);
056: }
057: }
058: return target;
059: }
060:
061: /**
062: * Normalizes text according to the chosen form
063: * @param source the original text, unnormalized
064: * @return target the resulting normalized text
065: */
066: public String normalize(String source) {
067: return normalize(source, new StringBuffer()).toString();
068: }
069:
070: // ======================================
071: // PRIVATES
072: // ======================================
073:
074: /**
075: * The current form.
076: */
077: private byte form;
078:
079: /**
080: * Decomposes text, either canonical or compatibility,
081: * replacing contents of the target buffer.
082: * @param form the normalization form. If COMPATIBILITY_MASK
083: * bit is on in this byte, then selects the recursive
084: * compatibility decomposition, otherwise selects
085: * the recursive canonical decomposition.
086: * @param source the original text, unnormalized
087: * @param target the resulting normalized text
088: */
089: private void internalDecompose(String source, StringBuffer target) {
090: StringBuffer buffer = new StringBuffer();
091: boolean canonical = (form & COMPATIBILITY_MASK) == 0;
092: int ch;
093: for (int i = 0; i < source.length();) {
094: buffer.setLength(0);
095: ch = UTF16Util.nextCodePoint(source, i);
096: i += UTF16Util.codePointLength(ch);
097: data.getRecursiveDecomposition(canonical, ch, buffer);
098:
099: // add all of the characters in the decomposition.
100: // (may be just the original character, if there was
101: // no decomposition mapping)
102:
103: for (int j = 0; j < buffer.length();) {
104: ch = UTF16Util.nextCodePoint(buffer, j);
105: j += UTF16Util.codePointLength(ch);
106: int chClass = data.getCanonicalClass(ch);
107: int k = target.length(); // insertion point
108: if (chClass != 0) {
109:
110: // bubble-sort combining marks as necessary
111:
112: int ch2;
113: for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {
114: ch2 = UTF16Util.prevCodePoint(target, k);
115: if (data.getCanonicalClass(ch2) <= chClass)
116: break;
117: }
118: }
119: UTF16Util.insertCodePoint(target, k, ch);
120: }
121: }
122: }
123:
124: /**
125: * Composes text in place. Target must already
126: * have been decomposed.
127: * @param target input: decomposed text.
128: * output: the resulting normalized text.
129: */
130: private void internalCompose(StringBuffer target) {
131:
132: int starterPos = 0;
133: int starterCh = UTF16Util.nextCodePoint(target, 0);
134: int compPos = UTF16Util.codePointLength(starterCh);
135: int lastClass = data.getCanonicalClass(starterCh);
136: if (lastClass != 0)
137: lastClass = 256; // fix for irregular combining sequence
138:
139: // Loop on the decomposed characters, combining where possible
140:
141: for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target
142: .length();) {
143: int ch = UTF16Util.nextCodePoint(target, decompPos);
144: decompPos += UTF16Util.codePointLength(ch);
145: int chClass = data.getCanonicalClass(ch);
146: int composite = data.getPairwiseComposition(starterCh, ch);
147: if (composite != NormalizerData.NOT_COMPOSITE
148: && (lastClass < chClass || lastClass == 0)) {
149: UTF16Util.setCodePointAt(target, starterPos, composite);
150: starterCh = composite;
151: } else {
152: if (chClass == 0) {
153: starterPos = compPos;
154: starterCh = ch;
155: }
156: lastClass = chClass;
157: decompPos += UTF16Util.setCodePointAt(target, compPos,
158: ch);
159: compPos += UTF16Util.codePointLength(ch);
160: }
161: }
162: target.setLength(compPos);
163: }
164:
165: /**
166: * Contains normalization data from the Unicode Character Database.
167: * use false for the minimal set, true for the real set.
168: */
169: private static NormalizerData data = null;
170:
171: /**
172: * Just accessible for testing.
173: */
174: boolean getExcluded(char ch) {
175: return data.getExcluded(ch);
176: }
177:
178: /**
179: * Just accessible for testing.
180: */
181: String getRawDecompositionMapping(char ch) {
182: return data.getRawDecompositionMapping(ch);
183: }
184: }
|