001: package net.sf.saxon.codenorm;
002:
003: import net.sf.saxon.om.XMLChar;
004:
005: /**
006: * Implements Unicode Normalization Forms C, D, KC, KD.<br>
007: * Copyright (c) 1991-2005 Unicode, Inc.
008: * For terms of use, see http://www.unicode.org/terms_of_use.html
009: * For documentation, see UAX#15.<br>
010: * The Unicode Consortium makes no expressed or implied warranty of any
011: * kind, and assumes no liability for errors or omissions.
012: * No liability is assumed for incidental and consequential damages
013: * in connection with or arising out of the use of the information here.
014: * @author Mark Davis
015: * Updates for supplementary code points: Vladimir Weinstein & Markus Scherer
016: * Modified to remove dependency on ICU code: Michael Kay
017: */
018:
019: public class Normalizer {
020:
021: /**
022: * Create a normalizer for a given form.
023: */
024: public Normalizer(byte form) {
025: this .form = form;
026: if (data == null) {
027: data = UnicodeDataParser.build(); // load 1st time
028: }
029: }
030:
031: /**
032: * Masks for the form selector
033: */
034: static final byte COMPATIBILITY_MASK = 1, COMPOSITION_MASK = 2;
035:
036: /**
037: * Normalization Form Selector
038: */
039: public static final byte D = 0, C = COMPOSITION_MASK,
040: KD = COMPATIBILITY_MASK,
041: KC = (byte) (COMPATIBILITY_MASK + COMPOSITION_MASK);
042:
043: /**
044: * Normalizes text according to the chosen form,
045: * replacing contents of the target buffer.
046: * @param source the original text, unnormalized
047: * @param target the resulting normalized text
048: */
049: public StringBuffer normalize(CharSequence source,
050: StringBuffer target) {
051:
052: // First decompose the source into target,
053: // then compose if the form requires.
054:
055: if (source.length() != 0) {
056: internalDecompose(source, target);
057: if ((form & COMPOSITION_MASK) != 0) {
058: internalCompose(target);
059: }
060: }
061: return target;
062: }
063:
064: /**
065: * Normalizes text according to the chosen form
066: * @param source the original text, unnormalized
067: * @return target the resulting normalized text
068: */
069: public CharSequence normalize(CharSequence source) {
070: return normalize(source, new StringBuffer(source.length() + 8));
071: }
072:
073: // ======================================
074: // PRIVATES
075: // ======================================
076:
077: /**
078: * The current form.
079: */
080: private byte form;
081:
082: /**
083: * Decomposes text, either canonical or compatibility,
084: * replacing contents of the target buffer.
085: // * @param form the normalization form. If COMPATIBILITY_MASK
086: // * bit is on in this byte, then selects the recursive
087: // * compatibility decomposition, otherwise selects
088: // * the recursive canonical decomposition.
089: * @param source the original text, unnormalized
090: * @param target the resulting normalized text
091: */
092: private void internalDecompose(CharSequence source,
093: StringBuffer target) {
094: StringBuffer buffer = new StringBuffer(8);
095: boolean canonical = (form & COMPATIBILITY_MASK) == 0;
096: int ch32;
097: //for (int i = 0; i < source.length(); i += (ch32<65536 ? 1 : 2)) {
098: for (int i = 0; i < source.length();) {
099: buffer.setLength(0);
100: //ch32 = UTF16.charAt(source, i);
101: ch32 = source.charAt(i++);
102: if (XMLChar.isHighSurrogate(ch32)) {
103: char low = source.charAt(i++);
104: ch32 = XMLChar.supplemental((char) ch32, low);
105: }
106: data.getRecursiveDecomposition(canonical, ch32, buffer);
107:
108: // add all of the characters in the decomposition.
109: // (may be just the original character, if there was
110: // no decomposition mapping)
111:
112: int ch;
113: //for (int j = 0; j < buffer.length(); j += (ch<65536 ? 1 : 2)) {
114: for (int j = 0; j < buffer.length();) {
115: //ch = UTF16.charAt(buffer, j);
116: ch = buffer.charAt(j++);
117: if (XMLChar.isHighSurrogate(ch32)) {
118: char low = buffer.charAt(j++);
119: ch = XMLChar.supplemental((char) ch, low);
120: }
121: int chClass = data.getCanonicalClass(ch);
122: int k = target.length(); // insertion point
123: if (chClass != 0) {
124:
125: // bubble-sort combining marks as necessary
126:
127: int ch2;
128: while (k > 0) {
129: ch2 = target.charAt(k - 1);
130: if (XMLChar.isSurrogate(ch2)) {
131: k--;
132: char high = buffer.charAt(k - 1);
133: ch2 = XMLChar
134: .supplemental(high, (char) ch2);
135: }
136: if (data.getCanonicalClass(ch2) <= chClass)
137: break;
138: k--;
139: }
140: // for (; k > 0; k -= (ch2<65536 ? 1 : 2)) {
141: // ch2 = UTF16.charAt(target, k-1);
142: // if (data.getCanonicalClass(ch2) <= chClass) break;
143: // }
144: }
145: if (ch < 65536) {
146: target.insert(k, (char) ch);
147: } else {
148: String s = "" + XMLChar.highSurrogate(ch)
149: + XMLChar.lowSurrogate(ch);
150: target.insert(k, s);
151: }
152: //target.insert(k, UTF16.valueOf(ch));
153: }
154: }
155: }
156:
157: /**
158: * Composes text in place. Target must already
159: * have been decomposed.
160: * @param target input: decomposed text.
161: * output: the resulting normalized text.
162: */
163: private void internalCompose(StringBuffer target) {
164:
165: int starterPos = 0;
166: //int starterCh = UTF16.charAt(target,0);
167: //int compPos = (starterCh<65536 ? 1 : 2); // length of last composition
168: int starterCh = target.charAt(0);
169: int compPos = 1;
170: if (XMLChar.isHighSurrogate(starterCh)) {
171: starterCh = XMLChar.supplemental((char) starterCh, target
172: .charAt(1));
173: compPos++;
174: }
175: int lastClass = data.getCanonicalClass(starterCh);
176: if (lastClass != 0)
177: lastClass = 256; // fix for strings staring with a combining mark
178: int oldLen = target.length();
179:
180: // Loop on the decomposed characters, combining where possible
181:
182: int ch;
183: //for (int decompPos = compPos; decompPos < target.length(); decompPos += (ch<65536 ? 1 : 2)) {
184: for (int decompPos = compPos; decompPos < target.length();) {
185: ch = target.charAt(decompPos++);
186: if (XMLChar.isHighSurrogate(ch)) {
187: ch = XMLChar.supplemental((char) ch, target
188: .charAt(decompPos++));
189: }
190: //ch = UTF16.charAt(target, decompPos);
191: int chClass = data.getCanonicalClass(ch);
192: int composite = data.getPairwiseComposition(starterCh, ch);
193: if (composite != NormalizerData.NOT_COMPOSITE
194: && (lastClass < chClass || lastClass == 0)) {
195: setCharAt(target, starterPos, composite);
196: // we know that we will only be replacing non-supplementaries by non-supplementaries
197: // so we don't have to adjust the decompPos
198: starterCh = composite;
199: } else {
200: if (chClass == 0) {
201: starterPos = compPos;
202: starterCh = ch;
203: }
204: lastClass = chClass;
205: setCharAt(target, compPos, ch);
206: if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
207: decompPos += target.length() - oldLen;
208: oldLen = target.length();
209: }
210: compPos += (ch < 65536 ? 1 : 2);
211: }
212: }
213: target.setLength(compPos);
214: }
215:
216: /**
217: * Set the 32-bit character at a particular 16-bit offset in a string buffer,
218: * replacing the previous character at that position, and taking account of the
219: * fact that either, both, or neither of the characters might be a surrogate pair.
220: */
221:
222: private static void setCharAt(StringBuffer target, int offset,
223: int ch32) {
224: if (ch32 < 65536) {
225: if (XMLChar.isHighSurrogate(target.charAt(offset))) {
226: target.setCharAt(offset, (char) ch32);
227: target.deleteCharAt(offset + 1);
228: } else {
229: target.setCharAt(offset, (char) ch32);
230: }
231: } else {
232: if (XMLChar.isHighSurrogate(target.charAt(offset))) {
233: target.setCharAt(offset, XMLChar.highSurrogate(ch32));
234: target
235: .setCharAt(offset + 1, XMLChar
236: .lowSurrogate(ch32));
237: } else {
238: target.setCharAt(offset, XMLChar.highSurrogate(ch32));
239: target.insert(offset + 1, XMLChar.lowSurrogate(ch32));
240: }
241: }
242: }
243:
244: /**
245: * Contains normalization data from the Unicode Character Database.
246: * use false for the minimal set, true for the real set.
247: */
248: private static NormalizerData data = null;
249:
250: /**
251: * Just accessible for testing.
252: */
253: boolean getExcluded(char ch) {
254: return data.getExcluded(ch);
255: }
256:
257: /**
258: * Just accessible for testing.
259: */
260: String getRawDecompositionMapping(char ch) {
261: return data.getRawDecompositionMapping(ch);
262: }
263: }
|