001: /*
002: *
003: *
004: * Copyright 1990-2007 Sun Microsystems, Inc. All Rights Reserved.
005: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
006: *
007: * This program is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU General Public License version
009: * 2 only, as published by the Free Software Foundation.
010: *
011: * This program is distributed in the hope that it will be useful, but
012: * WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014: * General Public License version 2 for more details (a copy is
015: * included at /legal/license.txt).
016: *
017: * You should have received a copy of the GNU General Public License
018: * version 2 along with this work; if not, write to the Free Software
019: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020: * 02110-1301 USA
021: *
022: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
023: * Clara, CA 95054 or visit www.sun.com if you need additional
024: * information or have any questions.
025: */
026:
027: package com.sun.j2me.global;
028:
029: /**
030: * A string normalizer is responsible for decomposing strings into their
031: * canonically decomposed equivalents (Normalization Form D).
032: */
033: public final class StringNormalizer implements StringDecomposer {
034: /** The capacity increment value of the internal buffers. */
035: private static final int CAPACITY_INCREMENT = 64;
036:
037: /** Internal decomposition buffer. */
038: private int[] decomposition = new int[CAPACITY_INCREMENT];
039: /** Decomposition offset. */
040: private int decOffset;
041: /** Decomposition length. */
042: private int decLength;
043: /** String offset. */
044: private int strOffset;
045: /** String length. */
046: private int strLength;
047: /** String initial offset. */
048: private int strInitOffset;
049:
050: /** Max decomposition length. */
051: private int maxDecomposition;
052:
053: /**
054: * The string being decomposed.
055: */
056: private String source;
057: /**
058: * A lookup table which is used during the normalization.
059: */
060: private NormalizationTable table;
061:
062: /**
063: * Creates a new instance of <code>StringNormalizer</code>.
064: *
065: * @param table a lookup table for the normalization
066: */
067: public StringNormalizer(NormalizationTable table) {
068: this .table = table;
069: this .maxDecomposition = table.getMaxDecompositionLength();
070: }
071:
072: /**
073: * Creates a new instance of <code>StringNormalizer</code>.
074: *
075: * @param s a string for the normaliztion
076: * @param table a lookup table for the normalization
077: */
078: public StringNormalizer(String s, NormalizationTable table) {
079: this (table);
080: source = s;
081: strLength = s.length();
082: }
083:
084: /**
085: * Sets the string for the normalization.
086: *
087: * @param s the string
088: */
089: public final void setSource(String s) {
090: source = s;
091: strLength = s.length();
092: strInitOffset = 0;
093: reset();
094: }
095:
096: /**
097: * Sets the string for the normalization.
098: *
099: * @param s the string
100: * @param offset the offset to start the normalization from
101: */
102: public final void setSource(String s, int offset) {
103: source = s;
104: strLength = s.length();
105: strInitOffset = offset;
106: reset();
107: }
108:
109: /**
110: * Restarts the decomposition.
111: */
112: public final void reset() {
113: decOffset = 0;
114: decLength = 0;
115: strOffset = strInitOffset;
116: }
117:
118: /**
119: * Returns the next code point value from the source string. It expects
120: * the input string to be UTF-16 encoded.
121: *
122: * @return the next code point value
123: */
124: public final int nextUTF32() {
125: if (strOffset >= strLength) {
126: return EOF_ELEMENT;
127: }
128:
129: int cp = (int) source.charAt(strOffset++);
130: if (((cp & 0xfc00) == 0xd800) && (strOffset < strLength)) {
131: // is a high surrogate cp
132: int cp2 = (int) source.charAt(strOffset);
133: if ((cp2 & 0xfc00) == 0xdc00) {
134: // we have got suplementary low surrogate
135: // so construct the final code point
136: int wwww = (cp >> 6) & 0xf;
137: cp = ((wwww + 1) << 16) | ((cp & 0x3f) << 10)
138: | (cp2 & 0x3ff);
139:
140: ++strOffset;
141: }
142: }
143:
144: return cp;
145: }
146:
147: /**
148: * Returns the next encoded code point value from the normalized input
149: * string. The methods of the <code>NormalizationTable</code> class can be
150: * used to inspect the returned value. Returns <code>EOF_ELEMENT</code> if
151: * the end of string is reached.
152: *
153: * @return the next encoded code point value from the normalized input
154: * string or <code>EOF_ELEMENT</code> if the end of string is reached
155: * @see NormalizationTable
156: */
157: public int getNextElement() {
158: if (decOffset < decLength) {
159: return decomposition[decOffset++];
160: }
161:
162: int value = nextUTF32();
163: if (value == EOF_ELEMENT) {
164: return EOF_ELEMENT;
165: }
166:
167: value = table
168: .getCanonicalDecomposition(decomposition, 0, value);
169:
170: if (NormalizationTable.isSingleCodePoint(value)) {
171: if (NormalizationTable.isStable(value)) {
172: return value;
173: }
174: decomposition[0] = value;
175: decLength = 1;
176: } else {
177: decLength = value;
178: }
179:
180: decOffset = 0;
181:
182: // decompose till we get a stable code point
183: value = nextUTF32();
184: while (value != -1) {
185: if ((decLength + maxDecomposition) > decomposition.length) {
186: int[] newDecomposition = new int[decomposition.length
187: + CAPACITY_INCREMENT];
188: System.arraycopy(decomposition, 0, newDecomposition, 0,
189: decLength);
190: decomposition = newDecomposition;
191: }
192:
193: value = table.getCanonicalDecomposition(decomposition,
194: decLength, value);
195:
196: if (NormalizationTable.isSingleCodePoint(value)) {
197: decomposition[decLength++] = value;
198: if (NormalizationTable.isStable(value)) {
199: break;
200: }
201: } else {
202: decLength += value;
203: }
204:
205: value = nextUTF32();
206: }
207:
208: // order the code points according to their combining classes
209: boolean checkOrder;
210: do {
211: checkOrder = false;
212:
213: for (int i = 1; i < decLength; ++i) {
214: int cp1 = decomposition[i - 1];
215: int cp2 = decomposition[i];
216:
217: int cc1 = NormalizationTable.getCombiningClass(cp1);
218: int cc2 = NormalizationTable.getCombiningClass(cp2);
219:
220: if ((cc1 > cc2) && (cc2 != 0)) {
221: decomposition[i - 1] = cp2;
222: decomposition[i] = cp1;
223: checkOrder = true;
224: }
225: }
226: } while (checkOrder);
227:
228: if (decLength > 0) {
229: return decomposition[decOffset++];
230: }
231:
232: return EOF_ELEMENT;
233: }
234: }
|