001: /*
002: *
003: * @(#)ComposedCharIter.java 1.7 06/10/10
004: *
005: * Portions Copyright 2000-2006 Sun Microsystems, Inc. All Rights
006: * Reserved. Use is subject to license terms.
007: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
008: *
009: * This program is free software; you can redistribute it and/or
010: * modify it under the terms of the GNU General Public License version
011: * 2 only, as published by the Free Software Foundation.
012: *
013: * This program is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
016: * General Public License version 2 for more details (a copy is
017: * included at /legal/license.txt).
018: *
019: * You should have received a copy of the GNU General Public License
020: * version 2 along with this work; if not, write to the Free Software
021: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022: * 02110-1301 USA
023: *
024: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
025: * Clara, CA 95054 or visit www.sun.com if you need additional
026: * information or have any questions.
027: */
028:
029: /*
030: * (C) Copyright IBM Corp. 1996-2001 - All Rights Reserved
031: *
032: * The original version of this source code and documentation is
033: * copyrighted and owned by IBM. These materials are provided
034: * under terms of a License Agreement between IBM and Sun.
035: * This technology is protected by multiple US and International
036: * patents. This notice and attribution to IBM may not be removed.
037: */
038:
039: package sun.text;
040:
041: /**
042: * <tt>ComposedCharIter</tt> is an iterator class that returns all
043: * of the precomposed characters defined in the Unicode standard, along
044: * with their decomposed forms. This is often useful when building
045: * data tables (<i>e.g.</i> collation tables) which need to treat composed
046: * and decomposed characters equivalently.
047: * <p>
048: * For example, imagine that you have built a collation table with ordering
049: * rules for the {@link Normalizer#DECOMP canonically decomposed} forms of all
050: * characters used in a particular language. When you process input text using
051: * this table, the text must first be decomposed so that it matches the form
052: * used in the table. This can impose a performance penalty that may be
053: * unacceptable in some situations.
054: * <p>
055: * You can avoid this problem by ensuring that the collation table contains
056: * rules for both the decomposed <i>and</i> composed versions of each character.
057: * To do so, use a <tt>ComposedCharIter</tt> to iterate through all of the
058: * composed characters in Unicode. If the decomposition for that character
059: * consists solely of characters that are listed in your ruleset, you can
060: * add a new rule for the composed character that makes it equivalent to
061: * its decomposition sequence.
062: * <p>
063: * Note that <tt>ComposedCharIter</tt> iterates over a <em>static</em> table
064: * of the composed characters in Unicode. If you want to iterate over the
065: * composed characters in a particular string, use {@link Normalizer} instead.
066: * <p>
067: * When constructing a <tt>ComposedCharIter</tt> there is one
068: * optional feature that you can enable or disable:
069: * <ul>
070: * <li>{@link Normalizer#IGNORE_HANGUL} - Do not iterate over the Hangul
071: * characters and their corresponding Jamo decompositions.
072: * This option is off by default (<i>i.e.</i> Hangul processing is enabled)
073: * since the Unicode standard specifies that Hangul to Jamo
074: * is a canonical decomposition.
075: * </ul>
076: * <p>
077: * <tt>ComposedCharIter</tt> is currently based on version 2.1.8 of the
078: * <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>.
079: * It will be updated as later versions of Unicode are released.
080: */
081: public final class ComposedCharIter {
082:
083: /**
084: * Constant that indicates the iteration has completed.
085: * {@link #next} returns this value when there are no more composed characters
086: * over which to iterate.
087: */
088: public static final char DONE = Normalizer.DONE;
089:
090: /**
091: * Construct a new <tt>ComposedCharIter</tt>. The iterator will return
092: * all Unicode characters with canonical decompositions, including Korean
093: * Hangul characters.
094: */
095: public ComposedCharIter() {
096: minDecomp = DecompData.MAX_COMPAT;
097: hangul = false;
098: }
099:
100: /**
101: * Constructs a non-default <tt>ComposedCharIter</tt> with optional behavior.
102: * <p>
103: * @param compat <tt>false</tt> for canonical decompositions only;
104: * <tt>true</tt> for both canonical and compatibility
105: * decompositions.
106: *
107: * @param options Optional decomposition features. Currently, the only
108: * supported option is {@link Normalizer#IGNORE_HANGUL}, which
109: * causes this <tt>ComposedCharIter</tt> not to iterate
110: * over the Hangul characters and their corresponding
111: * Jamo decompositions.
112: */
113: public ComposedCharIter(boolean compat, int options) {
114: // Compatibility explosions have lower indices; skip them if necessary
115: minDecomp = compat ? 0 : DecompData.MAX_COMPAT;
116:
117: hangul = (options & Normalizer.IGNORE_HANGUL) == 0;
118: }
119:
120: /**
121: * Determines whether there any precomposed Unicode characters not yet returned
122: * by {@link #next}.
123: */
124: public boolean hasNext() {
125: if (nextChar == DONE) {
126: findNextChar();
127: }
128: return nextChar != DONE;
129: }
130:
131: /**
132: * Returns the next precomposed Unicode character.
133: * Repeated calls to <tt>next</tt> return all of the precomposed characters defined
134: * by Unicode, in ascending order. After all precomposed characters have
135: * been returned, {@link #hasNext} will return <tt>false</tt> and further calls
136: * to <tt>next</tt> will return {@link #DONE}.
137: */
138: public char next() {
139: if (nextChar == DONE) {
140: findNextChar();
141: }
142: curChar = nextChar;
143: nextChar = DONE;
144: return curChar;
145: }
146:
147: /**
148: * Returns the Unicode decomposition of the current character.
149: * This method returns the decomposition of the precomposed character most
150: * recently returned by {@link #next}. The resulting decomposition is
151: * affected by the settings of the options passed to the constructor.
152: */
153: public String decomposition() {
154: StringBuffer result = new StringBuffer();
155:
156: int pos = (char) (DecompData.offsets.elementAt(curChar) & DecompData.DECOMP_MASK);
157:
158: if (pos > minDecomp) {
159: DecompData.doAppend(pos, result);
160: } else if (hangul && curChar >= HANGUL_BASE
161: && curChar < HANGUL_LIMIT) {
162: Normalizer.hangulToJamo(curChar, result, minDecomp);
163: } else {
164: result.append(curChar);
165: }
166: return result.toString();
167: }
168:
169: private void findNextChar() {
170: if (curChar != DONE) {
171: char ch = curChar;
172: while (++ch < 0xFFFF) {
173: int offset = DecompData.offsets.elementAt(ch)
174: & DecompData.DECOMP_MASK;
175: if (offset > minDecomp
176: || (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT)) {
177: nextChar = ch;
178: break;
179: }
180: }
181: }
182: }
183:
184: private final int minDecomp;
185: private final boolean hangul;
186:
187: private char curChar = 0;
188: private char nextChar = Normalizer.DONE;
189:
190: private static final char HANGUL_BASE = Normalizer.HANGUL_BASE;
191: private static final char HANGUL_LIMIT = Normalizer.HANGUL_LIMIT;
192: };
|