001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.text;
008:
009: import com.ibm.icu.impl.NormalizerImpl;
010:
011: /**
012: * <tt>ComposedCharIter</tt> is an iterator class that returns all
013: * of the precomposed characters defined in the Unicode standard, along
014: * with their decomposed forms. This is often useful when building
015: * data tables (<i>e.g.</i> collation tables) which need to treat composed
016: * and decomposed characters equivalently.
017: * <p>
018: * For example, imagine that you have built a collation table with ordering
019: * rules for the {@link Normalizer#DECOMP canonically decomposed} forms of all
020: * characters used in a particular language. When you process input text using
021: * this table, the text must first be decomposed so that it matches the form
022: * used in the table. This can impose a performance penalty that may be
023: * unacceptable in some situations.
024: * <p>
025: * You can avoid this problem by ensuring that the collation table contains
026: * rules for both the decomposed <i>and</i> composed versions of each character.
027: * To do so, use a <tt>ComposedCharIter</tt> to iterate through all of the
028: * composed characters in Unicode. If the decomposition for that character
029: * consists solely of characters that are listed in your ruleset, you can
030: * add a new rule for the composed character that makes it equivalent to
031: * its decomposition sequence.
032: * <p>
033: * Note that <tt>ComposedCharIter</tt> iterates over a <em>static</em> table
034: * of the composed characters in Unicode. If you want to iterate over the
035: * composed characters in a particular string, use {@link Normalizer} instead.
036: * <p>
037: * When constructing a <tt>ComposedCharIter</tt> there is one
038: * optional feature that you can enable or disable:
039: * <ul>
040: * <li>{@link Normalizer#IGNORE_HANGUL} - Do not iterate over the Hangul
041: * characters and their corresponding Jamo decompositions.
042: * This option is off by default (<i>i.e.</i> Hangul processing is enabled)
043: * since the Unicode standard specifies that Hangul to Jamo
044: * is a canonical decomposition.
045: * </ul>
046: * <p>
047: * <tt>ComposedCharIter</tt> is currently based on version 2.1.8 of the
048: * <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>.
049: * It will be updated as later versions of Unicode are released.
050: * @deprecated ICU 2.2
051: */
052: ///CLOVER:OFF
053: public final class ComposedCharIter {
054:
055: /**
056: * Constant that indicates the iteration has completed.
057: * {@link #next} returns this value when there are no more composed characters
058: * over which to iterate.
059: * @deprecated ICU 2.2
060: */
061: public static final char DONE = (char) Normalizer.DONE;
062:
063: /**
064: * Construct a new <tt>ComposedCharIter</tt>. The iterator will return
065: * all Unicode characters with canonical decompositions, including Korean
066: * Hangul characters.
067: * @deprecated ICU 2.2
068: */
069: public ComposedCharIter() {
070: compat = false;
071: options = 0;
072: }
073:
074: /**
075: * Constructs a non-default <tt>ComposedCharIter</tt> with optional behavior.
076: * <p>
077: * @param compat <tt>false</tt> for canonical decompositions only;
078: * <tt>true</tt> for both canonical and compatibility
079: * decompositions.
080: *
081: * @param options Optional decomposition features. Currently, the only
082: * supported option is {@link Normalizer#IGNORE_HANGUL}, which
083: * causes this <tt>ComposedCharIter</tt> not to iterate
084: * over the Hangul characters and their corresponding
085: * Jamo decompositions.
086: * @deprecated ICU 2.2
087: */
088: public ComposedCharIter(boolean compat, int options) {
089: this .compat = compat;
090: this .options = options;
091: }
092:
093: /**
094: * Determines whether there any precomposed Unicode characters not yet returned
095: * by {@link #next}.
096: * @deprecated ICU 2.2
097: */
098: public boolean hasNext() {
099: if (nextChar == Normalizer.DONE) {
100: findNextChar();
101: }
102: return nextChar != Normalizer.DONE;
103: }
104:
105: /**
106: * Returns the next precomposed Unicode character.
107: * Repeated calls to <tt>next</tt> return all of the precomposed characters defined
108: * by Unicode, in ascending order. After all precomposed characters have
109: * been returned, {@link #hasNext} will return <tt>false</tt> and further calls
110: * to <tt>next</tt> will return {@link #DONE}.
111: * @deprecated ICU 2.2
112: */
113: public char next() {
114: if (nextChar == Normalizer.DONE) {
115: findNextChar();
116: }
117: curChar = nextChar;
118: nextChar = Normalizer.DONE;
119: return (char) curChar;
120: }
121:
122: /**
123: * Returns the Unicode decomposition of the current character.
124: * This method returns the decomposition of the precomposed character most
125: * recently returned by {@link #next}. The resulting decomposition is
126: * affected by the settings of the options passed to the constructor.
127: * @deprecated ICU 2.2
128: */
129: public String decomposition() {
130: // the decomposition buffer contains the decomposition of
131: // current char so just return it
132: return new String(decompBuf, 0, bufLen);
133: }
134:
135: private void findNextChar() {
136: int c = curChar + 1;
137: for (;;) {
138: if (c < 0xFFFF) {
139: bufLen = NormalizerImpl.getDecomposition(c, compat,
140: decompBuf, 0, decompBuf.length);
141: if (bufLen > 0) {
142: // the curChar can be decomposed... so it is a composed char
143: // cache the result
144: break;
145: }
146: c++;
147: } else {
148: c = Normalizer.DONE;
149: break;
150: }
151: }
152: nextChar = c;
153: }
154:
155: private int options;
156: private boolean compat;
157: private char[] decompBuf = new char[100];
158: private int bufLen = 0;
159: private int curChar = 0;
160: private int nextChar = Normalizer.DONE;
161:
162: };
|