0001: /*
0002: *
0003: * @(#)Normalizer.java 1.12 06/10/10
0004: *
0005: * Portions Copyright 2000-2006 Sun Microsystems, Inc. All Rights
0006: * Reserved. Use is subject to license terms.
0007: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
0008: *
0009: * This program is free software; you can redistribute it and/or
0010: * modify it under the terms of the GNU General Public License version
0011: * 2 only, as published by the Free Software Foundation.
0012: *
0013: * This program is distributed in the hope that it will be useful, but
0014: * WITHOUT ANY WARRANTY; without even the implied warranty of
0015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
0016: * General Public License version 2 for more details (a copy is
0017: * included at /legal/license.txt).
0018: *
0019: * You should have received a copy of the GNU General Public License
0020: * version 2 along with this work; if not, write to the Free Software
0021: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
0022: * 02110-1301 USA
0023: *
0024: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
0025: * Clara, CA 95054 or visit www.sun.com if you need additional
0026: * information or have any questions.
0027: */
0028:
0029: /*
0030: * (C) Copyright IBM Corp. 1996-2002 - All Rights Reserved
0031: *
0032: * The original version of this source code and documentation is
0033: * copyrighted and owned by IBM. These materials are provided
0034: * under terms of a License Agreement between IBM and Sun.
0035: * This technology is protected by multiple US and International
0036: * patents. This notice and attribution to IBM may not be removed.
0037: */
0038:
0039: package sun.text;
0040:
0041: import java.lang.Character;
0042: import java.text.CharacterIterator;
0043: import java.text.StringCharacterIterator;
0044:
0045: /**
0046: * <tt>Normalizer</tt> transforms Unicode text into an equivalent composed or
0047: * decomposed form, allowing for easier sorting and searching of text.
0048: * <tt>Normalizer</tt> supports the standard normalization forms described in
0049: * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
0050: * Unicode Technical Report #15</a>.
0051: * <p>
0052: * Characters with accents or other adornments can be encoded in
0053: * several different ways in Unicode. For example, take the character "Â"
0054: * (A-acute). In Unicode, this can be encoded as a single character (the
0055: * "composed" form):
0056: * <pre>
0057: * 00C1 LATIN CAPITAL LETTER A WITH ACUTE</pre>
0058: * or as two separate characters (the "decomposed" form):
0059: * <pre>
0060: * 0041 LATIN CAPITAL LETTER A
0061: * 0301 COMBINING ACUTE ACCENT</pre>
0062: * <p>
0063: * To a user of your program, however, both of these sequences should be
0064: * treated as the same "user-level" character "Â". When you are searching or
0065: * comparing text, you must ensure that these two sequences are treated
0066: * equivalently. In addition, you must handle characters with more than one
0067: * accent. Sometimes the order of a character's combining accents is
0068: * significant, while in other cases accent sequences in different orders are
0069: * really equivalent.
0070: * <p>
0071: * Similarly, the string "ffi" can be encoded as three separate letters:
0072: * <pre>
0073: * 0066 LATIN SMALL LETTER F
0074: * 0066 LATIN SMALL LETTER F
0075: * 0069 LATIN SMALL LETTER I</pre>
0076: * or as the single character
0077: * <pre>
0078: * FB03 LATIN SMALL LIGATURE FFI</pre>
0079: * <p>
0080: * The ffi ligature is not a distinct semantic character, and strictly speaking
0081: * it shouldn't be in Unicode at all, but it was included for compatibility
0082: * with existing character sets that already provided it. The Unicode standard
0083: * identifies such characters by giving them "compatibility" decompositions
0084: * into the corresponding semantic characters. When sorting and searching, you
0085: * will often want to use these mappings.
0086: * <p>
0087: * <tt>Normalizer</tt> helps solve these problems by transforming text into the
0088: * canonical composed and decomposed forms as shown in the first example above.
0089: * In addition, you can have it perform compatibility decompositions so that
0090: * you can treat compatibility characters the same as their equivalents.
0091: * Finally, <tt>Normalizer</tt> rearranges accents into the proper canonical
0092: * order, so that you do not have to worry about accent rearrangement on your
0093: * own.
0094: * <p>
0095: * <tt>Normalizer</tt> adds one optional behavior, {@link #IGNORE_HANGUL},
0096: * that differs from
0097: * the standard Unicode Normalization Forms. This option can be passed
0098: * to the {@link #Normalizer constructors} and to the static
0099: * {@link #compose compose} and {@link #decompose decompose} methods. This
0100: * option, and any that are added in the future, will be turned off by default.
0101: * <p>
0102: * There are three common usage models for <tt>Normalizer</tt>. In the first,
0103: * the static {@link #normalize normalize()} method is used to process an
0104: * entire input string at once. Second, you can create a <tt>Normalizer</tt>
0105: * object and use it to iterate through the normalized form of a string by
0106: * calling {@link #first} and {@link #next}. Finally, you can use the
0107: * {@link #setIndex setIndex()} and {@link #getIndex} methods to perform
0108: * random-access iteration, which is very useful for searching.
0109: * <p>
0110: * <b>Note:</b> <tt>Normalizer</tt> objects behave like iterators and have
0111: * methods such as <tt>setIndex</tt>, <tt>next</tt>, <tt>previous</tt>, etc.
0112: * You should note that while the <tt>setIndex</tt> and <tt>getIndex</tt> refer
0113: * to indices in the underlying <em>input</em> text being processed, the
0114: * <tt>next</tt> and <tt>previous</tt> methods it iterate through characters
0115: * in the normalized <em>output</em>. This means that there is not
0116: * necessarily a one-to-one correspondence between characters returned
0117: * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
0118: * returned from <tt>setIndex</tt> and <tt>getIndex</tt>. It is for this
0119: * reason that <tt>Normalizer</tt> does not implement the
0120: * {@link CharacterIterator} interface.
0121: * <p>
0122: * <b>Note:</b> <tt>Normalizer</tt> is currently based on version 3.0
0123: * of the <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>.
0124: * It will be updated as later versions of Unicode are released. If you are
0125: * using this class on a JDK that supports an earlier version of Unicode, it
0126: * is possible that <tt>Normalizer</tt> may generate composed or dedecomposed
0127: * characters for which your JDK's {@link java.lang.Character} class does not
0128: * have any data.
0129: * <p>
0130: * @author Laura Werner, Mark Davis
0131: */
0132: public final class Normalizer implements Cloneable {
0133:
0134: /**
0135: * Constant indicating that the end of the iteration has been reached.
0136: * This is guaranteed to have the same value as {@link CharacterIterator#DONE}.
0137: */
0138: public static final char DONE = CharacterIterator.DONE;
0139:
0140: // This tells us what the bits in the "mode" object mean.
0141: private static final int COMPAT_BIT = 1;
0142: private static final int DECOMP_BIT = 2;
0143: private static final int COMPOSE_BIT = 4;
0144:
0145: /**
0146: * This class represents the mode of a {@link Normalizer}
0147: * object, <i>i.e.</i> the Unicode Normalization Form of the
0148: * text that the <tt>Normalizer</tt> produces. <tt>Mode</tt> objects
0149: * are used as arguments to the {@link Normalizer#Normalizer constructors}
0150: * and {@link Normalizer#setMode setMode} method of <tt>Normalizer</tt>.
0151: * <p>
0152: * Clients cannot create <tt>Mode</tt> objects directly.
0153: * Instead, use the predefined constants {@link Normalizer#NO_OP},
0154: * {@link Normalizer#COMPOSE}, {@link Normalizer#COMPOSE_COMPAT},
0155: * {@link Normalizer#DECOMP}, and {@link Normalizer#DECOMP_COMPAT}.
0156: * <p>
0157: * @see Normalizer
0158: */
0159: public static final class Mode {
0160: Mode(int m) {
0161: mode = m;
0162: }
0163:
0164: final boolean compat() {
0165: return (mode & COMPAT_BIT) != 0;
0166: }
0167:
0168: final boolean compose() {
0169: return (mode & COMPOSE_BIT) != 0;
0170: }
0171:
0172: final boolean decomp() {
0173: return (mode & DECOMP_BIT) != 0;
0174: }
0175:
0176: final int mode;
0177: };
0178:
0179: /**
0180: * Null operation for use with the {@link #Normalizer constructors}
0181: * and the static {@link #normalize normalize} method. This value tells
0182: * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
0183: * from the underlying String or CharacterIterator. If you have code which
0184: * requires raw text at some times and normalized text at others, you can
0185: * use <tt>NO_OP</tt> for the cases where you want raw text, rather
0186: * than having a separate code path that bypasses <tt>Normalizer</tt>
0187: * altogether.
0188: * <p>
0189: * @see #setMode
0190: */
0191: public static final Mode NO_OP = new Mode(0);
0192:
0193: /**
0194: * Canonical decomposition followed by canonical composition. Used with the
0195: * {@link #Normalizer constructors} and the static {@link #normalize normalize}
0196: * method to determine the operation to be performed.
0197: * <p>
0198: * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
0199: * off, this operation produces output that is in
0200: * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a>
0201: * <b>C</b>.
0202: * <p>
0203: * @see #setMode
0204: */
0205: public static final Mode COMPOSE = new Mode(COMPOSE_BIT);
0206:
0207: /**
0208: * Compatibility decomposition followed by canonical composition.
0209: * Used with the {@link #Normalizer constructors} and the static
0210: * {@link #normalize normalize} method to determine the operation to be performed.
0211: * <p>
0212: * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
0213: * off, this operation produces output that is in
0214: * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a>
0215: * <b>KC</b>.
0216: * <p>
0217: * @see #setMode
0218: */
0219: public static final Mode COMPOSE_COMPAT = new Mode(COMPOSE_BIT
0220: | COMPAT_BIT);
0221:
0222: /**
0223: * Canonical decomposition. This value is passed to the
0224: * {@link #Normalizer constructors} and the static {@link #normalize normalize}
0225: * method to determine the operation to be performed.
0226: * <p>
0227: * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
0228: * off, this operation produces output that is in
0229: * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a>
0230: * <b>D</b>.
0231: * <p>
0232: * @see #setMode
0233: */
0234: public static final Mode DECOMP = new Mode(DECOMP_BIT);
0235:
0236: /**
0237: * Compatibility decomposition. This value is passed to the
0238: * {@link #Normalizer constructors} and the static {@link #normalize normalize}
0239: * method to determine the operation to be performed.
0240: * <p>
0241: * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
0242: * off, this operation produces output that is in
0243: * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a>
0244: * <b>KD</b>.
0245: * <p>
0246: * @see #setMode
0247: */
0248: public static final Mode DECOMP_COMPAT = new Mode(DECOMP_BIT
0249: | COMPAT_BIT);
0250:
0251: /**
0252: * Option to disable Hangul/Jamo composition and decomposition.
0253: * This option applies to Korean text,
0254: * which can be represented either in the Jamo alphabet or in Hangul
0255: * characters, which are really just two or three Jamo combined
0256: * into one visual glyph. Since Jamo takes up more storage space than
0257: * Hangul, applications that process only Hangul text may wish to turn
0258: * this option on when decomposing text.
0259: * <p>
0260: * The Unicode standard treates Hangul to Jamo conversion as a
0261: * canonical decomposition, so this option must be turned <b>off</b> if you
0262: * wish to transform strings into one of the standard
0263: * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
0264: * Unicode Normalization Forms</a>.
0265: * <p>
0266: * @see #setOption
0267: */
0268: public static final int IGNORE_HANGUL = 0x0001;
0269:
0270: //-------------------------------------------------------------------------
0271: // Constructors
0272: //-------------------------------------------------------------------------
0273:
0274: /**
0275: * Creates a new <tt>Normalizer</tt> object for iterating over the
0276: * normalized form of a given string.
0277: * <p>
0278: * @param str The string to be normalized. The normalization
0279: * will start at the beginning of the string.
0280: *
0281: * @param mode The normalization mode.
0282: */
0283: public Normalizer(String str, Mode mode) {
0284: this (new StringCharacterIterator(str), mode, 0);
0285: }
0286:
0287: /**
0288: * Creates a new <tt>Normalizer</tt> object for iterating over the
0289: * normalized form of a given string.
0290: * <p>
0291: * The <tt>options</tt> parameter specifies which optional
0292: * <tt>Normalizer</tt> features are to be enabled for this object.
0293: * <p>
0294: * @param str The string to be normalized. The normalization
0295: * will start at the beginning of the string.
0296: *
0297: * @param mode The normalization mode.
0298: *
0299: * @param opt Any optional features to be enabled.
0300: * Currently the only available option is {@link #IGNORE_HANGUL}.
0301: * If you want the default behavior corresponding to one of the
0302: * standard Unicode Normalization Forms, use 0 for this argument.
0303: */
0304: public Normalizer(String str, Mode mode, int opt) {
0305: this (new StringCharacterIterator(str), mode, opt);
0306: }
0307:
0308: /**
0309: * Creates a new <tt>Normalizer</tt> object for iterating over the
0310: * normalized form of the given text.
0311: * <p>
0312: * @param iter The input text to be normalized. The normalization
0313: * will start at the beginning of the string.
0314: *
0315: * @param mode The normalization mode.
0316: *
0317: */
0318: public Normalizer(CharacterIterator iter, Mode mode) {
0319: this (iter, mode, 0);
0320: }
0321:
0322: /**
0323: * Creates a new <tt>Normalizer</tt> object for iterating over the
0324: * normalized form of the given text.
0325: * <p>
0326: * @param iter The input text to be normalized. The normalization
0327: * will start at the beginning of the string.
0328: *
0329: * @param mode The normalization mode.
0330: *
0331: * @param opt Any optional features to be enabled.
0332: * Currently the only available option is {@link #IGNORE_HANGUL}.
0333: * If you want the default behavior corresponding to one of the
0334: * standard Unicode Normalization Forms, use 0 for this argument.
0335: */
0336: public Normalizer(CharacterIterator iter, Mode mode, int opt) {
0337: text = iter;
0338: this .mode = mode;
0339: options = opt;
0340:
0341: // Compatibility explosions have lower indices; skip them if necessary
0342: minDecomp = mode.compat() ? 0 : DecompData.MAX_COMPAT;
0343: }
0344:
0345: /**
0346: * Clones this <tt>Normalizer</tt> object. All properties of this
0347: * object are duplicated in the new object, including the cloning of any
0348: * {@link CharacterIterator} that was passed in to the constructor
0349: * or to {@link #setText(CharacterIterator) setText}.
0350: * However, the text storage underlying
0351: * the <tt>CharacterIterator</tt> is not duplicated unless the
0352: * iterator's <tt>clone</tt> method does so.
0353: */
0354: public Object clone() {
0355: try {
0356: Normalizer copy = (Normalizer) super .clone();
0357: copy.text = (CharacterIterator) text.clone();
0358: // old version did not clone the buffer this causes serious
0359: // problems in the collation element iterator
0360: if (buffer != null) {
0361: copy.buffer = new StringBuffer();
0362: if (buffer.length() > 0) {
0363: copy.buffer.append(buffer);
0364: }
0365: }
0366: return copy;
0367: } catch (CloneNotSupportedException e) {
0368: throw new InternalError(e.toString());
0369: }
0370: }
0371:
0372: //-------------------------------------------------------------------------
0373: // Static utility methods
0374: //-------------------------------------------------------------------------
0375:
0376: /**
0377: * Normalizes a <tt>String</tt> using the given normalization operation.
0378: * <p>
0379: * The <tt>options</tt> parameter specifies which optional
0380: * <tt>Normalizer</tt> features are to be enabled for this operation.
0381: * Currently the only available option is {@link #IGNORE_HANGUL}.
0382: * If you want the default behavior corresponding to one of the standard
0383: * Unicode Normalization Forms, use 0 for this argument.
0384: * <p>
0385: * @param str the input string to be normalized.
0386: *
0387: * @param aMode the normalization mode
0388: *
0389: * @param options the optional features to be enabled.
0390: */
0391: public static String normalize(String str, Mode mode, int options) {
0392: return normalize(str, mode, options, false);
0393: }
0394:
0395: public static String normalize(String str, Mode mode, int options,
0396: boolean addSingleQuotation) {
0397: if (mode.compose()) {
0398: // compose() handles decomposition and reordering;
0399: // don't call decompose() first.
0400: return compose(str, mode.compat(), options);
0401: }
0402: if (mode.decomp()) {
0403: return decompose(str, mode.compat(), options,
0404: addSingleQuotation);
0405: }
0406: return str;
0407: }
0408:
0409: //-------------------------------------------------------------------------
0410: // Compose methods
0411: //-------------------------------------------------------------------------
0412:
0413: /**
0414: * Compose a <tt>String</tt>.
0415: * <p>
0416: * The <tt>options</tt> parameter specifies which optional
0417: * <tt>Normalizer</tt> features are to be enabled for this operation.
0418: * Currently the only available option is {@link #IGNORE_HANGUL}.
0419: * If you want the default behavior corresponding
0420: * to Unicode Normalization Form <b>C</b> or <b>KC</b>,
0421: * use 0 for this argument.
0422: * <p>
0423: * @param source the string to be composed.
0424: *
0425: * @param compat Perform compatibility decomposition before composition.
0426: * If this argument is <tt>false</tt>, only canonical
0427: * decomposition will be performed.
0428: *
0429: * @param options the optional features to be enabled.
0430: *
0431: * @return the composed string.
0432: */
0433: public static String compose(String source, boolean compat,
0434: int options) {
0435: StringBuffer result = new StringBuffer();
0436: StringBuffer explodeBuf = new StringBuffer();
0437:
0438: int explodePos = EMPTY; // Position in input buffer
0439: int basePos = 0; // Position of last base in output string
0440: int baseIndex = 0; // Index of last base in "actions" array
0441: int classesSeenL = 0; // Combining classes seen since last base
0442: int classesSeenH = 0; // 64-bit mask
0443: int action;
0444:
0445: // Compatibility explosions have lower indices; skip them if necessary
0446: int minExplode = compat ? 0 : ComposeData.MAX_COMPAT;
0447: int minDecomp = compat ? 0 : DecompData.MAX_COMPAT;
0448:
0449: if (DEBUG)
0450: System.out.println("minExplode = " + minExplode);
0451:
0452: int i = 0;
0453: while (i < source.length() || explodePos != EMPTY) {
0454: // Get the next char from either the buffer or the source
0455: char ch;
0456: if (explodePos == EMPTY) {
0457: ch = source.charAt(i++);
0458: } else {
0459: ch = explodeBuf.charAt(explodePos++);
0460: if (explodePos >= explodeBuf.length()) {
0461: explodePos = EMPTY;
0462: explodeBuf.setLength(0);
0463: }
0464: }
0465:
0466: // Get the basic info for the character
0467: int charInfo = composeLookup(ch);
0468: int type = charInfo & ComposeData.TYPE_MASK;
0469: int index = charInfo >>> ComposeData.INDEX_SHIFT;
0470:
0471: if (DEBUG)
0472: System.out.println("Got char " + Utility.hex(ch)
0473: + ", type=" + type + ", index=" + index);
0474:
0475: // Examples of NON_COMPOSING_COMBINING with an index < minExplode:
0476: // 00A8 017F 03D2 1FBF 1FFE
0477: if (type == ComposeData.BASE
0478: || (type == ComposeData.NON_COMPOSING_COMBINING && index < minExplode)) {
0479:
0480: if (DEBUG)
0481: System.out.println("New base " + Utility.hex(ch)
0482: + ", type=" + type + ", index=" + index);
0483: classesSeenL = classesSeenH = 0;
0484: baseIndex = index;
0485: basePos = result.length();
0486: result.append(ch);
0487: } else if (type == ComposeData.COMBINING) {
0488: // assert(index > 0);
0489: int cclass = ComposeData.typeBit[index];
0490: // typeBit is a bit value from 0..63, indicating the class.
0491: // We use a bit mask of 2 32-bit ints.
0492: boolean seen = 0 != ((cclass < 32) ? (classesSeenL & (1 << cclass))
0493: : (classesSeenH & (1 << (cclass & 31))));
0494:
0495: if (DEBUG)
0496: System.out.println("Class of " + Utility.hex(ch)
0497: + " = " + cclass + " seen:" + seen
0498: + " baseIndex:" + baseIndex + " action:"
0499: + composeAction(baseIndex, index));
0500:
0501: // We can only combine a character with the base if we haven't
0502: // already seen a combining character with the same canonical class.
0503: // We only combine characters with an index from
0504: // 1..COMBINING_COUNT-1. Indices >= COMBINING_COUNT are
0505: // also combining characters, but we know that they don't
0506: // compose with anything.
0507: if (index < ComposeData.COMBINING_COUNT
0508: && !seen
0509: && (action = composeAction(baseIndex, index)) > 0) {
0510: if (action > ComposeData.MAX_COMPOSED) {
0511: // Pairwise explosion. Actions above this value are really
0512: // indices into an array that in turn contains indices
0513: // into the exploding string table
0514: // TODO: What if there are unprocessed chars in the explode buffer?
0515: if (DEBUG)
0516: System.out.println("Pairwise exploding");
0517: char newBase = pairExplode(explodeBuf, action);
0518: explodePos = 0;
0519: result.setCharAt(basePos, newBase);
0520:
0521: baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT;
0522: if (DEBUG)
0523: System.out.println("New base "
0524: + Utility.hex(newBase));
0525: } else {
0526: // Normal pairwise combination. Replace the base char
0527: if (DEBUG)
0528: System.out.println("Pairwise combining");
0529: char newBase = (char) action;
0530: result.setCharAt(basePos, newBase);
0531:
0532: baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT;
0533: if (DEBUG)
0534: System.out.println("New base "
0535: + Utility.hex(newBase));
0536: }
0537: //
0538: // Since there are Unicode characters that cannot be combined in arbitrary
0539: // order, we have to re-process any combining marks that go with this
0540: // base character. There are only four characters in Unicode that have
0541: // this problem. If they are fixed in Unicode 3.0, this code can go away.
0542: //
0543: int len = result.length();
0544: if (len - basePos > 1) {
0545: for (int j = basePos + 1; j < len; j++) {
0546: explodeBuf.append(result.charAt(j));
0547: }
0548: result.setLength(basePos + 1);
0549: classesSeenL = classesSeenH = 0;
0550: if (explodePos == EMPTY)
0551: explodePos = 0;
0552: }
0553: } else {
0554: // No combination with this character
0555: if (DEBUG)
0556: System.out.println("No action");
0557: bubbleAppend(result, ch, cclass);
0558: if (cclass < 32) {
0559: classesSeenL |= 1 << cclass;
0560: } else {
0561: classesSeenH |= 1 << (cclass & 31);
0562: }
0563: }
0564: } else if (index > minExplode) {
0565: // Single exploding character
0566: explode(explodeBuf, index);
0567: explodePos = 0;
0568: if (DEBUG)
0569: System.out.println("explosion: " + Utility.hex(ch)
0570: + " --> " + Utility.hex(explodeBuf));
0571: } else if (type == ComposeData.HANGUL && minExplode == 0) {
0572: // If we're in compatibility mode we need to decompose Hangul to Jamo,
0573: // because some of the Jamo might have compatibility decompositions.
0574: hangulToJamo(ch, explodeBuf, minDecomp);
0575: if (DEBUG)
0576: System.out.println("decomposed hangul "
0577: + Utility.hex(ch) + " to jamo "
0578: + Utility.hex(explodeBuf));
0579: explodePos = 0;
0580: } else if (type == ComposeData.INITIAL_JAMO) {
0581: classesSeenL = classesSeenH = 0;
0582: baseIndex = ComposeData.INITIAL_JAMO_INDEX;
0583: basePos = result.length();
0584: result.append(ch);
0585: if (DEBUG)
0586: System.out.println("got initial jamo "
0587: + Utility.hex(ch));
0588: } else if (type == ComposeData.MEDIAL_JAMO
0589: && classesSeenL == 0 && classesSeenH == 0
0590: && baseIndex == ComposeData.INITIAL_JAMO_INDEX) {
0591: // If the last character was an initial jamo, we can combine it with this
0592: // one to create a Hangul character.
0593: int l = result.charAt(basePos) - JAMO_LBASE;
0594: int v = ch - JAMO_VBASE;
0595: char newCh = (char) (HANGUL_BASE + (l * JAMO_VCOUNT + v)
0596: * JAMO_TCOUNT);
0597: result.setCharAt(basePos, newCh);
0598:
0599: if (DEBUG)
0600: System.out.println("got medial jamo "
0601: + Utility.hex(ch)
0602: + ", replacing with Hangul "
0603: + Utility.hex(newCh));
0604:
0605: baseIndex = ComposeData.MEDIAL_JAMO_INDEX;
0606: } else if (type == ComposeData.FINAL_JAMO
0607: && classesSeenL == 0 && classesSeenH == 0
0608: && baseIndex == ComposeData.MEDIAL_JAMO_INDEX) {
0609: // If the last character was a medial jamo that we turned into Hangul,
0610: // we can add this character too.
0611: char newCh = (char) (result.charAt(basePos) + (ch - JAMO_TBASE));
0612: result.setCharAt(basePos, newCh);
0613:
0614: if (DEBUG)
0615: System.out.println("got final jamo "
0616: + Utility.hex(ch)
0617: + ", replacing with Hangul "
0618: + Utility.hex(newCh));
0619:
0620: baseIndex = 0;
0621: basePos = -1;
0622: classesSeenL = classesSeenH = 0;
0623: } else {
0624: if (DEBUG)
0625: System.out.println("No base as of "
0626: + Utility.hex(ch));
0627: baseIndex = 0;
0628: basePos = -1;
0629: classesSeenL = classesSeenH = 0;
0630: result.append(ch);
0631: }
0632: }
0633: return result.toString();
0634: }
0635:
0636: /**
0637: * Compose starting with current input character and continuing
0638: * until just before the next base char.
0639: * <p>
0640: * <b>Input</b>:
0641: * <ul>
0642: * <li>underlying char iter points to first character to compose
0643: * </ul>
0644: * <p>
0645: * <b>Output:</b>
0646: * <ul>
0647: * <li>returns first char of composition or DONE if at end
0648: * <li>Underlying char iter is pointing at next base char or past end
0649: * </ul>
0650: */
0651: private char nextCompose() {
0652: if (DEBUG)
0653: System.out
0654: .println("--------------- top of nextCompose() ---------------");
0655:
0656: int explodePos = EMPTY; // Position in input buffer
0657: int basePos = 0; // Position of last base in output string
0658: int baseIndex = 0; // Index of last base in "actions" array
0659: int classesSeenL = 0; // Combining classes seen since last base
0660: int classesSeenH = 0; // 64-bit mask
0661: int action;
0662: char lastBase = 0;
0663: boolean chFromText = true;
0664:
0665: currentIndex = nextIndex;
0666: text.setIndex(currentIndex);
0667: // Compatibility explosions have lower indices; skip them if necessary
0668: int minExplode = mode.compat() ? 0 : ComposeData.MAX_COMPAT;
0669: int minDecomp = mode.compat() ? 0 : DecompData.MAX_COMPAT;
0670:
0671: initBuffer();
0672: if (explodeBuf == null) {
0673: explodeBuf = new StringBuffer();
0674: } else {
0675: explodeBuf.setLength(0);
0676: }
0677:
0678: char ch = curForward();
0679:
0680: while (ch != DONE) {
0681: // Get the basic info for the character
0682: int charInfo = composeLookup(ch);
0683: int type = charInfo & ComposeData.TYPE_MASK;
0684: int index = charInfo >>> ComposeData.INDEX_SHIFT;
0685:
0686: if (type == ComposeData.BASE
0687: || (type == ComposeData.NON_COMPOSING_COMBINING && index < minExplode)) {
0688:
0689: if (getBufferLength() > 0 && chFromText
0690: && explodePos == EMPTY) {
0691: // When we hit a base char in the source text, we can return the text
0692: // that's been composed so far. We'll re-process this char next time through.
0693: if (DEBUG)
0694: System.out
0695: .println("returning early because we hit a new base");
0696: break;
0697: }
0698: classesSeenL = classesSeenH = 0;
0699: baseIndex = index;
0700: basePos = getBufferLength();
0701: buffer.append(ch);
0702: if (DEBUG)
0703: System.out.println("got BASE char "
0704: + Utility.hex(ch) + ", type=" + type
0705: + ", index=" + index);
0706: lastBase = ch;
0707: } else if (type == ComposeData.COMBINING) {
0708: // assert(index > 0);
0709: int cclass = ComposeData.typeBit[index];
0710: boolean seen = 0 != ((cclass < 32) ? (classesSeenL & (1 << cclass))
0711: : (classesSeenH & (1 << (cclass & 31))));
0712:
0713: if (DEBUG)
0714: System.out.println("got COMBINING char "
0715: + Utility.hex(ch) + ", type=" + type
0716: + ", index=" + index + ", class=" + cclass);
0717:
0718: // We can only combine a character with the base if we haven't
0719: // already seen a combining character with the same canonical class.
0720: if (index < ComposeData.COMBINING_COUNT
0721: && !seen
0722: && (action = composeAction(baseIndex, index)) > 0) {
0723: if (action > ComposeData.MAX_COMPOSED) {
0724: // Pairwise explosion. Actions above this value are really
0725: // indices into an array that in turn contains indices
0726: // into the exploding string table
0727: // TODO: What if there are unprocessed chars in the explode buffer?
0728: char newBase = pairExplode(explodeBuf, action);
0729: explodePos = 0;
0730: buffer.setCharAt(basePos, newBase);
0731:
0732: baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT;
0733:
0734: if (DEBUG)
0735: System.out.println("Pairwise explosion: "
0736: + Utility.hex(lastBase) + ","
0737: + Utility.hex(ch) + " --> "
0738: + Utility.hex(newBase) + ","
0739: + Utility.hex(explodeBuf));
0740: lastBase = newBase;
0741: } else {
0742: // Normal pairwise combination. Replace the base char
0743: char newBase = (char) action;
0744: buffer.setCharAt(basePos, newBase);
0745:
0746: baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT;
0747:
0748: if (DEBUG)
0749: System.out.println("Pairwise combination: "
0750: + Utility.hex(lastBase) + ","
0751: + Utility.hex(ch) + " --> "
0752: + Utility.hex(newBase));
0753: lastBase = newBase;
0754: }
0755: //
0756: // Since there are Unicode characters that cannot be combined in arbitrary
0757: // order, we have to re-process any combining marks that go with this
0758: // base character. There are only four characters in Unicode that have
0759: // this problem. If they are fixed in Unicode 3.0, this code can go away.
0760: //
0761: int len = getBufferLength();
0762: if (len - basePos > 1) {
0763: if (DEBUG)
0764: System.out
0765: .println("Reprocessing combining marks");
0766: for (int j = basePos + 1; j < len; j++) {
0767: explodeBuf.append(buffer.charAt(j));
0768: }
0769: buffer.setLength(basePos + 1);
0770: classesSeenL = classesSeenH = 0;
0771: if (explodePos == EMPTY)
0772: explodePos = 0;
0773: }
0774: } else {
0775: if (DEBUG)
0776: System.out.println("char doesn't combine");
0777: // No combination with this character
0778: bubbleAppend(buffer, ch, cclass);
0779: if (cclass < 32) {
0780: classesSeenL |= 1 << cclass;
0781: } else {
0782: classesSeenH |= 1 << (cclass & 31);
0783: }
0784: }
0785: } else if (index > minExplode) {
0786: // Single exploding character
0787: explode(explodeBuf, index);
0788: explodePos = 0;
0789: if (DEBUG)
0790: System.out.println("explosion: " + Utility.hex(ch)
0791: + " --> " + Utility.hex(explodeBuf));
0792: } else if (type == ComposeData.HANGUL && minExplode == 0) {
0793: // If we're in compatibility mode we need to decompose Hangul to Jamo,
0794: // because some of the Jamo might have compatibility decompositions.
0795: hangulToJamo(ch, explodeBuf, minDecomp);
0796: if (DEBUG)
0797: System.out.println("decomposed hangul "
0798: + Utility.hex(ch) + " to jamo "
0799: + Utility.hex(explodeBuf));
0800: explodePos = 0;
0801: } else if (type == ComposeData.INITIAL_JAMO) {
0802: if (getBufferLength() > 0 && chFromText
0803: && explodePos == EMPTY) {
0804: // When we hit a base char in the source text, we can return the text
0805: // that's been composed so far. We'll re-process this char next time through.
0806: if (DEBUG)
0807: System.out
0808: .println("returning early because we hit a new base");
0809: break;
0810: }
0811: classesSeenL = classesSeenH = 0;
0812: baseIndex = ComposeData.INITIAL_JAMO_INDEX;
0813: basePos = getBufferLength();
0814: buffer.append(ch);
0815: if (DEBUG)
0816: System.out.println("got initial jamo "
0817: + Utility.hex(ch));
0818: } else if (type == ComposeData.MEDIAL_JAMO
0819: && classesSeenL == 0 && classesSeenH == 0
0820: && baseIndex == ComposeData.INITIAL_JAMO_INDEX) {
0821: // If the last character was an initial jamo, we can combine it with this
0822: // one to create a Hangul character.
0823: int l = buffer.charAt(basePos) - JAMO_LBASE;
0824: int v = ch - JAMO_VBASE;
0825: char newCh = (char) (HANGUL_BASE + (l * JAMO_VCOUNT + v)
0826: * JAMO_TCOUNT);
0827: buffer.setCharAt(basePos, newCh);
0828:
0829: if (DEBUG)
0830: System.out.println("got medial jamo "
0831: + Utility.hex(ch)
0832: + ", replacing with Hangul "
0833: + Utility.hex(newCh));
0834:
0835: baseIndex = ComposeData.MEDIAL_JAMO_INDEX;
0836: } else if (type == ComposeData.FINAL_JAMO
0837: && classesSeenL == 0 && classesSeenH == 0
0838: && baseIndex == ComposeData.MEDIAL_JAMO_INDEX) {
0839: // If the last character was a medial jamo that we turned into Hangul,
0840: // we can add this character too.
0841: char newCh = (char) (buffer.charAt(basePos) + (ch - JAMO_TBASE));
0842: buffer.setCharAt(basePos, newCh);
0843:
0844: if (DEBUG)
0845: System.out.println("got final jamo "
0846: + Utility.hex(ch)
0847: + ", replacing with Hangul "
0848: + Utility.hex(newCh));
0849:
0850: baseIndex = 0;
0851: basePos = -1;
0852: classesSeenL = classesSeenH = 0;
0853: } else {
0854: // TODO: deal with JAMO character types
0855: baseIndex = 0;
0856: basePos = -1;
0857: classesSeenL = classesSeenH = 0;
0858: buffer.append(ch);
0859: if (DEBUG)
0860: System.out.println("UNKNOWN char "
0861: + Utility.hex(ch));
0862: }
0863:
0864: if (explodePos == EMPTY) {
0865: ch = text.next();
0866: chFromText = true;
0867: } else {
0868: ch = explodeBuf.charAt(explodePos++);
0869: if (explodePos >= explodeBuf.length()) {
0870: explodePos = EMPTY;
0871: explodeBuf.setLength(0);
0872: }
0873: chFromText = false;
0874: }
0875: }
0876: if (getBufferLength() > 0) {
0877: ch = buffer.charAt(0);
0878: } else {
0879: ch = DONE;
0880: }
0881: nextIndex = text.getIndex();
0882: return ch;
0883: }
0884:
0885: /**
0886: * Compose starting with the input char just before the current position
0887: * and continuing backward until (and including) the previous base char.
0888: * <p>
0889: * <b>Input</b>:
0890: * <ul>
0891: * <li>underlying char iter points just after last char to decompose
0892: * </ul>
0893: * <p>
0894: * <b>Output:</b>
0895: * <ul>
0896: * <li>returns last char of resulting decomposition sequence
0897: * <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
0898: * </ul>
0899: */
0900: private char prevCompose() {
0901: if (DEBUG)
0902: System.out
0903: .println("--------------- top of prevCompose() ---------------");
0904:
0905: // Compatibility explosions have lower indices; skip them if necessary
0906: int minExplode = mode.compat() ? 0 : ComposeData.MAX_COMPAT;
0907:
0908: nextIndex = currentIndex;
0909:
0910: initBuffer();
0911: // Slurp up characters until we hit a base char or an initial Jamo
0912: char ch;
0913: while ((ch = curBackward()) != DONE) {
0914: buffer.insert(0, ch);
0915:
0916: // Get the basic info for the character
0917: int charInfo = composeLookup(ch);
0918: int type = charInfo & ComposeData.TYPE_MASK;
0919: int index = charInfo >>> ComposeData.INDEX_SHIFT;
0920:
0921: if (DEBUG)
0922: System.out.println("prevCompose got char "
0923: + Utility.hex(ch) + ", type=" + type
0924: + ", index=" + index + ", minExplode="
0925: + minExplode);
0926:
0927: if (type == ComposeData.BASE
0928: || (type == ComposeData.NON_COMPOSING_COMBINING && index < minExplode)
0929: || type == ComposeData.HANGUL
0930: || type == ComposeData.INITIAL_JAMO) {
0931: break;
0932: }
0933: }
0934: // If there's more than one character in the buffer, compose it all at once....
0935: if (getBufferLength() > 0) {
0936: // TODO: The performance of this is awful; add a way to compose
0937: // a StringBuffer in place.
0938: String composed = compose(buffer.toString(), mode.compat(),
0939: options);
0940: if (DEBUG)
0941: System.out.println("prevCompose called compose("
0942: + Utility.hex(buffer) + ")->"
0943: + Utility.hex(composed));
0944:
0945: buffer.setLength(0);
0946: buffer.append(composed);
0947:
0948: if (getBufferLength() > 1) {
0949: bufferPos = getBufferLength() - 1;
0950: ch = buffer.charAt(bufferPos);
0951: } else {
0952: ch = buffer.charAt(0);
0953: }
0954: } else {
0955: ch = DONE;
0956: }
0957: currentIndex = text.getIndex();
0958: if (DEBUG)
0959: System.out.println("prevCompose returning "
0960: + Utility.hex(ch));
0961: return ch;
0962: }
0963:
0964: private static void bubbleAppend(StringBuffer target, char ch,
0965: int cclass) {
0966: if (DEBUG)
0967: System.out.println(" bubbleAppend(" + Utility.hex(target)
0968: + ", " + Utility.hex(ch) + ", " + cclass + ")");
0969: if (DEBUG)
0970: System.out.println(" getComposeClass(" + Utility.hex(ch)
0971: + ")=" + getComposeClass(ch));
0972: if (DEBUG)
0973: System.out.println(" target before bubbling is : "
0974: + Utility.hex(target));
0975:
0976: int i = target.length() - 1;
0977: if (cclass != 1) { // 1 means combining class 0!!!
0978: for (; i >= 0; --i) {
0979: int iClass = getComposeClass(target.charAt(i));
0980: if (DEBUG)
0981: System.out.println(" getComposeClass("
0982: + Utility.hex(target.charAt(i)) + ")="
0983: + getComposeClass(target.charAt(i)));
0984: if (DEBUG)
0985: System.out.println(" bubbleAppend: target[" + i
0986: + "]=" + Utility.hex(target.charAt(i))
0987: + " is iClass=" + iClass);
0988: if (DEBUG)
0989: System.out.println(" bubbleAppend: for ch="
0990: + Utility.hex(ch) + " class=" + cclass);
0991: if (iClass <= cclass) {
0992: // We've hit something we can't bubble this character past, so insert here
0993: break;
0994: }
0995: }
0996: }
0997: // We need to insert just after character "i"
0998: if (DEBUG)
0999: System.out.println(" bubbleAppend inserting "
1000: + Utility.hex(ch) + " at index " + (i + 1));
1001:
1002: target.insert(i + 1, ch);
1003:
1004: if (DEBUG)
1005: System.out.println(" target is : " + Utility.hex(target));
1006: }
1007:
1008: private static int getComposeClass(char ch) {
1009: int cclass = 0;
1010: int charInfo = composeLookup(ch);
1011: int type = charInfo & ComposeData.TYPE_MASK;
1012: if (type == ComposeData.COMBINING) {
1013: cclass = ComposeData.typeBit[charInfo >>> ComposeData.INDEX_SHIFT];
1014: }
1015: return cclass;
1016: }
1017:
1018: static final int composeLookup(char ch) {
1019: return ComposeData.lookup.elementAt(ch);
1020: }
1021:
1022: static final int composeAction(int baseIndex, int comIndex) {
1023: return ComposeData.actions
1024: .elementAt((char) (baseIndex + ComposeData.MAX_BASES
1025: * comIndex));
1026: }
1027:
1028: static final void explode(StringBuffer target, int index) {
1029: char ch;
1030: while ((ch = ComposeData.replaceCharAt(index++)) != 0)
1031: target.append(ch);
1032: }
1033:
1034: static final char pairExplode(StringBuffer target, int action) {
1035: int index = ComposeData.actionIndex[action
1036: - ComposeData.MAX_COMPOSED];
1037: explode(target, index + 1);
1038: return ComposeData.replaceCharAt(index); // New base char
1039: }
1040:
1041: //-------------------------------------------------------------------------
1042: // Decompose methods
1043: //-------------------------------------------------------------------------
1044:
1045: /**
1046: * Static method to decompose a <tt>String</tt>.
1047: * <p>
1048: * The <tt>options</tt> parameter specifies which optional
1049: * <tt>Normalizer</tt> features are to be enabled for this operation.
1050: * Currently the only available option is {@link #IGNORE_HANGUL}.
1051: * The desired options should be OR'ed together to determine the value
1052: * of this argument. If you want the default behavior corresponding
1053: * to Unicode Normalization Form <b>D</b> or <b>KD</b>,
1054: * use 0 for this argument.
1055: * <p>
1056: * @param str the string to be decomposed.
1057: *
1058: * @param compat Perform compatibility decomposition.
1059: * If this argument is <tt>false</tt>, only canonical
1060: * decomposition will be performed.
1061: *
1062: *
1063: * @return the decomposed string.
1064: */
1065: public static String decompose(String source, boolean compat,
1066: int options) {
1067: return decompose(source, compat, options, false);
1068: }
1069:
1070: public static String decompose(String source, boolean compat,
1071: int options, boolean addSingleQuotation) {
1072: if (DEBUG)
1073: System.out
1074: .println("--------------- top of decompose() ---------------");
1075:
1076: boolean hangul = (options & IGNORE_HANGUL) == 0;
1077: int minDecomp = compat ? 0 : DecompData.MAX_COMPAT;
1078:
1079: StringBuffer result = new StringBuffer();
1080: StringBuffer buffer = null;
1081: StringBuffer tmpBuf = null;
1082:
1083: int i = 0, bufPtr = -1;
1084:
1085: if (addSingleQuotation) {
1086: tmpBuf = new StringBuffer();
1087: }
1088:
1089: while (i < source.length() || bufPtr >= 0) {
1090: char ch;
1091:
1092: if (bufPtr >= 0) {
1093: ch = buffer.charAt(bufPtr++);
1094: if (bufPtr == buffer.length()) {
1095: bufPtr = -1;
1096: }
1097: } else {
1098: ch = source.charAt(i++);
1099: }
1100:
1101: int offset = DecompData.offsets.elementAt(ch);
1102: int index = offset & DecompData.DECOMP_MASK;
1103:
1104: if (DEBUG)
1105: System.out.println("decompose got " + Utility.hex(ch));
1106:
1107: if (index > minDecomp) {
1108: if ((offset & DecompData.DECOMP_RECURSE) != 0) {
1109: if (DEBUG)
1110: System.out
1111: .println(" "
1112: + Utility.hex(ch)
1113: + " has RECURSIVE decomposition, index="
1114: + index);
1115: if (buffer == null) {
1116: buffer = new StringBuffer();
1117: } else {
1118: buffer.setLength(0);
1119: }
1120: DecompData.doAppend(index, buffer);
1121: bufPtr = 0;
1122: } else {
1123: if (DEBUG)
1124: System.out.println(" " + Utility.hex(ch)
1125: + " has decomposition, index=" + index);
1126: if (!addSingleQuotation) {
1127: DecompData.doAppend(index, result);
1128: } else {
1129: tmpBuf.setLength(0);
1130: DecompData.doAppend(index, tmpBuf);
1131: if ((tmpBuf.length() > 1) || (ch == 0x037e) || // normalized to ';'
1132: (ch == 0x1fef)) { // normalized to '`'
1133: for (int j = 0; j < tmpBuf.length(); j++) {
1134: char c = tmpBuf.charAt(j);
1135: if ((c >= 0x0009 && c <= 0x000D)
1136: || (c >= 0x0020 && c <= 0x002F)
1137: || (c >= 0x003A && c <= 0x0040)
1138: || (c >= 0x005B && c <= 0x0060)
1139: || (c >= 0x007B && c <= 0x007E)) {
1140: result.append('\'');
1141: result.append(c);
1142: result.append('\'');
1143: } else {
1144: result.append(c);
1145: }
1146: }
1147: } else {
1148: result.append(tmpBuf);
1149: }
1150: }
1151: }
1152: } else if (ch >= HANGUL_BASE && ch < HANGUL_LIMIT && hangul) {
1153: hangulToJamo(ch, result, minDecomp);
1154: } else {
1155: result.append(ch);
1156: }
1157: }
1158: fixCanonical(result);
1159: return result.toString();
1160: }
1161:
1162: /**
1163: * Decompose starting with current input character and continuing
1164: * until just before the next base char.
1165: * <p>
1166: * <b>Input</b>:
1167: * <ul>
1168: * <li>underlying char iter points to first character to decompose
1169: * </ul>
1170: * <p>
1171: * <b>Output:</b>
1172: * <ul>
1173: * <li>returns first char of decomposition or DONE if at end
1174: * <li>Underlying char iter is pointing at next base char or past end
1175: * </ul>
1176: */
1177: private char nextDecomp() {
1178: if (DEBUG)
1179: System.out
1180: .println("--------------- top of nextDecomp() ---------------");
1181:
1182: boolean hangul = (options & IGNORE_HANGUL) == 0;
1183: currentIndex = nextIndex;
1184: char ch = curForward();
1185:
1186: int offset = DecompData.offsets.elementAt(ch);
1187: int index = offset & DecompData.DECOMP_MASK;
1188:
1189: initBuffer();
1190:
1191: if (index > minDecomp
1192: || DecompData.canonClass.elementAt(ch) != DecompData.BASE) {
1193: if (index > minDecomp) {
1194: if (DEBUG)
1195: System.out.println(" " + Utility.hex(ch)
1196: + " has decomposition, index=" + index);
1197: DecompData.doAppend(index, buffer);
1198:
1199: if ((offset & DecompData.DECOMP_RECURSE) != 0) {
1200: // Need to decompose the output of this decomposition recursively.
1201: for (int i = 0; i < getBufferLength(); i++) {
1202: ch = buffer.charAt(i);
1203: index = DecompData.offsets.elementAt(ch)
1204: & DecompData.DECOMP_MASK;
1205:
1206: if (index > minDecomp) {
1207: i += DecompData.doReplace(index, buffer, i);
1208: }
1209: }
1210: }
1211: } else {
1212: buffer.append(ch);
1213: }
1214: boolean needToReorder = false;
1215:
1216: // Any other combining chacters that immediately follow the decomposed
1217: // character must be included in the buffer too, because they're
1218: // conceptually part of the same logical character.
1219: while ((ch = text.next()) != DONE
1220: && DecompData.canonClass.elementAt(ch) != DecompData.BASE) {
1221: needToReorder = true;
1222: // Decompose any of these characters that need it - Liu
1223: index = DecompData.offsets.elementAt(ch)
1224: & DecompData.DECOMP_MASK;
1225: if (index > minDecomp) {
1226: DecompData.doAppend(index, buffer);
1227: } else {
1228: buffer.append(ch);
1229: }
1230: }
1231:
1232: if (getBufferLength() > 1 && needToReorder) {
1233: // If there is more than one combining character in the buffer,
1234: // put them into the canonical order.
1235: // But we don't need to sort if only characters are the ones that
1236: // resulted from decomosing the base character.
1237: fixCanonical(buffer);
1238: }
1239: ch = buffer.charAt(0);
1240: } else {
1241: // Just use this character, but first advance to the next one
1242: text.next();
1243: buffer.setLength(0);
1244: buffer.append(ch);
1245: // Do Hangul -> Jamo decomposition if necessary
1246: if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
1247: clearBuffer();
1248: hangulToJamo(ch, buffer, minDecomp);
1249: ch = buffer.charAt(0);
1250: }
1251: }
1252: nextIndex = text.getIndex();
1253:
1254: if (DEBUG)
1255: System.out.println("nextDecomp getBufferLength() "
1256: + getBufferLength() + " buffer : "
1257: + buffer.toString());
1258: if (DEBUG)
1259: System.out.println("nextDecomp returning "
1260: + Utility.hex(ch) + ", text index="
1261: + text.getIndex());
1262: return ch;
1263: }
1264:
1265: /**
1266: * Decompose starting with the input char just before the current position
1267: * and continuing backward until (and including) the previous base char.
1268: * <p>
1269: * <b>Input</b>:
1270: * <ul>
1271: * <li>underlying char iter points just after last char to decompose
1272: * </ul>
1273: * <p>
1274: * <b>Output:</b>
1275: * <ul>
1276: * <li>returns last char of resulting decomposition sequence
1277: * <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
1278: * </ul>
1279: */
1280: private char prevDecomp() {
1281: if (DEBUG)
1282: System.out
1283: .println("--------------- top of prevDecomp() ---------------");
1284:
1285: boolean hangul = (options & IGNORE_HANGUL) == 0;
1286:
1287: nextIndex = currentIndex;
1288:
1289: char ch = curBackward();
1290:
1291: int offset = DecompData.offsets.elementAt(ch);
1292: int index = offset & DecompData.DECOMP_MASK;
1293:
1294: if (DEBUG)
1295: System.out.println("prevDecomp got input char "
1296: + Utility.hex(ch));
1297:
1298: initBuffer();
1299:
1300: if (index > minDecomp
1301: || DecompData.canonClass.elementAt(ch) != DecompData.BASE) {
1302: // This method rewritten to pass conformance tests. - Liu
1303: // Collect all characters up to the previous base char
1304: while (ch != DONE) {
1305: buffer.insert(0, ch);
1306: if (DecompData.canonClass.elementAt(ch) == DecompData.BASE)
1307: break;
1308: ch = text.previous();
1309: }
1310:
1311: if (DEBUG)
1312: System.out.println("prevDecomp buffer: "
1313: + Utility.hex(buffer));
1314:
1315: // Decompose the buffer
1316: for (int i = 0; i < getBufferLength(); i++) {
1317: ch = buffer.charAt(i);
1318: offset = DecompData.offsets.elementAt(ch);
1319: index = offset & DecompData.DECOMP_MASK;
1320:
1321: if (index > minDecomp) {
1322: int j = DecompData.doReplace(index, buffer, i);
1323: if ((offset & DecompData.DECOMP_RECURSE) != 0) {
1324: // Need to decompose this recursively
1325: for (; i < j; ++i) {
1326: ch = buffer.charAt(i);
1327: index = DecompData.offsets.elementAt(ch)
1328: & DecompData.DECOMP_MASK;
1329: if (index > minDecomp) {
1330: i += DecompData.doReplace(index,
1331: buffer, i);
1332: }
1333: }
1334: }
1335: i = j;
1336: }
1337: }
1338:
1339: if (DEBUG)
1340: System.out.println("prevDecomp buffer after decomp: "
1341: + Utility.hex(buffer));
1342:
1343: if (getBufferLength() > 1) {
1344: // If there is more than one combining character in the buffer,
1345: // put them into the canonical order.
1346: fixCanonical(buffer);
1347: }
1348: bufferPos = getBufferLength() - 1;
1349: ch = buffer.charAt(bufferPos);
1350: } else if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
1351: hangulToJamo(ch, buffer, minDecomp);
1352: getBufferLength();
1353: bufferPos = getBufferLength() - 1;
1354: ch = buffer.charAt(bufferPos);
1355: } else {
1356: buffer.append(ch);
1357: getBufferLength();
1358: bufferPos = getBufferLength() - 1;
1359: }
1360:
1361: currentIndex = text.getIndex();
1362:
1363: if (DEBUG)
1364: System.out.println(" prevDecomp getBufferLength() "
1365: + getBufferLength() + " buffer : "
1366: + buffer.toString());
1367: if (DEBUG)
1368: System.out.println(" prevDecomp returning '" + ch + "' "
1369: + Utility.hex(ch) + ", text index="
1370: + text.getIndex());
1371: return ch;
1372: }
1373:
1374: public static final int getClass(char ch) {
1375: int value = DecompData.canonClass.elementAt(ch);
1376: return (value >= 0) ? value : value + 256;
1377: }
1378:
1379: //-------------------------------------------------------------------------
1380: // CharacterIterator overrides
1381: //-------------------------------------------------------------------------
1382:
1383: /**
1384: * Return the current character in the normalized text.
1385: */
1386: public char current() {
1387: if (bufferPos >= getBufferLength() || getBufferLength() == 0) {
1388: bufferPos = 0; // Buffer is now out of date
1389: // i have a problem with this, we are moving one away from
1390: // the current position, which isn't right.
1391: // even when currentIndex is still at the present position
1392: // text.index would have shifted because of next*()
1393: // we have to reset the text to the former position
1394: // admittedly, this isn't the best solution
1395: if (mode.compose()) {
1396: currentChar = nextCompose();
1397: text.setIndex(currentIndex);
1398: } else if (mode.decomp()) {
1399: currentChar = nextDecomp();
1400: text.setIndex(currentIndex);
1401: } else {
1402: if (currentIndex == 0) {
1403: currentChar = text.current();
1404: } else {
1405: /* text.setIndex(currentIndex - 1);
1406: currentChar = text.next(); */
1407: currentChar = text.current();
1408: }
1409: }
1410: } else {
1411: currentChar = buffer.charAt(bufferPos);
1412: }
1413:
1414: return currentChar;
1415: }
1416:
1417: /**
1418: * Return the first character in the normalized text. This resets
1419: * the <tt>Normalizer's</tt> position to the beginning of the text.
1420: */
1421: public char first() {
1422: reset();
1423: return next();
1424: }
1425:
1426: /**
1427: * Return the last character in the normalized text. This resets
1428: * the <tt>Normalizer's</tt> position to be just before the
1429: * the input text corresponding to that normalized character.
1430: */
1431: public char last() {
1432: currentIndex = nextIndex = text.getEndIndex() - 1;
1433: text.setIndex(currentIndex); // Setting to getEndIndex() fails in 1.1
1434: atEnd = true; // so work around the bug
1435: currentChar = DONE; // The current char hasn't been processed
1436: clearBuffer(); // The buffer is empty too
1437: return previous();
1438: }
1439:
1440: /**
1441: * Return the current character in the normalized text and advance
1442: * the iteration position by one. If the end
1443: * of the text has already been reached, {@link #DONE} is returned.
1444: */
1445: public char next() {
1446: if (buffer != null && (++bufferPos) < buffer.length()) {
1447: currentChar = buffer.charAt(bufferPos);
1448: } else {
1449: bufferPos = 0; // Buffer is now out of date
1450: if (mode.compose()) {
1451: currentChar = nextCompose();
1452: } else if (mode.decomp()) {
1453: currentChar = nextDecomp();
1454: } else {
1455: // If we're not really doing decomposition, just return the current char
1456: currentChar = text.current();
1457: text.next();
1458:
1459: //Set the indicies for no op
1460: if (currentChar != CharacterIterator.DONE) {
1461: currentIndex = nextIndex += 1;
1462: }
1463: }
1464: }
1465: return currentChar;
1466: }
1467:
1468: /**
1469: * Return the previous character in the normalized text and decrement
1470: * the iteration position by one. If the beginning
1471: * of the text has already been reached, {@link #DONE} is returned.
1472: */
1473: public char previous() {
1474: if (bufferPos > 0) {
1475: // There are output characters left in the buffer
1476: currentChar = buffer.charAt(--bufferPos);
1477: } else {
1478: bufferPos = 0; // Buffer is now out of date
1479: if (mode.compose()) {
1480: currentChar = prevCompose();
1481: } else if (mode.decomp()) {
1482: currentChar = prevDecomp();
1483: } else {
1484: text.setIndex(currentIndex);
1485: currentChar = text.previous();
1486: if (currentIndex != 0) {
1487: currentIndex = nextIndex -= 1;
1488: }
1489: }
1490: }
1491: return currentChar;
1492: }
1493:
1494: private int getBufferLength() {
1495: if (buffer == null) {
1496: return 0;
1497: } else {
1498: return buffer.length();
1499: }
1500: }
1501:
1502: /**
1503: * Set the iteration position in the input text that is being normalized
1504: * and return the first normalized character at that position.
1505: * <p>
1506: * @param index the desired index in the input text.
1507: *
1508: * @return the first normalized character that is the result of iterating
1509: * forward starting at the given index.
1510: *
1511: * @throws IllegalArgumentException if the given index is less than
1512: * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
1513: */
1514: public char setIndex(int index) {
1515: setIndexOnly(index);
1516: return current();
1517: }
1518:
1519: public void setIndexOnly(int index) {
1520: currentIndex = nextIndex = index;
1521: text.setIndex(index); // Checks range
1522: currentChar = DONE; // The current char hasn't been processed
1523: clearBuffer(); // The buffer is empty too
1524: }
1525:
1526: /**
1527: * Retrieve the current iteration position in the input text that is
1528: * being normalized. This method is useful in applications such as
1529: * searching, where you need to be able to determine the position in
1530: * the input text that corresponds to a given normalized output character.
1531: */
1532: public final int getIndex() {
1533: return text.getIndex();
1534: }
1535:
1536: /**
1537: * Retrieve the index of the start of the input text. This is the begin index
1538: * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
1539: * over which this <tt>Normalizer</tt> is iterating
1540: */
1541: public final int getBeginIndex() {
1542: return text.getBeginIndex();
1543: }
1544:
1545: /**
1546: * Retrieve the index of the end of the input text. This is the end index
1547: * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1548: * over which this <tt>Normalizer</tt> is iterating
1549: */
1550: public final int getEndIndex() {
1551: return text.getEndIndex();
1552: }
1553:
1554: //-------------------------------------------------------------------------
1555: // Property access methods
1556: //-------------------------------------------------------------------------
1557:
1558: /**
1559: * Set the normalization mode for this object.
1560: * <p>
1561: * <b>Note:</b>If the normalization mode is changed while iterating
1562: * over a string, calls to {@link #next} and {@link #previous} may
1563: * return previously buffers characters in the old normalization mode
1564: * until the iteration is able to re-sync at the next base character.
1565: * It is safest to call {@link #setText setText()}, {@link #first},
1566: * {@link #last}, etc. after calling <tt>setMode</tt>.
1567: * <p>
1568: * @param newMode the new mode for this <tt>Normalizer</tt>.
1569: * The supported modes are:
1570: * <ul>
1571: * <li>{@link #COMPOSE} - Unicode canonical decompositiion
1572: * followed by canonical composition.
1573: * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
1574: * follwed by canonical composition.
1575: * <li>{@link #DECOMP} - Unicode canonical decomposition
1576: * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition.
1577: * <li>{@link #NO_OP} - Do nothing but return characters
1578: * from the underlying input text.
1579: * </ul>
1580: *
1581: * @see #getMode
1582: */
1583: public void setMode(Mode newMode) {
1584: mode = newMode;
1585: minDecomp = mode.compat() ? 0 : DecompData.MAX_COMPAT;
1586: }
1587:
1588: /**
1589: * Return the basic operation performed by this <tt>Normalizer</tt>
1590: *
1591: * @see #setMode
1592: */
1593: public Mode getMode() {
1594: return mode;
1595: }
1596:
1597: /**
1598: * Set options that affect this <tt>Normalizer</tt>'s operation.
1599: * Options do not change the basic composition or decomposition operation
1600: * that is being performed , but they control whether
1601: * certain optional portions of the operation are done.
1602: * Currently the only available option is:
1603: * <p>
1604: * <ul>
1605: * <li>{@link #IGNORE_HANGUL} - Do not decompose Hangul syllables into the Jamo alphabet
1606: * and vice-versa. This option is off by default (<i>i.e.</i> Hangul processing
1607: * is enabled) since the Unicode standard specifies that Hangul to Jamo
1608: * is a canonical decomposition. For any of the standard Unicode Normalization
1609: * Forms, you should leave this option off.
1610: * </ul>
1611: * <p>
1612: * @param option the option whose value is to be set.
1613: * @param value the new setting for the option. Use <tt>true</tt> to
1614: * turn the option on and <tt>false</tt> to turn it off.
1615: *
1616: * @see #getOption
1617: */
1618: public void setOption(int option, boolean value) {
1619: if (option != IGNORE_HANGUL) {
1620: throw new IllegalArgumentException("Illegal option");
1621: }
1622: if (value) {
1623: options |= option;
1624: } else {
1625: options &= (~option);
1626: }
1627: }
1628:
1629: /**
1630: * Determine whether an option is turned on or off.
1631: * <p>
1632: * @see #setOption
1633: */
1634: public boolean getOption(int option) {
1635: return (options & option) != 0;
1636: }
1637:
1638: /**
1639: * Set the input text over which this <tt>Normalizer</tt> will iterate.
1640: * The iteration position will be reset to the beginning.
1641: * <p>
1642: * @param newText The new string to be normalized.
1643: */
1644: public void setText(String newText) {
1645: text = new StringCharacterIterator(newText);
1646: reset();
1647: }
1648:
1649: /**
1650: * Set the input text over which this <tt>Normalizer</tt> will iterate.
1651: * The iteration position will be reset to the beginning.
1652: * <p>
1653: * @param newText The new text to be normalized.
1654: */
1655: public void setText(CharacterIterator newText) {
1656: text = newText;
1657: reset();
1658: }
1659:
1660: //-------------------------------------------------------------------------
1661: // Private utility methods
1662: //-------------------------------------------------------------------------
1663:
1664: private final char curForward() {
1665: char ch = text.current();
1666: if (DEBUG)
1667: System.out.println(" curForward returning "
1668: + Utility.hex(ch) + ", text index="
1669: + text.getIndex());
1670: return ch;
1671: }
1672:
1673: private final char curBackward() {
1674: char ch = atEnd ? text.current() : text.previous();
1675: atEnd = false;
1676: if (DEBUG)
1677: System.out.println(" curBackward returning "
1678: + Utility.hex(ch) + ", text index="
1679: + text.getIndex());
1680: return ch;
1681: }
1682:
1683: public void reset() {
1684: currentIndex = nextIndex = text.getBeginIndex();
1685: text.setIndex(currentIndex);
1686: atEnd = false;
1687: bufferPos = 0;
1688: clearBuffer();
1689: }
1690:
1691: private final void initBuffer() {
1692: if (buffer == null) {
1693: buffer = new StringBuffer(10);
1694: } else {
1695: buffer.setLength(0);
1696: }
1697: clearBuffer();
1698: }
1699:
1700: private final void clearBuffer() {
1701: bufferPos = 0;
1702: if (buffer != null) {
1703: buffer.setLength(0);
1704: }
1705: }
1706:
1707: /**
1708: * Fixes the sorting sequence of non-spacing characters according to
1709: * their combining class. The algorithm is listed on p.3-11 in the
1710: * Unicode Standard 2.0. The table of combining classes is on p.4-2
1711: * in the Unicode Standard 2.0.
1712: * @param result the string to fix.
1713: */
1714: private static void fixCanonical(StringBuffer result) {
1715: if (result.length() == 0)
1716: return; // don't bother with empty strings!
1717:
1718: int i = result.length() - 1;
1719: int currentType = getClass(result.charAt(i));
1720: int lastType;
1721:
1722: for (--i; i >= 0; --i) {
1723: lastType = currentType;
1724: currentType = getClass(result.charAt(i));
1725:
1726: //
1727: // a swap is presumed to be rare (and a double-swap very rare),
1728: // so don't worry about efficiency here.
1729: //
1730: if (currentType > lastType && lastType != DecompData.BASE) {
1731: // swap characters
1732: char temp = result.charAt(i);
1733: result.setCharAt(i, result.charAt(i + 1));
1734: result.setCharAt(i + 1, temp);
1735: // if not at end, backup (one further, to compensate for for-loop)
1736: if (i < result.length() - 2) {
1737: i += 2;
1738: }
1739: // reset type, since we swapped.
1740: currentType = getClass(result.charAt(i));
1741: }
1742: }
1743: }
1744:
1745: //-------------------------------------------------------------------------
1746: // Hangul / Jamo conversion utilities for internal use
1747: // See section 3.10 of The Unicode Standard, v 2.0.
1748: //
1749:
1750: // Package-accessible for use by ComposedCharIter
1751: static final char HANGUL_BASE = 0xac00;
1752: static final char HANGUL_LIMIT = 0xd7a4;
1753:
1754: private static final char JAMO_LBASE = 0x1100;
1755: private static final char JAMO_VBASE = 0x1161;
1756: private static final char JAMO_TBASE = 0x11a7;
1757: private static final int JAMO_LCOUNT = 19;
1758: private static final int JAMO_VCOUNT = 21;
1759: private static final int JAMO_TCOUNT = 28;
1760: private static final int JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT;
1761:
1762: /**
1763: * Convert a single Hangul syllable into one or more Jamo characters.
1764: *
1765: * @param conjoin If true, decompose Jamo into conjoining Jamo.
1766: */
1767: static int hangulToJamo(char ch, StringBuffer result,
1768: int decompLimit) {
1769: char sIndex = (char) (ch - HANGUL_BASE);
1770: char leading = (char) (JAMO_LBASE + sIndex / JAMO_NCOUNT);
1771: char vowel = (char) (JAMO_VBASE + (sIndex % JAMO_NCOUNT)
1772: / JAMO_TCOUNT);
1773: char trailing = (char) (JAMO_TBASE + (sIndex % JAMO_TCOUNT));
1774:
1775: int length = 0;
1776:
1777: length += jamoAppend(leading, decompLimit, result);
1778: length += jamoAppend(vowel, decompLimit, result);
1779: if (trailing != JAMO_TBASE) {
1780: length += jamoAppend(trailing, decompLimit, result);
1781: }
1782: return length;
1783: }
1784:
1785: static final int jamoAppend(char ch, int limit, StringBuffer dest) {
1786: int offset = DecompData.offsets.elementAt(ch);
1787: if (offset > limit) {
1788: return DecompData.doAppend(offset, dest);
1789: } else {
1790: dest.append(ch);
1791: return 1;
1792: }
1793: }
1794:
1795: static private void jamoToHangul(StringBuffer buffer, int start) {
1796: int out = 0;
1797: int limit = buffer.length() - 1;
1798:
1799: int in, l, v, t;
1800:
1801: for (in = start; in < limit; in++) {
1802: char ch = buffer.charAt(in);
1803:
1804: if ((l = ch - JAMO_LBASE) >= 0 && l < JAMO_LCOUNT
1805: && (v = buffer.charAt(in + 1) - JAMO_VBASE) >= 0
1806: && v < JAMO_VCOUNT) {
1807: //
1808: // We've found a pair of Jamo characters to compose.
1809: // Snarf the Jamo vowel and see if there's also a trailing char
1810: //
1811: in++; // Snarf the Jamo vowel too.
1812:
1813: t = (in < limit) ? buffer.charAt(in + 1) : 0;
1814: t -= JAMO_TBASE;
1815:
1816: if (t >= 0 && t < JAMO_TCOUNT) {
1817: in++; // Snarf the trailing consonant too
1818: } else {
1819: t = 0; // No trailing consonant
1820: }
1821: buffer.setCharAt(out++, (char) ((l * JAMO_VCOUNT + v)
1822: * JAMO_TCOUNT + t + HANGUL_BASE));
1823: } else {
1824: buffer.setCharAt(out++, ch);
1825: }
1826: }
1827: while (in < buffer.length()) {
1828: buffer.setCharAt(out++, buffer.charAt(in++));
1829: }
1830:
1831: buffer.setLength(out);
1832: }
1833:
1834: //-------------------------------------------------------------------------
1835: // Private data
1836: //-------------------------------------------------------------------------
1837:
1838: private static final boolean DEBUG = false;
1839:
1840: private Mode mode = DECOMP;
1841: private int options = 0;
1842: private transient int minDecomp;
1843: private int currentIndex = 0;
1844: private int nextIndex = 0;
1845: // The input text and our position in it
1846: private CharacterIterator text;
1847: private boolean atEnd = false;
1848:
1849: // A buffer for holding intermediate results
1850: private StringBuffer buffer = null;
1851: private int bufferPos = 0;
1852:
1853: private char currentChar;
1854:
1855: // Another buffer for use during iterative composition
1856: private static final int EMPTY = -1;
1857: private StringBuffer explodeBuf = null;
1858:
1859: // These must agree with the constants used in NormalizerBuilder
1860: static final int STR_INDEX_SHIFT = 2;
1861: static final int STR_LENGTH_MASK = 0x0003;
1862: }
|