0001: /*
0002: *******************************************************************************
0003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */
0007: package com.ibm.icu.text;
0008:
0009: import com.ibm.icu.impl.ICUResourceBundle;
0010: import com.ibm.icu.impl.Utility;
0011: import com.ibm.icu.impl.UtilityExtensions;
0012: import com.ibm.icu.util.CaseInsensitiveString;
0013: import com.ibm.icu.util.ULocale;
0014: import com.ibm.icu.util.UResourceBundle;
0015: import com.ibm.icu.impl.UCharacterProperty;
0016:
0017: import java.text.MessageFormat;
0018: import java.util.Enumeration;
0019: import java.util.Hashtable;
0020: import java.util.Locale;
0021: import java.util.MissingResourceException;
0022: import java.util.Vector;
0023:
0024: /**
0025: * <code>Transliterator</code> is an abstract class that
0026: * transliterates text from one format to another. The most common
0027: * kind of transliterator is a script, or alphabet, transliterator.
0028: * For example, a Russian to Latin transliterator changes Russian text
0029: * written in Cyrillic characters to phonetically equivalent Latin
0030: * characters. It does not <em>translate</em> Russian to English!
0031: * Transliteration, unlike translation, operates on characters, without
0032: * reference to the meanings of words and sentences.
0033: *
0034: * <p>Although script conversion is its most common use, a
0035: * transliterator can actually perform a more general class of tasks.
0036: * In fact, <code>Transliterator</code> defines a very general API
0037: * which specifies only that a segment of the input text is replaced
0038: * by new text. The particulars of this conversion are determined
0039: * entirely by subclasses of <code>Transliterator</code>.
0040: *
0041: * <p><b>Transliterators are stateless</b>
0042: *
0043: * <p><code>Transliterator</code> objects are <em>stateless</em>; they
0044: * retain no information between calls to
0045: * <code>transliterate()</code>. As a result, threads may share
0046: * transliterators without synchronizing them. This might seem to
0047: * limit the complexity of the transliteration operation. In
0048: * practice, subclasses perform complex transliterations by delaying
0049: * the replacement of text until it is known that no other
0050: * replacements are possible. In other words, although the
0051: * <code>Transliterator</code> objects are stateless, the source text
0052: * itself embodies all the needed information, and delayed operation
0053: * allows arbitrary complexity.
0054: *
0055: * <p><b>Batch transliteration</b>
0056: *
0057: * <p>The simplest way to perform transliteration is all at once, on a
0058: * string of existing text. This is referred to as <em>batch</em>
0059: * transliteration. For example, given a string <code>input</code>
0060: * and a transliterator <code>t</code>, the call
0061: *
0062: * <blockquote><code>String result = t.transliterate(input);
0063: * </code></blockquote>
0064: *
0065: * will transliterate it and return the result. Other methods allow
0066: * the client to specify a substring to be transliterated and to use
0067: * {@link Replaceable} objects instead of strings, in order to
0068: * preserve out-of-band information (such as text styles).
0069: *
0070: * <p><b>Keyboard transliteration</b>
0071: *
0072: * <p>Somewhat more involved is <em>keyboard</em>, or incremental
0073: * transliteration. This is the transliteration of text that is
0074: * arriving from some source (typically the user's keyboard) one
0075: * character at a time, or in some other piecemeal fashion.
0076: *
0077: * <p>In keyboard transliteration, a <code>Replaceable</code> buffer
0078: * stores the text. As text is inserted, as much as possible is
0079: * transliterated on the fly. This means a GUI that displays the
0080: * contents of the buffer may show text being modified as each new
0081: * character arrives.
0082: *
0083: * <p>Consider the simple <code>RuleBasedTransliterator</code>:
0084: *
0085: * <blockquote><code>
0086: * th>{theta}<br>
0087: * t>{tau}
0088: * </code></blockquote>
0089: *
0090: * When the user types 't', nothing will happen, since the
0091: * transliterator is waiting to see if the next character is 'h'. To
0092: * remedy this, we introduce the notion of a cursor, marked by a '|'
0093: * in the output string:
0094: *
0095: * <blockquote><code>
0096: * t>|{tau}<br>
0097: * {tau}h>{theta}
0098: * </code></blockquote>
0099: *
0100: * Now when the user types 't', tau appears, and if the next character
0101: * is 'h', the tau changes to a theta. This is accomplished by
0102: * maintaining a cursor position (independent of the insertion point,
0103: * and invisible in the GUI) across calls to
0104: * <code>transliterate()</code>. Typically, the cursor will
0105: * be coincident with the insertion point, but in a case like the one
0106: * above, it will precede the insertion point.
0107: *
0108: * <p>Keyboard transliteration methods maintain a set of three indices
0109: * that are updated with each call to
0110: * <code>transliterate()</code>, including the cursor, start,
0111: * and limit. These indices are changed by the method, and they are
0112: * passed in and out via a Position object. The <code>start</code> index
0113: * marks the beginning of the substring that the transliterator will
0114: * look at. It is advanced as text becomes committed (but it is not
0115: * the committed index; that's the <code>cursor</code>). The
0116: * <code>cursor</code> index, described above, marks the point at
0117: * which the transliterator last stopped, either because it reached
0118: * the end, or because it required more characters to disambiguate
0119: * between possible inputs. The <code>cursor</code> can also be
0120: * explicitly set by rules in a <code>RuleBasedTransliterator</code>.
0121: * Any characters before the <code>cursor</code> index are frozen;
0122: * future keyboard transliteration calls within this input sequence
0123: * will not change them. New text is inserted at the
0124: * <code>limit</code> index, which marks the end of the substring that
0125: * the transliterator looks at.
0126: *
0127: * <p>Because keyboard transliteration assumes that more characters
0128: * are to arrive, it is conservative in its operation. It only
0129: * transliterates when it can do so unambiguously. Otherwise it waits
0130: * for more characters to arrive. When the client code knows that no
0131: * more characters are forthcoming, perhaps because the user has
0132: * performed some input termination operation, then it should call
0133: * <code>finishTransliteration()</code> to complete any
0134: * pending transliterations.
0135: *
0136: * <p><b>Inverses</b>
0137: *
0138: * <p>Pairs of transliterators may be inverses of one another. For
0139: * example, if transliterator <b>A</b> transliterates characters by
0140: * incrementing their Unicode value (so "abc" -> "def"), and
0141: * transliterator <b>B</b> decrements character values, then <b>A</b>
0142: * is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
0143: * with <b>B</b> in a compound transliterator, the result is the
0144: * indentity transliterator, that is, a transliterator that does not
0145: * change its input text.
0146: *
0147: * The <code>Transliterator</code> method <code>getInverse()</code>
0148: * returns a transliterator's inverse, if one exists, or
0149: * <code>null</code> otherwise. However, the result of
0150: * <code>getInverse()</code> usually will <em>not</em> be a true
0151: * mathematical inverse. This is because true inverse transliterators
0152: * are difficult to formulate. For example, consider two
0153: * transliterators: <b>AB</b>, which transliterates the character 'A'
0154: * to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
0155: * seem that these are exact inverses, since
0156: *
0157: * <blockquote>"A" x <b>AB</b> -> "B"<br>
0158: * "B" x <b>BA</b> -> "A"</blockquote>
0159: *
0160: * where 'x' represents transliteration. However,
0161: *
0162: * <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
0163: * "BBCD" x <b>BA</b> -> "AACD"</blockquote>
0164: *
0165: * so <b>AB</b> composed with <b>BA</b> is not the
0166: * identity. Nonetheless, <b>BA</b> may be usefully considered to be
0167: * <b>AB</b>'s inverse, and it is on this basis that
0168: * <b>AB</b><code>.getInverse()</code> could legitimately return
0169: * <b>BA</b>.
0170: *
0171: * <p><b>IDs and display names</b>
0172: *
0173: * <p>A transliterator is designated by a short identifier string or
0174: * <em>ID</em>. IDs follow the format <em>source-destination</em>,
0175: * where <em>source</em> describes the entity being replaced, and
0176: * <em>destination</em> describes the entity replacing
0177: * <em>source</em>. The entities may be the names of scripts,
0178: * particular sequences of characters, or whatever else it is that the
0179: * transliterator converts to or from. For example, a transliterator
0180: * from Russian to Latin might be named "Russian-Latin". A
0181: * transliterator from keyboard escape sequences to Latin-1 characters
0182: * might be named "KeyboardEscape-Latin1". By convention, system
0183: * entity names are in English, with the initial letters of words
0184: * capitalized; user entity names may follow any format so long as
0185: * they do not contain dashes.
0186: *
0187: * <p>In addition to programmatic IDs, transliterator objects have
0188: * display names for presentation in user interfaces, returned by
0189: * {@link #getDisplayName}.
0190: *
0191: * <p><b>Factory methods and registration</b>
0192: *
0193: * <p>In general, client code should use the factory method
0194: * <code>getInstance()</code> to obtain an instance of a
0195: * transliterator given its ID. Valid IDs may be enumerated using
0196: * <code>getAvailableIDs()</code>. Since transliterators are
0197: * stateless, multiple calls to <code>getInstance()</code> with the
0198: * same ID will return the same object.
0199: *
0200: * <p>In addition to the system transliterators registered at startup,
0201: * user transliterators may be registered by calling
0202: * <code>registerInstance()</code> at run time. To register a
0203: * transliterator subclass without instantiating it (until it is
0204: * needed), users may call <code>registerClass()</code>.
0205: *
0206: * <p><b>Composed transliterators</b>
0207: *
0208: * <p>In addition to built-in system transliterators like
0209: * "Latin-Greek", there are also built-in <em>composed</em>
0210: * transliterators. These are implemented by composing two or more
0211: * component transliterators. For example, if we have scripts "A",
0212: * "B", "C", and "D", and we want to transliterate between all pairs
0213: * of them, then we need to write 12 transliterators: "A-B", "A-C",
0214: * "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to
0215: * convert all scripts to an intermediate script "M", then instead of
0216: * writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M",
0217: * "D~M", "M~A", "M~B", "M~C", "M~D". (This might not seem like a big
0218: * win, but it's really 2<em>n</em> vs. <em>n</em><sup>2</sup> -
0219: * <em>n</em>, so as <em>n</em> gets larger the gain becomes
0220: * significant. With 9 scripts, it's 18 vs. 72 rule sets, a big
0221: * difference.) Note the use of "~" rather than "-" for the script
0222: * separator here; this indicates that the given transliterator is
0223: * intended to be composed with others, rather than be used as is.
0224: *
0225: * <p>Composed transliterators can be instantiated as usual. For
0226: * example, the system transliterator "Devanagari-Gujarati" is a
0227: * composed transliterator built internally as
0228: * "Devanagari~InterIndic;InterIndic~Gujarati". When this
0229: * transliterator is instantiated, it appears externally to be a
0230: * standard transliterator (e.g., getID() returns
0231: * "Devanagari-Gujarati").
0232: *
0233: * <p><b>Subclassing</b>
0234: *
0235: * <p>Subclasses must implement the abstract method
0236: * <code>handleTransliterate()</code>. <p>Subclasses should override
0237: * the <code>transliterate()</code> method taking a
0238: * <code>Replaceable</code> and the <code>transliterate()</code>
0239: * method taking a <code>String</code> and <code>StringBuffer</code>
0240: * if the performance of these methods can be improved over the
0241: * performance obtained by the default implementations in this class.
0242: *
0243: * <p>Copyright © IBM Corporation 1999. All rights reserved.
0244: *
0245: * @author Alan Liu
0246: * @stable ICU 2.0
0247: */
0248: public abstract class Transliterator {
0249: /**
0250: * Direction constant indicating the forward direction in a transliterator,
0251: * e.g., the forward rules of a RuleBasedTransliterator. An "A-B"
0252: * transliterator transliterates A to B when operating in the forward
0253: * direction, and B to A when operating in the reverse direction.
0254: * @stable ICU 2.0
0255: */
0256: public static final int FORWARD = 0;
0257:
0258: /**
0259: * Direction constant indicating the reverse direction in a transliterator,
0260: * e.g., the reverse rules of a RuleBasedTransliterator. An "A-B"
0261: * transliterator transliterates A to B when operating in the forward
0262: * direction, and B to A when operating in the reverse direction.
0263: * @stable ICU 2.0
0264: */
0265: public static final int REVERSE = 1;
0266:
0267: /**
0268: * Position structure for incremental transliteration. This data
0269: * structure defines two substrings of the text being
0270: * transliterated. The first region, [contextStart,
0271: * contextLimit), defines what characters the transliterator will
0272: * read as context. The second region, [start, limit), defines
0273: * what characters will actually be transliterated. The second
0274: * region should be a subset of the first.
0275: *
0276: * <p>After a transliteration operation, some of the indices in this
0277: * structure will be modified. See the field descriptions for
0278: * details.
0279: *
0280: * <p>contextStart <= start <= limit <= contextLimit
0281: *
0282: * <p>Note: All index values in this structure must be at code point
0283: * boundaries. That is, none of them may occur between two code units
0284: * of a surrogate pair. If any index does split a surrogate pair,
0285: * results are unspecified.
0286: * @stable ICU 2.0
0287: */
0288: public static class Position {
0289:
0290: /**
0291: * Beginning index, inclusive, of the context to be considered for
0292: * a transliteration operation. The transliterator will ignore
0293: * anything before this index. INPUT/OUTPUT parameter: This parameter
0294: * is updated by a transliteration operation to reflect the maximum
0295: * amount of antecontext needed by a transliterator.
0296: * @stable ICU 2.0
0297: */
0298: public int contextStart;
0299:
0300: /**
0301: * Ending index, exclusive, of the context to be considered for a
0302: * transliteration operation. The transliterator will ignore
0303: * anything at or after this index. INPUT/OUTPUT parameter: This
0304: * parameter is updated to reflect changes in the length of the
0305: * text, but points to the same logical position in the text.
0306: * @stable ICU 2.0
0307: */
0308: public int contextLimit;
0309:
0310: /**
0311: * Beginning index, inclusive, of the text to be transliteratd.
0312: * INPUT/OUTPUT parameter: This parameter is advanced past
0313: * characters that have already been transliterated by a
0314: * transliteration operation.
0315: * @stable ICU 2.0
0316: */
0317: public int start;
0318:
0319: /**
0320: * Ending index, exclusive, of the text to be transliteratd.
0321: * INPUT/OUTPUT parameter: This parameter is updated to reflect
0322: * changes in the length of the text, but points to the same
0323: * logical position in the text.
0324: * @stable ICU 2.0
0325: */
0326: public int limit;
0327:
0328: /**
0329: * Constructs a Position object with start, limit,
0330: * contextStart, and contextLimit all equal to zero.
0331: * @stable ICU 2.0
0332: */
0333: public Position() {
0334: this (0, 0, 0, 0);
0335: }
0336:
0337: /**
0338: * Constructs a Position object with the given start,
0339: * contextStart, and contextLimit. The limit is set to the
0340: * contextLimit.
0341: * @stable ICU 2.0
0342: */
0343: public Position(int contextStart, int contextLimit, int start) {
0344: this (contextStart, contextLimit, start, contextLimit);
0345: }
0346:
0347: /**
0348: * Constructs a Position object with the given start, limit,
0349: * contextStart, and contextLimit.
0350: * @stable ICU 2.0
0351: */
0352: public Position(int contextStart, int contextLimit, int start,
0353: int limit) {
0354: this .contextStart = contextStart;
0355: this .contextLimit = contextLimit;
0356: this .start = start;
0357: this .limit = limit;
0358: }
0359:
0360: /**
0361: * Constructs a Position object that is a copy of another.
0362: * @stable ICU 2.6
0363: */
0364: public Position(Position pos) {
0365: set(pos);
0366: }
0367:
0368: /**
0369: * Copies the indices of this position from another.
0370: * @stable ICU 2.6
0371: */
0372: public void set(Position pos) {
0373: contextStart = pos.contextStart;
0374: contextLimit = pos.contextLimit;
0375: start = pos.start;
0376: limit = pos.limit;
0377: }
0378:
0379: /**
0380: * Returns true if this Position is equal to the given object.
0381: * @stable ICU 2.6
0382: */
0383: public boolean equals(Object obj) {
0384: if (obj instanceof Position) {
0385: Position pos = (Position) obj;
0386: return contextStart == pos.contextStart
0387: && contextLimit == pos.contextLimit
0388: && start == pos.start && limit == pos.limit;
0389: }
0390: return false;
0391: }
0392:
0393: /**
0394: * Returns a string representation of this Position.
0395: * @stable ICU 2.6
0396: */
0397: public String toString() {
0398: return "[cs=" + contextStart + ", s=" + start + ", l="
0399: + limit + ", cl=" + contextLimit + "]";
0400: }
0401:
0402: /**
0403: * Check all bounds. If they are invalid, throw an exception.
0404: * @param length the length of the string this object applies to
0405: * @exception IllegalArgumentException if any indices are out
0406: * of bounds
0407: * @stable ICU 2.0
0408: */
0409: public final void validate(int length) {
0410: if (contextStart < 0 || start < contextStart
0411: || limit < start || contextLimit < limit
0412: || length < contextLimit) {
0413: throw new IllegalArgumentException(
0414: "Invalid Position {cs=" + contextStart + ", s="
0415: + start + ", l=" + limit + ", cl="
0416: + contextLimit + "}, len=" + length);
0417: }
0418: }
0419: }
0420:
0421: /**
0422: * Programmatic name, e.g., "Latin-Arabic".
0423: */
0424: private String ID;
0425:
0426: /**
0427: * This transliterator's filter. Any character for which
0428: * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
0429: * altered by this transliterator. If <tt>filter</tt> is
0430: * <tt>null</tt> then no filtering is applied.
0431: */
0432: private UnicodeFilter filter;
0433:
0434: private int maximumContextLength = 0;
0435:
0436: /**
0437: * System transliterator registry.
0438: */
0439: private static TransliteratorRegistry registry;
0440:
0441: private static Hashtable displayNameCache;
0442:
0443: /**
0444: * Prefix for resource bundle key for the display name for a
0445: * transliterator. The ID is appended to this to form the key.
0446: * The resource bundle value should be a String.
0447: */
0448: private static final String RB_DISPLAY_NAME_PREFIX = "%Translit%%";
0449:
0450: /**
0451: * Prefix for resource bundle key for the display name for a
0452: * transliterator SCRIPT. The ID is appended to this to form the key.
0453: * The resource bundle value should be a String.
0454: */
0455: private static final String RB_SCRIPT_DISPLAY_NAME_PREFIX = "%Translit%";
0456:
0457: /**
0458: * Resource bundle key for display name pattern.
0459: * The resource bundle value should be a String forming a
0460: * MessageFormat pattern, e.g.:
0461: * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
0462: */
0463: private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern";
0464:
0465: /**
0466: * Delimiter between elements in a compound ID.
0467: * @internal
0468: */
0469: static final char ID_DELIM = ';';
0470:
0471: /**
0472: * Delimiter before target in an ID.
0473: * @internal
0474: */
0475: static final char ID_SEP = '-';
0476:
0477: /**
0478: * Delimiter before variant in an ID.
0479: * @internal
0480: */
0481: static final char VARIANT_SEP = '/';
0482:
0483: /**
0484: * To enable debugging output in the Transliterator component, set
0485: * DEBUG to true.
0486: *
0487: * N.B. Make sure to recompile all of the com.ibm.icu.text package
0488: * after changing this. Easiest way to do this is 'ant clean
0489: * core' ('ant' will NOT pick up the dependency automatically).
0490: *
0491: * <<This generates a lot of output.>>
0492: */
0493: static final boolean DEBUG = false;
0494:
0495: private static final String COPYRIGHT = "\u00A9 IBM Corporation 1999. All rights reserved.";
0496:
0497: /**
0498: * Default constructor.
0499: * @param ID the string identifier for this transliterator
0500: * @param filter the filter. Any character for which
0501: * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
0502: * altered by this transliterator. If <tt>filter</tt> is
0503: * <tt>null</tt> then no filtering is applied.
0504: * @stable ICU 2.0
0505: */
0506: protected Transliterator(String ID, UnicodeFilter filter) {
0507: if (ID == null) {
0508: throw new NullPointerException();
0509: }
0510: this .ID = ID;
0511: this .filter = filter;
0512: }
0513:
0514: /**
0515: * Transliterates a segment of a string, with optional filtering.
0516: *
0517: * @param text the string to be transliterated
0518: * @param start the beginning index, inclusive; <code>0 <= start
0519: * <= limit</code>.
0520: * @param limit the ending index, exclusive; <code>start <= limit
0521: * <= text.length()</code>.
0522: * @return The new limit index. The text previously occupying <code>[start,
0523: * limit)</code> has been transliterated, possibly to a string of a different
0524: * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
0525: * <em>new-limit</em> is the return value. If the input offsets are out of bounds,
0526: * the returned value is -1 and the input string remains unchanged.
0527: * @stable ICU 2.0
0528: */
0529: public final int transliterate(Replaceable text, int start,
0530: int limit) {
0531: if (start < 0 || limit < start || text.length() < limit) {
0532: return -1;
0533: }
0534:
0535: Position pos = new Position(start, limit, start);
0536: filteredTransliterate(text, pos, false, true);
0537: return pos.limit;
0538: }
0539:
0540: /**
0541: * Transliterates an entire string in place. Convenience method.
0542: * @param text the string to be transliterated
0543: * @stable ICU 2.0
0544: */
0545: public final void transliterate(Replaceable text) {
0546: transliterate(text, 0, text.length());
0547: }
0548:
0549: /**
0550: * Transliterate an entire string and returns the result. Convenience method.
0551: *
0552: * @param text the string to be transliterated
0553: * @return The transliterated text
0554: * @stable ICU 2.0
0555: */
0556: public final String transliterate(String text) {
0557: ReplaceableString result = new ReplaceableString(text);
0558: transliterate(result);
0559: return result.toString();
0560: }
0561:
0562: /**
0563: * Transliterates the portion of the text buffer that can be
0564: * transliterated unambiguosly after new text has been inserted,
0565: * typically as a result of a keyboard event. The new text in
0566: * <code>insertion</code> will be inserted into <code>text</code>
0567: * at <code>index.contextLimit</code>, advancing
0568: * <code>index.contextLimit</code> by <code>insertion.length()</code>.
0569: * Then the transliterator will try to transliterate characters of
0570: * <code>text</code> between <code>index.start</code> and
0571: * <code>index.contextLimit</code>. Characters before
0572: * <code>index.start</code> will not be changed.
0573: *
0574: * <p>Upon return, values in <code>index</code> will be updated.
0575: * <code>index.contextStart</code> will be advanced to the first
0576: * character that future calls to this method will read.
0577: * <code>index.start</code> and <code>index.contextLimit</code> will
0578: * be adjusted to delimit the range of text that future calls to
0579: * this method may change.
0580: *
0581: * <p>Typical usage of this method begins with an initial call
0582: * with <code>index.contextStart</code> and <code>index.contextLimit</code>
0583: * set to indicate the portion of <code>text</code> to be
0584: * transliterated, and <code>index.start == index.contextStart</code>.
0585: * Thereafter, <code>index</code> can be used without
0586: * modification in future calls, provided that all changes to
0587: * <code>text</code> are made via this method.
0588: *
0589: * <p>This method assumes that future calls may be made that will
0590: * insert new text into the buffer. As a result, it only performs
0591: * unambiguous transliterations. After the last call to this
0592: * method, there may be untransliterated text that is waiting for
0593: * more input to resolve an ambiguity. In order to perform these
0594: * pending transliterations, clients should call {@link
0595: * #finishTransliteration} after the last call to this
0596: * method has been made.
0597: *
0598: * @param text the buffer holding transliterated and untransliterated text
0599: * @param index the start and limit of the text, the position
0600: * of the cursor, and the start and limit of transliteration.
0601: * @param insertion text to be inserted and possibly
0602: * transliterated into the translation buffer at
0603: * <code>index.contextLimit</code>. If <code>null</code> then no text
0604: * is inserted.
0605: * @see #handleTransliterate
0606: * @exception IllegalArgumentException if <code>index</code>
0607: * is invalid
0608: * @stable ICU 2.0
0609: */
0610: public final void transliterate(Replaceable text, Position index,
0611: String insertion) {
0612: index.validate(text.length());
0613:
0614: // int originalStart = index.contextStart;
0615: if (insertion != null) {
0616: text.replace(index.limit, index.limit, insertion);
0617: index.limit += insertion.length();
0618: index.contextLimit += insertion.length();
0619: }
0620:
0621: if (index.limit > 0
0622: && UTF16.isLeadSurrogate(text.charAt(index.limit - 1))) {
0623: // Oops, there is a dangling lead surrogate in the buffer.
0624: // This will break most transliterators, since they will
0625: // assume it is part of a pair. Don't transliterate until
0626: // more text comes in.
0627: return;
0628: }
0629:
0630: filteredTransliterate(text, index, true, true);
0631:
0632: // TODO
0633: // This doesn't work once we add quantifier support. Need to rewrite
0634: // this code to support quantifiers and 'use maximum backup <n>;'.
0635: //
0636: // index.contextStart = Math.max(index.start - getMaximumContextLength(),
0637: // originalStart);
0638: }
0639:
0640: /**
0641: * Transliterates the portion of the text buffer that can be
0642: * transliterated unambiguosly after a new character has been
0643: * inserted, typically as a result of a keyboard event. This is a
0644: * convenience method; see {@link #transliterate(Replaceable,
0645: * Transliterator.Position, String)} for details.
0646: * @param text the buffer holding transliterated and
0647: * untransliterated text
0648: * @param index the start and limit of the text, the position
0649: * of the cursor, and the start and limit of transliteration.
0650: * @param insertion text to be inserted and possibly
0651: * transliterated into the translation buffer at
0652: * <code>index.contextLimit</code>.
0653: * @see #transliterate(Replaceable, Transliterator.Position, String)
0654: * @stable ICU 2.0
0655: */
0656: public final void transliterate(Replaceable text, Position index,
0657: int insertion) {
0658: transliterate(text, index, UTF16.valueOf(insertion));
0659: }
0660:
0661: /**
0662: * Transliterates the portion of the text buffer that can be
0663: * transliterated unambiguosly. This is a convenience method; see
0664: * {@link #transliterate(Replaceable, Transliterator.Position,
0665: * String)} for details.
0666: * @param text the buffer holding transliterated and
0667: * untransliterated text
0668: * @param index the start and limit of the text, the position
0669: * of the cursor, and the start and limit of transliteration.
0670: * @see #transliterate(Replaceable, Transliterator.Position, String)
0671: * @stable ICU 2.0
0672: */
0673: public final void transliterate(Replaceable text, Position index) {
0674: transliterate(text, index, null);
0675: }
0676:
0677: /**
0678: * Finishes any pending transliterations that were waiting for
0679: * more characters. Clients should call this method as the last
0680: * call after a sequence of one or more calls to
0681: * <code>transliterate()</code>.
0682: * @param text the buffer holding transliterated and
0683: * untransliterated text.
0684: * @param index the array of indices previously passed to {@link
0685: * #transliterate}
0686: * @stable ICU 2.0
0687: */
0688: public final void finishTransliteration(Replaceable text,
0689: Position index) {
0690: index.validate(text.length());
0691: filteredTransliterate(text, index, false, true);
0692: }
0693:
0694: /**
0695: * Abstract method that concrete subclasses define to implement
0696: * their transliteration algorithm. This method handles both
0697: * incremental and non-incremental transliteration. Let
0698: * <code>originalStart</code> refer to the value of
0699: * <code>pos.start</code> upon entry.
0700: *
0701: * <ul>
0702: * <li>If <code>incremental</code> is false, then this method
0703: * should transliterate all characters between
0704: * <code>pos.start</code> and <code>pos.limit</code>. Upon return
0705: * <code>pos.start</code> must == <code> pos.limit</code>.</li>
0706: *
0707: * <li>If <code>incremental</code> is true, then this method
0708: * should transliterate all characters between
0709: * <code>pos.start</code> and <code>pos.limit</code> that can be
0710: * unambiguously transliterated, regardless of future insertions
0711: * of text at <code>pos.limit</code>. Upon return,
0712: * <code>pos.start</code> should be in the range
0713: * [<code>originalStart</code>, <code>pos.limit</code>).
0714: * <code>pos.start</code> should be positioned such that
0715: * characters [<code>originalStart</code>, <code>
0716: * pos.start</code>) will not be changed in the future by this
0717: * transliterator and characters [<code>pos.start</code>,
0718: * <code>pos.limit</code>) are unchanged.</li>
0719: * </ul>
0720: *
0721: * <p>Implementations of this method should also obey the
0722: * following invariants:</p>
0723: *
0724: * <ul>
0725: * <li> <code>pos.limit</code> and <code>pos.contextLimit</code>
0726: * should be updated to reflect changes in length of the text
0727: * between <code>pos.start</code> and <code>pos.limit</code>. The
0728: * difference <code> pos.contextLimit - pos.limit</code> should
0729: * not change.</li>
0730: *
0731: * <li><code>pos.contextStart</code> should not change.</li>
0732: *
0733: * <li>Upon return, neither <code>pos.start</code> nor
0734: * <code>pos.limit</code> should be less than
0735: * <code>originalStart</code>.</li>
0736: *
0737: * <li>Text before <code>originalStart</code> and text after
0738: * <code>pos.limit</code> should not change.</li>
0739: *
0740: * <li>Text before <code>pos.contextStart</code> and text after
0741: * <code> pos.contextLimit</code> should be ignored.</li>
0742: * </ul>
0743: *
0744: * <p>Subclasses may safely assume that all characters in
0745: * [<code>pos.start</code>, <code>pos.limit</code>) are filtered.
0746: * In other words, the filter has already been applied by the time
0747: * this method is called. See
0748: * <code>filteredTransliterate()</code>.
0749: *
0750: * <p>This method is <b>not</b> for public consumption. Calling
0751: * this method directly will transliterate
0752: * [<code>pos.start</code>, <code>pos.limit</code>) without
0753: * applying the filter. End user code should call <code>
0754: * transliterate()</code> instead of this method. Subclass code
0755: * should call <code>filteredTransliterate()</code> instead of
0756: * this method.<p>
0757: *
0758: * @param text the buffer holding transliterated and
0759: * untransliterated text
0760: *
0761: * @param pos the indices indicating the start, limit, context
0762: * start, and context limit of the text.
0763: *
0764: * @param incremental if true, assume more text may be inserted at
0765: * <code>pos.limit</code> and act accordingly. Otherwise,
0766: * transliterate all text between <code>pos.start</code> and
0767: * <code>pos.limit</code> and move <code>pos.start</code> up to
0768: * <code>pos.limit</code>.
0769: *
0770: * @see #transliterate
0771: * @stable ICU 2.0
0772: */
0773: protected abstract void handleTransliterate(Replaceable text,
0774: Position pos, boolean incremental);
0775:
0776: /**
0777: * Top-level transliteration method, handling filtering, incremental and
0778: * non-incremental transliteration, and rollback. All transliteration
0779: * public API methods eventually call this method with a rollback argument
0780: * of TRUE. Other entities may call this method but rollback should be
0781: * FALSE.
0782: *
0783: * <p>If this transliterator has a filter, break up the input text into runs
0784: * of unfiltered characters. Pass each run to
0785: * <subclass>.handleTransliterate().
0786: *
0787: * <p>In incremental mode, if rollback is TRUE, perform a special
0788: * incremental procedure in which several passes are made over the input
0789: * text, adding one character at a time, and committing successful
0790: * transliterations as they occur. Unsuccessful transliterations are rolled
0791: * back and retried with additional characters to give correct results.
0792: *
0793: * @param text the text to be transliterated
0794: * @param index the position indices
0795: * @param incremental if TRUE, then assume more characters may be inserted
0796: * at index.limit, and postpone processing to accomodate future incoming
0797: * characters
0798: * @param rollback if TRUE and if incremental is TRUE, then perform special
0799: * incremental processing, as described above, and undo partial
0800: * transliterations where necessary. If incremental is FALSE then this
0801: * parameter is ignored.
0802: */
0803: private void filteredTransliterate(Replaceable text,
0804: Position index, boolean incremental, boolean rollback) {
0805: // Short circuit path for transliterators with no filter in
0806: // non-incremental mode.
0807: if (filter == null && !rollback) {
0808: handleTransliterate(text, index, incremental);
0809: return;
0810: }
0811:
0812: //----------------------------------------------------------------------
0813: // This method processes text in two groupings:
0814: //
0815: // RUNS -- A run is a contiguous group of characters which are contained
0816: // in the filter for this transliterator (filter.contains(ch) == true).
0817: // Text outside of runs may appear as context but it is not modified.
0818: // The start and limit Position values are narrowed to each run.
0819: //
0820: // PASSES (incremental only) -- To make incremental mode work correctly,
0821: // each run is broken up into n passes, where n is the length (in code
0822: // points) of the run. Each pass contains the first n characters. If a
0823: // pass is completely transliterated, it is committed, and further passes
0824: // include characters after the committed text. If a pass is blocked,
0825: // and does not transliterate completely, then this method rolls back
0826: // the changes made during the pass, extends the pass by one code point,
0827: // and tries again.
0828: //----------------------------------------------------------------------
0829:
0830: // globalLimit is the limit value for the entire operation. We
0831: // set index.limit to the end of each unfiltered run before
0832: // calling handleTransliterate(), so we need to maintain the real
0833: // value of index.limit here. After each transliteration, we
0834: // update globalLimit for insertions or deletions that have
0835: // happened.
0836: int globalLimit = index.limit;
0837:
0838: // If there is a non-null filter, then break the input text up. Say the
0839: // input text has the form:
0840: // xxxabcxxdefxx
0841: // where 'x' represents a filtered character (filter.contains('x') ==
0842: // false). Then we break this up into:
0843: // xxxabc xxdef xx
0844: // Each pass through the loop consumes a run of filtered
0845: // characters (which are ignored) and a subsequent run of
0846: // unfiltered characters (which are transliterated).
0847:
0848: StringBuffer log = null;
0849: if (DEBUG) {
0850: log = new StringBuffer();
0851: }
0852:
0853: for (;;) {
0854:
0855: if (filter != null) {
0856: // Narrow the range to be transliterated to the first run
0857: // of unfiltered characters at or after index.start.
0858:
0859: // Advance past filtered chars
0860: int c;
0861: while (index.start < globalLimit
0862: && !filter.contains(c = text
0863: .char32At(index.start))) {
0864: index.start += UTF16.getCharCount(c);
0865: }
0866:
0867: // Find the end of this run of unfiltered chars
0868: index.limit = index.start;
0869: while (index.limit < globalLimit
0870: && filter.contains(c = text
0871: .char32At(index.limit))) {
0872: index.limit += UTF16.getCharCount(c);
0873: }
0874: }
0875:
0876: // Check to see if the unfiltered run is empty. This only
0877: // happens at the end of the string when all the remaining
0878: // characters are filtered.
0879: if (index.start == index.limit) {
0880: break;
0881: }
0882:
0883: // Is this run incremental? If there is additional
0884: // filtered text (if limit < globalLimit) then we pass in
0885: // an incremental value of FALSE to force the subclass to
0886: // complete the transliteration for this run.
0887: boolean isIncrementalRun = (index.limit < globalLimit ? false
0888: : incremental);
0889:
0890: int delta;
0891:
0892: // Implement rollback. To understand the need for rollback,
0893: // consider the following transliterator:
0894: //
0895: // "t" is "a > A;"
0896: // "u" is "A > b;"
0897: // "v" is a compound of "t; NFD; u" with a filter [:Ll:]
0898: //
0899: // Now apply "v" to the input text "a". The result is "b". But if
0900: // the transliteration is done incrementally, then the NFD holds
0901: // things up after "t" has already transformed "a" to "A". When
0902: // finishTransliterate() is called, "A" is _not_ processed because
0903: // it gets excluded by the [:Ll:] filter, and the end result is "A"
0904: // -- incorrect. The problem is that the filter is applied to a
0905: // partially-transliterated result, when we only want it to apply to
0906: // input text. Although this example describes a compound
0907: // transliterator containing NFD and a specific filter, it can
0908: // happen with any transliterator which does a partial
0909: // transformation in incremental mode into characters outside its
0910: // filter.
0911: //
0912: // To handle this, when in incremental mode we supply characters to
0913: // handleTransliterate() in several passes. Each pass adds one more
0914: // input character to the input text. That is, for input "ABCD", we
0915: // first try "A", then "AB", then "ABC", and finally "ABCD". If at
0916: // any point we block (upon return, start < limit) then we roll
0917: // back. If at any point we complete the run (upon return start ==
0918: // limit) then we commit that run.
0919:
0920: if (rollback && isIncrementalRun) {
0921:
0922: if (DEBUG) {
0923: log.setLength(0);
0924: System.out.println("filteredTransliterate{"
0925: + getID()
0926: + "}i: IN="
0927: + UtilityExtensions
0928: .formatInput(text, index));
0929: }
0930:
0931: int runStart = index.start;
0932: int runLimit = index.limit;
0933: int runLength = runLimit - runStart;
0934:
0935: // Make a rollback copy at the end of the string
0936: int rollbackOrigin = text.length();
0937: text.copy(runStart, runLimit, rollbackOrigin);
0938:
0939: // Variables reflecting the commitment of completely
0940: // transliterated text. passStart is the runStart, advanced
0941: // past committed text. rollbackStart is the rollbackOrigin,
0942: // advanced past rollback text that corresponds to committed
0943: // text.
0944: int passStart = runStart;
0945: int rollbackStart = rollbackOrigin;
0946:
0947: // The limit for each pass; we advance by one code point with
0948: // each iteration.
0949: int passLimit = index.start;
0950:
0951: // Total length, in 16-bit code units, of uncommitted text.
0952: // This is the length to be rolled back.
0953: int uncommittedLength = 0;
0954:
0955: // Total delta (change in length) for all passes
0956: int totalDelta = 0;
0957:
0958: // PASS MAIN LOOP -- Start with a single character, and extend
0959: // the text by one character at a time. Roll back partial
0960: // transliterations and commit complete transliterations.
0961: for (;;) {
0962: // Length of additional code point, either one or two
0963: int charLength = UTF16.getCharCount(text
0964: .char32At(passLimit));
0965: passLimit += charLength;
0966: if (passLimit > runLimit) {
0967: break;
0968: }
0969: uncommittedLength += charLength;
0970:
0971: index.limit = passLimit;
0972:
0973: if (DEBUG) {
0974: log.setLength(0);
0975: log.append("filteredTransliterate{" + getID()
0976: + "}i: ");
0977: UtilityExtensions.formatInput(log, text, index);
0978: }
0979:
0980: // Delegate to subclass for actual transliteration. Upon
0981: // return, start will be updated to point after the
0982: // transliterated text, and limit and contextLimit will be
0983: // adjusted for length changes.
0984: handleTransliterate(text, index, true);
0985:
0986: if (DEBUG) {
0987: log.append(" => ");
0988: UtilityExtensions.formatInput(log, text, index);
0989: }
0990:
0991: delta = index.limit - passLimit; // change in length
0992:
0993: // We failed to completely transliterate this pass.
0994: // Roll back the text. Indices remain unchanged; reset
0995: // them where necessary.
0996: if (index.start != index.limit) {
0997: // Find the rollbackStart, adjusted for length changes
0998: // and the deletion of partially transliterated text.
0999: int rs = rollbackStart + delta
1000: - (index.limit - passStart);
1001:
1002: // Delete the partially transliterated text
1003: text.replace(passStart, index.limit, "");
1004:
1005: // Copy the rollback text back
1006: text
1007: .copy(rs, rs + uncommittedLength,
1008: passStart);
1009:
1010: // Restore indices to their original values
1011: index.start = passStart;
1012: index.limit = passLimit;
1013: index.contextLimit -= delta;
1014:
1015: if (DEBUG) {
1016: log.append(" (ROLLBACK)");
1017: }
1018: }
1019:
1020: // We did completely transliterate this pass. Update the
1021: // commit indices to record how far we got. Adjust indices
1022: // for length change.
1023: else {
1024: // Move the pass indices past the committed text.
1025: passStart = passLimit = index.start;
1026:
1027: // Adjust the rollbackStart for length changes and move
1028: // it past the committed text. All characters we've
1029: // processed to this point are committed now, so zero
1030: // out the uncommittedLength.
1031: rollbackStart += delta + uncommittedLength;
1032: uncommittedLength = 0;
1033:
1034: // Adjust indices for length changes.
1035: runLimit += delta;
1036: totalDelta += delta;
1037: }
1038:
1039: if (DEBUG) {
1040: System.out.println(Utility.escape(log
1041: .toString()));
1042: }
1043: }
1044:
1045: // Adjust overall limit and rollbackOrigin for insertions and
1046: // deletions. Don't need to worry about contextLimit because
1047: // handleTransliterate() maintains that.
1048: rollbackOrigin += totalDelta;
1049: globalLimit += totalDelta;
1050:
1051: // Delete the rollback copy
1052: text.replace(rollbackOrigin,
1053: rollbackOrigin + runLength, "");
1054:
1055: // Move start past committed text
1056: index.start = passStart;
1057: }
1058:
1059: else {
1060: // Delegate to subclass for actual transliteration.
1061: if (DEBUG) {
1062: log.setLength(0);
1063: log.append("filteredTransliterate{" + getID()
1064: + "}: ");
1065: UtilityExtensions.formatInput(log, text, index);
1066: }
1067:
1068: int limit = index.limit;
1069: handleTransliterate(text, index, isIncrementalRun);
1070: delta = index.limit - limit; // change in length
1071:
1072: if (DEBUG) {
1073: log.append(" => ");
1074: UtilityExtensions.formatInput(log, text, index);
1075: }
1076:
1077: // In a properly written transliterator, start == limit after
1078: // handleTransliterate() returns when incremental is false.
1079: // Catch cases where the subclass doesn't do this, and throw
1080: // an exception. (Just pinning start to limit is a bad idea,
1081: // because what's probably happening is that the subclass
1082: // isn't transliterating all the way to the end, and it should
1083: // in non-incremental mode.)
1084: if (!isIncrementalRun && index.start != index.limit) {
1085: throw new RuntimeException(
1086: "ERROR: Incomplete non-incremental transliteration by "
1087: + getID());
1088: }
1089:
1090: // Adjust overall limit for insertions/deletions. Don't need
1091: // to worry about contextLimit because handleTransliterate()
1092: // maintains that.
1093: globalLimit += delta;
1094:
1095: if (DEBUG) {
1096: System.out.println(Utility.escape(log.toString()));
1097: }
1098: }
1099:
1100: if (filter == null || isIncrementalRun) {
1101: break;
1102: }
1103:
1104: // If we did completely transliterate this
1105: // run, then repeat with the next unfiltered run.
1106: }
1107:
1108: // Start is valid where it is. Limit needs to be put back where
1109: // it was, modulo adjustments for deletions/insertions.
1110: index.limit = globalLimit;
1111:
1112: if (DEBUG) {
1113: System.out.println("filteredTransliterate{" + getID()
1114: + "}: OUT="
1115: + UtilityExtensions.formatInput(text, index));
1116: }
1117: }
1118:
1119: /**
1120: * Transliterate a substring of text, as specified by index, taking filters
1121: * into account. This method is for subclasses that need to delegate to
1122: * another transliterator, such as CompoundTransliterator.
1123: * @param text the text to be transliterated
1124: * @param index the position indices
1125: * @param incremental if TRUE, then assume more characters may be inserted
1126: * at index.limit, and postpone processing to accomodate future incoming
1127: * characters
1128: * @stable ICU 2.0
1129: */
1130: public void filteredTransliterate(Replaceable text, Position index,
1131: boolean incremental) {
1132: filteredTransliterate(text, index, incremental, false);
1133: }
1134:
1135: /**
1136: * Returns the length of the longest context required by this transliterator.
1137: * This is <em>preceding</em> context. The default value is zero, but
1138: * subclasses can change this by calling <code>setMaximumContextLength()</code>.
1139: * For example, if a transliterator translates "ddd" (where
1140: * d is any digit) to "555" when preceded by "(ddd)", then the preceding
1141: * context length is 5, the length of "(ddd)".
1142: *
1143: * @return The maximum number of preceding context characters this
1144: * transliterator needs to examine
1145: * @stable ICU 2.0
1146: */
1147: public final int getMaximumContextLength() {
1148: return maximumContextLength;
1149: }
1150:
1151: /**
1152: * Method for subclasses to use to set the maximum context length.
1153: * @see #getMaximumContextLength
1154: * @stable ICU 2.0
1155: */
1156: protected void setMaximumContextLength(int a) {
1157: if (a < 0) {
1158: throw new IllegalArgumentException(
1159: "Invalid context length " + a);
1160: }
1161: maximumContextLength = a;
1162: }
1163:
1164: /**
1165: * Returns a programmatic identifier for this transliterator.
1166: * If this identifier is passed to <code>getInstance()</code>, it
1167: * will return this object, if it has been registered.
1168: * @see #registerClass
1169: * @see #getAvailableIDs
1170: * @stable ICU 2.0
1171: */
1172: public final String getID() {
1173: return ID;
1174: }
1175:
1176: /**
1177: * Set the programmatic identifier for this transliterator. Only
1178: * for use by subclasses.
1179: * @stable ICU 2.0
1180: */
1181: protected final void setID(String id) {
1182: ID = id;
1183: }
1184:
1185: /**
1186: * Returns a name for this transliterator that is appropriate for
1187: * display to the user in the default locale. See {@link
1188: * #getDisplayName(String,Locale)} for details.
1189: * @stable ICU 2.0
1190: */
1191: public final static String getDisplayName(String ID) {
1192: return getDisplayName(ID, ULocale.getDefault());
1193: }
1194:
1195: /**
1196: * Returns a name for this transliterator that is appropriate for
1197: * display to the user in the given locale. This name is taken
1198: * from the locale resource data in the standard manner of the
1199: * <code>java.text</code> package.
1200: *
1201: * <p>If no localized names exist in the system resource bundles,
1202: * a name is synthesized using a localized
1203: * <code>MessageFormat</code> pattern from the resource data. The
1204: * arguments to this pattern are an integer followed by one or two
1205: * strings. The integer is the number of strings, either 1 or 2.
1206: * The strings are formed by splitting the ID for this
1207: * transliterator at the first '-'. If there is no '-', then the
1208: * entire ID forms the only string.
1209: * @param inLocale the Locale in which the display name should be
1210: * localized.
1211: * @see java.text.MessageFormat
1212: * @stable ICU 2.0
1213: */
1214: public static String getDisplayName(String id, Locale inLocale) {
1215: return getDisplayName(id, ULocale.forLocale(inLocale));
1216: }
1217:
1218: /**
1219: * Returns a name for this transliterator that is appropriate for
1220: * display to the user in the given locale. This name is taken
1221: * from the locale resource data in the standard manner of the
1222: * <code>java.text</code> package.
1223: *
1224: * <p>If no localized names exist in the system resource bundles,
1225: * a name is synthesized using a localized
1226: * <code>MessageFormat</code> pattern from the resource data. The
1227: * arguments to this pattern are an integer followed by one or two
1228: * strings. The integer is the number of strings, either 1 or 2.
1229: * The strings are formed by splitting the ID for this
1230: * transliterator at the first '-'. If there is no '-', then the
1231: * entire ID forms the only string.
1232: * @param inLocale the ULocale in which the display name should be
1233: * localized.
1234: * @see java.text.MessageFormat
1235: * @draft ICU 3.2
1236: * @provisional This API might change or be removed in a future release.
1237: */
1238: public static String getDisplayName(String id, ULocale inLocale) {
1239:
1240: // Resource bundle containing display name keys and the
1241: // RB_RULE_BASED_IDS array.
1242: //
1243: //If we ever integrate this with the Sun JDK, the resource bundle
1244: // root will change to sun.text.resources.LocaleElements
1245:
1246: ICUResourceBundle bundle = (ICUResourceBundle) UResourceBundle
1247: .getBundleInstance(
1248: ICUResourceBundle.ICU_TRANSLIT_BASE_NAME,
1249: inLocale);
1250:
1251: // Normalize the ID
1252: String stv[] = TransliteratorIDParser.IDtoSTV(id);
1253: if (stv == null) {
1254: // No target; malformed id
1255: return "";
1256: }
1257: String ID = stv[0] + '-' + stv[1];
1258: if (stv[2] != null && stv[2].length() > 0) {
1259: ID = ID + '/' + stv[2];
1260: }
1261:
1262: // Use the registered display name, if any
1263: String n = (String) displayNameCache
1264: .get(new CaseInsensitiveString(ID));
1265: if (n != null) {
1266: return n;
1267: }
1268:
1269: // Use display name for the entire transliterator, if it
1270: // exists.
1271: try {
1272: return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID);
1273: } catch (MissingResourceException e) {
1274: }
1275:
1276: try {
1277: // Construct the formatter first; if getString() fails
1278: // we'll exit the try block
1279: MessageFormat format = new MessageFormat(bundle
1280: .getString(RB_DISPLAY_NAME_PATTERN));
1281: // Construct the argument array
1282: Object[] args = new Object[] { new Integer(2), stv[0],
1283: stv[1] };
1284:
1285: // Use display names for the scripts, if they exist
1286: for (int j = 1; j <= 2; ++j) {
1287: try {
1288: args[j] = bundle
1289: .getString(RB_SCRIPT_DISPLAY_NAME_PREFIX
1290: + (String) args[j]);
1291: } catch (MissingResourceException e) {
1292: }
1293: }
1294:
1295: // Format it using the pattern in the resource
1296: return (stv[2].length() > 0) ? (format.format(args) + '/' + stv[2])
1297: : format.format(args);
1298: } catch (MissingResourceException e2) {
1299: }
1300:
1301: // We should not reach this point unless there is something
1302: // wrong with the build or the RB_DISPLAY_NAME_PATTERN has
1303: // been deleted from the root RB_LOCALE_ELEMENTS resource.
1304: throw new RuntimeException();
1305: }
1306:
1307: /**
1308: * Returns the filter used by this transliterator, or <tt>null</tt>
1309: * if this transliterator uses no filter.
1310: * @stable ICU 2.0
1311: */
1312: public final UnicodeFilter getFilter() {
1313: return filter;
1314: }
1315:
1316: /**
1317: * Changes the filter used by this transliterator. If the filter
1318: * is set to <tt>null</tt> then no filtering will occur.
1319: *
1320: * <p>Callers must take care if a transliterator is in use by
1321: * multiple threads. The filter should not be changed by one
1322: * thread while another thread may be transliterating.
1323: * @stable ICU 2.0
1324: */
1325: public void setFilter(UnicodeFilter filter) {
1326: this .filter = filter;
1327: }
1328:
1329: /**
1330: * Returns a <code>Transliterator</code> object given its ID.
1331: * The ID must be either a system transliterator ID or a ID registered
1332: * using <code>registerClass()</code>.
1333: *
1334: * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
1335: * @return A <code>Transliterator</code> object with the given ID
1336: * @exception IllegalArgumentException if the given ID is invalid.
1337: * @stable ICU 2.0
1338: */
1339: public static final Transliterator getInstance(String ID) {
1340: return getInstance(ID, FORWARD);
1341: }
1342:
1343: /**
1344: * Returns a <code>Transliterator</code> object given its ID.
1345: * The ID must be either a system transliterator ID or a ID registered
1346: * using <code>registerClass()</code>.
1347: *
1348: * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
1349: * @param dir either FORWARD or REVERSE. If REVERSE then the
1350: * inverse of the given ID is instantiated.
1351: * @return A <code>Transliterator</code> object with the given ID
1352: * @exception IllegalArgumentException if the given ID is invalid.
1353: * @see #registerClass
1354: * @see #getAvailableIDs
1355: * @see #getID
1356: * @stable ICU 2.0
1357: */
1358: public static Transliterator getInstance(String ID, int dir) {
1359: StringBuffer canonID = new StringBuffer();
1360: Vector list = new Vector();
1361: UnicodeSet[] globalFilter = new UnicodeSet[1];
1362: if (!TransliteratorIDParser.parseCompoundID(ID, dir, canonID,
1363: list, globalFilter)) {
1364: throw new IllegalArgumentException("Invalid ID " + ID);
1365: }
1366:
1367: TransliteratorIDParser.instantiateList(list);
1368:
1369: // assert(list.size() > 0);
1370: Transliterator t = null;
1371: if (list.size() > 1 || Utility.indexOf(canonID, ";") >= 0) {
1372: // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only
1373: // has one child transliterator. This is so that toRules() will return the right thing
1374: // (without any inactive ID), but our main ID still comes out correct. That is, if we
1375: // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;"
1376: // even though the ID is "(Lower);Latin-Greek;".
1377: t = new CompoundTransliterator(list);
1378: } else {
1379: t = (Transliterator) list.elementAt(0);
1380: }
1381:
1382: t.setID(canonID.toString());
1383: if (globalFilter[0] != null) {
1384: t.setFilter(globalFilter[0]);
1385: }
1386: return t;
1387: }
1388:
1389: /**
1390: * Create a transliterator from a basic ID. This is an ID
1391: * containing only the forward direction source, target, and
1392: * variant.
1393: * @param id a basic ID of the form S-T or S-T/V.
1394: * @param canonID canonical ID to apply to the result, or
1395: * null to leave the ID unchanged
1396: * @return a newly created Transliterator or null if the ID is
1397: * invalid.
1398: */
1399: static Transliterator getBasicInstance(String id, String canonID) {
1400: StringBuffer s = new StringBuffer();
1401: Transliterator t = registry.get(id, s);
1402: if (s.length() != 0) {
1403: // assert(t==0);
1404: // Instantiate an alias
1405: t = getInstance(s.toString(), FORWARD);
1406: }
1407: if (t != null && canonID != null) {
1408: t.setID(canonID);
1409: }
1410: return t;
1411: }
1412:
1413: /**
1414: * Returns a <code>Transliterator</code> object constructed from
1415: * the given rule string. This will be a RuleBasedTransliterator,
1416: * if the rule string contains only rules, or a
1417: * CompoundTransliterator, if it contains ID blocks, or a
1418: * NullTransliterator, if it contains ID blocks which parse as
1419: * empty for the given direction.
1420: * @stable ICU 2.0
1421: */
1422: public static final Transliterator createFromRules(String ID,
1423: String rules, int dir) {
1424: Transliterator t = null;
1425:
1426: TransliteratorParser parser = new TransliteratorParser();
1427: parser.parse(rules, dir);
1428:
1429: // NOTE: The logic here matches that in TransliteratorRegistry.
1430: if (parser.idBlockVector.size() == 0
1431: && parser.dataVector.size() == 0) {
1432: t = new NullTransliterator();
1433: } else if (parser.idBlockVector.size() == 0
1434: && parser.dataVector.size() == 1) {
1435: t = new RuleBasedTransliterator(ID,
1436: (RuleBasedTransliterator.Data) parser.dataVector
1437: .get(0), null);
1438: } else if (parser.idBlockVector.size() == 1
1439: && parser.dataVector.size() == 0) {
1440: // idBlock, no data -- this is an alias. The ID has
1441: // been munged from reverse into forward mode, if
1442: // necessary, so instantiate the ID in the forward
1443: // direction.
1444: if (parser.compoundFilter != null)
1445: t = getInstance(parser.compoundFilter.toPattern(false)
1446: + ";" + (String) parser.idBlockVector.get(0));
1447: else
1448: t = getInstance((String) parser.idBlockVector.get(0));
1449:
1450: if (t != null) {
1451: t.setID(ID);
1452: }
1453: } else {
1454: Vector transliterators = new Vector();
1455: int passNumber = 1;
1456:
1457: int limit = Math.max(parser.idBlockVector.size(),
1458: parser.dataVector.size());
1459: for (int i = 0; i < limit; i++) {
1460: if (i < parser.idBlockVector.size()) {
1461: String idBlock = (String) parser.idBlockVector
1462: .get(i);
1463: if (idBlock.length() > 0) {
1464: Transliterator temp = getInstance(idBlock);
1465: if (!(temp instanceof NullTransliterator))
1466: transliterators.add(getInstance(idBlock));
1467: }
1468: }
1469: if (i < parser.dataVector.size()) {
1470: RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data) parser.dataVector
1471: .get(i);
1472: transliterators.add(new RuleBasedTransliterator(
1473: "%Pass" + passNumber++, data, null));
1474: }
1475: }
1476:
1477: t = new CompoundTransliterator(transliterators,
1478: passNumber - 1);
1479: t.setID(ID);
1480: if (parser.compoundFilter != null) {
1481: t.setFilter(parser.compoundFilter);
1482: }
1483: }
1484:
1485: return t;
1486: }
1487:
1488: /**
1489: * Returns a rule string for this transliterator.
1490: * @param escapeUnprintable if true, then unprintable characters
1491: * will be converted to escape form backslash-'u' or
1492: * backslash-'U'.
1493: * @stable ICU 2.0
1494: */
1495: public String toRules(boolean escapeUnprintable) {
1496: return baseToRules(escapeUnprintable);
1497: }
1498:
1499: /**
1500: * Returns a rule string for this transliterator. This is
1501: * a non-overrideable base class implementation that subclasses
1502: * may call. It simply munges the ID into the correct format,
1503: * that is, "foo" => "::foo".
1504: * @param escapeUnprintable if true, then unprintable characters
1505: * will be converted to escape form backslash-'u' or
1506: * backslash-'U'.
1507: * @stable ICU 2.0
1508: */
1509: protected final String baseToRules(boolean escapeUnprintable) {
1510: // The base class implementation of toRules munges the ID into
1511: // the correct format. That is: foo => ::foo
1512: // KEEP in sync with rbt_pars
1513: if (escapeUnprintable) {
1514: StringBuffer rulesSource = new StringBuffer();
1515: String id = getID();
1516: for (int i = 0; i < id.length();) {
1517: int c = UTF16.charAt(id, i);
1518: if (!Utility.escapeUnprintable(rulesSource, c)) {
1519: UTF16.append(rulesSource, c);
1520: }
1521: i += UTF16.getCharCount(c);
1522: }
1523: rulesSource.insert(0, "::");
1524: rulesSource.append(ID_DELIM);
1525: return rulesSource.toString();
1526: }
1527: return "::" + getID() + ID_DELIM;
1528: }
1529:
1530: /**
1531: * Return the elements that make up this transliterator. For
1532: * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
1533: * were created, the return value of this method would be an array
1534: * of the three transliterator objects that make up that
1535: * transliterator: [NFD, Jamo-Latin, Latin-Greek].
1536: *
1537: * <p>If this transliterator is not composed of other
1538: * transliterators, then this method will return an array of
1539: * length one containing a reference to this transliterator.
1540: * @return an array of one or more transliterators that make up
1541: * this transliterator
1542: * @stable ICU 3.0
1543: */
1544: public Transliterator[] getElements() {
1545: Transliterator result[];
1546: if (this instanceof CompoundTransliterator) {
1547: CompoundTransliterator cpd = (CompoundTransliterator) this ;
1548: result = new Transliterator[cpd.getCount()];
1549: for (int i = 0; i < result.length; ++i) {
1550: result[i] = cpd.getTransliterator(i);
1551: }
1552: } else {
1553: result = new Transliterator[] { this };
1554: }
1555: return result;
1556: }
1557:
1558: /**
1559: * Returns the set of all characters that may be modified in the
1560: * input text by this Transliterator. This incorporates this
1561: * object's current filter; if the filter is changed, the return
1562: * value of this function will change. The default implementation
1563: * returns an empty set. Some subclasses may override {@link
1564: * #handleGetSourceSet} to return a more precise result. The
1565: * return result is approximate in any case and is intended for
1566: * use by tests, tools, or utilities.
1567: * @see #getTargetSet
1568: * @see #handleGetSourceSet
1569: * @stable ICU 2.2
1570: */
1571: public final UnicodeSet getSourceSet() {
1572: UnicodeSet set = handleGetSourceSet();
1573: if (filter != null) {
1574: UnicodeSet filterSet;
1575: // Most, but not all filters will be UnicodeSets. Optimize for
1576: // the high-runner case.
1577: try {
1578: filterSet = (UnicodeSet) filter;
1579: } catch (ClassCastException e) {
1580: filterSet = new UnicodeSet();
1581: filter.addMatchSetTo(filterSet);
1582: }
1583: set.retainAll(filterSet);
1584: }
1585: return set;
1586: }
1587:
1588: /**
1589: * Framework method that returns the set of all characters that
1590: * may be modified in the input text by this Transliterator,
1591: * ignoring the effect of this object's filter. The base class
1592: * implementation returns the empty set. Subclasses that wish to
1593: * implement this should override this method.
1594: * @return the set of characters that this transliterator may
1595: * modify. The set may be modified, so subclasses should return a
1596: * newly-created object.
1597: * @see #getSourceSet
1598: * @see #getTargetSet
1599: * @stable ICU 2.2
1600: */
1601: protected UnicodeSet handleGetSourceSet() {
1602: return new UnicodeSet();
1603: }
1604:
1605: /**
1606: * Returns the set of all characters that may be generated as
1607: * replacement text by this transliterator. The default
1608: * implementation returns the empty set. Some subclasses may
1609: * override this method to return a more precise result. The
1610: * return result is approximate in any case and is intended for
1611: * use by tests, tools, or utilities requiring such
1612: * meta-information.
1613: * @see #getTargetSet
1614: * @stable ICU 2.2
1615: */
1616: public UnicodeSet getTargetSet() {
1617: return new UnicodeSet();
1618: }
1619:
1620: /**
1621: * Returns this transliterator's inverse. See the class
1622: * documentation for details. This implementation simply inverts
1623: * the two entities in the ID and attempts to retrieve the
1624: * resulting transliterator. That is, if <code>getID()</code>
1625: * returns "A-B", then this method will return the result of
1626: * <code>getInstance("B-A")</code>, or <code>null</code> if that
1627: * call fails.
1628: *
1629: * <p>Subclasses with knowledge of their inverse may wish to
1630: * override this method.
1631: *
1632: * @return a transliterator that is an inverse, not necessarily
1633: * exact, of this transliterator, or <code>null</code> if no such
1634: * transliterator is registered.
1635: * @see #registerClass
1636: * @stable ICU 2.0
1637: */
1638: public final Transliterator getInverse() {
1639: return getInstance(ID, REVERSE);
1640: }
1641:
1642: /**
1643: * Registers a subclass of <code>Transliterator</code> with the
1644: * system. This subclass must have a public constructor taking no
1645: * arguments. When that constructor is called, the resulting
1646: * object must return the <code>ID</code> passed to this method if
1647: * its <code>getID()</code> method is called.
1648: *
1649: * @param ID the result of <code>getID()</code> for this
1650: * transliterator
1651: * @param transClass a subclass of <code>Transliterator</code>
1652: * @see #unregister
1653: * @stable ICU 2.0
1654: */
1655: public static void registerClass(String ID, Class transClass,
1656: String displayName) {
1657: registry.put(ID, transClass, true);
1658: if (displayName != null) {
1659: displayNameCache.put(new CaseInsensitiveString(ID),
1660: displayName);
1661: }
1662: }
1663:
1664: /**
1665: * Register a factory object with the given ID. The factory
1666: * method should return a new instance of the given transliterator.
1667: * @param ID the ID of this transliterator
1668: * @param factory the factory object
1669: * @stable ICU 2.0
1670: */
1671: public static void registerFactory(String ID, Factory factory) {
1672: registry.put(ID, factory, true);
1673: }
1674:
1675: /**
1676: * Register a Transliterator object with the given ID.
1677: * @param trans the Transliterator object
1678: * @stable ICU 2.2
1679: */
1680: public static void registerInstance(Transliterator trans) {
1681: registry.put(trans.getID(), trans, true);
1682: }
1683:
1684: /**
1685: * Register a Transliterator object with the given ID.
1686: * @param ID the ID of this transliterator
1687: * @param trans the Transliterator object
1688: * @internal
1689: */
1690: static void registerInstance(Transliterator trans, boolean visible) {
1691: registry.put(trans.getID(), trans, visible);
1692: }
1693:
1694: /**
1695: * Register an ID as an alias of another ID. Instantiating
1696: * alias ID produces the same result as instantiating the original ID.
1697: * This is generally used to create short aliases of compound IDs.
1698: * @param aliasID The new ID being registered.
1699: * @param realID The existing ID that the new ID should be an alias of.
1700: * @draft ICU 3.4.1
1701: * @provisional This API might change or be removed in a future release.
1702: */
1703: public static void registerAlias(String aliasID, String realID) {
1704: registry.put(aliasID, realID, true);
1705: }
1706:
1707: /**
1708: * Register two targets as being inverses of one another. For
1709: * example, calling registerSpecialInverse("NFC", "NFD", true) causes
1710: * Transliterator to form the following inverse relationships:
1711: *
1712: * <pre>NFC => NFD
1713: * Any-NFC => Any-NFD
1714: * NFD => NFC
1715: * Any-NFD => Any-NFC</pre>
1716: *
1717: * (Without the special inverse registration, the inverse of NFC
1718: * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
1719: * that the presence or absence of "Any-" is preserved.
1720: *
1721: * <p>The relationship is symmetrical; registering (a, b) is
1722: * equivalent to registering (b, a).
1723: *
1724: * <p>The relevant IDs must still be registered separately as
1725: * factories or classes.
1726: *
1727: * <p>Only the targets are specified. Special inverses always
1728: * have the form Any-Target1 <=> Any-Target2. The target should
1729: * have canonical casing (the casing desired to be produced when
1730: * an inverse is formed) and should contain no whitespace or other
1731: * extraneous characters.
1732: *
1733: * @param target the target against which to register the inverse
1734: * @param inverseTarget the inverse of target, that is
1735: * Any-target.getInverse() => Any-inverseTarget
1736: * @param bidirectional if true, register the reverse relation
1737: * as well, that is, Any-inverseTarget.getInverse() => Any-target
1738: * @internal
1739: */
1740: static void registerSpecialInverse(String target,
1741: String inverseTarget, boolean bidirectional) {
1742: TransliteratorIDParser.registerSpecialInverse(target,
1743: inverseTarget, bidirectional);
1744: }
1745:
1746: /**
1747: * Unregisters a transliterator or class. This may be either
1748: * a system transliterator or a user transliterator or class.
1749: *
1750: * @param ID the ID of the transliterator or class
1751: * @see #registerClass
1752: * @stable ICU 2.0
1753: */
1754: public static void unregister(String ID) {
1755: displayNameCache.remove(new CaseInsensitiveString(ID));
1756: registry.remove(ID);
1757: }
1758:
1759: /**
1760: * Returns an enumeration over the programmatic names of registered
1761: * <code>Transliterator</code> objects. This includes both system
1762: * transliterators and user transliterators registered using
1763: * <code>registerClass()</code>. The enumerated names may be
1764: * passed to <code>getInstance()</code>.
1765: *
1766: * @return An <code>Enumeration</code> over <code>String</code> objects
1767: * @see #getInstance
1768: * @see #registerClass
1769: * @stable ICU 2.0
1770: */
1771: public static final Enumeration getAvailableIDs() {
1772: return registry.getAvailableIDs();
1773: }
1774:
1775: /**
1776: * Returns an enumeration over the source names of registered
1777: * transliterators. Source names may be passed to
1778: * getAvailableTargets() to obtain available targets for each
1779: * source.
1780: * @stable ICU 2.0
1781: */
1782: public static final Enumeration getAvailableSources() {
1783: return registry.getAvailableSources();
1784: }
1785:
1786: /**
1787: * Returns an enumeration over the target names of registered
1788: * transliterators having a given source name. Target names may
1789: * be passed to getAvailableVariants() to obtain available
1790: * variants for each source and target pair.
1791: * @stable ICU 2.0
1792: */
1793: public static final Enumeration getAvailableTargets(String source) {
1794: return registry.getAvailableTargets(source);
1795: }
1796:
1797: /**
1798: * Returns an enumeration over the variant names of registered
1799: * transliterators having a given source name and target name.
1800: * @stable ICU 2.0
1801: */
1802: public static final Enumeration getAvailableVariants(String source,
1803: String target) {
1804: return registry.getAvailableVariants(source, target);
1805: }
1806:
1807: private static final String INDEX = "index",
1808: RB_RULE_BASED_IDS = "RuleBasedTransliteratorIDs";
1809: static {
1810: registry = new TransliteratorRegistry();
1811:
1812: // The display name cache starts out empty
1813: displayNameCache = new Hashtable();
1814: /* The following code parses the index table located in
1815: * icu/data/translit/root.txt. The index is an n x 4 table
1816: * that follows this format:
1817: * <id>{
1818: * file{
1819: * resource{"<resource>"}
1820: * direction{"<direction>"}
1821: * }
1822: * }
1823: * <id>{
1824: * internal{
1825: * resource{"<resource>"}
1826: * direction{"<direction"}
1827: * }
1828: * }
1829: * <id>{
1830: * alias{"<getInstanceArg"}
1831: * }
1832: * <id> is the ID of the system transliterator being defined. These
1833: * are public IDs enumerated by Transliterator.getAvailableIDs(),
1834: * unless the second field is "internal".
1835: *
1836: * <resource> is a ResourceReader resource name. Currently these refer
1837: * to file names under com/ibm/text/resources. This string is passed
1838: * directly to ResourceReader, together with <encoding>.
1839: *
1840: * <direction> is either "FORWARD" or "REVERSE".
1841: *
1842: * <getInstanceArg> is a string to be passed directly to
1843: * Transliterator.getInstance(). The returned Transliterator object
1844: * then has its ID changed to <id> and is returned.
1845: *
1846: * The extra blank field on "alias" lines is to make the array square.
1847: */
1848: ICUResourceBundle bundle, transIDs, colBund;
1849: bundle = (ICUResourceBundle) UResourceBundle.getBundleInstance(
1850: ICUResourceBundle.ICU_TRANSLIT_BASE_NAME, INDEX);
1851: transIDs = bundle.get(RB_RULE_BASED_IDS);
1852:
1853: int row, maxRows;
1854: maxRows = transIDs.getSize();
1855: for (row = 0; row < maxRows; row++) {
1856: colBund = transIDs.get(row);
1857: String ID = colBund.getKey();
1858: ICUResourceBundle res = colBund.get(0);
1859: String type = res.getKey();
1860: if (type.equals("file") || type.equals("internal")) {
1861: // Rest of line is <resource>:<encoding>:<direction>
1862: // pos colon c2
1863: String resString = res.getString("resource");
1864: int dir;
1865: String direction = res.getString("direction");
1866: switch (direction.charAt(0)) {
1867: case 'F':
1868: dir = FORWARD;
1869: break;
1870: case 'R':
1871: dir = REVERSE;
1872: break;
1873: default:
1874: throw new RuntimeException(
1875: "Can't parse direction: " + direction);
1876: }
1877: registry.put(ID, resString, // resource
1878: "UTF-16", // encoding
1879: dir, !type.equals("internal"));
1880: } else if (type.equals("alias")) {
1881: //'alias'; row[2]=createInstance argument
1882: String resString = res.getString();
1883: registry.put(ID, resString, true);
1884: } else {
1885: // Unknown type
1886: throw new RuntimeException("Unknow type: " + type);
1887: }
1888: }
1889:
1890: registerSpecialInverse(NullTransliterator.SHORT_ID,
1891: NullTransliterator.SHORT_ID, false);
1892:
1893: // Register non-rule-based transliterators
1894: registerClass(NullTransliterator._ID, NullTransliterator.class,
1895: null);
1896: RemoveTransliterator.register();
1897: EscapeTransliterator.register();
1898: UnescapeTransliterator.register();
1899: LowercaseTransliterator.register();
1900: UppercaseTransliterator.register();
1901: TitlecaseTransliterator.register();
1902: UnicodeNameTransliterator.register();
1903: NameUnicodeTransliterator.register();
1904: NormalizationTransliterator.register();
1905: BreakTransliterator.register();
1906: AnyTransliterator.register(); // do this last!
1907: }
1908:
1909: /**
1910: * The factory interface for transliterators. Transliterator
1911: * subclasses can register factory objects for IDs using the
1912: * registerFactory() method of Transliterator. When invoked, the
1913: * factory object will be passed the ID being instantiated. This
1914: * makes it possible to register one factory method to more than
1915: * one ID, or for a factory method to parameterize its result
1916: * based on the variant.
1917: * @stable ICU 2.0
1918: */
1919: public static interface Factory {
1920: /**
1921: * Return a transliterator for the given ID.
1922: * @stable ICU 2.0
1923: */
1924: Transliterator getInstance(String ID);
1925: }
1926: }
|