001: /*
002: *****************************************************************
003: * Copyright (c) 2002-2006, International Business Machines Corporation
004: * and others. All Rights Reserved.
005: *****************************************************************
006: * Date Name Description
007: * 06/06/2002 aliu Creation.
008: *****************************************************************
009: */
010: package com.ibm.icu.text;
011:
012: import com.ibm.icu.lang.UScript;
013: import java.lang.Math;
014: import java.util.Enumeration;
015: import java.util.HashSet;
016: import java.util.HashMap;
017: import java.util.Map;
018: import java.util.MissingResourceException;
019:
020: /**
021: * A transliterator that translates multiple input scripts to a single
022: * output script. It is named Any-T or Any-T/V, where T is the target
023: * and V is the optional variant. The target T is a script.
024: *
025: * <p>An AnyTransliterator partitions text into runs of the same
026: * script, together with adjacent COMMON or INHERITED characters.
027: * After determining the script of each run, it transliterates from
028: * that script to the given target/variant. It does so by
029: * instantiating a transliterator from the source script to the
030: * target/variant. If a run consists only of the target script,
031: * COMMON, or INHERITED characters, then the run is not changed.
032: *
033: * <p>At startup, all possible AnyTransliterators are registered with
034: * the system, as determined by examining the registered script
035: * transliterators.
036: *
037: * @since ICU 2.2
038: * @author Alan Liu
039: */
040: class AnyTransliterator extends Transliterator {
041:
042: //------------------------------------------------------------
043: // Constants
044:
045: static final char TARGET_SEP = '-';
046: static final char VARIANT_SEP = '/';
047: static final String ANY = "Any";
048: static final String NULL_ID = "Null";
049: static final String LATIN_PIVOT = "-Latin;Latin-";
050:
051: /**
052: * Cache mapping UScriptCode values to Transliterator*.
053: */
054: private Map cache;
055:
056: /**
057: * The target or target/variant string.
058: */
059: private String target;
060:
061: /**
062: * The target script code. Never USCRIPT_INVALID_CODE.
063: */
064: private int targetScript;
065:
066: /**
067: * Implements {@link Transliterator#handleTransliterate}.
068: */
069: protected void handleTransliterate(Replaceable text, Position pos,
070: boolean isIncremental) {
071: int allStart = pos.start;
072: int allLimit = pos.limit;
073:
074: ScriptRunIterator it = new ScriptRunIterator(text,
075: pos.contextStart, pos.contextLimit);
076:
077: while (it.next()) {
078: // Ignore runs in the ante context
079: if (it.limit <= allStart)
080: continue;
081:
082: // Try to instantiate transliterator from it.scriptCode to
083: // our target or target/variant
084: Transliterator t = getTransliterator(it.scriptCode);
085:
086: if (t == null) {
087: // We have no transliterator. Do nothing, but keep
088: // pos.start up to date.
089: pos.start = it.limit;
090: continue;
091: }
092:
093: // If the run end is before the transliteration limit, do
094: // a non-incremental transliteration. Otherwise do an
095: // incremental one.
096: boolean incremental = isIncremental
097: && (it.limit >= allLimit);
098:
099: pos.start = Math.max(allStart, it.start);
100: pos.limit = Math.min(allLimit, it.limit);
101: int limit = pos.limit;
102: t.filteredTransliterate(text, pos, incremental);
103: int delta = pos.limit - limit;
104: allLimit += delta;
105: it.adjustLimit(delta);
106:
107: // We're done if we enter the post context
108: if (it.limit >= allLimit)
109: break;
110: }
111:
112: // Restore limit. pos.start is fine where the last transliterator
113: // left it, or at the end of the last run.
114: pos.limit = allLimit;
115: }
116:
117: /**
118: * Private constructor
119: * @param id the ID of the form S-T or S-T/V, where T is theTarget
120: * and V is theVariant. Must not be empty.
121: * @param theTarget the target name. Must not be empty, and must
122: * name a script corresponding to theTargetScript.
123: * @param theVariant the variant name, or the empty string if
124: * there is no variant
125: * @param theTargetScript the script code corresponding to
126: * theTarget.
127: */
128: private AnyTransliterator(String id, String theTarget,
129: String theVariant, int theTargetScript) {
130: super (id, null);
131: targetScript = theTargetScript;
132: cache = new HashMap();
133:
134: target = theTarget;
135: if (theVariant.length() > 0) {
136: target = theTarget + VARIANT_SEP + theVariant;
137: }
138: }
139:
140: /**
141: * Returns a transliterator from the given source to our target or
142: * target/variant. Returns NULL if the source is the same as our
143: * target script, or if the source is USCRIPT_INVALID_CODE.
144: * Caches the result and returns the same transliterator the next
145: * time. The caller does NOT own the result and must not delete
146: * it.
147: */
148: private Transliterator getTransliterator(int source) {
149: if (source == targetScript || source == UScript.INVALID_CODE) {
150: return null;
151: }
152:
153: Integer key = new Integer(source);
154: Transliterator t = (Transliterator) cache.get(key);
155: if (t == null) {
156: String sourceName = UScript.getName(source);
157: String id = sourceName + TARGET_SEP + target;
158:
159: try {
160: t = Transliterator.getInstance(id, FORWARD);
161: } catch (RuntimeException e) {
162: }
163: if (t == null) {
164:
165: // Try to pivot around Latin, our most common script
166: id = sourceName + LATIN_PIVOT + target;
167: try {
168: t = Transliterator.getInstance(id, FORWARD);
169: } catch (RuntimeException e) {
170: }
171: }
172:
173: if (t != null) {
174: cache.put(key, t);
175: }
176: }
177:
178: return t;
179: }
180:
181: /**
182: * Registers standard transliterators with the system. Called by
183: * Transliterator during initialization. Scan all current targets
184: * and register those that are scripts T as Any-T/V.
185: */
186: static void register() {
187:
188: HashSet seen = new HashSet();
189:
190: for (Enumeration s = Transliterator.getAvailableSources(); s
191: .hasMoreElements();) {
192: String source = (String) s.nextElement();
193:
194: // Ignore the "Any" source
195: if (source.equalsIgnoreCase(ANY))
196: continue;
197:
198: for (Enumeration t = Transliterator
199: .getAvailableTargets(source); t.hasMoreElements();) {
200: String target = (String) t.nextElement();
201:
202: // Only process each target once
203: if (seen.contains(target))
204: continue;
205: seen.add(target);
206:
207: // Get the script code for the target. If not a script, ignore.
208: int targetScript = scriptNameToCode(target);
209: if (targetScript == UScript.INVALID_CODE)
210: continue;
211:
212: for (Enumeration v = Transliterator
213: .getAvailableVariants(source, target); v
214: .hasMoreElements();) {
215: String variant = (String) v.nextElement();
216:
217: String id;
218: id = TransliteratorIDParser.STVtoID(ANY, target,
219: variant);
220: AnyTransliterator trans = new AnyTransliterator(id,
221: target, variant, targetScript);
222: Transliterator.registerInstance(trans);
223: Transliterator.registerSpecialInverse(target,
224: NULL_ID, false);
225: }
226: }
227: }
228: }
229:
230: /**
231: * Return the script code for a given name, or
232: * UScript.INVALID_CODE if not found.
233: */
234: private static int scriptNameToCode(String name) {
235: try {
236: int[] codes = UScript.getCode(name);
237: return codes != null ? codes[0] : UScript.INVALID_CODE;
238: } catch (MissingResourceException e) {
239: return UScript.INVALID_CODE;
240: }
241: }
242:
243: //------------------------------------------------------------
244: // ScriptRunIterator
245:
246: /**
247: * Returns a series of ranges corresponding to scripts. They will be
248: * of the form:
249: *
250: * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
251: * | | - first run (start, limit)
252: * | | - second run (start, limit)
253: *
254: * That is, the runs will overlap. The reason for this is so that a
255: * transliterator can consider common characters both before and after
256: * the scripts.
257: */
258: private static class ScriptRunIterator {
259:
260: private Replaceable text;
261: private int textStart;
262: private int textLimit;
263:
264: /**
265: * The code of the current run, valid after next() returns. May
266: * be UScript.INVALID_CODE if and only if the entire text is
267: * COMMON/INHERITED.
268: */
269: public int scriptCode;
270:
271: /**
272: * The start of the run, inclusive, valid after next() returns.
273: */
274: public int start;
275:
276: /**
277: * The end of the run, exclusive, valid after next() returns.
278: */
279: public int limit;
280:
281: /**
282: * Constructs a run iterator over the given text from start
283: * (inclusive) to limit (exclusive).
284: */
285: public ScriptRunIterator(Replaceable text, int start, int limit) {
286: this .text = text;
287: this .textStart = start;
288: this .textLimit = limit;
289: this .limit = start;
290: }
291:
292: /**
293: * Returns TRUE if there are any more runs. TRUE is always
294: * returned at least once. Upon return, the caller should
295: * examine scriptCode, start, and limit.
296: */
297: public boolean next() {
298: int ch;
299: int s;
300:
301: scriptCode = UScript.INVALID_CODE; // don't know script yet
302: start = limit;
303:
304: // Are we done?
305: if (start == textLimit) {
306: return false;
307: }
308:
309: // Move start back to include adjacent COMMON or INHERITED
310: // characters
311: while (start > textStart) {
312: ch = text.char32At(start - 1); // look back
313: s = UScript.getScript(ch);
314: if (s == UScript.COMMON || s == UScript.INHERITED) {
315: --start;
316: } else {
317: break;
318: }
319: }
320:
321: // Move limit ahead to include COMMON, INHERITED, and characters
322: // of the current script.
323: while (limit < textLimit) {
324: ch = text.char32At(limit); // look ahead
325: s = UScript.getScript(ch);
326: if (s != UScript.COMMON && s != UScript.INHERITED) {
327: if (scriptCode == UScript.INVALID_CODE) {
328: scriptCode = s;
329: } else if (s != scriptCode) {
330: break;
331: }
332: }
333: ++limit;
334: }
335:
336: // Return TRUE even if the entire text is COMMON / INHERITED, in
337: // which case scriptCode will be UScript.INVALID_CODE.
338: return true;
339: }
340:
341: /**
342: * Adjusts internal indices for a change in the limit index of the
343: * given delta. A positive delta means the limit has increased.
344: */
345: public void adjustLimit(int delta) {
346: limit += delta;
347: textLimit += delta;
348: }
349: }
350: }
351:
352: //eof
|