001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.text;
008:
009: import java.util.*;
010: import com.ibm.icu.impl.UtilityExtensions;
011:
012: /**
013: * A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
014: * the transliteration in one direction from one set of characters or short
015: * strings to another. A <code>RuleBasedTransliterator</code> consists of up to
016: * two such sets, one for the forward direction, and one for the reverse.
017: *
018: * <p>A <code>TransliterationRuleSet</code> has one important operation, that of
019: * finding a matching rule at a given point in the text. This is accomplished
020: * by the <code>findMatch()</code> method.
021: *
022: * <p>Copyright © IBM Corporation 1999. All rights reserved.
023: *
024: * @author Alan Liu
025: */
026: class TransliterationRuleSet {
027: /**
028: * Vector of rules, in the order added.
029: */
030: private Vector ruleVector;
031:
032: /**
033: * Length of the longest preceding context
034: */
035: private int maxContextLength;
036:
037: /**
038: * Sorted and indexed table of rules. This is created by freeze() from
039: * the rules in ruleVector. rules.length >= ruleVector.size(), and the
040: * references in rules[] are aliases of the references in ruleVector.
041: * A single rule in ruleVector is listed one or more times in rules[].
042: */
043: private TransliterationRule[] rules;
044:
045: /**
046: * Index table. For text having a first character c, compute x = c&0xFF.
047: * Now use rules[index[x]..index[x+1]-1]. This index table is created by
048: * freeze().
049: */
050: private int[] index;
051:
052: private static final String COPYRIGHT = "\u00A9 IBM Corporation 1999-2001. All rights reserved.";
053:
054: /**
055: * Construct a new empty rule set.
056: */
057: public TransliterationRuleSet() {
058: ruleVector = new Vector();
059: maxContextLength = 0;
060: }
061:
062: /**
063: * Return the maximum context length.
064: * @return the length of the longest preceding context.
065: */
066: public int getMaximumContextLength() {
067: return maxContextLength;
068: }
069:
070: /**
071: * Add a rule to this set. Rules are added in order, and order is
072: * significant.
073: * @param rule the rule to add
074: */
075: public void addRule(TransliterationRule rule) {
076: ruleVector.addElement(rule);
077: int len;
078: if ((len = rule.getAnteContextLength()) > maxContextLength) {
079: maxContextLength = len;
080: }
081:
082: rules = null;
083: }
084:
085: /**
086: * Close this rule set to further additions, check it for masked rules,
087: * and index it to optimize performance.
088: * @exception IllegalArgumentException if some rules are masked
089: */
090: public void freeze() {
091: /* Construct the rule array and index table. We reorder the
092: * rules by sorting them into 256 bins. Each bin contains all
093: * rules matching the index value for that bin. A rule
094: * matches an index value if string whose first key character
095: * has a low byte equal to the index value can match the rule.
096: *
097: * Each bin contains zero or more rules, in the same order
098: * they were found originally. However, the total rules in
099: * the bins may exceed the number in the original vector,
100: * since rules that have a variable as their first key
101: * character will generally fall into more than one bin.
102: *
103: * That is, each bin contains all rules that either have that
104: * first index value as their first key character, or have
105: * a set containing the index value as their first character.
106: */
107: int n = ruleVector.size();
108: index = new int[257]; // [sic]
109: Vector v = new Vector(2 * n); // heuristic; adjust as needed
110:
111: /* Precompute the index values. This saves a LOT of time.
112: */
113: int[] indexValue = new int[n];
114: for (int j = 0; j < n; ++j) {
115: TransliterationRule r = (TransliterationRule) ruleVector
116: .elementAt(j);
117: indexValue[j] = r.getIndexValue();
118: }
119: for (int x = 0; x < 256; ++x) {
120: index[x] = v.size();
121: for (int j = 0; j < n; ++j) {
122: if (indexValue[j] >= 0) {
123: if (indexValue[j] == x) {
124: v.addElement(ruleVector.elementAt(j));
125: }
126: } else {
127: // If the indexValue is < 0, then the first key character is
128: // a set, and we must use the more time-consuming
129: // matchesIndexValue check. In practice this happens
130: // rarely, so we seldom tread this code path.
131: TransliterationRule r = (TransliterationRule) ruleVector
132: .elementAt(j);
133: if (r.matchesIndexValue(x)) {
134: v.addElement(r);
135: }
136: }
137: }
138: }
139: index[256] = v.size();
140:
141: /* Freeze things into an array.
142: */
143: rules = new TransliterationRule[v.size()];
144: v.copyInto(rules);
145:
146: StringBuffer errors = null;
147:
148: /* Check for masking. This is MUCH faster than our old check,
149: * which was each rule against each following rule, since we
150: * only have to check for masking within each bin now. It's
151: * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
152: * count, and n2 is the per-bin rule count. But n2<<n1, so
153: * it's a big win.
154: */
155: for (int x = 0; x < 256; ++x) {
156: for (int j = index[x]; j < index[x + 1] - 1; ++j) {
157: TransliterationRule r1 = rules[j];
158: for (int k = j + 1; k < index[x + 1]; ++k) {
159: TransliterationRule r2 = rules[k];
160: if (r1.masks(r2)) {
161: if (errors == null) {
162: errors = new StringBuffer();
163: } else {
164: errors.append("\n");
165: }
166: errors.append("Rule " + r1 + " masks " + r2);
167: }
168: }
169: }
170: }
171:
172: if (errors != null) {
173: throw new IllegalArgumentException(errors.toString());
174: }
175: }
176:
177: /**
178: * Transliterate the given text with the given UTransPosition
179: * indices. Return TRUE if the transliteration should continue
180: * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
181: * Note that FALSE is only ever returned if isIncremental is TRUE.
182: * @param text the text to be transliterated
183: * @param pos the position indices, which will be updated
184: * @param incremental if TRUE, assume new text may be inserted
185: * at index.limit, and return FALSE if thre is a partial match.
186: * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
187: * indicating that transliteration should stop until more text
188: * arrives.
189: */
190: public boolean transliterate(Replaceable text,
191: Transliterator.Position pos, boolean incremental) {
192: int indexByte = text.char32At(pos.start) & 0xFF;
193: for (int i = index[indexByte]; i < index[indexByte + 1]; ++i) {
194: int m = rules[i].matchAndReplace(text, pos, incremental);
195: switch (m) {
196: case UnicodeMatcher.U_MATCH:
197: if (Transliterator.DEBUG) {
198: System.out.println((incremental ? "Rule.i: match "
199: : "Rule: match ")
200: + rules[i].toRule(true)
201: + " => "
202: + UtilityExtensions.formatInput(text, pos));
203: }
204: return true;
205: case UnicodeMatcher.U_PARTIAL_MATCH:
206: if (Transliterator.DEBUG) {
207: System.out
208: .println((incremental ? "Rule.i: partial match "
209: : "Rule: partial match ")
210: + rules[i].toRule(true)
211: + " => "
212: + UtilityExtensions.formatInput(
213: text, pos));
214: }
215: return false;
216: }
217: }
218: // No match or partial match from any rule
219: pos.start += UTF16.getCharCount(text.char32At(pos.start));
220: if (Transliterator.DEBUG) {
221: System.out.println((incremental ? "Rule.i: no match => "
222: : "Rule: no match => ")
223: + UtilityExtensions.formatInput(text, pos));
224: }
225: return true;
226: }
227:
228: /**
229: * Create rule strings that represents this rule set.
230: */
231: String toRules(boolean escapeUnprintable) {
232: int i;
233: int count = ruleVector.size();
234: StringBuffer ruleSource = new StringBuffer();
235: for (i = 0; i < count; ++i) {
236: if (i != 0) {
237: ruleSource.append('\n');
238: }
239: TransliterationRule r = (TransliterationRule) ruleVector
240: .elementAt(i);
241: ruleSource.append(r.toRule(escapeUnprintable));
242: }
243: return ruleSource.toString();
244: }
245:
246: /**
247: * Return the set of all characters that may be modified (getTarget=false)
248: * or emitted (getTarget=true) by this set.
249: */
250: UnicodeSet getSourceTargetSet(boolean getTarget) {
251: UnicodeSet set = new UnicodeSet();
252: int count = ruleVector.size();
253: for (int i = 0; i < count; ++i) {
254: TransliterationRule r = (TransliterationRule) ruleVector
255: .elementAt(i);
256: if (getTarget) {
257: r.addTargetSetTo(set);
258: } else {
259: r.addSourceSetTo(set);
260: }
261: }
262: return set;
263: }
264: }
|