001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.text;
008:
009: import com.ibm.icu.impl.Utility;
010:
011: /**
012: * A transliteration rule used by
013: * <code>RuleBasedTransliterator</code>.
014: * <code>TransliterationRule</code> is an immutable object.
015: *
016: * <p>A rule consists of an input pattern and an output string. When
017: * the input pattern is matched, the output string is emitted. The
018: * input pattern consists of zero or more characters which are matched
019: * exactly (the key) and optional context. Context must match if it
020: * is specified. Context may be specified before the key, after the
021: * key, or both. The key, preceding context, and following context
022: * may contain variables. Variables represent a set of Unicode
023: * characters, such as the letters <i>a</i> through <i>z</i>.
024: * Variables are detected by looking up each character in a supplied
025: * variable list to see if it has been so defined.
026: *
027: * <p>A rule may contain segments in its input string and segment
028: * references in its output string. A segment is a substring of the
029: * input pattern, indicated by an offset and limit. The segment may
030: * be in the preceding or following context. It may not span a
031: * context boundary. A segment reference is a special character in
032: * the output string that causes a segment of the input string (not
033: * the input pattern) to be copied to the output string. The range of
034: * special characters that represent segment references is defined by
035: * RuleBasedTransliterator.Data.
036: *
037: * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
038: * string "abc.123" to "ab1.c23".
039: *
040: * <p>Copyright © IBM Corporation 1999. All rights reserved.
041: *
042: * @author Alan Liu
043: */
044: class TransliterationRule {
045:
046: // TODO Eliminate the pattern and keyLength data members. They
047: // are used only by masks() and getIndexValue() which are called
048: // only during build time, not during run-time. Perhaps these
049: // methods and pattern/keyLength can be isolated into a separate
050: // object.
051:
052: /**
053: * The match that must occur before the key, or null if there is no
054: * preceding context.
055: */
056: private StringMatcher anteContext;
057:
058: /**
059: * The matcher object for the key. If null, then the key is empty.
060: */
061: private StringMatcher key;
062:
063: /**
064: * The match that must occur after the key, or null if there is no
065: * following context.
066: */
067: private StringMatcher postContext;
068:
069: /**
070: * The object that performs the replacement if the key,
071: * anteContext, and postContext are matched. Never null.
072: */
073: private UnicodeReplacer output;
074:
075: /**
076: * The string that must be matched, consisting of the anteContext, key,
077: * and postContext, concatenated together, in that order. Some components
078: * may be empty (zero length).
079: * @see anteContextLength
080: * @see keyLength
081: */
082: private String pattern;
083:
084: /**
085: * An array of matcher objects corresponding to the input pattern
086: * segments. If there are no segments this is null. N.B. This is
087: * a UnicodeMatcher for generality, but in practice it is always a
088: * StringMatcher. In the future we may generalize this, but for
089: * now we sometimes cast down to StringMatcher.
090: */
091: UnicodeMatcher[] segments;
092:
093: /**
094: * The length of the string that must match before the key. If
095: * zero, then there is no matching requirement before the key.
096: * Substring [0,anteContextLength) of pattern is the anteContext.
097: */
098: private int anteContextLength;
099:
100: /**
101: * The length of the key. Substring [anteContextLength,
102: * anteContextLength + keyLength) is the key.
103: */
104: private int keyLength;
105:
106: /**
107: * Miscellaneous attributes.
108: */
109: byte flags;
110:
111: /**
112: * Flag attributes.
113: */
114: static final int ANCHOR_START = 1;
115: static final int ANCHOR_END = 2;
116:
117: /**
118: * An alias pointer to the data for this rule. The data provides
119: * lookup services for matchers and segments.
120: */
121: private final RuleBasedTransliterator.Data data;
122:
123: private static final String COPYRIGHT = "\u00A9 IBM Corporation 1999-2001. All rights reserved.";
124:
125: /**
126: * Construct a new rule with the given input, output text, and other
127: * attributes. A cursor position may be specified for the output text.
128: * @param input input string, including key and optional ante and
129: * post context
130: * @param anteContextPos offset into input to end of ante context, or -1 if
131: * none. Must be <= input.length() if not -1.
132: * @param postContextPos offset into input to start of post context, or -1
133: * if none. Must be <= input.length() if not -1, and must be >=
134: * anteContextPos.
135: * @param output output string
136: * @param cursorPos offset into output at which cursor is located, or -1 if
137: * none. If less than zero, then the cursor is placed after the
138: * <code>output</code>; that is, -1 is equivalent to
139: * <code>output.length()</code>. If greater than
140: * <code>output.length()</code> then an exception is thrown.
141: * @param cursorOffset an offset to be added to cursorPos to position the
142: * cursor either in the ante context, if < 0, or in the post context, if >
143: * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
144: * "xyz" and moves the cursor to before "a". It would have a cursorOffset
145: * of -3.
146: * @param segs array of UnicodeMatcher corresponding to input pattern
147: * segments, or null if there are none
148: * @param anchorStart true if the the rule is anchored on the left to
149: * the context start
150: * @param anchorEnd true if the rule is anchored on the right to the
151: * context limit
152: */
153: public TransliterationRule(String input, int anteContextPos,
154: int postContextPos, String output, int cursorPos,
155: int cursorOffset, UnicodeMatcher[] segs,
156: boolean anchorStart, boolean anchorEnd,
157: RuleBasedTransliterator.Data theData) {
158: data = theData;
159:
160: // Do range checks only when warranted to save time
161: if (anteContextPos < 0) {
162: anteContextLength = 0;
163: } else {
164: if (anteContextPos > input.length()) {
165: throw new IllegalArgumentException(
166: "Invalid ante context");
167: }
168: anteContextLength = anteContextPos;
169: }
170: if (postContextPos < 0) {
171: keyLength = input.length() - anteContextLength;
172: } else {
173: if (postContextPos < anteContextLength
174: || postContextPos > input.length()) {
175: throw new IllegalArgumentException(
176: "Invalid post context");
177: }
178: keyLength = postContextPos - anteContextLength;
179: }
180: if (cursorPos < 0) {
181: cursorPos = output.length();
182: } else if (cursorPos > output.length()) {
183: throw new IllegalArgumentException(
184: "Invalid cursor position");
185: }
186:
187: // We don't validate the segments array. The caller must
188: // guarantee that the segments are well-formed (that is, that
189: // all $n references in the output refer to indices of this
190: // array, and that no array elements are null).
191: this .segments = segs;
192:
193: pattern = input;
194: flags = 0;
195: if (anchorStart) {
196: flags |= ANCHOR_START;
197: }
198: if (anchorEnd) {
199: flags |= ANCHOR_END;
200: }
201:
202: anteContext = null;
203: if (anteContextLength > 0) {
204: anteContext = new StringMatcher(pattern.substring(0,
205: anteContextLength), 0, data);
206: }
207:
208: key = null;
209: if (keyLength > 0) {
210: key = new StringMatcher(pattern.substring(
211: anteContextLength, anteContextLength + keyLength),
212: 0, data);
213: }
214:
215: int postContextLength = pattern.length() - keyLength
216: - anteContextLength;
217: postContext = null;
218: if (postContextLength > 0) {
219: postContext = new StringMatcher(pattern
220: .substring(anteContextLength + keyLength), 0, data);
221: }
222:
223: this .output = new StringReplacer(output, cursorPos
224: + cursorOffset, data);
225: }
226:
227: /**
228: * Return the preceding context length. This method is needed to
229: * support the <code>Transliterator</code> method
230: * <code>getMaximumContextLength()</code>.
231: */
232: public int getAnteContextLength() {
233: return anteContextLength
234: + (((flags & ANCHOR_START) != 0) ? 1 : 0);
235: }
236:
237: /**
238: * Internal method. Returns 8-bit index value for this rule.
239: * This is the low byte of the first character of the key,
240: * unless the first character of the key is a set. If it's a
241: * set, or otherwise can match multiple keys, the index value is -1.
242: */
243: final int getIndexValue() {
244: if (anteContextLength == pattern.length()) {
245: // A pattern with just ante context {such as foo)>bar} can
246: // match any key.
247: return -1;
248: }
249: int c = UTF16.charAt(pattern, anteContextLength);
250: return data.lookupMatcher(c) == null ? (c & 0xFF) : -1;
251: }
252:
253: /**
254: * Internal method. Returns true if this rule matches the given
255: * index value. The index value is an 8-bit integer, 0..255,
256: * representing the low byte of the first character of the key.
257: * It matches this rule if it matches the first character of the
258: * key, or if the first character of the key is a set, and the set
259: * contains any character with a low byte equal to the index
260: * value. If the rule contains only ante context, as in foo)>bar,
261: * then it will match any key.
262: */
263: final boolean matchesIndexValue(int v) {
264: // Delegate to the key, or if there is none, to the postContext.
265: // If there is neither then we match any key; return true.
266: UnicodeMatcher m = (key != null) ? key : postContext;
267: return (m != null) ? m.matchesIndexValue(v) : true;
268: }
269:
270: /**
271: * Return true if this rule masks another rule. If r1 masks r2 then
272: * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
273: * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
274: * "[c]a>x" masks "[dc]a>y".
275: */
276: public boolean masks(TransliterationRule r2) {
277: /* Rule r1 masks rule r2 if the string formed of the
278: * antecontext, key, and postcontext overlaps in the following
279: * way:
280: *
281: * r1: aakkkpppp
282: * r2: aaakkkkkpppp
283: * ^
284: *
285: * The strings must be aligned at the first character of the
286: * key. The length of r1 to the left of the alignment point
287: * must be <= the length of r2 to the left; ditto for the
288: * right. The characters of r1 must equal (or be a superset
289: * of) the corresponding characters of r2. The superset
290: * operation should be performed to check for UnicodeSet
291: * masking.
292: *
293: * Anchors: Two patterns that differ only in anchors only
294: * mask one another if they are exactly equal, and r2 has
295: * all the anchors r1 has (optionally, plus some). Here Y
296: * means the row masks the column, N means it doesn't.
297: *
298: * ab ^ab ab$ ^ab$
299: * ab Y Y Y Y
300: * ^ab N Y N Y
301: * ab$ N N Y Y
302: * ^ab$ N N N Y
303: *
304: * Post context: {a}b masks ab, but not vice versa, since {a}b
305: * matches everything ab matches, and {a}b matches {|a|}b but ab
306: * does not. Pre context is different (a{b} does not align with
307: * ab).
308: */
309:
310: /* LIMITATION of the current mask algorithm: Some rule
311: * maskings are currently not detected. For example,
312: * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO
313: */
314:
315: int len = pattern.length();
316: int left = anteContextLength;
317: int left2 = r2.anteContextLength;
318: int right = pattern.length() - left;
319: int right2 = r2.pattern.length() - left2;
320:
321: // TODO Clean this up -- some logic might be combinable with the
322: // next statement.
323:
324: // Test for anchor masking
325: if (left == left2 && right == right2
326: && keyLength <= r2.keyLength
327: && r2.pattern.regionMatches(0, pattern, 0, len)) {
328: // The following boolean logic implements the table above
329: return (flags == r2.flags)
330: || (!((flags & ANCHOR_START) != 0) && !((flags & ANCHOR_END) != 0))
331: || (((r2.flags & ANCHOR_START) != 0) && ((r2.flags & ANCHOR_END) != 0));
332: }
333:
334: return left <= left2
335: && (right < right2 || (right == right2 && keyLength <= r2.keyLength))
336: && r2.pattern.regionMatches(left2 - left, pattern, 0,
337: len);
338: }
339:
340: static final int posBefore(Replaceable str, int pos) {
341: return (pos > 0) ? pos
342: - UTF16.getCharCount(str.char32At(pos - 1)) : pos - 1;
343: }
344:
345: static final int posAfter(Replaceable str, int pos) {
346: return (pos >= 0 && pos < str.length()) ? pos
347: + UTF16.getCharCount(str.char32At(pos)) : pos + 1;
348: }
349:
350: /**
351: * Attempt a match and replacement at the given position. Return
352: * the degree of match between this rule and the given text. The
353: * degree of match may be mismatch, a partial match, or a full
354: * match. A mismatch means at least one character of the text
355: * does not match the context or key. A partial match means some
356: * context and key characters match, but the text is not long
357: * enough to match all of them. A full match means all context
358: * and key characters match.
359: *
360: * If a full match is obtained, perform a replacement, update pos,
361: * and return U_MATCH. Otherwise both text and pos are unchanged.
362: *
363: * @param text the text
364: * @param pos the position indices
365: * @param incremental if TRUE, test for partial matches that may
366: * be completed by additional text inserted at pos.limit.
367: * @return one of <code>U_MISMATCH</code>,
368: * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
369: * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
370: */
371: public int matchAndReplace(Replaceable text,
372: Transliterator.Position pos, boolean incremental) {
373: // Matching and replacing are done in one method because the
374: // replacement operation needs information obtained during the
375: // match. Another way to do this is to have the match method
376: // create a match result struct with relevant offsets, and to pass
377: // this into the replace method.
378:
379: // ============================ MATCH ===========================
380:
381: // Reset segment match data
382: if (segments != null) {
383: for (int i = 0; i < segments.length; ++i) {
384: ((StringMatcher) segments[i]).resetMatch();
385: }
386: }
387:
388: int keyLimit;
389: int[] intRef = new int[1];
390:
391: // ------------------------ Ante Context ------------------------
392:
393: // A mismatch in the ante context, or with the start anchor,
394: // is an outright U_MISMATCH regardless of whether we are
395: // incremental or not.
396: int oText; // offset into 'text'
397: int minOText;
398:
399: // Note (1): We process text in 16-bit code units, rather than
400: // 32-bit code points. This works because stand-ins are
401: // always in the BMP and because we are doing a literal match
402: // operation, which can be done 16-bits at a time.
403:
404: int anteLimit = posBefore(text, pos.contextStart);
405:
406: int match;
407:
408: // Start reverse match at char before pos.start
409: intRef[0] = posBefore(text, pos.start);
410:
411: if (anteContext != null) {
412: match = anteContext.matches(text, intRef, anteLimit, false);
413: if (match != UnicodeMatcher.U_MATCH) {
414: return UnicodeMatcher.U_MISMATCH;
415: }
416: }
417:
418: oText = intRef[0];
419:
420: minOText = posAfter(text, oText);
421:
422: // ------------------------ Start Anchor ------------------------
423:
424: if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {
425: return UnicodeMatcher.U_MISMATCH;
426: }
427:
428: // -------------------- Key and Post Context --------------------
429:
430: intRef[0] = pos.start;
431:
432: if (key != null) {
433: match = key.matches(text, intRef, pos.limit, incremental);
434: if (match != UnicodeMatcher.U_MATCH) {
435: return match;
436: }
437: }
438:
439: keyLimit = intRef[0];
440:
441: if (postContext != null) {
442: if (incremental && keyLimit == pos.limit) {
443: // The key matches just before pos.limit, and there is
444: // a postContext. Since we are in incremental mode,
445: // we must assume more characters may be inserted at
446: // pos.limit -- this is a partial match.
447: return UnicodeMatcher.U_PARTIAL_MATCH;
448: }
449:
450: match = postContext.matches(text, intRef, pos.contextLimit,
451: incremental);
452: if (match != UnicodeMatcher.U_MATCH) {
453: return match;
454: }
455: }
456:
457: oText = intRef[0];
458:
459: // ------------------------- Stop Anchor ------------------------
460:
461: if (((flags & ANCHOR_END)) != 0) {
462: if (oText != pos.contextLimit) {
463: return UnicodeMatcher.U_MISMATCH;
464: }
465: if (incremental) {
466: return UnicodeMatcher.U_PARTIAL_MATCH;
467: }
468: }
469:
470: // =========================== REPLACE ==========================
471:
472: // We have a full match. The key is between pos.start and
473: // keyLimit.
474:
475: int newLength = output.replace(text, pos.start, keyLimit,
476: intRef);
477: int lenDelta = newLength - (keyLimit - pos.start);
478: int newStart = intRef[0];
479:
480: oText += lenDelta;
481: pos.limit += lenDelta;
482: pos.contextLimit += lenDelta;
483: // Restrict new value of start to [minOText, min(oText, pos.limit)].
484: pos.start = Math.max(minOText, Math.min(Math.min(oText,
485: pos.limit), newStart));
486: return UnicodeMatcher.U_MATCH;
487: }
488:
489: /**
490: * Create a source string that represents this rule. Append it to the
491: * given string.
492: */
493: public String toRule(boolean escapeUnprintable) {
494: // int i;
495:
496: StringBuffer rule = new StringBuffer();
497:
498: // Accumulate special characters (and non-specials following them)
499: // into quoteBuf. Append quoteBuf, within single quotes, when
500: // a non-quoted element must be inserted.
501: StringBuffer quoteBuf = new StringBuffer();
502:
503: // Do not emit the braces '{' '}' around the pattern if there
504: // is neither anteContext nor postContext.
505: boolean emitBraces = (anteContext != null)
506: || (postContext != null);
507:
508: // Emit start anchor
509: if ((flags & ANCHOR_START) != 0) {
510: rule.append('^');
511: }
512:
513: // Emit the input pattern
514: Utility.appendToRule(rule, anteContext, escapeUnprintable,
515: quoteBuf);
516:
517: if (emitBraces) {
518: Utility.appendToRule(rule, '{', true, escapeUnprintable,
519: quoteBuf);
520: }
521:
522: Utility.appendToRule(rule, key, escapeUnprintable, quoteBuf);
523:
524: if (emitBraces) {
525: Utility.appendToRule(rule, '}', true, escapeUnprintable,
526: quoteBuf);
527: }
528:
529: Utility.appendToRule(rule, postContext, escapeUnprintable,
530: quoteBuf);
531:
532: // Emit end anchor
533: if ((flags & ANCHOR_END) != 0) {
534: rule.append('$');
535: }
536:
537: Utility.appendToRule(rule, " > ", true, escapeUnprintable,
538: quoteBuf);
539:
540: // Emit the output pattern
541:
542: Utility.appendToRule(rule, output
543: .toReplacerPattern(escapeUnprintable), true,
544: escapeUnprintable, quoteBuf);
545:
546: Utility.appendToRule(rule, ';', true, escapeUnprintable,
547: quoteBuf);
548:
549: return rule.toString();
550: }
551:
552: /**
553: * Return a string representation of this object.
554: * @return string representation of this object
555: */
556: public String toString() {
557: return '{' + toRule(true) + '}';
558: }
559:
560: /**
561: * Union the set of all characters that may be modified by this rule
562: * into the given set.
563: */
564: void addSourceSetTo(UnicodeSet toUnionTo) {
565: int limit = anteContextLength + keyLength;
566: for (int i = anteContextLength; i < limit;) {
567: int ch = UTF16.charAt(pattern, i);
568: i += UTF16.getCharCount(ch);
569: UnicodeMatcher matcher = data.lookupMatcher(ch);
570: if (matcher == null) {
571: toUnionTo.add(ch);
572: } else {
573: matcher.addMatchSetTo(toUnionTo);
574: }
575: }
576: }
577:
578: /**
579: * Union the set of all characters that may be emitted by this rule
580: * into the given set.
581: */
582: void addTargetSetTo(UnicodeSet toUnionTo) {
583: output.addReplacementSetTo(toUnionTo);
584: }
585: }
|