001: /*
002: *******************************************************************************
003: * Copyright (C) 2001-2004, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.text;
008:
009: import com.ibm.icu.impl.Utility;
010:
011: /**
012: * An object that matches a fixed input string, implementing the
013: * UnicodeMatcher API. This object also implements the
014: * UnicodeReplacer API, allowing it to emit the matched text as
015: * output. Since the match text may contain flexible match elements,
016: * such as UnicodeSets, the emitted text is not the match pattern, but
017: * instead a substring of the actual matched text. Following
018: * convention, the output text is the leftmost match seen up to this
019: * point.
020: *
021: * A StringMatcher may represent a segment, in which case it has a
022: * positive segment number. This affects how the matcher converts
023: * itself to a pattern but does not otherwise affect its function.
024: *
025: * A StringMatcher that is not a segment should not be used as a
026: * UnicodeReplacer.
027: */
028: class StringMatcher implements UnicodeMatcher, UnicodeReplacer {
029:
030: /**
031: * The text to be matched.
032: */
033: private String pattern;
034:
035: /**
036: * Start offset, in the match text, of the <em>rightmost</em>
037: * match.
038: */
039: private int matchStart;
040:
041: /**
042: * Limit offset, in the match text, of the <em>rightmost</em>
043: * match.
044: */
045: private int matchLimit;
046:
047: /**
048: * The segment number, 1-based, or 0 if not a segment.
049: */
050: private int segmentNumber;
051:
052: /**
053: * Context object that maps stand-ins to matcher and replacer
054: * objects.
055: */
056: private final RuleBasedTransliterator.Data data;
057:
058: /**
059: * Construct a matcher that matches the given pattern string.
060: * @param theString the pattern to be matched, possibly containing
061: * stand-ins that represent nested UnicodeMatcher objects.
062: * @param segmentNum the segment number from 1..n, or 0 if this is
063: * not a segment.
064: * @param theData context object mapping stand-ins to
065: * UnicodeMatcher objects.
066: */
067: public StringMatcher(String theString, int segmentNum,
068: RuleBasedTransliterator.Data theData) {
069: data = theData;
070: pattern = theString;
071: matchStart = matchLimit = -1;
072: segmentNumber = segmentNum;
073: }
074:
075: /**
076: * Construct a matcher that matches a substring of the given
077: * pattern string.
078: * @param theString the pattern to be matched, possibly containing
079: * stand-ins that represent nested UnicodeMatcher objects.
080: * @param start first character of theString to be matched
081: * @param limit index after the last character of theString to be
082: * matched.
083: * @param segmentNum the segment number from 1..n, or 0 if this is
084: * not a segment.
085: * @param theData context object mapping stand-ins to
086: * UnicodeMatcher objects.
087: */
088: public StringMatcher(String theString, int start, int limit,
089: int segmentNum, RuleBasedTransliterator.Data theData) {
090: this (theString.substring(start, limit), segmentNum, theData);
091: }
092:
093: /**
094: * Implement UnicodeMatcher
095: */
096: public int matches(Replaceable text, int[] offset, int limit,
097: boolean incremental) {
098: // Note (1): We process text in 16-bit code units, rather than
099: // 32-bit code points. This works because stand-ins are
100: // always in the BMP and because we are doing a literal match
101: // operation, which can be done 16-bits at a time.
102: int i;
103: int[] cursor = new int[] { offset[0] };
104: if (limit < cursor[0]) {
105: // Match in the reverse direction
106: for (i = pattern.length() - 1; i >= 0; --i) {
107: char keyChar = pattern.charAt(i); // OK; see note (1) above
108: UnicodeMatcher subm = data.lookupMatcher(keyChar);
109: if (subm == null) {
110: if (cursor[0] > limit
111: && keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
112: --cursor[0];
113: } else {
114: return U_MISMATCH;
115: }
116: } else {
117: int m = subm.matches(text, cursor, limit,
118: incremental);
119: if (m != U_MATCH) {
120: return m;
121: }
122: }
123: }
124: // Record the match position, but adjust for a normal
125: // forward start, limit, and only if a prior match does not
126: // exist -- we want the rightmost match.
127: if (matchStart < 0) {
128: matchStart = cursor[0] + 1;
129: matchLimit = offset[0] + 1;
130: }
131: } else {
132: for (i = 0; i < pattern.length(); ++i) {
133: if (incremental && cursor[0] == limit) {
134: // We've reached the context limit without a mismatch and
135: // without completing our match.
136: return U_PARTIAL_MATCH;
137: }
138: char keyChar = pattern.charAt(i); // OK; see note (1) above
139: UnicodeMatcher subm = data.lookupMatcher(keyChar);
140: if (subm == null) {
141: // Don't need the cursor < limit check if
142: // incremental is true (because it's done above); do need
143: // it otherwise.
144: if (cursor[0] < limit
145: && keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
146: ++cursor[0];
147: } else {
148: return U_MISMATCH;
149: }
150: } else {
151: int m = subm.matches(text, cursor, limit,
152: incremental);
153: if (m != U_MATCH) {
154: return m;
155: }
156: }
157: }
158: // Record the match position
159: matchStart = offset[0];
160: matchLimit = cursor[0];
161: }
162:
163: offset[0] = cursor[0];
164: return U_MATCH;
165: }
166:
167: /**
168: * Implement UnicodeMatcher
169: */
170: public String toPattern(boolean escapeUnprintable) {
171: StringBuffer result = new StringBuffer();
172: StringBuffer quoteBuf = new StringBuffer();
173: if (segmentNumber > 0) { // i.e., if this is a segment
174: result.append('(');
175: }
176: for (int i = 0; i < pattern.length(); ++i) {
177: char keyChar = pattern.charAt(i); // OK; see note (1) above
178: UnicodeMatcher m = data.lookupMatcher(keyChar);
179: if (m == null) {
180: Utility.appendToRule(result, keyChar, false,
181: escapeUnprintable, quoteBuf);
182: } else {
183: Utility.appendToRule(result, m
184: .toPattern(escapeUnprintable), true,
185: escapeUnprintable, quoteBuf);
186: }
187: }
188: if (segmentNumber > 0) { // i.e., if this is a segment
189: result.append(')');
190: }
191: // Flush quoteBuf out to result
192: Utility.appendToRule(result, -1, true, escapeUnprintable,
193: quoteBuf);
194: return result.toString();
195: }
196:
197: /**
198: * Implement UnicodeMatcher
199: */
200: public boolean matchesIndexValue(int v) {
201: if (pattern.length() == 0) {
202: return true;
203: }
204: int c = UTF16.charAt(pattern, 0);
205: UnicodeMatcher m = data.lookupMatcher(c);
206: return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
207: }
208:
209: /**
210: * Implementation of UnicodeMatcher API. Union the set of all
211: * characters that may be matched by this object into the given
212: * set.
213: * @param toUnionTo the set into which to union the source characters
214: */
215: public void addMatchSetTo(UnicodeSet toUnionTo) {
216: int ch;
217: for (int i = 0; i < pattern.length(); i += UTF16
218: .getCharCount(ch)) {
219: ch = UTF16.charAt(pattern, i);
220: UnicodeMatcher matcher = data.lookupMatcher(ch);
221: if (matcher == null) {
222: toUnionTo.add(ch);
223: } else {
224: matcher.addMatchSetTo(toUnionTo);
225: }
226: }
227: }
228:
229: /**
230: * UnicodeReplacer API
231: */
232: public int replace(Replaceable text, int start, int limit,
233: int[] cursor) {
234:
235: int outLen = 0;
236:
237: // Copy segment with out-of-band data
238: int dest = limit;
239: // If there was no match, that means that a quantifier
240: // matched zero-length. E.g., x (a)* y matched "xy".
241: if (matchStart >= 0) {
242: if (matchStart != matchLimit) {
243: text.copy(matchStart, matchLimit, dest);
244: outLen = matchLimit - matchStart;
245: }
246: }
247:
248: text.replace(start, limit, ""); // delete original text
249:
250: return outLen;
251: }
252:
253: /**
254: * UnicodeReplacer API
255: */
256: public String toReplacerPattern(boolean escapeUnprintable) {
257: // assert(segmentNumber > 0);
258: StringBuffer rule = new StringBuffer("$");
259: Utility.appendNumber(rule, segmentNumber, 10, 1);
260: return rule.toString();
261: }
262:
263: /**
264: * Remove any match data. This must be called before performing a
265: * set of matches with this segment.
266: */
267: public void resetMatch() {
268: matchStart = matchLimit = -1;
269: }
270:
271: /**
272: * Union the set of all characters that may output by this object
273: * into the given set.
274: * @param toUnionTo the set into which to union the output characters
275: */
276: public void addReplacementSetTo(UnicodeSet toUnionTo) {
277: // The output of this replacer varies; it is the source text between
278: // matchStart and matchLimit. Since this varies depending on the
279: // input text, we can't compute it here. We can either do nothing
280: // or we can add ALL characters to the set. It's probably more useful
281: // to do nothing.
282: }
283: }
284:
285: //eof
|