001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package java.util.regex;
019:
020: /**
021: * Represents canonical decomposition of
022: * Unicode character. Is used when
023: * CANON_EQ flag of Pattern class
024: * is specified.
025: */
026: class DecomposedCharSet extends JointSet {
027:
028: /**
029: * Contains information about number of chars
030: * that were read for a codepoint last time
031: */
032: private int readCharsForCodePoint = 1;
033:
034: /**
035: * UTF-16 encoding of decomposedChar
036: */
037: private String decomposedCharUTF16 = null;
038:
039: /**
040: * Decomposition of the Unicode codepoint
041: */
042: private int[] decomposedChar;
043:
044: /**
045: * Length of useful part of decomposedChar
046: * decomposedCharLength <= decomposedChar.length
047: */
048: private int decomposedCharLength;
049:
050: public DecomposedCharSet(int[] decomposedChar,
051: int decomposedCharLength) {
052: this .decomposedChar = decomposedChar;
053: this .decomposedCharLength = decomposedCharLength;
054: }
055:
056: /**
057: * Returns the next.
058: */
059: public AbstractSet getNext() {
060: return this .next;
061: }
062:
063: /**
064: * Sets next abstract set.
065: * @param next
066: * The next to set.
067: */
068: public void setNext(AbstractSet next) {
069: this .next = next;
070: }
071:
072: public int matches(int strIndex, CharSequence testString,
073: MatchResultImpl matchResult) {
074:
075: /*
076: * All decompositions have length that
077: * is less or equal Lexer.MAX_DECOMPOSITION_LENGTH
078: */
079: int[] decCurCodePoint;
080: int[] decCodePoint = new int[Lexer.MAX_DECOMPOSITION_LENGTH];
081: int readCodePoints = 0;
082: int rightBound = matchResult.getRightBound();
083: int curChar;
084: int i = 0;
085:
086: if (strIndex >= rightBound) {
087: return -1;
088: }
089:
090: /*
091: * We read testString and decompose it gradually to compare with
092: * this decomposedChar at position strIndex
093: */
094: curChar = codePointAt(strIndex, testString, rightBound);
095: strIndex += readCharsForCodePoint;
096: decCurCodePoint = Lexer.getDecomposition(curChar);
097: if (decCurCodePoint == null) {
098: decCodePoint[readCodePoints++] = curChar;
099: } else {
100: i = decCurCodePoint.length;
101: System.arraycopy(decCurCodePoint, 0, decCodePoint, 0, i);
102: readCodePoints += i;
103: }
104:
105: if (strIndex < rightBound) {
106: curChar = codePointAt(strIndex, testString, rightBound);
107:
108: /*
109: * Read testString until we met a decomposed char boundary
110: * and decompose obtained portion of testString
111: */
112: while ((readCodePoints < Lexer.MAX_DECOMPOSITION_LENGTH)
113: && !Lexer.isDecomposedCharBoundary(curChar)) {
114:
115: if (Lexer.hasDecompositionNonNullCanClass(curChar)) {
116:
117: /*
118: * A few codepoints have decompositions and non null
119: * canonical classes, we have to take them into
120: * consideration, but general rule is:
121: * if canonical class != 0 then no decomposition
122: */
123: decCurCodePoint = Lexer.getDecomposition(curChar);
124:
125: /*
126: * Length of such decomposition is 1 or 2. See
127: * UnicodeData file
128: * http://www.unicode.org/Public/4.0-Update
129: * /UnicodeData-4.0.0.txt
130: */
131: if (decCurCodePoint.length == 2) {
132: decCodePoint[readCodePoints++] = decCurCodePoint[0];
133: decCodePoint[readCodePoints++] = decCurCodePoint[1];
134: } else {
135: decCodePoint[readCodePoints++] = decCurCodePoint[0];
136: }
137: } else {
138: decCodePoint[readCodePoints++] = curChar;
139: }
140:
141: strIndex += readCharsForCodePoint;
142:
143: if (strIndex < rightBound) {
144: curChar = codePointAt(strIndex, testString,
145: rightBound);
146: } else {
147: break;
148: }
149: }
150: }
151:
152: /*
153: * Some optimization since length of decomposed char is <= 3 usually
154: */
155: switch (readCodePoints) {
156: case 0:
157: case 1:
158: case 2:
159: break;
160:
161: case 3:
162: int i1 = Lexer.getCanonicalClass(decCodePoint[1]);
163: int i2 = Lexer.getCanonicalClass(decCodePoint[2]);
164:
165: if ((i2 != 0) && (i1 > i2)) {
166: i1 = decCodePoint[1];
167: decCodePoint[1] = decCodePoint[2];
168: decCodePoint[2] = i1;
169: }
170: break;
171:
172: default:
173: decCodePoint = Lexer.getCanonicalOrder(decCodePoint,
174: readCodePoints);
175: }
176:
177: /*
178: * Compare decomposedChar with decomposed char
179: * that was just read from testString
180: */
181: if (readCodePoints != decomposedCharLength) {
182: return -1;
183: }
184:
185: for (i = 0; i < readCodePoints; i++) {
186: if (decCodePoint[i] != decomposedChar[i]) {
187: return -1;
188: }
189: }
190:
191: return next.matches(strIndex, testString, matchResult);
192: }
193:
194: /**
195: * Return UTF-16 encoding of given Unicode codepoint.
196: *
197: * @return UTF-16 encoding
198: */
199: private String getDecomposedChar() {
200: if (decomposedCharUTF16 == null) {
201: StringBuffer strBuff = new StringBuffer();
202:
203: for (int i = 0; i < decomposedCharLength; i++) {
204: strBuff.append(Character.toChars(decomposedChar[i]));
205: }
206: decomposedCharUTF16 = strBuff.toString();
207: }
208: return decomposedCharUTF16;
209: }
210:
211: protected String getName() {
212: return "decomposed char:" + getDecomposedChar(); //$NON-NLS-1$
213: }
214:
215: /**
216: * Reads Unicode codepoint from input.
217: *
218: * @param strIndex - index to read codepoint at
219: * @param testString - input
220: * @param matchResult - auxiliary object
221: * @return codepoint at given strIndex at testString and
222: */
223: public int codePointAt(int strIndex, CharSequence testString,
224: int rightBound) {
225:
226: /*
227: * We store information about number of codepoints
228: * we read at variable readCharsForCodePoint.
229: */
230: int curChar;
231:
232: readCharsForCodePoint = 1;
233: if (strIndex < rightBound - 1) {
234: char high = testString.charAt(strIndex++);
235: char low = testString.charAt(strIndex);
236:
237: if (Character.isSurrogatePair(high, low)) {
238: char[] curCodePointUTF16 = new char[] { high, low };
239: curChar = Character.codePointAt(curCodePointUTF16, 0);
240: readCharsForCodePoint = 2;
241: } else {
242: curChar = high;
243: }
244: } else {
245: curChar = testString.charAt(strIndex);
246: }
247:
248: return curChar;
249: }
250:
251: public boolean first(AbstractSet set) {
252: return (set instanceof DecomposedCharSet) ? ((DecomposedCharSet) set)
253: .getDecomposedChar().equals(getDecomposedChar())
254: : true;
255: }
256:
257: public boolean hasConsumed(MatchResultImpl matchResult) {
258: return true;
259: }
260: }
|