001 /*
002 * Copyright 2003-2005 Sun Microsystems, Inc. All Rights Reserved.
003 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004 *
005 * This code is free software; you can redistribute it and/or modify it
006 * under the terms of the GNU General Public License version 2 only, as
007 * published by the Free Software Foundation. Sun designates this
008 * particular file as subject to the "Classpath" exception as provided
009 * by Sun in the LICENSE file that accompanied this code.
010 *
011 * This code is distributed in the hope that it will be useful, but WITHOUT
012 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
014 * version 2 for more details (a copy is included in the LICENSE file that
015 * accompanied this code).
016 *
017 * You should have received a copy of the GNU General Public License version
018 * 2 along with this work; if not, write to the Free Software Foundation,
019 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020 *
021 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022 * CA 95054 USA or visit www.sun.com if you need additional information or
023 * have any questions.
024 */
025
026 package java.lang;
027
028 import java.text.BreakIterator;
029 import java.util.HashSet;
030 import java.util.Hashtable;
031 import java.util.Iterator;
032 import java.util.Locale;
033 import sun.text.Normalizer;
034
035 /**
036 * This is a utility class for <code>String.toLowerCase()</code> and
037 * <code>String.toUpperCase()</code>, that handles special casing with
038 * conditions. In other words, it handles the mappings with conditions
039 * that are defined in
040 * <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special
041 * Casing Properties</a> file.
042 * <p>
043 * Note that the unconditional case mappings (including 1:M mappings)
044 * are handled in <code>Character.toLower/UpperCase()</code>.
045 */
046 final class ConditionalSpecialCasing {
047
048 // context conditions.
049 final static int FINAL_CASED = 1;
050 final static int AFTER_SOFT_DOTTED = 2;
051 final static int MORE_ABOVE = 3;
052 final static int AFTER_I = 4;
053 final static int NOT_BEFORE_DOT = 5;
054
055 // combining class definitions
056 final static int COMBINING_CLASS_ABOVE = 230;
057
058 // Special case mapping entries
059 static Entry[] entry = {
060 //# ================================================================================
061 //# Conditional mappings
062 //# ================================================================================
063 new Entry(0x03A3, new char[] { 0x03C2 },
064 new char[] { 0x03A3 }, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA
065
066 //# ================================================================================
067 //# Locale-sensitive mappings
068 //# ================================================================================
069 //# Lithuanian
070 new Entry(0x0307, new char[] { 0x0307 }, new char[] {},
071 "lt", AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE
072 new Entry(0x0049, new char[] { 0x0069, 0x0307 },
073 new char[] { 0x0049 }, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I
074 new Entry(0x004A, new char[] { 0x006A, 0x0307 },
075 new char[] { 0x004A }, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J
076 new Entry(0x012E, new char[] { 0x012F, 0x0307 },
077 new char[] { 0x012E }, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK
078 new Entry(0x00CC, new char[] { 0x0069, 0x0307, 0x0300 },
079 new char[] { 0x00CC }, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE
080 new Entry(0x00CD, new char[] { 0x0069, 0x0307, 0x0301 },
081 new char[] { 0x00CD }, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE
082 new Entry(0x0128, new char[] { 0x0069, 0x0307, 0x0303 },
083 new char[] { 0x0128 }, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE
084
085 //# ================================================================================
086 //# Turkish and Azeri
087 // new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
088 // new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
089 new Entry(0x0307, new char[] {}, new char[] { 0x0307 },
090 "tr", AFTER_I), // # COMBINING DOT ABOVE
091 new Entry(0x0307, new char[] {}, new char[] { 0x0307 },
092 "az", AFTER_I), // # COMBINING DOT ABOVE
093 new Entry(0x0049, new char[] { 0x0131 },
094 new char[] { 0x0049 }, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
095 new Entry(0x0049, new char[] { 0x0131 },
096 new char[] { 0x0049 }, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
097 new Entry(0x0069, new char[] { 0x0069 },
098 new char[] { 0x0130 }, "tr", 0), // # LATIN SMALL LETTER I
099 new Entry(0x0069, new char[] { 0x0069 },
100 new char[] { 0x0130 }, "az", 0) // # LATIN SMALL LETTER I
101 };
102
103 // A hash table that contains the above entries
104 static Hashtable entryTable = new Hashtable();
105 static {
106 // create hashtable from the entry
107 for (int i = 0; i < entry.length; i++) {
108 Entry cur = entry[i];
109 Integer cp = new Integer(cur.getCodePoint());
110 HashSet set = (HashSet) entryTable.get(cp);
111 if (set == null) {
112 set = new HashSet();
113 }
114 set.add(cur);
115 entryTable.put(cp, set);
116 }
117 }
118
119 static int toLowerCaseEx(String src, int index, Locale locale) {
120 char[] result = lookUpTable(src, index, locale, true);
121
122 if (result != null) {
123 if (result.length == 1) {
124 return result[0];
125 } else {
126 return Character.ERROR;
127 }
128 } else {
129 // default to Character class' one
130 return Character.toLowerCase(src.codePointAt(index));
131 }
132 }
133
134 static int toUpperCaseEx(String src, int index, Locale locale) {
135 char[] result = lookUpTable(src, index, locale, false);
136
137 if (result != null) {
138 if (result.length == 1) {
139 return result[0];
140 } else {
141 return Character.ERROR;
142 }
143 } else {
144 // default to Character class' one
145 return Character.toUpperCaseEx(src.codePointAt(index));
146 }
147 }
148
149 static char[] toLowerCaseCharArray(String src, int index,
150 Locale locale) {
151 return lookUpTable(src, index, locale, true);
152 }
153
154 static char[] toUpperCaseCharArray(String src, int index,
155 Locale locale) {
156 char[] result = lookUpTable(src, index, locale, false);
157 if (result != null) {
158 return result;
159 } else {
160 return Character.toUpperCaseCharArray(src
161 .codePointAt(index));
162 }
163 }
164
165 private static char[] lookUpTable(String src, int index,
166 Locale locale, boolean bLowerCasing) {
167 HashSet set = (HashSet) entryTable.get(new Integer(src
168 .codePointAt(index)));
169
170 if (set != null) {
171 Iterator iter = set.iterator();
172 String currentLang = locale.getLanguage();
173 while (iter.hasNext()) {
174 Entry entry = (Entry) iter.next();
175 String conditionLang = entry.getLanguage();
176 if (((conditionLang == null) || (conditionLang
177 .equals(currentLang)))
178 && isConditionMet(src, index, locale, entry
179 .getCondition())) {
180 return (bLowerCasing ? entry.getLowerCase() : entry
181 .getUpperCase());
182 }
183 }
184 }
185
186 return null;
187 }
188
189 private static boolean isConditionMet(String src, int index,
190 Locale locale, int condition) {
191 switch (condition) {
192 case FINAL_CASED:
193 return isFinalCased(src, index, locale);
194
195 case AFTER_SOFT_DOTTED:
196 return isAfterSoftDotted(src, index);
197
198 case MORE_ABOVE:
199 return isMoreAbove(src, index);
200
201 case AFTER_I:
202 return isAfterI(src, index);
203
204 case NOT_BEFORE_DOT:
205 return !isBeforeDot(src, index);
206
207 default:
208 return true;
209 }
210 }
211
212 /**
213 * Implements the "Final_Cased" condition
214 *
215 * Specification: Within the closest word boundaries containing C, there is a cased
216 * letter before C, and there is no cased letter after C.
217 *
218 * Regular Expression:
219 * Before C: [{cased==true}][{wordBoundary!=true}]*
220 * After C: !([{wordBoundary!=true}]*[{cased}])
221 */
222 private static boolean isFinalCased(String src, int index,
223 Locale locale) {
224 BreakIterator wordBoundary = BreakIterator
225 .getWordInstance(locale);
226 wordBoundary.setText(src);
227 int ch;
228
229 // Look for a preceding 'cased' letter
230 for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i); i -= Character
231 .charCount(ch)) {
232
233 ch = src.codePointBefore(i);
234 if (isCased(ch)) {
235
236 int len = src.length();
237 // Check that there is no 'cased' letter after the index
238 for (i = index
239 + Character.charCount(src.codePointAt(index)); (i < len)
240 && !wordBoundary.isBoundary(i); i += Character
241 .charCount(ch)) {
242
243 ch = src.codePointAt(i);
244 if (isCased(ch)) {
245 return false;
246 }
247 }
248
249 return true;
250 }
251 }
252
253 return false;
254 }
255
256 /**
257 * Implements the "After_I" condition
258 *
259 * Specification: The last preceding base character was an uppercase I,
260 * and there is no intervening combining character class 230 (ABOVE).
261 *
262 * Regular Expression:
263 * Before C: [I]([{cc!=230}&{cc!=0}])*
264 */
265 private static boolean isAfterI(String src, int index) {
266 int ch;
267 int cc;
268
269 // Look for the last preceding base character
270 for (int i = index; i > 0; i -= Character.charCount(ch)) {
271
272 ch = src.codePointBefore(i);
273
274 if (ch == 'I') {
275 return true;
276 } else {
277 cc = Normalizer.getCombiningClass(ch);
278 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
279 return false;
280 }
281 }
282 }
283
284 return false;
285 }
286
287 /**
288 * Implements the "After_Soft_Dotted" condition
289 *
290 * Specification: The last preceding character with combining class
291 * of zero before C was Soft_Dotted, and there is no intervening
292 * combining character class 230 (ABOVE).
293 *
294 * Regular Expression:
295 * Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*
296 */
297 private static boolean isAfterSoftDotted(String src, int index) {
298 int ch;
299 int cc;
300
301 // Look for the last preceding character
302 for (int i = index; i > 0; i -= Character.charCount(ch)) {
303
304 ch = src.codePointBefore(i);
305
306 if (isSoftDotted(ch)) {
307 return true;
308 } else {
309 cc = Normalizer.getCombiningClass(ch);
310 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
311 return false;
312 }
313 }
314 }
315
316 return false;
317 }
318
319 /**
320 * Implements the "More_Above" condition
321 *
322 * Specification: C is followed by one or more characters of combining
323 * class 230 (ABOVE) in the combining character sequence.
324 *
325 * Regular Expression:
326 * After C: [{cc!=0}]*[{cc==230}]
327 */
328 private static boolean isMoreAbove(String src, int index) {
329 int ch;
330 int cc;
331 int len = src.length();
332
333 // Look for a following ABOVE combining class character
334 for (int i = index
335 + Character.charCount(src.codePointAt(index)); i < len; i += Character
336 .charCount(ch)) {
337
338 ch = src.codePointAt(i);
339 cc = Normalizer.getCombiningClass(ch);
340
341 if (cc == COMBINING_CLASS_ABOVE) {
342 return true;
343 } else if (cc == 0) {
344 return false;
345 }
346 }
347
348 return false;
349 }
350
351 /**
352 * Implements the "Before_Dot" condition
353 *
354 * Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>.
355 * Any sequence of characters with a combining class that is
356 * neither 0 nor 230 may intervene between the current character
357 * and the combining dot above.
358 *
359 * Regular Expression:
360 * After C: ([{cc!=230}&{cc!=0}])*[\u0307]
361 */
362 private static boolean isBeforeDot(String src, int index) {
363 int ch;
364 int cc;
365 int len = src.length();
366
367 // Look for a following COMBINING DOT ABOVE
368 for (int i = index
369 + Character.charCount(src.codePointAt(index)); i < len; i += Character
370 .charCount(ch)) {
371
372 ch = src.codePointAt(i);
373
374 if (ch == '\u0307') {
375 return true;
376 } else {
377 cc = Normalizer.getCombiningClass(ch);
378 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
379 return false;
380 }
381 }
382 }
383
384 return false;
385 }
386
387 /**
388 * Examines whether a character is 'cased'.
389 *
390 * A character C is defined to be 'cased' if and only if at least one of
391 * following are true for C: uppercase==true, or lowercase==true, or
392 * general_category==titlecase_letter.
393 *
394 * The uppercase and lowercase property values are specified in the data
395 * file DerivedCoreProperties.txt in the Unicode Character Database.
396 */
397 private static boolean isCased(int ch) {
398 int type = Character.getType(ch);
399 if (type == Character.LOWERCASE_LETTER
400 || type == Character.UPPERCASE_LETTER
401 || type == Character.TITLECASE_LETTER) {
402 return true;
403 } else {
404 // Check for Other_Lowercase and Other_Uppercase
405 //
406 if ((ch >= 0x02B0) && (ch <= 0x02B8)) {
407 // MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y
408 return true;
409 } else if ((ch >= 0x02C0) && (ch <= 0x02C1)) {
410 // MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP
411 return true;
412 } else if ((ch >= 0x02E0) && (ch <= 0x02E4)) {
413 // MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
414 return true;
415 } else if (ch == 0x0345) {
416 // COMBINING GREEK YPOGEGRAMMENI
417 return true;
418 } else if (ch == 0x037A) {
419 // GREEK YPOGEGRAMMENI
420 return true;
421 } else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) {
422 // MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI
423 return true;
424 } else if ((ch >= 0x2160) && (ch <= 0x217F)) {
425 // ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
426 // SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
427 return true;
428 } else if ((ch >= 0x24B6) && (ch <= 0x24E9)) {
429 // CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
430 // CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
431 return true;
432 } else {
433 return false;
434 }
435 }
436 }
437
438 private static boolean isSoftDotted(int ch) {
439 switch (ch) {
440 case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I
441 case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J
442 case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK
443 case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE
444 case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
445 case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE
446 case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I
447 case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW
448 case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW
449 case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I
450 return true;
451 default:
452 return false;
453 }
454 }
455
456 /**
457 * An internal class that represents an entry in the Special Casing Properties.
458 */
459 static class Entry {
460 int ch;
461 char[] lower;
462 char[] upper;
463 String lang;
464 int condition;
465
466 Entry(int ch, char[] lower, char[] upper, String lang,
467 int condition) {
468 this .ch = ch;
469 this .lower = lower;
470 this .upper = upper;
471 this .lang = lang;
472 this .condition = condition;
473 }
474
475 int getCodePoint() {
476 return ch;
477 }
478
479 char[] getLowerCase() {
480 return lower;
481 }
482
483 char[] getUpperCase() {
484 return upper;
485 }
486
487 String getLanguage() {
488 return lang;
489 }
490
491 int getCondition() {
492 return condition;
493 }
494 }
495 }
|