001: /**
002: *******************************************************************************
003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.dev.test.util;
007:
008: import java.util.*;
009: import com.ibm.icu.lang.UCharacter;
010: import com.ibm.icu.text.UTF16;
011: import com.ibm.icu.text.Transliterator;
012: import com.ibm.icu.text.UnicodeSet;
013:
014: /**
015: * Incrementally returns the set of all strings that case-fold to the same value.
016: */
017: public class CaseIterator {
018:
019: // testing stuff
020: private static final boolean DEBUG = true;
021: static Transliterator toName = Transliterator
022: .getInstance("[:^ascii:] Any-Name");
023: static Transliterator toHex = Transliterator
024: .getInstance("[:^ascii:] Any-Hex");
025: static Transliterator toHex2 = Transliterator
026: .getInstance("[[^\u0021-\u007F]-[,]] Any-Hex");
027:
028: // global tables (could be precompiled)
029: private static Map fromCaseFold = new HashMap();
030: private static Map toCaseFold = new HashMap();
031: private static int maxLength = 0;
032:
033: // This exception list is generated on the console by turning on the GENERATED flag,
034: // which MUST be false for normal operation.
035: // Once the list is generated, it is pasted in here.
036: // A bit of a cludge, but this bootstrapping is the easiest way
037: // to get around certain complications in the data.
038:
039: private static final boolean GENERATE = false;
040:
041: private static final boolean DUMP = false;
042:
043: private static String[][] exceptionList = {
044: // a\N{MODIFIER LETTER RIGHT HALF RING}
045: { "a\u02BE", "A\u02BE", "a\u02BE", },
046: // ff
047: { "ff", "FF", "Ff", "fF", "ff", },
048: // ffi
049: { "ffi", "FFI", "FFi", "FfI", "Ffi", "F\uFB01", "fFI",
050: "fFi", "ffI", "ffi", "f\uFB01", "\uFB00I",
051: "\uFB00i", },
052: // ffl
053: { "ffl", "FFL", "FFl", "FfL", "Ffl", "F\uFB02", "fFL",
054: "fFl", "ffL", "ffl", "f\uFB02", "\uFB00L",
055: "\uFB00l", },
056: // fi
057: { "fi", "FI", "Fi", "fI", "fi", },
058: // fl
059: { "fl", "FL", "Fl", "fL", "fl", },
060: // h\N{COMBINING MACRON BELOW}
061: { "h\u0331", "H\u0331", "h\u0331", },
062: // i\N{COMBINING DOT ABOVE}
063: { "i\u0307", "I\u0307", "i\u0307", },
064: // j\N{COMBINING CARON}
065: { "j\u030C", "J\u030C", "j\u030C", },
066: // ss
067: { "ss", "SS", "Ss", "S\u017F", "sS", "ss", "s\u017F",
068: "\u017FS", "\u017Fs", "\u017F\u017F", },
069: // st
070: { "st", "ST", "St", "sT", "st", "\u017FT", "\u017Ft", },
071: // t\N{COMBINING DIAERESIS}
072: { "t\u0308", "T\u0308", "t\u0308", },
073: // w\N{COMBINING RING ABOVE}
074: { "w\u030A", "W\u030A", "w\u030A", },
075: // y\N{COMBINING RING ABOVE}
076: { "y\u030A", "Y\u030A", "y\u030A", },
077: // \N{MODIFIER LETTER APOSTROPHE}n
078: { "\u02BCn", "\u02BCN", "\u02BCn", },
079: // \N{GREEK SMALL LETTER ALPHA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
080: { "\u03AC\u03B9", "\u0386\u0345", "\u0386\u0399",
081: "\u0386\u03B9", "\u0386\u1FBE", "\u03AC\u0345",
082: "\u03AC\u0399", "\u03AC\u03B9", "\u03AC\u1FBE", },
083: // \N{GREEK SMALL LETTER ETA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
084: { "\u03AE\u03B9", "\u0389\u0345", "\u0389\u0399",
085: "\u0389\u03B9", "\u0389\u1FBE", "\u03AE\u0345",
086: "\u03AE\u0399", "\u03AE\u03B9", "\u03AE\u1FBE", },
087: // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}
088: { "\u03B1\u0342", "\u0391\u0342", "\u03B1\u0342", },
089: // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
090: { "\u03B1\u0342\u03B9", "\u0391\u0342\u0345",
091: "\u0391\u0342\u0399", "\u0391\u0342\u03B9",
092: "\u0391\u0342\u1FBE", "\u03B1\u0342\u0345",
093: "\u03B1\u0342\u0399", "\u03B1\u0342\u03B9",
094: "\u03B1\u0342\u1FBE", "\u1FB6\u0345",
095: "\u1FB6\u0399", "\u1FB6\u03B9", "\u1FB6\u1FBE", },
096: // \N{GREEK SMALL LETTER ALPHA}\N{GREEK SMALL LETTER IOTA}
097: { "\u03B1\u03B9", "\u0391\u0345", "\u0391\u0399",
098: "\u0391\u03B9", "\u0391\u1FBE", "\u03B1\u0345",
099: "\u03B1\u0399", "\u03B1\u03B9", "\u03B1\u1FBE", },
100: // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}
101: { "\u03B7\u0342", "\u0397\u0342", "\u03B7\u0342", },
102: // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
103: { "\u03B7\u0342\u03B9", "\u0397\u0342\u0345",
104: "\u0397\u0342\u0399", "\u0397\u0342\u03B9",
105: "\u0397\u0342\u1FBE", "\u03B7\u0342\u0345",
106: "\u03B7\u0342\u0399", "\u03B7\u0342\u03B9",
107: "\u03B7\u0342\u1FBE", "\u1FC6\u0345",
108: "\u1FC6\u0399", "\u1FC6\u03B9", "\u1FC6\u1FBE", },
109: // \N{GREEK SMALL LETTER ETA}\N{GREEK SMALL LETTER IOTA}
110: { "\u03B7\u03B9", "\u0397\u0345", "\u0397\u0399",
111: "\u0397\u03B9", "\u0397\u1FBE", "\u03B7\u0345",
112: "\u03B7\u0399", "\u03B7\u03B9", "\u03B7\u1FBE", },
113: // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
114: { "\u03B9\u0308\u0300", "\u0345\u0308\u0300",
115: "\u0399\u0308\u0300", "\u03B9\u0308\u0300",
116: "\u1FBE\u0308\u0300", },
117: // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
118: { "\u03B9\u0308\u0301", "\u0345\u0308\u0301",
119: "\u0399\u0308\u0301", "\u03B9\u0308\u0301",
120: "\u1FBE\u0308\u0301", },
121: // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
122: { "\u03B9\u0308\u0342", "\u0345\u0308\u0342",
123: "\u0399\u0308\u0342", "\u03B9\u0308\u0342",
124: "\u1FBE\u0308\u0342", },
125: // \N{GREEK SMALL LETTER IOTA}\N{COMBINING GREEK PERISPOMENI}
126: { "\u03B9\u0342", "\u0345\u0342", "\u0399\u0342",
127: "\u03B9\u0342", "\u1FBE\u0342", },
128: // \N{GREEK SMALL LETTER RHO}\N{COMBINING COMMA ABOVE}
129: { "\u03C1\u0313", "\u03A1\u0313", "\u03C1\u0313",
130: "\u03F1\u0313", },
131: // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
132: { "\u03C5\u0308\u0300", "\u03A5\u0308\u0300",
133: "\u03C5\u0308\u0300", },
134: // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
135: { "\u03C5\u0308\u0301", "\u03A5\u0308\u0301",
136: "\u03C5\u0308\u0301", },
137: // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
138: { "\u03C5\u0308\u0342", "\u03A5\u0308\u0342",
139: "\u03C5\u0308\u0342", },
140: // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}
141: { "\u03C5\u0313", "\u03A5\u0313", "\u03C5\u0313", },
142: // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GRAVE ACCENT}
143: { "\u03C5\u0313\u0300", "\u03A5\u0313\u0300",
144: "\u03C5\u0313\u0300", "\u1F50\u0300", },
145: // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING ACUTE ACCENT}
146: { "\u03C5\u0313\u0301", "\u03A5\u0313\u0301",
147: "\u03C5\u0313\u0301", "\u1F50\u0301", },
148: // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GREEK PERISPOMENI}
149: { "\u03C5\u0313\u0342", "\u03A5\u0313\u0342",
150: "\u03C5\u0313\u0342", "\u1F50\u0342", },
151: // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING GREEK PERISPOMENI}
152: { "\u03C5\u0342", "\u03A5\u0342", "\u03C5\u0342", },
153: // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}
154: { "\u03C9\u0342", "\u03A9\u0342", "\u03C9\u0342",
155: "\u2126\u0342", },
156: // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
157: { "\u03C9\u0342\u03B9", "\u03A9\u0342\u0345",
158: "\u03A9\u0342\u0399", "\u03A9\u0342\u03B9",
159: "\u03A9\u0342\u1FBE", "\u03C9\u0342\u0345",
160: "\u03C9\u0342\u0399", "\u03C9\u0342\u03B9",
161: "\u03C9\u0342\u1FBE", "\u1FF6\u0345",
162: "\u1FF6\u0399", "\u1FF6\u03B9", "\u1FF6\u1FBE",
163: "\u2126\u0342\u0345", "\u2126\u0342\u0399",
164: "\u2126\u0342\u03B9", "\u2126\u0342\u1FBE", },
165: // \N{GREEK SMALL LETTER OMEGA}\N{GREEK SMALL LETTER IOTA}
166: { "\u03C9\u03B9", "\u03A9\u0345", "\u03A9\u0399",
167: "\u03A9\u03B9", "\u03A9\u1FBE", "\u03C9\u0345",
168: "\u03C9\u0399", "\u03C9\u03B9", "\u03C9\u1FBE",
169: "\u2126\u0345", "\u2126\u0399", "\u2126\u03B9",
170: "\u2126\u1FBE", },
171: // \N{GREEK SMALL LETTER OMEGA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
172: { "\u03CE\u03B9", "\u038F\u0345", "\u038F\u0399",
173: "\u038F\u03B9", "\u038F\u1FBE", "\u03CE\u0345",
174: "\u03CE\u0399", "\u03CE\u03B9", "\u03CE\u1FBE", },
175: // \N{ARMENIAN SMALL LETTER ECH}\N{ARMENIAN SMALL LETTER YIWN}
176: { "\u0565\u0582", "\u0535\u0552", "\u0535\u0582",
177: "\u0565\u0552", "\u0565\u0582", },
178: // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER ECH}
179: { "\u0574\u0565", "\u0544\u0535", "\u0544\u0565",
180: "\u0574\u0535", "\u0574\u0565", },
181: // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER INI}
182: { "\u0574\u056B", "\u0544\u053B", "\u0544\u056B",
183: "\u0574\u053B", "\u0574\u056B", },
184: // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER XEH}
185: { "\u0574\u056D", "\u0544\u053D", "\u0544\u056D",
186: "\u0574\u053D", "\u0574\u056D", },
187: // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER NOW}
188: { "\u0574\u0576", "\u0544\u0546", "\u0544\u0576",
189: "\u0574\u0546", "\u0574\u0576", },
190: // \N{ARMENIAN SMALL LETTER VEW}\N{ARMENIAN SMALL LETTER NOW}
191: { "\u057E\u0576", "\u054E\u0546", "\u054E\u0576",
192: "\u057E\u0546", "\u057E\u0576", },
193: // \N{GREEK SMALL LETTER ALPHA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
194: { "\u1F00\u03B9", "\u1F00\u0345", "\u1F00\u0399",
195: "\u1F00\u03B9", "\u1F00\u1FBE", "\u1F08\u0345",
196: "\u1F08\u0399", "\u1F08\u03B9", "\u1F08\u1FBE", },
197: // \N{GREEK SMALL LETTER ALPHA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
198: { "\u1F01\u03B9", "\u1F01\u0345", "\u1F01\u0399",
199: "\u1F01\u03B9", "\u1F01\u1FBE", "\u1F09\u0345",
200: "\u1F09\u0399", "\u1F09\u03B9", "\u1F09\u1FBE", },
201: // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
202: { "\u1F02\u03B9", "\u1F02\u0345", "\u1F02\u0399",
203: "\u1F02\u03B9", "\u1F02\u1FBE", "\u1F0A\u0345",
204: "\u1F0A\u0399", "\u1F0A\u03B9", "\u1F0A\u1FBE", },
205: // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
206: { "\u1F03\u03B9", "\u1F03\u0345", "\u1F03\u0399",
207: "\u1F03\u03B9", "\u1F03\u1FBE", "\u1F0B\u0345",
208: "\u1F0B\u0399", "\u1F0B\u03B9", "\u1F0B\u1FBE", },
209: // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
210: { "\u1F04\u03B9", "\u1F04\u0345", "\u1F04\u0399",
211: "\u1F04\u03B9", "\u1F04\u1FBE", "\u1F0C\u0345",
212: "\u1F0C\u0399", "\u1F0C\u03B9", "\u1F0C\u1FBE", },
213: // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
214: { "\u1F05\u03B9", "\u1F05\u0345", "\u1F05\u0399",
215: "\u1F05\u03B9", "\u1F05\u1FBE", "\u1F0D\u0345",
216: "\u1F0D\u0399", "\u1F0D\u03B9", "\u1F0D\u1FBE", },
217: // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
218: { "\u1F06\u03B9", "\u1F06\u0345", "\u1F06\u0399",
219: "\u1F06\u03B9", "\u1F06\u1FBE", "\u1F0E\u0345",
220: "\u1F0E\u0399", "\u1F0E\u03B9", "\u1F0E\u1FBE", },
221: // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
222: { "\u1F07\u03B9", "\u1F07\u0345", "\u1F07\u0399",
223: "\u1F07\u03B9", "\u1F07\u1FBE", "\u1F0F\u0345",
224: "\u1F0F\u0399", "\u1F0F\u03B9", "\u1F0F\u1FBE", },
225: // \N{GREEK SMALL LETTER ETA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
226: { "\u1F20\u03B9", "\u1F20\u0345", "\u1F20\u0399",
227: "\u1F20\u03B9", "\u1F20\u1FBE", "\u1F28\u0345",
228: "\u1F28\u0399", "\u1F28\u03B9", "\u1F28\u1FBE", },
229: // \N{GREEK SMALL LETTER ETA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
230: { "\u1F21\u03B9", "\u1F21\u0345", "\u1F21\u0399",
231: "\u1F21\u03B9", "\u1F21\u1FBE", "\u1F29\u0345",
232: "\u1F29\u0399", "\u1F29\u03B9", "\u1F29\u1FBE", },
233: // \N{GREEK SMALL LETTER ETA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
234: { "\u1F22\u03B9", "\u1F22\u0345", "\u1F22\u0399",
235: "\u1F22\u03B9", "\u1F22\u1FBE", "\u1F2A\u0345",
236: "\u1F2A\u0399", "\u1F2A\u03B9", "\u1F2A\u1FBE", },
237: // \N{GREEK SMALL LETTER ETA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
238: { "\u1F23\u03B9", "\u1F23\u0345", "\u1F23\u0399",
239: "\u1F23\u03B9", "\u1F23\u1FBE", "\u1F2B\u0345",
240: "\u1F2B\u0399", "\u1F2B\u03B9", "\u1F2B\u1FBE", },
241: // \N{GREEK SMALL LETTER ETA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
242: { "\u1F24\u03B9", "\u1F24\u0345", "\u1F24\u0399",
243: "\u1F24\u03B9", "\u1F24\u1FBE", "\u1F2C\u0345",
244: "\u1F2C\u0399", "\u1F2C\u03B9", "\u1F2C\u1FBE", },
245: // \N{GREEK SMALL LETTER ETA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
246: { "\u1F25\u03B9", "\u1F25\u0345", "\u1F25\u0399",
247: "\u1F25\u03B9", "\u1F25\u1FBE", "\u1F2D\u0345",
248: "\u1F2D\u0399", "\u1F2D\u03B9", "\u1F2D\u1FBE", },
249: // \N{GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
250: { "\u1F26\u03B9", "\u1F26\u0345", "\u1F26\u0399",
251: "\u1F26\u03B9", "\u1F26\u1FBE", "\u1F2E\u0345",
252: "\u1F2E\u0399", "\u1F2E\u03B9", "\u1F2E\u1FBE", },
253: // \N{GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
254: { "\u1F27\u03B9", "\u1F27\u0345", "\u1F27\u0399",
255: "\u1F27\u03B9", "\u1F27\u1FBE", "\u1F2F\u0345",
256: "\u1F2F\u0399", "\u1F2F\u03B9", "\u1F2F\u1FBE", },
257: // \N{GREEK SMALL LETTER OMEGA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
258: { "\u1F60\u03B9", "\u1F60\u0345", "\u1F60\u0399",
259: "\u1F60\u03B9", "\u1F60\u1FBE", "\u1F68\u0345",
260: "\u1F68\u0399", "\u1F68\u03B9", "\u1F68\u1FBE", },
261: // \N{GREEK SMALL LETTER OMEGA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
262: { "\u1F61\u03B9", "\u1F61\u0345", "\u1F61\u0399",
263: "\u1F61\u03B9", "\u1F61\u1FBE", "\u1F69\u0345",
264: "\u1F69\u0399", "\u1F69\u03B9", "\u1F69\u1FBE", },
265: // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
266: { "\u1F62\u03B9", "\u1F62\u0345", "\u1F62\u0399",
267: "\u1F62\u03B9", "\u1F62\u1FBE", "\u1F6A\u0345",
268: "\u1F6A\u0399", "\u1F6A\u03B9", "\u1F6A\u1FBE", },
269: // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
270: { "\u1F63\u03B9", "\u1F63\u0345", "\u1F63\u0399",
271: "\u1F63\u03B9", "\u1F63\u1FBE", "\u1F6B\u0345",
272: "\u1F6B\u0399", "\u1F6B\u03B9", "\u1F6B\u1FBE", },
273: // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
274: { "\u1F64\u03B9", "\u1F64\u0345", "\u1F64\u0399",
275: "\u1F64\u03B9", "\u1F64\u1FBE", "\u1F6C\u0345",
276: "\u1F6C\u0399", "\u1F6C\u03B9", "\u1F6C\u1FBE", },
277: // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
278: { "\u1F65\u03B9", "\u1F65\u0345", "\u1F65\u0399",
279: "\u1F65\u03B9", "\u1F65\u1FBE", "\u1F6D\u0345",
280: "\u1F6D\u0399", "\u1F6D\u03B9", "\u1F6D\u1FBE", },
281: // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
282: { "\u1F66\u03B9", "\u1F66\u0345", "\u1F66\u0399",
283: "\u1F66\u03B9", "\u1F66\u1FBE", "\u1F6E\u0345",
284: "\u1F6E\u0399", "\u1F6E\u03B9", "\u1F6E\u1FBE", },
285: // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
286: { "\u1F67\u03B9", "\u1F67\u0345", "\u1F67\u0399",
287: "\u1F67\u03B9", "\u1F67\u1FBE", "\u1F6F\u0345",
288: "\u1F6F\u0399", "\u1F6F\u03B9", "\u1F6F\u1FBE", },
289: // \N{GREEK SMALL LETTER ALPHA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
290: { "\u1F70\u03B9", "\u1F70\u0345", "\u1F70\u0399",
291: "\u1F70\u03B9", "\u1F70\u1FBE", "\u1FBA\u0345",
292: "\u1FBA\u0399", "\u1FBA\u03B9", "\u1FBA\u1FBE", },
293: // \N{GREEK SMALL LETTER ETA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
294: { "\u1F74\u03B9", "\u1F74\u0345", "\u1F74\u0399",
295: "\u1F74\u03B9", "\u1F74\u1FBE", "\u1FCA\u0345",
296: "\u1FCA\u0399", "\u1FCA\u03B9", "\u1FCA\u1FBE", },
297: // \N{GREEK SMALL LETTER OMEGA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
298: { "\u1F7C\u03B9", "\u1F7C\u0345", "\u1F7C\u0399",
299: "\u1F7C\u03B9", "\u1F7C\u1FBE", "\u1FFA\u0345",
300: "\u1FFA\u0399", "\u1FFA\u03B9", "\u1FFA\u1FBE", }, };
301:
302: // this initializes the data used to generated the case-equivalents
303:
304: static {
305:
306: // Gather up the exceptions in a form we can use
307:
308: if (!GENERATE) {
309: for (int i = 0; i < exceptionList.length; ++i) {
310: String[] exception = exceptionList[i];
311: Set s = new HashSet();
312: // there has to be some method to do the following, but I can't find it in the collections
313: for (int j = 0; j < exception.length; ++j) {
314: s.add(exception[j]);
315: }
316: fromCaseFold.put(exception[0], s);
317: }
318: }
319:
320: // walk through all the characters, and at every case fold result,
321: // put a set of all the characters that map to that result
322:
323: boolean defaultmapping = true; // false for turkish
324: for (int i = 0; i <= 0x10FFFF; ++i) {
325: int cat = UCharacter.getType(i);
326: if (cat == Character.UNASSIGNED
327: || cat == Character.PRIVATE_USE)
328: continue;
329:
330: String cp = UTF16.valueOf(i);
331: String mapped = UCharacter.foldCase(cp, defaultmapping);
332: if (mapped.equals(cp))
333: continue;
334:
335: if (maxLength < mapped.length())
336: maxLength = mapped.length();
337:
338: // at this point, have different case folding
339:
340: Set s = (Set) fromCaseFold.get(mapped);
341: if (s == null) {
342: s = new HashSet();
343: s.add(mapped); // add the case fold result itself
344: fromCaseFold.put(mapped, s);
345: }
346: s.add(cp);
347: toCaseFold.put(cp, mapped);
348: toCaseFold.put(mapped, mapped); // add mapping to self
349: }
350:
351: // Emit the final data
352:
353: if (DUMP) {
354: System.out.println("maxLength = " + maxLength);
355:
356: System.out.println("\nfromCaseFold:");
357: Iterator it = fromCaseFold.keySet().iterator();
358: while (it.hasNext()) {
359: Object key = it.next();
360: System.out.print(" "
361: + toHex2.transliterate((String) key) + ": ");
362: Set s = (Set) fromCaseFold.get(key);
363: Iterator it2 = s.iterator();
364: boolean first = true;
365: while (it2.hasNext()) {
366: if (first) {
367: first = false;
368: } else {
369: System.out.print(", ");
370: }
371: System.out.print(toHex2.transliterate((String) it2
372: .next()));
373: }
374: System.out.println("");
375: }
376:
377: System.out.println("\ntoCaseFold:");
378: it = toCaseFold.keySet().iterator();
379: while (it.hasNext()) {
380: String key = (String) it.next();
381: String value = (String) toCaseFold.get(key);
382: System.out.println(" " + toHex2.transliterate(key)
383: + ": " + toHex2.transliterate(value));
384: }
385: }
386:
387: // Now convert all those sets into linear arrays
388: // We can't do this in place in Java, so make a temporary target array
389:
390: // Note: This could be transformed into a single array, with offsets into it.
391: // Might be best choice in C.
392:
393: Map fromCaseFold2 = new HashMap();
394: Iterator it = fromCaseFold.keySet().iterator();
395: while (it.hasNext()) {
396: Object key = it.next();
397: Set s = (Set) fromCaseFold.get(key);
398: String[] temp = new String[s.size()];
399: s.toArray(temp);
400: fromCaseFold2.put(key, temp);
401: }
402: fromCaseFold = fromCaseFold2;
403:
404: // We have processed everything, so the iterator will now work
405: // The following is normally OFF.
406: // It is here to generate (under the GENERATE flag) the static exception list.
407: // It must be at the very end of initialization, so that the iterator is functional.
408: // (easiest to do it that way)
409:
410: if (GENERATE) {
411:
412: // first get small set of items that have multiple characters
413:
414: Set multichars = new TreeSet();
415: it = fromCaseFold.keySet().iterator();
416: while (it.hasNext()) {
417: String key = (String) it.next();
418: if (UTF16.countCodePoint(key) < 2)
419: continue;
420: multichars.add(key);
421: }
422:
423: // now we will go through each of them.
424:
425: CaseIterator ci = new CaseIterator();
426: it = multichars.iterator();
427:
428: while (it.hasNext()) {
429: String key = (String) it.next();
430:
431: // here is a nasty complication. Take 'ffi' ligature. We
432: // can't just close it, since we would miss the combination
433: // that includes the 'fi' => "fi" ligature
434: // so first do a pass through, and add substring combinations
435: // we call this a 'partial closure'
436:
437: Set partialClosure = new TreeSet();
438: partialClosure.add(key);
439:
440: if (UTF16.countCodePoint(key) > 2) {
441: Iterator multiIt2 = multichars.iterator();
442: while (multiIt2.hasNext()) {
443: String otherKey = (String) multiIt2.next();
444: if (otherKey.length() >= key.length())
445: continue;
446: int pos = -1;
447: while (true) {
448: // The following is not completely general
449: // but works for the actual cased stuff,
450: // and should work for future characters, since we won't have
451: // more ligatures & other oddities.
452: pos = key.indexOf(otherKey, pos + 1);
453: if (pos < 0)
454: break;
455: int endPos = pos + otherKey.length();
456: // we know we have a proper substring,
457: // so get the combinations
458: String[] choices = (String[]) fromCaseFold
459: .get(otherKey);
460: for (int ii = 0; ii < choices.length; ++ii) {
461: String patchwork = key
462: .substring(0, pos)
463: + choices[ii]
464: + key.substring(endPos);
465: partialClosure.add(patchwork);
466: }
467: }
468: }
469: }
470:
471: // now, for each thing in the partial closure, get its
472: // case closure and add it to the final result.
473:
474: Set closure = new TreeSet(); // this will be the real closure
475: Iterator partialIt = partialClosure.iterator();
476: while (partialIt.hasNext()) {
477: String key2 = (String) partialIt.next();
478: ci.reset(key2);
479: for (String temp = ci.next(); temp != null; temp = ci
480: .next()) {
481: closure.add(temp);
482: }
483: // form closure
484: /*String[] choices = (String[]) fromCaseFold.get(key2);
485: for (int i = 0; i < choices.length; ++i) {
486: ci.reset(choices[i]);
487: String temp;
488: while (null != (temp = ci.next())) {
489: closure.add(temp);
490: }
491: }
492: */
493: }
494:
495: // print it out, so that it can be cut and pasted back into this document.
496:
497: Iterator it2 = closure.iterator();
498: System.out.println("\t// " + toName.transliterate(key));
499: System.out.print("\t{\"" + toHex.transliterate(key)
500: + "\",");
501: while (it2.hasNext()) {
502: String item = (String) it2.next();
503: System.out.print("\"" + toHex.transliterate(item)
504: + "\",");
505: }
506: System.out.println("},");
507: }
508: }
509: }
510:
511: // ============ PRIVATE CLASS DATA ============
512:
513: // pieces that we will put together
514: // is not changed during iteration
515: private int count = 0;
516: private String[][] variants;
517:
518: // state information, changes during iteration
519: private boolean done = false;
520: private int[] counts;
521:
522: // internal buffer for efficiency
523: private StringBuffer nextBuffer = new StringBuffer();
524:
525: // ========================
526:
527: /**
528: * Reset to different source. Once reset, the iteration starts from the beginning.
529: * @param source The string to get case variants for
530: */
531: public void reset(String source) {
532:
533: // allocate arrays to store pieces
534: // using length might be slightly too long, but we don't care much
535:
536: counts = new int[source.length()];
537: variants = new String[source.length()][];
538:
539: // walk through the source, and break up into pieces
540: // each piece becomes an array of equivalent values
541: // TODO: could optimized this later to coalesce all single string pieces
542:
543: String piece = null;
544: count = 0;
545: for (int i = 0; i < source.length(); i += piece.length()) {
546:
547: // find *longest* matching piece
548: String caseFold = null;
549:
550: if (GENERATE) {
551: // do exactly one CP
552: piece = UTF16.valueOf(source, i);
553: caseFold = (String) toCaseFold.get(piece);
554: } else {
555: int max = i + maxLength;
556: if (max > source.length())
557: max = source.length();
558: for (int j = max; j > i; --j) {
559: piece = source.substring(i, j);
560: caseFold = (String) toCaseFold.get(piece);
561: if (caseFold != null)
562: break;
563: }
564: }
565:
566: // if we fail, pick one code point
567: if (caseFold == null) {
568: piece = UTF16.valueOf(source, i);
569: variants[count++] = new String[] { piece }; // single item string
570: } else {
571: variants[count++] = (String[]) fromCaseFold
572: .get(caseFold);
573: }
574: }
575: reset();
576: }
577:
578: /**
579: * Restart the iteration from the beginning, but with same source
580: */
581: public void reset() {
582: done = false;
583: for (int i = 0; i < count; ++i) {
584: counts[i] = 0;
585: }
586: }
587:
588: /**
589: * Iterates through the case variants.
590: * @return next case variant. Each variant will case-fold to the same value as the source will.
591: * When the iteration is done, null is returned.
592: */
593: public String next() {
594:
595: if (done)
596: return null;
597: int i;
598:
599: // TODO Optimize so we keep the piece before and after the current position
600: // so we don't have so much concatenation
601:
602: // get the result, a concatenation
603:
604: nextBuffer.setLength(0);
605: for (i = 0; i < count; ++i) {
606: nextBuffer.append(variants[i][counts[i]]);
607: }
608:
609: // find the next right set of pieces to concatenate
610:
611: for (i = count - 1; i >= 0; --i) {
612: counts[i]++;
613: if (counts[i] < variants[i].length)
614: break;
615: counts[i] = 0;
616: }
617:
618: // if we go too far, bail
619:
620: if (i < 0) {
621: done = true;
622: }
623:
624: return nextBuffer.toString();
625: }
626:
627: /**
628: * Temporary test, just to see how the stuff works.
629: */
630: static public void main(String[] args) {
631: String[] testCases = { "fiss", "h\u03a3" };
632: CaseIterator ci = new CaseIterator();
633:
634: for (int i = 0; i < testCases.length; ++i) {
635: String item = testCases[i];
636: System.out.println();
637: System.out
638: .println("Testing: " + toName.transliterate(item));
639: System.out.println();
640: ci.reset(item);
641: int count = 0;
642: for (String temp = ci.next(); temp != null; temp = ci
643: .next()) {
644: System.out.println(toName.transliterate(temp));
645: count++;
646: }
647: System.out.println("Total: " + count);
648: }
649:
650: // generate a list of all caseless characters -- characters whose
651: // case closure is themselves.
652:
653: UnicodeSet caseless = new UnicodeSet();
654:
655: for (int i = 0; i <= 0x10FFFF; ++i) {
656: String cp = UTF16.valueOf(i);
657: ci.reset(cp);
658: int count = 0;
659: String fold = null;
660: for (String temp = ci.next(); temp != null; temp = ci
661: .next()) {
662: fold = temp;
663: if (++count > 1)
664: break;
665: }
666: if (count == 1 && fold.equals(cp)) {
667: caseless.add(i);
668: }
669: }
670:
671: System.out.println("caseless = " + caseless.toPattern(true));
672:
673: UnicodeSet not_lc = new UnicodeSet("[:^lc:]");
674:
675: UnicodeSet a = new UnicodeSet();
676: a.set(not_lc);
677: a.removeAll(caseless);
678: System.out.println("[:^lc:] - caseless = " + a.toPattern(true));
679:
680: a.set(caseless);
681: a.removeAll(not_lc);
682: System.out.println("caseless - [:^lc:] = " + a.toPattern(true));
683: }
684: }
|