Source Code Cross Referenced for CaseIterator.java in » Internationalization-Localization » icu4j » com » ibm » icu » dev » demo » translit » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.dev.demo.translit
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /**
002:         *******************************************************************************
003:         * Copyright (C) 1996-2004, International Business Machines Corporation and    *
004:         * others. All Rights Reserved.                                                *
005:         *******************************************************************************
006:         */package com.ibm.icu.dev.demo.translit;
007:
008:        import java.util.*;
009:        import com.ibm.icu.lang.UCharacter;
010:        import com.ibm.icu.text.UTF16;
011:        import com.ibm.icu.text.Transliterator;
012:        import com.ibm.icu.text.UnicodeSet;
013:
014:        /**
015:         * Incrementally returns the set of all strings that case-fold to the same value.
016:         */
017:        public class CaseIterator {
018:
019:            // testing stuff
020:            private static final boolean DEBUG = true;
021:            static Transliterator toName = Transliterator
022:                    .getInstance("[:^ascii:] Any-Name");
023:            static Transliterator toHex = Transliterator
024:                    .getInstance("[:^ascii:] Any-Hex");
025:            static Transliterator toHex2 = Transliterator
026:                    .getInstance("[[^\u0021-\u007F]-[,]] Any-Hex");
027:
028:            // global tables (could be precompiled)
029:            private static Map fromCaseFold = new HashMap();
030:            private static Map toCaseFold = new HashMap();
031:            private static int maxLength = 0;
032:
033:            // This exception list is generated on the console by turning on the GENERATED flag, 
034:            // which MUST be false for normal operation.
035:            // Once the list is generated, it is pasted in here.
036:            // A bit of a cludge, but this bootstrapping is the easiest way 
037:            // to get around certain complications in the data.
038:
039:            private static final boolean GENERATE = false;
040:
041:            private static final boolean DUMP = false;
042:
043:            private static String[][] exceptionList = {
044:            // a\N{MODIFIER LETTER RIGHT HALF RING}
045:                    { "a\u02BE", "A\u02BE", "a\u02BE", },
046:                    // ff
047:                    { "ff", "FF", "Ff", "fF", "ff", },
048:                    // ffi
049:                    { "ffi", "FFI", "FFi", "FfI", "Ffi", "F\uFB01", "fFI",
050:                            "fFi", "ffI", "ffi", "f\uFB01", "\uFB00I",
051:                            "\uFB00i", },
052:                    // ffl
053:                    { "ffl", "FFL", "FFl", "FfL", "Ffl", "F\uFB02", "fFL",
054:                            "fFl", "ffL", "ffl", "f\uFB02", "\uFB00L",
055:                            "\uFB00l", },
056:                    // fi
057:                    { "fi", "FI", "Fi", "fI", "fi", },
058:                    // fl
059:                    { "fl", "FL", "Fl", "fL", "fl", },
060:                    // h\N{COMBINING MACRON BELOW}
061:                    { "h\u0331", "H\u0331", "h\u0331", },
062:                    // i\N{COMBINING DOT ABOVE}
063:                    { "i\u0307", "I\u0307", "i\u0307", },
064:                    // j\N{COMBINING CARON}
065:                    { "j\u030C", "J\u030C", "j\u030C", },
066:                    // ss
067:                    { "ss", "SS", "Ss", "S\u017F", "sS", "ss", "s\u017F",
068:                            "\u017FS", "\u017Fs", "\u017F\u017F", },
069:                    // st
070:                    { "st", "ST", "St", "sT", "st", "\u017FT", "\u017Ft", },
071:                    // t\N{COMBINING DIAERESIS}
072:                    { "t\u0308", "T\u0308", "t\u0308", },
073:                    // w\N{COMBINING RING ABOVE}
074:                    { "w\u030A", "W\u030A", "w\u030A", },
075:                    // y\N{COMBINING RING ABOVE}
076:                    { "y\u030A", "Y\u030A", "y\u030A", },
077:                    // \N{MODIFIER LETTER APOSTROPHE}n
078:                    { "\u02BCn", "\u02BCN", "\u02BCn", },
079:                    // \N{GREEK SMALL LETTER ALPHA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
080:                    { "\u03AC\u03B9", "\u0386\u0345", "\u0386\u0399",
081:                            "\u0386\u03B9", "\u0386\u1FBE", "\u03AC\u0345",
082:                            "\u03AC\u0399", "\u03AC\u03B9", "\u03AC\u1FBE", },
083:                    // \N{GREEK SMALL LETTER ETA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
084:                    { "\u03AE\u03B9", "\u0389\u0345", "\u0389\u0399",
085:                            "\u0389\u03B9", "\u0389\u1FBE", "\u03AE\u0345",
086:                            "\u03AE\u0399", "\u03AE\u03B9", "\u03AE\u1FBE", },
087:                    // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}
088:                    { "\u03B1\u0342", "\u0391\u0342", "\u03B1\u0342", },
089:                    // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
090:                    { "\u03B1\u0342\u03B9", "\u0391\u0342\u0345",
091:                            "\u0391\u0342\u0399", "\u0391\u0342\u03B9",
092:                            "\u0391\u0342\u1FBE", "\u03B1\u0342\u0345",
093:                            "\u03B1\u0342\u0399", "\u03B1\u0342\u03B9",
094:                            "\u03B1\u0342\u1FBE", "\u1FB6\u0345",
095:                            "\u1FB6\u0399", "\u1FB6\u03B9", "\u1FB6\u1FBE", },
096:                    // \N{GREEK SMALL LETTER ALPHA}\N{GREEK SMALL LETTER IOTA}
097:                    { "\u03B1\u03B9", "\u0391\u0345", "\u0391\u0399",
098:                            "\u0391\u03B9", "\u0391\u1FBE", "\u03B1\u0345",
099:                            "\u03B1\u0399", "\u03B1\u03B9", "\u03B1\u1FBE", },
100:                    // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}
101:                    { "\u03B7\u0342", "\u0397\u0342", "\u03B7\u0342", },
102:                    // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
103:                    { "\u03B7\u0342\u03B9", "\u0397\u0342\u0345",
104:                            "\u0397\u0342\u0399", "\u0397\u0342\u03B9",
105:                            "\u0397\u0342\u1FBE", "\u03B7\u0342\u0345",
106:                            "\u03B7\u0342\u0399", "\u03B7\u0342\u03B9",
107:                            "\u03B7\u0342\u1FBE", "\u1FC6\u0345",
108:                            "\u1FC6\u0399", "\u1FC6\u03B9", "\u1FC6\u1FBE", },
109:                    // \N{GREEK SMALL LETTER ETA}\N{GREEK SMALL LETTER IOTA}
110:                    { "\u03B7\u03B9", "\u0397\u0345", "\u0397\u0399",
111:                            "\u0397\u03B9", "\u0397\u1FBE", "\u03B7\u0345",
112:                            "\u03B7\u0399", "\u03B7\u03B9", "\u03B7\u1FBE", },
113:                    // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
114:                    { "\u03B9\u0308\u0300", "\u0345\u0308\u0300",
115:                            "\u0399\u0308\u0300", "\u03B9\u0308\u0300",
116:                            "\u1FBE\u0308\u0300", },
117:                    // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
118:                    { "\u03B9\u0308\u0301", "\u0345\u0308\u0301",
119:                            "\u0399\u0308\u0301", "\u03B9\u0308\u0301",
120:                            "\u1FBE\u0308\u0301", },
121:                    // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
122:                    { "\u03B9\u0308\u0342", "\u0345\u0308\u0342",
123:                            "\u0399\u0308\u0342", "\u03B9\u0308\u0342",
124:                            "\u1FBE\u0308\u0342", },
125:                    // \N{GREEK SMALL LETTER IOTA}\N{COMBINING GREEK PERISPOMENI}
126:                    { "\u03B9\u0342", "\u0345\u0342", "\u0399\u0342",
127:                            "\u03B9\u0342", "\u1FBE\u0342", },
128:                    // \N{GREEK SMALL LETTER RHO}\N{COMBINING COMMA ABOVE}
129:                    { "\u03C1\u0313", "\u03A1\u0313", "\u03C1\u0313",
130:                            "\u03F1\u0313", },
131:                    // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
132:                    { "\u03C5\u0308\u0300", "\u03A5\u0308\u0300",
133:                            "\u03C5\u0308\u0300", },
134:                    // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
135:                    { "\u03C5\u0308\u0301", "\u03A5\u0308\u0301",
136:                            "\u03C5\u0308\u0301", },
137:                    // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
138:                    { "\u03C5\u0308\u0342", "\u03A5\u0308\u0342",
139:                            "\u03C5\u0308\u0342", },
140:                    // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}
141:                    { "\u03C5\u0313", "\u03A5\u0313", "\u03C5\u0313", },
142:                    // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GRAVE ACCENT}
143:                    { "\u03C5\u0313\u0300", "\u03A5\u0313\u0300",
144:                            "\u03C5\u0313\u0300", "\u1F50\u0300", },
145:                    // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING ACUTE ACCENT}
146:                    { "\u03C5\u0313\u0301", "\u03A5\u0313\u0301",
147:                            "\u03C5\u0313\u0301", "\u1F50\u0301", },
148:                    // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GREEK PERISPOMENI}
149:                    { "\u03C5\u0313\u0342", "\u03A5\u0313\u0342",
150:                            "\u03C5\u0313\u0342", "\u1F50\u0342", },
151:                    // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING GREEK PERISPOMENI}
152:                    { "\u03C5\u0342", "\u03A5\u0342", "\u03C5\u0342", },
153:                    // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}
154:                    { "\u03C9\u0342", "\u03A9\u0342", "\u03C9\u0342",
155:                            "\u2126\u0342", },
156:                    // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
157:                    { "\u03C9\u0342\u03B9", "\u03A9\u0342\u0345",
158:                            "\u03A9\u0342\u0399", "\u03A9\u0342\u03B9",
159:                            "\u03A9\u0342\u1FBE", "\u03C9\u0342\u0345",
160:                            "\u03C9\u0342\u0399", "\u03C9\u0342\u03B9",
161:                            "\u03C9\u0342\u1FBE", "\u1FF6\u0345",
162:                            "\u1FF6\u0399", "\u1FF6\u03B9", "\u1FF6\u1FBE",
163:                            "\u2126\u0342\u0345", "\u2126\u0342\u0399",
164:                            "\u2126\u0342\u03B9", "\u2126\u0342\u1FBE", },
165:                    // \N{GREEK SMALL LETTER OMEGA}\N{GREEK SMALL LETTER IOTA}
166:                    { "\u03C9\u03B9", "\u03A9\u0345", "\u03A9\u0399",
167:                            "\u03A9\u03B9", "\u03A9\u1FBE", "\u03C9\u0345",
168:                            "\u03C9\u0399", "\u03C9\u03B9", "\u03C9\u1FBE",
169:                            "\u2126\u0345", "\u2126\u0399", "\u2126\u03B9",
170:                            "\u2126\u1FBE", },
171:                    // \N{GREEK SMALL LETTER OMEGA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
172:                    { "\u03CE\u03B9", "\u038F\u0345", "\u038F\u0399",
173:                            "\u038F\u03B9", "\u038F\u1FBE", "\u03CE\u0345",
174:                            "\u03CE\u0399", "\u03CE\u03B9", "\u03CE\u1FBE", },
175:                    // \N{ARMENIAN SMALL LETTER ECH}\N{ARMENIAN SMALL LETTER YIWN}
176:                    { "\u0565\u0582", "\u0535\u0552", "\u0535\u0582",
177:                            "\u0565\u0552", "\u0565\u0582", },
178:                    // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER ECH}
179:                    { "\u0574\u0565", "\u0544\u0535", "\u0544\u0565",
180:                            "\u0574\u0535", "\u0574\u0565", },
181:                    // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER INI}
182:                    { "\u0574\u056B", "\u0544\u053B", "\u0544\u056B",
183:                            "\u0574\u053B", "\u0574\u056B", },
184:                    // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER XEH}
185:                    { "\u0574\u056D", "\u0544\u053D", "\u0544\u056D",
186:                            "\u0574\u053D", "\u0574\u056D", },
187:                    // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER NOW}
188:                    { "\u0574\u0576", "\u0544\u0546", "\u0544\u0576",
189:                            "\u0574\u0546", "\u0574\u0576", },
190:                    // \N{ARMENIAN SMALL LETTER VEW}\N{ARMENIAN SMALL LETTER NOW}
191:                    { "\u057E\u0576", "\u054E\u0546", "\u054E\u0576",
192:                            "\u057E\u0546", "\u057E\u0576", },
193:                    // \N{GREEK SMALL LETTER ALPHA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
194:                    { "\u1F00\u03B9", "\u1F00\u0345", "\u1F00\u0399",
195:                            "\u1F00\u03B9", "\u1F00\u1FBE", "\u1F08\u0345",
196:                            "\u1F08\u0399", "\u1F08\u03B9", "\u1F08\u1FBE", },
197:                    // \N{GREEK SMALL LETTER ALPHA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
198:                    { "\u1F01\u03B9", "\u1F01\u0345", "\u1F01\u0399",
199:                            "\u1F01\u03B9", "\u1F01\u1FBE", "\u1F09\u0345",
200:                            "\u1F09\u0399", "\u1F09\u03B9", "\u1F09\u1FBE", },
201:                    // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
202:                    { "\u1F02\u03B9", "\u1F02\u0345", "\u1F02\u0399",
203:                            "\u1F02\u03B9", "\u1F02\u1FBE", "\u1F0A\u0345",
204:                            "\u1F0A\u0399", "\u1F0A\u03B9", "\u1F0A\u1FBE", },
205:                    // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
206:                    { "\u1F03\u03B9", "\u1F03\u0345", "\u1F03\u0399",
207:                            "\u1F03\u03B9", "\u1F03\u1FBE", "\u1F0B\u0345",
208:                            "\u1F0B\u0399", "\u1F0B\u03B9", "\u1F0B\u1FBE", },
209:                    // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
210:                    { "\u1F04\u03B9", "\u1F04\u0345", "\u1F04\u0399",
211:                            "\u1F04\u03B9", "\u1F04\u1FBE", "\u1F0C\u0345",
212:                            "\u1F0C\u0399", "\u1F0C\u03B9", "\u1F0C\u1FBE", },
213:                    // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
214:                    { "\u1F05\u03B9", "\u1F05\u0345", "\u1F05\u0399",
215:                            "\u1F05\u03B9", "\u1F05\u1FBE", "\u1F0D\u0345",
216:                            "\u1F0D\u0399", "\u1F0D\u03B9", "\u1F0D\u1FBE", },
217:                    // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
218:                    { "\u1F06\u03B9", "\u1F06\u0345", "\u1F06\u0399",
219:                            "\u1F06\u03B9", "\u1F06\u1FBE", "\u1F0E\u0345",
220:                            "\u1F0E\u0399", "\u1F0E\u03B9", "\u1F0E\u1FBE", },
221:                    // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
222:                    { "\u1F07\u03B9", "\u1F07\u0345", "\u1F07\u0399",
223:                            "\u1F07\u03B9", "\u1F07\u1FBE", "\u1F0F\u0345",
224:                            "\u1F0F\u0399", "\u1F0F\u03B9", "\u1F0F\u1FBE", },
225:                    // \N{GREEK SMALL LETTER ETA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
226:                    { "\u1F20\u03B9", "\u1F20\u0345", "\u1F20\u0399",
227:                            "\u1F20\u03B9", "\u1F20\u1FBE", "\u1F28\u0345",
228:                            "\u1F28\u0399", "\u1F28\u03B9", "\u1F28\u1FBE", },
229:                    // \N{GREEK SMALL LETTER ETA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
230:                    { "\u1F21\u03B9", "\u1F21\u0345", "\u1F21\u0399",
231:                            "\u1F21\u03B9", "\u1F21\u1FBE", "\u1F29\u0345",
232:                            "\u1F29\u0399", "\u1F29\u03B9", "\u1F29\u1FBE", },
233:                    // \N{GREEK SMALL LETTER ETA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
234:                    { "\u1F22\u03B9", "\u1F22\u0345", "\u1F22\u0399",
235:                            "\u1F22\u03B9", "\u1F22\u1FBE", "\u1F2A\u0345",
236:                            "\u1F2A\u0399", "\u1F2A\u03B9", "\u1F2A\u1FBE", },
237:                    // \N{GREEK SMALL LETTER ETA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
238:                    { "\u1F23\u03B9", "\u1F23\u0345", "\u1F23\u0399",
239:                            "\u1F23\u03B9", "\u1F23\u1FBE", "\u1F2B\u0345",
240:                            "\u1F2B\u0399", "\u1F2B\u03B9", "\u1F2B\u1FBE", },
241:                    // \N{GREEK SMALL LETTER ETA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
242:                    { "\u1F24\u03B9", "\u1F24\u0345", "\u1F24\u0399",
243:                            "\u1F24\u03B9", "\u1F24\u1FBE", "\u1F2C\u0345",
244:                            "\u1F2C\u0399", "\u1F2C\u03B9", "\u1F2C\u1FBE", },
245:                    // \N{GREEK SMALL LETTER ETA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
246:                    { "\u1F25\u03B9", "\u1F25\u0345", "\u1F25\u0399",
247:                            "\u1F25\u03B9", "\u1F25\u1FBE", "\u1F2D\u0345",
248:                            "\u1F2D\u0399", "\u1F2D\u03B9", "\u1F2D\u1FBE", },
249:                    // \N{GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
250:                    { "\u1F26\u03B9", "\u1F26\u0345", "\u1F26\u0399",
251:                            "\u1F26\u03B9", "\u1F26\u1FBE", "\u1F2E\u0345",
252:                            "\u1F2E\u0399", "\u1F2E\u03B9", "\u1F2E\u1FBE", },
253:                    // \N{GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
254:                    { "\u1F27\u03B9", "\u1F27\u0345", "\u1F27\u0399",
255:                            "\u1F27\u03B9", "\u1F27\u1FBE", "\u1F2F\u0345",
256:                            "\u1F2F\u0399", "\u1F2F\u03B9", "\u1F2F\u1FBE", },
257:                    // \N{GREEK SMALL LETTER OMEGA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
258:                    { "\u1F60\u03B9", "\u1F60\u0345", "\u1F60\u0399",
259:                            "\u1F60\u03B9", "\u1F60\u1FBE", "\u1F68\u0345",
260:                            "\u1F68\u0399", "\u1F68\u03B9", "\u1F68\u1FBE", },
261:                    // \N{GREEK SMALL LETTER OMEGA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
262:                    { "\u1F61\u03B9", "\u1F61\u0345", "\u1F61\u0399",
263:                            "\u1F61\u03B9", "\u1F61\u1FBE", "\u1F69\u0345",
264:                            "\u1F69\u0399", "\u1F69\u03B9", "\u1F69\u1FBE", },
265:                    // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
266:                    { "\u1F62\u03B9", "\u1F62\u0345", "\u1F62\u0399",
267:                            "\u1F62\u03B9", "\u1F62\u1FBE", "\u1F6A\u0345",
268:                            "\u1F6A\u0399", "\u1F6A\u03B9", "\u1F6A\u1FBE", },
269:                    // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
270:                    { "\u1F63\u03B9", "\u1F63\u0345", "\u1F63\u0399",
271:                            "\u1F63\u03B9", "\u1F63\u1FBE", "\u1F6B\u0345",
272:                            "\u1F6B\u0399", "\u1F6B\u03B9", "\u1F6B\u1FBE", },
273:                    // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
274:                    { "\u1F64\u03B9", "\u1F64\u0345", "\u1F64\u0399",
275:                            "\u1F64\u03B9", "\u1F64\u1FBE", "\u1F6C\u0345",
276:                            "\u1F6C\u0399", "\u1F6C\u03B9", "\u1F6C\u1FBE", },
277:                    // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
278:                    { "\u1F65\u03B9", "\u1F65\u0345", "\u1F65\u0399",
279:                            "\u1F65\u03B9", "\u1F65\u1FBE", "\u1F6D\u0345",
280:                            "\u1F6D\u0399", "\u1F6D\u03B9", "\u1F6D\u1FBE", },
281:                    // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
282:                    { "\u1F66\u03B9", "\u1F66\u0345", "\u1F66\u0399",
283:                            "\u1F66\u03B9", "\u1F66\u1FBE", "\u1F6E\u0345",
284:                            "\u1F6E\u0399", "\u1F6E\u03B9", "\u1F6E\u1FBE", },
285:                    // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
286:                    { "\u1F67\u03B9", "\u1F67\u0345", "\u1F67\u0399",
287:                            "\u1F67\u03B9", "\u1F67\u1FBE", "\u1F6F\u0345",
288:                            "\u1F6F\u0399", "\u1F6F\u03B9", "\u1F6F\u1FBE", },
289:                    // \N{GREEK SMALL LETTER ALPHA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
290:                    { "\u1F70\u03B9", "\u1F70\u0345", "\u1F70\u0399",
291:                            "\u1F70\u03B9", "\u1F70\u1FBE", "\u1FBA\u0345",
292:                            "\u1FBA\u0399", "\u1FBA\u03B9", "\u1FBA\u1FBE", },
293:                    // \N{GREEK SMALL LETTER ETA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
294:                    { "\u1F74\u03B9", "\u1F74\u0345", "\u1F74\u0399",
295:                            "\u1F74\u03B9", "\u1F74\u1FBE", "\u1FCA\u0345",
296:                            "\u1FCA\u0399", "\u1FCA\u03B9", "\u1FCA\u1FBE", },
297:                    // \N{GREEK SMALL LETTER OMEGA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
298:                    { "\u1F7C\u03B9", "\u1F7C\u0345", "\u1F7C\u0399",
299:                            "\u1F7C\u03B9", "\u1F7C\u1FBE", "\u1FFA\u0345",
300:                            "\u1FFA\u0399", "\u1FFA\u03B9", "\u1FFA\u1FBE", }, };
301:
302:            // this initializes the data used to generated the case-equivalents
303:
304:            static {
305:
306:                // Gather up the exceptions in a form we can use
307:
308:                if (!GENERATE) {
309:                    for (int i = 0; i < exceptionList.length; ++i) {
310:                        String[] exception = exceptionList[i];
311:                        Set s = new HashSet();
312:                        // there has to be some method to do the following, but I can't find it in the collections
313:                        for (int j = 0; j < exception.length; ++j) {
314:                            s.add(exception[j]);
315:                        }
316:                        fromCaseFold.put(exception[0], s);
317:                    }
318:                }
319:
320:                // walk through all the characters, and at every case fold result,
321:                // put a set of all the characters that map to that result
322:
323:                boolean defaultmapping = true; // false for turkish
324:                for (int i = 0; i <= 0x10FFFF; ++i) {
325:                    int cat = UCharacter.getType(i);
326:                    if (cat == Character.UNASSIGNED
327:                            || cat == Character.PRIVATE_USE)
328:                        continue;
329:
330:                    String cp = UTF16.valueOf(i);
331:                    String mapped = UCharacter.foldCase(cp, defaultmapping);
332:                    if (mapped.equals(cp))
333:                        continue;
334:
335:                    if (maxLength < mapped.length())
336:                        maxLength = mapped.length();
337:
338:                    // at this point, have different case folding
339:
340:                    Set s = (Set) fromCaseFold.get(mapped);
341:                    if (s == null) {
342:                        s = new HashSet();
343:                        s.add(mapped); // add the case fold result itself
344:                        fromCaseFold.put(mapped, s);
345:                    }
346:                    s.add(cp);
347:                    toCaseFold.put(cp, mapped);
348:                    toCaseFold.put(mapped, mapped); // add mapping to self
349:                }
350:
351:                // Emit the final data
352:
353:                if (DUMP) {
354:                    System.out.println("maxLength = " + maxLength);
355:
356:                    System.out.println("\nfromCaseFold:");
357:                    Iterator it = fromCaseFold.keySet().iterator();
358:                    while (it.hasNext()) {
359:                        Object key = it.next();
360:                        System.out.print(" "
361:                                + toHex2.transliterate((String) key) + ": ");
362:                        Set s = (Set) fromCaseFold.get(key);
363:                        Iterator it2 = s.iterator();
364:                        boolean first = true;
365:                        while (it2.hasNext()) {
366:                            if (first) {
367:                                first = false;
368:                            } else {
369:                                System.out.print(", ");
370:                            }
371:                            System.out.print(toHex2.transliterate((String) it2
372:                                    .next()));
373:                        }
374:                        System.out.println("");
375:                    }
376:
377:                    System.out.println("\ntoCaseFold:");
378:                    it = toCaseFold.keySet().iterator();
379:                    while (it.hasNext()) {
380:                        String key = (String) it.next();
381:                        String value = (String) toCaseFold.get(key);
382:                        System.out.println(" " + toHex2.transliterate(key)
383:                                + ": " + toHex2.transliterate(value));
384:                    }
385:                }
386:
387:                // Now convert all those sets into linear arrays
388:                // We can't do this in place in Java, so make a temporary target array
389:
390:                // Note: This could be transformed into a single array, with offsets into it.
391:                // Might be best choice in C.
392:
393:                Map fromCaseFold2 = new HashMap();
394:                Iterator it = fromCaseFold.keySet().iterator();
395:                while (it.hasNext()) {
396:                    Object key = it.next();
397:                    Set s = (Set) fromCaseFold.get(key);
398:                    String[] temp = new String[s.size()];
399:                    s.toArray(temp);
400:                    fromCaseFold2.put(key, temp);
401:                }
402:                fromCaseFold = fromCaseFold2;
403:
404:                // We have processed everything, so the iterator will now work
405:                // The following is normally OFF. 
406:                // It is here to generate (under the GENERATE flag) the static exception list.
407:                // It must be at the very end of initialization, so that the iterator is functional.
408:                // (easiest to do it that way)
409:
410:                if (GENERATE) {
411:
412:                    // first get small set of items that have multiple characters
413:
414:                    Set multichars = new TreeSet();
415:                    it = fromCaseFold.keySet().iterator();
416:                    while (it.hasNext()) {
417:                        String key = (String) it.next();
418:                        if (UTF16.countCodePoint(key) < 2)
419:                            continue;
420:                        multichars.add(key);
421:                    }
422:
423:                    // now we will go through each of them.
424:
425:                    CaseIterator ci = new CaseIterator();
426:                    it = multichars.iterator();
427:
428:                    while (it.hasNext()) {
429:                        String key = (String) it.next();
430:
431:                        // here is a nasty complication. Take 'ffi' ligature. We
432:                        // can't just close it, since we would miss the combination
433:                        // that includes the 'fi' => "fi" ligature
434:                        // so first do a pass through, and add substring combinations
435:                        // we call this a 'partial closure'
436:
437:                        Set partialClosure = new TreeSet();
438:                        partialClosure.add(key);
439:
440:                        if (UTF16.countCodePoint(key) > 2) {
441:                            Iterator multiIt2 = multichars.iterator();
442:                            while (multiIt2.hasNext()) {
443:                                String otherKey = (String) multiIt2.next();
444:                                if (otherKey.length() >= key.length())
445:                                    continue;
446:                                int pos = -1;
447:                                while (true) {
448:                                    // The following is not completely general
449:                                    // but works for the actual cased stuff,
450:                                    // and should work for future characters, since we won't have
451:                                    // more ligatures & other oddities.
452:                                    pos = key.indexOf(otherKey, pos + 1);
453:                                    if (pos < 0)
454:                                        break;
455:                                    int endPos = pos + otherKey.length();
456:                                    // we know we have a proper substring,
457:                                    // so get the combinations
458:                                    String[] choices = (String[]) fromCaseFold
459:                                            .get(otherKey);
460:                                    for (int ii = 0; ii < choices.length; ++ii) {
461:                                        String patchwork = key
462:                                                .substring(0, pos)
463:                                                + choices[ii]
464:                                                + key.substring(endPos);
465:                                        partialClosure.add(patchwork);
466:                                    }
467:                                }
468:                            }
469:                        }
470:
471:                        // now, for each thing in the partial closure, get its
472:                        // case closure and add it to the final result.
473:
474:                        Set closure = new TreeSet(); // this will be the real closure
475:                        Iterator partialIt = partialClosure.iterator();
476:                        while (partialIt.hasNext()) {
477:                            String key2 = (String) partialIt.next();
478:                            ci.reset(key2);
479:                            for (String temp = ci.next(); temp != null; temp = ci
480:                                    .next()) {
481:                                closure.add(temp);
482:                            }
483:                            // form closure
484:                            /*String[] choices = (String[]) fromCaseFold.get(key2);
485:                            for (int i = 0; i < choices.length; ++i) {
486:                                ci.reset(choices[i]);
487:                                String temp;
488:                                while (null != (temp = ci.next())) {
489:                                    closure.add(temp);
490:                                }
491:                            }
492:                             */
493:                        }
494:
495:                        // print it out, so that it can be cut and pasted back into this document.
496:
497:                        Iterator it2 = closure.iterator();
498:                        System.out.println("\t// " + toName.transliterate(key));
499:                        System.out.print("\t{\"" + toHex.transliterate(key)
500:                                + "\",");
501:                        while (it2.hasNext()) {
502:                            String item = (String) it2.next();
503:                            System.out.print("\"" + toHex.transliterate(item)
504:                                    + "\",");
505:                        }
506:                        System.out.println("},");
507:                    }
508:                }
509:            }
510:
511:            // ============ PRIVATE CLASS DATA ============ 
512:
513:            // pieces that we will put together
514:            // is not changed during iteration
515:            private int count = 0;
516:            private String[][] variants;
517:
518:            // state information, changes during iteration
519:            private boolean done = false;
520:            private int[] counts;
521:
522:            // internal buffer for efficiency
523:            private StringBuffer nextBuffer = new StringBuffer();
524:
525:            // ========================  
526:
527:            /**
528:             * Reset to different source. Once reset, the iteration starts from the beginning.
529:             * @param source The string to get case variants for
530:             */
531:            public void reset(String source) {
532:
533:                // allocate arrays to store pieces
534:                // using length might be slightly too long, but we don't care much
535:
536:                counts = new int[source.length()];
537:                variants = new String[source.length()][];
538:
539:                // walk through the source, and break up into pieces
540:                // each piece becomes an array of equivalent values
541:                // TODO: could optimized this later to coalesce all single string pieces
542:
543:                String piece = null;
544:                count = 0;
545:                for (int i = 0; i < source.length(); i += piece.length()) {
546:
547:                    // find *longest* matching piece
548:                    String caseFold = null;
549:
550:                    if (GENERATE) {
551:                        // do exactly one CP
552:                        piece = UTF16.valueOf(source, i);
553:                        caseFold = (String) toCaseFold.get(piece);
554:                    } else {
555:                        int max = i + maxLength;
556:                        if (max > source.length())
557:                            max = source.length();
558:                        for (int j = max; j > i; --j) {
559:                            piece = source.substring(i, j);
560:                            caseFold = (String) toCaseFold.get(piece);
561:                            if (caseFold != null)
562:                                break;
563:                        }
564:                    }
565:
566:                    // if we fail, pick one code point
567:                    if (caseFold == null) {
568:                        piece = UTF16.valueOf(source, i);
569:                        variants[count++] = new String[] { piece }; // single item string
570:                    } else {
571:                        variants[count++] = (String[]) fromCaseFold
572:                                .get(caseFold);
573:                    }
574:                }
575:                reset();
576:            }
577:
578:            /**
579:             * Restart the iteration from the beginning, but with same source
580:             */
581:            public void reset() {
582:                done = false;
583:                for (int i = 0; i < count; ++i) {
584:                    counts[i] = 0;
585:                }
586:            }
587:
588:            /**
589:             * Iterates through the case variants.
590:             * @return next case variant. Each variant will case-fold to the same value as the source will.
591:             * When the iteration is done, null is returned.
592:             */
593:            public String next() {
594:
595:                if (done)
596:                    return null;
597:                int i;
598:
599:                // TODO Optimize so we keep the piece before and after the current position
600:                // so we don't have so much concatenation
601:
602:                // get the result, a concatenation
603:
604:                nextBuffer.setLength(0);
605:                for (i = 0; i < count; ++i) {
606:                    nextBuffer.append(variants[i][counts[i]]);
607:                }
608:
609:                // find the next right set of pieces to concatenate
610:
611:                for (i = count - 1; i >= 0; --i) {
612:                    counts[i]++;
613:                    if (counts[i] < variants[i].length)
614:                        break;
615:                    counts[i] = 0;
616:                }
617:
618:                // if we go too far, bail
619:
620:                if (i < 0) {
621:                    done = true;
622:                }
623:
624:                return nextBuffer.toString();
625:            }
626:
627:            /**
628:             * Temporary test, just to see how the stuff works.
629:             */
630:            static public void main(String[] args) {
631:                String[] testCases = { "fiss", "h\u03a3" };
632:                CaseIterator ci = new CaseIterator();
633:
634:                for (int i = 0; i < testCases.length; ++i) {
635:                    String item = testCases[i];
636:                    System.out.println();
637:                    System.out
638:                            .println("Testing: " + toName.transliterate(item));
639:                    System.out.println();
640:                    ci.reset(item);
641:                    int count = 0;
642:                    for (String temp = ci.next(); temp != null; temp = ci
643:                            .next()) {
644:                        System.out.println(toName.transliterate(temp));
645:                        count++;
646:                    }
647:                    System.out.println("Total: " + count);
648:                }
649:
650:                // generate a list of all caseless characters -- characters whose
651:                // case closure is themselves.
652:
653:                UnicodeSet caseless = new UnicodeSet();
654:
655:                for (int i = 0; i <= 0x10FFFF; ++i) {
656:                    String cp = UTF16.valueOf(i);
657:                    ci.reset(cp);
658:                    int count = 0;
659:                    String fold = null;
660:                    for (String temp = ci.next(); temp != null; temp = ci
661:                            .next()) {
662:                        fold = temp;
663:                        if (++count > 1)
664:                            break;
665:                    }
666:                    if (count == 1 && fold.equals(cp)) {
667:                        caseless.add(i);
668:                    }
669:                }
670:
671:                System.out.println("caseless = " + caseless.toPattern(true));
672:
673:                UnicodeSet not_lc = new UnicodeSet("[:^lc:]");
674:
675:                UnicodeSet a = new UnicodeSet();
676:                a.set(not_lc);
677:                a.removeAll(caseless);
678:                System.out.println("[:^lc:] - caseless = " + a.toPattern(true));
679:
680:                a.set(caseless);
681:                a.removeAll(not_lc);
682:                System.out.println("caseless - [:^lc:] = " + a.toPattern(true));
683:            }
684:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.