Source Code Cross Referenced for CanonicalIterator.java in » Internationalization-Localization » icu4j » com » ibm » icu » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.text
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         *******************************************************************************
003:         * Copyright (C) 1996-2006, International Business Machines Corporation and    *
004:         * others. All Rights Reserved.                                                *
005:         *******************************************************************************
006:         */
007:        package com.ibm.icu.text;
008:
009:        import com.ibm.icu.lang.*;
010:        import java.util.*;
011:        import com.ibm.icu.impl.NormalizerImpl;
012:        import com.ibm.icu.impl.USerializedSet;
013:        import com.ibm.icu.impl.Utility;
014:
015:        /**
016:         * This class allows one to iterate through all the strings that are canonically equivalent to a given
017:         * string. For example, here are some sample results:
018:         * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
019:         * <pre>
020:         1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
021:         2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
022:         3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
023:         4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
024:         5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
025:         6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
026:         7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
027:         8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
028:         9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
029:         10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
030:         11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
031:         12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
032:         *</pre>
033:         *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
034:         * since it has not been optimized for that situation.
035:         * @author M. Davis
036:         * @stable ICU 2.4
037:         */
038:
039:        public final class CanonicalIterator {
040:            /**
041:             * Construct a CanonicalIterator object
042:             * @param source string to get results for
043:             * @stable ICU 2.4
044:             */
045:            public CanonicalIterator(String source) {
046:                setSource(source);
047:            }
048:
049:            /**
050:             * Gets the NFD form of the current source we are iterating over.
051:             * @return gets the source: NOTE: it is the NFD form of the source originally passed in
052:             * @stable ICU 2.4
053:             */
054:            public String getSource() {
055:                return source;
056:            }
057:
058:            /**
059:             * Resets the iterator so that one can start again from the beginning.
060:             * @stable ICU 2.4
061:             */
062:            public void reset() {
063:                done = false;
064:                for (int i = 0; i < current.length; ++i) {
065:                    current[i] = 0;
066:                }
067:            }
068:
069:            /**
070:             * Get the next canonically equivalent string.
071:             * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
072:             * @return the next string that is canonically equivalent. The value null is returned when
073:             * the iteration is done.
074:             * @stable ICU 2.4
075:             */
076:            public String next() {
077:                if (done)
078:                    return null;
079:
080:                // construct return value
081:
082:                buffer.setLength(0); // delete old contents
083:                for (int i = 0; i < pieces.length; ++i) {
084:                    buffer.append(pieces[i][current[i]]);
085:                }
086:                String result = buffer.toString();
087:
088:                // find next value for next time
089:
090:                for (int i = current.length - 1;; --i) {
091:                    if (i < 0) {
092:                        done = true;
093:                        break;
094:                    }
095:                    current[i]++;
096:                    if (current[i] < pieces[i].length)
097:                        break; // got sequence
098:                    current[i] = 0;
099:                }
100:                return result;
101:            }
102:
103:            /**
104:             * Set a new source for this iterator. Allows object reuse.
105:             * @param newSource the source string to iterate against. This allows the same iterator to be used
106:             * while changing the source string, saving object creation.
107:             * @stable ICU 2.4
108:             */
109:            public void setSource(String newSource) {
110:                source = Normalizer.normalize(newSource, Normalizer.NFD);
111:                done = false;
112:
113:                // catch degenerate case
114:                if (newSource.length() == 0) {
115:                    pieces = new String[1][];
116:                    current = new int[1];
117:                    pieces[0] = new String[] { "" };
118:                    return;
119:                }
120:
121:                // find the segments
122:                List segmentList = new ArrayList();
123:                int cp;
124:                int start = 0;
125:
126:                // i should be the end of the first code point
127:                // break up the string into segements
128:
129:                int i = UTF16.findOffsetFromCodePoint(source, 1);
130:
131:                for (; i < source.length(); i += UTF16.getCharCount(cp)) {
132:                    cp = UTF16.charAt(source, i);
133:                    if (NormalizerImpl.isCanonSafeStart(cp)) {
134:                        segmentList.add(source.substring(start, i)); // add up to i
135:                        start = i;
136:                    }
137:                }
138:                segmentList.add(source.substring(start, i)); // add last one
139:
140:                // allocate the arrays, and find the strings that are CE to each segment
141:                pieces = new String[segmentList.size()][];
142:                current = new int[segmentList.size()];
143:                for (i = 0; i < pieces.length; ++i) {
144:                    if (PROGRESS)
145:                        System.out.println("SEGMENT");
146:                    pieces[i] = getEquivalents((String) segmentList.get(i));
147:                }
148:            }
149:
150:            /**
151:             * Simple implementation of permutation.
152:             * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
153:             * @param source the string to find permutations for
154:             * @param skipZeros set to true to skip characters with canonical combining class zero
155:             * @param output the set to add the results to
156:             * @internal
157:             * @deprecated This API is ICU internal only.
158:             */
159:            public static void permute(String source, boolean skipZeros,
160:                    Set output) {
161:                // TODO: optimize
162:                //if (PROGRESS) System.out.println("Permute: " + source);
163:
164:                // optimization:
165:                // if zero or one character, just return a set with it
166:                // we check for length < 2 to keep from counting code points all the time
167:                if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
168:                    output.add(source);
169:                    return;
170:                }
171:
172:                // otherwise iterate through the string, and recursively permute all the other characters
173:                Set subpermute = new HashSet();
174:                int cp;
175:                for (int i = 0; i < source.length(); i += UTF16
176:                        .getCharCount(cp)) {
177:                    cp = UTF16.charAt(source, i);
178:
179:                    // optimization:
180:                    // if the character is canonical combining class zero,
181:                    // don't permute it
182:                    if (skipZeros && i != 0
183:                            && UCharacter.getCombiningClass(cp) == 0) {
184:                        //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
185:                        continue;
186:                    }
187:
188:                    // see what the permutations of the characters before and after this one are
189:                    subpermute.clear();
190:                    permute(source.substring(0, i)
191:                            + source.substring(i + UTF16.getCharCount(cp)),
192:                            skipZeros, subpermute);
193:
194:                    // prefix this character to all of them
195:                    String chStr = UTF16.valueOf(source, i);
196:                    Iterator it = subpermute.iterator();
197:                    while (it.hasNext()) {
198:                        String piece = chStr + (String) it.next();
199:                        //if (PROGRESS) System.out.println("  Piece: " + piece);
200:                        output.add(piece);
201:                    }
202:                }
203:            }
204:
205:            // FOR TESTING
206:
207:            /*
208:             *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
209:             *@internal
210:             *
211:            public static UnicodeSet getSafeStart() {
212:                return (UnicodeSet) SAFE_START.clone();
213:            }
214:             */
215:            /*
216:             *@return the set of characters whose decompositions start with the given character
217:             *@internal
218:             *
219:            public static UnicodeSet getStarts(int cp) {
220:                UnicodeSet result = AT_START.get(cp);
221:                if (result == null) result = EMPTY;
222:                return (UnicodeSet) result.clone();
223:            }
224:             */
225:
226:            // ===================== PRIVATES ==============================
227:            // debug
228:            private static boolean PROGRESS = false; // debug progress
229:            //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
230:            private static boolean SKIP_ZEROS = true;
231:
232:            // fields
233:            private String source;
234:            private boolean done;
235:            private String[][] pieces;
236:            private int[] current;
237:            // Note: C will need two more fields, since arrays there don't have lengths
238:            // int pieces_length;
239:            // int[] pieces_lengths;
240:
241:            // transient fields
242:            private transient StringBuffer buffer = new StringBuffer();
243:
244:            // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
245:            private String[] getEquivalents(String segment) {
246:                Set result = new HashSet();
247:                Set basic = getEquivalents2(segment);
248:                Set permutations = new HashSet();
249:
250:                // now get all the permutations
251:                // add only the ones that are canonically equivalent
252:                // TODO: optimize by not permuting any class zero.
253:                Iterator it = basic.iterator();
254:                while (it.hasNext()) {
255:                    String item = (String) it.next();
256:                    permutations.clear();
257:                    permute(item, SKIP_ZEROS, permutations);
258:                    Iterator it2 = permutations.iterator();
259:                    while (it2.hasNext()) {
260:                        String possible = (String) it2.next();
261:
262:                        /*
263:                         String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
264:                         if (attempt.equals(segment)) {
265:                         */
266:                        if (Normalizer.compare(possible, segment, 0) == 0) {
267:
268:                            if (PROGRESS)
269:                                System.out.println("Adding Permutation: "
270:                                        + Utility.hex(possible));
271:                            result.add(possible);
272:
273:                        } else {
274:                            if (PROGRESS)
275:                                System.out.println("-Skipping Permutation: "
276:                                        + Utility.hex(possible));
277:                        }
278:                    }
279:                }
280:
281:                // convert into a String[] to clean up storage
282:                String[] finalResult = new String[result.size()];
283:                result.toArray(finalResult);
284:                return finalResult;
285:            }
286:
287:            private Set getEquivalents2(String segment) {
288:
289:                Set result = new HashSet();
290:
291:                if (PROGRESS)
292:                    System.out.println("Adding: " + Utility.hex(segment));
293:
294:                result.add(segment);
295:                StringBuffer workingBuffer = new StringBuffer();
296:
297:                // cycle through all the characters
298:                int cp = 0;
299:                int[] range = new int[2];
300:                for (int i = 0; i < segment.length(); i += UTF16
301:                        .getCharCount(cp)) {
302:
303:                    // see if any character is at the start of some decomposition
304:                    cp = UTF16.charAt(segment, i);
305:                    USerializedSet starts = new USerializedSet();
306:
307:                    if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
308:                        continue;
309:                    }
310:                    int j = 0;
311:                    // if so, see which decompositions match
312:                    int rangeCount = starts.countRanges();
313:                    for (j = 0; j < rangeCount; ++j) {
314:                        starts.getRange(j, range);
315:                        int end = range[1];
316:                        for (int cp2 = range[0]; cp2 <= end; ++cp2) {
317:                            Set remainder = extract(cp2, segment, i,
318:                                    workingBuffer);
319:                            if (remainder == null)
320:                                continue;
321:
322:                            // there were some matches, so add all the possibilities to the set.
323:                            String prefix = segment.substring(0, i);
324:                            prefix += UTF16.valueOf(cp2);
325:                            //int el = -1;
326:                            Iterator iter = remainder.iterator();
327:                            while (iter.hasNext()) {
328:                                String item = (String) iter.next();
329:                                String toAdd = new String(prefix);
330:                                toAdd += item;
331:                                result.add(toAdd);
332:                                //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
333:                            }
334:                        }
335:                    }
336:                }
337:                return result;
338:                /*
339:                Set result = new HashSet();
340:                if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
341:                result.add(segment);
342:                StringBuffer workingBuffer = new StringBuffer();
343:
344:                // cycle through all the characters
345:                int cp;
346:
347:                for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
348:                    // see if any character is at the start of some decomposition
349:                    cp = UTF16.charAt(segment, i);
350:                    NormalizerImpl.getCanonStartSet(c,fillSet)
351:                    UnicodeSet starts = AT_START.get(cp);
352:                    if (starts == null) continue;
353:                    UnicodeSetIterator usi = new UnicodeSetIterator(starts);
354:                    // if so, see which decompositions match
355:                    while (usi.next()) {
356:                        int cp2 = usi.codepoint;
357:                        // we know that there are no strings in it
358:                        // so we don't have to check CharacterIterator.IS_STRING
359:                        Set remainder = extract(cp2, segment, i, workingBuffer);
360:                        if (remainder == null) continue;
361:
362:                        // there were some matches, so add all the possibilities to the set.
363:                        String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
364:                        Iterator it = remainder.iterator();
365:                        while (it.hasNext()) {
366:                            String item = (String) it.next();
367:                            if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
368:                            result.add(prefix + item);
369:                        }
370:                    }
371:                }
372:                return result;
373:                 */
374:            }
375:
376:            /**
377:             * See if the decomposition of cp2 is at segment starting at segmentPos
378:             * (with canonical rearrangment!)
379:             * If so, take the remainder, and return the equivalents
380:             */
381:            private Set extract(int comp, String segment, int segmentPos,
382:                    StringBuffer buffer) {
383:                if (PROGRESS)
384:                    System.out.println(" extract: "
385:                            + Utility.hex(UTF16.valueOf(comp)) + ", "
386:                            + Utility.hex(segment.substring(segmentPos)));
387:
388:                //String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
389:                String decomp = Normalizer.normalize(comp, Normalizer.NFD);
390:
391:                // See if it matches the start of segment (at segmentPos)
392:                boolean ok = false;
393:                int cp;
394:                int decompPos = 0;
395:                int decompCp = UTF16.charAt(decomp, 0);
396:                decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
397:                //int decompClass = getClass(decompCp);
398:                buffer.setLength(0); // initialize working buffer, shared among callees
399:
400:                for (int i = segmentPos; i < segment.length(); i += UTF16
401:                        .getCharCount(cp)) {
402:                    cp = UTF16.charAt(segment, i);
403:                    if (cp == decompCp) { // if equal, eat another cp from decomp
404:                        if (PROGRESS)
405:                            System.out.println("  matches: "
406:                                    + Utility.hex(UTF16.valueOf(cp)));
407:                        if (decompPos == decomp.length()) { // done, have all decomp characters!
408:                            buffer.append(segment.substring(i
409:                                    + UTF16.getCharCount(cp))); // add remaining segment chars
410:                            ok = true;
411:                            break;
412:                        }
413:                        decompCp = UTF16.charAt(decomp, decompPos);
414:                        decompPos += UTF16.getCharCount(decompCp);
415:                        //decompClass = getClass(decompCp);
416:                    } else {
417:                        if (PROGRESS)
418:                            System.out.println("  buffer: "
419:                                    + Utility.hex(UTF16.valueOf(cp)));
420:                        // brute force approach
421:                        UTF16.append(buffer, cp);
422:                        /* TODO: optimize
423:                        // since we know that the classes are monotonically increasing, after zero
424:                        // e.g. 0 5 7 9 0 3
425:                        // we can do an optimization
426:                        // there are only a few cases that work: zero, less, same, greater
427:                        // if both classes are the same, we fail
428:                        // if the decomp class < the segment class, we fail
429:
430:                        segClass = getClass(cp);
431:                        if (decompClass <= segClass) return null;
432:                         */
433:                    }
434:                }
435:                if (!ok)
436:                    return null; // we failed, characters left over
437:                if (PROGRESS)
438:                    System.out.println("Matches");
439:                if (buffer.length() == 0)
440:                    return SET_WITH_NULL_STRING; // succeed, but no remainder
441:                String remainder = buffer.toString();
442:
443:                // brute force approach
444:                // to check to make sure result is canonically equivalent
445:                /*
446:                String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
447:                if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
448:                 */
449:
450:                if (0 != Normalizer.compare(UTF16.valueOf(comp) + remainder,
451:                        segment.substring(segmentPos), 0))
452:                    return null;
453:
454:                // get the remaining combinations
455:                return getEquivalents2(remainder);
456:            }
457:
458:            /*
459:            // TODO: fix once we have a codepoint interface to get the canonical combining class
460:            // TODO: Need public access to canonical combining class in UCharacter!
461:            private static int getClass(int cp) {
462:                return Normalizer.getClass((char)cp);
463:            }
464:             */
465:
466:            // ================= BUILDER =========================
467:            // TODO: Flatten this data so it doesn't have to be reconstructed each time!
468:            private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
469:            private static final Set SET_WITH_NULL_STRING = new HashSet(); // constant, don't change
470:            static {
471:                SET_WITH_NULL_STRING.add("");
472:            }
473:
474:            //  private static UnicodeSet SAFE_START = new UnicodeSet();
475:            //  private static CharMap AT_START = new CharMap();
476:
477:            // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
478:            // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
479:            //  private static int LAST_UNICODE = 0x10FFFF;
480:            /*
481:            static {
482:                buildData();
483:            }
484:             */
485:            /*
486:            private static void buildData() {
487:
488:                if (PROGRESS) System.out.println("Getting Safe Start");
489:                for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
490:                    if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
491:                    int cc = UCharacter.getCombiningClass(cp);
492:                    if (cc == 0) SAFE_START.add(cp);
493:                    // will fix to be really safe below
494:                }
495:                if (PROGRESS) System.out.println();
496:
497:                if (PROGRESS) System.out.println("Getting Containment");
498:                for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
499:                    if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
500:
501:                    if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
502:
503:                    //String istr = UTF16.valueOf(cp);
504:                    String decomp = Normalizer.normalize(cp, Normalizer.NFD);
505:                    //if (decomp.equals(istr)) continue;
506:
507:                    // add each character in the decomposition to canBeIn
508:
509:                    int component;
510:                    for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
511:                        component = UTF16.charAt(decomp, i);
512:                        if (i == 0) {
513:                            AT_START.add(component, cp);
514:                        } else if (UCharacter.getCombiningClass(component) == 0) {
515:                            SAFE_START.remove(component);
516:                        }
517:                    }
518:                }
519:                if (PROGRESS) System.out.println();
520:            }
521:                // the following is just for a map from characters to a set of characters
522:
523:            private static class CharMap {
524:                Map storage = new HashMap();
525:                MutableInt probe = new MutableInt();
526:                boolean converted = false;
527:
528:                public void add(int cp, int whatItIsIn) {
529:                    UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
530:                    if (result == null) {
531:                        result = new UnicodeSet();
532:                        storage.put(probe, result);
533:                    }
534:                    result.add(whatItIsIn);
535:                }
536:
537:                public UnicodeSet get(int cp) {
538:                    return (UnicodeSet) storage.get(probe.set(cp));
539:                }
540:            }
541:
542:            private static class MutableInt {
543:                public int contents;
544:                public int hashCode() { return contents; }
545:                public boolean equals(Object other) {
546:                    return ((MutableInt)other).contents == contents;
547:                }
548:                // allows chaining
549:                public MutableInt set(int contents) {
550:                    this.contents = contents;
551:                    return this;
552:                }
553:            }
554:             */
555:
556:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.