001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.text;
008:
009: import com.ibm.icu.lang.*;
010: import java.util.*;
011: import com.ibm.icu.impl.NormalizerImpl;
012: import com.ibm.icu.impl.USerializedSet;
013: import com.ibm.icu.impl.Utility;
014:
015: /**
016: * This class allows one to iterate through all the strings that are canonically equivalent to a given
017: * string. For example, here are some sample results:
018: * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
019: * <pre>
020: 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
021: 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
022: 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
023: 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
024: 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
025: 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
026: 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
027: 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
028: 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
029: 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
030: 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
031: 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
032: *</pre>
033: *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
034: * since it has not been optimized for that situation.
035: * @author M. Davis
036: * @stable ICU 2.4
037: */
038:
039: public final class CanonicalIterator {
040: /**
041: * Construct a CanonicalIterator object
042: * @param source string to get results for
043: * @stable ICU 2.4
044: */
045: public CanonicalIterator(String source) {
046: setSource(source);
047: }
048:
049: /**
050: * Gets the NFD form of the current source we are iterating over.
051: * @return gets the source: NOTE: it is the NFD form of the source originally passed in
052: * @stable ICU 2.4
053: */
054: public String getSource() {
055: return source;
056: }
057:
058: /**
059: * Resets the iterator so that one can start again from the beginning.
060: * @stable ICU 2.4
061: */
062: public void reset() {
063: done = false;
064: for (int i = 0; i < current.length; ++i) {
065: current[i] = 0;
066: }
067: }
068:
069: /**
070: * Get the next canonically equivalent string.
071: * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
072: * @return the next string that is canonically equivalent. The value null is returned when
073: * the iteration is done.
074: * @stable ICU 2.4
075: */
076: public String next() {
077: if (done)
078: return null;
079:
080: // construct return value
081:
082: buffer.setLength(0); // delete old contents
083: for (int i = 0; i < pieces.length; ++i) {
084: buffer.append(pieces[i][current[i]]);
085: }
086: String result = buffer.toString();
087:
088: // find next value for next time
089:
090: for (int i = current.length - 1;; --i) {
091: if (i < 0) {
092: done = true;
093: break;
094: }
095: current[i]++;
096: if (current[i] < pieces[i].length)
097: break; // got sequence
098: current[i] = 0;
099: }
100: return result;
101: }
102:
103: /**
104: * Set a new source for this iterator. Allows object reuse.
105: * @param newSource the source string to iterate against. This allows the same iterator to be used
106: * while changing the source string, saving object creation.
107: * @stable ICU 2.4
108: */
109: public void setSource(String newSource) {
110: source = Normalizer.normalize(newSource, Normalizer.NFD);
111: done = false;
112:
113: // catch degenerate case
114: if (newSource.length() == 0) {
115: pieces = new String[1][];
116: current = new int[1];
117: pieces[0] = new String[] { "" };
118: return;
119: }
120:
121: // find the segments
122: List segmentList = new ArrayList();
123: int cp;
124: int start = 0;
125:
126: // i should be the end of the first code point
127: // break up the string into segements
128:
129: int i = UTF16.findOffsetFromCodePoint(source, 1);
130:
131: for (; i < source.length(); i += UTF16.getCharCount(cp)) {
132: cp = UTF16.charAt(source, i);
133: if (NormalizerImpl.isCanonSafeStart(cp)) {
134: segmentList.add(source.substring(start, i)); // add up to i
135: start = i;
136: }
137: }
138: segmentList.add(source.substring(start, i)); // add last one
139:
140: // allocate the arrays, and find the strings that are CE to each segment
141: pieces = new String[segmentList.size()][];
142: current = new int[segmentList.size()];
143: for (i = 0; i < pieces.length; ++i) {
144: if (PROGRESS)
145: System.out.println("SEGMENT");
146: pieces[i] = getEquivalents((String) segmentList.get(i));
147: }
148: }
149:
150: /**
151: * Simple implementation of permutation.
152: * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
153: * @param source the string to find permutations for
154: * @param skipZeros set to true to skip characters with canonical combining class zero
155: * @param output the set to add the results to
156: * @internal
157: * @deprecated This API is ICU internal only.
158: */
159: public static void permute(String source, boolean skipZeros,
160: Set output) {
161: // TODO: optimize
162: //if (PROGRESS) System.out.println("Permute: " + source);
163:
164: // optimization:
165: // if zero or one character, just return a set with it
166: // we check for length < 2 to keep from counting code points all the time
167: if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
168: output.add(source);
169: return;
170: }
171:
172: // otherwise iterate through the string, and recursively permute all the other characters
173: Set subpermute = new HashSet();
174: int cp;
175: for (int i = 0; i < source.length(); i += UTF16
176: .getCharCount(cp)) {
177: cp = UTF16.charAt(source, i);
178:
179: // optimization:
180: // if the character is canonical combining class zero,
181: // don't permute it
182: if (skipZeros && i != 0
183: && UCharacter.getCombiningClass(cp) == 0) {
184: //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
185: continue;
186: }
187:
188: // see what the permutations of the characters before and after this one are
189: subpermute.clear();
190: permute(source.substring(0, i)
191: + source.substring(i + UTF16.getCharCount(cp)),
192: skipZeros, subpermute);
193:
194: // prefix this character to all of them
195: String chStr = UTF16.valueOf(source, i);
196: Iterator it = subpermute.iterator();
197: while (it.hasNext()) {
198: String piece = chStr + (String) it.next();
199: //if (PROGRESS) System.out.println(" Piece: " + piece);
200: output.add(piece);
201: }
202: }
203: }
204:
205: // FOR TESTING
206:
207: /*
208: *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
209: *@internal
210: *
211: public static UnicodeSet getSafeStart() {
212: return (UnicodeSet) SAFE_START.clone();
213: }
214: */
215: /*
216: *@return the set of characters whose decompositions start with the given character
217: *@internal
218: *
219: public static UnicodeSet getStarts(int cp) {
220: UnicodeSet result = AT_START.get(cp);
221: if (result == null) result = EMPTY;
222: return (UnicodeSet) result.clone();
223: }
224: */
225:
226: // ===================== PRIVATES ==============================
227: // debug
228: private static boolean PROGRESS = false; // debug progress
229: //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
230: private static boolean SKIP_ZEROS = true;
231:
232: // fields
233: private String source;
234: private boolean done;
235: private String[][] pieces;
236: private int[] current;
237: // Note: C will need two more fields, since arrays there don't have lengths
238: // int pieces_length;
239: // int[] pieces_lengths;
240:
241: // transient fields
242: private transient StringBuffer buffer = new StringBuffer();
243:
244: // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
245: private String[] getEquivalents(String segment) {
246: Set result = new HashSet();
247: Set basic = getEquivalents2(segment);
248: Set permutations = new HashSet();
249:
250: // now get all the permutations
251: // add only the ones that are canonically equivalent
252: // TODO: optimize by not permuting any class zero.
253: Iterator it = basic.iterator();
254: while (it.hasNext()) {
255: String item = (String) it.next();
256: permutations.clear();
257: permute(item, SKIP_ZEROS, permutations);
258: Iterator it2 = permutations.iterator();
259: while (it2.hasNext()) {
260: String possible = (String) it2.next();
261:
262: /*
263: String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
264: if (attempt.equals(segment)) {
265: */
266: if (Normalizer.compare(possible, segment, 0) == 0) {
267:
268: if (PROGRESS)
269: System.out.println("Adding Permutation: "
270: + Utility.hex(possible));
271: result.add(possible);
272:
273: } else {
274: if (PROGRESS)
275: System.out.println("-Skipping Permutation: "
276: + Utility.hex(possible));
277: }
278: }
279: }
280:
281: // convert into a String[] to clean up storage
282: String[] finalResult = new String[result.size()];
283: result.toArray(finalResult);
284: return finalResult;
285: }
286:
287: private Set getEquivalents2(String segment) {
288:
289: Set result = new HashSet();
290:
291: if (PROGRESS)
292: System.out.println("Adding: " + Utility.hex(segment));
293:
294: result.add(segment);
295: StringBuffer workingBuffer = new StringBuffer();
296:
297: // cycle through all the characters
298: int cp = 0;
299: int[] range = new int[2];
300: for (int i = 0; i < segment.length(); i += UTF16
301: .getCharCount(cp)) {
302:
303: // see if any character is at the start of some decomposition
304: cp = UTF16.charAt(segment, i);
305: USerializedSet starts = new USerializedSet();
306:
307: if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
308: continue;
309: }
310: int j = 0;
311: // if so, see which decompositions match
312: int rangeCount = starts.countRanges();
313: for (j = 0; j < rangeCount; ++j) {
314: starts.getRange(j, range);
315: int end = range[1];
316: for (int cp2 = range[0]; cp2 <= end; ++cp2) {
317: Set remainder = extract(cp2, segment, i,
318: workingBuffer);
319: if (remainder == null)
320: continue;
321:
322: // there were some matches, so add all the possibilities to the set.
323: String prefix = segment.substring(0, i);
324: prefix += UTF16.valueOf(cp2);
325: //int el = -1;
326: Iterator iter = remainder.iterator();
327: while (iter.hasNext()) {
328: String item = (String) iter.next();
329: String toAdd = new String(prefix);
330: toAdd += item;
331: result.add(toAdd);
332: //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
333: }
334: }
335: }
336: }
337: return result;
338: /*
339: Set result = new HashSet();
340: if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
341: result.add(segment);
342: StringBuffer workingBuffer = new StringBuffer();
343:
344: // cycle through all the characters
345: int cp;
346:
347: for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
348: // see if any character is at the start of some decomposition
349: cp = UTF16.charAt(segment, i);
350: NormalizerImpl.getCanonStartSet(c,fillSet)
351: UnicodeSet starts = AT_START.get(cp);
352: if (starts == null) continue;
353: UnicodeSetIterator usi = new UnicodeSetIterator(starts);
354: // if so, see which decompositions match
355: while (usi.next()) {
356: int cp2 = usi.codepoint;
357: // we know that there are no strings in it
358: // so we don't have to check CharacterIterator.IS_STRING
359: Set remainder = extract(cp2, segment, i, workingBuffer);
360: if (remainder == null) continue;
361:
362: // there were some matches, so add all the possibilities to the set.
363: String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
364: Iterator it = remainder.iterator();
365: while (it.hasNext()) {
366: String item = (String) it.next();
367: if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
368: result.add(prefix + item);
369: }
370: }
371: }
372: return result;
373: */
374: }
375:
376: /**
377: * See if the decomposition of cp2 is at segment starting at segmentPos
378: * (with canonical rearrangment!)
379: * If so, take the remainder, and return the equivalents
380: */
381: private Set extract(int comp, String segment, int segmentPos,
382: StringBuffer buffer) {
383: if (PROGRESS)
384: System.out.println(" extract: "
385: + Utility.hex(UTF16.valueOf(comp)) + ", "
386: + Utility.hex(segment.substring(segmentPos)));
387:
388: //String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
389: String decomp = Normalizer.normalize(comp, Normalizer.NFD);
390:
391: // See if it matches the start of segment (at segmentPos)
392: boolean ok = false;
393: int cp;
394: int decompPos = 0;
395: int decompCp = UTF16.charAt(decomp, 0);
396: decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
397: //int decompClass = getClass(decompCp);
398: buffer.setLength(0); // initialize working buffer, shared among callees
399:
400: for (int i = segmentPos; i < segment.length(); i += UTF16
401: .getCharCount(cp)) {
402: cp = UTF16.charAt(segment, i);
403: if (cp == decompCp) { // if equal, eat another cp from decomp
404: if (PROGRESS)
405: System.out.println(" matches: "
406: + Utility.hex(UTF16.valueOf(cp)));
407: if (decompPos == decomp.length()) { // done, have all decomp characters!
408: buffer.append(segment.substring(i
409: + UTF16.getCharCount(cp))); // add remaining segment chars
410: ok = true;
411: break;
412: }
413: decompCp = UTF16.charAt(decomp, decompPos);
414: decompPos += UTF16.getCharCount(decompCp);
415: //decompClass = getClass(decompCp);
416: } else {
417: if (PROGRESS)
418: System.out.println(" buffer: "
419: + Utility.hex(UTF16.valueOf(cp)));
420: // brute force approach
421: UTF16.append(buffer, cp);
422: /* TODO: optimize
423: // since we know that the classes are monotonically increasing, after zero
424: // e.g. 0 5 7 9 0 3
425: // we can do an optimization
426: // there are only a few cases that work: zero, less, same, greater
427: // if both classes are the same, we fail
428: // if the decomp class < the segment class, we fail
429:
430: segClass = getClass(cp);
431: if (decompClass <= segClass) return null;
432: */
433: }
434: }
435: if (!ok)
436: return null; // we failed, characters left over
437: if (PROGRESS)
438: System.out.println("Matches");
439: if (buffer.length() == 0)
440: return SET_WITH_NULL_STRING; // succeed, but no remainder
441: String remainder = buffer.toString();
442:
443: // brute force approach
444: // to check to make sure result is canonically equivalent
445: /*
446: String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
447: if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
448: */
449:
450: if (0 != Normalizer.compare(UTF16.valueOf(comp) + remainder,
451: segment.substring(segmentPos), 0))
452: return null;
453:
454: // get the remaining combinations
455: return getEquivalents2(remainder);
456: }
457:
458: /*
459: // TODO: fix once we have a codepoint interface to get the canonical combining class
460: // TODO: Need public access to canonical combining class in UCharacter!
461: private static int getClass(int cp) {
462: return Normalizer.getClass((char)cp);
463: }
464: */
465:
466: // ================= BUILDER =========================
467: // TODO: Flatten this data so it doesn't have to be reconstructed each time!
468: private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
469: private static final Set SET_WITH_NULL_STRING = new HashSet(); // constant, don't change
470: static {
471: SET_WITH_NULL_STRING.add("");
472: }
473:
474: // private static UnicodeSet SAFE_START = new UnicodeSet();
475: // private static CharMap AT_START = new CharMap();
476:
477: // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
478: // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
479: // private static int LAST_UNICODE = 0x10FFFF;
480: /*
481: static {
482: buildData();
483: }
484: */
485: /*
486: private static void buildData() {
487:
488: if (PROGRESS) System.out.println("Getting Safe Start");
489: for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
490: if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
491: int cc = UCharacter.getCombiningClass(cp);
492: if (cc == 0) SAFE_START.add(cp);
493: // will fix to be really safe below
494: }
495: if (PROGRESS) System.out.println();
496:
497: if (PROGRESS) System.out.println("Getting Containment");
498: for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
499: if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
500:
501: if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
502:
503: //String istr = UTF16.valueOf(cp);
504: String decomp = Normalizer.normalize(cp, Normalizer.NFD);
505: //if (decomp.equals(istr)) continue;
506:
507: // add each character in the decomposition to canBeIn
508:
509: int component;
510: for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
511: component = UTF16.charAt(decomp, i);
512: if (i == 0) {
513: AT_START.add(component, cp);
514: } else if (UCharacter.getCombiningClass(component) == 0) {
515: SAFE_START.remove(component);
516: }
517: }
518: }
519: if (PROGRESS) System.out.println();
520: }
521: // the following is just for a map from characters to a set of characters
522:
523: private static class CharMap {
524: Map storage = new HashMap();
525: MutableInt probe = new MutableInt();
526: boolean converted = false;
527:
528: public void add(int cp, int whatItIsIn) {
529: UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
530: if (result == null) {
531: result = new UnicodeSet();
532: storage.put(probe, result);
533: }
534: result.add(whatItIsIn);
535: }
536:
537: public UnicodeSet get(int cp) {
538: return (UnicodeSet) storage.get(probe.set(cp));
539: }
540: }
541:
542: private static class MutableInt {
543: public int contents;
544: public int hashCode() { return contents; }
545: public boolean equals(Object other) {
546: return ((MutableInt)other).contents == contents;
547: }
548: // allows chaining
549: public MutableInt set(int contents) {
550: this.contents = contents;
551: return this;
552: }
553: }
554: */
555:
556: }
|