001: /*
002: *******************************************************************************
003: * Copyright (C) 2002-2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007:
008: /**
009: * Port From: ICU4C v2.1 : collate/CollationRegressionTest
010: * Source File: $ICU4CRoot/source/test/intltest/regcoll.cpp
011: **/package com.ibm.icu.dev.test.collator;
012:
013: import com.ibm.icu.dev.test.*;
014: import com.ibm.icu.text.*;
015: import java.util.Locale;
016: import java.util.Comparator;
017: import java.util.Arrays;
018: import java.io.*;
019:
020: public class CollationThaiTest extends TestFmwk {
021:
022: final int MAX_FAILURES_TO_SHOW = -1;
023:
024: public static void main(String[] args) throws Exception {
025: new CollationThaiTest().run(args);
026: }
027:
028: /**
029: * Odd corner conditions taken from "How to Sort Thai Without Rewriting Sort",
030: * by Doug Cooper, http://seasrc.th.net/paper/thaisort.zip
031: */
032: public void TestCornerCases() {
033: String TESTS[] = {
034: // Shorter words precede longer
035: "\u0e01",
036: "<",
037: "\u0e01\u0e01",
038:
039: // Tone marks are considered after letters (i.e. are primary ignorable)
040: "\u0e01\u0e32",
041: "<",
042: "\u0e01\u0e49\u0e32",
043:
044: // ditto for other over-marks
045: "\u0e01\u0e32",
046: "<",
047: "\u0e01\u0e32\u0e4c",
048:
049: // commonly used mark-in-context order.
050: // In effect, marks are sorted after each syllable.
051: "\u0e01\u0e32\u0e01\u0e49\u0e32",
052: "<",
053: "\u0e01\u0e48\u0e32\u0e01\u0e49\u0e32",
054:
055: // Hyphens and other punctuation follow whitespace but come before letters
056: "\u0e01\u0e32", "<",
057: "\u0e01\u0e32-",
058: "\u0e01\u0e32-",
059: "<",
060: "\u0e01\u0e32\u0e01\u0e32",
061:
062: // Doubler follows an indentical word without the doubler
063: "\u0e01\u0e32",
064: "<",
065: "\u0e01\u0e32\u0e46",
066: "\u0e01\u0e32\u0e46",
067: "<",
068: "\u0e01\u0e32\u0e01\u0e32",
069:
070: // \u0e45 after either \u0e24 or \u0e26 is treated as a single
071: // combining character, similar to "c < ch" in traditional spanish.
072: // TODO: beef up this case
073: "\u0e24\u0e29\u0e35", "<", "\u0e24\u0e45\u0e29\u0e35",
074: "\u0e26\u0e29\u0e35", "<", "\u0e26\u0e45\u0e29\u0e35",
075:
076: // Vowels reorder, should compare \u0e2d and \u0e34
077: "\u0e40\u0e01\u0e2d", "<", "\u0e40\u0e01\u0e34",
078:
079: // Tones are compared after the rest of the word (e.g. primary ignorable)
080: "\u0e01\u0e32\u0e01\u0e48\u0e32", "<",
081: "\u0e01\u0e49\u0e32\u0e01\u0e32",
082:
083: // Periods are ignored entirely
084: "\u0e01.\u0e01.", "<", "\u0e01\u0e32", };
085:
086: RuleBasedCollator coll = null;
087: try {
088: coll = getThaiCollator();
089: } catch (Exception e) {
090: warnln("could not construct Thai collator");
091: return;
092: }
093: compareArray(coll, TESTS);
094: }
095:
096: void compareArray(RuleBasedCollator c, String[] tests) {
097: for (int i = 0; i < tests.length; i += 3) {
098: int expect = 0;
099: if (tests[i + 1].equals("<")) {
100: expect = -1;
101: } else if (tests[i + 1].equals(">")) {
102: expect = 1;
103: } else if (tests[i + 1].equals("=")) {
104: expect = 0;
105: } else {
106: // expect = Integer.decode(tests[i+1]).intValue();
107: errln("Error: unknown operator " + tests[i + 1]);
108: return;
109: }
110: String s1 = tests[i];
111: String s2 = tests[i + 2];
112: CollationTest.doTest(this , c, s1, s2, expect);
113: }
114: }
115:
116: int sign(int i) {
117: if (i < 0)
118: return -1;
119: if (i > 0)
120: return 1;
121: return 0;
122: }
123:
124: /**
125: * Read the external dictionary file, which is already in proper
126: * sorted order, and confirm that the collator compares each line as
127: * preceding the following line.
128: */
129: public void TestDictionary() {
130: RuleBasedCollator coll = null;
131: try {
132: coll = getThaiCollator();
133: } catch (Exception e) {
134: warnln("could not construct Thai collator");
135: return;
136: }
137:
138: // Read in a dictionary of Thai words
139: BufferedReader in = null;
140: String fileName = "riwords.txt";
141: try {
142: in = TestUtil.getDataReader(fileName, "UTF-8");
143: } catch (SecurityException e) {
144: warnln("Security exception encountered reading test data file.");
145: return;
146: } catch (Exception e) {
147: try {
148: if (in != null) {
149: in.close();
150: }
151: } catch (IOException ioe) {
152: }
153: errln("Error: could not open test file: " + fileName
154: + ". Aborting test.");
155: return;
156: }
157:
158: //
159: // Loop through each word in the dictionary and compare it to the previous
160: // word. They should be in sorted order.
161: //
162: String lastWord = "";
163: int line = 0;
164: int failed = 0;
165: int wordCount = 0;
166: try {
167: String word = in.readLine();
168: while (word != null) {
169: line++;
170:
171: // Skip comments and blank lines
172: if (word.length() == 0 || word.charAt(0) == 0x23) {
173: word = in.readLine();
174: continue;
175: }
176:
177: // Show the first 8 words being compared, so we can see what's happening
178: ++wordCount;
179: if (wordCount <= 8) {
180: logln("Word " + wordCount + ": " + word);
181: }
182:
183: if (lastWord.length() > 0) {
184: CollationTest
185: .doTest(this , coll, lastWord, word, -1);
186: int result = coll.compare(lastWord, word);
187:
188: if (result >= 0) {
189: failed++;
190: if (MAX_FAILURES_TO_SHOW < 0
191: || failed <= MAX_FAILURES_TO_SHOW) {
192: String msg = "--------------------------------------------\n"
193: + line
194: + " compare("
195: + lastWord
196: + ", "
197: + word
198: + ") returned "
199: + result + ", expected -1\n";
200: CollationKey k1, k2;
201: try {
202: k1 = coll.getCollationKey(lastWord);
203: k2 = coll.getCollationKey(word);
204: } catch (Exception e) {
205: errln("Fail: getCollationKey returned ");
206: return;
207: }
208: msg += "key1: " + prettify(k1) + "\n"
209: + "key2: " + prettify(k2);
210: errln(msg);
211: }
212: }
213: }
214: lastWord = word;
215: word = in.readLine();
216: }
217: } catch (IOException e) {
218: errln("IOException " + e.getMessage());
219: }
220:
221: if (failed != 0) {
222: if (failed > MAX_FAILURES_TO_SHOW) {
223: errln("Too many failures; only the first "
224: + MAX_FAILURES_TO_SHOW + " failures were shown");
225: }
226: errln("Summary: " + failed + " of " + (line - 1)
227: + " comparisons failed");
228: }
229:
230: logln("Words checked: " + wordCount);
231: }
232:
233: public void TestInvalidThai() {
234: String tests[] = { "\u0E44\u0E01\u0E44\u0E01",
235: "\u0E44\u0E01\u0E01\u0E44", "\u0E01\u0E44\u0E01\u0E44",
236: "\u0E01\u0E01\u0E44\u0E44", "\u0E44\u0E44\u0E01\u0E01",
237: "\u0E01\u0E44\u0E44\u0E01", };
238:
239: RuleBasedCollator collator;
240: StrCmp comparator;
241: try {
242: collator = getThaiCollator();
243: comparator = new StrCmp();
244: } catch (Exception e) {
245: warnln("could not construct Thai collator");
246: return;
247: }
248:
249: Arrays.sort(tests, comparator);
250:
251: for (int i = 0; i < tests.length; i++) {
252: for (int j = i + 1; j < tests.length; j++) {
253: if (collator.compare(tests[i], tests[j]) > 0) {
254: // inconsistency ordering found!
255: errln("Inconsistent ordering between strings " + i
256: + " and " + j);
257: }
258: }
259: CollationElementIterator iterator = collator
260: .getCollationElementIterator(tests[i]);
261: CollationTest.backAndForth(this , iterator);
262: }
263: }
264:
265: public void TestReordering() {
266: String tests[] = {
267: "\u0E41c\u0301",
268: "=",
269: "\u0E41\u0107", // composition
270: "\u0E41\uD835\uDFCE",
271: "<",
272: "\u0E41\uD835\uDFCF", // supplementaries
273: "\u0E41\uD834\uDD5F",
274: "=",
275: "\u0E41\uD834\uDD58\uD834\uDD65", // supplementary composition decomps to supplementary
276: "\u0E41\uD87E\uDC02",
277: "=",
278: "\u0E41\u4E41", // supplementary composition decomps to BMP
279: "\u0E41\u0301",
280: "=",
281: "\u0E41\u0301", // unsafe (just checking backwards iteration)
282: "\u0E41\u0301\u0316",
283: "=",
284: "\u0E41\u0316\u0301",
285:
286: "abc\u0E41c\u0301",
287: "=",
288: "abc\u0E41\u0107", // composition
289: "abc\u0E41\uD834\uDC00",
290: "<",
291: "abc\u0E41\uD834\uDC01", // supplementaries
292: "abc\u0E41\uD834\uDD5F",
293: "=",
294: "abc\u0E41\uD834\uDD58\uD834\uDD65", // supplementary composition decomps to supplementary
295: "abc\u0E41\uD87E\uDC02",
296: "=",
297: "abc\u0E41\u4E41", // supplementary composition decomps to BMP
298: "abc\u0E41\u0301",
299: "=",
300: "abc\u0E41\u0301", // unsafe (just checking backwards iteration)
301: "abc\u0E41\u0301\u0316",
302: "=",
303: "abc\u0E41\u0316\u0301",
304:
305: "\u0E41c\u0301abc",
306: "=",
307: "\u0E41\u0107abc", // composition
308: "\u0E41\uD834\uDC00abc",
309: "<",
310: "\u0E41\uD834\uDC01abc", // supplementaries
311: "\u0E41\uD834\uDD5Fabc",
312: "=",
313: "\u0E41\uD834\uDD58\uD834\uDD65abc", // supplementary composition decomps to supplementary
314: "\u0E41\uD87E\uDC02abc",
315: "=",
316: "\u0E41\u4E41abc", // supplementary composition decomps to BMP
317: "\u0E41\u0301abc",
318: "=",
319: "\u0E41\u0301abc", // unsafe (just checking backwards iteration)
320: "\u0E41\u0301\u0316abc", "=", "\u0E41\u0316\u0301abc",
321:
322: "abc\u0E41c\u0301abc",
323: "=",
324: "abc\u0E41\u0107abc", // composition
325: "abc\u0E41\uD834\uDC00abc",
326: "<",
327: "abc\u0E41\uD834\uDC01abc", // supplementaries
328: "abc\u0E41\uD834\uDD5Fabc",
329: "=",
330: "abc\u0E41\uD834\uDD58\uD834\uDD65abc", // supplementary composition decomps to supplementary
331: "abc\u0E41\uD87E\uDC02abc", "=",
332: "abc\u0E41\u4E41abc", // supplementary composition decomps to BMP
333: "abc\u0E41\u0301abc", "=",
334: "abc\u0E41\u0301abc", // unsafe (just checking backwards iteration)
335: "abc\u0E41\u0301\u0316abc", "=",
336: "abc\u0E41\u0316\u0301abc", };
337:
338: RuleBasedCollator collator;
339: try {
340: collator = (RuleBasedCollator) getThaiCollator();
341: } catch (Exception e) {
342: warnln("could not construct Thai collator");
343: return;
344: }
345: compareArray(collator, tests);
346:
347: String rule = "& c < ab";
348: String testcontraction[] = { "\u0E41ab", ">", "\u0E41c" };
349: try {
350: collator = new RuleBasedCollator(rule);
351: } catch (Exception e) {
352: errln("Error: could not construct collator with rule "
353: + rule);
354: return;
355: }
356: compareArray(collator, testcontraction);
357: }
358:
359: String prettify(CollationKey sourceKey) {
360: int i;
361: byte[] bytes = sourceKey.toByteArray();
362: String target = "[";
363:
364: for (i = 0; i < bytes.length; i++) {
365: target += Integer.toHexString(bytes[i]);
366: target += " ";
367: }
368: target += "]";
369: return target;
370: }
371:
372: // private inner class -------------------------------------------------
373:
374: private static final class StrCmp implements Comparator {
375: public int compare(Object string1, Object string2) {
376: return collator.compare(string1, string2);
377: }
378:
379: StrCmp() throws Exception {
380: collator = getThaiCollator();
381: }
382:
383: Collator collator;
384: }
385:
386: // private data members ------------------------------------------------
387:
388: private static RuleBasedCollator m_collator_;
389:
390: // private methods -----------------------------------------------------
391:
392: private static RuleBasedCollator getThaiCollator() throws Exception {
393: if (m_collator_ == null) {
394: m_collator_ = (RuleBasedCollator) Collator
395: .getInstance(new Locale("th", "TH", ""));
396: }
397: return m_collator_;
398: }
399: }
|