001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007:
008: package com.ibm.icu.dev.test.normalizer;
009:
010: import com.ibm.icu.dev.test.TestFmwk;
011: import com.ibm.icu.dev.test.TestUtil;
012:
013: import java.io.BufferedReader;
014: import java.io.IOException;
015: import java.text.StringCharacterIterator;
016: import com.ibm.icu.impl.Utility;
017: import com.ibm.icu.text.Normalizer;
018: import com.ibm.icu.text.UTF16;
019: import com.ibm.icu.text.UnicodeSet;
020:
021: public class ConformanceTest extends TestFmwk {
022:
023: Normalizer normalizer;
024:
025: public static void main(String[] args) throws Exception {
026: new ConformanceTest().run(args);
027: }
028:
029: public ConformanceTest() {
030: // Doesn't matter what the string and mode are; we'll change
031: // them later as needed.
032: normalizer = new Normalizer("", Normalizer.NFC, 0);
033: }
034:
035: // more interesting conformance test cases, not in the unicode.org NormalizationTest.txt
036: static String[] moreCases = {
037: // Markus 2001aug30
038: "0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0",
039:
040: // Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129
041: "0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1" };
042:
043: /**
044: * Test the conformance of Normalizer to
045: * http://www.unicode.org/unicode/reports/tr15/conformance/Draft-TestSuite.txt.* http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
046: * This file must be located at the path specified as TEST_SUITE_FILE.
047: */
048: public void TestConformance() throws Exception {
049: runConformance("unicode/NormalizationTest.txt", 0);
050: }
051:
052: public void TestConformance_3_2() throws Exception {
053: runConformance("unicode/NormalizationTest-3.2.0.txt",
054: Normalizer.UNICODE_3_2);
055: }
056:
057: public void runConformance(String fileName, int options)
058: throws Exception {
059: BufferedReader input = null;
060: String line = null;
061: String[] fields = new String[5];
062: StringBuffer buf = new StringBuffer();
063: int passCount = 0;
064: int failCount = 0;
065: UnicodeSet other = new UnicodeSet(0, 0x10ffff);
066: int c = 0;
067: try {
068: input = TestUtil.getDataReader(fileName);
069: for (int count = 0;; ++count) {
070: line = input.readLine();
071: if (line == null) {
072: //read the extra test cases
073: if (count > moreCases.length) {
074: count = 0;
075: } else if (count == moreCases.length) {
076: // all done
077: break;
078: }
079: line = moreCases[count++];
080: }
081: if (line.length() == 0)
082: continue;
083:
084: // Expect 5 columns of this format:
085: // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>
086:
087: // Skip comments
088: if (line.charAt(0) == '#' || line.charAt(0) == '@')
089: continue;
090:
091: // Parse out the fields
092: hexsplit(line, ';', fields, buf);
093:
094: // Remove a single code point from the "other" UnicodeSet
095: if (fields[0].length() == UTF16.moveCodePointOffset(
096: fields[0], 0, 1)) {
097: c = UTF16.charAt(fields[0], 0);
098: if (0xac20 <= c && c <= 0xd73f) {
099: // not an exhaustive test run: skip most Hangul syllables
100: if (c == 0xac20) {
101: other.remove(0xac20, 0xd73f);
102: }
103: continue;
104: }
105: other.remove(c);
106: }
107: if (checkConformance(fields, line, options)) {
108: ++passCount;
109: } else {
110: ++failCount;
111: }
112: if ((count % 1000) == 999) {
113: logln("Line " + (count + 1));
114: }
115: }
116: } catch (IOException ex) {
117: try {
118: input.close();
119: } catch (Exception ex2) {
120: System.out.print("");
121: }
122: ex.printStackTrace();
123: throw new IllegalArgumentException("Couldn't read file "
124: + ex.getClass().getName() + " " + ex.getMessage()
125: + " line = " + line);
126: }
127:
128: if (failCount != 0) {
129: errln("Total: " + failCount + " lines failed, " + passCount
130: + " lines passed");
131: } else {
132: logln("Total: " + passCount + " lines passed");
133: }
134: }
135:
136: /**
137: * Verify the conformance of the given line of the Unicode
138: * normalization (UTR 15) test suite file. For each line,
139: * there are five columns, corresponding to field[0]..field[4].
140: *
141: * The following invariants must be true for all conformant implementations
142: * c2 == NFC(c1) == NFC(c2) == NFC(c3)
143: * c3 == NFD(c1) == NFD(c2) == NFD(c3)
144: * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
145: * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
146: *
147: * @param field the 5 columns
148: * @param line the source line from the test suite file
149: * @return true if the test passes
150: */
151: private boolean checkConformance(String[] field, String line,
152: int options) throws Exception {
153: boolean pass = true;
154: StringBuffer buf = new StringBuffer(); // scratch
155: String out, fcd;
156: int i = 0;
157: for (i = 0; i < 5; ++i) {
158: if (i < 3) {
159: out = Normalizer.normalize(field[i], Normalizer.NFC,
160: options);
161: pass &= assertEqual("C", field[i], out, field[1],
162: "c2!=C(c" + (i + 1));
163:
164: out = iterativeNorm(field[i], Normalizer.NFC, buf, +1,
165: options);
166: pass &= assertEqual("C(+1)", field[i], out, field[1],
167: "c2!=C(c" + (i + 1));
168:
169: out = iterativeNorm(field[i], Normalizer.NFC, buf, -1,
170: options);
171: pass &= assertEqual("C(-1)", field[i], out, field[1],
172: "c2!=C(c" + (i + 1));
173:
174: out = iterativeNorm(new StringCharacterIterator(
175: field[i]), Normalizer.NFC, buf, +1, options);
176: pass &= assertEqual("C(+1)", field[i], out, field[1],
177: "c2!=C(c" + (i + 1));
178:
179: out = iterativeNorm(new StringCharacterIterator(
180: field[i]), Normalizer.NFC, buf, -1, options);
181: pass &= assertEqual("C(-1)", field[i], out, field[1],
182: "c2!=C(c" + (i + 1));
183:
184: out = Normalizer.normalize(field[i], Normalizer.NFD);
185: pass &= assertEqual("D", field[i], out, field[2],
186: "c3!=D(c" + (i + 1));
187:
188: out = iterativeNorm(field[i], Normalizer.NFD, buf, +1,
189: options);
190: pass &= assertEqual("D(+1)", field[i], out, field[2],
191: "c3!=D(c" + (i + 1));
192:
193: out = iterativeNorm(field[i], Normalizer.NFD, buf, -1,
194: options);
195: pass &= assertEqual("D(-1)", field[i], out, field[2],
196: "c3!=D(c" + (i + 1));
197:
198: out = iterativeNorm(new StringCharacterIterator(
199: field[i]), Normalizer.NFD, buf, +1, options);
200: pass &= assertEqual("D(+1)", field[i], out, field[2],
201: "c3!=D(c" + (i + 1));
202:
203: out = iterativeNorm(new StringCharacterIterator(
204: field[i]), Normalizer.NFD, buf, -1, options);
205: pass &= assertEqual("D(-1)", field[i], out, field[2],
206: "c3!=D(c" + (i + 1));
207:
208: cross(field[2] /*NFD String*/,
209: field[1]/*NFC String*/, Normalizer.NFC);
210: cross(field[1] /*NFC String*/,
211: field[2]/*NFD String*/, Normalizer.NFD);
212: }
213: out = Normalizer.normalize(field[i], Normalizer.NFKC,
214: options);
215: pass &= assertEqual("KC", field[i], out, field[3],
216: "c4!=KC(c" + (i + 1));
217:
218: out = iterativeNorm(field[i], Normalizer.NFKC, buf, +1,
219: options);
220: pass &= assertEqual("KD(+1)", field[i], out, field[3],
221: "c4!=KC(c" + (i + 1));
222:
223: out = iterativeNorm(field[i], Normalizer.NFKC, buf, -1,
224: options);
225: pass &= assertEqual("KD(-1)", field[i], out, field[3],
226: "c4!=KC(c" + (i + 1));
227:
228: out = iterativeNorm(new StringCharacterIterator(field[i]),
229: Normalizer.NFKC, buf, +1, options);
230: pass &= assertEqual("KD(+1)", field[i], out, field[3],
231: "c4!=KC(c" + (i + 1));
232:
233: out = iterativeNorm(new StringCharacterIterator(field[i]),
234: Normalizer.NFKC, buf, -1, options);
235: pass &= assertEqual("KD(-1)", field[i], out, field[3],
236: "c4!=KC(c" + (i + 1));
237:
238: out = Normalizer.normalize(field[i], Normalizer.NFKD,
239: options);
240: pass &= assertEqual("KD", field[i], out, field[4],
241: "c5!=KD(c" + (i + 1));
242:
243: out = iterativeNorm(field[i], Normalizer.NFKD, buf, +1,
244: options);
245: pass &= assertEqual("KD(+1)", field[i], out, field[4],
246: "c5!=KD(c" + (i + 1));
247:
248: out = iterativeNorm(field[i], Normalizer.NFKD, buf, -1,
249: options);
250: pass &= assertEqual("KD(-1)", field[i], out, field[4],
251: "c5!=KD(c" + (i + 1));
252:
253: out = iterativeNorm(new StringCharacterIterator(field[i]),
254: Normalizer.NFKD, buf, +1, options);
255: pass &= assertEqual("KD(+1)", field[i], out, field[4],
256: "c5!=KD(c" + (i + 1));
257:
258: out = iterativeNorm(new StringCharacterIterator(field[i]),
259: Normalizer.NFKD, buf, -1, options);
260: pass &= assertEqual("KD(-1)", field[i], out, field[4],
261: "c5!=KD(c" + (i + 1));
262:
263: cross(field[4] /*NFKD String*/, field[3]/*NFKC String*/,
264: Normalizer.NFKC);
265: cross(field[3] /*NFKC String*/, field[4]/*NFKD String*/,
266: Normalizer.NFKD);
267:
268: }
269: compare(field[1], field[2]);
270: compare(field[0], field[1]);
271: compare(field[0], field[2]);
272: // test quick checks
273: if (Normalizer.NO == Normalizer.quickCheck(field[1],
274: Normalizer.NFC, options)) {
275: errln("Normalizer error: quickCheck(NFC(s), Normalizer.NFC) is Normalizer.NO");
276: pass = false;
277: }
278: if (Normalizer.NO == Normalizer.quickCheck(field[2],
279: Normalizer.NFD, options)) {
280: errln("Normalizer error: quickCheck(NFD(s), Normalizer.NFD) is Normalizer.NO");
281: pass = false;
282: }
283: if (Normalizer.NO == Normalizer.quickCheck(field[3],
284: Normalizer.NFKC, options)) {
285: errln("Normalizer error: quickCheck(NFKC(s), Normalizer.NFKC) is Normalizer.NO");
286: pass = false;
287: }
288: if (Normalizer.NO == Normalizer.quickCheck(field[4],
289: Normalizer.NFKD, options)) {
290: errln("Normalizer error: quickCheck(NFKD(s), Normalizer.NFKD) is Normalizer.NO");
291: pass = false;
292: }
293:
294: if (!Normalizer.isNormalized(field[1], Normalizer.NFC, options)) {
295: errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
296: pass = false;
297: }
298: if (!field[0].equals(field[1])
299: && Normalizer.isNormalized(field[0], Normalizer.NFC,
300: options)) {
301: errln("Normalizer error: isNormalized(s, Normalizer.NFC) is TRUE");
302: pass = false;
303: }
304: if (!Normalizer
305: .isNormalized(field[3], Normalizer.NFKC, options)) {
306: errln("Normalizer error: isNormalized(NFKC(s), Normalizer.NFKC) is false");
307: pass = false;
308: }
309: if (!field[0].equals(field[3])
310: && Normalizer.isNormalized(field[0], Normalizer.NFKC,
311: options)) {
312: errln("Normalizer error: isNormalized(s, Normalizer.NFKC) is TRUE");
313: pass = false;
314: }
315: // test api that takes a char[]
316: if (!Normalizer.isNormalized(field[1].toCharArray(), 0,
317: field[1].length(), Normalizer.NFC, options)) {
318: errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
319: pass = false;
320: }
321: // test api that takes a codepoint
322: if (!Normalizer.isNormalized(UTF16.charAt(field[1], 0),
323: Normalizer.NFC, options)) {
324: errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
325: pass = false;
326: }
327: // test FCD quick check and "makeFCD"
328: fcd = Normalizer.normalize(field[0], Normalizer.FCD);
329: if (Normalizer.NO == Normalizer.quickCheck(fcd, Normalizer.FCD,
330: options)) {
331: errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO");
332: pass = false;
333: }
334: // check FCD return length
335: {
336: char[] fcd2 = new char[fcd.length() * 2];
337: char[] src = field[0].toCharArray();
338: int fcdLen = Normalizer.normalize(src, 0, src.length, fcd2,
339: fcd.length(), fcd2.length, Normalizer.FCD, 0);
340: if (fcdLen != fcd.length()) {
341: errln("makeFCD did not return the correct length");
342: }
343: }
344: if (Normalizer.NO == Normalizer.quickCheck(fcd, Normalizer.FCD,
345: options)) {
346: errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO");
347: pass = false;
348: }
349: if (Normalizer.NO == Normalizer.quickCheck(field[2],
350: Normalizer.FCD, options)) {
351: errln("Normalizer error: quickCheck(NFD(s), Normalizer.FCD) is Normalizer.NO");
352: pass = false;
353: }
354:
355: if (Normalizer.NO == Normalizer.quickCheck(field[4],
356: Normalizer.FCD, options)) {
357: errln("Normalizer error: quickCheck(NFKD(s), Normalizer.FCD) is Normalizer.NO");
358: pass = false;
359: }
360:
361: out = iterativeNorm(new StringCharacterIterator(field[0]),
362: Normalizer.FCD, buf, +1, options);
363: out = iterativeNorm(new StringCharacterIterator(field[0]),
364: Normalizer.FCD, buf, -1, options);
365:
366: out = iterativeNorm(new StringCharacterIterator(field[2]),
367: Normalizer.FCD, buf, +1, options);
368: out = iterativeNorm(new StringCharacterIterator(field[2]),
369: Normalizer.FCD, buf, -1, options);
370:
371: out = iterativeNorm(new StringCharacterIterator(field[4]),
372: Normalizer.FCD, buf, +1, options);
373: out = iterativeNorm(new StringCharacterIterator(field[4]),
374: Normalizer.FCD, buf, -1, options);
375:
376: out = Normalizer.normalize(fcd, Normalizer.NFD);
377: if (!out.equals(field[2])) {
378: errln("Normalizer error: NFD(FCD(s))!=NFD(s)");
379: pass = false;
380: }
381: if (!pass) {
382: errln("FAIL: " + line);
383: }
384: if (field[0] != field[2]) {
385: // two strings that are canonically equivalent must test
386: // equal under a canonical caseless match
387: // see UAX #21 Case Mappings and Jitterbug 2021 and
388: // Unicode Technical Committee meeting consensus 92-C31
389: int rc;
390: if ((rc = Normalizer.compare(field[0], field[2],
391: (options << Normalizer.COMPARE_NORM_OPTIONS_SHIFT)
392: | Normalizer.COMPARE_IGNORE_CASE)) != 0) {
393: errln("Normalizer.compare(original, NFD, case-insensitive) returned "
394: + rc + " instead of 0 for equal");
395: pass = false;
396: }
397: }
398:
399: return pass;
400: }
401:
402: // two strings that are canonically equivalent must test
403: // equal under a canonical caseless match
404: // see UAX #21 Case Mappings and Jitterbug 2021 and
405: // Unicode Technical Committee meeting consensus 92-C31
406: private void compare(String s1, String s2) {
407: if (s1.length() == 1 && s2.length() == 1) {
408: if (Normalizer.compare(UTF16.charAt(s1, 0), UTF16.charAt(
409: s2, 0), Normalizer.COMPARE_IGNORE_CASE) != 0) {
410: errln("Normalizer.compare(int,int) failed for s1: "
411: + Utility.hex(s1) + " s2: " + Utility.hex(s2));
412: }
413: }
414: if (s1.length() == 1 && s2.length() > 1) {
415: if (Normalizer.compare(UTF16.charAt(s1, 0), s2,
416: Normalizer.COMPARE_IGNORE_CASE) != 0) {
417: errln("Normalizer.compare(int,String) failed for s1: "
418: + Utility.hex(s1) + " s2: " + Utility.hex(s2));
419: }
420: }
421: if (s1.length() > 1 && s2.length() > 1) {
422: // TODO: Re-enable this tests after UTC fixes UAX 21
423: if (Normalizer.compare(s1.toCharArray(), s2.toCharArray(),
424: Normalizer.COMPARE_IGNORE_CASE) != 0) {
425: errln("Normalizer.compare(char[],char[]) failed for s1: "
426: + Utility.hex(s1) + " s2: " + Utility.hex(s2));
427: }
428: }
429: }
430:
431: private void cross(String s1, String s2, Normalizer.Mode mode) {
432: String result = Normalizer.normalize(s1, mode);
433: if (!result.equals(s2)) {
434: errln("cross test failed s1: " + Utility.hex(s1) + " s2: "
435: + Utility.hex(s2));
436: }
437: }
438:
439: /**
440: * Do a normalization using the iterative API in the given direction.
441: * @param buf scratch buffer
442: * @param dir either +1 or -1
443: */
444: private String iterativeNorm(String str, Normalizer.Mode mode,
445: StringBuffer buf, int dir, int options) throws Exception {
446: normalizer.setText(str);
447: normalizer.setMode(mode);
448: buf.setLength(0);
449: normalizer.setOption(-1, false); // reset all options
450: normalizer.setOption(options, true); // set desired options
451:
452: int ch;
453: if (dir > 0) {
454: for (ch = normalizer.first(); ch != Normalizer.DONE; ch = normalizer
455: .next()) {
456: buf.append(UTF16.valueOf(ch));
457: }
458: } else {
459: for (ch = normalizer.last(); ch != Normalizer.DONE; ch = normalizer
460: .previous()) {
461: buf.insert(0, UTF16.valueOf(ch));
462: }
463: }
464: return buf.toString();
465: }
466:
467: /**
468: * Do a normalization using the iterative API in the given direction.
469: * @param str a Java StringCharacterIterator
470: * @param buf scratch buffer
471: * @param dir either +1 or -1
472: */
473: private String iterativeNorm(StringCharacterIterator str,
474: Normalizer.Mode mode, StringBuffer buf, int dir, int options)
475: throws Exception {
476: normalizer.setText(str);
477: normalizer.setMode(mode);
478: buf.setLength(0);
479: normalizer.setOption(-1, false); // reset all options
480: normalizer.setOption(options, true); // set desired options
481:
482: int ch;
483: if (dir > 0) {
484: for (ch = normalizer.first(); ch != Normalizer.DONE; ch = normalizer
485: .next()) {
486: buf.append(UTF16.valueOf(ch));
487: }
488: } else {
489: for (ch = normalizer.last(); ch != Normalizer.DONE; ch = normalizer
490: .previous()) {
491: buf.insert(0, UTF16.valueOf(ch));
492: }
493: }
494: return buf.toString();
495: }
496:
497: /**
498: * @param op name of normalization form, e.g., "KC"
499: * @param s string being normalized
500: * @param got value received
501: * @param exp expected value
502: * @param msg description of this test
503: * @returns true if got == exp
504: */
505: private boolean assertEqual(String op, String s, String got,
506: String exp, String msg) {
507: if (exp.equals(got)) {
508: return true;
509: }
510: errln((" " + msg + ") " + op + "(" + s + ")=" + hex(got)
511: + ", exp. " + hex(exp)));
512: return false;
513: }
514:
515: /**
516: * Split a string into pieces based on the given delimiter
517: * character. Then, parse the resultant fields from hex into
518: * characters. That is, "0040 0400;0C00;0899" -> new String[] {
519: * "\u0040\u0400", "\u0C00", "\u0899" }. The output is assumed to
520: * be of the proper length already, and exactly output.length
521: * fields are parsed. If there are too few an exception is
522: * thrown. If there are too many the extras are ignored.
523: *
524: * @param buf scratch buffer
525: */
526: private static void hexsplit(String s, char delimiter,
527: String[] output, StringBuffer buf) {
528: int i;
529: int pos = 0;
530: for (i = 0; i < output.length; ++i) {
531: int delim = s.indexOf(delimiter, pos);
532: if (delim < 0) {
533: throw new IllegalArgumentException("Missing field in "
534: + s);
535: }
536: // Our field is from pos..delim-1.
537: buf.setLength(0);
538:
539: String toHex = s.substring(pos, delim);
540: pos = delim;
541: int index = 0;
542: int len = toHex.length();
543: while (index < len) {
544: if (toHex.charAt(index) == ' ') {
545: index++;
546: } else {
547: int spacePos = toHex.indexOf(' ', index);
548: if (spacePos == -1) {
549: appendInt(buf, toHex.substring(index, len), s);
550: spacePos = len;
551: } else {
552: appendInt(buf,
553: toHex.substring(index, spacePos), s);
554: }
555: index = spacePos + 1;
556: }
557: }
558:
559: if (buf.length() < 1) {
560: throw new IllegalArgumentException("Empty field " + i
561: + " in " + s);
562: }
563: output[i] = buf.toString();
564: ++pos; // Skip over delim
565: }
566: }
567:
568: public static void appendInt(StringBuffer buf, String strToHex,
569: String s) {
570: int hex = Integer.parseInt(strToHex, 16);
571: if (hex < 0) {
572: throw new IllegalArgumentException("Out of range hex "
573: + hex + " in " + s);
574: } else if (hex > 0xFFFF) {
575: buf.append((char) ((hex >> 10) + 0xd7c0));
576: buf.append((char) ((hex & 0x3ff) | 0xdc00));
577: } else {
578: buf.append((char) hex);
579: }
580: }
581:
582: // Specific tests for debugging. These are generally failures
583: // taken from the conformance file, but culled out to make
584: // debugging easier. These can be eliminated without affecting
585: // coverage.
586:
587: public void _hideTestCase6(int options) throws Exception {
588: _testOneLine(
589: "0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;",
590: options);
591: }
592:
593: public void _testOneLine(String line, int options) throws Exception {
594: String[] fields = new String[5];
595: StringBuffer buf = new StringBuffer();
596: // Parse out the fields
597: hexsplit(line, ';', fields, buf);
598: checkConformance(fields, line, options);
599: }
600:
601: }
|