001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007:
008: package com.ibm.icu.dev.test.normalizer;
009:
010: import java.io.BufferedReader;
011: import java.io.IOException;
012:
013: import com.ibm.icu.dev.test.TestFmwk;
014: import com.ibm.icu.dev.test.TestUtil;
015: import com.ibm.icu.text.UTF16;
016: import com.ibm.icu.text.UnicodeSet;
017:
018: public class UnicodeNormalizerConformanceTest extends TestFmwk {
019:
020: UnicodeNormalizer normalizer_C, normalizer_D, normalizer_KC,
021: normalizer_KD;
022:
023: public static void main(String[] args) throws Exception {
024: new UnicodeNormalizerConformanceTest().run(args);
025: }
026:
027: public UnicodeNormalizerConformanceTest() {
028: // Doesn't matter what the string and mode are; we'll change
029: // them later as needed.
030: normalizer_C = new UnicodeNormalizer(UnicodeNormalizer.C, true);
031: normalizer_D = new UnicodeNormalizer(UnicodeNormalizer.D, false);
032: normalizer_KC = new UnicodeNormalizer(UnicodeNormalizer.KC,
033: false);
034: normalizer_KD = new UnicodeNormalizer(UnicodeNormalizer.KD,
035: false);
036:
037: }
038:
039: // more interesting conformance test cases, not in the unicode.org NormalizationTest.txt
040: static String[] moreCases = {
041: // Markus 2001aug30
042: "0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0",
043:
044: // Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129
045: "0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1" };
046:
047: /**
048: * Test the conformance of NewNormalizer to
049: * http://www.unicode.org/unicode/reports/tr15/conformance/Draft-TestSuite.txt.
050: * This file must be located at the path specified as TEST_SUITE_FILE.
051: */
052: public void TestConformance() throws Exception {
053: BufferedReader input = null;
054: String line = null;
055: String[] fields = new String[5];
056: StringBuffer buf = new StringBuffer();
057: int passCount = 0;
058: int failCount = 0;
059: UnicodeSet other = new UnicodeSet(0, 0x10ffff);
060: int c = 0;
061: try {
062: input = TestUtil
063: .getDataReader("unicode/NormalizationTest.txt");
064: for (int count = 0;; ++count) {
065: line = input.readLine();
066: if (line == null) {
067: //read the extra test cases
068: if (count > moreCases.length) {
069: count = 0;
070: } else if (count == moreCases.length) {
071: // all done
072: break;
073: }
074: line = moreCases[count++];
075: }
076: if (line.length() == 0)
077: continue;
078:
079: // Expect 5 columns of this format:
080: // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>
081:
082: // Skip comments
083: if (line.charAt(0) == '#' || line.charAt(0) == '@')
084: continue;
085:
086: // Parse out the fields
087: hexsplit(line, ';', fields, buf);
088:
089: // Remove a single code point from the "other" UnicodeSet
090: if (fields[0].length() == UTF16.moveCodePointOffset(
091: fields[0], 0, 1)) {
092: c = UTF16.charAt(fields[0], 0);
093: if (0xac20 <= c && c <= 0xd73f) {
094: // not an exhaustive test run: skip most Hangul syllables
095: if (c == 0xac20) {
096: other.remove(0xac20, 0xd73f);
097: }
098: continue;
099: }
100: other.remove(c);
101: }
102: if (checkConformance(fields, line)) {
103: ++passCount;
104: } else {
105: ++failCount;
106: }
107: if ((count % 1000) == 999) {
108: logln("Line " + (count + 1));
109: }
110: }
111: } catch (IOException ex) {
112: try {
113: input.close();
114: } catch (Exception ex2) {
115: System.out.print("");
116: }
117: ex.printStackTrace();
118: throw new IllegalArgumentException("Couldn't read file "
119: + ex.getClass().getName() + " " + ex.getMessage()
120: + " line = " + line);
121: }
122:
123: if (failCount != 0) {
124: errln("Total: " + failCount + " lines failed, " + passCount
125: + " lines passed");
126: } else {
127: logln("Total: " + passCount + " lines passed");
128: }
129: }
130:
131: /**
132: * Verify the conformance of the given line of the Unicode
133: * normalization (UTR 15) test suite file. For each line,
134: * there are five columns, corresponding to field[0]..field[4].
135: *
136: * The following invariants must be true for all conformant implementations
137: * c2 == NFC(c1) == NFC(c2) == NFC(c3)
138: * c3 == NFD(c1) == NFD(c2) == NFD(c3)
139: * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
140: * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
141: *
142: * @param field the 5 columns
143: * @param line the source line from the test suite file
144: * @return true if the test passes
145: */
146: private boolean checkConformance(String[] field, String line)
147: throws Exception {
148: boolean pass = true;
149: // StringBuffer buf = new StringBuffer(); // scratch
150: String out;
151: int i = 0;
152: for (i = 0; i < 5; ++i) {
153: if (i < 3) {
154: out = normalizer_C.normalize(field[i]);
155: pass &= assertEqual("C", field[i], out, field[1],
156: "c2!=C(c" + (i + 1));
157:
158: out = normalizer_D.normalize(field[i]);
159: pass &= assertEqual("D", field[i], out, field[2],
160: "c3!=D(c" + (i + 1));
161:
162: }
163: out = normalizer_KC.normalize(field[i]);
164: pass &= assertEqual("KC", field[i], out, field[3],
165: "c4!=KC(c" + (i + 1));
166:
167: out = normalizer_KD.normalize(field[i]);
168: pass &= assertEqual("KD", field[i], out, field[4],
169: "c5!=KD(c" + (i + 1));
170:
171: }
172:
173: if (!pass) {
174: errln("FAIL: " + line);
175: }
176:
177: return pass;
178: }
179:
180: /**
181: * @param op name of normalization form, e.g., "KC"
182: * @param s string being normalized
183: * @param got value received
184: * @param exp expected value
185: * @param msg description of this test
186: * @returns true if got == exp
187: */
188: private boolean assertEqual(String op, String s, String got,
189: String exp, String msg) {
190: if (exp.equals(got)) {
191: return true;
192: }
193: errln((" " + msg + ") " + op + "(" + s + ")=" + hex(got)
194: + ", exp. " + hex(exp)));
195: return false;
196: }
197:
198: /**
199: * Split a string into pieces based on the given delimiter
200: * character. Then, parse the resultant fields from hex into
201: * characters. That is, "0040 0400;0C00;0899" -> new String[] {
202: * "\u0040\u0400", "\u0C00", "\u0899" }. The output is assumed to
203: * be of the proper length already, and exactly output.length
204: * fields are parsed. If there are too few an exception is
205: * thrown. If there are too many the extras are ignored.
206: *
207: * @param buf scratch buffer
208: */
209: private static void hexsplit(String s, char delimiter,
210: String[] output, StringBuffer buf) {
211: int i;
212: int pos = 0;
213: for (i = 0; i < output.length; ++i) {
214: int delim = s.indexOf(delimiter, pos);
215: if (delim < 0) {
216: throw new IllegalArgumentException("Missing field in "
217: + s);
218: }
219: // Our field is from pos..delim-1.
220: buf.setLength(0);
221:
222: String toHex = s.substring(pos, delim);
223: pos = delim;
224: int index = 0;
225: int len = toHex.length();
226: while (index < len) {
227: if (toHex.charAt(index) == ' ') {
228: index++;
229: } else {
230: int spacePos = toHex.indexOf(' ', index);
231: if (spacePos == -1) {
232: appendInt(buf, toHex.substring(index, len), s);
233: spacePos = len;
234: } else {
235: appendInt(buf,
236: toHex.substring(index, spacePos), s);
237: }
238: index = spacePos + 1;
239: }
240: }
241:
242: if (buf.length() < 1) {
243: throw new IllegalArgumentException("Empty field " + i
244: + " in " + s);
245: }
246: output[i] = buf.toString();
247: ++pos; // Skip over delim
248: }
249: }
250:
251: public static void appendInt(StringBuffer buf, String strToHex,
252: String s) {
253: int hex = Integer.parseInt(strToHex, 16);
254: if (hex < 0) {
255: throw new IllegalArgumentException("Out of range hex "
256: + hex + " in " + s);
257: } else if (hex > 0xFFFF) {
258: buf.append((char) ((hex >> 10) + 0xd7c0));
259: buf.append((char) ((hex & 0x3ff) | 0xdc00));
260: } else {
261: buf.append((char) hex);
262: }
263: }
264:
265: // Specific tests for debugging. These are generally failures
266: // taken from the conformance file, but culled out to make
267: // debugging easier. These can be eliminated without affecting
268: // coverage.
269:
270: public void _hideTestCase6() throws Exception {
271: _testOneLine("0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;");
272: }
273:
274: public void _testOneLine(String line) throws Exception {
275: String[] fields = new String[5];
276: StringBuffer buf = new StringBuffer();
277: // Parse out the fields
278: hexsplit(line, ';', fields, buf);
279: checkConformance(fields, line);
280: }
281:
282: }
|