001: /*
002: *******************************************************************************
003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.dev.test.rbbi;
008:
009: //Regression testing of RuleBasedBreakIterator
010: //
011: // TODO: These tests should be mostly retired.
012: // Much of the test data that was originaly here was removed when the RBBI rules
013: // were updated to match the Unicode boundary TRs, and the data was found to be invalid.
014: // Much of the remaining data has been moved into the rbbitst.txt test data file,
015: // which is common between ICU4C and ICU4J. The remaining test data should also be moved,
016: // or simply retired if it is no longer interesting.
017: import com.ibm.icu.dev.test.*;
018: import com.ibm.icu.text.RuleBasedBreakIterator;
019: import com.ibm.icu.text.BreakIterator;
020: import java.util.Vector;
021:
022: public class RBBITest extends TestFmwk {
023:
024: public static void main(String[] args) throws Exception {
025: new RBBITest().run(args);
026: }
027:
028: public RBBITest() {
029: }
030:
031: private static final String halfNA = "\u0928\u094d\u200d"; /*halfform NA = devanigiri NA + virama(supresses inherent vowel)+ zero width joiner */
032:
033: // tests default rules based character iteration.
034: // Builds a new iterator from the source rules in the default (prebuilt) iterator.
035: //
036: public void TestDefaultRuleBasedCharacterIteration() {
037: RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator) BreakIterator
038: .getCharacterInstance();
039: logln("Testing the RBBI for character iteration by using default rules");
040:
041: //fetch the rules used to create the above RuleBasedBreakIterator
042: String defaultRules = rbbi.toString();
043:
044: RuleBasedBreakIterator charIterDefault = null;
045: try {
046: charIterDefault = new RuleBasedBreakIterator(defaultRules);
047: } catch (IllegalArgumentException iae) {
048: errln("ERROR: failed construction in TestDefaultRuleBasedCharacterIteration()"
049: + iae.toString());
050: }
051:
052: Vector chardata = new Vector();
053: chardata.addElement("H");
054: chardata.addElement("e");
055: chardata.addElement("l");
056: chardata.addElement("l");
057: chardata.addElement("o");
058: chardata.addElement("e\u0301"); //acuteE
059: chardata.addElement("&");
060: chardata.addElement("e\u0303"); //tildaE
061: //devanagiri characters for Hindi support
062: chardata.addElement("\u0906"); //devanagiri AA
063: //chardata.addElement("\u093e\u0901"); //devanagiri vowelsign AA+ chandrabindhu
064: chardata.addElement("\u0916\u0947"); //devanagiri KHA+vowelsign E
065: chardata.addElement("\u0938\u0941\u0902"); //devanagiri SA+vowelsign U + anusvara(bindu)
066: chardata.addElement("\u0926"); //devanagiri consonant DA
067: chardata.addElement("\u0930"); //devanagiri consonant RA
068: // chardata.addElement("\u0939\u094c"); //devanagiri HA+vowel sign AI
069: chardata.addElement("\u0964"); //devanagiri danda
070: //end hindi characters
071: chardata.addElement("A\u0302"); // circumflexA
072: chardata.addElement("i\u0301"); // acuteBelowI
073: // conjoining jamo...
074: chardata.addElement("\u1109\u1161\u11bc");
075: chardata.addElement("\u1112\u1161\u11bc");
076: chardata.addElement("\n");
077: chardata.addElement("\r\n"); // keep CRLF sequences together
078: chardata.addElement("S\u0300"); //graveS
079: chardata.addElement("i\u0301"); // acuteBelowI
080: chardata.addElement("!");
081:
082: // What follows is a string of Korean characters (I found it in the Yellow Pages
083: // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
084: // it correctly), first as precomposed syllables, and then as conjoining jamo.
085: // Both sequences should be semantically identical and break the same way.
086: // precomposed syllables...
087: chardata.addElement("\uc0c1");
088: chardata.addElement("\ud56d");
089: chardata.addElement(" ");
090: chardata.addElement("\ud55c");
091: chardata.addElement("\uc778");
092: chardata.addElement(" ");
093: chardata.addElement("\uc5f0");
094: chardata.addElement("\ud569");
095: chardata.addElement(" ");
096: chardata.addElement("\uc7a5");
097: chardata.addElement("\ub85c");
098: chardata.addElement("\uad50");
099: chardata.addElement("\ud68c");
100: chardata.addElement(" ");
101: // conjoining jamo...
102: chardata.addElement("\u1109\u1161\u11bc");
103: chardata.addElement("\u1112\u1161\u11bc");
104: chardata.addElement(" ");
105: chardata.addElement("\u1112\u1161\u11ab");
106: chardata.addElement("\u110b\u1175\u11ab");
107: chardata.addElement(" ");
108: chardata.addElement("\u110b\u1167\u11ab");
109: chardata.addElement("\u1112\u1161\u11b8");
110: chardata.addElement(" ");
111: chardata.addElement("\u110c\u1161\u11bc");
112: chardata.addElement("\u1105\u1169");
113: chardata.addElement("\u1100\u116d");
114: chardata.addElement("\u1112\u116c");
115:
116: generalIteratorTest(charIterDefault, chardata);
117:
118: }
119:
120: public void TestDefaultRuleBasedWordIteration() {
121: logln("Testing the RBBI for word iteration using default rules");
122: RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator) BreakIterator
123: .getWordInstance();
124: //fetch the rules used to create the above RuleBasedBreakIterator
125: String defaultRules = rbbi.toString();
126:
127: RuleBasedBreakIterator wordIterDefault = null;
128: try {
129: wordIterDefault = new RuleBasedBreakIterator(defaultRules);
130: } catch (IllegalArgumentException iae) {
131: errln("ERROR: failed construction in TestDefaultRuleBasedWordIteration() -- custom rules"
132: + iae.toString());
133: }
134:
135: Vector worddata = new Vector();
136: worddata.addElement("Write");
137: worddata.addElement(" ");
138: worddata.addElement("wordrules");
139: worddata.addElement(".");
140: worddata.addElement(" ");
141: //worddata.addElement("alpha-beta-gamma");
142: worddata.addElement(" ");
143: worddata.addElement("\u092f\u0939");
144: worddata.addElement(" ");
145: worddata.addElement("\u0939\u093f" + halfNA + "\u0926\u0940");
146: worddata.addElement(" ");
147: worddata.addElement("\u0939\u0948");
148: // worddata.addElement("\u0964"); //danda followed by a space
149: worddata.addElement(" ");
150: worddata.addElement("\u0905\u093e\u092a");
151: worddata.addElement(" ");
152: worddata.addElement("\u0938\u093f\u0916\u094b\u0917\u0947");
153: worddata.addElement("?");
154: worddata.addElement(" ");
155: worddata.addElement("\r");
156: worddata.addElement("It's");
157: worddata.addElement(" ");
158: // worddata.addElement("$30.10");
159: worddata.addElement(" ");
160: worddata.addElement(" ");
161: worddata.addElement("Badges");
162: worddata.addElement("?");
163: worddata.addElement(" ");
164: worddata.addElement("BADGES");
165: worddata.addElement("!");
166: worddata.addElement("1000,233,456.000");
167: worddata.addElement(" ");
168:
169: generalIteratorTest(wordIterDefault, worddata);
170: }
171:
172: private static final String kParagraphSeparator = "\u2029";
173: private static final String kLineSeparator = "\u2028";
174:
175: public void TestDefaultRuleBasedSentenceIteration() {
176: logln("Testing the RBBI for sentence iteration using default rules");
177: RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator) BreakIterator
178: .getSentenceInstance();
179:
180: //fetch the rules used to create the above RuleBasedBreakIterator
181: String defaultRules = rbbi.toString();
182: RuleBasedBreakIterator sentIterDefault = null;
183: try {
184: sentIterDefault = new RuleBasedBreakIterator(defaultRules);
185: } catch (IllegalArgumentException iae) {
186: errln("ERROR: failed construction in TestDefaultRuleBasedSentenceIteration()"
187: + iae.toString());
188: }
189:
190: Vector sentdata = new Vector();
191: sentdata.addElement("(This is it.) ");
192: sentdata.addElement("Testing the sentence iterator. ");
193: sentdata.addElement("\"This isn\'t it.\" ");
194: sentdata.addElement("Hi! ");
195: sentdata.addElement("This is a simple sample sentence. ");
196: sentdata.addElement("(This is it.) ");
197: sentdata.addElement("This is a simple sample sentence. ");
198: sentdata.addElement("\"This isn\'t it.\" ");
199: sentdata.addElement("Hi! ");
200: sentdata.addElement("This is a simple sample sentence. ");
201: sentdata
202: .addElement("It does not have to make any sense as you can see. ");
203: sentdata
204: .addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
205: sentdata.addElement("Che la dritta via aveo smarrita. ");
206: generalIteratorTest(sentIterDefault, sentdata);
207: }
208:
209: public void TestDefaultRuleBasedLineIteration() {
210: logln("Testing the RBBI for line iteration using default rules");
211: RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator) RuleBasedBreakIterator
212: .getLineInstance();
213: //fetch the rules used to create the above RuleBasedBreakIterator
214: String defaultRules = rbbi.toString();
215: RuleBasedBreakIterator lineIterDefault = null;
216: try {
217: lineIterDefault = new RuleBasedBreakIterator(defaultRules);
218: } catch (IllegalArgumentException iae) {
219: errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()"
220: + iae.toString());
221: }
222:
223: Vector linedata = new Vector();
224: linedata.addElement("Multi-");
225: linedata.addElement("Level ");
226: linedata.addElement("example ");
227: linedata.addElement("of ");
228: linedata.addElement("a ");
229: linedata.addElement("semi-");
230: linedata.addElement("idiotic ");
231: linedata.addElement("non-");
232: linedata.addElement("sensical ");
233: linedata.addElement("(non-");
234: linedata.addElement("important) ");
235: linedata.addElement("sentence. ");
236:
237: linedata.addElement("Hi ");
238: linedata.addElement("Hello ");
239: linedata.addElement("How\n");
240: linedata.addElement("are\r");
241: linedata.addElement("you" + kLineSeparator);
242: linedata.addElement("fine.\t");
243: linedata.addElement("good. ");
244:
245: linedata.addElement("Now\r");
246: linedata.addElement("is\n");
247: linedata.addElement("the\r\n");
248: linedata.addElement("time\n");
249: linedata.addElement("\r");
250: linedata.addElement("for\r");
251: linedata.addElement("\r");
252: linedata.addElement("all");
253:
254: generalIteratorTest(lineIterDefault, linedata);
255:
256: }
257:
258: //=========================================================================
259: // general test subroutines
260: //=========================================================================
261:
262: private void generalIteratorTest(RuleBasedBreakIterator rbbi,
263: Vector expectedResult) {
264: StringBuffer buffer = new StringBuffer();
265: String text;
266: for (int i = 0; i < expectedResult.size(); i++) {
267: text = (String) expectedResult.elementAt(i);
268: buffer.append(text);
269: }
270: text = buffer.toString();
271: if (rbbi == null) {
272: errln("null iterator, test skipped.");
273: return;
274: }
275:
276: rbbi.setText(text);
277:
278: Vector nextResults = _testFirstAndNext(rbbi, text);
279: Vector previousResults = _testLastAndPrevious(rbbi, text);
280:
281: logln("comparing forward and backward...");
282: int errs = getErrorCount();
283: compareFragmentLists("forward iteration", "backward iteration",
284: nextResults, previousResults);
285: if (getErrorCount() == errs) {
286: logln("comparing expected and actual...");
287: compareFragmentLists("expected result", "actual result",
288: expectedResult, nextResults);
289: }
290:
291: int[] boundaries = new int[expectedResult.size() + 3];
292: boundaries[0] = RuleBasedBreakIterator.DONE;
293: boundaries[1] = 0;
294: for (int i = 0; i < expectedResult.size(); i++)
295: boundaries[i + 2] = boundaries[i + 1]
296: + ((String) expectedResult.elementAt(i)).length();
297:
298: boundaries[boundaries.length - 1] = RuleBasedBreakIterator.DONE;
299:
300: _testFollowing(rbbi, text, boundaries);
301: _testPreceding(rbbi, text, boundaries);
302: _testIsBoundary(rbbi, text, boundaries);
303:
304: doMultipleSelectionTest(rbbi, text);
305: }
306:
307: private Vector _testFirstAndNext(RuleBasedBreakIterator rbbi,
308: String text) {
309: int p = rbbi.first();
310: int lastP = p;
311: Vector result = new Vector();
312:
313: if (p != 0)
314: errln("first() returned " + p + " instead of 0");
315: while (p != RuleBasedBreakIterator.DONE) {
316: p = rbbi.next();
317: if (p != RuleBasedBreakIterator.DONE) {
318: if (p <= lastP)
319: errln("next() failed to move forward: next() on position "
320: + lastP + " yielded " + p);
321:
322: result.addElement(text.substring(lastP, p));
323: } else {
324: if (lastP != text.length())
325: errln("next() returned DONE prematurely: offset was "
326: + lastP + " instead of " + text.length());
327: }
328: lastP = p;
329: }
330: return result;
331: }
332:
333: private Vector _testLastAndPrevious(RuleBasedBreakIterator rbbi,
334: String text) {
335: int p = rbbi.last();
336: int lastP = p;
337: Vector result = new Vector();
338:
339: if (p != text.length())
340: errln("last() returned " + p + " instead of "
341: + text.length());
342: while (p != RuleBasedBreakIterator.DONE) {
343: p = rbbi.previous();
344: if (p != RuleBasedBreakIterator.DONE) {
345: if (p >= lastP)
346: errln("previous() failed to move backward: previous() on position "
347: + lastP + " yielded " + p);
348:
349: result.insertElementAt(text.substring(p, lastP), 0);
350: } else {
351: if (lastP != 0)
352: errln("previous() returned DONE prematurely: offset was "
353: + lastP + " instead of 0");
354: }
355: lastP = p;
356: }
357: return result;
358: }
359:
360: private void compareFragmentLists(String f1Name, String f2Name,
361: Vector f1, Vector f2) {
362: int p1 = 0;
363: int p2 = 0;
364: String s1;
365: String s2;
366: int t1 = 0;
367: int t2 = 0;
368:
369: while (p1 < f1.size() && p2 < f2.size()) {
370: s1 = (String) f1.elementAt(p1);
371: s2 = (String) f2.elementAt(p2);
372: t1 += s1.length();
373: t2 += s2.length();
374:
375: if (s1.equals(s2)) {
376: debugLogln(" >" + s1 + "<");
377: ++p1;
378: ++p2;
379: } else {
380: int tempT1 = t1;
381: int tempT2 = t2;
382: int tempP1 = p1;
383: int tempP2 = p2;
384:
385: while (tempT1 != tempT2 && tempP1 < f1.size()
386: && tempP2 < f2.size()) {
387: while (tempT1 < tempT2 && tempP1 < f1.size()) {
388: tempT1 += ((String) f1.elementAt(tempP1))
389: .length();
390: ++tempP1;
391: }
392: while (tempT2 < tempT1 && tempP2 < f2.size()) {
393: tempT2 += ((String) f2.elementAt(tempP2))
394: .length();
395: ++tempP2;
396: }
397: }
398: logln("*** " + f1Name + " has:");
399: while (p1 <= tempP1 && p1 < f1.size()) {
400: s1 = (String) f1.elementAt(p1);
401: t1 += s1.length();
402: debugLogln(" *** >" + s1 + "<");
403: ++p1;
404: }
405: logln("***** " + f2Name + " has:");
406: while (p2 <= tempP2 && p2 < f2.size()) {
407: s2 = (String) f2.elementAt(p2);
408: t2 += s2.length();
409: debugLogln(" ***** >" + s2 + "<");
410: ++p2;
411: }
412: errln("Discrepancy between " + f1Name + " and "
413: + f2Name);
414: }
415: }
416: }
417:
418: private void _testFollowing(RuleBasedBreakIterator rbbi,
419: String text, int[] boundaries) {
420: logln("testFollowing():");
421: int p = 2;
422: for (int i = 0; i <= text.length(); i++) {
423: if (i == boundaries[p])
424: ++p;
425: int b = rbbi.following(i);
426: logln("rbbi.following(" + i + ") -> " + b);
427: if (b != boundaries[p])
428: errln("Wrong result from following() for " + i
429: + ": expected " + boundaries[p] + ", got " + b);
430: }
431: }
432:
433: private void _testPreceding(RuleBasedBreakIterator rbbi,
434: String text, int[] boundaries) {
435: logln("testPreceding():");
436: int p = 0;
437: for (int i = 0; i <= text.length(); i++) {
438: int b = rbbi.preceding(i);
439: logln("rbbi.preceding(" + i + ") -> " + b);
440: if (b != boundaries[p])
441: errln("Wrong result from preceding() for " + i
442: + ": expected " + boundaries[p] + ", got " + b);
443: if (i == boundaries[p + 1])
444: ++p;
445: }
446: }
447:
448: private void _testIsBoundary(RuleBasedBreakIterator rbbi,
449: String text, int[] boundaries) {
450: logln("testIsBoundary():");
451: int p = 1;
452: boolean isB;
453: for (int i = 0; i <= text.length(); i++) {
454: isB = rbbi.isBoundary(i);
455: logln("rbbi.isBoundary(" + i + ") -> " + isB);
456: if (i == boundaries[p]) {
457: if (!isB)
458: errln("Wrong result from isBoundary() for " + i
459: + ": expected true, got false");
460: ++p;
461: } else {
462: if (isB)
463: errln("Wrong result from isBoundary() for " + i
464: + ": expected false, got true");
465: }
466: }
467: }
468:
469: private void doMultipleSelectionTest(
470: RuleBasedBreakIterator iterator, String testText) {
471: logln("Multiple selection test...");
472: RuleBasedBreakIterator testIterator = (RuleBasedBreakIterator) iterator
473: .clone();
474: int offset = iterator.first();
475: int testOffset;
476: int count = 0;
477:
478: do {
479: testOffset = testIterator.first();
480: testOffset = testIterator.next(count);
481: logln("next(" + count + ") -> " + testOffset);
482: if (offset != testOffset)
483: errln("next(n) and next() not returning consistent results: for step "
484: + count
485: + ", next(n) returned "
486: + testOffset
487: + " and next() had " + offset);
488:
489: if (offset != RuleBasedBreakIterator.DONE) {
490: count++;
491: offset = iterator.next();
492: }
493: } while (offset != RuleBasedBreakIterator.DONE);
494:
495: // now do it backwards...
496: offset = iterator.last();
497: count = 0;
498:
499: do {
500: testOffset = testIterator.last();
501: testOffset = testIterator.next(count);
502: logln("next(" + count + ") -> " + testOffset);
503: if (offset != testOffset)
504: errln("next(n) and next() not returning consistent results: for step "
505: + count
506: + ", next(n) returned "
507: + testOffset
508: + " and next() had " + offset);
509:
510: if (offset != RuleBasedBreakIterator.DONE) {
511: count--;
512: offset = iterator.previous();
513: }
514: } while (offset != RuleBasedBreakIterator.DONE);
515: }
516:
517: private void debugLogln(String s) {
518: final String zeros = "0000";
519: String temp;
520: StringBuffer out = new StringBuffer();
521: for (int i = 0; i < s.length(); i++) {
522: char c = s.charAt(i);
523: if (c >= ' ' && c < '\u007f')
524: out.append(c);
525: else {
526: out.append("\\u");
527: temp = Integer.toHexString((int) c);
528: out.append(zeros.substring(0, 4 - temp.length()));
529: out.append(temp);
530: }
531: }
532: logln(out.toString());
533: }
534:
535: }
|