001: /*
002: * Created on May 5, 2004
003: *
004: * Copyright (C) 2004-2006 International Business Machines Corporation and others.
005: * All Rights Reserved.
006: *
007: */
008: package com.ibm.icu.dev.test.rbbi;
009:
010: import com.ibm.icu.dev.test.TestFmwk;
011: import com.ibm.icu.impl.Utility;
012: import com.ibm.icu.text.BreakIterator;
013: import com.ibm.icu.text.RuleBasedBreakIterator;
014: import com.ibm.icu.lang.UCharacter;
015: import com.ibm.icu.text.UTF16;
016: import com.ibm.icu.util.ULocale;
017: import java.io.InputStream;
018: import java.io.InputStreamReader;
019: import java.io.IOException;
020: import java.util.Arrays;
021:
022: /**
023: * Rule based break iterator data driven test.
024: * Perform the tests from the file rbbitst.txt.
025: * The test data file is common to both ICU4C and ICU4J.
026: * See the data file for a description of the tests.
027: *
028: */
029: public class RBBITestExtended extends TestFmwk {
030:
031: public static void main(String[] args) throws Exception {
032: new RBBITestExtended().run(args);
033: }
034:
035: public RBBITestExtended() {
036: }
037:
038: static class TestParams {
039: BreakIterator bi;
040: StringBuffer dataToBreak = new StringBuffer();
041: int[] expectedBreaks = new int[1000];
042: int[] srcLine = new int[1000];
043: int[] srcCol = new int[1000];
044: ULocale currentLocale = new ULocale("en_US");
045: }
046:
047: public void TestExtended() {
048:
049: TestParams tp = new TestParams();
050:
051: //
052: // Open and read the test data file.
053: //
054: InputStreamReader isr = null;
055: StringBuffer testFileBuf = new StringBuffer();
056: try {
057: InputStream is = RBBITestExtended.class
058: .getResourceAsStream("rbbitst.txt");
059: if (is == null) {
060: errln("Could not open test data file rbbitst.txt");
061: return;
062: }
063: isr = new InputStreamReader(is, "UTF-8");
064: int c;
065: int count = 0;
066: for (;;) {
067: c = isr.read();
068: if (c < 0) {
069: break;
070: }
071: count++;
072: if (c == 0xFEFF && count == 1) {
073: // BOM in the test data file. Discard it.
074: continue;
075: }
076:
077: UTF16.append(testFileBuf, c);
078: }
079:
080: } catch (IOException e) {
081: errln(e.toString());
082: return;
083: }
084:
085: String testString = testFileBuf.toString();
086:
087: final int PARSE_COMMENT = 1;
088: final int PARSE_TAG = 2;
089: final int PARSE_DATA = 3;
090: final int PARSE_NUM = 4;
091:
092: int parseState = PARSE_TAG;
093:
094: int savedState = PARSE_TAG;
095:
096: final char CH_LF = 0x0a;
097: final char CH_CR = 0x0d;
098: final char CH_HASH = 0x23;
099: /*static const UChar CH_PERIOD = 0x2e;*/
100: final char CH_LT = 0x3c;
101: final char CH_GT = 0x3e;
102: final char CH_BACKSLASH = 0x5c;
103: final char CH_BULLET = 0x2022;
104:
105: int lineNum = 1;
106: int colStart = 0;
107: int column = 0;
108: int charIdx = 0;
109: int i;
110:
111: int tagValue = 0; // The numeric value of a <nnn> tag.
112: int len = testString.length();
113:
114: for (charIdx = 0; charIdx < len;) {
115: int c = UTF16.charAt(testString, charIdx);
116: charIdx++;
117: if (c == CH_CR && charIdx < len
118: && testString.charAt(charIdx) == CH_LF) {
119: // treat CRLF as a unit
120: c = CH_LF;
121: charIdx++;
122: }
123: if (c == CH_LF || c == CH_CR) {
124: lineNum++;
125: colStart = charIdx;
126: }
127: column = charIdx - colStart + 1;
128:
129: switch (parseState) {
130: case PARSE_COMMENT:
131: if (c == 0x0a || c == 0x0d) {
132: parseState = savedState;
133: }
134: break;
135:
136: case PARSE_TAG: {
137: if (c == CH_HASH) {
138: parseState = PARSE_COMMENT;
139: savedState = PARSE_TAG;
140: break;
141: }
142: if (UCharacter.isWhitespace(c)) {
143: break;
144: }
145: if (testString.startsWith("<word>", charIdx - 1)) {
146: tp.bi = BreakIterator
147: .getWordInstance(tp.currentLocale);
148: charIdx += 5;
149: break;
150: }
151: if (testString.startsWith("<char>", charIdx - 1)) {
152: tp.bi = BreakIterator
153: .getCharacterInstance(tp.currentLocale);
154: charIdx += 5;
155: break;
156: }
157: if (testString.startsWith("<line>", charIdx - 1)) {
158: tp.bi = BreakIterator
159: .getLineInstance(tp.currentLocale);
160: charIdx += 5;
161: break;
162: }
163: if (testString.startsWith("<sent>", charIdx - 1)) {
164: tp.bi = BreakIterator
165: .getSentenceInstance(tp.currentLocale);
166: charIdx += 5;
167: break;
168: }
169: if (testString.startsWith("<title>", charIdx - 1)) {
170: tp.bi = BreakIterator
171: .getTitleInstance(tp.currentLocale);
172: charIdx += 6;
173: break;
174: }
175: if (testString.startsWith("<locale ", charIdx - 1)) {
176: int closeIndex = testString.indexOf(">", charIdx);
177: if (closeIndex < 0) {
178: errln("line" + lineNum
179: + ": missing close on <locale tag.");
180: break;
181: }
182: String localeName = testString.substring(
183: charIdx + 6, closeIndex);
184: localeName = localeName.trim();
185: tp.currentLocale = new ULocale(localeName);
186: charIdx = closeIndex + 1;
187: break;
188: }
189: if (testString.startsWith("<data>", charIdx - 1)) {
190: parseState = PARSE_DATA;
191: charIdx += 5;
192: tp.dataToBreak.setLength(0);
193: Arrays.fill(tp.expectedBreaks, 0);
194: Arrays.fill(tp.srcCol, 0);
195: Arrays.fill(tp.srcLine, 0);
196: break;
197: }
198:
199: errln("line" + lineNum + ": Tag expected in test file.");
200: return;
201: //parseState = PARSE_COMMENT;
202: //savedState = PARSE_DATA;
203: }
204:
205: case PARSE_DATA:
206: if (c == CH_BULLET) {
207: int breakIdx = tp.dataToBreak.length();
208: tp.expectedBreaks[breakIdx] = -1;
209: tp.srcLine[breakIdx] = lineNum;
210: tp.srcCol[breakIdx] = column;
211: break;
212: }
213:
214: if (testString.startsWith("</data>", charIdx - 1)) {
215: // Add final entry to mappings from break location to source file position.
216: // Need one extra because last break position returned is after the
217: // last char in the data, not at the last char.
218: int idx = tp.dataToBreak.length();
219: tp.srcLine[idx] = lineNum;
220: tp.srcCol[idx] = column;
221:
222: parseState = PARSE_TAG;
223: charIdx += 6;
224:
225: // RUN THE TEST!
226: executeTest(tp);
227: break;
228: }
229:
230: if (testString.startsWith("\\N{", charIdx - 1)) {
231: int nameEndIdx = testString.indexOf('}', charIdx);
232: if (nameEndIdx == -1) {
233: errln("Error in named character in test file at line "
234: + lineNum + ", col " + column);
235: }
236: // Named character, e.g. \N{COMBINING GRAVE ACCENT}
237: // Get the code point from the name and insert it into the test data.
238: String charName = testString.substring(charIdx + 2,
239: nameEndIdx);
240: c = UCharacter.getCharFromName(charName);
241: if (c == -1) {
242: errln("Error in named character in test file at line "
243: + lineNum + ", col " + column);
244: } else {
245: // Named code point was recognized. Insert it
246: // into the test data.
247: UTF16.append(tp.dataToBreak, c);
248: for (i = tp.dataToBreak.length() - 1; i >= 0
249: && tp.srcLine[i] == 0; i--) {
250: tp.srcLine[i] = lineNum;
251: tp.srcCol[i] = column;
252: }
253:
254: }
255: if (nameEndIdx > charIdx) {
256: charIdx = nameEndIdx + 1;
257: }
258: break;
259: }
260:
261: if (testString.startsWith("<>", charIdx - 1)) {
262: charIdx++;
263: int breakIdx = tp.dataToBreak.length();
264: tp.expectedBreaks[breakIdx] = -1;
265: tp.srcLine[breakIdx] = lineNum;
266: tp.srcCol[breakIdx] = column;
267: break;
268: }
269:
270: if (c == CH_LT) {
271: tagValue = 0;
272: parseState = PARSE_NUM;
273: break;
274: }
275:
276: if (c == CH_HASH && column == 3) { // TODO: why is column off so far?
277: parseState = PARSE_COMMENT;
278: savedState = PARSE_DATA;
279: break;
280: }
281:
282: if (c == CH_BACKSLASH) {
283: // Check for \ at end of line, a line continuation.
284: // Advance over (discard) the newline
285: int cp = UTF16.charAt(testString, charIdx);
286: if (cp == CH_CR
287: && charIdx < len
288: && UTF16.charAt(testString, charIdx + 1) == CH_LF) {
289: // We have a CR LF
290: // Need an extra increment of the input ptr to move over both of them
291: charIdx++;
292: }
293: if (cp == CH_LF || cp == CH_CR) {
294: lineNum++;
295: column = 0;
296: charIdx++;
297: colStart = charIdx;
298: break;
299: }
300:
301: // Let unescape handle the back slash.
302: int charIdxAr[] = new int[1];
303: charIdxAr[0] = charIdx;
304: cp = Utility.unescapeAt(testString, charIdxAr);
305: if (cp != -1) {
306: // Escape sequence was recognized. Insert the char
307: // into the test data.
308: charIdx = charIdxAr[0];
309: UTF16.append(tp.dataToBreak, cp);
310: for (i = tp.dataToBreak.length() - 1; i >= 0
311: && tp.srcLine[i] == 0; i--) {
312: tp.srcLine[i] = lineNum;
313: tp.srcCol[i] = column;
314: }
315:
316: break;
317: }
318:
319: // Not a recognized backslash escape sequence.
320: // Take the next char as a literal.
321: // TODO: Should this be an error?
322: c = UTF16.charAt(testString, charIdx);
323: charIdx = UTF16.moveCodePointOffset(testString,
324: charIdx, 1);
325: }
326:
327: // Normal, non-escaped data char.
328: UTF16.append(tp.dataToBreak, c);
329:
330: // Save the mapping from offset in the data to line/column numbers in
331: // the original input file. Will be used for better error messages only.
332: // If there's an expected break before this char, the slot in the mapping
333: // vector will already be set for this char; don't overwrite it.
334: for (i = tp.dataToBreak.length() - 1; i >= 0
335: && tp.srcLine[i] == 0; i--) {
336: tp.srcLine[i] = lineNum;
337: tp.srcCol[i] = column;
338: }
339: break;
340:
341: case PARSE_NUM:
342: // We are parsing an expected numeric tag value, like <1234>,
343: // within a chunk of data.
344: if (UCharacter.isWhitespace(c)) {
345: break;
346: }
347:
348: if (c == CH_GT) {
349: // Finished the number. Add the info to the expected break data,
350: // and switch parse state back to doing plain data.
351: parseState = PARSE_DATA;
352: if (tagValue == 0) {
353: tagValue = -1;
354: }
355: int breakIdx = tp.dataToBreak.length();
356: tp.expectedBreaks[breakIdx] = tagValue;
357: tp.srcLine[breakIdx] = lineNum;
358: tp.srcCol[breakIdx] = column;
359: break;
360: }
361:
362: if (UCharacter.isDigit(c)) {
363: tagValue = tagValue * 10 + UCharacter.digit(c);
364: break;
365: }
366:
367: errln("Syntax Error in test file at line " + lineNum
368: + ", col %d" + column);
369: return;
370:
371: // parseState = PARSE_COMMENT; // TODO: unreachable. Don't stop on errors.
372: // break;
373: }
374:
375: }
376: }
377:
378: void executeTest(TestParams t) {
379: int bp;
380: int prevBP;
381: int i;
382:
383: if (t.bi == null) {
384: return;
385: }
386:
387: t.bi.setText(t.dataToBreak.toString());
388: //
389: // Run the iterator forward
390: //
391: prevBP = -1;
392: for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi
393: .next()) {
394: if (prevBP == bp) {
395: // Fail for lack of forward progress.
396: errln("Forward Iteration, no forward progress. Break Pos="
397: + bp
398: + " File line,col="
399: + t.srcLine[bp]
400: + ", " + t.srcCol[bp]);
401: break;
402: }
403:
404: // Check that there were we didn't miss an expected break between the last one
405: // and this one.
406: for (i = prevBP + 1; i < bp; i++) {
407: if (t.expectedBreaks[i] != 0) {
408: errln("Forward Iteration, break expected, but not found. Pos="
409: + i
410: + " File line,col= "
411: + t.srcLine[i]
412: + ", " + t.srcCol[i]);
413: }
414: }
415:
416: // Check that the break we did find was expected
417: if (t.expectedBreaks[bp] == 0) {
418: errln("Forward Iteration, break found, but not expected. Pos="
419: + bp
420: + " File line,col= "
421: + t.srcLine[bp]
422: + ", " + t.srcCol[bp]);
423: } else {
424: // The break was expected.
425: // Check that the {nnn} tag value is correct.
426: int expectedTagVal = t.expectedBreaks[bp];
427: if (expectedTagVal == -1) {
428: expectedTagVal = 0;
429: }
430: int line = t.srcLine[bp];
431: int rs = ((RuleBasedBreakIterator) t.bi)
432: .getRuleStatus();
433: if (rs != expectedTagVal) {
434: errln("Incorrect status for forward break. Pos = "
435: + bp + ". File line,col = " + line + ", "
436: + t.srcCol[bp] + "\n"
437: + " Actual, Expected status = "
438: + rs + ", " + expectedTagVal);
439: }
440: }
441:
442: prevBP = bp;
443: }
444:
445: // Verify that there were no missed expected breaks after the last one found
446: for (i = prevBP + 1; i < t.dataToBreak.length() + 1; i++) {
447: if (t.expectedBreaks[i] != 0) {
448: errln("Forward Iteration, break expected, but not found. Pos="
449: + i
450: + " File line,col= "
451: + t.srcLine[i]
452: + ", "
453: + t.srcCol[i]);
454: }
455: }
456:
457: //
458: // Run the iterator backwards, verify that the same breaks are found.
459: //
460: prevBP = t.dataToBreak.length() + 2; // start with a phony value for the last break pos seen.
461: for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi
462: .previous()) {
463: if (prevBP == bp) {
464: // Fail for lack of progress.
465: errln("Reverse Iteration, no progress. Break Pos="
466: + bp + "File line,col=" + t.srcLine[bp] + " "
467: + t.srcCol[bp]);
468: break;
469: }
470:
471: // Check that there were we didn't miss an expected break between the last one
472: // and this one. (UVector returns zeros for index out of bounds.)
473: for (i = prevBP - 1; i > bp; i--) {
474: if (t.expectedBreaks[i] != 0) {
475: errln("Reverse Itertion, break expected, but not found. Pos="
476: + i
477: + " File line,col= "
478: + t.srcLine[i]
479: + ", " + t.srcCol[i]);
480: }
481: }
482:
483: // Check that the break we did find was expected
484: if (t.expectedBreaks[bp] == 0) {
485: errln("Reverse Itertion, break found, but not expected. Pos="
486: + bp
487: + " File line,col= "
488: + t.srcLine[bp]
489: + ", " + t.srcCol[bp]);
490: } else {
491: // The break was expected.
492: // Check that the {nnn} tag value is correct.
493: int expectedTagVal = t.expectedBreaks[bp];
494: if (expectedTagVal == -1) {
495: expectedTagVal = 0;
496: }
497: int line = t.srcLine[bp];
498: int rs = ((RuleBasedBreakIterator) t.bi)
499: .getRuleStatus();
500: if (rs != expectedTagVal) {
501: errln("Incorrect status for reverse break. Pos= "
502: + bp + "File line,col= " + line + ", "
503: + t.srcCol[bp] + "\n"
504: + " Actual, Expected status = "
505: + rs + ", " + expectedTagVal);
506: }
507: }
508:
509: prevBP = bp;
510: }
511:
512: // Verify that there were no missed breaks prior to the last one found
513: for (i = prevBP - 1; i >= 0; i--) {
514: if (t.expectedBreaks[i] != 0) {
515: errln("Forward Itertion, break expected, but not found. Pos="
516: + i
517: + " File line,col= "
518: + t.srcLine[i]
519: + ", "
520: + t.srcCol[i]);
521: }
522: }
523: }
524:
525: }
|