0001: /*
0002: *******************************************************************************
0003: * Copyright (C) 2003-2006 International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */
0007: package com.ibm.icu.dev.test.rbbi;
0008:
0009: // Monkey testing of RuleBasedBreakIterator
0010: import com.ibm.icu.dev.test.*;
0011: import com.ibm.icu.text.BreakIterator;
0012: import com.ibm.icu.text.RuleBasedBreakIterator;
0013: import com.ibm.icu.text.UTF16;
0014: import com.ibm.icu.text.UnicodeSet;
0015: import com.ibm.icu.lang.UCharacter;
0016: import com.ibm.icu.lang.UProperty;
0017: import java.util.List;
0018: import java.util.Arrays;
0019: import java.util.ArrayList;
0020: import java.util.Locale;
0021:
0022: /**
0023: * Monkey tests for RBBI. These tests have independent implementations of
0024: * the Unicode TR boundary rules, and compare results between these and ICU's
0025: * implementation, using random data.
0026: *
0027: * Tests cover Grapheme Cluster (char), Word and Line breaks
0028: *
0029: * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
0030: *
0031: */
0032: public class RBBITestMonkey extends TestFmwk {
0033:
0034: public static void main(String[] args) {
0035: new RBBITestMonkey().run(args);
0036: }
0037:
0038: //
0039: // classs RBBIMonkeyKind
0040: //
0041: // Monkey Test for Break Iteration
0042: // Abstract interface class. Concrete derived classes independently
0043: // implement the break rules for different iterator types.
0044: //
0045: // The Monkey Test itself uses doesn't know which type of break iterator it is
0046: // testing, but works purely in terms of the interface defined here.
0047: //
0048: abstract static class RBBIMonkeyKind {
0049:
0050: // Return a List of UnicodeSets, representing the character classes used
0051: // for this type of iterator.
0052: abstract List charClasses();
0053:
0054: // Set the test text on which subsequent calls to next() will operate
0055: abstract void setText(StringBuffer text);
0056:
0057: // Find the next break postion, starting from the specified position.
0058: // Return -1 after reaching end of string.
0059: abstract int next(int i);
0060: }
0061:
0062: /**
0063: * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
0064: */
0065: static class RBBICharMonkey extends RBBIMonkeyKind {
0066: List fSets;
0067:
0068: UnicodeSet fCRLFSet;
0069: UnicodeSet fControlSet;
0070: UnicodeSet fExtendSet;
0071: UnicodeSet fHangulSet;
0072: UnicodeSet fAnySet;
0073:
0074: StringBuffer fText;
0075:
0076: RBBICharMonkey() {
0077: fText = null;
0078: fCRLFSet = new UnicodeSet("[\\r\\n]");
0079: fControlSet = new UnicodeSet(
0080: "[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]]");
0081: fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]");
0082: fHangulSet = new UnicodeSet(
0083: "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
0084: + "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]");
0085: fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
0086:
0087: fSets = new ArrayList();
0088: fSets.add(fCRLFSet);
0089: fSets.add(fControlSet);
0090: fSets.add(fExtendSet);
0091: fSets.add(fHangulSet);
0092: fSets.add(fAnySet);
0093: }
0094:
0095: void setText(StringBuffer s) {
0096: fText = s;
0097: }
0098:
0099: List charClasses() {
0100: return fSets;
0101: }
0102:
0103: int next(int i) {
0104: return nextGC(fText, i);
0105: }
0106: }
0107:
0108: /**
0109: *
0110: * Word Monkey Test Class
0111: *
0112: *
0113: *
0114: */
0115: static class RBBIWordMonkey extends RBBIMonkeyKind {
0116: List fSets;
0117: StringBuffer fText;
0118:
0119: UnicodeSet fKatakanaSet;
0120: UnicodeSet fALetterSet;
0121: UnicodeSet fMidLetterSet;
0122: UnicodeSet fMidNumSet;
0123: UnicodeSet fNumericSet;
0124: UnicodeSet fFormatSet;
0125: UnicodeSet fExtendSet;
0126: UnicodeSet fExtendNumLetSet;
0127: UnicodeSet fOtherSet;
0128:
0129: RBBIWordMonkey() {
0130: fSets = new ArrayList();
0131:
0132: fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
0133: + "[\\p{Line_Break = Complex_Context}"
0134: + "-\\p{Grapheme_Cluster_Break = Extend}"
0135: + "-\\p{Grapheme_Cluster_Break = Control}]]");
0136: fKatakanaSet = new UnicodeSet(
0137: "[\\p{Word_Break = Katakana}-[\\uff9e\\uff9f]]");
0138: fMidLetterSet = new UnicodeSet(
0139: "[\\p{Word_Break = MidLetter}]");
0140: fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
0141: fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]");
0142: fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
0143: fExtendNumLetSet = new UnicodeSet(
0144: "[\\p{Word_Break = ExtendNumLet}]");
0145: fExtendSet = new UnicodeSet(
0146: "[\\p{Grapheme_Cluster_Break = Extend}\\uff9e\\uff9f]");
0147: fOtherSet = new UnicodeSet();
0148:
0149: fOtherSet.complement();
0150: fOtherSet.removeAll(fALetterSet);
0151: fOtherSet.removeAll(fKatakanaSet);
0152: fOtherSet.removeAll(fMidLetterSet);
0153: fOtherSet.removeAll(fMidNumSet);
0154: fOtherSet.removeAll(fNumericSet);
0155: fOtherSet.removeAll(fFormatSet);
0156: fOtherSet.removeAll(fExtendSet);
0157: fOtherSet.removeAll(fExtendNumLetSet);
0158:
0159: fSets.add(fALetterSet);
0160: fSets.add(fKatakanaSet);
0161: fSets.add(fMidLetterSet);
0162: fSets.add(fMidNumSet);
0163: fSets.add(fNumericSet);
0164: fSets.add(fFormatSet);
0165: fSets.add(fExtendSet);
0166: fSets.add(fExtendNumLetSet);
0167: fSets.add(fOtherSet);
0168: }
0169:
0170: List charClasses() {
0171: return fSets;
0172: }
0173:
0174: void setText(StringBuffer s) {
0175: fText = s;
0176: }
0177:
0178: int next(int prevPos) {
0179: int p0, p1, p2, p3; // Indices of the significant code points around the
0180: // break position being tested. The candidate break
0181: // location is before p2.
0182: int breakPos = -1;
0183:
0184: int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
0185:
0186: // Prev break at end of string. return DONE.
0187: if (prevPos >= fText.length()) {
0188: return -1;
0189: }
0190: p0 = p1 = p2 = p3 = prevPos;
0191: c3 = UTF16.charAt(fText, prevPos);
0192: c0 = c1 = c2 = 0;
0193:
0194: // Loop runs once per "significant" character position in the input text.
0195: for (;;) {
0196: // Move all of the positions forward in the input string.
0197: p0 = p1;
0198: c0 = c1;
0199: p1 = p2;
0200: c1 = c2;
0201: p2 = p3;
0202: c2 = c3;
0203:
0204: // Advancd p3 by X(Extend | Format)* Rule 4
0205: do {
0206: p3 = moveIndex32(fText, p3, 1);
0207: c3 = -1;
0208: if (p3 >= fText.length()) {
0209: break;
0210: }
0211: c3 = UTF16.charAt(fText, p3);
0212: } while (setContains(fFormatSet, c3)
0213: || setContains(fExtendSet, c3));
0214:
0215: if (p1 == p2) {
0216: // Still warming up the loop. (won't work with zero length strings, but we don't care)
0217: continue;
0218: }
0219: if (p2 == fText.length()) {
0220: // Reached end of string. Always a break position.
0221: break;
0222: }
0223:
0224: // Rule (3) CR x LF
0225: // No Extend or Format characters may appear between the CR and LF,
0226: // which requires the additional check for p2 immediately following p1.
0227: //
0228: if (c1 == 0x0D && c2 == 0x0A && p1 == (p2 - 1)) {
0229: continue;
0230: }
0231:
0232: // Rule (5). ALetter x ALetter
0233: if (fALetterSet.contains(c1)
0234: && fALetterSet.contains(c2)) {
0235: continue;
0236: }
0237:
0238: // Rule (6) ALetter x MidLetter ALetter
0239: //
0240: if (fALetterSet.contains(c1)
0241: && fMidLetterSet.contains(c2)
0242: && setContains(fALetterSet, c3)) {
0243: continue;
0244: }
0245:
0246: // Rule (7) ALetter MidLetter x ALetter
0247: if (fALetterSet.contains(c0)
0248: && fMidLetterSet.contains(c1)
0249: && fALetterSet.contains(c2)) {
0250: continue;
0251: }
0252:
0253: // Rule (8) Numeric x Numeric
0254: if (fNumericSet.contains(c1)
0255: && fNumericSet.contains(c2)) {
0256: continue;
0257: }
0258:
0259: // Rule (9) ALetter x Numeric
0260: if (fALetterSet.contains(c1)
0261: && fNumericSet.contains(c2)) {
0262: continue;
0263: }
0264:
0265: // Rule (10) Numeric x ALetter
0266: if (fNumericSet.contains(c1)
0267: && fALetterSet.contains(c2)) {
0268: continue;
0269: }
0270:
0271: // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
0272: if (fNumericSet.contains(c0) && fMidNumSet.contains(c1)
0273: && fNumericSet.contains(c2)) {
0274: continue;
0275: }
0276:
0277: // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
0278: if (fNumericSet.contains(c1) && fMidNumSet.contains(c2)
0279: && setContains(fNumericSet, c3)) {
0280: continue;
0281: }
0282:
0283: // Rule (13) Katakana x Katakana
0284: if (fKatakanaSet.contains(c1)
0285: && fKatakanaSet.contains(c2)) {
0286: continue;
0287: }
0288:
0289: // Rule 13a (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet
0290: if ((fALetterSet.contains(c1)
0291: || fNumericSet.contains(c1)
0292: || fKatakanaSet.contains(c1) || fExtendNumLetSet
0293: .contains(c1))
0294: && fExtendNumLetSet.contains(c2)) {
0295: continue;
0296: }
0297: // Rule 13b ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)
0298: if (fExtendNumLetSet.contains(c1)
0299: && (fALetterSet.contains(c2)
0300: || fNumericSet.contains(c2)
0301: || fKatakanaSet.contains(c2) || fExtendNumLetSet
0302: .contains(c2))) {
0303: continue;
0304: }
0305:
0306: // Rule 14. Break found here.
0307: break;
0308: }
0309:
0310: breakPos = p2;
0311: return breakPos;
0312: }
0313:
0314: }
0315:
0316: static class RBBILineMonkey extends RBBIMonkeyKind {
0317:
0318: List fSets;
0319:
0320: UnicodeSet fBK;
0321: UnicodeSet fCR;
0322: UnicodeSet fLF;
0323: UnicodeSet fCM;
0324: UnicodeSet fNL;
0325: UnicodeSet fSG;
0326: UnicodeSet fWJ;
0327: UnicodeSet fZW;
0328: UnicodeSet fGL;
0329: UnicodeSet fCB;
0330: UnicodeSet fSP;
0331: UnicodeSet fB2;
0332: UnicodeSet fBA;
0333: UnicodeSet fBB;
0334: UnicodeSet fHY;
0335: UnicodeSet fCL;
0336: UnicodeSet fEX;
0337: UnicodeSet fIN;
0338: UnicodeSet fNS;
0339: UnicodeSet fOP;
0340: UnicodeSet fQU;
0341: UnicodeSet fIS;
0342: UnicodeSet fNU;
0343: UnicodeSet fPO;
0344: UnicodeSet fPR;
0345: UnicodeSet fSY;
0346: UnicodeSet fAI;
0347: UnicodeSet fAL;
0348: UnicodeSet fID;
0349: UnicodeSet fSA;
0350: UnicodeSet fJL;
0351: UnicodeSet fJV;
0352: UnicodeSet fJT;
0353: UnicodeSet fH2;
0354: UnicodeSet fH3;
0355: UnicodeSet fXX;
0356:
0357: StringBuffer fText;
0358: int fOrigPositions;
0359:
0360: RBBILineMonkey() {
0361: fSets = new ArrayList();
0362:
0363: fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
0364: fCR = new UnicodeSet("[\\p{Line_break=CR}]");
0365: fLF = new UnicodeSet("[\\p{Line_break=LF}]");
0366: fCM = new UnicodeSet("[\\p{Line_break=CM}]");
0367: fNL = new UnicodeSet("[\\p{Line_break=NL}]");
0368: fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
0369: fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
0370: fGL = new UnicodeSet("[\\p{Line_break=GL}]");
0371: fCB = new UnicodeSet("[\\p{Line_break=CB}]");
0372: fSP = new UnicodeSet("[\\p{Line_break=SP}]");
0373: fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
0374: fBA = new UnicodeSet("[\\p{Line_break=BA}]");
0375: fBB = new UnicodeSet("[\\p{Line_break=BB}]");
0376: fHY = new UnicodeSet("[\\p{Line_break=HY}]");
0377: fCL = new UnicodeSet("[\\p{Line_break=CL}]");
0378: fEX = new UnicodeSet("[\\p{Line_break=EX}]");
0379: fIN = new UnicodeSet("[\\p{Line_break=IN}]");
0380: fNS = new UnicodeSet("[\\p{Line_break=NS}]");
0381: fOP = new UnicodeSet("[\\p{Line_break=OP}]");
0382: fQU = new UnicodeSet("[\\p{Line_break=QU}]");
0383: fIS = new UnicodeSet("[\\p{Line_break=IS}]");
0384: fNU = new UnicodeSet("[\\p{Line_break=NU}]");
0385: fPO = new UnicodeSet("[\\p{Line_break=PO}]");
0386: fPR = new UnicodeSet("[\\p{Line_break=PR}]");
0387: fSY = new UnicodeSet("[\\p{Line_break=SY}]");
0388: fAI = new UnicodeSet("[\\p{Line_break=AI}]");
0389: fAL = new UnicodeSet("[\\p{Line_break=AL}]");
0390: fID = new UnicodeSet("[\\p{Line_break=ID}]");
0391: fSA = new UnicodeSet("[\\p{Line_break=SA}]");
0392: fJL = new UnicodeSet("[\\p{Line_break=JL}]");
0393: fJV = new UnicodeSet("[\\p{Line_break=JV}]");
0394: fJT = new UnicodeSet("[\\p{Line_break=JT}]");
0395: fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
0396: fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
0397: fSG = new UnicodeSet("[\\ud800-\\udfff]");
0398: fXX = new UnicodeSet("[\\p{Line_break=XX}]");
0399:
0400: fAL.addAll(fXX); // Default behavior for XX is identical to AL
0401: fAL.addAll(fAI); // Default behavior for AI is identical to AL
0402: fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL
0403: fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL
0404:
0405: fSets.add(fBK);
0406: fSets.add(fCR);
0407: fSets.add(fLF);
0408: fSets.add(fCM);
0409: fSets.add(fNL);
0410: fSets.add(fWJ);
0411: fSets.add(fZW);
0412: fSets.add(fGL);
0413: fSets.add(fCB);
0414: fSets.add(fSP);
0415: fSets.add(fB2);
0416: fSets.add(fBA);
0417: fSets.add(fBB);
0418: fSets.add(fHY);
0419: fSets.add(fH2);
0420: fSets.add(fH3);
0421: fSets.add(fCL);
0422: fSets.add(fEX);
0423: fSets.add(fIN);
0424: fSets.add(fJL);
0425: fSets.add(fJT);
0426: fSets.add(fJV);
0427: fSets.add(fNS);
0428: fSets.add(fOP);
0429: fSets.add(fQU);
0430: fSets.add(fIS);
0431: fSets.add(fNU);
0432: fSets.add(fPO);
0433: fSets.add(fPR);
0434: fSets.add(fSY);
0435: fSets.add(fAI);
0436: fSets.add(fAL);
0437: fSets.add(fID);
0438: fSets.add(fWJ);
0439: fSets.add(fSA);
0440: fSets.add(fSG);
0441:
0442: }
0443:
0444: void setText(StringBuffer s) {
0445: fText = s;
0446: }
0447:
0448: int next(int startPos) {
0449: int pos; // Index of the char following a potential break position
0450: int this Char; // Character at above position "pos"
0451:
0452: int prevPos; // Index of the char preceding a potential break position
0453: int prevChar; // Character at above position. Note that prevChar
0454: // and thisChar may not be adjacent because combining
0455: // characters between them will be ignored.
0456:
0457: int nextPos; // Index of the next character following pos.
0458: // Usually skips over combining marks.
0459: int tPos; // temp value.
0460: int c;
0461: int matchVals[] = null; // Number Expression Match Results
0462:
0463: if (startPos >= fText.length()) {
0464: return -1;
0465: }
0466:
0467: // Initial values for loop. Loop will run the first time without finding breaks,
0468: // while the invalid values shift out and the "this" and
0469: // "prev" positions are filled in with good values.
0470: pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
0471: this Char = prevChar = 0;
0472: nextPos = startPos;
0473:
0474: // Loop runs once per position in the test text, until a break position
0475: // is found. In each iteration, we are testing for a possible break
0476: // just preceding the character at index "pos". The character preceding
0477: // this char is at postion "prevPos"; because of combining sequences,
0478: // "prevPos" can be arbitrarily far before "pos".
0479: for (;;) {
0480: // Advance to the next position to be tested.
0481: prevPos = pos;
0482: prevChar = this Char;
0483: pos = nextPos;
0484: nextPos = moveIndex32(fText, pos, 1);
0485:
0486: // Rule LB2 - Break at end of text.
0487: if (pos >= fText.length()) {
0488: break;
0489: }
0490:
0491: // Rule LB 9 - adjust for combining sequences.
0492: // We do this rule out-of-order because the adjustment does
0493: // not effect the way that rules LB 3 through LB 6 match,
0494: // and doing it here rather than after LB 6 is substantially
0495: // simpler when combining sequences do occur.
0496:
0497: // LB 9 Keep combining sequences together.
0498: // advance over any CM class chars at "pos",
0499: // result is "nextPos" for the following loop iteration.
0500: this Char = UTF16.charAt(fText, pos);
0501: if (!(fSP.contains(this Char) || fBK.contains(this Char)
0502: || this Char == 0x0d || this Char == 0x0a
0503: || fNL.contains(this Char) || fZW
0504: .contains(this Char))) {
0505: for (;;) {
0506: if (nextPos == fText.length()) {
0507: break;
0508: }
0509: int nextChar = UTF16.charAt(fText, nextPos);
0510: if (!fCM.contains(nextChar)) {
0511: break;
0512: }
0513: nextPos = moveIndex32(fText, nextPos, 1);
0514: }
0515: }
0516:
0517: // LB 9 Treat X CM* as if it were X
0518: // No explicit action required.
0519:
0520: // LB 10 Treat any remaining combining mark as AL
0521: if (fCM.contains(this Char)) {
0522: this Char = 'A';
0523: }
0524:
0525: // If the loop is still warming up - if we haven't shifted the initial
0526: // -1 positions out of prevPos yet - loop back to advance the
0527: // position in the input without any further looking for breaks.
0528: if (prevPos == -1) {
0529: continue;
0530: }
0531:
0532: // LB 4 Always break after hard line breaks,
0533: if (fBK.contains(prevChar)) {
0534: break;
0535: }
0536:
0537: // LB 5 Break after CR, LF, NL, but not inside CR LF
0538: if (fCR.contains(prevChar) && fLF.contains(this Char)) {
0539: continue;
0540: }
0541: if (fCR.contains(prevChar) || fLF.contains(prevChar)
0542: || fNL.contains(prevChar)) {
0543: break;
0544: }
0545:
0546: // LB 6 Don't break before hard line breaks
0547: if (fBK.contains(this Char) || fCR.contains(this Char)
0548: || fLF.contains(this Char)
0549: || fNL.contains(this Char)) {
0550: continue;
0551: }
0552:
0553: // LB 7 Don't break before spaces or zero-width space.
0554: if (fSP.contains(this Char)) {
0555: continue;
0556: }
0557:
0558: if (fZW.contains(this Char)) {
0559: continue;
0560: }
0561:
0562: // LB 8 Break after zero width space
0563: if (fZW.contains(prevChar)) {
0564: break;
0565: }
0566:
0567: // LB 9, 10 Already done, at top of loop.
0568: //
0569:
0570: // LB 11
0571: // x WJ
0572: // WJ x
0573: if (fWJ.contains(this Char) || fWJ.contains(prevChar)) {
0574: continue;
0575: }
0576:
0577: // LB 12
0578: // (!SP) x GL
0579: // GL x
0580: if ((!fSP.contains(prevChar)) && fGL.contains(this Char)
0581: || fGL.contains(prevChar)) {
0582: continue;
0583: }
0584:
0585: // LB 13 Don't break before closings.
0586: // NU x CL and NU x IS are not matched here so that they will
0587: // fall into LB 17 and the more general number regular expression.
0588: //
0589: if (!fNU.contains(prevChar) && fCL.contains(this Char)
0590: || fEX.contains(this Char)
0591: || !fNU.contains(prevChar)
0592: && fIS.contains(this Char)
0593: || !fNU.contains(prevChar)
0594: && fSY.contains(this Char)) {
0595: continue;
0596: }
0597:
0598: // LB 14 Don't break after OP SP*
0599: // Scan backwards, checking for this sequence.
0600: // The OP char could include combining marks, so we acually check for
0601: // OP CM* SP* x
0602: tPos = prevPos;
0603: if (fSP.contains(prevChar)) {
0604: while (tPos > 0
0605: && fSP.contains(UTF16.charAt(fText, tPos))) {
0606: tPos = moveIndex32(fText, tPos, -1);
0607: }
0608: }
0609: while (tPos > 0
0610: && fCM.contains(UTF16.charAt(fText, tPos))) {
0611: tPos = moveIndex32(fText, tPos, -1);
0612: }
0613: if (fOP.contains(UTF16.charAt(fText, tPos))) {
0614: continue;
0615: }
0616:
0617: // LB 15 Do not break withing "[
0618: // QU CM* SP* x OP
0619: if (fOP.contains(this Char)) {
0620: // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
0621: tPos = prevPos;
0622: while (tPos > 0
0623: && fSP.contains(UTF16.charAt(fText, tPos))) {
0624: tPos = moveIndex32(fText, tPos, -1);
0625: }
0626: while (tPos > 0
0627: && fCM.contains(UTF16.charAt(fText, tPos))) {
0628: tPos = moveIndex32(fText, tPos, -1);
0629: }
0630: if (fQU.contains(UTF16.charAt(fText, tPos))) {
0631: continue;
0632: }
0633: }
0634:
0635: // LB 16 CL SP* x NS
0636: if (fNS.contains(this Char)) {
0637: tPos = prevPos;
0638: while (tPos > 0
0639: && fSP.contains(UTF16.charAt(fText, tPos))) {
0640: tPos = moveIndex32(fText, tPos, -1);
0641: }
0642: while (tPos > 0
0643: && fCM.contains(UTF16.charAt(fText, tPos))) {
0644: tPos = moveIndex32(fText, tPos, -1);
0645: }
0646: if (fCL.contains(UTF16.charAt(fText, tPos))) {
0647: continue;
0648: }
0649: }
0650:
0651: // LB 17 B2 SP* x B2
0652: if (fB2.contains(this Char)) {
0653: tPos = prevPos;
0654: while (tPos > 0
0655: && fSP.contains(UTF16.charAt(fText, tPos))) {
0656: tPos = moveIndex32(fText, tPos, -1);
0657: }
0658: while (tPos > 0
0659: && fCM.contains(UTF16.charAt(fText, tPos))) {
0660: tPos = moveIndex32(fText, tPos, -1);
0661: }
0662: if (fB2.contains(UTF16.charAt(fText, tPos))) {
0663: continue;
0664: }
0665: }
0666:
0667: // LB 18 break after space
0668: if (fSP.contains(prevChar)) {
0669: break;
0670: }
0671:
0672: // LB 19
0673: // x QU
0674: // QU x
0675: if (fQU.contains(this Char) || fQU.contains(prevChar)) {
0676: continue;
0677: }
0678:
0679: // LB 20 Break around a CB
0680: if (fCB.contains(this Char) || fCB.contains(prevChar)) {
0681: break;
0682: }
0683:
0684: // LB 21
0685: if (fBA.contains(this Char) || fHY.contains(this Char)
0686: || fNS.contains(this Char)
0687: || fBB.contains(prevChar)) {
0688: continue;
0689: }
0690:
0691: // LB 22
0692: if (fAL.contains(prevChar) && fIN.contains(this Char)
0693: || fID.contains(prevChar)
0694: && fIN.contains(this Char)
0695: || fIN.contains(prevChar)
0696: && fIN.contains(this Char)
0697: || fNU.contains(prevChar)
0698: && fIN.contains(this Char)) {
0699: continue;
0700: }
0701:
0702: // LB 23 ID x PO (Note: Leading CM behaves like ID)
0703: // AL x NU
0704: // NU x AL
0705: if (fID.contains(prevChar) && fPO.contains(this Char)
0706: || fAL.contains(prevChar)
0707: && fNU.contains(this Char)
0708: || fNU.contains(prevChar)
0709: && fAL.contains(this Char)) {
0710: continue;
0711: }
0712:
0713: // LB 24 Do not break between prefix and letters or ideographs.
0714: // PR x ID
0715: // PR x AL
0716: // PO x AL
0717: if (fPR.contains(prevChar) && fID.contains(this Char)
0718: || fPR.contains(prevChar)
0719: && fAL.contains(this Char)
0720: || fPO.contains(prevChar)
0721: && fAL.contains(this Char)) {
0722: continue;
0723: }
0724:
0725: // LB 25 Numbers
0726: matchVals = LBNumberCheck(fText, prevPos, matchVals);
0727: if (matchVals[0] != -1) {
0728: // Matched a number. But could have been just a single digit, which would
0729: // not represent a "no break here" between prevChar and thisChar
0730: int numEndIdx = matchVals[1]; // idx of first char following num
0731: if (numEndIdx > pos) {
0732: // Number match includes at least the two chars being checked
0733: if (numEndIdx > nextPos) {
0734: // Number match includes additional chars. Update pos and nextPos
0735: // so that next loop iteration will continue at the end of the number,
0736: // checking for breaks between last char in number & whatever follows.
0737: nextPos = numEndIdx;
0738: pos = numEndIdx;
0739: do {
0740: pos = moveIndex32(fText, pos, -1);
0741: this Char = UTF16.charAt(fText, pos);
0742: } while (fCM.contains(this Char));
0743: }
0744: continue;
0745: }
0746: }
0747:
0748: // LB 26 Do not break Korean Syllables
0749: if (fJL.contains(prevChar)
0750: && (fJL.contains(this Char)
0751: || fJV.contains(this Char)
0752: || fH2.contains(this Char) || fH3
0753: .contains(this Char))) {
0754: continue;
0755: }
0756:
0757: if ((fJV.contains(prevChar) || fH2.contains(prevChar))
0758: && (fJV.contains(this Char) || fJT
0759: .contains(this Char))) {
0760: continue;
0761: }
0762:
0763: if ((fJT.contains(prevChar) || fH3.contains(prevChar))
0764: && fJT.contains(this Char)) {
0765: continue;
0766: }
0767:
0768: // LB 27 Treat a Korean Syllable Block the same as ID
0769: if ((fJL.contains(prevChar) || fJV.contains(prevChar)
0770: || fJT.contains(prevChar)
0771: || fH2.contains(prevChar) || fH3
0772: .contains(prevChar))
0773: && fIN.contains(this Char)) {
0774: continue;
0775: }
0776: if ((fJL.contains(prevChar) || fJV.contains(prevChar)
0777: || fJT.contains(prevChar)
0778: || fH2.contains(prevChar) || fH3
0779: .contains(prevChar))
0780: && fPO.contains(this Char)) {
0781: continue;
0782: }
0783: if (fPR.contains(prevChar)
0784: && (fJL.contains(this Char)
0785: || fJV.contains(this Char)
0786: || fJT.contains(this Char)
0787: || fH2.contains(this Char) || fH3
0788: .contains(this Char))) {
0789: continue;
0790: }
0791:
0792: // LB 28 Do not break between alphabetics
0793: if (fAL.contains(prevChar) && fAL.contains(this Char)) {
0794: continue;
0795: }
0796:
0797: // LB 29 Do not break between numeric punctuation and alphabetics
0798: if (fIS.contains(prevChar) && fAL.contains(this Char)) {
0799: continue;
0800: }
0801:
0802: // LB 30 Do not break between letters, numbers or oridnary symbols and
0803: // opening or closing punctuation.
0804: // (AL | NU) x OP
0805: // CL x (AL | NU)
0806: if ((fAL.contains(prevChar) || fNU.contains(prevChar))
0807: && fOP.contains(this Char)) {
0808: continue;
0809: }
0810: if (fCL.contains(prevChar)
0811: && (fAL.contains(this Char) || fNU
0812: .contains(this Char))) {
0813: continue;
0814: }
0815:
0816: // LB 31 Break everywhere else
0817: break;
0818: }
0819:
0820: return pos;
0821: }
0822:
0823: // Match the following regular expression in the input text.
0824: // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
0825: // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
0826: // retVals array [0] index of the start of the match, or -1 if no match
0827: // [1] index of first char following the match.
0828: // Can not use Java regex because need supplementary character support,
0829: // and because Unicode char properties version must be the same as in
0830: // the version of ICU being tested.
0831: private int[] LBNumberCheck(StringBuffer s, int startIdx,
0832: int[] retVals) {
0833: if (retVals == null) {
0834: retVals = new int[2];
0835: }
0836: retVals[0] = -1; // Indicates no match.
0837: int matchState = 0;
0838: int idx = startIdx;
0839:
0840: matchLoop: for (idx = startIdx; idx < s.length(); idx = moveIndex32(
0841: s, idx, 1)) {
0842: int c = UTF16.charAt(s, idx);
0843: int cLBType = UCharacter.getIntPropertyValue(c,
0844: UProperty.LINE_BREAK);
0845: switch (matchState) {
0846: case 0:
0847: if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC
0848: || cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
0849: matchState = 1;
0850: break;
0851: }
0852: if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
0853: matchState = 4;
0854: break;
0855: }
0856: if (cLBType == UCharacter.LineBreak.HYPHEN) {
0857: matchState = 4;
0858: break;
0859: }
0860: if (cLBType == UCharacter.LineBreak.NUMERIC) {
0861: matchState = 7;
0862: break;
0863: }
0864: break matchLoop; /* No Match */
0865:
0866: case 1:
0867: if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
0868: matchState = 1;
0869: break;
0870: }
0871: if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
0872: matchState = 4;
0873: break;
0874: }
0875: if (cLBType == UCharacter.LineBreak.HYPHEN) {
0876: matchState = 4;
0877: break;
0878: }
0879: if (cLBType == UCharacter.LineBreak.NUMERIC) {
0880: matchState = 7;
0881: break;
0882: }
0883: break matchLoop; /* No Match */
0884:
0885: case 4:
0886: if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
0887: matchState = 4;
0888: break;
0889: }
0890: if (cLBType == UCharacter.LineBreak.NUMERIC) {
0891: matchState = 7;
0892: break;
0893: }
0894: break matchLoop; /* No Match */
0895: // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
0896: // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
0897:
0898: case 7:
0899: if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
0900: matchState = 7;
0901: break;
0902: }
0903: if (cLBType == UCharacter.LineBreak.NUMERIC) {
0904: matchState = 7;
0905: break;
0906: }
0907: if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
0908: matchState = 7;
0909: break;
0910: }
0911: if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
0912: matchState = 7;
0913: break;
0914: }
0915: if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
0916: matchState = 9;
0917: break;
0918: }
0919: if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
0920: matchState = 11;
0921: break;
0922: }
0923: if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
0924: matchState = 11;
0925: break;
0926: }
0927:
0928: break matchLoop; // Match Complete.
0929: case 9:
0930: if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
0931: matchState = 9;
0932: break;
0933: }
0934: if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
0935: matchState = 11;
0936: break;
0937: }
0938: if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
0939: matchState = 11;
0940: break;
0941: }
0942: break matchLoop; // Match Complete.
0943: case 11:
0944: if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
0945: matchState = 11;
0946: break;
0947: }
0948: break matchLoop; // Match Complete.
0949: }
0950: }
0951: if (matchState > 4) {
0952: retVals[0] = startIdx;
0953: retVals[1] = idx;
0954: }
0955: return retVals;
0956: }
0957:
0958: List charClasses() {
0959: return fSets;
0960: }
0961:
0962: }
0963:
0964: /**
0965: *
0966: * Sentence Monkey Test Class
0967: *
0968: *
0969: *
0970: */
0971: static class RBBISentenceMonkey extends RBBIMonkeyKind {
0972: List fSets;
0973: StringBuffer fText;
0974:
0975: UnicodeSet fSepSet;
0976: UnicodeSet fFormatSet;
0977: UnicodeSet fSpSet;
0978: UnicodeSet fLowerSet;
0979: UnicodeSet fUpperSet;
0980: UnicodeSet fOLetterSet;
0981: UnicodeSet fNumericSet;
0982: UnicodeSet fATermSet;
0983: UnicodeSet fSTermSet;
0984: UnicodeSet fCloseSet;
0985: UnicodeSet fOtherSet;
0986: UnicodeSet fExtendSet;
0987:
0988: RBBISentenceMonkey() {
0989: fSets = new ArrayList();
0990:
0991: fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep}]");
0992: fFormatSet = new UnicodeSet(
0993: "[\\p{Sentence_Break = Format}]");
0994: fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
0995: fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
0996: fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
0997: fOLetterSet = new UnicodeSet(
0998: "[\\p{Sentence_Break = OLetter}-[\\uff9e\\uff9f]]");
0999: fNumericSet = new UnicodeSet(
1000: "[\\p{Sentence_Break = Numeric}]");
1001: fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
1002: fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
1003: fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]");
1004: fExtendSet = new UnicodeSet(
1005: "[\\p{Grapheme_Extend}\\uff9e\\uff9f]");
1006: fOtherSet = new UnicodeSet();
1007:
1008: fOtherSet.complement();
1009: fOtherSet.removeAll(fSepSet);
1010: fOtherSet.removeAll(fFormatSet);
1011: fOtherSet.removeAll(fSpSet);
1012: fOtherSet.removeAll(fLowerSet);
1013: fOtherSet.removeAll(fUpperSet);
1014: fOtherSet.removeAll(fOLetterSet);
1015: fOtherSet.removeAll(fNumericSet);
1016: fOtherSet.removeAll(fATermSet);
1017: fOtherSet.removeAll(fSTermSet);
1018: fOtherSet.removeAll(fCloseSet);
1019: fOtherSet.removeAll(fExtendSet);
1020:
1021: fSets.add(fSepSet);
1022: fSets.add(fFormatSet);
1023:
1024: fSets.add(fSpSet);
1025: fSets.add(fLowerSet);
1026: fSets.add(fUpperSet);
1027: fSets.add(fOLetterSet);
1028: fSets.add(fNumericSet);
1029: fSets.add(fATermSet);
1030: fSets.add(fSTermSet);
1031: fSets.add(fCloseSet);
1032: fSets.add(fOtherSet);
1033: fSets.add(fExtendSet);
1034: }
1035:
1036: List charClasses() {
1037: return fSets;
1038: }
1039:
1040: void setText(StringBuffer s) {
1041: fText = s;
1042: }
1043:
1044: // moveBack() Find the "significant" code point preceding the index i.
1045: // Skips over ($Extend | $Format)*
1046: //
1047: private int moveBack(int i) {
1048:
1049: if (i <= 0) {
1050: return -1;
1051: }
1052:
1053: int c;
1054: int j = i;
1055: do {
1056: j = moveIndex32(fText, j, -1);
1057: c = UTF16.charAt(fText, j);
1058: } while (j > 0
1059: && (fFormatSet.contains(c) || fExtendSet
1060: .contains(c)));
1061: return j;
1062: }
1063:
1064: int moveForward(int i) {
1065: if (i >= fText.length()) {
1066: return fText.length();
1067: }
1068: int c;
1069: int j = i;
1070: do {
1071: j = moveIndex32(fText, j, 1);
1072: c = cAt(j);
1073: } while (c >= 0
1074: && (fFormatSet.contains(c) || fExtendSet
1075: .contains(c)));
1076: return j;
1077:
1078: }
1079:
1080: int cAt(int pos) {
1081: if (pos < 0 || pos >= fText.length()) {
1082: return -1;
1083: }
1084: return UTF16.charAt(fText, pos);
1085: }
1086:
1087: int next(int prevPos) {
1088: int p0, p1, p2, p3; // Indices of the significant code points around the
1089: // break position being tested. The candidate break
1090: // location is before p2.
1091: int breakPos = -1;
1092:
1093: int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1094: int c;
1095:
1096: // Prev break at end of string. return DONE.
1097: if (prevPos >= fText.length()) {
1098: return -1;
1099: }
1100: p0 = p1 = p2 = p3 = prevPos;
1101: c3 = UTF16.charAt(fText, prevPos);
1102: c0 = c1 = c2 = 0;
1103:
1104: // Loop runs once per "significant" character position in the input text.
1105: for (;;) {
1106: // Move all of the positions forward in the input string.
1107: p0 = p1;
1108: c0 = c1;
1109: p1 = p2;
1110: c1 = c2;
1111: p2 = p3;
1112: c2 = c3;
1113:
1114: // Advancd p3 by X(Extend | Format)* Rule 4
1115: p3 = moveForward(p3);
1116: c3 = cAt(p3);
1117:
1118: // Rule (3) CR x LF
1119: if (c1 == 0x0d && c2 == 0x0a && p2 == (p1 + 1)) {
1120: continue;
1121: }
1122:
1123: // Rule (4) Sep <break>
1124: if (fSepSet.contains(c1)) {
1125: p2 = p1 + 1; // Separators don't combine with Extend or Format
1126: break;
1127: }
1128:
1129: if (p2 >= fText.length()) {
1130: // Reached end of string. Always a break position.
1131: break;
1132: }
1133:
1134: if (p2 == prevPos) {
1135: // Still warming up the loop. (won't work with zero length strings, but we don't care)
1136: continue;
1137: }
1138:
1139: // Rule (6). ATerm x Numeric
1140: if (fATermSet.contains(c1) && fNumericSet.contains(c2)) {
1141: continue;
1142: }
1143:
1144: // Rule (7). Upper ATerm x Uppper
1145: if (fUpperSet.contains(c0) && fATermSet.contains(c1)
1146: && fUpperSet.contains(c2)) {
1147: continue;
1148: }
1149:
1150: // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower
1151: // Note: Sterm | ATerm are added to the negated part of the expression by a
1152: // note to the Unicode 5.0 documents.
1153: int p8 = p1;
1154: while (p8 > 0 && fSpSet.contains(cAt(p8))) {
1155: p8 = moveBack(p8);
1156: }
1157: while (p8 > 0 && fCloseSet.contains(cAt(p8))) {
1158: p8 = moveBack(p8);
1159: }
1160: if (fATermSet.contains(cAt(p8))) {
1161: p8 = p2;
1162: for (;;) {
1163: c = cAt(p8);
1164: if (c == -1 || fOLetterSet.contains(c)
1165: || fUpperSet.contains(c)
1166: || fLowerSet.contains(c)
1167: || fSepSet.contains(c)
1168: || fATermSet.contains(c)
1169: || fSTermSet.contains(c)) {
1170: break;
1171: }
1172: p8 = moveForward(p8);
1173: }
1174: if (p8 < fText.length()
1175: && fLowerSet.contains(cAt(p8))) {
1176: continue;
1177: }
1178: }
1179:
1180: // Rule 8a (STerm | ATerm) Close* Sp* x (Sterm | ATerm)
1181: if (fSTermSet.contains(c2) || fATermSet.contains(c2)) {
1182: p8 = p1;
1183: while (setContains(fSpSet, cAt(p8))) {
1184: p8 = moveBack(p8);
1185: }
1186: while (setContains(fCloseSet, cAt(p8))) {
1187: p8 = moveBack(p8);
1188: }
1189: c = cAt(p8);
1190: if (setContains(fSTermSet, c)
1191: || setContains(fATermSet, c)) {
1192: continue;
1193: }
1194: }
1195:
1196: // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep)
1197: int p9 = p1;
1198: while (p9 > 0 && fCloseSet.contains(cAt(p9))) {
1199: p9 = moveBack(p9);
1200: }
1201: c = cAt(p9);
1202: if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
1203: if (fCloseSet.contains(c2) || fSpSet.contains(c2)
1204: || fSepSet.contains(c2)) {
1205: continue;
1206: }
1207: }
1208:
1209: // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep)
1210: int p10 = p1;
1211: while (p10 > 0 && fSpSet.contains(cAt(p10))) {
1212: p10 = moveBack(p10);
1213: }
1214: while (p10 > 0 && fCloseSet.contains(cAt(p10))) {
1215: p10 = moveBack(p10);
1216: }
1217: if (fSTermSet.contains(cAt(p10))
1218: || fATermSet.contains(cAt(p10))) {
1219: if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
1220: continue;
1221: }
1222: }
1223:
1224: // Rule (11) (STerm | ATerm) Close* Sp* <break>
1225: int p11 = p1;
1226: while (p11 > 0 && fSpSet.contains(cAt(p11))) {
1227: p11 = moveBack(p11);
1228: }
1229: while (p11 > 0 && fCloseSet.contains(cAt(p11))) {
1230: p11 = moveBack(p11);
1231: }
1232: if (fSTermSet.contains(cAt(p11))
1233: || fATermSet.contains(cAt(p11))) {
1234: break;
1235: }
1236:
1237: // Rule (12) Any x Any
1238: continue;
1239: }
1240: breakPos = p2;
1241: return breakPos;
1242: }
1243:
1244: }
1245:
1246: /**
1247: * Move an index into a string by n code points.
1248: * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
1249: * complicating usage.
1250: * @param s a Text string
1251: * @param pos The starting code unit index into the text string
1252: * @param amt The amount to adjust the string by.
1253: * @return The adjusted code unit index, pinned to the string's length, or
1254: * unchanged if input index was outside of the string.
1255: */
1256: static int moveIndex32(StringBuffer s, int pos, int amt) {
1257: int i;
1258: char c;
1259: if (amt > 0) {
1260: for (i = 0; i < amt; i++) {
1261: if (pos >= s.length()) {
1262: return s.length();
1263: }
1264: c = s.charAt(pos);
1265: pos++;
1266: if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
1267: c = s.charAt(pos);
1268: if (UTF16.isTrailSurrogate(c)) {
1269: pos++;
1270: }
1271: }
1272: }
1273: } else {
1274: for (i = 0; i > amt; i--) {
1275: if (pos <= 0) {
1276: return 0;
1277: }
1278: pos--;
1279: c = s.charAt(pos);
1280: if (UTF16.isTrailSurrogate(c) && pos >= 0) {
1281: c = s.charAt(pos);
1282: if (UTF16.isLeadSurrogate(c)) {
1283: pos--;
1284: }
1285: }
1286: }
1287: }
1288: return pos;
1289: }
1290:
1291: /**
1292: * No-exceptions form of UnicodeSet.contains(c).
1293: * Simplifies loops that terminate with an end-of-input character value.
1294: * @param s A unicode set
1295: * @param c A code point value
1296: * @return true if the set contains c.
1297: */
1298: static boolean setContains(UnicodeSet s, int c) {
1299: if (c < 0 || c > UTF16.CODEPOINT_MAX_VALUE) {
1300: return false;
1301: }
1302: return s.contains(c);
1303: }
1304:
1305: /**
1306: * return the index of the next code point in the input text.
1307: * @param i the preceding index
1308: * @return
1309: * @internal
1310: */
1311: static int nextCP(StringBuffer s, int i) {
1312: if (i == -1) {
1313: // End of Input indication. Continue to return end value.
1314: return -1;
1315: }
1316: int retVal = i + 1;
1317: if (retVal > s.length()) {
1318: return -1;
1319: }
1320: int c = UTF16.charAt(s, i);
1321: if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE
1322: && UTF16.isLeadSurrogate(s.charAt(i))) {
1323: retVal++;
1324: }
1325: return retVal;
1326: }
1327:
1328: //
1329: // The following UnicodeSets are used in matching a Grapheme Cluster
1330: //
1331: private static UnicodeSet GC_Control;
1332:
1333: private static UnicodeSet GC_Extend;
1334:
1335: private static UnicodeSet GC_L;
1336:
1337: private static UnicodeSet GC_V;
1338:
1339: private static UnicodeSet GC_T;
1340:
1341: private static UnicodeSet GC_LV;
1342:
1343: private static UnicodeSet GC_LVT;
1344:
1345: protected void init() throws Exception {
1346: GC_Control = new UnicodeSet(
1347: "[[:Zl:][:Zp:][:Cc:][:Cf:]-[\\u000d\\u000a]-[\\p{Grapheme_Cluster_Break=Extend}]]");
1348:
1349: GC_Extend = new UnicodeSet(
1350: "[\\p{Grapheme_Cluster_Break=Extend}]");
1351:
1352: GC_L = new UnicodeSet("[[:Hangul_Syllable_Type=L:]]");
1353:
1354: GC_V = new UnicodeSet("[[:Hangul_Syllable_Type=V:]]");
1355:
1356: GC_T = new UnicodeSet("[[:Hangul_Syllable_Type=T:]]");
1357:
1358: GC_LV = new UnicodeSet("[[:Hangul_Syllable_Type=LV:]]");
1359:
1360: GC_LVT = new UnicodeSet("[[:Hangul_Syllable_Type=LVT:]]");
1361: }
1362:
1363: /**
1364: * Find the end of the extent of a grapheme cluster.
1365: * This is the reference implementation used by the monkey test for comparison
1366: * with the RBBI results.
1367: * @param s The string containing the text to be analyzed
1368: * @param i The index of the start of the grapheme cluster.
1369: * @return The index of the first code point following the grapheme cluster
1370: * @internal
1371: */
1372: private static int nextGC(StringBuffer s, int i) {
1373: if (i >= s.length() || i == -1) {
1374: return -1;
1375: }
1376:
1377: int c = UTF16.charAt(s, i);
1378: int pos = i;
1379:
1380: if (c == 0x0d) {
1381: pos = nextCP(s, i);
1382: if (pos >= s.length()) {
1383: return pos;
1384: }
1385: c = UTF16.charAt(s, pos);
1386: if (c == 0x0a) {
1387: pos = nextCP(s, pos);
1388: }
1389: return pos;
1390: }
1391:
1392: if (GC_Control.contains(c) || c == 0x0a) {
1393: pos = nextCP(s, pos);
1394: return pos;
1395: }
1396:
1397: // Little state machine to consume Hangul Syllables
1398: int hangulState = 1;
1399: state_loop: for (;;) {
1400: switch (hangulState) {
1401: case 1:
1402: if (GC_L.contains(c)) {
1403: hangulState = 2;
1404: break;
1405: }
1406: if (GC_V.contains(c) || GC_LV.contains(c)) {
1407: hangulState = 3;
1408: break;
1409: }
1410: if (GC_T.contains(c) || GC_LVT.contains(c)) {
1411: hangulState = 4;
1412: break;
1413: }
1414: break state_loop;
1415: case 2:
1416: if (GC_L.contains(c)) {
1417: // continue in state 2.
1418: break;
1419: }
1420: if (GC_V.contains(c) || GC_LV.contains(c)) {
1421: hangulState = 3;
1422: break;
1423: }
1424: if (GC_LVT.contains(c)) {
1425: hangulState = 4;
1426: break;
1427: }
1428: if (GC_Extend.contains(c)) {
1429: hangulState = 5;
1430: break;
1431: }
1432: break state_loop;
1433: case 3:
1434: if (GC_V.contains(c)) {
1435: // continue in state 3;
1436: break;
1437: }
1438: if (GC_T.contains(c)) {
1439: hangulState = 4;
1440: break;
1441: }
1442: if (GC_Extend.contains(c)) {
1443: hangulState = 5;
1444: break;
1445: }
1446: break state_loop;
1447: case 4:
1448: if (GC_T.contains(c)) {
1449: // continue in state 4
1450: break;
1451: }
1452: if (GC_Extend.contains(c)) {
1453: hangulState = 5;
1454: break;
1455: }
1456: break state_loop;
1457: case 5:
1458: if (GC_Extend.contains(c)) {
1459: hangulState = 5;
1460: break;
1461: }
1462: break state_loop;
1463: }
1464: // We have exited the switch statement, but are still in the loop.
1465: // Still in a Hangul Syllable, advance to the next code point.
1466: pos = nextCP(s, pos);
1467: if (pos >= s.length()) {
1468: break;
1469: }
1470: c = UTF16.charAt(s, pos);
1471: } // end of loop
1472:
1473: if (hangulState != 1) {
1474: // We found a Hangul. We're done.
1475: return pos;
1476: }
1477:
1478: // Ordinary characters. Consume one codepoint unconditionally, then any following Extends.
1479: for (;;) {
1480: pos = nextCP(s, pos);
1481: if (pos >= s.length()) {
1482: break;
1483: }
1484: c = UTF16.charAt(s, pos);
1485: if (GC_Extend.contains(c) == false) {
1486: break;
1487: }
1488: }
1489:
1490: return pos;
1491: }
1492:
1493: /**
1494: * random number generator. Not using Java's built-in Randoms for two reasons:
1495: * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
1496: * 2. We need to get and restore the seed from values occuring in the middle
1497: * of a long sequence, to more easily reproduce failing cases.
1498: */
1499: private static int m_seed = 1;
1500:
1501: private static int m_rand() {
1502: m_seed = m_seed * 1103515245 + 12345;
1503: return (int) (m_seed >>> 16) % 32768;
1504: }
1505:
1506: /**
1507: * Run a RBBI monkey test. Common routine, for all break iterator types.
1508: * Parameters:
1509: * bi - the break iterator to use
1510: * mk - MonkeyKind, abstraction for obtaining expected results
1511: * name - Name of test (char, word, etc.) for use in error messages
1512: * seed - Seed for starting random number generator (parameter from user)
1513: * numIterations
1514: */
1515: void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name,
1516: int seed, int numIterations) {
1517: int TESTSTRINGLEN = 500;
1518: StringBuffer testText = new StringBuffer();
1519: int numCharClasses;
1520: List chClasses;
1521: int[] expected = new int[TESTSTRINGLEN * 2 + 1];
1522: int expectedCount = 0;
1523: boolean[] expectedBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1524: boolean[] forwardBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1525: boolean[] reverseBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1526: boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1527: boolean[] followingBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1528: boolean[] precedingBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1529: int i;
1530: int loopCount = 0;
1531: boolean printTestData = false;
1532: boolean printBreaksFromBI = false;
1533:
1534: m_seed = seed;
1535:
1536: numCharClasses = mk.charClasses().size();
1537: chClasses = mk.charClasses();
1538:
1539: // Verify that the character classes all have at least one member.
1540: for (i = 0; i < numCharClasses; i++) {
1541: UnicodeSet s = (UnicodeSet) chClasses.get(i);
1542: if (s == null || s.size() == 0) {
1543: errln("Character Class " + i
1544: + " is null or of zero size.");
1545: return;
1546: }
1547: }
1548:
1549: //--------------------------------------------------------------------------------------------
1550: //
1551: // Debugging settings. Comment out everything in the following block for normal operation
1552: //
1553: //--------------------------------------------------------------------------------------------
1554: // numIterations = -1;
1555: // RuleBasedBreakIterator_New.fTrace = true;
1556: // m_seed = 859056465;
1557: // TESTSTRINGLEN = 50;
1558: // printTestData = true;
1559: // printBreaksFromBI = true;
1560: // ((RuleBasedBreakIterator_New)bi).dump();
1561:
1562: //--------------------------------------------------------------------------------------------
1563: //
1564: // End of Debugging settings.
1565: //
1566: //--------------------------------------------------------------------------------------------
1567:
1568: int dotsOnLine = 0;
1569: while (loopCount < numIterations || numIterations == -1) {
1570: if (numIterations == -1 && loopCount % 10 == 0) {
1571: // If test is running in an infinite loop, display a periodic tic so
1572: // we can tell that it is making progress.
1573: System.out.print(".");
1574: if (dotsOnLine++ >= 80) {
1575: System.out.println();
1576: dotsOnLine = 0;
1577: }
1578: }
1579: // Save current random number seed, so that we can recreate the random numbers
1580: // for this loop iteration in event of an error.
1581: seed = m_seed;
1582:
1583: testText.setLength(0);
1584: // Populate a test string with data.
1585: if (printTestData) {
1586: System.out.println("Test Data string ...");
1587: }
1588: for (i = 0; i < TESTSTRINGLEN; i++) {
1589: int aClassNum = m_rand() % numCharClasses;
1590: UnicodeSet classSet = (UnicodeSet) chClasses
1591: .get(aClassNum);
1592: int charIdx = m_rand() % classSet.size();
1593: int c = classSet.charAt(charIdx);
1594: if (c < 0) { // TODO: deal with sets containing strings.
1595: errln("c < 0");
1596: }
1597: UTF16.appendCodePoint(testText, c);
1598: if (printTestData) {
1599: System.out.print(Integer.toHexString(c) + " ");
1600: }
1601: }
1602: if (printTestData) {
1603: System.out.println();
1604: }
1605:
1606: Arrays.fill(expected, 0);
1607: Arrays.fill(expectedBreaks, false);
1608: Arrays.fill(forwardBreaks, false);
1609: Arrays.fill(reverseBreaks, false);
1610: Arrays.fill(isBoundaryBreaks, false);
1611: Arrays.fill(followingBreaks, false);
1612: Arrays.fill(precedingBreaks, false);
1613:
1614: // Calculate the expected results for this test string.
1615: mk.setText(testText);
1616: expectedCount = 0;
1617: expectedBreaks[0] = true;
1618: expected[expectedCount++] = 0;
1619: int breakPos = 0;
1620: int lastBreakPos = -1;
1621: for (;;) {
1622: lastBreakPos = breakPos;
1623: breakPos = mk.next(breakPos);
1624: if (breakPos == -1) {
1625: break;
1626: }
1627: if (breakPos > testText.length()) {
1628: errln("breakPos > testText.length()");
1629: }
1630: if (lastBreakPos >= breakPos) {
1631: errln("Next() not increasing.");
1632: // break;
1633: }
1634: expectedBreaks[breakPos] = true;
1635: expected[expectedCount++] = breakPos;
1636: }
1637:
1638: // Find the break positions using forward iteration
1639: if (printBreaksFromBI) {
1640: System.out.println("Breaks from BI...");
1641: }
1642: bi.setText(testText.toString());
1643: for (i = bi.first(); i != BreakIterator.DONE; i = bi.next()) {
1644: if (i < 0 || i > testText.length()) {
1645: errln(name
1646: + " break monkey test: Out of range value returned by breakIterator::next()");
1647: break;
1648: }
1649: if (printBreaksFromBI) {
1650: System.out.print(Integer.toHexString(i) + " ");
1651: }
1652: forwardBreaks[i] = true;
1653: }
1654: if (printBreaksFromBI) {
1655: System.out.println();
1656: }
1657:
1658: // Find the break positions using reverse iteration
1659: for (i = bi.last(); i != BreakIterator.DONE; i = bi
1660: .previous()) {
1661: if (i < 0 || i > testText.length()) {
1662: errln(name
1663: + " break monkey test: Out of range value returned by breakIterator.next()"
1664: + name);
1665: break;
1666: }
1667: reverseBreaks[i] = true;
1668: }
1669:
1670: // Find the break positions using isBoundary() tests.
1671: for (i = 0; i <= testText.length(); i++) {
1672: isBoundaryBreaks[i] = bi.isBoundary(i);
1673: }
1674:
1675: // Find the break positions using the following() function.
1676: lastBreakPos = 0;
1677: followingBreaks[0] = true;
1678: for (i = 0; i < testText.length(); i++) {
1679: breakPos = bi.following(i);
1680: if (breakPos <= i || breakPos < lastBreakPos
1681: || breakPos > testText.length()
1682: || breakPos > lastBreakPos && lastBreakPos > i) {
1683: errln(name
1684: + " break monkey test: "
1685: + "Out of range value returned by BreakIterator::following().\n"
1686: + "index=" + i + "following returned="
1687: + breakPos + "lastBreak=" + lastBreakPos);
1688: precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
1689: } else {
1690: followingBreaks[breakPos] = true;
1691: lastBreakPos = breakPos;
1692: }
1693: }
1694:
1695: // Find the break positions using the preceding() function.
1696: lastBreakPos = testText.length();
1697: precedingBreaks[testText.length()] = true;
1698: for (i = testText.length(); i > 0; i--) {
1699: breakPos = bi.preceding(i);
1700: if (breakPos >= i || breakPos > lastBreakPos
1701: || breakPos < 0 || breakPos < lastBreakPos
1702: && lastBreakPos < i) {
1703: errln(name
1704: + " break monkey test: "
1705: + "Out of range value returned by BreakIterator::preceding().\n"
1706: + "index=" + i + "preceding returned="
1707: + breakPos + "lastBreak=" + lastBreakPos);
1708: precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
1709: } else {
1710: precedingBreaks[breakPos] = true;
1711: lastBreakPos = breakPos;
1712: }
1713: }
1714:
1715: // Compare the expected and actual results.
1716: for (i = 0; i <= testText.length(); i++) {
1717: String errorType = null;
1718: if (forwardBreaks[i] != expectedBreaks[i]) {
1719: errorType = "next()";
1720: } else if (reverseBreaks[i] != forwardBreaks[i]) {
1721: errorType = "previous()";
1722: } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
1723: errorType = "isBoundary()";
1724: } else if (followingBreaks[i] != expectedBreaks[i]) {
1725: errorType = "following()";
1726: } else if (precedingBreaks[i] != expectedBreaks[i]) {
1727: errorType = "preceding()";
1728: }
1729:
1730: if (errorType != null) {
1731: // Format a range of the test text that includes the failure as
1732: // a data item that can be included in the rbbi test data file.
1733:
1734: // Start of the range is the last point where expected and actual results
1735: // both agreed that there was a break position.
1736: int startContext = i;
1737: int count = 0;
1738: for (;;) {
1739: if (startContext == 0) {
1740: break;
1741: }
1742: startContext--;
1743: if (expectedBreaks[startContext]) {
1744: if (count == 2)
1745: break;
1746: count++;
1747: }
1748: }
1749:
1750: // End of range is two expected breaks past the start position.
1751: int endContext = i + 1;
1752: int ci;
1753: for (ci = 0; ci < 2; ci++) { // Number of items to include in error text.
1754: for (;;) {
1755: if (endContext >= testText.length()) {
1756: break;
1757: }
1758: if (expectedBreaks[endContext - 1]) {
1759: if (count == 0)
1760: break;
1761: count--;
1762: }
1763: endContext++;
1764: }
1765: }
1766:
1767: // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
1768: StringBuffer errorText = new StringBuffer();
1769: errorText.append("<data>");
1770:
1771: String hexChars = "0123456789abcdef";
1772: int c; // Char from test data
1773: int bn;
1774: for (ci = startContext; ci <= endContext
1775: && ci != -1; ci = nextCP(testText, ci)) {
1776: if (ci == i) {
1777: // This is the location of the error.
1778: errorText.append("<?>");
1779: } else if (expectedBreaks[ci]) {
1780: // This a non-error expected break position.
1781: errorText.append("<>");
1782: }
1783: if (ci < testText.length()) {
1784: c = UTF16.charAt(testText, ci);
1785: if (c < 0x10000) {
1786: errorText.append("\\u");
1787: for (bn = 12; bn >= 0; bn -= 4) {
1788: errorText
1789: .append(hexChars
1790: .charAt((((int) c) >> bn) & 0xf));
1791: }
1792: } else {
1793: errorText.append("\\U");
1794: for (bn = 28; bn >= 0; bn -= 4) {
1795: errorText
1796: .append(hexChars
1797: .charAt((((int) c) >> bn) & 0xf));
1798: }
1799: }
1800: }
1801: }
1802: if (ci == testText.length() && ci != -1) {
1803: errorText.append("<>");
1804: }
1805: errorText.append("</data>\n");
1806:
1807: // Output the error
1808: errln(name
1809: + " break monkey test error. "
1810: + (expectedBreaks[i] ? "Break expected but not found."
1811: : "Break found but not expected.")
1812: + "\nOperation = " + errorType
1813: + "; random seed = " + seed
1814: + "; buf Idx = " + i + "\n" + errorText);
1815: break;
1816: }
1817: }
1818:
1819: loopCount++;
1820: }
1821: }
1822:
1823: public void TestCharMonkey() {
1824:
1825: int loopCount = 500;
1826: int seed = 1;
1827:
1828: if (params.inclusion >= 9) {
1829: loopCount = 10000;
1830: }
1831:
1832: RBBICharMonkey m = new RBBICharMonkey();
1833: BreakIterator bi = BreakIterator
1834: .getCharacterInstance(Locale.US);
1835: RunMonkey(bi, m, "char", seed, loopCount);
1836: }
1837:
1838: public void TestWordMonkey() {
1839:
1840: int loopCount = 500;
1841: int seed = 1;
1842:
1843: if (params.inclusion >= 9) {
1844: loopCount = 10000;
1845: }
1846:
1847: logln("Word Break Monkey Test");
1848: RBBIWordMonkey m = new RBBIWordMonkey();
1849: BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
1850: RunMonkey(bi, m, "word", seed, loopCount);
1851: }
1852:
1853: public void TestLineMonkey() {
1854:
1855: int loopCount = 500;
1856: int seed = 1;
1857:
1858: if (params.inclusion >= 9) {
1859: loopCount = 10000;
1860: }
1861:
1862: logln("Line Break Monkey Test");
1863: RBBILineMonkey m = new RBBILineMonkey();
1864: BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
1865: if (params == null) {
1866: loopCount = 50;
1867: }
1868: RunMonkey(bi, m, "line", seed, loopCount);
1869: }
1870:
1871: public void TestSentMonkey() {
1872:
1873: int loopCount = 500;
1874: int seed = 1;
1875:
1876: if (params.inclusion >= 9) {
1877: loopCount = 3000;
1878: }
1879:
1880: logln("Sentence Break Monkey Test");
1881: RBBISentenceMonkey m = new RBBISentenceMonkey();
1882: BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
1883: if (params == null) {
1884: loopCount = 30;
1885: }
1886: RunMonkey(bi, m, "sent", seed, loopCount);
1887: }
1888:
1889: //
1890: // Round-trip monkey tests.
1891: // Verify that break iterators created from the rule source from the default
1892: // break iterators still pass the monkey test for the iterator type.
1893: //
1894: // This is a major test for the Rule Compiler. The default break iterators are built
1895: // from pre-compiled binary rule data that was created using ICU4C; these
1896: // round-trip rule recompile tests verify that the Java rule compiler can
1897: // rebuild break iterators from the original source rules.
1898: //
1899: public void TestRTCharMonkey() {
1900:
1901: int loopCount = 200;
1902: int seed = 1;
1903:
1904: if (params.inclusion >= 9) {
1905: loopCount = 2000;
1906: }
1907:
1908: RBBICharMonkey m = new RBBICharMonkey();
1909: BreakIterator bi = BreakIterator
1910: .getCharacterInstance(Locale.US);
1911: String rules = bi.toString();
1912: BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1913: RunMonkey(rtbi, m, "char", seed, loopCount);
1914: }
1915:
1916: public void TestRTWordMonkey() {
1917:
1918: int loopCount = 200;
1919: int seed = 1;
1920:
1921: if (params.inclusion >= 9) {
1922: loopCount = 2000;
1923: }
1924:
1925: logln("Word Break Monkey Test");
1926: RBBIWordMonkey m = new RBBIWordMonkey();
1927: BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
1928: String rules = bi.toString();
1929: BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1930: RunMonkey(rtbi, m, "word", seed, loopCount);
1931: }
1932:
1933: public void TestRTLineMonkey() {
1934:
1935: int loopCount = 200;
1936: int seed = 1;
1937:
1938: if (params.inclusion >= 9) {
1939: loopCount = 2000;
1940: }
1941:
1942: logln("Line Break Monkey Test");
1943: RBBILineMonkey m = new RBBILineMonkey();
1944: BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
1945: String rules = bi.toString();
1946: BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1947: if (params == null) {
1948: loopCount = 50;
1949: }
1950: RunMonkey(rtbi, m, "line", seed, loopCount);
1951: }
1952:
1953: public void TestRTSentMonkey() {
1954:
1955: int loopCount = 200;
1956: int seed = 1;
1957:
1958: if (params.inclusion >= 9) {
1959: loopCount = 1000;
1960: }
1961:
1962: logln("Sentence Break Monkey Test");
1963: RBBISentenceMonkey m = new RBBISentenceMonkey();
1964: BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
1965: String rules = bi.toString();
1966: BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1967: if (params == null) {
1968: loopCount = 30;
1969: }
1970: RunMonkey(rtbi, m, "sent", seed, loopCount);
1971: }
1972:
1973: }
|