Source Code Cross Referenced for RBBITestMonkey.java in  » Internationalization-Localization » icu4j » com » ibm » icu » dev » test » rbbi » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.dev.test.rbbi 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /*
0002:         *******************************************************************************
0003:         * Copyright (C) 2003-2006 International Business Machines Corporation and     *
0004:         * others. All Rights Reserved.                                                *
0005:         *******************************************************************************
0006:         */
0007:        package com.ibm.icu.dev.test.rbbi;
0008:
0009:        // Monkey testing of RuleBasedBreakIterator
0010:        import com.ibm.icu.dev.test.*;
0011:        import com.ibm.icu.text.BreakIterator;
0012:        import com.ibm.icu.text.RuleBasedBreakIterator;
0013:        import com.ibm.icu.text.UTF16;
0014:        import com.ibm.icu.text.UnicodeSet;
0015:        import com.ibm.icu.lang.UCharacter;
0016:        import com.ibm.icu.lang.UProperty;
0017:        import java.util.List;
0018:        import java.util.Arrays;
0019:        import java.util.ArrayList;
0020:        import java.util.Locale;
0021:
0022:        /**
0023:         * Monkey tests for RBBI.  These tests have independent implementations of
0024:         * the Unicode TR boundary rules, and compare results between these and ICU's
0025:         * implementation, using random data.
0026:         * 
0027:         * Tests cover Grapheme Cluster (char), Word and Line breaks
0028:         * 
0029:         * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
0030:         *
0031:         */
0032:        public class RBBITestMonkey extends TestFmwk {
0033:
0034:            public static void main(String[] args) {
0035:                new RBBITestMonkey().run(args);
0036:            }
0037:
0038:            //
0039:            //     classs RBBIMonkeyKind
0040:            //
0041:            //        Monkey Test for Break Iteration
0042:            //        Abstract interface class.   Concrete derived classes independently
0043:            //        implement the break rules for different iterator types.
0044:            //
0045:            //        The Monkey Test itself uses doesn't know which type of break iterator it is
0046:            //        testing, but works purely in terms of the interface defined here.
0047:            //
0048:            abstract static class RBBIMonkeyKind {
0049:
0050:                // Return a List of UnicodeSets, representing the character classes used
0051:                //   for this type of iterator.
0052:                abstract List charClasses();
0053:
0054:                // Set the test text on which subsequent calls to next() will operate
0055:                abstract void setText(StringBuffer text);
0056:
0057:                // Find the next break postion, starting from the specified position.
0058:                // Return -1 after reaching end of string.
0059:                abstract int next(int i);
0060:            }
0061:
0062:            /**
0063:             * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
0064:             */
0065:            static class RBBICharMonkey extends RBBIMonkeyKind {
0066:                List fSets;
0067:
0068:                UnicodeSet fCRLFSet;
0069:                UnicodeSet fControlSet;
0070:                UnicodeSet fExtendSet;
0071:                UnicodeSet fHangulSet;
0072:                UnicodeSet fAnySet;
0073:
0074:                StringBuffer fText;
0075:
0076:                RBBICharMonkey() {
0077:                    fText = null;
0078:                    fCRLFSet = new UnicodeSet("[\\r\\n]");
0079:                    fControlSet = new UnicodeSet(
0080:                            "[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]]");
0081:                    fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]");
0082:                    fHangulSet = new UnicodeSet(
0083:                            "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
0084:                                    + "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]");
0085:                    fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
0086:
0087:                    fSets = new ArrayList();
0088:                    fSets.add(fCRLFSet);
0089:                    fSets.add(fControlSet);
0090:                    fSets.add(fExtendSet);
0091:                    fSets.add(fHangulSet);
0092:                    fSets.add(fAnySet);
0093:                }
0094:
0095:                void setText(StringBuffer s) {
0096:                    fText = s;
0097:                }
0098:
0099:                List charClasses() {
0100:                    return fSets;
0101:                }
0102:
0103:                int next(int i) {
0104:                    return nextGC(fText, i);
0105:                }
0106:            }
0107:
0108:            /**
0109:             * 
0110:             * Word Monkey Test Class
0111:             *
0112:             * 
0113:             * 
0114:             */
0115:            static class RBBIWordMonkey extends RBBIMonkeyKind {
0116:                List fSets;
0117:                StringBuffer fText;
0118:
0119:                UnicodeSet fKatakanaSet;
0120:                UnicodeSet fALetterSet;
0121:                UnicodeSet fMidLetterSet;
0122:                UnicodeSet fMidNumSet;
0123:                UnicodeSet fNumericSet;
0124:                UnicodeSet fFormatSet;
0125:                UnicodeSet fExtendSet;
0126:                UnicodeSet fExtendNumLetSet;
0127:                UnicodeSet fOtherSet;
0128:
0129:                RBBIWordMonkey() {
0130:                    fSets = new ArrayList();
0131:
0132:                    fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
0133:                            + "[\\p{Line_Break = Complex_Context}"
0134:                            + "-\\p{Grapheme_Cluster_Break = Extend}"
0135:                            + "-\\p{Grapheme_Cluster_Break = Control}]]");
0136:                    fKatakanaSet = new UnicodeSet(
0137:                            "[\\p{Word_Break = Katakana}-[\\uff9e\\uff9f]]");
0138:                    fMidLetterSet = new UnicodeSet(
0139:                            "[\\p{Word_Break = MidLetter}]");
0140:                    fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
0141:                    fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]");
0142:                    fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
0143:                    fExtendNumLetSet = new UnicodeSet(
0144:                            "[\\p{Word_Break = ExtendNumLet}]");
0145:                    fExtendSet = new UnicodeSet(
0146:                            "[\\p{Grapheme_Cluster_Break = Extend}\\uff9e\\uff9f]");
0147:                    fOtherSet = new UnicodeSet();
0148:
0149:                    fOtherSet.complement();
0150:                    fOtherSet.removeAll(fALetterSet);
0151:                    fOtherSet.removeAll(fKatakanaSet);
0152:                    fOtherSet.removeAll(fMidLetterSet);
0153:                    fOtherSet.removeAll(fMidNumSet);
0154:                    fOtherSet.removeAll(fNumericSet);
0155:                    fOtherSet.removeAll(fFormatSet);
0156:                    fOtherSet.removeAll(fExtendSet);
0157:                    fOtherSet.removeAll(fExtendNumLetSet);
0158:
0159:                    fSets.add(fALetterSet);
0160:                    fSets.add(fKatakanaSet);
0161:                    fSets.add(fMidLetterSet);
0162:                    fSets.add(fMidNumSet);
0163:                    fSets.add(fNumericSet);
0164:                    fSets.add(fFormatSet);
0165:                    fSets.add(fExtendSet);
0166:                    fSets.add(fExtendNumLetSet);
0167:                    fSets.add(fOtherSet);
0168:                }
0169:
0170:                List charClasses() {
0171:                    return fSets;
0172:                }
0173:
0174:                void setText(StringBuffer s) {
0175:                    fText = s;
0176:                }
0177:
0178:                int next(int prevPos) {
0179:                    int p0, p1, p2, p3; // Indices of the significant code points around the 
0180:                    //   break position being tested.  The candidate break
0181:                    //   location is before p2.
0182:                    int breakPos = -1;
0183:
0184:                    int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
0185:
0186:                    // Prev break at end of string.  return DONE.
0187:                    if (prevPos >= fText.length()) {
0188:                        return -1;
0189:                    }
0190:                    p0 = p1 = p2 = p3 = prevPos;
0191:                    c3 = UTF16.charAt(fText, prevPos);
0192:                    c0 = c1 = c2 = 0;
0193:
0194:                    // Loop runs once per "significant" character position in the input text.
0195:                    for (;;) {
0196:                        // Move all of the positions forward in the input string.
0197:                        p0 = p1;
0198:                        c0 = c1;
0199:                        p1 = p2;
0200:                        c1 = c2;
0201:                        p2 = p3;
0202:                        c2 = c3;
0203:
0204:                        // Advancd p3 by    X(Extend | Format)*   Rule 4
0205:                        do {
0206:                            p3 = moveIndex32(fText, p3, 1);
0207:                            c3 = -1;
0208:                            if (p3 >= fText.length()) {
0209:                                break;
0210:                            }
0211:                            c3 = UTF16.charAt(fText, p3);
0212:                        } while (setContains(fFormatSet, c3)
0213:                                || setContains(fExtendSet, c3));
0214:
0215:                        if (p1 == p2) {
0216:                            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
0217:                            continue;
0218:                        }
0219:                        if (p2 == fText.length()) {
0220:                            // Reached end of string.  Always a break position.
0221:                            break;
0222:                        }
0223:
0224:                        // Rule (3)   CR x LF
0225:                        //     No Extend or Format characters may appear between the CR and LF,
0226:                        //     which requires the additional check for p2 immediately following p1.
0227:                        //
0228:                        if (c1 == 0x0D && c2 == 0x0A && p1 == (p2 - 1)) {
0229:                            continue;
0230:                        }
0231:
0232:                        // Rule (5).   ALetter x ALetter
0233:                        if (fALetterSet.contains(c1)
0234:                                && fALetterSet.contains(c2)) {
0235:                            continue;
0236:                        }
0237:
0238:                        // Rule (6)  ALetter  x  MidLetter  ALetter
0239:                        //
0240:                        if (fALetterSet.contains(c1)
0241:                                && fMidLetterSet.contains(c2)
0242:                                && setContains(fALetterSet, c3)) {
0243:                            continue;
0244:                        }
0245:
0246:                        // Rule (7)  ALetter MidLetter   x  ALetter
0247:                        if (fALetterSet.contains(c0)
0248:                                && fMidLetterSet.contains(c1)
0249:                                && fALetterSet.contains(c2)) {
0250:                            continue;
0251:                        }
0252:
0253:                        //  Rule (8)    Numeric x Numeric
0254:                        if (fNumericSet.contains(c1)
0255:                                && fNumericSet.contains(c2)) {
0256:                            continue;
0257:                        }
0258:
0259:                        // Rule (9)    ALetter x Numeric
0260:                        if (fALetterSet.contains(c1)
0261:                                && fNumericSet.contains(c2)) {
0262:                            continue;
0263:                        }
0264:
0265:                        // Rule (10)    Numeric x ALetter
0266:                        if (fNumericSet.contains(c1)
0267:                                && fALetterSet.contains(c2)) {
0268:                            continue;
0269:                        }
0270:
0271:                        // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
0272:                        if (fNumericSet.contains(c0) && fMidNumSet.contains(c1)
0273:                                && fNumericSet.contains(c2)) {
0274:                            continue;
0275:                        }
0276:
0277:                        // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
0278:                        if (fNumericSet.contains(c1) && fMidNumSet.contains(c2)
0279:                                && setContains(fNumericSet, c3)) {
0280:                            continue;
0281:                        }
0282:
0283:                        // Rule (13)  Katakana x Katakana
0284:                        if (fKatakanaSet.contains(c1)
0285:                                && fKatakanaSet.contains(c2)) {
0286:                            continue;
0287:                        }
0288:
0289:                        // Rule 13a  (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet
0290:                        if ((fALetterSet.contains(c1)
0291:                                || fNumericSet.contains(c1)
0292:                                || fKatakanaSet.contains(c1) || fExtendNumLetSet
0293:                                .contains(c1))
0294:                                && fExtendNumLetSet.contains(c2)) {
0295:                            continue;
0296:                        }
0297:                        // Rule 13b   ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)
0298:                        if (fExtendNumLetSet.contains(c1)
0299:                                && (fALetterSet.contains(c2)
0300:                                        || fNumericSet.contains(c2)
0301:                                        || fKatakanaSet.contains(c2) || fExtendNumLetSet
0302:                                        .contains(c2))) {
0303:                            continue;
0304:                        }
0305:
0306:                        // Rule 14.  Break found here.
0307:                        break;
0308:                    }
0309:
0310:                    breakPos = p2;
0311:                    return breakPos;
0312:                }
0313:
0314:            }
0315:
0316:            static class RBBILineMonkey extends RBBIMonkeyKind {
0317:
0318:                List fSets;
0319:
0320:                UnicodeSet fBK;
0321:                UnicodeSet fCR;
0322:                UnicodeSet fLF;
0323:                UnicodeSet fCM;
0324:                UnicodeSet fNL;
0325:                UnicodeSet fSG;
0326:                UnicodeSet fWJ;
0327:                UnicodeSet fZW;
0328:                UnicodeSet fGL;
0329:                UnicodeSet fCB;
0330:                UnicodeSet fSP;
0331:                UnicodeSet fB2;
0332:                UnicodeSet fBA;
0333:                UnicodeSet fBB;
0334:                UnicodeSet fHY;
0335:                UnicodeSet fCL;
0336:                UnicodeSet fEX;
0337:                UnicodeSet fIN;
0338:                UnicodeSet fNS;
0339:                UnicodeSet fOP;
0340:                UnicodeSet fQU;
0341:                UnicodeSet fIS;
0342:                UnicodeSet fNU;
0343:                UnicodeSet fPO;
0344:                UnicodeSet fPR;
0345:                UnicodeSet fSY;
0346:                UnicodeSet fAI;
0347:                UnicodeSet fAL;
0348:                UnicodeSet fID;
0349:                UnicodeSet fSA;
0350:                UnicodeSet fJL;
0351:                UnicodeSet fJV;
0352:                UnicodeSet fJT;
0353:                UnicodeSet fH2;
0354:                UnicodeSet fH3;
0355:                UnicodeSet fXX;
0356:
0357:                StringBuffer fText;
0358:                int fOrigPositions;
0359:
0360:                RBBILineMonkey() {
0361:                    fSets = new ArrayList();
0362:
0363:                    fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
0364:                    fCR = new UnicodeSet("[\\p{Line_break=CR}]");
0365:                    fLF = new UnicodeSet("[\\p{Line_break=LF}]");
0366:                    fCM = new UnicodeSet("[\\p{Line_break=CM}]");
0367:                    fNL = new UnicodeSet("[\\p{Line_break=NL}]");
0368:                    fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
0369:                    fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
0370:                    fGL = new UnicodeSet("[\\p{Line_break=GL}]");
0371:                    fCB = new UnicodeSet("[\\p{Line_break=CB}]");
0372:                    fSP = new UnicodeSet("[\\p{Line_break=SP}]");
0373:                    fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
0374:                    fBA = new UnicodeSet("[\\p{Line_break=BA}]");
0375:                    fBB = new UnicodeSet("[\\p{Line_break=BB}]");
0376:                    fHY = new UnicodeSet("[\\p{Line_break=HY}]");
0377:                    fCL = new UnicodeSet("[\\p{Line_break=CL}]");
0378:                    fEX = new UnicodeSet("[\\p{Line_break=EX}]");
0379:                    fIN = new UnicodeSet("[\\p{Line_break=IN}]");
0380:                    fNS = new UnicodeSet("[\\p{Line_break=NS}]");
0381:                    fOP = new UnicodeSet("[\\p{Line_break=OP}]");
0382:                    fQU = new UnicodeSet("[\\p{Line_break=QU}]");
0383:                    fIS = new UnicodeSet("[\\p{Line_break=IS}]");
0384:                    fNU = new UnicodeSet("[\\p{Line_break=NU}]");
0385:                    fPO = new UnicodeSet("[\\p{Line_break=PO}]");
0386:                    fPR = new UnicodeSet("[\\p{Line_break=PR}]");
0387:                    fSY = new UnicodeSet("[\\p{Line_break=SY}]");
0388:                    fAI = new UnicodeSet("[\\p{Line_break=AI}]");
0389:                    fAL = new UnicodeSet("[\\p{Line_break=AL}]");
0390:                    fID = new UnicodeSet("[\\p{Line_break=ID}]");
0391:                    fSA = new UnicodeSet("[\\p{Line_break=SA}]");
0392:                    fJL = new UnicodeSet("[\\p{Line_break=JL}]");
0393:                    fJV = new UnicodeSet("[\\p{Line_break=JV}]");
0394:                    fJT = new UnicodeSet("[\\p{Line_break=JT}]");
0395:                    fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
0396:                    fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
0397:                    fSG = new UnicodeSet("[\\ud800-\\udfff]");
0398:                    fXX = new UnicodeSet("[\\p{Line_break=XX}]");
0399:
0400:                    fAL.addAll(fXX); // Default behavior for XX is identical to AL
0401:                    fAL.addAll(fAI); // Default behavior for AI is identical to AL
0402:                    fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL
0403:                    fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL
0404:
0405:                    fSets.add(fBK);
0406:                    fSets.add(fCR);
0407:                    fSets.add(fLF);
0408:                    fSets.add(fCM);
0409:                    fSets.add(fNL);
0410:                    fSets.add(fWJ);
0411:                    fSets.add(fZW);
0412:                    fSets.add(fGL);
0413:                    fSets.add(fCB);
0414:                    fSets.add(fSP);
0415:                    fSets.add(fB2);
0416:                    fSets.add(fBA);
0417:                    fSets.add(fBB);
0418:                    fSets.add(fHY);
0419:                    fSets.add(fH2);
0420:                    fSets.add(fH3);
0421:                    fSets.add(fCL);
0422:                    fSets.add(fEX);
0423:                    fSets.add(fIN);
0424:                    fSets.add(fJL);
0425:                    fSets.add(fJT);
0426:                    fSets.add(fJV);
0427:                    fSets.add(fNS);
0428:                    fSets.add(fOP);
0429:                    fSets.add(fQU);
0430:                    fSets.add(fIS);
0431:                    fSets.add(fNU);
0432:                    fSets.add(fPO);
0433:                    fSets.add(fPR);
0434:                    fSets.add(fSY);
0435:                    fSets.add(fAI);
0436:                    fSets.add(fAL);
0437:                    fSets.add(fID);
0438:                    fSets.add(fWJ);
0439:                    fSets.add(fSA);
0440:                    fSets.add(fSG);
0441:
0442:                }
0443:
0444:                void setText(StringBuffer s) {
0445:                    fText = s;
0446:                }
0447:
0448:                int next(int startPos) {
0449:                    int pos; //  Index of the char following a potential break position
0450:                    int this Char; //  Character at above position "pos"
0451:
0452:                    int prevPos; //  Index of the char preceding a potential break position
0453:                    int prevChar; //  Character at above position.  Note that prevChar
0454:                    //   and thisChar may not be adjacent because combining
0455:                    //   characters between them will be ignored.
0456:
0457:                    int nextPos; //  Index of the next character following pos.
0458:                    //     Usually skips over combining marks.
0459:                    int tPos; //  temp value.
0460:                    int c;
0461:                    int matchVals[] = null; // Number  Expression Match Results
0462:
0463:                    if (startPos >= fText.length()) {
0464:                        return -1;
0465:                    }
0466:
0467:                    // Initial values for loop.  Loop will run the first time without finding breaks,
0468:                    //                           while the invalid values shift out and the "this" and
0469:                    //                           "prev" positions are filled in with good values.
0470:                    pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
0471:                    this Char = prevChar = 0;
0472:                    nextPos = startPos;
0473:
0474:                    // Loop runs once per position in the test text, until a break position
0475:                    //  is found.  In each iteration, we are testing for a possible break
0476:                    //  just preceding the character at index "pos".  The character preceding
0477:                    //  this char is at postion "prevPos"; because of combining sequences,
0478:                    //  "prevPos" can be arbitrarily far before "pos".
0479:                    for (;;) {
0480:                        // Advance to the next position to be tested.
0481:                        prevPos = pos;
0482:                        prevChar = this Char;
0483:                        pos = nextPos;
0484:                        nextPos = moveIndex32(fText, pos, 1);
0485:
0486:                        // Rule LB2 - Break at end of text.
0487:                        if (pos >= fText.length()) {
0488:                            break;
0489:                        }
0490:
0491:                        // Rule LB 9 - adjust for combining sequences.
0492:                        //             We do this rule out-of-order because the adjustment does
0493:                        //             not effect the way that rules LB 3 through LB 6 match,
0494:                        //             and doing it here rather than after LB 6 is substantially
0495:                        //             simpler when combining sequences do occur.
0496:
0497:                        // LB 9         Keep combining sequences together.
0498:                        //              advance over any CM class chars at "pos", 
0499:                        //              result is "nextPos" for the following loop iteration.
0500:                        this Char = UTF16.charAt(fText, pos);
0501:                        if (!(fSP.contains(this Char) || fBK.contains(this Char)
0502:                                || this Char == 0x0d || this Char == 0x0a
0503:                                || fNL.contains(this Char) || fZW
0504:                                .contains(this Char))) {
0505:                            for (;;) {
0506:                                if (nextPos == fText.length()) {
0507:                                    break;
0508:                                }
0509:                                int nextChar = UTF16.charAt(fText, nextPos);
0510:                                if (!fCM.contains(nextChar)) {
0511:                                    break;
0512:                                }
0513:                                nextPos = moveIndex32(fText, nextPos, 1);
0514:                            }
0515:                        }
0516:
0517:                        // LB 9 Treat X CM* as if it were X
0518:                        //        No explicit action required.
0519:
0520:                        // LB 10     Treat any remaining combining mark as AL
0521:                        if (fCM.contains(this Char)) {
0522:                            this Char = 'A';
0523:                        }
0524:
0525:                        // If the loop is still warming up - if we haven't shifted the initial
0526:                        //   -1 positions out of prevPos yet - loop back to advance the
0527:                        //    position in the input without any further looking for breaks.
0528:                        if (prevPos == -1) {
0529:                            continue;
0530:                        }
0531:
0532:                        // LB 4  Always break after hard line breaks,
0533:                        if (fBK.contains(prevChar)) {
0534:                            break;
0535:                        }
0536:
0537:                        // LB 5  Break after CR, LF, NL, but not inside CR LF
0538:                        if (fCR.contains(prevChar) && fLF.contains(this Char)) {
0539:                            continue;
0540:                        }
0541:                        if (fCR.contains(prevChar) || fLF.contains(prevChar)
0542:                                || fNL.contains(prevChar)) {
0543:                            break;
0544:                        }
0545:
0546:                        // LB 6  Don't break before hard line breaks
0547:                        if (fBK.contains(this Char) || fCR.contains(this Char)
0548:                                || fLF.contains(this Char)
0549:                                || fNL.contains(this Char)) {
0550:                            continue;
0551:                        }
0552:
0553:                        // LB 7  Don't break before spaces or zero-width space.
0554:                        if (fSP.contains(this Char)) {
0555:                            continue;
0556:                        }
0557:
0558:                        if (fZW.contains(this Char)) {
0559:                            continue;
0560:                        }
0561:
0562:                        // LB 8  Break after zero width space
0563:                        if (fZW.contains(prevChar)) {
0564:                            break;
0565:                        }
0566:
0567:                        //  LB 9, 10  Already done, at top of loop.
0568:                        //
0569:
0570:                        // LB 11
0571:                        //    x  WJ
0572:                        //    WJ  x
0573:                        if (fWJ.contains(this Char) || fWJ.contains(prevChar)) {
0574:                            continue;
0575:                        }
0576:
0577:                        // LB 12
0578:                        //        (!SP) x GL
0579:                        //        GL x
0580:                        if ((!fSP.contains(prevChar)) && fGL.contains(this Char)
0581:                                || fGL.contains(prevChar)) {
0582:                            continue;
0583:                        }
0584:
0585:                        // LB 13  Don't break before closings.
0586:                        //       NU x CL  and NU x IS are not matched here so that they will
0587:                        //       fall into LB 17 and the more general number regular expression.
0588:                        //
0589:                        if (!fNU.contains(prevChar) && fCL.contains(this Char)
0590:                                || fEX.contains(this Char)
0591:                                || !fNU.contains(prevChar)
0592:                                && fIS.contains(this Char)
0593:                                || !fNU.contains(prevChar)
0594:                                && fSY.contains(this Char)) {
0595:                            continue;
0596:                        }
0597:
0598:                        // LB 14  Don't break after OP SP*
0599:                        //       Scan backwards, checking for this sequence.
0600:                        //       The OP char could include combining marks, so we acually check for
0601:                        //           OP CM* SP* x
0602:                        tPos = prevPos;
0603:                        if (fSP.contains(prevChar)) {
0604:                            while (tPos > 0
0605:                                    && fSP.contains(UTF16.charAt(fText, tPos))) {
0606:                                tPos = moveIndex32(fText, tPos, -1);
0607:                            }
0608:                        }
0609:                        while (tPos > 0
0610:                                && fCM.contains(UTF16.charAt(fText, tPos))) {
0611:                            tPos = moveIndex32(fText, tPos, -1);
0612:                        }
0613:                        if (fOP.contains(UTF16.charAt(fText, tPos))) {
0614:                            continue;
0615:                        }
0616:
0617:                        // LB 15 Do not break withing "[ 
0618:                        //       QU CM* SP* x OP
0619:                        if (fOP.contains(this Char)) {
0620:                            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
0621:                            tPos = prevPos;
0622:                            while (tPos > 0
0623:                                    && fSP.contains(UTF16.charAt(fText, tPos))) {
0624:                                tPos = moveIndex32(fText, tPos, -1);
0625:                            }
0626:                            while (tPos > 0
0627:                                    && fCM.contains(UTF16.charAt(fText, tPos))) {
0628:                                tPos = moveIndex32(fText, tPos, -1);
0629:                            }
0630:                            if (fQU.contains(UTF16.charAt(fText, tPos))) {
0631:                                continue;
0632:                            }
0633:                        }
0634:
0635:                        // LB 16   CL SP* x NS
0636:                        if (fNS.contains(this Char)) {
0637:                            tPos = prevPos;
0638:                            while (tPos > 0
0639:                                    && fSP.contains(UTF16.charAt(fText, tPos))) {
0640:                                tPos = moveIndex32(fText, tPos, -1);
0641:                            }
0642:                            while (tPos > 0
0643:                                    && fCM.contains(UTF16.charAt(fText, tPos))) {
0644:                                tPos = moveIndex32(fText, tPos, -1);
0645:                            }
0646:                            if (fCL.contains(UTF16.charAt(fText, tPos))) {
0647:                                continue;
0648:                            }
0649:                        }
0650:
0651:                        // LB 17        B2 SP* x B2
0652:                        if (fB2.contains(this Char)) {
0653:                            tPos = prevPos;
0654:                            while (tPos > 0
0655:                                    && fSP.contains(UTF16.charAt(fText, tPos))) {
0656:                                tPos = moveIndex32(fText, tPos, -1);
0657:                            }
0658:                            while (tPos > 0
0659:                                    && fCM.contains(UTF16.charAt(fText, tPos))) {
0660:                                tPos = moveIndex32(fText, tPos, -1);
0661:                            }
0662:                            if (fB2.contains(UTF16.charAt(fText, tPos))) {
0663:                                continue;
0664:                            }
0665:                        }
0666:
0667:                        // LB 18    break after space
0668:                        if (fSP.contains(prevChar)) {
0669:                            break;
0670:                        }
0671:
0672:                        // LB 19
0673:                        //    x   QU
0674:                        //    QU  x
0675:                        if (fQU.contains(this Char) || fQU.contains(prevChar)) {
0676:                            continue;
0677:                        }
0678:
0679:                        // LB 20  Break around a CB
0680:                        if (fCB.contains(this Char) || fCB.contains(prevChar)) {
0681:                            break;
0682:                        }
0683:
0684:                        // LB 21
0685:                        if (fBA.contains(this Char) || fHY.contains(this Char)
0686:                                || fNS.contains(this Char)
0687:                                || fBB.contains(prevChar)) {
0688:                            continue;
0689:                        }
0690:
0691:                        // LB 22
0692:                        if (fAL.contains(prevChar) && fIN.contains(this Char)
0693:                                || fID.contains(prevChar)
0694:                                && fIN.contains(this Char)
0695:                                || fIN.contains(prevChar)
0696:                                && fIN.contains(this Char)
0697:                                || fNU.contains(prevChar)
0698:                                && fIN.contains(this Char)) {
0699:                            continue;
0700:                        }
0701:
0702:                        // LB 23    ID x PO    (Note:  Leading CM behaves like ID)
0703:                        //          AL x NU
0704:                        //          NU x AL
0705:                        if (fID.contains(prevChar) && fPO.contains(this Char)
0706:                                || fAL.contains(prevChar)
0707:                                && fNU.contains(this Char)
0708:                                || fNU.contains(prevChar)
0709:                                && fAL.contains(this Char)) {
0710:                            continue;
0711:                        }
0712:
0713:                        // LB 24  Do not break between prefix and letters or ideographs.
0714:                        //        PR x ID
0715:                        //        PR x AL
0716:                        //        PO x AL
0717:                        if (fPR.contains(prevChar) && fID.contains(this Char)
0718:                                || fPR.contains(prevChar)
0719:                                && fAL.contains(this Char)
0720:                                || fPO.contains(prevChar)
0721:                                && fAL.contains(this Char)) {
0722:                            continue;
0723:                        }
0724:
0725:                        // LB 25    Numbers
0726:                        matchVals = LBNumberCheck(fText, prevPos, matchVals);
0727:                        if (matchVals[0] != -1) {
0728:                            // Matched a number.  But could have been just a single digit, which would
0729:                            //    not represent a "no break here" between prevChar and thisChar
0730:                            int numEndIdx = matchVals[1]; // idx of first char following num
0731:                            if (numEndIdx > pos) {
0732:                                // Number match includes at least the two chars being checked
0733:                                if (numEndIdx > nextPos) {
0734:                                    // Number match includes additional chars.  Update pos and nextPos
0735:                                    //   so that next loop iteration will continue at the end of the number,
0736:                                    //   checking for breaks between last char in number & whatever follows.
0737:                                    nextPos = numEndIdx;
0738:                                    pos = numEndIdx;
0739:                                    do {
0740:                                        pos = moveIndex32(fText, pos, -1);
0741:                                        this Char = UTF16.charAt(fText, pos);
0742:                                    } while (fCM.contains(this Char));
0743:                                }
0744:                                continue;
0745:                            }
0746:                        }
0747:
0748:                        // LB 26  Do not break Korean Syllables
0749:                        if (fJL.contains(prevChar)
0750:                                && (fJL.contains(this Char)
0751:                                        || fJV.contains(this Char)
0752:                                        || fH2.contains(this Char) || fH3
0753:                                        .contains(this Char))) {
0754:                            continue;
0755:                        }
0756:
0757:                        if ((fJV.contains(prevChar) || fH2.contains(prevChar))
0758:                                && (fJV.contains(this Char) || fJT
0759:                                        .contains(this Char))) {
0760:                            continue;
0761:                        }
0762:
0763:                        if ((fJT.contains(prevChar) || fH3.contains(prevChar))
0764:                                && fJT.contains(this Char)) {
0765:                            continue;
0766:                        }
0767:
0768:                        // LB 27 Treat a Korean Syllable Block the same as ID
0769:                        if ((fJL.contains(prevChar) || fJV.contains(prevChar)
0770:                                || fJT.contains(prevChar)
0771:                                || fH2.contains(prevChar) || fH3
0772:                                .contains(prevChar))
0773:                                && fIN.contains(this Char)) {
0774:                            continue;
0775:                        }
0776:                        if ((fJL.contains(prevChar) || fJV.contains(prevChar)
0777:                                || fJT.contains(prevChar)
0778:                                || fH2.contains(prevChar) || fH3
0779:                                .contains(prevChar))
0780:                                && fPO.contains(this Char)) {
0781:                            continue;
0782:                        }
0783:                        if (fPR.contains(prevChar)
0784:                                && (fJL.contains(this Char)
0785:                                        || fJV.contains(this Char)
0786:                                        || fJT.contains(this Char)
0787:                                        || fH2.contains(this Char) || fH3
0788:                                        .contains(this Char))) {
0789:                            continue;
0790:                        }
0791:
0792:                        // LB 28 Do not break between alphabetics
0793:                        if (fAL.contains(prevChar) && fAL.contains(this Char)) {
0794:                            continue;
0795:                        }
0796:
0797:                        // LB 29  Do not break between numeric punctuation and alphabetics
0798:                        if (fIS.contains(prevChar) && fAL.contains(this Char)) {
0799:                            continue;
0800:                        }
0801:
0802:                        // LB 30  Do not break between letters, numbers or oridnary symbols and 
0803:                        //        opening or closing punctuation.
0804:                        //        (AL | NU) x OP
0805:                        //        CL x (AL | NU)
0806:                        if ((fAL.contains(prevChar) || fNU.contains(prevChar))
0807:                                && fOP.contains(this Char)) {
0808:                            continue;
0809:                        }
0810:                        if (fCL.contains(prevChar)
0811:                                && (fAL.contains(this Char) || fNU
0812:                                        .contains(this Char))) {
0813:                            continue;
0814:                        }
0815:
0816:                        // LB 31    Break everywhere else
0817:                        break;
0818:                    }
0819:
0820:                    return pos;
0821:                }
0822:
0823:                // Match the following regular expression in the input text.
0824:                //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?
0825:                //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)
0826:                //  retVals array  [0]  index of the start of the match, or -1 if no match
0827:                //                 [1]  index of first char following the match.
0828:                //  Can not use Java regex because need supplementary character support,
0829:                //     and because Unicode char properties version must be the same as in
0830:                //     the version of ICU being tested.
0831:                private int[] LBNumberCheck(StringBuffer s, int startIdx,
0832:                        int[] retVals) {
0833:                    if (retVals == null) {
0834:                        retVals = new int[2];
0835:                    }
0836:                    retVals[0] = -1; // Indicates no match.
0837:                    int matchState = 0;
0838:                    int idx = startIdx;
0839:
0840:                    matchLoop: for (idx = startIdx; idx < s.length(); idx = moveIndex32(
0841:                            s, idx, 1)) {
0842:                        int c = UTF16.charAt(s, idx);
0843:                        int cLBType = UCharacter.getIntPropertyValue(c,
0844:                                UProperty.LINE_BREAK);
0845:                        switch (matchState) {
0846:                        case 0:
0847:                            if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC
0848:                                    || cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
0849:                                matchState = 1;
0850:                                break;
0851:                            }
0852:                            if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
0853:                                matchState = 4;
0854:                                break;
0855:                            }
0856:                            if (cLBType == UCharacter.LineBreak.HYPHEN) {
0857:                                matchState = 4;
0858:                                break;
0859:                            }
0860:                            if (cLBType == UCharacter.LineBreak.NUMERIC) {
0861:                                matchState = 7;
0862:                                break;
0863:                            }
0864:                            break matchLoop; /* No Match  */
0865:
0866:                        case 1:
0867:                            if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
0868:                                matchState = 1;
0869:                                break;
0870:                            }
0871:                            if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
0872:                                matchState = 4;
0873:                                break;
0874:                            }
0875:                            if (cLBType == UCharacter.LineBreak.HYPHEN) {
0876:                                matchState = 4;
0877:                                break;
0878:                            }
0879:                            if (cLBType == UCharacter.LineBreak.NUMERIC) {
0880:                                matchState = 7;
0881:                                break;
0882:                            }
0883:                            break matchLoop; /* No Match  */
0884:
0885:                        case 4:
0886:                            if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
0887:                                matchState = 4;
0888:                                break;
0889:                            }
0890:                            if (cLBType == UCharacter.LineBreak.NUMERIC) {
0891:                                matchState = 7;
0892:                                break;
0893:                            }
0894:                            break matchLoop; /* No Match  */
0895:                        //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?
0896:                        //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)
0897:
0898:                        case 7:
0899:                            if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
0900:                                matchState = 7;
0901:                                break;
0902:                            }
0903:                            if (cLBType == UCharacter.LineBreak.NUMERIC) {
0904:                                matchState = 7;
0905:                                break;
0906:                            }
0907:                            if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
0908:                                matchState = 7;
0909:                                break;
0910:                            }
0911:                            if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
0912:                                matchState = 7;
0913:                                break;
0914:                            }
0915:                            if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
0916:                                matchState = 9;
0917:                                break;
0918:                            }
0919:                            if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
0920:                                matchState = 11;
0921:                                break;
0922:                            }
0923:                            if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
0924:                                matchState = 11;
0925:                                break;
0926:                            }
0927:
0928:                            break matchLoop; // Match Complete.
0929:                        case 9:
0930:                            if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
0931:                                matchState = 9;
0932:                                break;
0933:                            }
0934:                            if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
0935:                                matchState = 11;
0936:                                break;
0937:                            }
0938:                            if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
0939:                                matchState = 11;
0940:                                break;
0941:                            }
0942:                            break matchLoop; // Match Complete.
0943:                        case 11:
0944:                            if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
0945:                                matchState = 11;
0946:                                break;
0947:                            }
0948:                            break matchLoop; // Match Complete.
0949:                        }
0950:                    }
0951:                    if (matchState > 4) {
0952:                        retVals[0] = startIdx;
0953:                        retVals[1] = idx;
0954:                    }
0955:                    return retVals;
0956:                }
0957:
0958:                List charClasses() {
0959:                    return fSets;
0960:                }
0961:
0962:            }
0963:
0964:            /**
0965:             * 
0966:             * Sentence Monkey Test Class
0967:             *
0968:             * 
0969:             * 
0970:             */
0971:            static class RBBISentenceMonkey extends RBBIMonkeyKind {
0972:                List fSets;
0973:                StringBuffer fText;
0974:
0975:                UnicodeSet fSepSet;
0976:                UnicodeSet fFormatSet;
0977:                UnicodeSet fSpSet;
0978:                UnicodeSet fLowerSet;
0979:                UnicodeSet fUpperSet;
0980:                UnicodeSet fOLetterSet;
0981:                UnicodeSet fNumericSet;
0982:                UnicodeSet fATermSet;
0983:                UnicodeSet fSTermSet;
0984:                UnicodeSet fCloseSet;
0985:                UnicodeSet fOtherSet;
0986:                UnicodeSet fExtendSet;
0987:
0988:                RBBISentenceMonkey() {
0989:                    fSets = new ArrayList();
0990:
0991:                    fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep}]");
0992:                    fFormatSet = new UnicodeSet(
0993:                            "[\\p{Sentence_Break = Format}]");
0994:                    fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
0995:                    fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
0996:                    fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
0997:                    fOLetterSet = new UnicodeSet(
0998:                            "[\\p{Sentence_Break = OLetter}-[\\uff9e\\uff9f]]");
0999:                    fNumericSet = new UnicodeSet(
1000:                            "[\\p{Sentence_Break = Numeric}]");
1001:                    fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
1002:                    fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
1003:                    fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]");
1004:                    fExtendSet = new UnicodeSet(
1005:                            "[\\p{Grapheme_Extend}\\uff9e\\uff9f]");
1006:                    fOtherSet = new UnicodeSet();
1007:
1008:                    fOtherSet.complement();
1009:                    fOtherSet.removeAll(fSepSet);
1010:                    fOtherSet.removeAll(fFormatSet);
1011:                    fOtherSet.removeAll(fSpSet);
1012:                    fOtherSet.removeAll(fLowerSet);
1013:                    fOtherSet.removeAll(fUpperSet);
1014:                    fOtherSet.removeAll(fOLetterSet);
1015:                    fOtherSet.removeAll(fNumericSet);
1016:                    fOtherSet.removeAll(fATermSet);
1017:                    fOtherSet.removeAll(fSTermSet);
1018:                    fOtherSet.removeAll(fCloseSet);
1019:                    fOtherSet.removeAll(fExtendSet);
1020:
1021:                    fSets.add(fSepSet);
1022:                    fSets.add(fFormatSet);
1023:
1024:                    fSets.add(fSpSet);
1025:                    fSets.add(fLowerSet);
1026:                    fSets.add(fUpperSet);
1027:                    fSets.add(fOLetterSet);
1028:                    fSets.add(fNumericSet);
1029:                    fSets.add(fATermSet);
1030:                    fSets.add(fSTermSet);
1031:                    fSets.add(fCloseSet);
1032:                    fSets.add(fOtherSet);
1033:                    fSets.add(fExtendSet);
1034:                }
1035:
1036:                List charClasses() {
1037:                    return fSets;
1038:                }
1039:
1040:                void setText(StringBuffer s) {
1041:                    fText = s;
1042:                }
1043:
1044:                //      moveBack()   Find the "significant" code point preceding the index i.
1045:                //      Skips over ($Extend | $Format)*
1046:                // 
1047:                private int moveBack(int i) {
1048:
1049:                    if (i <= 0) {
1050:                        return -1;
1051:                    }
1052:
1053:                    int c;
1054:                    int j = i;
1055:                    do {
1056:                        j = moveIndex32(fText, j, -1);
1057:                        c = UTF16.charAt(fText, j);
1058:                    } while (j > 0
1059:                            && (fFormatSet.contains(c) || fExtendSet
1060:                                    .contains(c)));
1061:                    return j;
1062:                }
1063:
1064:                int moveForward(int i) {
1065:                    if (i >= fText.length()) {
1066:                        return fText.length();
1067:                    }
1068:                    int c;
1069:                    int j = i;
1070:                    do {
1071:                        j = moveIndex32(fText, j, 1);
1072:                        c = cAt(j);
1073:                    } while (c >= 0
1074:                            && (fFormatSet.contains(c) || fExtendSet
1075:                                    .contains(c)));
1076:                    return j;
1077:
1078:                }
1079:
1080:                int cAt(int pos) {
1081:                    if (pos < 0 || pos >= fText.length()) {
1082:                        return -1;
1083:                    }
1084:                    return UTF16.charAt(fText, pos);
1085:                }
1086:
1087:                int next(int prevPos) {
1088:                    int p0, p1, p2, p3; // Indices of the significant code points around the 
1089:                    //   break position being tested.  The candidate break
1090:                    //   location is before p2.
1091:                    int breakPos = -1;
1092:
1093:                    int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1094:                    int c;
1095:
1096:                    // Prev break at end of string.  return DONE.
1097:                    if (prevPos >= fText.length()) {
1098:                        return -1;
1099:                    }
1100:                    p0 = p1 = p2 = p3 = prevPos;
1101:                    c3 = UTF16.charAt(fText, prevPos);
1102:                    c0 = c1 = c2 = 0;
1103:
1104:                    // Loop runs once per "significant" character position in the input text.
1105:                    for (;;) {
1106:                        // Move all of the positions forward in the input string.
1107:                        p0 = p1;
1108:                        c0 = c1;
1109:                        p1 = p2;
1110:                        c1 = c2;
1111:                        p2 = p3;
1112:                        c2 = c3;
1113:
1114:                        // Advancd p3 by  X(Extend | Format)*   Rule 4
1115:                        p3 = moveForward(p3);
1116:                        c3 = cAt(p3);
1117:
1118:                        // Rule (3) CR x LF
1119:                        if (c1 == 0x0d && c2 == 0x0a && p2 == (p1 + 1)) {
1120:                            continue;
1121:                        }
1122:
1123:                        // Rule (4)    Sep  <break>
1124:                        if (fSepSet.contains(c1)) {
1125:                            p2 = p1 + 1; // Separators don't combine with Extend or Format
1126:                            break;
1127:                        }
1128:
1129:                        if (p2 >= fText.length()) {
1130:                            // Reached end of string.  Always a break position.
1131:                            break;
1132:                        }
1133:
1134:                        if (p2 == prevPos) {
1135:                            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1136:                            continue;
1137:                        }
1138:
1139:                        // Rule (6).   ATerm x Numeric
1140:                        if (fATermSet.contains(c1) && fNumericSet.contains(c2)) {
1141:                            continue;
1142:                        }
1143:
1144:                        // Rule (7).  Upper ATerm  x  Uppper
1145:                        if (fUpperSet.contains(c0) && fATermSet.contains(c1)
1146:                                && fUpperSet.contains(c2)) {
1147:                            continue;
1148:                        }
1149:
1150:                        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower
1151:                        //           Note:  Sterm | ATerm are added to the negated part of the expression by a 
1152:                        //                  note to the Unicode 5.0 documents.
1153:                        int p8 = p1;
1154:                        while (p8 > 0 && fSpSet.contains(cAt(p8))) {
1155:                            p8 = moveBack(p8);
1156:                        }
1157:                        while (p8 > 0 && fCloseSet.contains(cAt(p8))) {
1158:                            p8 = moveBack(p8);
1159:                        }
1160:                        if (fATermSet.contains(cAt(p8))) {
1161:                            p8 = p2;
1162:                            for (;;) {
1163:                                c = cAt(p8);
1164:                                if (c == -1 || fOLetterSet.contains(c)
1165:                                        || fUpperSet.contains(c)
1166:                                        || fLowerSet.contains(c)
1167:                                        || fSepSet.contains(c)
1168:                                        || fATermSet.contains(c)
1169:                                        || fSTermSet.contains(c)) {
1170:                                    break;
1171:                                }
1172:                                p8 = moveForward(p8);
1173:                            }
1174:                            if (p8 < fText.length()
1175:                                    && fLowerSet.contains(cAt(p8))) {
1176:                                continue;
1177:                            }
1178:                        }
1179:
1180:                        // Rule 8a  (STerm | ATerm) Close* Sp* x (Sterm | ATerm)
1181:                        if (fSTermSet.contains(c2) || fATermSet.contains(c2)) {
1182:                            p8 = p1;
1183:                            while (setContains(fSpSet, cAt(p8))) {
1184:                                p8 = moveBack(p8);
1185:                            }
1186:                            while (setContains(fCloseSet, cAt(p8))) {
1187:                                p8 = moveBack(p8);
1188:                            }
1189:                            c = cAt(p8);
1190:                            if (setContains(fSTermSet, c)
1191:                                    || setContains(fATermSet, c)) {
1192:                                continue;
1193:                            }
1194:                        }
1195:
1196:                        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep)
1197:                        int p9 = p1;
1198:                        while (p9 > 0 && fCloseSet.contains(cAt(p9))) {
1199:                            p9 = moveBack(p9);
1200:                        }
1201:                        c = cAt(p9);
1202:                        if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
1203:                            if (fCloseSet.contains(c2) || fSpSet.contains(c2)
1204:                                    || fSepSet.contains(c2)) {
1205:                                continue;
1206:                            }
1207:                        }
1208:
1209:                        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep)
1210:                        int p10 = p1;
1211:                        while (p10 > 0 && fSpSet.contains(cAt(p10))) {
1212:                            p10 = moveBack(p10);
1213:                        }
1214:                        while (p10 > 0 && fCloseSet.contains(cAt(p10))) {
1215:                            p10 = moveBack(p10);
1216:                        }
1217:                        if (fSTermSet.contains(cAt(p10))
1218:                                || fATermSet.contains(cAt(p10))) {
1219:                            if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
1220:                                continue;
1221:                            }
1222:                        }
1223:
1224:                        // Rule (11)  (STerm | ATerm) Close* Sp*   <break>
1225:                        int p11 = p1;
1226:                        while (p11 > 0 && fSpSet.contains(cAt(p11))) {
1227:                            p11 = moveBack(p11);
1228:                        }
1229:                        while (p11 > 0 && fCloseSet.contains(cAt(p11))) {
1230:                            p11 = moveBack(p11);
1231:                        }
1232:                        if (fSTermSet.contains(cAt(p11))
1233:                                || fATermSet.contains(cAt(p11))) {
1234:                            break;
1235:                        }
1236:
1237:                        //  Rule (12)  Any x Any
1238:                        continue;
1239:                    }
1240:                    breakPos = p2;
1241:                    return breakPos;
1242:                }
1243:
1244:            }
1245:
1246:            /**
1247:             * Move an index into a string by n code points.
1248:             *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
1249:             *   complicating usage.
1250:             * @param s   a Text string
1251:             * @param pos The starting code unit index into the text string
1252:             * @param amt The amount to adjust the string by.
1253:             * @return    The adjusted code unit index, pinned to the string's length, or
1254:             *            unchanged if input index was outside of the string.
1255:             */
1256:            static int moveIndex32(StringBuffer s, int pos, int amt) {
1257:                int i;
1258:                char c;
1259:                if (amt > 0) {
1260:                    for (i = 0; i < amt; i++) {
1261:                        if (pos >= s.length()) {
1262:                            return s.length();
1263:                        }
1264:                        c = s.charAt(pos);
1265:                        pos++;
1266:                        if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
1267:                            c = s.charAt(pos);
1268:                            if (UTF16.isTrailSurrogate(c)) {
1269:                                pos++;
1270:                            }
1271:                        }
1272:                    }
1273:                } else {
1274:                    for (i = 0; i > amt; i--) {
1275:                        if (pos <= 0) {
1276:                            return 0;
1277:                        }
1278:                        pos--;
1279:                        c = s.charAt(pos);
1280:                        if (UTF16.isTrailSurrogate(c) && pos >= 0) {
1281:                            c = s.charAt(pos);
1282:                            if (UTF16.isLeadSurrogate(c)) {
1283:                                pos--;
1284:                            }
1285:                        }
1286:                    }
1287:                }
1288:                return pos;
1289:            }
1290:
1291:            /**
1292:             * No-exceptions form of UnicodeSet.contains(c).
1293:             *    Simplifies loops that terminate with an end-of-input character value.
1294:             * @param s  A unicode set
1295:             * @param c  A code point value
1296:             * @return   true if the set contains c.
1297:             */
1298:            static boolean setContains(UnicodeSet s, int c) {
1299:                if (c < 0 || c > UTF16.CODEPOINT_MAX_VALUE) {
1300:                    return false;
1301:                }
1302:                return s.contains(c);
1303:            }
1304:
1305:            /**
1306:             * return the index of the next code point in the input text.
1307:             * @param i the preceding index
1308:             * @return
1309:             * @internal
1310:             */
1311:            static int nextCP(StringBuffer s, int i) {
1312:                if (i == -1) {
1313:                    // End of Input indication.  Continue to return end value.
1314:                    return -1;
1315:                }
1316:                int retVal = i + 1;
1317:                if (retVal > s.length()) {
1318:                    return -1;
1319:                }
1320:                int c = UTF16.charAt(s, i);
1321:                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE
1322:                        && UTF16.isLeadSurrogate(s.charAt(i))) {
1323:                    retVal++;
1324:                }
1325:                return retVal;
1326:            }
1327:
1328:            //
1329:            //  The following UnicodeSets are used in matching a Grapheme Cluster
1330:            //
1331:            private static UnicodeSet GC_Control;
1332:
1333:            private static UnicodeSet GC_Extend;
1334:
1335:            private static UnicodeSet GC_L;
1336:
1337:            private static UnicodeSet GC_V;
1338:
1339:            private static UnicodeSet GC_T;
1340:
1341:            private static UnicodeSet GC_LV;
1342:
1343:            private static UnicodeSet GC_LVT;
1344:
1345:            protected void init() throws Exception {
1346:                GC_Control = new UnicodeSet(
1347:                        "[[:Zl:][:Zp:][:Cc:][:Cf:]-[\\u000d\\u000a]-[\\p{Grapheme_Cluster_Break=Extend}]]");
1348:
1349:                GC_Extend = new UnicodeSet(
1350:                        "[\\p{Grapheme_Cluster_Break=Extend}]");
1351:
1352:                GC_L = new UnicodeSet("[[:Hangul_Syllable_Type=L:]]");
1353:
1354:                GC_V = new UnicodeSet("[[:Hangul_Syllable_Type=V:]]");
1355:
1356:                GC_T = new UnicodeSet("[[:Hangul_Syllable_Type=T:]]");
1357:
1358:                GC_LV = new UnicodeSet("[[:Hangul_Syllable_Type=LV:]]");
1359:
1360:                GC_LVT = new UnicodeSet("[[:Hangul_Syllable_Type=LVT:]]");
1361:            }
1362:
1363:            /**
1364:             * Find the end of the extent of a grapheme cluster.
1365:             * This is the reference implementation used by the monkey test for comparison
1366:             * with the RBBI results.
1367:             * @param s  The string containing the text to be analyzed  
1368:             * @param i  The index of the start of the grapheme cluster.
1369:             * @return   The index of the first code point following the grapheme cluster
1370:             * @internal
1371:             */
1372:            private static int nextGC(StringBuffer s, int i) {
1373:                if (i >= s.length() || i == -1) {
1374:                    return -1;
1375:                }
1376:
1377:                int c = UTF16.charAt(s, i);
1378:                int pos = i;
1379:
1380:                if (c == 0x0d) {
1381:                    pos = nextCP(s, i);
1382:                    if (pos >= s.length()) {
1383:                        return pos;
1384:                    }
1385:                    c = UTF16.charAt(s, pos);
1386:                    if (c == 0x0a) {
1387:                        pos = nextCP(s, pos);
1388:                    }
1389:                    return pos;
1390:                }
1391:
1392:                if (GC_Control.contains(c) || c == 0x0a) {
1393:                    pos = nextCP(s, pos);
1394:                    return pos;
1395:                }
1396:
1397:                // Little state machine to consume Hangul Syllables
1398:                int hangulState = 1;
1399:                state_loop: for (;;) {
1400:                    switch (hangulState) {
1401:                    case 1:
1402:                        if (GC_L.contains(c)) {
1403:                            hangulState = 2;
1404:                            break;
1405:                        }
1406:                        if (GC_V.contains(c) || GC_LV.contains(c)) {
1407:                            hangulState = 3;
1408:                            break;
1409:                        }
1410:                        if (GC_T.contains(c) || GC_LVT.contains(c)) {
1411:                            hangulState = 4;
1412:                            break;
1413:                        }
1414:                        break state_loop;
1415:                    case 2:
1416:                        if (GC_L.contains(c)) {
1417:                            // continue in state 2.
1418:                            break;
1419:                        }
1420:                        if (GC_V.contains(c) || GC_LV.contains(c)) {
1421:                            hangulState = 3;
1422:                            break;
1423:                        }
1424:                        if (GC_LVT.contains(c)) {
1425:                            hangulState = 4;
1426:                            break;
1427:                        }
1428:                        if (GC_Extend.contains(c)) {
1429:                            hangulState = 5;
1430:                            break;
1431:                        }
1432:                        break state_loop;
1433:                    case 3:
1434:                        if (GC_V.contains(c)) {
1435:                            // continue in state 3;
1436:                            break;
1437:                        }
1438:                        if (GC_T.contains(c)) {
1439:                            hangulState = 4;
1440:                            break;
1441:                        }
1442:                        if (GC_Extend.contains(c)) {
1443:                            hangulState = 5;
1444:                            break;
1445:                        }
1446:                        break state_loop;
1447:                    case 4:
1448:                        if (GC_T.contains(c)) {
1449:                            // continue in state 4
1450:                            break;
1451:                        }
1452:                        if (GC_Extend.contains(c)) {
1453:                            hangulState = 5;
1454:                            break;
1455:                        }
1456:                        break state_loop;
1457:                    case 5:
1458:                        if (GC_Extend.contains(c)) {
1459:                            hangulState = 5;
1460:                            break;
1461:                        }
1462:                        break state_loop;
1463:                    }
1464:                    // We have exited the switch statement, but are still in the loop.
1465:                    // Still in a Hangul Syllable, advance to the next code point.
1466:                    pos = nextCP(s, pos);
1467:                    if (pos >= s.length()) {
1468:                        break;
1469:                    }
1470:                    c = UTF16.charAt(s, pos);
1471:                } // end of loop
1472:
1473:                if (hangulState != 1) {
1474:                    // We found a Hangul.  We're done.
1475:                    return pos;
1476:                }
1477:
1478:                // Ordinary characters.  Consume one codepoint unconditionally, then any following Extends.
1479:                for (;;) {
1480:                    pos = nextCP(s, pos);
1481:                    if (pos >= s.length()) {
1482:                        break;
1483:                    }
1484:                    c = UTF16.charAt(s, pos);
1485:                    if (GC_Extend.contains(c) == false) {
1486:                        break;
1487:                    }
1488:                }
1489:
1490:                return pos;
1491:            }
1492:
1493:            /**
1494:             * random number generator.  Not using Java's built-in Randoms for two reasons:
1495:             *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
1496:             *    2.  We need to get and restore the seed from values occuring in the middle
1497:             *        of a long sequence, to more easily reproduce failing cases.
1498:             */
1499:            private static int m_seed = 1;
1500:
1501:            private static int m_rand() {
1502:                m_seed = m_seed * 1103515245 + 12345;
1503:                return (int) (m_seed >>> 16) % 32768;
1504:            }
1505:
1506:            /**
1507:             *  Run a RBBI monkey test.  Common routine, for all break iterator types.
1508:             *    Parameters:
1509:             *       bi      - the break iterator to use
1510:             *       mk      - MonkeyKind, abstraction for obtaining expected results
1511:             *       name    - Name of test (char, word, etc.) for use in error messages
1512:             *       seed    - Seed for starting random number generator (parameter from user)
1513:             *       numIterations
1514:             */
1515:            void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name,
1516:                    int seed, int numIterations) {
1517:                int TESTSTRINGLEN = 500;
1518:                StringBuffer testText = new StringBuffer();
1519:                int numCharClasses;
1520:                List chClasses;
1521:                int[] expected = new int[TESTSTRINGLEN * 2 + 1];
1522:                int expectedCount = 0;
1523:                boolean[] expectedBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1524:                boolean[] forwardBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1525:                boolean[] reverseBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1526:                boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1527:                boolean[] followingBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1528:                boolean[] precedingBreaks = new boolean[TESTSTRINGLEN * 2 + 1];
1529:                int i;
1530:                int loopCount = 0;
1531:                boolean printTestData = false;
1532:                boolean printBreaksFromBI = false;
1533:
1534:                m_seed = seed;
1535:
1536:                numCharClasses = mk.charClasses().size();
1537:                chClasses = mk.charClasses();
1538:
1539:                // Verify that the character classes all have at least one member.
1540:                for (i = 0; i < numCharClasses; i++) {
1541:                    UnicodeSet s = (UnicodeSet) chClasses.get(i);
1542:                    if (s == null || s.size() == 0) {
1543:                        errln("Character Class " + i
1544:                                + " is null or of zero size.");
1545:                        return;
1546:                    }
1547:                }
1548:
1549:                //--------------------------------------------------------------------------------------------
1550:                //
1551:                //  Debugging settings.  Comment out everything in the following block for normal operation
1552:                //
1553:                //--------------------------------------------------------------------------------------------
1554:                // numIterations = -1;  
1555:                // RuleBasedBreakIterator_New.fTrace = true;
1556:                // m_seed = 859056465;
1557:                // TESTSTRINGLEN = 50;
1558:                // printTestData = true;
1559:                // printBreaksFromBI = true;
1560:                // ((RuleBasedBreakIterator_New)bi).dump();
1561:
1562:                //--------------------------------------------------------------------------------------------
1563:                //
1564:                //  End of Debugging settings.  
1565:                //
1566:                //--------------------------------------------------------------------------------------------
1567:
1568:                int dotsOnLine = 0;
1569:                while (loopCount < numIterations || numIterations == -1) {
1570:                    if (numIterations == -1 && loopCount % 10 == 0) {
1571:                        // If test is running in an infinite loop, display a periodic tic so
1572:                        //   we can tell that it is making progress.
1573:                        System.out.print(".");
1574:                        if (dotsOnLine++ >= 80) {
1575:                            System.out.println();
1576:                            dotsOnLine = 0;
1577:                        }
1578:                    }
1579:                    // Save current random number seed, so that we can recreate the random numbers
1580:                    //   for this loop iteration in event of an error.
1581:                    seed = m_seed;
1582:
1583:                    testText.setLength(0);
1584:                    // Populate a test string with data.
1585:                    if (printTestData) {
1586:                        System.out.println("Test Data string ...");
1587:                    }
1588:                    for (i = 0; i < TESTSTRINGLEN; i++) {
1589:                        int aClassNum = m_rand() % numCharClasses;
1590:                        UnicodeSet classSet = (UnicodeSet) chClasses
1591:                                .get(aClassNum);
1592:                        int charIdx = m_rand() % classSet.size();
1593:                        int c = classSet.charAt(charIdx);
1594:                        if (c < 0) { // TODO:  deal with sets containing strings.
1595:                            errln("c < 0");
1596:                        }
1597:                        UTF16.appendCodePoint(testText, c);
1598:                        if (printTestData) {
1599:                            System.out.print(Integer.toHexString(c) + " ");
1600:                        }
1601:                    }
1602:                    if (printTestData) {
1603:                        System.out.println();
1604:                    }
1605:
1606:                    Arrays.fill(expected, 0);
1607:                    Arrays.fill(expectedBreaks, false);
1608:                    Arrays.fill(forwardBreaks, false);
1609:                    Arrays.fill(reverseBreaks, false);
1610:                    Arrays.fill(isBoundaryBreaks, false);
1611:                    Arrays.fill(followingBreaks, false);
1612:                    Arrays.fill(precedingBreaks, false);
1613:
1614:                    // Calculate the expected results for this test string.
1615:                    mk.setText(testText);
1616:                    expectedCount = 0;
1617:                    expectedBreaks[0] = true;
1618:                    expected[expectedCount++] = 0;
1619:                    int breakPos = 0;
1620:                    int lastBreakPos = -1;
1621:                    for (;;) {
1622:                        lastBreakPos = breakPos;
1623:                        breakPos = mk.next(breakPos);
1624:                        if (breakPos == -1) {
1625:                            break;
1626:                        }
1627:                        if (breakPos > testText.length()) {
1628:                            errln("breakPos > testText.length()");
1629:                        }
1630:                        if (lastBreakPos >= breakPos) {
1631:                            errln("Next() not increasing.");
1632:                            // break;
1633:                        }
1634:                        expectedBreaks[breakPos] = true;
1635:                        expected[expectedCount++] = breakPos;
1636:                    }
1637:
1638:                    // Find the break positions using forward iteration
1639:                    if (printBreaksFromBI) {
1640:                        System.out.println("Breaks from BI...");
1641:                    }
1642:                    bi.setText(testText.toString());
1643:                    for (i = bi.first(); i != BreakIterator.DONE; i = bi.next()) {
1644:                        if (i < 0 || i > testText.length()) {
1645:                            errln(name
1646:                                    + " break monkey test: Out of range value returned by breakIterator::next()");
1647:                            break;
1648:                        }
1649:                        if (printBreaksFromBI) {
1650:                            System.out.print(Integer.toHexString(i) + " ");
1651:                        }
1652:                        forwardBreaks[i] = true;
1653:                    }
1654:                    if (printBreaksFromBI) {
1655:                        System.out.println();
1656:                    }
1657:
1658:                    // Find the break positions using reverse iteration
1659:                    for (i = bi.last(); i != BreakIterator.DONE; i = bi
1660:                            .previous()) {
1661:                        if (i < 0 || i > testText.length()) {
1662:                            errln(name
1663:                                    + " break monkey test: Out of range value returned by breakIterator.next()"
1664:                                    + name);
1665:                            break;
1666:                        }
1667:                        reverseBreaks[i] = true;
1668:                    }
1669:
1670:                    // Find the break positions using isBoundary() tests.
1671:                    for (i = 0; i <= testText.length(); i++) {
1672:                        isBoundaryBreaks[i] = bi.isBoundary(i);
1673:                    }
1674:
1675:                    // Find the break positions using the following() function.
1676:                    lastBreakPos = 0;
1677:                    followingBreaks[0] = true;
1678:                    for (i = 0; i < testText.length(); i++) {
1679:                        breakPos = bi.following(i);
1680:                        if (breakPos <= i || breakPos < lastBreakPos
1681:                                || breakPos > testText.length()
1682:                                || breakPos > lastBreakPos && lastBreakPos > i) {
1683:                            errln(name
1684:                                    + " break monkey test: "
1685:                                    + "Out of range value returned by BreakIterator::following().\n"
1686:                                    + "index=" + i + "following returned="
1687:                                    + breakPos + "lastBreak=" + lastBreakPos);
1688:                            precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
1689:                        } else {
1690:                            followingBreaks[breakPos] = true;
1691:                            lastBreakPos = breakPos;
1692:                        }
1693:                    }
1694:
1695:                    // Find the break positions using the preceding() function.
1696:                    lastBreakPos = testText.length();
1697:                    precedingBreaks[testText.length()] = true;
1698:                    for (i = testText.length(); i > 0; i--) {
1699:                        breakPos = bi.preceding(i);
1700:                        if (breakPos >= i || breakPos > lastBreakPos
1701:                                || breakPos < 0 || breakPos < lastBreakPos
1702:                                && lastBreakPos < i) {
1703:                            errln(name
1704:                                    + " break monkey test: "
1705:                                    + "Out of range value returned by BreakIterator::preceding().\n"
1706:                                    + "index=" + i + "preceding returned="
1707:                                    + breakPos + "lastBreak=" + lastBreakPos);
1708:                            precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
1709:                        } else {
1710:                            precedingBreaks[breakPos] = true;
1711:                            lastBreakPos = breakPos;
1712:                        }
1713:                    }
1714:
1715:                    // Compare the expected and actual results.
1716:                    for (i = 0; i <= testText.length(); i++) {
1717:                        String errorType = null;
1718:                        if (forwardBreaks[i] != expectedBreaks[i]) {
1719:                            errorType = "next()";
1720:                        } else if (reverseBreaks[i] != forwardBreaks[i]) {
1721:                            errorType = "previous()";
1722:                        } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
1723:                            errorType = "isBoundary()";
1724:                        } else if (followingBreaks[i] != expectedBreaks[i]) {
1725:                            errorType = "following()";
1726:                        } else if (precedingBreaks[i] != expectedBreaks[i]) {
1727:                            errorType = "preceding()";
1728:                        }
1729:
1730:                        if (errorType != null) {
1731:                            // Format a range of the test text that includes the failure as
1732:                            //  a data item that can be included in the rbbi test data file.
1733:
1734:                            // Start of the range is the last point where expected and actual results
1735:                            //   both agreed that there was a break position.
1736:                            int startContext = i;
1737:                            int count = 0;
1738:                            for (;;) {
1739:                                if (startContext == 0) {
1740:                                    break;
1741:                                }
1742:                                startContext--;
1743:                                if (expectedBreaks[startContext]) {
1744:                                    if (count == 2)
1745:                                        break;
1746:                                    count++;
1747:                                }
1748:                            }
1749:
1750:                            // End of range is two expected breaks past the start position.
1751:                            int endContext = i + 1;
1752:                            int ci;
1753:                            for (ci = 0; ci < 2; ci++) { // Number of items to include in error text.
1754:                                for (;;) {
1755:                                    if (endContext >= testText.length()) {
1756:                                        break;
1757:                                    }
1758:                                    if (expectedBreaks[endContext - 1]) {
1759:                                        if (count == 0)
1760:                                            break;
1761:                                        count--;
1762:                                    }
1763:                                    endContext++;
1764:                                }
1765:                            }
1766:
1767:                            // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
1768:                            StringBuffer errorText = new StringBuffer();
1769:                            errorText.append("<data>");
1770:
1771:                            String hexChars = "0123456789abcdef";
1772:                            int c; // Char from test data
1773:                            int bn;
1774:                            for (ci = startContext; ci <= endContext
1775:                                    && ci != -1; ci = nextCP(testText, ci)) {
1776:                                if (ci == i) {
1777:                                    // This is the location of the error.
1778:                                    errorText.append("<?>");
1779:                                } else if (expectedBreaks[ci]) {
1780:                                    // This a non-error expected break position.
1781:                                    errorText.append("<>");
1782:                                }
1783:                                if (ci < testText.length()) {
1784:                                    c = UTF16.charAt(testText, ci);
1785:                                    if (c < 0x10000) {
1786:                                        errorText.append("\\u");
1787:                                        for (bn = 12; bn >= 0; bn -= 4) {
1788:                                            errorText
1789:                                                    .append(hexChars
1790:                                                            .charAt((((int) c) >> bn) & 0xf));
1791:                                        }
1792:                                    } else {
1793:                                        errorText.append("\\U");
1794:                                        for (bn = 28; bn >= 0; bn -= 4) {
1795:                                            errorText
1796:                                                    .append(hexChars
1797:                                                            .charAt((((int) c) >> bn) & 0xf));
1798:                                        }
1799:                                    }
1800:                                }
1801:                            }
1802:                            if (ci == testText.length() && ci != -1) {
1803:                                errorText.append("<>");
1804:                            }
1805:                            errorText.append("</data>\n");
1806:
1807:                            // Output the error
1808:                            errln(name
1809:                                    + " break monkey test error.  "
1810:                                    + (expectedBreaks[i] ? "Break expected but not found."
1811:                                            : "Break found but not expected.")
1812:                                    + "\nOperation = " + errorType
1813:                                    + "; random seed = " + seed
1814:                                    + ";  buf Idx = " + i + "\n" + errorText);
1815:                            break;
1816:                        }
1817:                    }
1818:
1819:                    loopCount++;
1820:                }
1821:            }
1822:
1823:            public void TestCharMonkey() {
1824:
1825:                int loopCount = 500;
1826:                int seed = 1;
1827:
1828:                if (params.inclusion >= 9) {
1829:                    loopCount = 10000;
1830:                }
1831:
1832:                RBBICharMonkey m = new RBBICharMonkey();
1833:                BreakIterator bi = BreakIterator
1834:                        .getCharacterInstance(Locale.US);
1835:                RunMonkey(bi, m, "char", seed, loopCount);
1836:            }
1837:
1838:            public void TestWordMonkey() {
1839:
1840:                int loopCount = 500;
1841:                int seed = 1;
1842:
1843:                if (params.inclusion >= 9) {
1844:                    loopCount = 10000;
1845:                }
1846:
1847:                logln("Word Break Monkey Test");
1848:                RBBIWordMonkey m = new RBBIWordMonkey();
1849:                BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
1850:                RunMonkey(bi, m, "word", seed, loopCount);
1851:            }
1852:
1853:            public void TestLineMonkey() {
1854:
1855:                int loopCount = 500;
1856:                int seed = 1;
1857:
1858:                if (params.inclusion >= 9) {
1859:                    loopCount = 10000;
1860:                }
1861:
1862:                logln("Line Break Monkey Test");
1863:                RBBILineMonkey m = new RBBILineMonkey();
1864:                BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
1865:                if (params == null) {
1866:                    loopCount = 50;
1867:                }
1868:                RunMonkey(bi, m, "line", seed, loopCount);
1869:            }
1870:
1871:            public void TestSentMonkey() {
1872:
1873:                int loopCount = 500;
1874:                int seed = 1;
1875:
1876:                if (params.inclusion >= 9) {
1877:                    loopCount = 3000;
1878:                }
1879:
1880:                logln("Sentence Break Monkey Test");
1881:                RBBISentenceMonkey m = new RBBISentenceMonkey();
1882:                BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
1883:                if (params == null) {
1884:                    loopCount = 30;
1885:                }
1886:                RunMonkey(bi, m, "sent", seed, loopCount);
1887:            }
1888:
1889:            //
1890:            //  Round-trip monkey tests.
1891:            //  Verify that break iterators created from the rule source from the default
1892:            //    break iterators still pass the monkey test for the iterator type.
1893:            //
1894:            //  This is a major test for the Rule Compiler.  The default break iterators are built
1895:            //  from pre-compiled binary rule data that was created using ICU4C; these
1896:            //  round-trip rule recompile tests verify that the Java rule compiler can
1897:            //  rebuild break iterators from the original source rules.
1898:            //
1899:            public void TestRTCharMonkey() {
1900:
1901:                int loopCount = 200;
1902:                int seed = 1;
1903:
1904:                if (params.inclusion >= 9) {
1905:                    loopCount = 2000;
1906:                }
1907:
1908:                RBBICharMonkey m = new RBBICharMonkey();
1909:                BreakIterator bi = BreakIterator
1910:                        .getCharacterInstance(Locale.US);
1911:                String rules = bi.toString();
1912:                BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1913:                RunMonkey(rtbi, m, "char", seed, loopCount);
1914:            }
1915:
1916:            public void TestRTWordMonkey() {
1917:
1918:                int loopCount = 200;
1919:                int seed = 1;
1920:
1921:                if (params.inclusion >= 9) {
1922:                    loopCount = 2000;
1923:                }
1924:
1925:                logln("Word Break Monkey Test");
1926:                RBBIWordMonkey m = new RBBIWordMonkey();
1927:                BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
1928:                String rules = bi.toString();
1929:                BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1930:                RunMonkey(rtbi, m, "word", seed, loopCount);
1931:            }
1932:
1933:            public void TestRTLineMonkey() {
1934:
1935:                int loopCount = 200;
1936:                int seed = 1;
1937:
1938:                if (params.inclusion >= 9) {
1939:                    loopCount = 2000;
1940:                }
1941:
1942:                logln("Line Break Monkey Test");
1943:                RBBILineMonkey m = new RBBILineMonkey();
1944:                BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
1945:                String rules = bi.toString();
1946:                BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1947:                if (params == null) {
1948:                    loopCount = 50;
1949:                }
1950:                RunMonkey(rtbi, m, "line", seed, loopCount);
1951:            }
1952:
1953:            public void TestRTSentMonkey() {
1954:
1955:                int loopCount = 200;
1956:                int seed = 1;
1957:
1958:                if (params.inclusion >= 9) {
1959:                    loopCount = 1000;
1960:                }
1961:
1962:                logln("Sentence Break Monkey Test");
1963:                RBBISentenceMonkey m = new RBBISentenceMonkey();
1964:                BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
1965:                String rules = bi.toString();
1966:                BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1967:                if (params == null) {
1968:                    loopCount = 30;
1969:                }
1970:                RunMonkey(rtbi, m, "sent", seed, loopCount);
1971:            }
1972:
1973:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.