Source Code Cross Referenced for RuleBasedBreakIterator.java in » Internationalization-Localization » icu4j » com » ibm » icu » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.text
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        //##header
0002:        /*
0003:         *******************************************************************************
0004:         * Copyright (C) 2005-2006 International Business Machines Corporation and          *
0005:         * others. All Rights Reserved.                                                *
0006:         *******************************************************************************
0007:         */
0008:        package com.ibm.icu.text;
0009:
0010:        import java.text.CharacterIterator;
0011:        import java.io.IOException;
0012:        import java.io.InputStream;
0013:        import java.io.OutputStream;
0014:        import java.io.ByteArrayInputStream;
0015:        import java.io.ByteArrayOutputStream;
0016:
0017:        import com.ibm.icu.impl.Assert;
0018:
0019:        /**
0020:         * Rule Based Break Iterator 
0021:         * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
0022:         * 
0023:         * @stable ICU 2.0
0024:         */
0025:        public class RuleBasedBreakIterator extends BreakIterator {
0026:
0027:            //=======================================================================
0028:            // Constructors & Factories
0029:            //=======================================================================
0030:
0031:            /** 
0032:             * @internal 
0033:             * @deprecated This API is ICU internal only.
0034:             */
0035:            public RuleBasedBreakIterator() {
0036:            }
0037:
0038:            /**
0039:             * Create a break iterator from a precompiled set of rules.
0040:             * @internal
0041:             * @deprecated This API is ICU internal only.
0042:             */
0043:            public static RuleBasedBreakIterator getInstanceFromCompiledRules(
0044:                    InputStream is) throws IOException {
0045:                RuleBasedBreakIterator This = new RuleBasedBreakIterator();
0046:                This.fRData = RBBIDataWrapper.get(is);
0047:                return This;
0048:            }
0049:
0050:            private RuleBasedBreakIterator(RuleBasedBreakIterator other) {
0051:                // TODO: check types.
0052:                fRData = other.fRData;
0053:                if (fText != null) {
0054:                    fText = (CharacterIterator) (other.fText.clone());
0055:                }
0056:            }
0057:
0058:            /**
0059:             * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
0060:             * @param rules The break rules to be used.
0061:             * @param parseError  In the event of a syntax error in the rules, provides the location
0062:             *                    within the rules of the problem.
0063:             * @param status Information on any errors encountered.
0064:             * @stable ICU 2.2
0065:             */
0066:            public RuleBasedBreakIterator(String rules) {
0067:                init();
0068:                try {
0069:                    ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
0070:                    compileRules(rules, ruleOS);
0071:                    byte[] ruleBA = ruleOS.toByteArray();
0072:                    InputStream ruleIS = new ByteArrayInputStream(ruleBA);
0073:                    fRData = RBBIDataWrapper.get(ruleIS);
0074:                } catch (IOException e) {
0075:                    // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
0076:                    //  causing bogus compiled rules to be produced, but with no compile error raised.
0077:                    //#ifdef FOUNDATION
0078:                    //##            RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error:");
0079:                    //#else
0080:                    RuntimeException rte = new RuntimeException(
0081:                            "RuleBasedBreakIterator rule compilation internal error:",
0082:                            e);
0083:                    //#endif
0084:                    throw rte;
0085:                }
0086:            }
0087:
0088:            //=======================================================================
0089:            // Boilerplate
0090:            //=======================================================================
0091:
0092:            /**
0093:             * Clones this iterator.
0094:             * @return A newly-constructed RuleBasedBreakIterator with the same
0095:             * behavior as this one.
0096:             * @stable ICU 2.0
0097:             */
0098:            public Object clone() {
0099:                RuleBasedBreakIterator result = new RuleBasedBreakIterator(this );
0100:                return result;
0101:            }
0102:
0103:            /**
0104:             * Returns true if both BreakIterators are of the same class, have the same
0105:             * rules, and iterate over the same text.
0106:             * @stable ICU 2.0
0107:             */
0108:            public boolean equals(Object that) {
0109:                try {
0110:                    RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
0111:                    if (fRData != other.fRData
0112:                            && (fRData == null || other.fRData == null)) {
0113:                        return false;
0114:                    }
0115:                    if (fRData != null
0116:                            && other.fRData != null
0117:                            && (!fRData.fRuleSource
0118:                                    .equals(other.fRData.fRuleSource))) {
0119:                        return false;
0120:                    }
0121:                    if (fText == null && other.fText == null) {
0122:                        return true;
0123:                    }
0124:                    if (fText == null || other.fText == null) {
0125:                        return false;
0126:                    }
0127:                    return fText.equals(other.fText);
0128:                } catch (ClassCastException e) {
0129:                    return false;
0130:                }
0131:            }
0132:
0133:            /**
0134:             * Returns the description (rules) used to create this iterator.
0135:             * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
0136:             * @stable ICU 2.0
0137:             */
0138:            public String toString() {
0139:                String retStr = null;
0140:                if (fRData != null) {
0141:                    retStr = fRData.fRuleSource;
0142:                }
0143:                return retStr;
0144:            }
0145:
0146:            /**
0147:             * Compute a hashcode for this BreakIterator
0148:             * @return A hash code
0149:             * @stable ICU 2.0
0150:             */
0151:            public int hashCode() {
0152:                return fRData.fRuleSource.hashCode();
0153:            }
0154:
0155:            /** 
0156:             * Tag value for "words" that do not fit into any of other categories. 
0157:             * Includes spaces and most punctuation. 
0158:             * @draft ICU 3.0 
0159:             * @provisional This is a draft API and might change in a future release of ICU.
0160:             */
0161:            public static final int WORD_NONE = 0;
0162:
0163:            /**
0164:             * Upper bound for tags for uncategorized words. 
0165:             * @draft ICU 3.0 
0166:             * @provisional This is a draft API and might change in a future release of ICU.
0167:             */
0168:            public static final int WORD_NONE_LIMIT = 100;
0169:
0170:            /**
0171:             * Tag value for words that appear to be numbers, lower limit. 
0172:             * @draft ICU 3.0 
0173:             * @provisional This is a draft API and might change in a future release of ICU.
0174:             */
0175:            public static final int WORD_NUMBER = 100;
0176:
0177:            /** 
0178:             * Tag value for words that appear to be numbers, upper limit.
0179:             * @draft ICU 3.0 
0180:             * @provisional This is a draft API and might change in a future release of ICU.
0181:             */
0182:            public static final int WORD_NUMBER_LIMIT = 200;
0183:
0184:            /** 
0185:             * Tag value for words that contain letters, excluding
0186:             * hiragana, katakana or ideographic characters, lower limit. 
0187:             * @draft ICU 3.0 
0188:             * @provisional This is a draft API and might change in a future release of ICU.
0189:             */
0190:            public static final int WORD_LETTER = 200;
0191:
0192:            /** 
0193:             * Tag value for words containing letters, upper limit 
0194:             * @draft ICU 3.0 
0195:             * @provisional This is a draft API and might change in a future release of ICU.
0196:             */
0197:            public static final int WORD_LETTER_LIMIT = 300;
0198:
0199:            /** 
0200:             * Tag value for words containing kana characters, lower limit
0201:             * @draft ICU 3.0 
0202:             * @provisional This is a draft API and might change in a future release of ICU.
0203:             */
0204:            public static final int WORD_KANA = 300;
0205:
0206:            /** 
0207:             * Tag value for words containing kana characters, upper limit
0208:             * @draft ICU 3.0 
0209:             * @provisional This is a draft API and might change in a future release of ICU.
0210:             */
0211:            public static final int WORD_KANA_LIMIT = 400;
0212:
0213:            /**
0214:             * Tag value for words containing ideographic characters, lower limit
0215:             * @draft ICU 3.0 
0216:             * @provisional This is a draft API and might change in a future release of ICU.
0217:             */
0218:            public static final int WORD_IDEO = 400;
0219:
0220:            /**
0221:             * Tag value for words containing ideographic characters, upper limit
0222:             * @draft ICU 3.0 
0223:             * @provisional This is a draft API and might change in a future release of ICU.
0224:             */
0225:            public static final int WORD_IDEO_LIMIT = 500;
0226:
0227:            private static final int START_STATE = 1; // The state number of the starting state
0228:            private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
0229:
0230:            // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
0231:            //               of user text.  A variable with this enum type keeps track of where we
0232:            //               are.  The state machine only fetches user text input while in RUN mode.
0233:            private static final int RBBI_START = 0;
0234:            private static final int RBBI_RUN = 1;
0235:            private static final int RBBI_END = 2;
0236:
0237:            /**
0238:             * The character iterator through which this BreakIterator accesses the text.
0239:             * 
0240:             * @internal
0241:             * @deprecated This API is ICU internal only.
0242:             */
0243:            private CharacterIterator fText = new java.text.StringCharacterIterator(
0244:                    "");
0245:
0246:            /**
0247:             * The rule data for this BreakIterator instance
0248:             * @internal
0249:             * @deprecated This API is ICU internal only.
0250:             */
0251:            protected RBBIDataWrapper fRData;
0252:
0253:            /** Index of the Rule {tag} values for the most recent match. 
0254:             *  @internal
0255:             * @deprecated This API is ICU internal only.
0256:             */
0257:            private int fLastRuleStatusIndex;
0258:
0259:            /**
0260:             * Rule tag value valid flag.
0261:             * Some iterator operations don't intrinsically set the correct tag value.
0262:             * This flag lets us lazily compute the value if we are ever asked for it.
0263:             * @internal
0264:             * @deprecated This API is ICU internal only.
0265:             */
0266:            private boolean fLastStatusIndexValid;
0267:
0268:            /**
0269:             * Counter for the number of characters encountered with the "dictionary"
0270:             *   flag set.  Normal RBBI iterators don't use it, although the code
0271:             *   for updating it is live.  Dictionary Based break iterators (a subclass
0272:             *   of us) access this field directly.
0273:             * @internal
0274:             * @deprecated This API is ICU internal only.
0275:             */
0276:            protected int fDictionaryCharCount;
0277:
0278:            /**
0279:             * Debugging flag.  Trace operation of state machine when true.
0280:             * @internal
0281:             * @deprecated This API is ICU internal only.
0282:             */
0283:            public static boolean fTrace;
0284:
0285:            /**
0286:             * Dump the contents of the state table and character classes for this break iterator.
0287:             * For debugging only.
0288:             * @internal
0289:             * @deprecated This API is ICU internal only.
0290:             */
0291:            public void dump() {
0292:                this .fRData.dump();
0293:            }
0294:
0295:            private static boolean debugInitDone = false;
0296:
0297:            private void init() {
0298:                fLastStatusIndexValid = true;
0299:                fDictionaryCharCount = 0;
0300:
0301:                if (debugInitDone == false) {
0302:                    String debugEnv = System.getProperty("U_RBBIDEBUG");
0303:                    if (debugEnv != null && debugEnv.indexOf("trace") >= 0) {
0304:                        fTrace = true;
0305:                    }
0306:                    debugInitDone = true;
0307:                }
0308:            }
0309:
0310:            private static void compileRules(String rules,
0311:                    OutputStream ruleBinary) throws IOException {
0312:                RBBIRuleBuilder.compileRules(rules, ruleBinary);
0313:            }
0314:
0315:            //=======================================================================
0316:            // BreakIterator overrides
0317:            //=======================================================================
0318:
0319:            /**
0320:             * Sets the current iteration position to the beginning of the text.
0321:             * (i.e., the CharacterIterator's starting offset).
0322:             * @return The offset of the beginning of the text.
0323:             * @stable ICU 2.0
0324:             */
0325:            public int first() {
0326:                fLastRuleStatusIndex = 0;
0327:                fLastStatusIndexValid = true;
0328:                if (fText == null) {
0329:                    return BreakIterator.DONE;
0330:                }
0331:                fText.first();
0332:                return fText.getIndex();
0333:            }
0334:
0335:            /**
0336:             * Sets the current iteration position to the end of the text.
0337:             * (i.e., the CharacterIterator's ending offset).
0338:             * @return The text's past-the-end offset.
0339:             * @stable ICU 2.0
0340:             */
0341:            public int last() {
0342:                if (fText == null) {
0343:                    fLastRuleStatusIndex = 0;
0344:                    fLastStatusIndexValid = true;
0345:                    return BreakIterator.DONE;
0346:                }
0347:
0348:                // I'm not sure why, but t.last() returns the offset of the last character,
0349:                // rather than the past-the-end offset
0350:                //
0351:                //   (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
0352:                //     will work correctly.)
0353:
0354:                fLastStatusIndexValid = false;
0355:                int pos = fText.getEndIndex();
0356:                fText.setIndex(pos);
0357:                return pos;
0358:            }
0359:
0360:            /**
0361:             * Advances the iterator either forward or backward the specified number of steps.
0362:             * Negative values move backward, and positive values move forward.  This is
0363:             * equivalent to repeatedly calling next() or previous().
0364:             * @param n The number of steps to move.  The sign indicates the direction
0365:             * (negative is backwards, and positive is forwards).
0366:             * @return The character offset of the boundary position n boundaries away from
0367:             * the current one.
0368:             * @stable ICU 2.0
0369:             */
0370:            public int next(int n) {
0371:                int result = current();
0372:                while (n > 0) {
0373:                    result = handleNext();
0374:                    --n;
0375:                }
0376:                while (n < 0) {
0377:                    result = previous();
0378:                    ++n;
0379:                }
0380:                return result;
0381:            }
0382:
0383:            /**
0384:             * Advances the iterator to the next boundary position.
0385:             * @return The position of the first boundary after this one.
0386:             * @stable ICU 2.0
0387:             */
0388:            public int next() {
0389:                return handleNext();
0390:            }
0391:
0392:            /**
0393:             * Moves the iterator backwards, to the last boundary preceding this one.
0394:             * @return The position of the last boundary position preceding this one.
0395:             * @stable ICU 2.0
0396:             */
0397:            public int previous() {
0398:                // if we're already sitting at the beginning of the text, return DONE
0399:                if (fText == null || current() == fText.getBeginIndex()) {
0400:                    fLastRuleStatusIndex = 0;
0401:                    fLastStatusIndexValid = true;
0402:                    return BreakIterator.DONE;
0403:                }
0404:
0405:                if (fRData.fSRTable != null || fRData.fSFTable != null) {
0406:                    return handlePrevious(fRData.fRTable);
0407:                }
0408:
0409:                // old rule syntax
0410:                // set things up.  handlePrevious() will back us up to some valid
0411:                // break position before the current position (we back our internal
0412:                // iterator up one step to prevent handlePrevious() from returning
0413:                // the current position), but not necessarily the last one before
0414:                // where we started
0415:
0416:                int start = current();
0417:
0418:                CIPrevious32(fText);
0419:                int lastResult = handlePrevious(fRData.fRTable);
0420:                if (lastResult == BreakIterator.DONE) {
0421:                    lastResult = fText.getBeginIndex();
0422:                    fText.setIndex(lastResult);
0423:                }
0424:                int result = lastResult;
0425:                int lastTag = 0;
0426:                boolean breakTagValid = false;
0427:
0428:                // iterate forward from the known break position until we pass our
0429:                // starting point.  The last break position before the starting
0430:                // point is our return value
0431:
0432:                for (;;) {
0433:                    result = handleNext();
0434:                    if (result == BreakIterator.DONE || result >= start) {
0435:                        break;
0436:                    }
0437:                    lastResult = result;
0438:                    lastTag = fLastRuleStatusIndex;
0439:                    breakTagValid = true;
0440:                }
0441:
0442:                // fLastBreakTag wants to have the value for section of text preceding
0443:                // the result position that we are to return (in lastResult.)  If
0444:                // the backwards rules overshot and the above loop had to do two or more
0445:                // handleNext()s to move up to the desired return position, we will have a valid
0446:                // tag value. But, if handlePrevious() took us to exactly the correct result positon,
0447:                // we wont have a tag value for that position, which is only set by handleNext().
0448:
0449:                // set the current iteration position to be the last break position
0450:                // before where we started, and then return that value
0451:                fText.setIndex(lastResult);
0452:                fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
0453:                fLastStatusIndexValid = breakTagValid;
0454:                return lastResult;
0455:            }
0456:
0457:            /**
0458:             * Sets the iterator to refer to the first boundary position following
0459:             * the specified position.
0460:             * @param offset The position from which to begin searching for a break position.
0461:             * @return The position of the first break after the current position.
0462:             * @stable ICU 2.0
0463:             */
0464:            public int following(int offset) {
0465:                // if the offset passed in is already past the end of the text,
0466:                // just return DONE; if it's before the beginning, return the
0467:                // text's starting offset
0468:                fLastRuleStatusIndex = 0;
0469:                fLastStatusIndexValid = true;
0470:                if (fText == null || offset >= fText.getEndIndex()) {
0471:                    last();
0472:                    return next();
0473:                } else if (offset < fText.getBeginIndex()) {
0474:                    return first();
0475:                }
0476:
0477:                // otherwise, set our internal iteration position (temporarily)
0478:                // to the position passed in.  If this is the _beginning_ position,
0479:                // then we can just use next() to get our return value
0480:
0481:                int result = 0;
0482:
0483:                if (fRData.fSRTable != null) {
0484:                    // Safe Point Reverse rules exist.
0485:                    //   This allows us to use the optimum algorithm.
0486:                    fText.setIndex(offset);
0487:                    // move forward one codepoint to prepare for moving back to a
0488:                    // safe point.
0489:                    // this handles offset being between a supplementary character
0490:                    CINext32(fText);
0491:                    // handlePrevious will move most of the time to < 1 boundary away
0492:                    handlePrevious(fRData.fSRTable);
0493:                    result = next();
0494:                    while (result <= offset) {
0495:                        result = next();
0496:                    }
0497:                    return result;
0498:                }
0499:                if (fRData.fSFTable != null) {
0500:                    // No Safe point reverse table, but there is a safe pt forward table.
0501:                    // 
0502:                    fText.setIndex(offset);
0503:                    CIPrevious32(fText);
0504:                    // handle next will give result >= offset
0505:                    handleNext(fRData.fSFTable);
0506:                    // previous will give result 0 or 1 boundary away from offset,
0507:                    // most of the time
0508:                    // we have to
0509:                    int oldresult = previous();
0510:                    while (oldresult > offset) {
0511:                        result = previous();
0512:                        if (result <= offset) {
0513:                            return oldresult;
0514:                        }
0515:                        oldresult = result;
0516:                    }
0517:                    result = next();
0518:                    if (result <= offset) {
0519:                        return next();
0520:                    }
0521:                    return result;
0522:                }
0523:                // otherwise, we have to sync up first.  Use handlePrevious() to back
0524:                // us up to a known break position before the specified position (if
0525:                // we can determine that the specified position is a break position,
0526:                // we don't back up at all).  This may or may not be the last break
0527:                // position at or before our starting position.  Advance forward
0528:                // from here until we've passed the starting position.  The position
0529:                // we stop on will be the first break position after the specified one.
0530:                // old rule syntax
0531:
0532:                fText.setIndex(offset);
0533:                if (offset == fText.getBeginIndex()) {
0534:                    return handleNext();
0535:                }
0536:                result = previous();
0537:
0538:                while (result != BreakIterator.DONE && result <= offset) {
0539:                    result = next();
0540:                }
0541:
0542:                return result;
0543:            }
0544:
0545:            /**
0546:             * Sets the iterator to refer to the last boundary position before the
0547:             * specified position.
0548:             * @param offset The position to begin searching for a break from.
0549:             * @return The position of the last boundary before the starting position.
0550:             * @stable ICU 2.0
0551:             */
0552:            public int preceding(int offset) {
0553:                // if the offset passed in is already past the end of the text,
0554:                // just return DONE; if it's before the beginning, return the
0555:
0556:                // text's starting offset
0557:                if (fText == null || offset > fText.getEndIndex()) {
0558:                    // return BreakIterator::DONE;
0559:                    return last();
0560:                } else if (offset < fText.getBeginIndex()) {
0561:                    return first();
0562:                }
0563:
0564:                // if we start by updating the current iteration position to the
0565:                // position specified by the caller, we can just use previous()
0566:                // to carry out this operation
0567:
0568:                int result;
0569:                if (fRData.fSFTable != null) {
0570:                    /// todo synwee
0571:                    // new rule syntax
0572:                    fText.setIndex(offset);
0573:                    // move backwards one codepoint to prepare for moving forwards to a
0574:                    // safe point.
0575:                    // this handles offset being between a supplementary character
0576:                    CIPrevious32(fText);
0577:                    handleNext(fRData.fSFTable);
0578:                    result = previous();
0579:                    while (result >= offset) {
0580:                        result = previous();
0581:                    }
0582:                    return result;
0583:                }
0584:                if (fRData.fSRTable != null) {
0585:                    // backup plan if forward safe table is not available
0586:                    fText.setIndex(offset);
0587:                    CINext32(fText);
0588:                    // handle previous will give result <= offset
0589:                    handlePrevious(fRData.fSRTable);
0590:
0591:                    // next will give result 0 or 1 boundary away from offset,
0592:                    // most of the time
0593:                    // we have to
0594:                    int oldresult = next();
0595:                    while (oldresult < offset) {
0596:                        result = next();
0597:                        if (result >= offset) {
0598:                            return oldresult;
0599:                        }
0600:                        oldresult = result;
0601:                    }
0602:                    result = previous();
0603:                    if (result >= offset) {
0604:                        return previous();
0605:                    }
0606:                    return result;
0607:                }
0608:
0609:                // old rule syntax
0610:                fText.setIndex(offset);
0611:                return previous();
0612:            }
0613:
0614:            /**
0615:             * Throw IllegalArgumentException unless begin <= offset < end.
0616:             * @stable ICU 2.0
0617:             */
0618:            protected static final void checkOffset(int offset,
0619:                    CharacterIterator text) {
0620:                if (offset < text.getBeginIndex()
0621:                        || offset > text.getEndIndex()) {
0622:                    throw new IllegalArgumentException("offset out of bounds");
0623:                }
0624:            }
0625:
0626:            /**
0627:             * Returns true if the specfied position is a boundary position.  As a side
0628:             * effect, leaves the iterator pointing to the first boundary position at
0629:             * or after "offset".
0630:             * @param offset the offset to check.
0631:             * @return True if "offset" is a boundary position.
0632:             * @stable ICU 2.0
0633:             */
0634:            public boolean isBoundary(int offset) {
0635:                checkOffset(offset, fText);
0636:
0637:                // the beginning index of the iterator is always a boundary position by definition
0638:                if (offset == fText.getBeginIndex()) {
0639:                    first(); // For side effects on current position, tag values.
0640:                    return true;
0641:                }
0642:
0643:                if (offset == fText.getEndIndex()) {
0644:                    last(); // For side effects on current position, tag values.
0645:                    return true;
0646:                }
0647:
0648:                // otherwise, we can use following() on the position before the specified
0649:                // one and return true if the position we get back is the one the user
0650:                // specified
0651:
0652:                // return following(offset - 1) == offset;
0653:                // TODO:  check whether it is safe to revert to the simpler offset-1 code
0654:                //         The safe rules may take care of unpaired surrogates ok.
0655:                fText.setIndex(offset);
0656:                CIPrevious32(fText);
0657:                int pos = fText.getIndex();
0658:                boolean result = following(pos) == offset;
0659:                return result;
0660:            }
0661:
0662:            /**
0663:             * Returns the current iteration position.
0664:             * @return The current iteration position.
0665:             * @stable ICU 2.0
0666:             */
0667:            public int current() {
0668:                return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
0669:            }
0670:
0671:            private void makeRuleStatusValid() {
0672:                if (fLastStatusIndexValid == false) {
0673:                    //  No cached status is available.
0674:                    if (fText == null || current() == fText.getBeginIndex()) {
0675:                        //  At start of text, or there is no text.  Status is always zero.
0676:                        fLastRuleStatusIndex = 0;
0677:                        fLastStatusIndexValid = true;
0678:                    } else {
0679:                        //  Not at start of text.  Find status the tedious way.
0680:                        int pa = current();
0681:                        previous();
0682:                        int pb = next();
0683:                        Assert.assrt(pa == pb);
0684:                    }
0685:                    Assert.assrt(fLastStatusIndexValid == true);
0686:                    Assert
0687:                            .assrt(fLastRuleStatusIndex >= 0
0688:                                    && fLastRuleStatusIndex < fRData.fStatusTable.length);
0689:                }
0690:            }
0691:
0692:            /**
0693:             * Return the status tag from the break rule that determined the most recently
0694:             * returned break position.  The values appear in the rule source
0695:             * within brackets, {123}, for example.  For rules that do not specify a
0696:             * status, a default value of 0 is returned.  If more than one rule applies,
0697:             * the numerically largest of the possible status values is returned.
0698:             * <p>
0699:             * Of the standard types of ICU break iterators, only the word break
0700:             * iterator provides status values.  The values are defined in
0701:             * class RuleBasedBreakIterator, and allow distinguishing between words
0702:             * that contain alphabetic letters, "words" that appear to be numbers,
0703:             * punctuation and spaces, words containing ideographic characters, and
0704:             * more.  Call <code>getRuleStatus</code> after obtaining a boundary
0705:             * position from <code>next()<code>, <code>previous()</code>, or 
0706:             * any other break iterator functions that returns a boundary position.
0707:             * <p>
0708:             * @return the status from the break rule that determined the most recently
0709:             * returned break position.
0710:             *
0711:             * @draft ICU 3.0
0712:             * @provisional This is a draft API and might change in a future release of ICU.
0713:             */
0714:
0715:            public int getRuleStatus() {
0716:                makeRuleStatusValid();
0717:                //   Status records have this form:
0718:                //           Count N         <--  fLastRuleStatusIndex points here.
0719:                //           Status val 0
0720:                //           Status val 1
0721:                //              ...
0722:                //           Status val N-1  <--  the value we need to return
0723:                //   The status values are sorted in ascending order.
0724:                //   This function returns the last (largest) of the array of status values.
0725:                int idx = fLastRuleStatusIndex
0726:                        + fRData.fStatusTable[fLastRuleStatusIndex];
0727:                int tagVal = fRData.fStatusTable[idx];
0728:
0729:                return tagVal;
0730:            }
0731:
0732:            /**
0733:             * Get the status (tag) values from the break rule(s) that determined the most 
0734:             * recently returned break position.  The values appear in the rule source
0735:             * within brackets, {123}, for example.  The default status value for rules
0736:             * that do not explicitly provide one is zero.
0737:             * <p>
0738:             * The status values used by the standard ICU break rules are defined
0739:             * as public constants in class RuleBasedBreakIterator.
0740:             * <p>
0741:             * If the size  of the output array is insufficient to hold the data,
0742:             *  the output will be truncated to the available length.  No exception
0743:             *  will be thrown.
0744:             *
0745:             * @param fillInArray an array to be filled in with the status values.  
0746:             * @return          The number of rule status values from rules that determined 
0747:             *                  the most recent boundary returned by the break iterator.
0748:             *                  In the event that the array is too small, the return value
0749:             *                  is the total number of status values that were available,
0750:             *                  not the reduced number that were actually returned.
0751:             * @draft ICU 3.0
0752:             * @provisional This is a draft API and might change in a future release of ICU.
0753:             */
0754:            public int getRuleStatusVec(int[] fillInArray) {
0755:                makeRuleStatusValid();
0756:                int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
0757:                if (fillInArray != null) {
0758:                    int numToCopy = Math.min(numStatusVals, fillInArray.length);
0759:                    for (int i = 0; i < numToCopy; i++) {
0760:                        fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex
0761:                                + i + 1];
0762:                    }
0763:                }
0764:                return numStatusVals;
0765:            }
0766:
0767:            /**
0768:             * Return a CharacterIterator over the text being analyzed.  This version
0769:             * of this method returns the actual CharacterIterator we're using internally.
0770:             * Changing the state of this iterator can have undefined consequences.  If
0771:             * you need to change it, clone it first.
0772:             * @return An iterator over the text being analyzed.
0773:             * @stable ICU 2.0
0774:             */
0775:            public CharacterIterator getText() {
0776:                return fText;
0777:            }
0778:
0779:            /**
0780:             * Set the iterator to analyze a new piece of text.  This function resets
0781:             * the current iteration position to the beginning of the text.
0782:             * @param newText An iterator over the text to analyze.
0783:             * @stable ICU 2.0
0784:             */
0785:            public void setText(CharacterIterator newText) {
0786:                fText = newText;
0787:                this .first();
0788:            }
0789:
0790:            /**
0791:             * Control debug, trace and dump options.
0792:             * @internal
0793:             * @deprecated This API is ICU internal only.
0794:             */
0795:            protected static String fDebugEnv = System
0796:                    .getProperty("U_RBBIDEBUG");
0797:
0798:            // 32 bit Char value returned from when an iterator has run out of range.
0799:            //     Positive value so fast case (not end, not surrogate) can be checked
0800:            //     with a single test.
0801:            private static int CI_DONE32 = 0x7fffffff;
0802:
0803:            /**
0804:             * Move the iterator forward to the next code point, and return that code point,
0805:             *   leaving the iterator positioned at char returned.
0806:             *   For Supplementary chars, the iterator is left positioned at the lead surrogate.
0807:             * @param ci  The character iterator
0808:             * @return    The next code point.
0809:             */
0810:            static int CINext32(CharacterIterator ci) {
0811:                // If the current position is at a surrogate pair, move to the trail surrogate
0812:                //   which leaves it in positon for underlying iterator's next() to work.
0813:                int c = ci.current();
0814:                if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE
0815:                        && c <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
0816:                    c = ci.next();
0817:                    if (c < UTF16.TRAIL_SURROGATE_MIN_VALUE
0818:                            || c > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
0819:                        c = ci.previous();
0820:                    }
0821:                }
0822:
0823:                // For BMP chars, this next() is the real deal.
0824:                c = ci.next();
0825:
0826:                // If we might have a lead surrogate, we need to peak ahead to get the trail 
0827:                //  even though we don't want to really be positioned there.
0828:                if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
0829:                    c = CINextTrail32(ci, c);
0830:                }
0831:
0832:                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
0833:                    // We got a supplementary char.  Back the iterator up to the postion
0834:                    // of the lead surrogate.
0835:                    ci.previous();
0836:                }
0837:                return c;
0838:            }
0839:
0840:            // Out-of-line portion of the in-line Next32 code.
0841:            // The call site does an initial ci.next() and calls this function
0842:            //    if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
0843:            // NOTE:  we leave the underlying char iterator positioned in the
0844:            //        middle of a surroage pair.  ci.next() will work correctly
0845:            //        from there, but the ci.getIndex() will be wrong, and needs
0846:            //        adjustment.
0847:            private static int CINextTrail32(CharacterIterator ci, int lead) {
0848:                int retVal = lead;
0849:                if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
0850:                    char cTrail = ci.next();
0851:                    if (UTF16.isTrailSurrogate(cTrail)) {
0852:                        retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10)
0853:                                + (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE)
0854:                                + UTF16.SUPPLEMENTARY_MIN_VALUE;
0855:                    } else {
0856:                        ci.previous();
0857:                    }
0858:                } else {
0859:                    if (lead == CharacterIterator.DONE
0860:                            && ci.getIndex() >= ci.getEndIndex()) {
0861:                        retVal = CI_DONE32;
0862:                    }
0863:                }
0864:                return retVal;
0865:            }
0866:
0867:            private static int CIPrevious32(CharacterIterator ci) {
0868:                if (ci.getIndex() <= ci.getBeginIndex()) {
0869:                    return CI_DONE32;
0870:                }
0871:                char trail = ci.previous();
0872:                int retVal = trail;
0873:                if (UTF16.isTrailSurrogate(trail)
0874:                        && ci.getIndex() > ci.getBeginIndex()) {
0875:                    char lead = ci.previous();
0876:                    if (UTF16.isLeadSurrogate(lead)) {
0877:                        retVal = (((int) lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10)
0878:                                + ((int) trail - UTF16.TRAIL_SURROGATE_MIN_VALUE)
0879:                                + UTF16.SUPPLEMENTARY_MIN_VALUE;
0880:                    } else {
0881:                        ci.next();
0882:                    }
0883:                }
0884:                return retVal;
0885:            }
0886:
0887:            static int CICurrent32(CharacterIterator ci) {
0888:                char lead = ci.current();
0889:                int retVal = lead;
0890:                if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
0891:                    return retVal;
0892:                }
0893:                if (UTF16.isLeadSurrogate(lead)) {
0894:                    int trail = (int) ci.next();
0895:                    ci.previous();
0896:                    if (UTF16.isTrailSurrogate((char) trail)) {
0897:                        retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10)
0898:                                + (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE)
0899:                                + UTF16.SUPPLEMENTARY_MIN_VALUE;
0900:                    }
0901:                } else {
0902:                    if (lead == CharacterIterator.DONE) {
0903:                        if (ci.getIndex() >= ci.getEndIndex()) {
0904:                            retVal = CI_DONE32;
0905:                        }
0906:                    }
0907:                }
0908:                return retVal;
0909:            }
0910:
0911:            //-----------------------------------------------------------------------------------
0912:            //
0913:            //      handleNext(void)    All forward iteration vectors through this function.
0914:            //                          NOTE:  This function is overridden by the dictionary base break iterator.
0915:            //                                 User level API functions go to the dbbi implementation
0916:            //                                     when the break iterator type is dbbi.
0917:            //                                 The DBBI implementation sometimes explicitly calls back to here, 
0918:            //                                     its inherited handleNext().
0919:            //                      
0920:            //-----------------------------------------------------------------------------------
0921:            int handleNext() {
0922:                return handleNext(fRData.fFTable);
0923:            }
0924:
0925:            /**
0926:             * The State Machine Engine for moving forward is here.
0927:             * This function is the heart of the RBBI run time engine.
0928:             * 
0929:             * @param stateTable
0930:             * @return the new iterator position
0931:             * 
0932:             * A note on supplementary characters and the position of underlying
0933:             * Java CharacterIterator:   Normally, a character iterator is positioned at
0934:             * the char most recently returned by next().  Within this function, when
0935:             * a supplementary char is being processed, the char iterator is left
0936:             * sitting on the trail surrogate, in the middle of the code point.
0937:             * This is different from everywhere else, where an iterator always
0938:             * points at the lead surrogate of a supplementary.
0939:             */
0940:            private int handleNext(short stateTable[]) {
0941:                int state;
0942:                short category = 0;
0943:                int mode;
0944:                int row;
0945:                int c;
0946:                int lookaheadStatus = 0;
0947:                int lookaheadTagIdx = 0;
0948:                int result = 0;
0949:                int initialPosition = 0;
0950:                int lookaheadResult = 0;
0951:                boolean lookAheadHardBreak = (stateTable[RBBIDataWrapper.FLAGS + 1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
0952:
0953:                if (fTrace) {
0954:                    System.out
0955:                            .println("Handle Next   pos      char  state category");
0956:                }
0957:
0958:                // No matter what, handleNext alway correctly sets the break tag value.
0959:                fLastStatusIndexValid = true;
0960:                fLastRuleStatusIndex = 0;
0961:
0962:                // if we're already at the end of the text, return DONE.
0963:                if (fText == null) {
0964:                    fLastRuleStatusIndex = 0;
0965:                    return BreakIterator.DONE;
0966:                }
0967:
0968:                // Set up the starting char
0969:                initialPosition = fText.getIndex();
0970:                result = initialPosition;
0971:                c = fText.current();
0972:                if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
0973:                    c = CINextTrail32(fText, c);
0974:                    if (c == CI_DONE32) {
0975:                        fLastRuleStatusIndex = 0;
0976:                        return BreakIterator.DONE;
0977:                    }
0978:                }
0979:
0980:                // Set the initial state for the state machine
0981:                state = START_STATE;
0982:                row = fRData.getRowIndex(state);
0983:                category = 3;
0984:                mode = RBBI_RUN;
0985:                if ((stateTable[RBBIDataWrapper.FLAGS + 1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
0986:                    category = 2;
0987:                    mode = RBBI_START;
0988:                }
0989:
0990:                // loop until we reach the end of the text or transition to state 0
0991:                while (state != STOP_STATE) {
0992:                    if (c == CI_DONE32) {
0993:                        // Reached end of input string.
0994:                        if (mode == RBBI_END) {
0995:                            // We have already run the loop one last time with the
0996:                            // character set to the pseudo {eof} value. Now it is time
0997:                            // to unconditionally bail out.
0998:
0999:                            if (lookaheadResult > result) {
1000:                                // We ran off the end of the string with a pending
1001:                                // look-ahead match.
1002:                                // Treat this as if the look-ahead condition had been
1003:                                // met, and return
1004:                                // the match at the / position from the look-ahead rule.
1005:                                result = lookaheadResult;
1006:                                fLastRuleStatusIndex = lookaheadTagIdx;
1007:                                lookaheadStatus = 0;
1008:                            } else if (result == initialPosition) {
1009:                                // Ran off end, no match found.
1010:                                // move forward one
1011:                                fText.setIndex(initialPosition);
1012:                                CINext32(fText);
1013:                            }
1014:                            break;
1015:                        }
1016:                        // Run the loop one last time with the fake end-of-input character category
1017:                        mode = RBBI_END;
1018:                        category = 1;
1019:                    }
1020:
1021:                    // Get the char category.  An incoming category of 1 or 2 mens that
1022:                    //      we are preset for doing the beginning or end of input, and
1023:                    //      that we shouldn't get a category from an actual text input character.
1024:                    //
1025:                    if (mode == RBBI_RUN) {
1026:                        // look up the current character's character category, which tells us
1027:                        // which column in the state table to look at.
1028:                        //
1029:                        category = (short) fRData.fTrie.getCodePointValue(c);
1030:
1031:                        // Check the dictionary bit in the character's category.
1032:                        //    Counter is only used by dictionary based iterators (subclasses).
1033:                        //    Chars that need to be handled by a dictionary have a flag bit set
1034:                        //    in their category values.
1035:                        //
1036:                        if ((category & 0x4000) != 0) {
1037:                            fDictionaryCharCount++;
1038:                            //  And off the dictionary flag bit.
1039:                            category &= ~0x4000;
1040:                        }
1041:                    }
1042:
1043:                    if (fTrace) {
1044:                        System.out.print("            "
1045:                                + RBBIDataWrapper.intToString(fText.getIndex(),
1046:                                        5));
1047:                        System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1048:                        System.out.println(RBBIDataWrapper
1049:                                .intToString(state, 7)
1050:                                + RBBIDataWrapper.intToString(category, 6));
1051:                    }
1052:
1053:                    // look up a state transition in the state table
1054:                    //     state = row->fNextState[category];
1055:                    state = stateTable[row + RBBIDataWrapper.NEXTSTATES
1056:                            + category];
1057:                    row = fRData.getRowIndex(state);
1058:
1059:                    // Advance to the next character.  
1060:                    // If this is a beginning-of-input loop iteration, don't advance.
1061:                    //    The next iteration will be processing the first real input character.
1062:                    if (mode == RBBI_RUN) {
1063:                        c = (int) fText.next();
1064:                        if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
1065:                            c = CINextTrail32(fText, c);
1066:                        }
1067:                    } else {
1068:                        if (mode == RBBI_START) {
1069:                            mode = RBBI_RUN;
1070:                        }
1071:                    }
1072:
1073:                    if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1074:                        // Match found, common case
1075:                        result = fText.getIndex();
1076:                        if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE
1077:                                && c != CI_DONE32) {
1078:                            // The iterator has been left in the middle of a surrogate pair.
1079:                            // We want the start of it.
1080:                            result--;
1081:                        }
1082:
1083:                        //  Remember the break status (tag) values.
1084:                        fLastRuleStatusIndex = stateTable[row
1085:                                + RBBIDataWrapper.TAGIDX];
1086:                    }
1087:
1088:                    if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1089:                        if (lookaheadStatus != 0
1090:                                && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1091:                            // Lookahead match is completed.  Set the result accordingly, but only
1092:                            // if no other rule has matched further in the mean time.
1093:                            result = lookaheadResult;
1094:                            fLastRuleStatusIndex = lookaheadTagIdx;
1095:                            lookaheadStatus = 0;
1096:                            // TODO: make a standalone hard break in a rule work.
1097:                            if (lookAheadHardBreak) {
1098:                                return result;
1099:                            }
1100:                            // Look-ahead completed, but other rules may match further.  Continue on.
1101:                            //   TODO:  junk this feature?  I don't think it's used anywhere.
1102:                            continue;
1103:                        }
1104:
1105:                        lookaheadResult = fText.getIndex();
1106:                        if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE
1107:                                && c != CI_DONE32) {
1108:                            // The iterator has been left in the middle of a surrogate pair.
1109:                            // We want the beginning  of it.
1110:                            lookaheadResult--;
1111:                        }
1112:                        lookaheadStatus = stateTable[row
1113:                                + RBBIDataWrapper.LOOKAHEAD];
1114:                        lookaheadTagIdx = stateTable[row
1115:                                + RBBIDataWrapper.TAGIDX];
1116:                        continue;
1117:                    }
1118:
1119:                    if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1120:                        // Because this is an accepting state, any in-progress look-ahead match
1121:                        //   is no longer relavant.  Clear out the pending lookahead status.
1122:                        lookaheadStatus = 0;
1123:                    }
1124:
1125:                } // End of state machine main loop
1126:
1127:                // The state machine is done.  Check whether it found a match...
1128:
1129:                // If the iterator failed to advance in the match engine, force it ahead by one.
1130:                //   (This really indicates a defect in the break rules.  They should always match
1131:                //    at least one character.)
1132:                if (result == initialPosition) {
1133:                    result = fText.setIndex(initialPosition);
1134:                    CINext32(fText);
1135:                    result = fText.getIndex();
1136:                }
1137:
1138:                // Leave the iterator at our result position.
1139:                //   (we may have advanced beyond the last accepting position chasing after
1140:                //    longer matches that never completed.)
1141:                fText.setIndex(result);
1142:                if (fTrace) {
1143:                    System.out.println("result = " + result);
1144:                }
1145:                return result;
1146:            }
1147:
1148:            private int handlePrevious(short stateTable[]) {
1149:                int state;
1150:                int category = 0;
1151:                int mode;
1152:                int row;
1153:                int c;
1154:                int lookaheadStatus = 0;
1155:                int result = 0;
1156:                int initialPosition = 0;
1157:                int lookaheadResult = 0;
1158:                boolean lookAheadHardBreak = (stateTable[RBBIDataWrapper.FLAGS + 1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1159:
1160:                if (fText == null || stateTable == null) {
1161:                    return 0;
1162:                }
1163:                // handlePrevious() never gets the rule status.
1164:                // Flag the status as invalid; if the user ever asks for status, we will need
1165:                // to back up, then re-find the break position using handleNext(), which does
1166:                // get the status value.
1167:                fLastStatusIndexValid = false;
1168:                fLastRuleStatusIndex = 0;
1169:
1170:                // set up the starting char
1171:                initialPosition = fText.getIndex();
1172:                result = initialPosition;
1173:                c = CIPrevious32(fText);
1174:
1175:                // Set up the initial state for the state machine
1176:                state = START_STATE;
1177:                row = fRData.getRowIndex(state);
1178:                category = 3; // TODO:  obsolete?  from the old start/run mode scheme?
1179:                mode = RBBI_RUN;
1180:                if ((stateTable[RBBIDataWrapper.FLAGS + 1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
1181:                    category = 2;
1182:                    mode = RBBI_START;
1183:                }
1184:
1185:                if (fTrace) {
1186:                    System.out
1187:                            .println("Handle Prev   pos   char  state category ");
1188:                }
1189:
1190:                // loop until we reach the beginning of the text or transition to state 0
1191:                //
1192:                mainLoop: for (;;) {
1193:                    innerBlock: {
1194:                        if (c == CI_DONE32) {
1195:                            // Reached end of input string.
1196:                            if (mode == RBBI_END
1197:                                    || fRData.fHeader.fVersion == 1) {
1198:                                // Either this is the old (ICU 3.2 and earlier) format data which
1199:                                // does not support explicit support for matching {eof}, or
1200:                                // we have already done the {eof} iteration.  Now is the time
1201:                                // to unconditionally bail out.
1202:                                if (lookaheadResult < result) {
1203:                                    // We ran off the end of the string with a pending look-ahead match.
1204:                                    // Treat this as if the look-ahead condition had been met, and return
1205:                                    //  the match at the / position from the look-ahead rule.
1206:                                    result = lookaheadResult;
1207:                                    lookaheadStatus = 0;
1208:                                } else if (result == initialPosition) {
1209:                                    // Ran off start, no match found.
1210:                                    // Move one position (towards the start, since we are doing previous.)
1211:                                    fText.setIndex(initialPosition);
1212:                                    CIPrevious32(fText);
1213:                                }
1214:                                break mainLoop;
1215:                            }
1216:                            mode = RBBI_END;
1217:                            category = 1;
1218:                        }
1219:
1220:                        if (mode == RBBI_RUN) {
1221:                            // look up the current character's category, which tells us
1222:                            // which column in the state table to look at.
1223:                            //
1224:                            category = (short) fRData.fTrie
1225:                                    .getCodePointValue(c);
1226:
1227:                            // Check the dictionary bit in the character's category.
1228:                            //    Counter is only used by dictionary based iterators (subclasses).
1229:                            //    Chars that need to be handled by a dictionary have a flag bit set
1230:                            //    in their category values.
1231:                            //
1232:                            if ((category & 0x4000) != 0) {
1233:                                fDictionaryCharCount++;
1234:                                //  And off the dictionary flag bit.
1235:                                category &= ~0x4000;
1236:                            }
1237:                        }
1238:
1239:                        if (fTrace) {
1240:                            System.out.print("             " + fText.getIndex()
1241:                                    + "   ");
1242:                            if (0x20 <= c && c < 0x7f) {
1243:                                System.out.print("  " + c + "  ");
1244:                            } else {
1245:                                System.out.print(" " + Integer.toHexString(c)
1246:                                        + " ");
1247:                            }
1248:                            System.out.println(" " + state + "  " + category
1249:                                    + " ");
1250:                        }
1251:
1252:                        // State Transition - move machine to its next state
1253:                        //
1254:                        state = stateTable[row + RBBIDataWrapper.NEXTSTATES
1255:                                + category];
1256:                        row = fRData.getRowIndex(state);
1257:
1258:                        if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1259:                            // Match found, common case, could have lookahead so we move
1260:                            // on to check it
1261:                            result = fText.getIndex();
1262:                        }
1263:
1264:                        if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1265:                            if (lookaheadStatus != 0
1266:                                    && stateTable[row
1267:                                            + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1268:                                // Lookahead match is completed. Set the result
1269:                                // accordingly, but only
1270:                                // if no other rule has matched further in the mean
1271:                                // time.
1272:                                result = lookaheadResult;
1273:                                lookaheadStatus = 0;
1274:                                // TODO: make a standalone hard break in a rule work.
1275:
1276:                                if (lookAheadHardBreak) {
1277:                                    break mainLoop;
1278:                                }
1279:                                // Look-ahead completed, but other rules may match further.
1280:                                // Continue on.
1281:                                // TODO: junk this feature?  I don't think that it's used anywhere.
1282:                                break innerBlock;
1283:                            }
1284:                            // Hit a possible look-ahead match. We are at the
1285:                            // position of the '/'. Remember this position.
1286:                            lookaheadResult = fText.getIndex();
1287:                            lookaheadStatus = stateTable[row
1288:                                    + RBBIDataWrapper.LOOKAHEAD];
1289:                            break innerBlock;
1290:                        }
1291:
1292:                        // not lookahead...
1293:                        if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1294:                            // This is a plain (non-look-ahead) accepting state.
1295:                            if (!lookAheadHardBreak) {
1296:                                // Clear out any pending look-ahead matches,
1297:                                // but only if not doing the lookAheadHardBreak option
1298:                                // which needs to force a break no matter what is going
1299:                                // on with the rest of the match, i.e. we can't abandon
1300:                                // a partially completed look-ahead match because
1301:                                // some other rule matched further than the '/' position
1302:                                // in the look-ahead match.
1303:                                lookaheadStatus = 0;
1304:                            }
1305:                        }
1306:
1307:                    } // end of innerBlock.  "break innerBlock" in above code comes out here.
1308:
1309:                    if (state == STOP_STATE) {
1310:                        // Normal loop exit is here
1311:                        break mainLoop;
1312:                    }
1313:
1314:                    // then move iterator position backwards one character
1315:                    //
1316:                    if (mode == RBBI_RUN) {
1317:                        c = CIPrevious32(fText);
1318:                    } else {
1319:                        if (mode == RBBI_START) {
1320:                            mode = RBBI_RUN;
1321:                        }
1322:                    }
1323:
1324:                } // End of the main loop.
1325:
1326:                // The state machine is done.  Check whether it found a match...
1327:                //
1328:                // If the iterator failed to advance in the match engine, force it ahead by one.
1329:                //   (This really indicates a defect in the break rules.  They should always match
1330:                //    at least one character.)
1331:                if (result == initialPosition) {
1332:                    result = fText.setIndex(initialPosition);
1333:                    CIPrevious32(fText);
1334:                    result = fText.getIndex();
1335:                }
1336:
1337:                fText.setIndex(result);
1338:                if (fTrace) {
1339:                    System.out.println("Result = " + result);
1340:                }
1341:
1342:                return result;
1343:            }
1344:
1345:            //-------------------------------------------------------------------------------
1346:
1347:            //
1348:
1349:            //  isDictionaryChar      Return true if the category lookup for this char
1350:
1351:            //                        indicates that it is in the set of dictionary lookup
1352:
1353:            //                        chars.
1354:
1355:            //
1356:
1357:            //                        This function is intended for use by dictionary based
1358:
1359:            //                        break iterators.
1360:
1361:            //
1362:
1363:            //-------------------------------------------------------------------------------
1364:
1365:            boolean isDictionaryChar(int c) {
1366:
1367:                short category = (short) fRData.fTrie.getCodePointValue(c);
1368:
1369:                return (category & 0x4000) != 0;
1370:
1371:            }
1372:
1373:        }
1374:        //eof
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.