0001: //##header
0002: /*
0003: *******************************************************************************
0004: * Copyright (C) 2005-2006 International Business Machines Corporation and *
0005: * others. All Rights Reserved. *
0006: *******************************************************************************
0007: */
0008: package com.ibm.icu.text;
0009:
0010: import java.text.CharacterIterator;
0011: import java.io.IOException;
0012: import java.io.InputStream;
0013: import java.io.OutputStream;
0014: import java.io.ByteArrayInputStream;
0015: import java.io.ByteArrayOutputStream;
0016:
0017: import com.ibm.icu.impl.Assert;
0018:
0019: /**
0020: * Rule Based Break Iterator
0021: * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
0022: *
0023: * @stable ICU 2.0
0024: */
0025: public class RuleBasedBreakIterator extends BreakIterator {
0026:
0027: //=======================================================================
0028: // Constructors & Factories
0029: //=======================================================================
0030:
0031: /**
0032: * @internal
0033: * @deprecated This API is ICU internal only.
0034: */
0035: public RuleBasedBreakIterator() {
0036: }
0037:
0038: /**
0039: * Create a break iterator from a precompiled set of rules.
0040: * @internal
0041: * @deprecated This API is ICU internal only.
0042: */
0043: public static RuleBasedBreakIterator getInstanceFromCompiledRules(
0044: InputStream is) throws IOException {
0045: RuleBasedBreakIterator This = new RuleBasedBreakIterator();
0046: This.fRData = RBBIDataWrapper.get(is);
0047: return This;
0048: }
0049:
0050: private RuleBasedBreakIterator(RuleBasedBreakIterator other) {
0051: // TODO: check types.
0052: fRData = other.fRData;
0053: if (fText != null) {
0054: fText = (CharacterIterator) (other.fText.clone());
0055: }
0056: }
0057:
0058: /**
0059: * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
0060: * @param rules The break rules to be used.
0061: * @param parseError In the event of a syntax error in the rules, provides the location
0062: * within the rules of the problem.
0063: * @param status Information on any errors encountered.
0064: * @stable ICU 2.2
0065: */
0066: public RuleBasedBreakIterator(String rules) {
0067: init();
0068: try {
0069: ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
0070: compileRules(rules, ruleOS);
0071: byte[] ruleBA = ruleOS.toByteArray();
0072: InputStream ruleIS = new ByteArrayInputStream(ruleBA);
0073: fRData = RBBIDataWrapper.get(ruleIS);
0074: } catch (IOException e) {
0075: // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
0076: // causing bogus compiled rules to be produced, but with no compile error raised.
0077: //#ifdef FOUNDATION
0078: //## RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error:");
0079: //#else
0080: RuntimeException rte = new RuntimeException(
0081: "RuleBasedBreakIterator rule compilation internal error:",
0082: e);
0083: //#endif
0084: throw rte;
0085: }
0086: }
0087:
0088: //=======================================================================
0089: // Boilerplate
0090: //=======================================================================
0091:
0092: /**
0093: * Clones this iterator.
0094: * @return A newly-constructed RuleBasedBreakIterator with the same
0095: * behavior as this one.
0096: * @stable ICU 2.0
0097: */
0098: public Object clone() {
0099: RuleBasedBreakIterator result = new RuleBasedBreakIterator(this );
0100: return result;
0101: }
0102:
0103: /**
0104: * Returns true if both BreakIterators are of the same class, have the same
0105: * rules, and iterate over the same text.
0106: * @stable ICU 2.0
0107: */
0108: public boolean equals(Object that) {
0109: try {
0110: RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
0111: if (fRData != other.fRData
0112: && (fRData == null || other.fRData == null)) {
0113: return false;
0114: }
0115: if (fRData != null
0116: && other.fRData != null
0117: && (!fRData.fRuleSource
0118: .equals(other.fRData.fRuleSource))) {
0119: return false;
0120: }
0121: if (fText == null && other.fText == null) {
0122: return true;
0123: }
0124: if (fText == null || other.fText == null) {
0125: return false;
0126: }
0127: return fText.equals(other.fText);
0128: } catch (ClassCastException e) {
0129: return false;
0130: }
0131: }
0132:
0133: /**
0134: * Returns the description (rules) used to create this iterator.
0135: * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
0136: * @stable ICU 2.0
0137: */
0138: public String toString() {
0139: String retStr = null;
0140: if (fRData != null) {
0141: retStr = fRData.fRuleSource;
0142: }
0143: return retStr;
0144: }
0145:
0146: /**
0147: * Compute a hashcode for this BreakIterator
0148: * @return A hash code
0149: * @stable ICU 2.0
0150: */
0151: public int hashCode() {
0152: return fRData.fRuleSource.hashCode();
0153: }
0154:
0155: /**
0156: * Tag value for "words" that do not fit into any of other categories.
0157: * Includes spaces and most punctuation.
0158: * @draft ICU 3.0
0159: * @provisional This is a draft API and might change in a future release of ICU.
0160: */
0161: public static final int WORD_NONE = 0;
0162:
0163: /**
0164: * Upper bound for tags for uncategorized words.
0165: * @draft ICU 3.0
0166: * @provisional This is a draft API and might change in a future release of ICU.
0167: */
0168: public static final int WORD_NONE_LIMIT = 100;
0169:
0170: /**
0171: * Tag value for words that appear to be numbers, lower limit.
0172: * @draft ICU 3.0
0173: * @provisional This is a draft API and might change in a future release of ICU.
0174: */
0175: public static final int WORD_NUMBER = 100;
0176:
0177: /**
0178: * Tag value for words that appear to be numbers, upper limit.
0179: * @draft ICU 3.0
0180: * @provisional This is a draft API and might change in a future release of ICU.
0181: */
0182: public static final int WORD_NUMBER_LIMIT = 200;
0183:
0184: /**
0185: * Tag value for words that contain letters, excluding
0186: * hiragana, katakana or ideographic characters, lower limit.
0187: * @draft ICU 3.0
0188: * @provisional This is a draft API and might change in a future release of ICU.
0189: */
0190: public static final int WORD_LETTER = 200;
0191:
0192: /**
0193: * Tag value for words containing letters, upper limit
0194: * @draft ICU 3.0
0195: * @provisional This is a draft API and might change in a future release of ICU.
0196: */
0197: public static final int WORD_LETTER_LIMIT = 300;
0198:
0199: /**
0200: * Tag value for words containing kana characters, lower limit
0201: * @draft ICU 3.0
0202: * @provisional This is a draft API and might change in a future release of ICU.
0203: */
0204: public static final int WORD_KANA = 300;
0205:
0206: /**
0207: * Tag value for words containing kana characters, upper limit
0208: * @draft ICU 3.0
0209: * @provisional This is a draft API and might change in a future release of ICU.
0210: */
0211: public static final int WORD_KANA_LIMIT = 400;
0212:
0213: /**
0214: * Tag value for words containing ideographic characters, lower limit
0215: * @draft ICU 3.0
0216: * @provisional This is a draft API and might change in a future release of ICU.
0217: */
0218: public static final int WORD_IDEO = 400;
0219:
0220: /**
0221: * Tag value for words containing ideographic characters, upper limit
0222: * @draft ICU 3.0
0223: * @provisional This is a draft API and might change in a future release of ICU.
0224: */
0225: public static final int WORD_IDEO_LIMIT = 500;
0226:
0227: private static final int START_STATE = 1; // The state number of the starting state
0228: private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
0229:
0230: // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
0231: // of user text. A variable with this enum type keeps track of where we
0232: // are. The state machine only fetches user text input while in RUN mode.
0233: private static final int RBBI_START = 0;
0234: private static final int RBBI_RUN = 1;
0235: private static final int RBBI_END = 2;
0236:
0237: /**
0238: * The character iterator through which this BreakIterator accesses the text.
0239: *
0240: * @internal
0241: * @deprecated This API is ICU internal only.
0242: */
0243: private CharacterIterator fText = new java.text.StringCharacterIterator(
0244: "");
0245:
0246: /**
0247: * The rule data for this BreakIterator instance
0248: * @internal
0249: * @deprecated This API is ICU internal only.
0250: */
0251: protected RBBIDataWrapper fRData;
0252:
0253: /** Index of the Rule {tag} values for the most recent match.
0254: * @internal
0255: * @deprecated This API is ICU internal only.
0256: */
0257: private int fLastRuleStatusIndex;
0258:
0259: /**
0260: * Rule tag value valid flag.
0261: * Some iterator operations don't intrinsically set the correct tag value.
0262: * This flag lets us lazily compute the value if we are ever asked for it.
0263: * @internal
0264: * @deprecated This API is ICU internal only.
0265: */
0266: private boolean fLastStatusIndexValid;
0267:
0268: /**
0269: * Counter for the number of characters encountered with the "dictionary"
0270: * flag set. Normal RBBI iterators don't use it, although the code
0271: * for updating it is live. Dictionary Based break iterators (a subclass
0272: * of us) access this field directly.
0273: * @internal
0274: * @deprecated This API is ICU internal only.
0275: */
0276: protected int fDictionaryCharCount;
0277:
0278: /**
0279: * Debugging flag. Trace operation of state machine when true.
0280: * @internal
0281: * @deprecated This API is ICU internal only.
0282: */
0283: public static boolean fTrace;
0284:
0285: /**
0286: * Dump the contents of the state table and character classes for this break iterator.
0287: * For debugging only.
0288: * @internal
0289: * @deprecated This API is ICU internal only.
0290: */
0291: public void dump() {
0292: this .fRData.dump();
0293: }
0294:
0295: private static boolean debugInitDone = false;
0296:
0297: private void init() {
0298: fLastStatusIndexValid = true;
0299: fDictionaryCharCount = 0;
0300:
0301: if (debugInitDone == false) {
0302: String debugEnv = System.getProperty("U_RBBIDEBUG");
0303: if (debugEnv != null && debugEnv.indexOf("trace") >= 0) {
0304: fTrace = true;
0305: }
0306: debugInitDone = true;
0307: }
0308: }
0309:
0310: private static void compileRules(String rules,
0311: OutputStream ruleBinary) throws IOException {
0312: RBBIRuleBuilder.compileRules(rules, ruleBinary);
0313: }
0314:
0315: //=======================================================================
0316: // BreakIterator overrides
0317: //=======================================================================
0318:
0319: /**
0320: * Sets the current iteration position to the beginning of the text.
0321: * (i.e., the CharacterIterator's starting offset).
0322: * @return The offset of the beginning of the text.
0323: * @stable ICU 2.0
0324: */
0325: public int first() {
0326: fLastRuleStatusIndex = 0;
0327: fLastStatusIndexValid = true;
0328: if (fText == null) {
0329: return BreakIterator.DONE;
0330: }
0331: fText.first();
0332: return fText.getIndex();
0333: }
0334:
0335: /**
0336: * Sets the current iteration position to the end of the text.
0337: * (i.e., the CharacterIterator's ending offset).
0338: * @return The text's past-the-end offset.
0339: * @stable ICU 2.0
0340: */
0341: public int last() {
0342: if (fText == null) {
0343: fLastRuleStatusIndex = 0;
0344: fLastStatusIndexValid = true;
0345: return BreakIterator.DONE;
0346: }
0347:
0348: // I'm not sure why, but t.last() returns the offset of the last character,
0349: // rather than the past-the-end offset
0350: //
0351: // (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
0352: // will work correctly.)
0353:
0354: fLastStatusIndexValid = false;
0355: int pos = fText.getEndIndex();
0356: fText.setIndex(pos);
0357: return pos;
0358: }
0359:
0360: /**
0361: * Advances the iterator either forward or backward the specified number of steps.
0362: * Negative values move backward, and positive values move forward. This is
0363: * equivalent to repeatedly calling next() or previous().
0364: * @param n The number of steps to move. The sign indicates the direction
0365: * (negative is backwards, and positive is forwards).
0366: * @return The character offset of the boundary position n boundaries away from
0367: * the current one.
0368: * @stable ICU 2.0
0369: */
0370: public int next(int n) {
0371: int result = current();
0372: while (n > 0) {
0373: result = handleNext();
0374: --n;
0375: }
0376: while (n < 0) {
0377: result = previous();
0378: ++n;
0379: }
0380: return result;
0381: }
0382:
0383: /**
0384: * Advances the iterator to the next boundary position.
0385: * @return The position of the first boundary after this one.
0386: * @stable ICU 2.0
0387: */
0388: public int next() {
0389: return handleNext();
0390: }
0391:
0392: /**
0393: * Moves the iterator backwards, to the last boundary preceding this one.
0394: * @return The position of the last boundary position preceding this one.
0395: * @stable ICU 2.0
0396: */
0397: public int previous() {
0398: // if we're already sitting at the beginning of the text, return DONE
0399: if (fText == null || current() == fText.getBeginIndex()) {
0400: fLastRuleStatusIndex = 0;
0401: fLastStatusIndexValid = true;
0402: return BreakIterator.DONE;
0403: }
0404:
0405: if (fRData.fSRTable != null || fRData.fSFTable != null) {
0406: return handlePrevious(fRData.fRTable);
0407: }
0408:
0409: // old rule syntax
0410: // set things up. handlePrevious() will back us up to some valid
0411: // break position before the current position (we back our internal
0412: // iterator up one step to prevent handlePrevious() from returning
0413: // the current position), but not necessarily the last one before
0414: // where we started
0415:
0416: int start = current();
0417:
0418: CIPrevious32(fText);
0419: int lastResult = handlePrevious(fRData.fRTable);
0420: if (lastResult == BreakIterator.DONE) {
0421: lastResult = fText.getBeginIndex();
0422: fText.setIndex(lastResult);
0423: }
0424: int result = lastResult;
0425: int lastTag = 0;
0426: boolean breakTagValid = false;
0427:
0428: // iterate forward from the known break position until we pass our
0429: // starting point. The last break position before the starting
0430: // point is our return value
0431:
0432: for (;;) {
0433: result = handleNext();
0434: if (result == BreakIterator.DONE || result >= start) {
0435: break;
0436: }
0437: lastResult = result;
0438: lastTag = fLastRuleStatusIndex;
0439: breakTagValid = true;
0440: }
0441:
0442: // fLastBreakTag wants to have the value for section of text preceding
0443: // the result position that we are to return (in lastResult.) If
0444: // the backwards rules overshot and the above loop had to do two or more
0445: // handleNext()s to move up to the desired return position, we will have a valid
0446: // tag value. But, if handlePrevious() took us to exactly the correct result positon,
0447: // we wont have a tag value for that position, which is only set by handleNext().
0448:
0449: // set the current iteration position to be the last break position
0450: // before where we started, and then return that value
0451: fText.setIndex(lastResult);
0452: fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
0453: fLastStatusIndexValid = breakTagValid;
0454: return lastResult;
0455: }
0456:
0457: /**
0458: * Sets the iterator to refer to the first boundary position following
0459: * the specified position.
0460: * @param offset The position from which to begin searching for a break position.
0461: * @return The position of the first break after the current position.
0462: * @stable ICU 2.0
0463: */
0464: public int following(int offset) {
0465: // if the offset passed in is already past the end of the text,
0466: // just return DONE; if it's before the beginning, return the
0467: // text's starting offset
0468: fLastRuleStatusIndex = 0;
0469: fLastStatusIndexValid = true;
0470: if (fText == null || offset >= fText.getEndIndex()) {
0471: last();
0472: return next();
0473: } else if (offset < fText.getBeginIndex()) {
0474: return first();
0475: }
0476:
0477: // otherwise, set our internal iteration position (temporarily)
0478: // to the position passed in. If this is the _beginning_ position,
0479: // then we can just use next() to get our return value
0480:
0481: int result = 0;
0482:
0483: if (fRData.fSRTable != null) {
0484: // Safe Point Reverse rules exist.
0485: // This allows us to use the optimum algorithm.
0486: fText.setIndex(offset);
0487: // move forward one codepoint to prepare for moving back to a
0488: // safe point.
0489: // this handles offset being between a supplementary character
0490: CINext32(fText);
0491: // handlePrevious will move most of the time to < 1 boundary away
0492: handlePrevious(fRData.fSRTable);
0493: result = next();
0494: while (result <= offset) {
0495: result = next();
0496: }
0497: return result;
0498: }
0499: if (fRData.fSFTable != null) {
0500: // No Safe point reverse table, but there is a safe pt forward table.
0501: //
0502: fText.setIndex(offset);
0503: CIPrevious32(fText);
0504: // handle next will give result >= offset
0505: handleNext(fRData.fSFTable);
0506: // previous will give result 0 or 1 boundary away from offset,
0507: // most of the time
0508: // we have to
0509: int oldresult = previous();
0510: while (oldresult > offset) {
0511: result = previous();
0512: if (result <= offset) {
0513: return oldresult;
0514: }
0515: oldresult = result;
0516: }
0517: result = next();
0518: if (result <= offset) {
0519: return next();
0520: }
0521: return result;
0522: }
0523: // otherwise, we have to sync up first. Use handlePrevious() to back
0524: // us up to a known break position before the specified position (if
0525: // we can determine that the specified position is a break position,
0526: // we don't back up at all). This may or may not be the last break
0527: // position at or before our starting position. Advance forward
0528: // from here until we've passed the starting position. The position
0529: // we stop on will be the first break position after the specified one.
0530: // old rule syntax
0531:
0532: fText.setIndex(offset);
0533: if (offset == fText.getBeginIndex()) {
0534: return handleNext();
0535: }
0536: result = previous();
0537:
0538: while (result != BreakIterator.DONE && result <= offset) {
0539: result = next();
0540: }
0541:
0542: return result;
0543: }
0544:
0545: /**
0546: * Sets the iterator to refer to the last boundary position before the
0547: * specified position.
0548: * @param offset The position to begin searching for a break from.
0549: * @return The position of the last boundary before the starting position.
0550: * @stable ICU 2.0
0551: */
0552: public int preceding(int offset) {
0553: // if the offset passed in is already past the end of the text,
0554: // just return DONE; if it's before the beginning, return the
0555:
0556: // text's starting offset
0557: if (fText == null || offset > fText.getEndIndex()) {
0558: // return BreakIterator::DONE;
0559: return last();
0560: } else if (offset < fText.getBeginIndex()) {
0561: return first();
0562: }
0563:
0564: // if we start by updating the current iteration position to the
0565: // position specified by the caller, we can just use previous()
0566: // to carry out this operation
0567:
0568: int result;
0569: if (fRData.fSFTable != null) {
0570: /// todo synwee
0571: // new rule syntax
0572: fText.setIndex(offset);
0573: // move backwards one codepoint to prepare for moving forwards to a
0574: // safe point.
0575: // this handles offset being between a supplementary character
0576: CIPrevious32(fText);
0577: handleNext(fRData.fSFTable);
0578: result = previous();
0579: while (result >= offset) {
0580: result = previous();
0581: }
0582: return result;
0583: }
0584: if (fRData.fSRTable != null) {
0585: // backup plan if forward safe table is not available
0586: fText.setIndex(offset);
0587: CINext32(fText);
0588: // handle previous will give result <= offset
0589: handlePrevious(fRData.fSRTable);
0590:
0591: // next will give result 0 or 1 boundary away from offset,
0592: // most of the time
0593: // we have to
0594: int oldresult = next();
0595: while (oldresult < offset) {
0596: result = next();
0597: if (result >= offset) {
0598: return oldresult;
0599: }
0600: oldresult = result;
0601: }
0602: result = previous();
0603: if (result >= offset) {
0604: return previous();
0605: }
0606: return result;
0607: }
0608:
0609: // old rule syntax
0610: fText.setIndex(offset);
0611: return previous();
0612: }
0613:
0614: /**
0615: * Throw IllegalArgumentException unless begin <= offset < end.
0616: * @stable ICU 2.0
0617: */
0618: protected static final void checkOffset(int offset,
0619: CharacterIterator text) {
0620: if (offset < text.getBeginIndex()
0621: || offset > text.getEndIndex()) {
0622: throw new IllegalArgumentException("offset out of bounds");
0623: }
0624: }
0625:
0626: /**
0627: * Returns true if the specfied position is a boundary position. As a side
0628: * effect, leaves the iterator pointing to the first boundary position at
0629: * or after "offset".
0630: * @param offset the offset to check.
0631: * @return True if "offset" is a boundary position.
0632: * @stable ICU 2.0
0633: */
0634: public boolean isBoundary(int offset) {
0635: checkOffset(offset, fText);
0636:
0637: // the beginning index of the iterator is always a boundary position by definition
0638: if (offset == fText.getBeginIndex()) {
0639: first(); // For side effects on current position, tag values.
0640: return true;
0641: }
0642:
0643: if (offset == fText.getEndIndex()) {
0644: last(); // For side effects on current position, tag values.
0645: return true;
0646: }
0647:
0648: // otherwise, we can use following() on the position before the specified
0649: // one and return true if the position we get back is the one the user
0650: // specified
0651:
0652: // return following(offset - 1) == offset;
0653: // TODO: check whether it is safe to revert to the simpler offset-1 code
0654: // The safe rules may take care of unpaired surrogates ok.
0655: fText.setIndex(offset);
0656: CIPrevious32(fText);
0657: int pos = fText.getIndex();
0658: boolean result = following(pos) == offset;
0659: return result;
0660: }
0661:
0662: /**
0663: * Returns the current iteration position.
0664: * @return The current iteration position.
0665: * @stable ICU 2.0
0666: */
0667: public int current() {
0668: return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
0669: }
0670:
0671: private void makeRuleStatusValid() {
0672: if (fLastStatusIndexValid == false) {
0673: // No cached status is available.
0674: if (fText == null || current() == fText.getBeginIndex()) {
0675: // At start of text, or there is no text. Status is always zero.
0676: fLastRuleStatusIndex = 0;
0677: fLastStatusIndexValid = true;
0678: } else {
0679: // Not at start of text. Find status the tedious way.
0680: int pa = current();
0681: previous();
0682: int pb = next();
0683: Assert.assrt(pa == pb);
0684: }
0685: Assert.assrt(fLastStatusIndexValid == true);
0686: Assert
0687: .assrt(fLastRuleStatusIndex >= 0
0688: && fLastRuleStatusIndex < fRData.fStatusTable.length);
0689: }
0690: }
0691:
0692: /**
0693: * Return the status tag from the break rule that determined the most recently
0694: * returned break position. The values appear in the rule source
0695: * within brackets, {123}, for example. For rules that do not specify a
0696: * status, a default value of 0 is returned. If more than one rule applies,
0697: * the numerically largest of the possible status values is returned.
0698: * <p>
0699: * Of the standard types of ICU break iterators, only the word break
0700: * iterator provides status values. The values are defined in
0701: * class RuleBasedBreakIterator, and allow distinguishing between words
0702: * that contain alphabetic letters, "words" that appear to be numbers,
0703: * punctuation and spaces, words containing ideographic characters, and
0704: * more. Call <code>getRuleStatus</code> after obtaining a boundary
0705: * position from <code>next()<code>, <code>previous()</code>, or
0706: * any other break iterator functions that returns a boundary position.
0707: * <p>
0708: * @return the status from the break rule that determined the most recently
0709: * returned break position.
0710: *
0711: * @draft ICU 3.0
0712: * @provisional This is a draft API and might change in a future release of ICU.
0713: */
0714:
0715: public int getRuleStatus() {
0716: makeRuleStatusValid();
0717: // Status records have this form:
0718: // Count N <-- fLastRuleStatusIndex points here.
0719: // Status val 0
0720: // Status val 1
0721: // ...
0722: // Status val N-1 <-- the value we need to return
0723: // The status values are sorted in ascending order.
0724: // This function returns the last (largest) of the array of status values.
0725: int idx = fLastRuleStatusIndex
0726: + fRData.fStatusTable[fLastRuleStatusIndex];
0727: int tagVal = fRData.fStatusTable[idx];
0728:
0729: return tagVal;
0730: }
0731:
0732: /**
0733: * Get the status (tag) values from the break rule(s) that determined the most
0734: * recently returned break position. The values appear in the rule source
0735: * within brackets, {123}, for example. The default status value for rules
0736: * that do not explicitly provide one is zero.
0737: * <p>
0738: * The status values used by the standard ICU break rules are defined
0739: * as public constants in class RuleBasedBreakIterator.
0740: * <p>
0741: * If the size of the output array is insufficient to hold the data,
0742: * the output will be truncated to the available length. No exception
0743: * will be thrown.
0744: *
0745: * @param fillInArray an array to be filled in with the status values.
0746: * @return The number of rule status values from rules that determined
0747: * the most recent boundary returned by the break iterator.
0748: * In the event that the array is too small, the return value
0749: * is the total number of status values that were available,
0750: * not the reduced number that were actually returned.
0751: * @draft ICU 3.0
0752: * @provisional This is a draft API and might change in a future release of ICU.
0753: */
0754: public int getRuleStatusVec(int[] fillInArray) {
0755: makeRuleStatusValid();
0756: int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
0757: if (fillInArray != null) {
0758: int numToCopy = Math.min(numStatusVals, fillInArray.length);
0759: for (int i = 0; i < numToCopy; i++) {
0760: fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex
0761: + i + 1];
0762: }
0763: }
0764: return numStatusVals;
0765: }
0766:
0767: /**
0768: * Return a CharacterIterator over the text being analyzed. This version
0769: * of this method returns the actual CharacterIterator we're using internally.
0770: * Changing the state of this iterator can have undefined consequences. If
0771: * you need to change it, clone it first.
0772: * @return An iterator over the text being analyzed.
0773: * @stable ICU 2.0
0774: */
0775: public CharacterIterator getText() {
0776: return fText;
0777: }
0778:
0779: /**
0780: * Set the iterator to analyze a new piece of text. This function resets
0781: * the current iteration position to the beginning of the text.
0782: * @param newText An iterator over the text to analyze.
0783: * @stable ICU 2.0
0784: */
0785: public void setText(CharacterIterator newText) {
0786: fText = newText;
0787: this .first();
0788: }
0789:
0790: /**
0791: * Control debug, trace and dump options.
0792: * @internal
0793: * @deprecated This API is ICU internal only.
0794: */
0795: protected static String fDebugEnv = System
0796: .getProperty("U_RBBIDEBUG");
0797:
0798: // 32 bit Char value returned from when an iterator has run out of range.
0799: // Positive value so fast case (not end, not surrogate) can be checked
0800: // with a single test.
0801: private static int CI_DONE32 = 0x7fffffff;
0802:
0803: /**
0804: * Move the iterator forward to the next code point, and return that code point,
0805: * leaving the iterator positioned at char returned.
0806: * For Supplementary chars, the iterator is left positioned at the lead surrogate.
0807: * @param ci The character iterator
0808: * @return The next code point.
0809: */
0810: static int CINext32(CharacterIterator ci) {
0811: // If the current position is at a surrogate pair, move to the trail surrogate
0812: // which leaves it in positon for underlying iterator's next() to work.
0813: int c = ci.current();
0814: if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE
0815: && c <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
0816: c = ci.next();
0817: if (c < UTF16.TRAIL_SURROGATE_MIN_VALUE
0818: || c > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
0819: c = ci.previous();
0820: }
0821: }
0822:
0823: // For BMP chars, this next() is the real deal.
0824: c = ci.next();
0825:
0826: // If we might have a lead surrogate, we need to peak ahead to get the trail
0827: // even though we don't want to really be positioned there.
0828: if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
0829: c = CINextTrail32(ci, c);
0830: }
0831:
0832: if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
0833: // We got a supplementary char. Back the iterator up to the postion
0834: // of the lead surrogate.
0835: ci.previous();
0836: }
0837: return c;
0838: }
0839:
0840: // Out-of-line portion of the in-line Next32 code.
0841: // The call site does an initial ci.next() and calls this function
0842: // if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
0843: // NOTE: we leave the underlying char iterator positioned in the
0844: // middle of a surroage pair. ci.next() will work correctly
0845: // from there, but the ci.getIndex() will be wrong, and needs
0846: // adjustment.
0847: private static int CINextTrail32(CharacterIterator ci, int lead) {
0848: int retVal = lead;
0849: if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
0850: char cTrail = ci.next();
0851: if (UTF16.isTrailSurrogate(cTrail)) {
0852: retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10)
0853: + (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE)
0854: + UTF16.SUPPLEMENTARY_MIN_VALUE;
0855: } else {
0856: ci.previous();
0857: }
0858: } else {
0859: if (lead == CharacterIterator.DONE
0860: && ci.getIndex() >= ci.getEndIndex()) {
0861: retVal = CI_DONE32;
0862: }
0863: }
0864: return retVal;
0865: }
0866:
0867: private static int CIPrevious32(CharacterIterator ci) {
0868: if (ci.getIndex() <= ci.getBeginIndex()) {
0869: return CI_DONE32;
0870: }
0871: char trail = ci.previous();
0872: int retVal = trail;
0873: if (UTF16.isTrailSurrogate(trail)
0874: && ci.getIndex() > ci.getBeginIndex()) {
0875: char lead = ci.previous();
0876: if (UTF16.isLeadSurrogate(lead)) {
0877: retVal = (((int) lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10)
0878: + ((int) trail - UTF16.TRAIL_SURROGATE_MIN_VALUE)
0879: + UTF16.SUPPLEMENTARY_MIN_VALUE;
0880: } else {
0881: ci.next();
0882: }
0883: }
0884: return retVal;
0885: }
0886:
0887: static int CICurrent32(CharacterIterator ci) {
0888: char lead = ci.current();
0889: int retVal = lead;
0890: if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
0891: return retVal;
0892: }
0893: if (UTF16.isLeadSurrogate(lead)) {
0894: int trail = (int) ci.next();
0895: ci.previous();
0896: if (UTF16.isTrailSurrogate((char) trail)) {
0897: retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10)
0898: + (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE)
0899: + UTF16.SUPPLEMENTARY_MIN_VALUE;
0900: }
0901: } else {
0902: if (lead == CharacterIterator.DONE) {
0903: if (ci.getIndex() >= ci.getEndIndex()) {
0904: retVal = CI_DONE32;
0905: }
0906: }
0907: }
0908: return retVal;
0909: }
0910:
0911: //-----------------------------------------------------------------------------------
0912: //
0913: // handleNext(void) All forward iteration vectors through this function.
0914: // NOTE: This function is overridden by the dictionary base break iterator.
0915: // User level API functions go to the dbbi implementation
0916: // when the break iterator type is dbbi.
0917: // The DBBI implementation sometimes explicitly calls back to here,
0918: // its inherited handleNext().
0919: //
0920: //-----------------------------------------------------------------------------------
0921: int handleNext() {
0922: return handleNext(fRData.fFTable);
0923: }
0924:
0925: /**
0926: * The State Machine Engine for moving forward is here.
0927: * This function is the heart of the RBBI run time engine.
0928: *
0929: * @param stateTable
0930: * @return the new iterator position
0931: *
0932: * A note on supplementary characters and the position of underlying
0933: * Java CharacterIterator: Normally, a character iterator is positioned at
0934: * the char most recently returned by next(). Within this function, when
0935: * a supplementary char is being processed, the char iterator is left
0936: * sitting on the trail surrogate, in the middle of the code point.
0937: * This is different from everywhere else, where an iterator always
0938: * points at the lead surrogate of a supplementary.
0939: */
0940: private int handleNext(short stateTable[]) {
0941: int state;
0942: short category = 0;
0943: int mode;
0944: int row;
0945: int c;
0946: int lookaheadStatus = 0;
0947: int lookaheadTagIdx = 0;
0948: int result = 0;
0949: int initialPosition = 0;
0950: int lookaheadResult = 0;
0951: boolean lookAheadHardBreak = (stateTable[RBBIDataWrapper.FLAGS + 1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
0952:
0953: if (fTrace) {
0954: System.out
0955: .println("Handle Next pos char state category");
0956: }
0957:
0958: // No matter what, handleNext alway correctly sets the break tag value.
0959: fLastStatusIndexValid = true;
0960: fLastRuleStatusIndex = 0;
0961:
0962: // if we're already at the end of the text, return DONE.
0963: if (fText == null) {
0964: fLastRuleStatusIndex = 0;
0965: return BreakIterator.DONE;
0966: }
0967:
0968: // Set up the starting char
0969: initialPosition = fText.getIndex();
0970: result = initialPosition;
0971: c = fText.current();
0972: if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
0973: c = CINextTrail32(fText, c);
0974: if (c == CI_DONE32) {
0975: fLastRuleStatusIndex = 0;
0976: return BreakIterator.DONE;
0977: }
0978: }
0979:
0980: // Set the initial state for the state machine
0981: state = START_STATE;
0982: row = fRData.getRowIndex(state);
0983: category = 3;
0984: mode = RBBI_RUN;
0985: if ((stateTable[RBBIDataWrapper.FLAGS + 1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
0986: category = 2;
0987: mode = RBBI_START;
0988: }
0989:
0990: // loop until we reach the end of the text or transition to state 0
0991: while (state != STOP_STATE) {
0992: if (c == CI_DONE32) {
0993: // Reached end of input string.
0994: if (mode == RBBI_END) {
0995: // We have already run the loop one last time with the
0996: // character set to the pseudo {eof} value. Now it is time
0997: // to unconditionally bail out.
0998:
0999: if (lookaheadResult > result) {
1000: // We ran off the end of the string with a pending
1001: // look-ahead match.
1002: // Treat this as if the look-ahead condition had been
1003: // met, and return
1004: // the match at the / position from the look-ahead rule.
1005: result = lookaheadResult;
1006: fLastRuleStatusIndex = lookaheadTagIdx;
1007: lookaheadStatus = 0;
1008: } else if (result == initialPosition) {
1009: // Ran off end, no match found.
1010: // move forward one
1011: fText.setIndex(initialPosition);
1012: CINext32(fText);
1013: }
1014: break;
1015: }
1016: // Run the loop one last time with the fake end-of-input character category
1017: mode = RBBI_END;
1018: category = 1;
1019: }
1020:
1021: // Get the char category. An incoming category of 1 or 2 mens that
1022: // we are preset for doing the beginning or end of input, and
1023: // that we shouldn't get a category from an actual text input character.
1024: //
1025: if (mode == RBBI_RUN) {
1026: // look up the current character's character category, which tells us
1027: // which column in the state table to look at.
1028: //
1029: category = (short) fRData.fTrie.getCodePointValue(c);
1030:
1031: // Check the dictionary bit in the character's category.
1032: // Counter is only used by dictionary based iterators (subclasses).
1033: // Chars that need to be handled by a dictionary have a flag bit set
1034: // in their category values.
1035: //
1036: if ((category & 0x4000) != 0) {
1037: fDictionaryCharCount++;
1038: // And off the dictionary flag bit.
1039: category &= ~0x4000;
1040: }
1041: }
1042:
1043: if (fTrace) {
1044: System.out.print(" "
1045: + RBBIDataWrapper.intToString(fText.getIndex(),
1046: 5));
1047: System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1048: System.out.println(RBBIDataWrapper
1049: .intToString(state, 7)
1050: + RBBIDataWrapper.intToString(category, 6));
1051: }
1052:
1053: // look up a state transition in the state table
1054: // state = row->fNextState[category];
1055: state = stateTable[row + RBBIDataWrapper.NEXTSTATES
1056: + category];
1057: row = fRData.getRowIndex(state);
1058:
1059: // Advance to the next character.
1060: // If this is a beginning-of-input loop iteration, don't advance.
1061: // The next iteration will be processing the first real input character.
1062: if (mode == RBBI_RUN) {
1063: c = (int) fText.next();
1064: if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
1065: c = CINextTrail32(fText, c);
1066: }
1067: } else {
1068: if (mode == RBBI_START) {
1069: mode = RBBI_RUN;
1070: }
1071: }
1072:
1073: if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1074: // Match found, common case
1075: result = fText.getIndex();
1076: if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE
1077: && c != CI_DONE32) {
1078: // The iterator has been left in the middle of a surrogate pair.
1079: // We want the start of it.
1080: result--;
1081: }
1082:
1083: // Remember the break status (tag) values.
1084: fLastRuleStatusIndex = stateTable[row
1085: + RBBIDataWrapper.TAGIDX];
1086: }
1087:
1088: if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1089: if (lookaheadStatus != 0
1090: && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1091: // Lookahead match is completed. Set the result accordingly, but only
1092: // if no other rule has matched further in the mean time.
1093: result = lookaheadResult;
1094: fLastRuleStatusIndex = lookaheadTagIdx;
1095: lookaheadStatus = 0;
1096: // TODO: make a standalone hard break in a rule work.
1097: if (lookAheadHardBreak) {
1098: return result;
1099: }
1100: // Look-ahead completed, but other rules may match further. Continue on.
1101: // TODO: junk this feature? I don't think it's used anywhere.
1102: continue;
1103: }
1104:
1105: lookaheadResult = fText.getIndex();
1106: if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE
1107: && c != CI_DONE32) {
1108: // The iterator has been left in the middle of a surrogate pair.
1109: // We want the beginning of it.
1110: lookaheadResult--;
1111: }
1112: lookaheadStatus = stateTable[row
1113: + RBBIDataWrapper.LOOKAHEAD];
1114: lookaheadTagIdx = stateTable[row
1115: + RBBIDataWrapper.TAGIDX];
1116: continue;
1117: }
1118:
1119: if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1120: // Because this is an accepting state, any in-progress look-ahead match
1121: // is no longer relavant. Clear out the pending lookahead status.
1122: lookaheadStatus = 0;
1123: }
1124:
1125: } // End of state machine main loop
1126:
1127: // The state machine is done. Check whether it found a match...
1128:
1129: // If the iterator failed to advance in the match engine, force it ahead by one.
1130: // (This really indicates a defect in the break rules. They should always match
1131: // at least one character.)
1132: if (result == initialPosition) {
1133: result = fText.setIndex(initialPosition);
1134: CINext32(fText);
1135: result = fText.getIndex();
1136: }
1137:
1138: // Leave the iterator at our result position.
1139: // (we may have advanced beyond the last accepting position chasing after
1140: // longer matches that never completed.)
1141: fText.setIndex(result);
1142: if (fTrace) {
1143: System.out.println("result = " + result);
1144: }
1145: return result;
1146: }
1147:
1148: private int handlePrevious(short stateTable[]) {
1149: int state;
1150: int category = 0;
1151: int mode;
1152: int row;
1153: int c;
1154: int lookaheadStatus = 0;
1155: int result = 0;
1156: int initialPosition = 0;
1157: int lookaheadResult = 0;
1158: boolean lookAheadHardBreak = (stateTable[RBBIDataWrapper.FLAGS + 1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1159:
1160: if (fText == null || stateTable == null) {
1161: return 0;
1162: }
1163: // handlePrevious() never gets the rule status.
1164: // Flag the status as invalid; if the user ever asks for status, we will need
1165: // to back up, then re-find the break position using handleNext(), which does
1166: // get the status value.
1167: fLastStatusIndexValid = false;
1168: fLastRuleStatusIndex = 0;
1169:
1170: // set up the starting char
1171: initialPosition = fText.getIndex();
1172: result = initialPosition;
1173: c = CIPrevious32(fText);
1174:
1175: // Set up the initial state for the state machine
1176: state = START_STATE;
1177: row = fRData.getRowIndex(state);
1178: category = 3; // TODO: obsolete? from the old start/run mode scheme?
1179: mode = RBBI_RUN;
1180: if ((stateTable[RBBIDataWrapper.FLAGS + 1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
1181: category = 2;
1182: mode = RBBI_START;
1183: }
1184:
1185: if (fTrace) {
1186: System.out
1187: .println("Handle Prev pos char state category ");
1188: }
1189:
1190: // loop until we reach the beginning of the text or transition to state 0
1191: //
1192: mainLoop: for (;;) {
1193: innerBlock: {
1194: if (c == CI_DONE32) {
1195: // Reached end of input string.
1196: if (mode == RBBI_END
1197: || fRData.fHeader.fVersion == 1) {
1198: // Either this is the old (ICU 3.2 and earlier) format data which
1199: // does not support explicit support for matching {eof}, or
1200: // we have already done the {eof} iteration. Now is the time
1201: // to unconditionally bail out.
1202: if (lookaheadResult < result) {
1203: // We ran off the end of the string with a pending look-ahead match.
1204: // Treat this as if the look-ahead condition had been met, and return
1205: // the match at the / position from the look-ahead rule.
1206: result = lookaheadResult;
1207: lookaheadStatus = 0;
1208: } else if (result == initialPosition) {
1209: // Ran off start, no match found.
1210: // Move one position (towards the start, since we are doing previous.)
1211: fText.setIndex(initialPosition);
1212: CIPrevious32(fText);
1213: }
1214: break mainLoop;
1215: }
1216: mode = RBBI_END;
1217: category = 1;
1218: }
1219:
1220: if (mode == RBBI_RUN) {
1221: // look up the current character's category, which tells us
1222: // which column in the state table to look at.
1223: //
1224: category = (short) fRData.fTrie
1225: .getCodePointValue(c);
1226:
1227: // Check the dictionary bit in the character's category.
1228: // Counter is only used by dictionary based iterators (subclasses).
1229: // Chars that need to be handled by a dictionary have a flag bit set
1230: // in their category values.
1231: //
1232: if ((category & 0x4000) != 0) {
1233: fDictionaryCharCount++;
1234: // And off the dictionary flag bit.
1235: category &= ~0x4000;
1236: }
1237: }
1238:
1239: if (fTrace) {
1240: System.out.print(" " + fText.getIndex()
1241: + " ");
1242: if (0x20 <= c && c < 0x7f) {
1243: System.out.print(" " + c + " ");
1244: } else {
1245: System.out.print(" " + Integer.toHexString(c)
1246: + " ");
1247: }
1248: System.out.println(" " + state + " " + category
1249: + " ");
1250: }
1251:
1252: // State Transition - move machine to its next state
1253: //
1254: state = stateTable[row + RBBIDataWrapper.NEXTSTATES
1255: + category];
1256: row = fRData.getRowIndex(state);
1257:
1258: if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1259: // Match found, common case, could have lookahead so we move
1260: // on to check it
1261: result = fText.getIndex();
1262: }
1263:
1264: if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1265: if (lookaheadStatus != 0
1266: && stateTable[row
1267: + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1268: // Lookahead match is completed. Set the result
1269: // accordingly, but only
1270: // if no other rule has matched further in the mean
1271: // time.
1272: result = lookaheadResult;
1273: lookaheadStatus = 0;
1274: // TODO: make a standalone hard break in a rule work.
1275:
1276: if (lookAheadHardBreak) {
1277: break mainLoop;
1278: }
1279: // Look-ahead completed, but other rules may match further.
1280: // Continue on.
1281: // TODO: junk this feature? I don't think that it's used anywhere.
1282: break innerBlock;
1283: }
1284: // Hit a possible look-ahead match. We are at the
1285: // position of the '/'. Remember this position.
1286: lookaheadResult = fText.getIndex();
1287: lookaheadStatus = stateTable[row
1288: + RBBIDataWrapper.LOOKAHEAD];
1289: break innerBlock;
1290: }
1291:
1292: // not lookahead...
1293: if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1294: // This is a plain (non-look-ahead) accepting state.
1295: if (!lookAheadHardBreak) {
1296: // Clear out any pending look-ahead matches,
1297: // but only if not doing the lookAheadHardBreak option
1298: // which needs to force a break no matter what is going
1299: // on with the rest of the match, i.e. we can't abandon
1300: // a partially completed look-ahead match because
1301: // some other rule matched further than the '/' position
1302: // in the look-ahead match.
1303: lookaheadStatus = 0;
1304: }
1305: }
1306:
1307: } // end of innerBlock. "break innerBlock" in above code comes out here.
1308:
1309: if (state == STOP_STATE) {
1310: // Normal loop exit is here
1311: break mainLoop;
1312: }
1313:
1314: // then move iterator position backwards one character
1315: //
1316: if (mode == RBBI_RUN) {
1317: c = CIPrevious32(fText);
1318: } else {
1319: if (mode == RBBI_START) {
1320: mode = RBBI_RUN;
1321: }
1322: }
1323:
1324: } // End of the main loop.
1325:
1326: // The state machine is done. Check whether it found a match...
1327: //
1328: // If the iterator failed to advance in the match engine, force it ahead by one.
1329: // (This really indicates a defect in the break rules. They should always match
1330: // at least one character.)
1331: if (result == initialPosition) {
1332: result = fText.setIndex(initialPosition);
1333: CIPrevious32(fText);
1334: result = fText.getIndex();
1335: }
1336:
1337: fText.setIndex(result);
1338: if (fTrace) {
1339: System.out.println("Result = " + result);
1340: }
1341:
1342: return result;
1343: }
1344:
1345: //-------------------------------------------------------------------------------
1346:
1347: //
1348:
1349: // isDictionaryChar Return true if the category lookup for this char
1350:
1351: // indicates that it is in the set of dictionary lookup
1352:
1353: // chars.
1354:
1355: //
1356:
1357: // This function is intended for use by dictionary based
1358:
1359: // break iterators.
1360:
1361: //
1362:
1363: //-------------------------------------------------------------------------------
1364:
1365: boolean isDictionaryChar(int c) {
1366:
1367: short category = (short) fRData.fTrie.getCodePointValue(c);
1368:
1369: return (category & 0x4000) != 0;
1370:
1371: }
1372:
1373: }
1374: //eof
|