0001: /*
0002: *
0003: * @(#)RuleBasedBreakIterator.java 1.18 06/10/11
0004: *
0005: * Portions Copyright 2000-2006 Sun Microsystems, Inc. All Rights
0006: * Reserved. Use is subject to license terms.
0007: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
0008: *
0009: * This program is free software; you can redistribute it and/or
0010: * modify it under the terms of the GNU General Public License version
0011: * 2 only, as published by the Free Software Foundation.
0012: *
0013: * This program is distributed in the hope that it will be useful, but
0014: * WITHOUT ANY WARRANTY; without even the implied warranty of
0015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
0016: * General Public License version 2 for more details (a copy is
0017: * included at /legal/license.txt).
0018: *
0019: * You should have received a copy of the GNU General Public License
0020: * version 2 along with this work; if not, write to the Free Software
0021: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
0022: * 02110-1301 USA
0023: *
0024: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
0025: * Clara, CA 95054 or visit www.sun.com if you need additional
0026: * information or have any questions.
0027: */
0028:
0029: /*
0030: * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
0031: * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
0032: *
0033: * The original version of this source code and documentation
0034: * is copyrighted and owned by Taligent, Inc., a wholly-owned
0035: * subsidiary of IBM. These materials are provided under terms
0036: * of a License Agreement between Taligent and Sun. This technology
0037: * is protected by multiple US and International patents.
0038: *
0039: * This notice and attribution to Taligent may not be removed.
0040: * Taligent is a registered trademark of Taligent, Inc.
0041: */
0042:
0043: package java.text;
0044:
0045: import java.util.Vector;
0046: import java.util.Stack;
0047: import java.util.Hashtable;
0048: import java.util.Enumeration;
0049: import java.text.CharacterIterator;
0050: import java.text.StringCharacterIterator;
0051: import sun.text.CompactByteArray;
0052:
0053: /**
0054: * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
0055: *
0056: * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
0057: * and <i>regular expressions.</i></p>
0058: *
0059: * <p>A substitution rule defines a name that can be used in place of an expression. It
0060: * consists of a name, which is a string of characters contained in angle brackets, an equals
0061: * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
0062: * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
0063: * square brackets. A substitution is visible after its definition, and is filled in using
0064: * simple textual substitution. Substitution definitions can contain other substitutions, as
0065: * long as those substitutions have been defined first. Substitutions are generally used to
0066: * make the regular expressions (which can get quite complex) shorted and easier to read.
0067: * They typically define either character categories or commonly-used subexpressions.</p>
0068: *
0069: * <p>There is one special substitution. If the description defines a substitution
0070: * called "<ignore>", the expression must be a [] expression, and the
0071: * expression defines a set of characters (the "<em>ignore characters</em>") that
0072: * will be transparent to the BreakIterator. A sequence of characters will break the
0073: * same way it would if any ignore characters it contains are taken out. Break
0074: * positions never occur befoer ignore characters.</p>
0075: *
0076: * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
0077: * defines a sequence of characters to be kept together. With one significant exception, the
0078: * iterator uses a longest-possible-match algorithm when matching text to regular
0079: * expressions. The iterator also treats descriptions containing multiple regular expressions
0080: * as if they were ORed together (i.e., as if they were separated by |).</p>
0081: *
0082: * <p>The special characters recognized by the regular-expression parser are as follows:</p>
0083: *
0084: * <blockquote>
0085: * <table border="1" width="100%">
0086: * <tr>
0087: * <td width="6%">*</td>
0088: * <td width="94%">Specifies that the expression preceding the asterisk may occur any number
0089: * of times (including not at all).</td>
0090: * </tr>
0091: * <tr>
0092: * <td width="6%">{}</td>
0093: * <td width="94%">Encloses a sequence of characters that is optional.</td>
0094: * </tr>
0095: * <tr>
0096: * <td width="6%">()</td>
0097: * <td width="94%">Encloses a sequence of characters. If followed by *, the sequence
0098: * repeats. Otherwise, the parentheses are just a grouping device and a way to delimit
0099: * the ends of expressions containing |.</td>
0100: * </tr>
0101: * <tr>
0102: * <td width="6%">|</td>
0103: * <td width="94%">Separates two alternative sequences of characters. Either one
0104: * sequence or the other, but not both, matches this expression. The | character can
0105: * only occur inside ().</td>
0106: * </tr>
0107: * <tr>
0108: * <td width="6%">.</td>
0109: * <td width="94%">Matches any character.</td>
0110: * </tr>
0111: * <tr>
0112: * <td width="6%">*?</td>
0113: * <td width="94%">Specifies a non-greedy asterisk. *? works the same way as *, except
0114: * when there is overlap between the last group of characters in the expression preceding the
0115: * * and the first group of characters following the *. When there is this kind of
0116: * overlap, * will match the longest sequence of characters that match the expression before
0117: * the *, and *? will match the shortest sequence of characters matching the expression
0118: * before the *?. For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text,
0119: * "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy",
0120: * but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td>
0121: * </tr>
0122: * <tr>
0123: * <td width="6%">[]</td>
0124: * <td width="94%">Specifies a group of alternative characters. A [] expression will
0125: * match any single character that is specified in the [] expression. For more on the
0126: * syntax of [] expressions, see below.</td>
0127: * </tr>
0128: * <tr>
0129: * <td width="6%">/</td>
0130: * <td width="94%">Specifies where the break position should go if text matches this
0131: * expression. (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run
0132: * of letters, followed by a run of whitespace, followed by a digit, but the break position
0133: * will actually go before the whitespace). Expressions that don't contain / put the
0134: * break position at the end of the matching text.</td>
0135: * </tr>
0136: * <tr>
0137: * <td width="6%">\</td>
0138: * <td width="94%">Escape character. The \ itself is ignored, but causes the next
0139: * character to be treated as literal character. This has no effect for many
0140: * characters, but for the characters listed above, this deprives them of their special
0141: * meaning. (There are no special escape sequences for Unicode characters, or tabs and
0142: * newlines; these are all handled by a higher-level protocol. In a Java string,
0143: * "\n" will be converted to a literal newline character by the time the
0144: * regular-expression parser sees it. Of course, this means that \ sequences that are
0145: * visible to the regexp parser must be written as \\ when inside a Java string.) All
0146: * characters in the ASCII range except for letters, digits, and control characters are
0147: * reserved characters to the parser and must be preceded by \ even if they currently don't
0148: * mean anything.</td>
0149: * </tr>
0150: * <tr>
0151: * <td width="6%">!</td>
0152: * <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
0153: * parser that this expression specifies the backwards-iteration behavior of the iterator,
0154: * and not its normal iteration behavior. This is generally only used in situations
0155: * where the automatically-generated backwards-iteration brhavior doesn't produce
0156: * satisfactory results and must be supplemented with extra client-specified rules.</td>
0157: * </tr>
0158: * <tr>
0159: * <td width="6%"><em>(all others)</em></td>
0160: * <td width="94%">All other characters are treated as literal characters, which must match
0161: * the corresponding character(s) in the text exactly.</td>
0162: * </tr>
0163: * </table>
0164: * </blockquote>
0165: *
0166: * <p>Within a [] expression, a number of other special characters can be used to specify
0167: * groups of characters:</p>
0168: *
0169: * <blockquote>
0170: * <table border="1" width="100%">
0171: * <tr>
0172: * <td width="6%">-</td>
0173: * <td width="94%">Specifies a range of matching characters. For example
0174: * "[a-p]" matches all lowercase Latin letters from a to p (inclusive). The -
0175: * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
0176: * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does
0177: * it include accented letters such as a-umlaut.</td>
0178: * </tr>
0179: * <tr>
0180: * <td width="6%">::</td>
0181: * <td width="94%">A pair of colons containing a one- or two-letter code matches all
0182: * characters in the corresponding Unicode category. The two-letter codes are the same
0183: * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]"
0184: * matches all currency symbols and all math symbols). Specifying a one-letter code is
0185: * the same as specifying all two-letter codes that begin with that letter (for example,
0186: * "[:L:]" matches all letters, and is equivalent to
0187: * "[:Lu::Ll::Lo::Lm::Lt:]"). Anything other than a valid two-letter Unicode
0188: * category code or a single letter that begins a Unicode category code is illegal within
0189: * colons.</td>
0190: * </tr>
0191: * <tr>
0192: * <td width="6%">[]</td>
0193: * <td width="94%">[] expressions can nest. This has no effect, except when used in
0194: * conjunction with the ^ token.</td>
0195: * </tr>
0196: * <tr>
0197: * <td width="6%">^</td>
0198: * <td width="94%">Excludes the character (or the characters in the [] expression) following
0199: * it from the group of characters. For example, "[a-z^p]" matches all Latin
0200: * lowercase letters except p. "[:L:^[\u4e00-\u9fff]]" matches all letters
0201: * except the Han ideographs.</td>
0202: * </tr>
0203: * <tr>
0204: * <td width="6%"><em>(all others)</em></td>
0205: * <td width="94%">All other characters are treated as literal characters. (For
0206: * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td>
0207: * </tr>
0208: * </table>
0209: * </blockquote>
0210: *
0211: * <p>For a more complete explanation, see <a
0212: * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
0213: * For examples, see the resource data (which is annotated).</p>
0214: *
0215: * @author Richard Gillam
0216: * @version $RCSFile$ $Revision: 1.1 $ $Date: 1998/11/05 19:32:04 $
0217: */
0218: class RuleBasedBreakIterator extends BreakIterator {
0219:
0220: /**
0221: * A token used as a character-category value to identify ignore characters
0222: */
0223: protected static final byte IGNORE = -1;
0224:
0225: /**
0226: * The state number of the starting state
0227: */
0228: private static final short START_STATE = 1;
0229:
0230: /**
0231: * The state-transition value indicating "stop"
0232: */
0233: private static final short STOP_STATE = 0;
0234:
0235: /**
0236: * The textual description this iterator was created from
0237: */
0238: private String description;
0239:
0240: /**
0241: * A table that indexes from character values to character category numbers
0242: */
0243: private CompactByteArray charCategoryTable = null;
0244:
0245: /**
0246: * The table of state transitions used for forward iteration
0247: */
0248: private short[] stateTable = null;
0249:
0250: /**
0251: * The table of state transitions used to sync up the iterator with the
0252: * text in backwards and random-access iteration
0253: */
0254: private short[] backwardsStateTable = null;
0255:
0256: /**
0257: * A list of flags indicating which states in the state table are accepting
0258: * ("end") states
0259: */
0260: private boolean[] endStates = null;
0261:
0262: /**
0263: * A list of flags indicating which states in the state table are
0264: * lookahead states (states which turn lookahead on and off)
0265: */
0266: private boolean[] lookaheadStates = null;
0267:
0268: /**
0269: * The number of character categories (and, thus, the number of columns in
0270: * the state tables)
0271: */
0272: private int numCategories;
0273:
0274: /**
0275: * The character iterator through which this BreakIterator accesses the text
0276: */
0277: private CharacterIterator text = null;
0278:
0279: //=======================================================================
0280: // constructors
0281: //=======================================================================
0282:
0283: /**
0284: * Constructs a RuleBasedBreakIterator according to the description
0285: * provided. If the description is malformed, throws an
0286: * IllegalArgumentException. Normally, instead of constructing a
0287: * RuleBasedBreakIterator directory, you'll use the factory methods
0288: * on BreakIterator to create one indirectly from a description
0289: * in the framework's resource files. You'd use this when you want
0290: * special behavior not provided by the built-in iterators.
0291: */
0292: public RuleBasedBreakIterator(String description) {
0293: this .description = description;
0294:
0295: // the actual work is done by the Builder class
0296: Builder builder = makeBuilder();
0297: builder.buildBreakIterator();
0298: }
0299:
0300: /**
0301: * Creates a Builder.
0302: */
0303: protected Builder makeBuilder() {
0304: return new Builder();
0305: }
0306:
0307: //=======================================================================
0308: // boilerplate
0309: //=======================================================================
0310: /**
0311: * Clones this iterator.
0312: * @return A newly-constructed RuleBasedBreakIterator with the same
0313: * behavior as this one.
0314: */
0315: public Object clone() {
0316: RuleBasedBreakIterator result = (RuleBasedBreakIterator) super
0317: .clone();
0318: if (text != null) {
0319: result.text = (CharacterIterator) text.clone();
0320: }
0321: return result;
0322: }
0323:
0324: /**
0325: * Returns true if both BreakIterators are of the same class, have the same
0326: * rules, and iterate over the same text.
0327: */
0328: public boolean equals(Object that) {
0329: try {
0330: RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
0331: if (!description.equals(other.description)) {
0332: return false;
0333: }
0334: if (text == null) {
0335: return other.text == null;
0336: } else {
0337: return text.equals(other.text);
0338: }
0339: } catch (ClassCastException e) {
0340: return false;
0341: }
0342: }
0343:
0344: /**
0345: * Returns the description used to create this iterator
0346: */
0347: public String toString() {
0348: return description;
0349: }
0350:
0351: /**
0352: * Compute a hashcode for this BreakIterator
0353: * @return A hash code
0354: */
0355: public int hashCode() {
0356: return description.hashCode();
0357: }
0358:
0359: //=======================================================================
0360: // BreakIterator overrides
0361: //=======================================================================
0362:
0363: /**
0364: * Sets the current iteration position to the beginning of the text.
0365: * (i.e., the CharacterIterator's starting offset).
0366: * @return The offset of the beginning of the text.
0367: */
0368: public int first() {
0369: CharacterIterator t = getText();
0370:
0371: t.first();
0372: return t.getIndex();
0373: }
0374:
0375: /**
0376: * Sets the current iteration position to the end of the text.
0377: * (i.e., the CharacterIterator's ending offset).
0378: * @return The text's past-the-end offset.
0379: */
0380: public int last() {
0381: CharacterIterator t = getText();
0382:
0383: // t.last() returns the offset of the last character,
0384: // rather than the past-the-end offset
0385: t.setIndex(t.getEndIndex());
0386: return t.getIndex();
0387: }
0388:
0389: /**
0390: * Advances the iterator either forward or backward the specified number of steps.
0391: * Negative values move backward, and positive values move forward. This is
0392: * equivalent to repeatedly calling next() or previous().
0393: * @param n The number of steps to move. The sign indicates the direction
0394: * (negative is backwards, and positive is forwards).
0395: * @return The character offset of the boundary position n boundaries away from
0396: * the current one.
0397: */
0398: public int next(int n) {
0399: int result = current();
0400: while (n > 0) {
0401: result = handleNext();
0402: --n;
0403: }
0404: while (n < 0) {
0405: result = previous();
0406: ++n;
0407: }
0408: return result;
0409: }
0410:
0411: /**
0412: * Advances the iterator to the next boundary position.
0413: * @return The position of the first boundary after this one.
0414: */
0415: public int next() {
0416: return handleNext();
0417: }
0418:
0419: /**
0420: * Advances the iterator backwards, to the last boundary preceding this one.
0421: * @return The position of the last boundary position preceding this one.
0422: */
0423: public int previous() {
0424: // if we're already sitting at the beginning of the text, return DONE
0425: CharacterIterator text = getText();
0426: if (current() == text.getBeginIndex()) {
0427: return BreakIterator.DONE;
0428: }
0429:
0430: // set things up. handlePrevious() will back us up to some valid
0431: // break position before the current position (we back our internal
0432: // iterator up one step to prevent handlePrevious() from returning
0433: // the current position), but not necessarily the last one before
0434: // where we started
0435: int start = current();
0436: text.previous();
0437: int lastResult = handlePrevious();
0438: int result = lastResult;
0439:
0440: // iterate forward from the known break position until we pass our
0441: // starting point. The last break position before the starting
0442: // point is our return value
0443: while (result != BreakIterator.DONE && result < start) {
0444: lastResult = result;
0445: result = handleNext();
0446: }
0447:
0448: // set the current iteration position to be the last break position
0449: // before where we started, and then return that value
0450: text.setIndex(lastResult);
0451: return lastResult;
0452: }
0453:
0454: /**
0455: * Throw IllegalArgumentException unless begin <= offset < end.
0456: */
0457: protected static final void checkOffset(int offset,
0458: CharacterIterator text) {
0459: if (offset < text.getBeginIndex()
0460: || offset >= text.getEndIndex()) {
0461: throw new IllegalArgumentException("offset out of bounds");
0462: }
0463: }
0464:
0465: /**
0466: * Sets the iterator to refer to the first boundary position following
0467: * the specified position.
0468: * @offset The position from which to begin searching for a break position.
0469: * @return The position of the first break after the current position.
0470: */
0471: public int following(int offset) {
0472:
0473: CharacterIterator text = getText();
0474: checkOffset(offset, text);
0475:
0476: // Set our internal iteration position (temporarily)
0477: // to the position passed in. If this is the _beginning_ position,
0478: // then we can just use next() to get our return value
0479: text.setIndex(offset);
0480: if (offset == text.getBeginIndex()) {
0481: return handleNext();
0482: }
0483:
0484: // otherwise, we have to sync up first. Use handlePrevious() to back
0485: // us up to a known break position before the specified position (if
0486: // we can determine that the specified position is a break position,
0487: // we don't back up at all). This may or may not be the last break
0488: // position at or before our starting position. Advance forward
0489: // from here until we've passed the starting position. The position
0490: // we stop on will be the first break position after the specified one.
0491: int result = handlePrevious();
0492: while (result != BreakIterator.DONE && result <= offset) {
0493: result = handleNext();
0494: }
0495: return result;
0496: }
0497:
0498: /**
0499: * Sets the iterator to refer to the last boundary position before the
0500: * specified position.
0501: * @offset The position to begin searching for a break from.
0502: * @return The position of the last boundary before the starting position.
0503: */
0504: public int preceding(int offset) {
0505: // if we start by updating the current iteration position to the
0506: // position specified by the caller, we can just use previous()
0507: // to carry out this operation
0508: CharacterIterator text = getText();
0509: checkOffset(offset, text);
0510: text.setIndex(offset);
0511: return previous();
0512: }
0513:
0514: /**
0515: * Returns true if the specfied position is a boundary position. As a side
0516: * effect, leaves the iterator pointing to the first boundary position at
0517: * or after "offset".
0518: * @param offset the offset to check.
0519: * @return True if "offset" is a boundary position.
0520: */
0521: public boolean isBoundary(int offset) {
0522: CharacterIterator text = getText();
0523: checkOffset(offset, text);
0524: if (offset == text.getBeginIndex()) {
0525: return true;
0526: }
0527:
0528: // to check whether this is a boundary, we can use following() on the
0529: // position before the specified one and return true if the position we
0530: // get back is the one the user specified
0531: else {
0532: return following(offset - 1) == offset;
0533: }
0534: }
0535:
0536: /**
0537: * Returns the current iteration position.
0538: * @return The current iteration position.
0539: */
0540: public int current() {
0541: return getText().getIndex();
0542: }
0543:
0544: /**
0545: * Return a CharacterIterator over the text being analyzed. This version
0546: * of this method returns the actual CharacterIterator we're using internally.
0547: * Changing the state of this iterator can have undefined consequences. If
0548: * you need to change it, clone it first.
0549: * @return An iterator over the text being analyzed.
0550: */
0551: public CharacterIterator getText() {
0552: // The iterator is initialized pointing to no text at all, so if this
0553: // function is called while we're in that state, we have to fudge an
0554: // an iterator to return.
0555: if (text == null) {
0556: text = new StringCharacterIterator("");
0557: }
0558: return text;
0559: }
0560:
0561: /**
0562: * Set the iterator to analyze a new piece of text. This function resets
0563: * the current iteration position to the beginning of the text.
0564: * @param newText An iterator over the text to analyze.
0565: */
0566: public void setText(CharacterIterator newText) {
0567: // Test iterator to see if we need to wrap it in a SafeCharIterator.
0568: // The correct behavior for CharacterIterators is to allow the
0569: // position to be set to the endpoint of the iterator. Many
0570: // CharacterIterators do not uphold this, so this is a workaround
0571: // to permit them to use this class.
0572: int end = newText.getEndIndex();
0573: boolean goodIterator;
0574: try {
0575: newText.setIndex(end); // some buggy iterators throw an exception here
0576: goodIterator = newText.getIndex() == end;
0577: } catch (IllegalArgumentException e) {
0578: goodIterator = false;
0579: }
0580:
0581: if (goodIterator) {
0582: text = newText;
0583: } else {
0584: text = new SafeCharIterator(newText);
0585: }
0586: text.first();
0587: }
0588:
0589: //=======================================================================
0590: // implementation
0591: //=======================================================================
0592:
0593: /**
0594: * This method is the actual implementation of the next() method. All iteration
0595: * vectors through here. This method initializes the state machine to state 1
0596: * and advances through the text character by character until we reach the end
0597: * of the text or the state machine transitions to state 0. We update our return
0598: * value every time the state machine passes through a possible end state.
0599: */
0600: protected int handleNext() {
0601: // if we're already at the end of the text, return DONE.
0602: CharacterIterator text = getText();
0603: if (text.getIndex() == text.getEndIndex()) {
0604: return BreakIterator.DONE;
0605: }
0606:
0607: // no matter what, we always advance at least one character forward
0608: int result = text.getIndex() + 1;
0609: int lookaheadResult = 0;
0610:
0611: // begin in state 1
0612: int state = START_STATE;
0613: int category;
0614: char c = text.current();
0615:
0616: // loop until we reach the end of the text or transition to state 0
0617: while (c != CharacterIterator.DONE && state != STOP_STATE) {
0618:
0619: // look up the current character's character category (which tells us
0620: // which column in the state table to look at)
0621: category = lookupCategory(c);
0622:
0623: // if the character isn't an ignore character, look up a state
0624: // transition in the state table
0625: if (category != IGNORE) {
0626: state = lookupState(state, category);
0627: }
0628:
0629: // if the state we've just transitioned to is a lookahead state,
0630: // (but not also an end state), save its position. If it's
0631: // both a lookahead state and an end state, update the break position
0632: // to the last saved lookup-state position
0633: if (lookaheadStates[state]) {
0634: if (endStates[state]) {
0635: result = lookaheadResult;
0636: } else {
0637: lookaheadResult = text.getIndex() + 1;
0638: }
0639: }
0640:
0641: // otherwise, if the state we've just transitioned to is an accepting
0642: // state, update the break position to be the current iteration position
0643: else {
0644: if (endStates[state]) {
0645: result = text.getIndex() + 1;
0646: }
0647: }
0648:
0649: c = text.next();
0650: }
0651:
0652: // if we've run off the end of the text, and the very last character took us into
0653: // a lookahead state, advance the break position to the lookahead position
0654: // (the theory here is that if there are no characters at all after the lookahead
0655: // position, that always matches the lookahead criteria)
0656: if (c == CharacterIterator.DONE
0657: && lookaheadResult == text.getEndIndex()) {
0658: result = lookaheadResult;
0659: }
0660:
0661: text.setIndex(result);
0662: return result;
0663: }
0664:
0665: /**
0666: * This method backs the iterator back up to a "safe position" in the text.
0667: * This is a position that we know, without any context, must be a break position.
0668: * The various calling methods then iterate forward from this safe position to
0669: * the appropriate position to return. (For more information, see the description
0670: * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
0671: */
0672: protected int handlePrevious() {
0673: CharacterIterator text = getText();
0674: int state = START_STATE;
0675: int category = 0;
0676: int lastCategory = 0;
0677: char c = text.current();
0678:
0679: // loop until we reach the beginning of the text or transition to state 0
0680: while (c != CharacterIterator.DONE && state != STOP_STATE) {
0681:
0682: // save the last character's category and look up the current
0683: // character's category
0684: lastCategory = category;
0685: category = lookupCategory(c);
0686:
0687: // if the current character isn't an ignore character, look up a
0688: // state transition in the backwards state table
0689: if (category != IGNORE) {
0690: state = lookupBackwardState(state, category);
0691: }
0692:
0693: // then advance one character backwards
0694: c = text.previous();
0695: }
0696:
0697: // if we didn't march off the beginning of the text, we're either one or two
0698: // positions away from the real break position. (One because of the call to
0699: // previous() at the end of the loop above, and another because the character
0700: // that takes us into the stop state will always be the character BEFORE
0701: // the break position.)
0702: if (c != CharacterIterator.DONE) {
0703: if (lastCategory != IGNORE) {
0704: text.setIndex(text.getIndex() + 2);
0705: } else {
0706: text.next();
0707: }
0708: }
0709: return text.getIndex();
0710: }
0711:
0712: /**
0713: * Looks up a character's category (i.e., its category for breaking purposes,
0714: * not its Unicode category)
0715: */
0716: protected int lookupCategory(char c) {
0717: return charCategoryTable.elementAt(c);
0718: }
0719:
0720: /**
0721: * Given a current state and a character category, looks up the
0722: * next state to transition to in the state table.
0723: */
0724: protected int lookupState(int state, int category) {
0725: return stateTable[state * numCategories + category];
0726: }
0727:
0728: /**
0729: * Given a current state and a character category, looks up the
0730: * next state to transition to in the backwards state table.
0731: */
0732: protected int lookupBackwardState(int state, int category) {
0733: return backwardsStateTable[state * numCategories + category];
0734: }
0735:
0736: //=======================================================================
0737: // RuleBasedBreakIterator.Builder
0738: //=======================================================================
0739: /**
0740: * The Builder class has the job of constructing a RuleBasedBreakIterator from a
0741: * textual description. A Builder is constructed by RuleBasedBreakIterator's
0742: * constructor, which uses it to construct the iterator itself and then throws it
0743: * away.
0744: * <p>The construction logic is separated out into its own class for two primary
0745: * reasons:
0746: * <ul><li>The construction logic is quite sophisticated and large. Separating it
0747: * out into its own class means the code must only be loaded into memory while a
0748: * RuleBasedBreakIterator is being constructed, and can be purged after that.
0749: * <li>There is a fair amount of state that must be maintained throughout the
0750: * construction process that is not needed by the iterator after construction.
0751: * Separating this state out into another class prevents all of the functions that
0752: * construct the iterator from having to have really long parameter lists,
0753: * (hopefully) contributing to readability and maintainability.</ul>
0754: * <p>It'd be really nice if this could be an independent class rather than an
0755: * inner class, because that would shorten the source file considerably, but
0756: * making Builder an inner class of RuleBasedBreakIterator allows it direct access
0757: * to RuleBasedBreakIterator's private members, which saves us from having to
0758: * provide some kind of "back door" to the Builder class that could then also be
0759: * used by other classes.
0760: */
0761: protected class Builder {
0762: /**
0763: * A temporary holding place used for calculating the character categories.
0764: * This object contains CharSet objects.
0765: */
0766: protected Vector categories = null;
0767:
0768: /**
0769: * A table used to map parts of regexp text to lists of character categories,
0770: * rather than having to figure them out from scratch each time
0771: */
0772: protected Hashtable expressions = null;
0773:
0774: /**
0775: * A temporary holding place for the list of ignore characters
0776: */
0777: protected CharSet ignoreChars = null;
0778:
0779: /**
0780: * A temporary holding place where the forward state table is built
0781: */
0782: protected Vector tempStateTable = null;
0783:
0784: /**
0785: * A list of all the states that have to be filled in with transitions to the
0786: * next state that is created. Used when building the state table from the
0787: * regular expressions.
0788: */
0789: protected Vector decisionPointList = null;
0790:
0791: /**
0792: * A stack for holding decision point lists. This is used to handle nested
0793: * parentheses and braces in regexps.
0794: */
0795: protected Stack decisionPointStack = null;
0796:
0797: /**
0798: * A list of states that loop back on themselves. Used to handle .*?
0799: */
0800: protected Vector loopingStates = null;
0801:
0802: /**
0803: * Looping states actually have to be backfilled later in the process
0804: * than everything else. This is where a the list of states to backfill
0805: * is accumulated. This is also used to handle .*?
0806: */
0807: protected Vector statesToBackfill = null;
0808:
0809: /**
0810: * A list mapping pairs of state numbers for states that are to be combined
0811: * to the state number of the state representing their combination. Used
0812: * in the process of making the state table deterministic to prevent
0813: * infinite recursion.
0814: */
0815: protected Vector mergeList = null;
0816:
0817: /**
0818: * A flag that is used to indicate when the list of looping states can
0819: * be reset.
0820: */
0821: protected boolean clearLoopingStates = false;
0822:
0823: /**
0824: * A bit mask used to indicate a bit in the table's flags column that marks a
0825: * state as an accepting state.
0826: */
0827: protected static final int END_STATE_FLAG = 0x8000;
0828:
0829: /**
0830: * A bit mask used to indicate a bit in the table's flags column that marks a
0831: * state as one the builder shouldn't loop to any looping states
0832: */
0833: protected static final int DONT_LOOP_FLAG = 0x4000;
0834:
0835: /**
0836: * A bit mask used to indicate a bit in the table's flags column that marks a
0837: * state as a lookahead state.
0838: */
0839: protected static final int LOOKAHEAD_STATE_FLAG = 0x2000;
0840:
0841: /**
0842: * A bit mask representing the union of the mask values listed above.
0843: * Used for clearing or masking off the flag bits.
0844: */
0845: protected static final int ALL_FLAGS = END_STATE_FLAG
0846: | LOOKAHEAD_STATE_FLAG | DONT_LOOP_FLAG;
0847:
0848: /**
0849: * No special construction is required for the Builder.
0850: */
0851: public Builder() {
0852: }
0853:
0854: /**
0855: * This is the main function for setting up the BreakIterator's tables. It
0856: * just vectors different parts of the job off to other functions.
0857: */
0858: public void buildBreakIterator() {
0859: Vector tempRuleList = buildRuleList(description);
0860: buildCharCategories(tempRuleList);
0861: buildStateTable(tempRuleList);
0862: buildBackwardsStateTable(tempRuleList);
0863: }
0864:
0865: /**
0866: * Thus function has three main purposes:
0867: * <ul><li>Perform general syntax checking on the description, so the rest of the
0868: * build code can assume that it's parsing a legal description.
0869: * <li>Split the description into separate rules
0870: * <li>Perform variable-name substitutions (so that no one else sees variable names)
0871: * </ul>
0872: */
0873: private Vector buildRuleList(String description) {
0874: // invariants:
0875: // - parentheses must be balanced: ()[]{}<>
0876: // - nothing can be nested inside <>
0877: // - nothing can be nested inside [] except more []s
0878: // - pairs of ()[]{}<> must not be empty
0879: // - ; can only occur at the outer level
0880: // - | can only appear inside ()
0881: // - only one = or / can occur in a single rule
0882: // - = and / cannot both occur in the same rule
0883: // - <> can only occur on the left side of a = expression
0884: // (because we'll perform substitutions to eliminate them other places)
0885: // - the left-hand side of a = expression can only be a single character
0886: // (possibly with \) or text inside <>
0887: // - the right-hand side of a = expression must be enclosed in [] or ()
0888: // - * may not occur at the beginning of a rule, nor may it follow
0889: // =, /, (, (, |, }, ;, or *
0890: // - ? may only follow *
0891: // - the rule list must contain at least one / rule
0892: // - no rule may be empty
0893: // - all printing characters in the ASCII range except letters and digits
0894: // are reserved and must be preceded by \
0895: // - ! may only occur at the beginning of a rule
0896:
0897: // set up a vector to contain the broken-up description (each entry in the
0898: // vector is a separate rule) and a stack for keeping track of opening
0899: // punctuation
0900: Vector tempRuleList = new Vector();
0901: Stack parenStack = new Stack();
0902:
0903: int p = 0;
0904: int ruleStart = 0;
0905: char c = '\u0000';
0906: char lastC = '\u0000';
0907: char lastOpen = '\u0000';
0908: boolean haveEquals = false;
0909: boolean havePipe = false;
0910: boolean sawVarName = false;
0911: final String charsThatCantPrecedeAsterisk = "=/{(|}*;\u0000";
0912:
0913: // if the description doesn't end with a semicolon, tack a semicolon onto the end
0914: if (description.length() != 0
0915: && description.charAt(description.length() - 1) != ';') {
0916: description = description + ";";
0917: }
0918:
0919: // for each character, do...
0920: while (p < description.length()) {
0921: c = description.charAt(p);
0922: switch (c) {
0923: // if the character is a backslash, skip the character that follows it
0924: // (it'll get treated as a literal character)
0925: case '\\':
0926: ++p;
0927: break;
0928:
0929: // if the character is opening punctuation, verify that no nesting
0930: // rules are broken, and push the character onto the stack
0931: case '{':
0932: case '<':
0933: case '[':
0934: case '(':
0935: if (lastOpen == '<') {
0936: error("Can't nest brackets inside <>", p,
0937: description);
0938: }
0939: if (lastOpen == '[' && c != '[') {
0940: error("Can't nest anything in [] but []", p,
0941: description);
0942: }
0943:
0944: // if we see < anywhere except on the left-hand side of =,
0945: // we must be seeing a variable name that was never defined
0946: if (c == '<' && (haveEquals || havePipe)) {
0947: error("Unknown variable name", p, description);
0948: }
0949:
0950: lastOpen = c;
0951: parenStack.push(new Character(c));
0952: if (c == '<') {
0953: sawVarName = true;
0954: }
0955: break;
0956:
0957: // if the character is closing punctuation, verify that it matches the
0958: // last opening punctuation we saw, and that the brackets contain
0959: // something, then pop the stack
0960: case '}':
0961: case '>':
0962: case ']':
0963: case ')':
0964: char expectedClose = '\u0000';
0965: switch (lastOpen) {
0966: case '{':
0967: expectedClose = '}';
0968: break;
0969: case '[':
0970: expectedClose = ']';
0971: break;
0972: case '(':
0973: expectedClose = ')';
0974: break;
0975: case '<':
0976: expectedClose = '>';
0977: break;
0978: }
0979: if (c != expectedClose) {
0980: error("Unbalanced parentheses", p, description);
0981: }
0982: if (lastC == lastOpen) {
0983: error("Parens don't contain anything", p,
0984: description);
0985: }
0986: parenStack.pop();
0987: if (!parenStack.empty()) {
0988: lastOpen = ((Character) (parenStack.peek()))
0989: .charValue();
0990: } else {
0991: lastOpen = '\u0000';
0992: }
0993:
0994: break;
0995:
0996: // if the character is an asterisk, make sure it occurs in a place
0997: // where an asterisk can legally go
0998: case '*':
0999: if (charsThatCantPrecedeAsterisk.indexOf(lastC) != -1) {
1000: error("Misplaced asterisk", p, description);
1001: }
1002: break;
1003:
1004: // if the character is a question mark, make sure it follows an asterisk
1005: case '?':
1006: if (lastC != '*') {
1007: error("Misplaced ?", p, description);
1008: }
1009: break;
1010:
1011: // if the character is an equals sign, make sure we haven't seen another
1012: // equals sign or a slash yet
1013: case '=':
1014: if (haveEquals || havePipe) {
1015: error("More than one = or / in rule", p,
1016: description);
1017: }
1018: haveEquals = true;
1019: break;
1020:
1021: // if the character is a slash, make sure we haven't seen another slash
1022: // or an equals sign yet
1023: case '/':
1024: if (haveEquals || havePipe) {
1025: error("More than one = or / in rule", p,
1026: description);
1027: }
1028: if (sawVarName) {
1029: error("Unknown variable name", p, description);
1030: }
1031: havePipe = true;
1032: break;
1033:
1034: // if the character is an exclamation point, make sure it occurs only
1035: // at the beginning of a rule
1036: case '!':
1037: if (lastC != ';' && lastC != '\u0000') {
1038: error(
1039: "! can only occur at the beginning of a rule",
1040: p, description);
1041: }
1042: break;
1043:
1044: // we don't have to do anything special on a period
1045: case '.':
1046: break;
1047:
1048: // if the character is a syntax character that can only occur
1049: // inside [], make sure that it does in fact only occur inside [].
1050: case '^':
1051: case '-':
1052: case ':':
1053: if (lastOpen != '[' && lastOpen != '<') {
1054: error("Illegal character", p, description);
1055: }
1056: break;
1057:
1058: // if the character is a semicolon, do the following...
1059: case ';':
1060: // make sure the rule contains something and that there are no
1061: // unbalanced parentheses or brackets
1062: if (lastC == ';' || lastC == '\u0000') {
1063: error("Empty rule", p, description);
1064: }
1065: if (!parenStack.empty()) {
1066: error("Unbalanced parenheses", p, description);
1067: }
1068:
1069: if (parenStack.empty()) {
1070: // if the rule contained an = sign, call processSubstitution()
1071: // to replace the substitution name with the substitution text
1072: // wherever it appears in the description
1073: if (haveEquals) {
1074: description = processSubstitution(
1075: description.substring(ruleStart, p),
1076: description, p + 1);
1077: } else {
1078: // otherwise, check to make sure the rule doesn't reference
1079: // any undefined substitutions
1080: if (sawVarName) {
1081: error("Unknown variable name", p,
1082: description);
1083: }
1084:
1085: // then add it to tempRuleList
1086: tempRuleList.addElement(description
1087: .substring(ruleStart, p));
1088: }
1089:
1090: // and reset everything to process the next rule
1091: ruleStart = p + 1;
1092: haveEquals = havePipe = sawVarName = false;
1093: }
1094: break;
1095:
1096: // if the character is a vertical bar, check to make sure that it
1097: // occurs inside a () expression and that the character that precedes
1098: // it isn't also a vertical bar
1099: case '|':
1100: if (lastC == '|') {
1101: error("Empty alternative", p, description);
1102: }
1103: if (parenStack.empty() || lastOpen != '(') {
1104: error("Misplaced |", p, description);
1105: }
1106: break;
1107:
1108: // if the character is anything else (escaped characters are
1109: // skipped and don't make it here), it's an error
1110: default:
1111: if (c >= ' ' && c < '\u007f'
1112: && !Character.isLetter(c)
1113: && !Character.isDigit(c)) {
1114: error("Illegal character", p, description);
1115: }
1116: break;
1117: }
1118: lastC = c;
1119: ++p;
1120: }
1121: if (tempRuleList.size() == 0) {
1122: error("No valid rules in description", p, description);
1123: }
1124: return tempRuleList;
1125: }
1126:
1127: /**
1128: * This function performs variable-name substitutions. First it does syntax
1129: * checking on the variable-name definition. If it's syntactically valid, it
1130: * then goes through the remainder of the description and does a simple
1131: * find-and-replace of the variable name with its text. (The variable text
1132: * must be enclosed in either [] or () for this to work.)
1133: */
1134: protected String processSubstitution(String substitutionRule,
1135: String description, int startPos) {
1136: // isolate out the text on either side of the equals sign
1137: String replace;
1138: String replaceWith;
1139: int equalPos = substitutionRule.indexOf('=');
1140: replace = substitutionRule.substring(0, equalPos);
1141: replaceWith = substitutionRule.substring(equalPos + 1);
1142:
1143: // check to see whether the substitution name is something we've declared
1144: // to be "special". For RuleBasedBreakIterator itself, this is "<ignore>".
1145: // This function takes care of any extra processing that has to be done
1146: // with "special" substitution names.
1147: handleSpecialSubstitution(replace, replaceWith, startPos,
1148: description);
1149:
1150: // perform various other syntax checks on the rule
1151: if (replaceWith.length() == 0) {
1152: error("Nothing on right-hand side of =", startPos,
1153: description);
1154: }
1155: if (replace.length() == 0) {
1156: error("Nothing on left-hand side of =", startPos,
1157: description);
1158: }
1159: if (replace.length() == 2 && replace.charAt(0) != '\\') {
1160: error("Illegal left-hand side for =", startPos,
1161: description);
1162: }
1163: if (replace.length() >= 3 && replace.charAt(0) != '<'
1164: && replace.charAt(equalPos - 1) != '>') {
1165: error("Illegal left-hand side for =", startPos,
1166: description);
1167: }
1168: if (!(replaceWith.charAt(0) == '[' && replaceWith
1169: .charAt(replaceWith.length() - 1) == ']')
1170: && !(replaceWith.charAt(0) == '(' && replaceWith
1171: .charAt(replaceWith.length() - 1) == ')')) {
1172: error("Illegal right-hand side for =", startPos,
1173: description);
1174: }
1175:
1176: // now go through the rest of the description (which hasn't been broken up
1177: // into separate rules yet) and replace every occurrence of the
1178: // substitution name with the substitution body
1179: StringBuffer result = new StringBuffer();
1180: result.append(description.substring(0, startPos));
1181: int lastPos = startPos;
1182: int pos = description.indexOf(replace, startPos);
1183: while (pos != -1) {
1184: result.append(description.substring(lastPos, pos));
1185: result.append(replaceWith);
1186: lastPos = pos + replace.length();
1187: pos = description.indexOf(replace, lastPos);
1188: }
1189: result.append(description.substring(lastPos));
1190: return result.toString();
1191: }
1192:
1193: /**
1194: * This function defines a protocol for handling substitution names that
1195: * are "special," i.e., that have some property beyond just being
1196: * substitutions. At the RuleBasedBreakIterator level, we have one
1197: * special substitution name, "<ignore>". Subclasses can override this
1198: * function to add more. Any special processing that has to go on beyond
1199: * that which is done by the normal substitution-processing code is done
1200: * here.
1201: */
1202: protected void handleSpecialSubstitution(String replace,
1203: String replaceWith, int startPos, String description) {
1204: // if we get a definition for a substitution called "ignore", it defines
1205: // the ignore characters for the iterator. Check to make sure the expression
1206: // is a [] expression, and if it is, parse it and store the characters off
1207: // to the side.
1208: if (replace.equals("<ignore>")) {
1209: if (replaceWith.charAt(0) == '(') {
1210: error("Ignore group can't be enclosed in (",
1211: startPos, description);
1212: }
1213: ignoreChars = CharSet.parseString(replaceWith);
1214: }
1215: }
1216:
1217: /**
1218: * This function builds the character category table. On entry,
1219: * tempRuleList is a vector of break rules that has had variable names substituted.
1220: * On exit, the charCategoryTable data member has been initialized to hold the
1221: * character category table, and tempRuleList's rules have been munged to contain
1222: * character category numbers everywhere a literal character or a [] expression
1223: * originally occurred.
1224: */
1225: protected void buildCharCategories(Vector tempRuleList) {
1226: int bracketLevel = 0;
1227: int p = 0;
1228: int lineNum = 0;
1229:
1230: // build hash table of every literal character or [] expression in the rule list
1231: // and use CharSet.parseString() to derive a CharSet object representing the
1232: // characters each refers to
1233: expressions = new Hashtable();
1234: while (lineNum < tempRuleList.size()) {
1235: String line = (String) (tempRuleList.elementAt(lineNum));
1236: p = 0;
1237: while (p < line.length()) {
1238: char c = line.charAt(p);
1239: switch (c) {
1240: // skip over all syntax characters except [
1241: case '{':
1242: case '}':
1243: case '(':
1244: case ')':
1245: case '*':
1246: case '.':
1247: case '/':
1248: case '|':
1249: case ';':
1250: case '?':
1251: case '!':
1252: break;
1253:
1254: // for [, find the matching ] (taking nested [] pairs into account)
1255: // and add the whole expression to the expression list
1256: case '[':
1257: int q = p + 1;
1258: ++bracketLevel;
1259: while (q < line.length() && bracketLevel != 0) {
1260: c = line.charAt(q);
1261: switch (c) {
1262: case '\\':
1263: q++;
1264: break;
1265: case '[':
1266: ++bracketLevel;
1267: break;
1268: case ']':
1269: --bracketLevel;
1270: break;
1271: }
1272: ++q;
1273: }
1274: if (expressions.get(line.substring(p, q)) == null) {
1275: expressions.put(line.substring(p, q),
1276: CharSet.parseString(line.substring(
1277: p, q)));
1278: }
1279: p = q - 1;
1280: break;
1281:
1282: // for \ sequences, just move to the next character and treat
1283: // it as a single character
1284: case '\\':
1285: ++p;
1286: c = line.charAt(p);
1287: // DON'T break; fall through into "default" clause
1288:
1289: // for an isolated single character, add it to the expression list
1290: default:
1291: expressions.put(line.substring(p, p + 1),
1292: CharSet.parseString(line.substring(p,
1293: p + 1)));
1294: break;
1295: }
1296: ++p;
1297: }
1298: ++lineNum;
1299: }
1300: // dump CharSet's internal expression cache
1301: CharSet.releaseExpressionCache();
1302:
1303: // create the temporary category table (which is a vector of CharSet objects)
1304: categories = new Vector();
1305: if (ignoreChars != null) {
1306: categories.addElement(ignoreChars);
1307: } else {
1308: categories.addElement(new CharSet());
1309: }
1310: ignoreChars = null;
1311:
1312: // this is a hook to allow subclasses to add categories on their own
1313: mungeExpressionList(expressions);
1314:
1315: // Derive the character categories. Go through the existing character categories
1316: // looking for overlap. Any time there's overlap, we create a new character
1317: // category for the characters that overlapped and remove them from their original
1318: // category. At the end, any characters that are left in the expression haven't
1319: // been mentioned in any category, so another new category is created for them.
1320: // For example, if the first expression is [abc], then a, b, and c will be placed
1321: // into a single character category. If the next expression is [bcd], we will first
1322: // remove b and c from their existing category (leaving a behind), create a new
1323: // category for b and c, and then create another new category for d (which hadn't
1324: // been mentioned in the previous expression).
1325: // At no time should a character ever occur in more than one character category.
1326:
1327: // for each expression in the expressions list, do...
1328: Enumeration iter = expressions.elements();
1329: while (iter.hasMoreElements()) {
1330: // initialize the working char set to the chars in the current expression
1331: CharSet e = (CharSet) iter.nextElement();
1332:
1333: // for each category in the category list, do...
1334: for (int j = categories.size() - 1; !e.empty() && j > 0; j--) {
1335:
1336: // if there's overlap between the current working set of chars
1337: // and the current category...
1338: CharSet that = (CharSet) (categories.elementAt(j));
1339: if (!that.intersection(e).empty()) {
1340:
1341: // add a new category for the characters that were in the
1342: // current category but not in the working char set
1343: CharSet temp = that.difference(e);
1344: if (!temp.empty()) {
1345: categories.addElement(temp);
1346: }
1347:
1348: // remove those characters from the working char set and replace
1349: // the current category with the characters that it did
1350: // have in common with the current working char set
1351: temp = e.intersection(that);
1352: e = e.difference(that);
1353: if (!temp.equals(that)) {
1354: categories.setElementAt(temp, j);
1355: }
1356: }
1357: }
1358:
1359: // if there are still characters left in the working char set,
1360: // add a new category containing them
1361: if (!e.empty()) {
1362: categories.addElement(e);
1363: }
1364: }
1365:
1366: // we have the ignore characters stored in position 0. Make an extra pass through
1367: // the character category list and remove anything from the ignore list that shows
1368: // up in some other category
1369: CharSet allChars = new CharSet();
1370: for (int i = 1; i < categories.size(); i++)
1371: allChars = allChars.union((CharSet) (categories
1372: .elementAt(i)));
1373: CharSet ignoreChars = (CharSet) (categories.elementAt(0));
1374: ignoreChars = ignoreChars.difference(allChars);
1375: categories.setElementAt(ignoreChars, 0);
1376:
1377: // now that we've derived the character categories, go back through the expression
1378: // list and replace each CharSet object with a String that represents the
1379: // character categories that expression refers to. The String is encoded: each
1380: // character is a character category number (plus 0x100 to avoid confusing them
1381: // with syntax characters in the rule grammar)
1382: iter = expressions.keys();
1383: while (iter.hasMoreElements()) {
1384: String key = (String) iter.nextElement();
1385: CharSet cs = (CharSet) expressions.get(key);
1386: StringBuffer cats = new StringBuffer();
1387:
1388: // for each category...
1389: for (int j = 0; j < categories.size(); j++) {
1390:
1391: // if the current expression contains characters in that category...
1392: CharSet temp = cs
1393: .intersection((CharSet) (categories
1394: .elementAt(j)));
1395: if (!temp.empty()) {
1396:
1397: // then add the encoded category number to the String for this
1398: // expression
1399: cats.append((char) (0x100 + j));
1400: if (temp.equals(cs)) {
1401: break;
1402: }
1403: }
1404: }
1405:
1406: // once we've finished building the encoded String for this expression,
1407: // replace the CharSet object with it
1408: expressions.put(key, cats.toString());
1409: }
1410:
1411: // and finally, we turn the temporary category table into a permanent category
1412: // table, which is a CompactByteArray. (we skip category 0, which by definition
1413: // refers to all characters not mentioned specifically in the rules)
1414: charCategoryTable = new CompactByteArray((byte) 0);
1415:
1416: // for each category...
1417: for (int i = 0; i < categories.size(); i++) {
1418: CharSet chars = (CharSet) (categories.elementAt(i));
1419:
1420: // go through the character ranges in the category one by one...
1421: Enumeration enum_ = chars.getChars();
1422: while (enum_.hasMoreElements()) {
1423: char[] range = (char[]) (enum_.nextElement());
1424:
1425: // and set the corresponding elements in the CompactArray accordingly
1426: if (i != 0) {
1427: charCategoryTable.setElementAt(range[0],
1428: range[1], (byte) i);
1429: }
1430:
1431: // (category 0 is special-- it's the hiding place for the ignore
1432: // characters, whose real category number in the CompactArray is
1433: // -1 [this is because category 0 contains all characters not
1434: // specifically mentioned anywhere in the rules] )
1435: else {
1436: charCategoryTable.setElementAt(range[0],
1437: range[1], IGNORE);
1438: }
1439: }
1440: }
1441:
1442: // once we've populated the CompactArray, compact it
1443: charCategoryTable.compact();
1444:
1445: // initialize numCategories
1446: numCategories = categories.size();
1447: }
1448:
1449: protected void mungeExpressionList(Hashtable expressions) {
1450: // empty in the parent class. This function provides a hook for subclasses
1451: // to mess with the character category table.
1452: }
1453:
1454: /**
1455: * This is the function that builds the forward state table. Most of the real
1456: * work is done in parseRule(), which is called once for each rule in the
1457: * description.
1458: */
1459: private void buildStateTable(Vector tempRuleList) {
1460: // initialize our temporary state table, and fill it with two states:
1461: // state 0 is a dummy state that allows state 1 to be the starting state
1462: // and 0 to represent "stop". State 1 is added here to seed things
1463: // before we start parsing
1464: tempStateTable = new Vector();
1465: tempStateTable.addElement(new short[numCategories + 1]);
1466: tempStateTable.addElement(new short[numCategories + 1]);
1467:
1468: // call parseRule() for every rule in the rule list (except those which
1469: // start with !, which are actually backwards-iteration rules)
1470: for (int i = 0; i < tempRuleList.size(); i++) {
1471: String rule = (String) tempRuleList.elementAt(i);
1472: if (rule.charAt(0) != '!') {
1473: parseRule(rule, true);
1474: }
1475: }
1476:
1477: // finally, use finishBuildingStateTable() to minimize the number of
1478: // states in the table and perform some other cleanup work
1479: finishBuildingStateTable(true);
1480: }
1481:
1482: /**
1483: * This is where most of the work really happens. This routine parses a single
1484: * rule in the rule description, adding and modifying states in the state
1485: * table according to the new expression. The state table is kept deterministic
1486: * throughout the whole operation, although some ugly postprocessing is needed
1487: * to handle the *? token.
1488: */
1489: private void parseRule(String rule, boolean forward) {
1490: // algorithm notes:
1491: // - The basic idea here is to read successive character-category groups
1492: // from the input string. For each group, you create a state and point
1493: // the appropriate entries in the previous state to it. This produces a
1494: // straight line from the start state to the end state. The {}, *, and (|)
1495: // idioms produce branches in this straight line. These branches (states
1496: // that can transition to more than one other state) are called "decision
1497: // points." A list of decision points is kept. This contains a list of
1498: // all states that can transition to the next state to be created. For a
1499: // straight line progression, the only thing in the decision-point list is
1500: // the current state. But if there's a branch, the decision-point list
1501: // will contain all of the beginning points of the branch when the next
1502: // state to be created represents the end point of the branch. A stack is
1503: // used to save decision point lists in the presence of nested parentheses
1504: // and the like. For example, when a { is encountered, the current decision
1505: // point list is saved on the stack and restored when the corresponding }
1506: // is encountered. This way, after the } is read, the decision point list
1507: // will contain both the state right before the } _and_ the state before
1508: // the whole {} expression. Both of these states can transition to the next
1509: // state after the {} expression.
1510: // - one complication arises when we have to stamp a transition value into
1511: // an array cell that already contains one. The updateStateTable() and
1512: // mergeStates() functions handle this case. Their basic approach is to
1513: // create a new state that combines the two states that conflict and point
1514: // at it when necessary. This happens recursively, so if the merged states
1515: // also conflict, they're resolved in the same way, and so on. There are
1516: // a number of tests aimed at preventing infinite recursion.
1517: // - another complication arises with repeating characters. It's somewhat
1518: // ambiguous whether the user wants a greedy or non-greedy match in these cases.
1519: // (e.g., whether "[a-z]*abc" means the SHORTEST sequence of letters ending in
1520: // "abc" or the LONGEST sequence of letters ending in "abc". We've adopted
1521: // the *? to mean "shortest" and * by itself to mean "longest". (You get the
1522: // same result with both if there's no overlap between the repeating character
1523: // group and the group immediately following it.) Handling the *? token is
1524: // rather complicated and involves keeping track of whether a state needs to
1525: // be merged (as described above) or merely overwritten when you update one of
1526: // its cells, and copying the contents of a state that loops with a *? token
1527: // into some of the states that follow it after the rest of the table-building
1528: // process is complete ("backfilling").
1529: // implementation notes:
1530: // - This function assumes syntax checking has been performed on the input string
1531: // prior to its being passed in here. It assumes that parentheses are
1532: // balanced, all literal characters are enclosed in [] and turned into category
1533: // numbers, that there are no illegal characters or character sequences, and so
1534: // on. Violation of these invariants will lead to undefined behavior.
1535: // - It'd probably be better to use linked lists rather than Vector and Stack
1536: // to maintain the decision point list and stack. I went for simplicity in
1537: // this initial implementation. If performance is critical enough, we can go
1538: // back and fix this later.
1539: // -There are a number of important limitations on the *? token. It does not work
1540: // right when followed by a repeating character sequence (e.g., ".*?(abc)*")
1541: // (although it does work right when followed by a single repeating character).
1542: // It will not always work right when nested in parentheses or braces (although
1543: // sometimes it will). It also will not work right if the group of repeating
1544: // characters and the group of characters that follows overlap partially
1545: // (e.g., "[a-g]*?[e-j]"). None of these capabilites was deemed necessary for
1546: // describing breaking rules we know about, so we left them out for
1547: // expeditiousness.
1548: // - Rules such as "[a-z]*?abc;" will be treated the same as "[a-z]*?aa*bc;"--
1549: // that is, if the string ends in "aaaabc", the break will go before the first
1550: // "a" rather than the last one. Both of these are limitations in the design
1551: // of RuleBasedBreakIterator and not limitations of the rule parser.
1552:
1553: int p = 0;
1554: int currentState = 1; // don't use state number 0; 0 means "stop"
1555: int lastState = currentState;
1556: String pendingChars = "";
1557:
1558: decisionPointStack = new Stack();
1559: decisionPointList = new Vector();
1560: loopingStates = new Vector();
1561: statesToBackfill = new Vector();
1562:
1563: short[] state;
1564: boolean sawEarlyBreak = false;
1565:
1566: // if we're adding rules to the backward state table, mark the initial state
1567: // as a looping state
1568: if (!forward) {
1569: loopingStates.addElement(new Integer(1));
1570: }
1571:
1572: // put the current state on the decision point list before we start
1573: decisionPointList.addElement(new Integer(currentState)); // we want currentState to
1574: // be 1 here...
1575: currentState = tempStateTable.size() - 1; // but after that, we want it to be
1576: // 1 less than the state number of the next state
1577: while (p < rule.length()) {
1578: char c = rule.charAt(p);
1579: clearLoopingStates = false;
1580:
1581: // this section handles literal characters, escaped characters (which are
1582: // effectively literal characters too), the . token, and [] expressions
1583: if (c == '[' || c == '\\' || Character.isLetter(c)
1584: || Character.isDigit(c) || c < ' ' || c == '.'
1585: || c >= '\u007f') {
1586:
1587: // if we're not on a period, isolate the expression and look up
1588: // the corresponding category list
1589: if (c != '.') {
1590: int q = p;
1591:
1592: // if we're on a backslash, the expression is the character
1593: // after the backslash
1594: if (c == '\\') {
1595: q = p + 2;
1596: ++p;
1597: }
1598:
1599: // if we're on an opening bracket, scan to the closing bracket
1600: // to isolate the expression
1601: else if (c == '[') {
1602: int bracketLevel = 1;
1603: while (bracketLevel > 0) {
1604: ++q;
1605: c = rule.charAt(q);
1606: if (c == '[') {
1607: ++bracketLevel;
1608: } else if (c == ']') {
1609: --bracketLevel;
1610: } else if (c == '\\') {
1611: ++q;
1612: }
1613: }
1614: ++q;
1615: }
1616:
1617: // otherwise, the expression is just the character itself
1618: else {
1619: q = p + 1;
1620: }
1621:
1622: // look up the category list for the expression and store it
1623: // in pendingChars
1624: pendingChars = (String) expressions.get(rule
1625: .substring(p, q));
1626:
1627: // advance the current position past the expression
1628: p = q - 1;
1629: }
1630:
1631: // if the character we're on is a period, we end up down here
1632: else {
1633: int rowNum = ((Integer) decisionPointList
1634: .lastElement()).intValue();
1635: state = (short[]) tempStateTable
1636: .elementAt(rowNum);
1637:
1638: // if the period is followed by an asterisk, then just set the current
1639: // state to loop back on itself
1640: if (p + 1 < rule.length()
1641: && rule.charAt(p + 1) == '*'
1642: && state[0] != 0) {
1643: decisionPointList.addElement(new Integer(
1644: state[0]));
1645: pendingChars = "";
1646: ++p;
1647: }
1648:
1649: // otherwise, fabricate a category list ("pendingChars") with
1650: // every category in it
1651: else {
1652: StringBuffer temp = new StringBuffer();
1653: for (int i = 0; i < numCategories; i++)
1654: temp.append((char) (i + 0x100));
1655: pendingChars = temp.toString();
1656: }
1657: }
1658:
1659: // we'll end up in here for all expressions except for .*, which is
1660: // special-cased above
1661: if (pendingChars.length() != 0) {
1662:
1663: // if the expression is followed by an asterisk, then push a copy
1664: // of the current desicion point list onto the stack (this is
1665: // the same thing we do on an opening brace)
1666: if (p + 1 < rule.length()
1667: && rule.charAt(p + 1) == '*') {
1668: decisionPointStack.push(decisionPointList
1669: .clone());
1670: }
1671:
1672: // create a new state, add it to the list of states to backfill
1673: // if we have looping states to worry about, set its "don't make
1674: // me an accepting state" flag if we've seen a slash, and add
1675: // it to the end of the state table
1676: int newState = tempStateTable.size();
1677: if (loopingStates.size() != 0) {
1678: statesToBackfill.addElement(new Integer(
1679: newState));
1680: }
1681: state = new short[numCategories + 1];
1682: if (sawEarlyBreak) {
1683: state[numCategories] = DONT_LOOP_FLAG;
1684: }
1685: tempStateTable.addElement(state);
1686:
1687: // update everybody in the decision point list to point to
1688: // the new state (this also performs all the reconciliation
1689: // needed to make the table deterministic), then clear the
1690: // decision point list
1691: updateStateTable(decisionPointList,
1692: pendingChars, (short) newState);
1693: decisionPointList.removeAllElements();
1694:
1695: // add all states created since the last literal character we've
1696: // seen to the decision point list
1697: lastState = currentState;
1698: do {
1699: ++currentState;
1700: decisionPointList.addElement(new Integer(
1701: currentState));
1702: } while (currentState + 1 < tempStateTable
1703: .size());
1704: }
1705: }
1706:
1707: // a { marks the beginning of an optional run of characters. Push a
1708: // copy of the current decision point list onto the stack. This saves
1709: // it, preventing it from being affected by whatever's inside the parentheses.
1710: // This decision point list is restored when a } is encountered.
1711: else if (c == '{') {
1712: decisionPointStack.push(decisionPointList.clone());
1713: }
1714:
1715: // a } marks the end of an optional run of characters. Pop the last decision
1716: // point list off the stack and merge it with the current decision point list.
1717: // a * denotes a repeating character or group (* after () is handled separately
1718: // below). In addition to restoring the decision point list, modify the
1719: // current state to point to itself on the appropriate character categories.
1720: else if (c == '}' || c == '*') {
1721: // when there's a *, update the current state to loop back on itself
1722: // on the character categories that caused us to enter this state
1723: if (c == '*') {
1724: for (int i = lastState + 1; i < tempStateTable
1725: .size(); i++) {
1726: Vector temp = new Vector();
1727: temp.addElement(new Integer(i));
1728: updateStateTable(temp, pendingChars,
1729: (short) (lastState + 1));
1730: }
1731: }
1732:
1733: // pop the top element off the decision point stack and merge
1734: // it with the current decision point list (this causes the divergent
1735: // paths through the state table to come together again on the next
1736: // new state)
1737: Vector temp = (Vector) decisionPointStack.pop();
1738: for (int i = 0; i < decisionPointList.size(); i++)
1739: temp.addElement(decisionPointList.elementAt(i));
1740: decisionPointList = temp;
1741: }
1742:
1743: // a ? after a * modifies the behavior of * in cases where there is overlap
1744: // between the set of characters that repeat and the characters which follow.
1745: // Without the ?, all states following the repeating state, up to a state which
1746: // is reached by a character that doesn't overlap, will loop back into the
1747: // repeating state. With the ?, the mark states following the *? DON'T loop
1748: // back into the repeating state. Thus, "[a-z]*xyz" will match the longest
1749: // sequence of letters that ends in "xyz," while "[a-z]*? will match the
1750: // _shortest_ sequence of letters that ends in "xyz".
1751: // We use extra bookkeeping to achieve this effect, since everything else works
1752: // according to the "longest possible match" principle. The basic principle
1753: // is that transitions out of a looping state are written in over the looping
1754: // value instead of being reconciled, and that we copy the contents of the
1755: // looping state into empty cells of all non-terminal states that follow the
1756: // looping state.
1757: else if (c == '?') {
1758: setLoopingStates(decisionPointList,
1759: decisionPointList);
1760: }
1761:
1762: // a ( marks the beginning of a sequence of characters. Parentheses can either
1763: // contain several alternative character sequences (i.e., "(ab|cd|ef)"), or
1764: // they can contain a sequence of characters that can repeat (i.e., "(abc)*"). Thus,
1765: // A () group can have multiple entry and exit points. To keep track of this,
1766: // we reserve TWO spots on the decision-point stack. The top of the stack is
1767: // the list of exit points, which becomes the current decision point list when
1768: // the ) is reached. The next entry down is the decision point list at the
1769: // beginning of the (), which becomes the current decision point list at every
1770: // entry point.
1771: // In addition to keeping track of the exit points and the active decision
1772: // points before the ( (i.e., the places from which the () can be entered),
1773: // we need to keep track of the entry points in case the expression loops
1774: // (i.e., is followed by *). We do that by creating a dummy state in the
1775: // state table and adding it to the decision point list (BEFORE it's duplicated
1776: // on the stack). Nobody points to this state, so it'll get optimized out
1777: // at the end. It exists only to hold the entry points in case the ()
1778: // expression loops.
1779: else if (c == '(') {
1780:
1781: // add a new state to the state table to hold the entry points into
1782: // the () expression
1783: tempStateTable
1784: .addElement(new short[numCategories + 1]);
1785:
1786: // we have to adjust lastState and currentState to account for the
1787: // new dummy state
1788: lastState = currentState;
1789: ++currentState;
1790:
1791: // add the current state to the decision point list (add it at the
1792: // BEGINNING so we can find it later)
1793: decisionPointList.insertElementAt(new Integer(
1794: currentState), 0);
1795:
1796: // finally, push a copy of the current decision point list onto the
1797: // stack (this keeps track of the active decision point list before
1798: // the () expression), followed by an empty decision point list
1799: // (this will hold the exit points)
1800: decisionPointStack.push(decisionPointList.clone());
1801: decisionPointStack.push(new Vector());
1802: }
1803:
1804: // a | separates alternative character sequences in a () expression. When
1805: // a | is encountered, we add the current decision point list to the exit-point
1806: // list, and restore the decision point list to its state prior to the (.
1807: else if (c == '|') {
1808:
1809: // pick out the top two decision point lists on the stack
1810: Vector oneDown = (Vector) decisionPointStack.pop();
1811: Vector twoDown = (Vector) decisionPointStack.peek();
1812: decisionPointStack.push(oneDown);
1813:
1814: // append the current decision point list to the list below it
1815: // on the stack (the list of exit points), and restore the
1816: // current decision point list to its state before the () expression
1817: for (int i = 0; i < decisionPointList.size(); i++)
1818: oneDown.addElement(decisionPointList
1819: .elementAt(i));
1820: decisionPointList = (Vector) twoDown.clone();
1821: }
1822:
1823: // a ) marks the end of a sequence of characters. We do one of two things
1824: // depending on whether the sequence repeats (i.e., whether the ) is followed
1825: // by *): If the sequence doesn't repeat, then the exit-point list is merged
1826: // with the current decision point list and the decision point list from before
1827: // the () is thrown away. If the sequence does repeat, then we fish out the
1828: // state we were in before the ( and copy all of its forward transitions
1829: // (i.e., every transition added by the () expression) into every state in the
1830: // exit-point list and the current decision point list. The current decision
1831: // point list is then merged with both the exit-point list AND the saved version
1832: // of the decision point list from before the (). Then we throw out the *.
1833: else if (c == ')') {
1834:
1835: // pull the exit point list off the stack, merge it with the current
1836: // decision point list, and make the merged version the current
1837: // decision point list
1838: Vector exitPoints = (Vector) decisionPointStack
1839: .pop();
1840: for (int i = 0; i < decisionPointList.size(); i++)
1841: exitPoints.addElement(decisionPointList
1842: .elementAt(i));
1843: decisionPointList = exitPoints;
1844:
1845: // if the ) isn't followed by a *, then all we have to do is throw
1846: // away the other list on the decision point stack, and we're done
1847: if (p + 1 >= rule.length()
1848: || rule.charAt(p + 1) != '*') {
1849: decisionPointStack.pop();
1850: }
1851:
1852: // but if the sequence repeats, we have a lot more work to do...
1853: else {
1854:
1855: // now exitPoints and decisionPointList have to point to equivalent
1856: // vectors, but not the SAME vector
1857: exitPoints = (Vector) decisionPointList.clone();
1858:
1859: // pop the original decision point list off the stack
1860: Vector temp = (Vector) decisionPointStack.pop();
1861:
1862: // we squirreled away the row number of our entry point list
1863: // at the beginning of the original decision point list. Fish
1864: // that state number out and retrieve the entry point list
1865: int tempStateNum = ((Integer) temp
1866: .firstElement()).intValue();
1867: short[] tempState = (short[]) tempStateTable
1868: .elementAt(tempStateNum);
1869:
1870: // merge the original decision point list with the current
1871: // decision point list
1872: for (int i = 0; i < decisionPointList.size(); i++)
1873: temp.addElement(decisionPointList
1874: .elementAt(i));
1875: decisionPointList = temp;
1876:
1877: // finally, copy every forward reference from the entry point
1878: // list into every state in the new decision point list
1879: for (int i = 0; i < tempState.length; i++) {
1880: if (tempState[i] > tempStateNum) {
1881: updateStateTable(exitPoints,
1882: new Character(
1883: (char) (i + 0x100))
1884: .toString(),
1885: tempState[i]);
1886: }
1887: }
1888:
1889: // update lastState and currentState, and throw away the *
1890: lastState = currentState;
1891: currentState = tempStateTable.size() - 1;
1892: ++p;
1893: }
1894: }
1895:
1896: // a / marks the position where the break is to go if the character sequence
1897: // matches this rule. We update the flag word of every state on the decision
1898: // point list to mark them as ending states, and take note of the fact that
1899: // we've seen the slash
1900: else if (c == '/') {
1901: sawEarlyBreak = true;
1902: for (int i = 0; i < decisionPointList.size(); i++) {
1903: state = (short[]) tempStateTable
1904: .elementAt(((Integer) decisionPointList
1905: .elementAt(i)).intValue());
1906: state[numCategories] |= LOOKAHEAD_STATE_FLAG;
1907: }
1908: }
1909:
1910: // if we get here without executing any of the above clauses, we have a
1911: // syntax error. However, for now we just ignore the offending character
1912: // and move on
1913:
1914: // clearLoopingStates is a signal back from updateStateTable() that we've
1915: // transitioned to a state that won't loop back to the current looping
1916: // state. (In other words, we've gotten to a point where we can no longer
1917: // go back into a *? we saw earlier.) Clear out the list of looping states
1918: // and backfill any states that need to be backfilled.
1919: if (clearLoopingStates) {
1920: setLoopingStates(null, decisionPointList);
1921: }
1922:
1923: // advance to the next character, now that we've processed the current
1924: // character
1925: ++p;
1926: }
1927:
1928: // this takes care of backfilling any states that still need to be backfilled
1929: setLoopingStates(null, decisionPointList);
1930:
1931: // when we reach the end of the string, we do a postprocessing step to mark the
1932: // end states. The decision point list contains every state that can transition
1933: // to the end state-- that is, every state that is the last state in a sequence
1934: // that matches the rule. All of these states are considered "mark states"
1935: // or "accepting states"-- that is, states that cause the position returned from
1936: // next() to be updated. A mark state represents a possible break position.
1937: // This allows us to look ahead and remember how far the rule matched
1938: // before following the new branch (see next() for more information).
1939: // The temporary state table has an extra "flag column" at the end where this
1940: // information is stored. We mark the end states by setting a flag in their
1941: // flag column.
1942: // Now if we saw the / in the rule, then everything after it is lookahead
1943: // material and the break really goes where the slash is. In this case,
1944: // we mark these states as BOTH accepting states and lookahead states. This
1945: // signals that these states cause the break position to be updated to the
1946: // position of the slash rather than the current break position.
1947: for (int i = 0; i < decisionPointList.size(); i++) {
1948: int rowNum = ((Integer) decisionPointList.elementAt(i))
1949: .intValue();
1950: state = (short[]) tempStateTable.elementAt(rowNum);
1951: state[numCategories] |= END_STATE_FLAG;
1952: if (sawEarlyBreak) {
1953: state[numCategories] |= LOOKAHEAD_STATE_FLAG;
1954: }
1955: }
1956: }
1957:
1958: /**
1959: * Update entries in the state table, and merge states when necessary to keep
1960: * the table deterministic.
1961: * @param rows The list of rows that need updating (the decision point list)
1962: * @param pendingChars A character category list, encoded in a String. This is the
1963: * list of the columns that need updating.
1964: * @param newValue Update the cells specfied above to contain this value
1965: */
1966: private void updateStateTable(Vector rows, String pendingChars,
1967: short newValue) {
1968: // create a dummy state that has the specified row number (newValue) in
1969: // the cells that need to be updated (those specified by pendingChars)
1970: // and 0 in the other cells
1971: short[] newValues = new short[numCategories + 1];
1972: for (int i = 0; i < pendingChars.length(); i++)
1973: newValues[(int) (pendingChars.charAt(i)) - 0x100] = newValue;
1974:
1975: // go through the list of rows to update, and update them by calling
1976: // mergeStates() to merge them the the dummy state we created
1977: for (int i = 0; i < rows.size(); i++) {
1978: mergeStates(((Integer) rows.elementAt(i)).intValue(),
1979: newValues, rows);
1980: }
1981: }
1982:
1983: /**
1984: * The real work of making the state table deterministic happens here. This function
1985: * merges a state in the state table (specified by rowNum) with a state that is
1986: * passed in (newValues). The basic process is to copy the nonzero cells in newStates
1987: * into the state in the state table (we'll call that oldValues). If there's a
1988: * collision (i.e., if the same cell has a nonzero value in both states, and it's
1989: * not the SAME value), then we have to reconcile the collision. We do this by
1990: * creating a new state, adding it to the end of the state table, and using this
1991: * function recursively to merge the original two states into a single, combined
1992: * state. This process may happen recursively (i.e., each successive level may
1993: * involve collisions). To prevent infinite recursion, we keep a log of merge
1994: * operations. Any time we're merging two states we've merged before, we can just
1995: * supply the row number for the result of that merge operation rather than creating
1996: * a new state just like it.
1997: * @param rowNum The row number in the state table of the state to be updated
1998: * @param newValues The state to merge it with.
1999: * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
2000: * (itself a copy of the decision point list from parseRule()). Newly-created
2001: * states get added to the decision point list if their "parents" were on it.
2002: */
2003: private void mergeStates(int rowNum, short[] newValues,
2004: Vector rowsBeingUpdated) {
2005: short[] oldValues = (short[]) (tempStateTable
2006: .elementAt(rowNum));
2007: boolean isLoopingState = loopingStates
2008: .contains(new Integer(rowNum));
2009:
2010: // for each of the cells in the rows we're reconciling, do...
2011: for (int i = 0; i < oldValues.length; i++) {
2012:
2013: // if they contain the same value, we don't have to do anything
2014: if (oldValues[i] == newValues[i]) {
2015: continue;
2016: }
2017:
2018: // if oldValues is a looping state and the state the current cell points to
2019: // is too, then we can just stomp over the current value of that cell (and
2020: // set the clear-looping-states flag if necessary)
2021: else if (isLoopingState
2022: && loopingStates.contains(new Integer(
2023: oldValues[i]))) {
2024: if (newValues[i] != 0) {
2025: if (oldValues[i] == 0) {
2026: clearLoopingStates = true;
2027: }
2028: oldValues[i] = newValues[i];
2029: }
2030: }
2031:
2032: // if the current cell in oldValues is 0, copy in the corresponding value
2033: // from newValues
2034: else if (oldValues[i] == 0) {
2035: oldValues[i] = newValues[i];
2036: }
2037:
2038: // the last column of each row is the flag column. Take care to merge the
2039: // flag words correctly
2040: else if (i == numCategories) {
2041: oldValues[i] = (short) ((newValues[i] & ALL_FLAGS) | oldValues[i]);
2042: }
2043:
2044: // if both newValues and oldValues have a nonzero value in the current
2045: // cell, and it isn't the same value both places...
2046: else if (oldValues[i] != 0 && newValues[i] != 0) {
2047:
2048: // look up this pair of cell values in the merge list. If it's
2049: // found, update the cell in oldValues to point to the merged state
2050: int combinedRowNum = searchMergeList(oldValues[i],
2051: newValues[i]);
2052: if (combinedRowNum != 0) {
2053: oldValues[i] = (short) combinedRowNum;
2054: }
2055:
2056: // otherwise, we have to reconcile them...
2057: else {
2058: // copy our row numbers into variables to make things easier
2059: int oldRowNum = oldValues[i];
2060: int newRowNum = newValues[i];
2061: combinedRowNum = tempStateTable.size();
2062:
2063: // add this pair of row numbers to the merge list (create it first
2064: // if we haven't created the merge list yet)
2065: if (mergeList == null) {
2066: mergeList = new Vector();
2067: }
2068: mergeList.addElement(new int[] { oldRowNum,
2069: newRowNum, combinedRowNum });
2070:
2071: // create a new row to represent the merged state, and copy the
2072: // contents of oldRow into it, then add it to the end of the
2073: // state table and update the original row (oldValues) to point
2074: // to the new, merged, state
2075: short[] newRow = new short[numCategories + 1];
2076: short[] oldRow = (short[]) (tempStateTable
2077: .elementAt(oldRowNum));
2078: System.arraycopy(oldRow, 0, newRow, 0,
2079: numCategories + 1);
2080: tempStateTable.addElement(newRow);
2081: oldValues[i] = (short) combinedRowNum;
2082:
2083: // if the decision point list contains either of the parent rows,
2084: // update it to include the new row as well
2085: if ((decisionPointList.contains(new Integer(
2086: oldRowNum)) || decisionPointList
2087: .contains(new Integer(newRowNum)))
2088: && !decisionPointList
2089: .contains(new Integer(
2090: combinedRowNum))) {
2091: decisionPointList.addElement(new Integer(
2092: combinedRowNum));
2093: }
2094:
2095: // do the same thing with the list of rows being updated
2096: if ((rowsBeingUpdated.contains(new Integer(
2097: oldRowNum)) || rowsBeingUpdated
2098: .contains(new Integer(newRowNum)))
2099: && !rowsBeingUpdated
2100: .contains(new Integer(
2101: combinedRowNum))) {
2102: decisionPointList.addElement(new Integer(
2103: combinedRowNum));
2104: }
2105: // now (groan) do the same thing for all the entries on the
2106: // decision point stack
2107: for (int k = 0; k < decisionPointStack.size(); k++) {
2108: Vector dpl = (Vector) decisionPointStack
2109: .elementAt(k);
2110: if ((dpl.contains(new Integer(oldRowNum)) || dpl
2111: .contains(new Integer(newRowNum)))
2112: && !dpl.contains(new Integer(
2113: combinedRowNum))) {
2114: dpl.addElement(new Integer(
2115: combinedRowNum));
2116: }
2117: }
2118:
2119: // FINALLY (puff puff puff), call mergeStates() recursively to copy
2120: // the row referred to by newValues into the new row and resolve any
2121: // conflicts that come up at that level
2122: mergeStates(combinedRowNum,
2123: (short[]) (tempStateTable
2124: .elementAt(newValues[i])),
2125: rowsBeingUpdated);
2126: }
2127: }
2128: }
2129: return;
2130: }
2131:
2132: /**
2133: * The merge list is a list of pairs of rows that have been merged somewhere in
2134: * the process of building this state table, along with the row number of the
2135: * row containing the merged state. This function looks up a pair of row numbers
2136: * and returns the row number of the row they combine into. (It returns 0 if
2137: * this pair of rows isn't in the merge list.)
2138: */
2139: private int searchMergeList(int a, int b) {
2140: // if there is no merge list, there obviously isn't anything in it
2141: if (mergeList == null) {
2142: return 0;
2143: }
2144:
2145: // otherwise, for each element in the merge list...
2146: else {
2147: int[] entry;
2148: for (int i = 0; i < mergeList.size(); i++) {
2149: entry = (int[]) (mergeList.elementAt(i));
2150:
2151: // we have a hit if the two row numbers match the two row numbers
2152: // in the beginning of the entry (the two that combine), in either
2153: // order
2154: if ((entry[0] == a && entry[1] == b)
2155: || (entry[0] == b && entry[1] == a)) {
2156: return entry[2];
2157: }
2158:
2159: // we also have a hit if one of the two row numbers matches the marged
2160: // row number and the other one matches one of the original row numbers
2161: if ((entry[2] == a && (entry[0] == b || entry[1] == b))) {
2162: return entry[2];
2163: }
2164: if ((entry[2] == b && (entry[0] == a || entry[1] == a))) {
2165: return entry[2];
2166: }
2167: }
2168: return 0;
2169: }
2170: }
2171:
2172: /**
2173: * This function is used to update the list of current loooping states (i.e.,
2174: * states that are controlled by a *? construct). It backfills values from
2175: * the looping states into unpopulated cells of the states that are currently
2176: * marked for backfilling, and then updates the list of looping states to be
2177: * the new list
2178: * @param newLoopingStates The list of new looping states
2179: * @param endStates The list of states to treat as end states (states that
2180: * can exit the loop).
2181: */
2182: private void setLoopingStates(Vector newLoopingStates,
2183: Vector endStates) {
2184:
2185: // if the current list of looping states isn't empty, we have to backfill
2186: // values from the looping states into the states that are waiting to be
2187: // backfilled
2188: if (!loopingStates.isEmpty()) {
2189: int loopingState = ((Integer) loopingStates
2190: .lastElement()).intValue();
2191: int rowNum;
2192:
2193: // don't backfill into an end state OR any state reachable from an end state
2194: // (since the search for reachable states is recursive, it's split out into
2195: // a separate function, eliminateBackfillStates(), below)
2196: for (int i = 0; i < endStates.size(); i++) {
2197: eliminateBackfillStates(((Integer) endStates
2198: .elementAt(i)).intValue());
2199: }
2200:
2201: // we DON'T actually backfill the states that need to be backfilled here.
2202: // Instead, we MARK them for backfilling. The reason for this is that if
2203: // there are multiple rules in the state-table description, the looping
2204: // states may have some of their values changed by a succeeding rule, and
2205: // this wouldn't be reflected in the backfilled states. We mark a state
2206: // for backfilling by putting the row number of the state to copy from
2207: // into the flag cell at the end of the row
2208: for (int i = 0; i < statesToBackfill.size(); i++) {
2209: rowNum = ((Integer) statesToBackfill.elementAt(i))
2210: .intValue();
2211: short[] state = (short[]) tempStateTable
2212: .elementAt(rowNum);
2213: state[numCategories] = (short) ((state[numCategories] & ALL_FLAGS) | loopingState);
2214: }
2215: statesToBackfill.removeAllElements();
2216: loopingStates.removeAllElements();
2217: }
2218:
2219: if (newLoopingStates != null) {
2220: loopingStates = (Vector) newLoopingStates.clone();
2221: }
2222: }
2223:
2224: /**
2225: * This removes "ending states" and states reachable from them from the
2226: * list of states to backfill.
2227: * @param The row number of the state to remove from the backfill list
2228: */
2229: private void eliminateBackfillStates(int baseState) {
2230:
2231: // don't do anything unless this state is actually in the backfill list...
2232: if (statesToBackfill.contains(new Integer(baseState))) {
2233:
2234: // if it is, take it out
2235: statesToBackfill.removeElement(new Integer(baseState));
2236:
2237: // then go through and recursively call this function for every
2238: // state that the base state points to
2239: short[] state = (short[]) tempStateTable
2240: .elementAt(baseState);
2241: for (int i = 0; i < numCategories; i++) {
2242: if (state[i] != 0) {
2243: eliminateBackfillStates(state[i]);
2244: }
2245: }
2246: }
2247: }
2248:
2249: /**
2250: * This function completes the backfilling process by actually doing the
2251: * backfilling on the states that are marked for it
2252: */
2253: private void backfillLoopingStates() {
2254: short[] state;
2255: short[] loopingState = null;
2256: int loopingStateRowNum = 0;
2257: int fromState;
2258:
2259: // for each state in the state table...
2260: for (int i = 0; i < tempStateTable.size(); i++) {
2261: state = (short[]) tempStateTable.elementAt(i);
2262:
2263: // check the state's flag word to see if it's marked for backfilling
2264: // (it's marked for backfilling if any bits other than the two high-order
2265: // bits are set-- if they are, then the flag word, minus the two high bits,
2266: // is the row number to copy from)
2267: fromState = state[numCategories] & ~ALL_FLAGS;
2268: if (fromState > 0) {
2269:
2270: // load up the state to copy from (if we haven't already)
2271: if (fromState != loopingStateRowNum) {
2272: loopingStateRowNum = fromState;
2273: loopingState = (short[]) tempStateTable
2274: .elementAt(loopingStateRowNum);
2275: }
2276:
2277: // clear out the backfill part of the flag word
2278: state[numCategories] &= ALL_FLAGS;
2279:
2280: // then fill all zero cells in the current state with values
2281: // from the corresponding cells of the fromState
2282: for (int j = 0; j < state.length; j++) {
2283: if (state[j] == 0) {
2284: state[j] = loopingState[j];
2285: } else if (state[j] == DONT_LOOP_FLAG) {
2286: state[j] = 0;
2287: }
2288: }
2289: }
2290: }
2291: }
2292:
2293: /**
2294: * This function completes the state-table-building process by doing several
2295: * postprocessing steps and copying everything into its final resting place
2296: * in the iterator itself
2297: * @param forward True if we're working on the forward state table
2298: */
2299: private void finishBuildingStateTable(boolean forward) {
2300: // start by backfilling the looping states
2301: backfillLoopingStates();
2302:
2303: int[] rowNumMap = new int[tempStateTable.size()];
2304: Stack rowsToFollow = new Stack();
2305: rowsToFollow.push(new Integer(1));
2306: rowNumMap[1] = 1;
2307:
2308: // determine which states are no longer reachable from the start state
2309: // (the reachable states will have their row numbers in the row number
2310: // map, and the nonreachable states will have zero in the row number map)
2311: while (rowsToFollow.size() != 0) {
2312: int rowNum = ((Integer) rowsToFollow.pop()).intValue();
2313: short[] row = (short[]) (tempStateTable
2314: .elementAt(rowNum));
2315:
2316: for (int i = 0; i < numCategories; i++) {
2317: if (row[i] != 0) {
2318: if (rowNumMap[row[i]] == 0) {
2319: rowNumMap[row[i]] = row[i];
2320: rowsToFollow.push(new Integer(row[i]));
2321: }
2322: }
2323: }
2324: }
2325:
2326: boolean madeChange;
2327: int newRowNum;
2328:
2329: // algorithm for minimizing the number of states in the table adapted from
2330: // Aho & Ullman, "Principles of Compiler Design"
2331: // The basic idea here is to organize the states into classes. When we're done,
2332: // all states in the same class can be considered identical and all but one eliminated.
2333:
2334: // initially assign states to classes based on the number of populated cells they
2335: // contain (the class number is the number of populated cells)
2336: int[] stateClasses = new int[tempStateTable.size()];
2337: int nextClass = numCategories + 1;
2338: short[] state1, state2;
2339: for (int i = 1; i < stateClasses.length; i++) {
2340: if (rowNumMap[i] == 0) {
2341: continue;
2342: }
2343: state1 = (short[]) tempStateTable.elementAt(i);
2344: for (int j = 0; j < numCategories; j++) {
2345: if (state1[j] != 0) {
2346: ++stateClasses[i];
2347: }
2348: }
2349: if (stateClasses[i] == 0) {
2350: stateClasses[i] = nextClass;
2351: }
2352: }
2353: ++nextClass;
2354:
2355: // then, for each class, elect the first member of that class as that class's
2356: // "representative". For each member of the class, compare it to the "representative."
2357: // If there's a column position where the state being tested transitions to a
2358: // state in a DIFFERENT class from the class where the "representative" transitions,
2359: // then move the state into a new class. Repeat this process until no new classes
2360: // are created.
2361: int currentClass;
2362: int lastClass;
2363: boolean split;
2364:
2365: do {
2366: currentClass = 1;
2367: lastClass = nextClass;
2368: while (currentClass < nextClass) {
2369: split = false;
2370: state1 = state2 = null;
2371: for (int i = 0; i < stateClasses.length; i++) {
2372: if (stateClasses[i] == currentClass) {
2373: if (state1 == null) {
2374: state1 = (short[]) tempStateTable
2375: .elementAt(i);
2376: } else {
2377: state2 = (short[]) tempStateTable
2378: .elementAt(i);
2379: for (int j = 0; j < state2.length; j++) {
2380: if ((j == numCategories
2381: && state1[j] != state2[j] && forward)
2382: || (j != numCategories && stateClasses[state1[j]] != stateClasses[state2[j]])) {
2383: stateClasses[i] = nextClass;
2384: split = true;
2385: break;
2386: }
2387: }
2388: }
2389: }
2390: }
2391: if (split) {
2392: ++nextClass;
2393: }
2394: ++currentClass;
2395: }
2396: } while (lastClass != nextClass);
2397:
2398: // at this point, all of the states in a class except the first one (the
2399: //"representative") can be eliminated, so update the row-number map accordingly
2400: int[] representatives = new int[nextClass];
2401: for (int i = 1; i < stateClasses.length; i++)
2402: if (representatives[stateClasses[i]] == 0) {
2403: representatives[stateClasses[i]] = i;
2404: } else {
2405: rowNumMap[i] = representatives[stateClasses[i]];
2406: }
2407:
2408: // renumber all remaining rows...
2409: // first drop all that are either unreferenced or not a class representative
2410: for (int i = 1; i < rowNumMap.length; i++) {
2411: if (rowNumMap[i] != i) {
2412: tempStateTable.setElementAt(null, i);
2413: }
2414: }
2415:
2416: // then calculate everybody's new row number and update the row
2417: // number map appropriately (the first pass updates the row numbers
2418: // of all the class representatives [the rows we're keeping], and the
2419: // second pass updates the cross references for all the rows that
2420: // are being deleted)
2421: newRowNum = 1;
2422: for (int i = 1; i < rowNumMap.length; i++) {
2423: if (tempStateTable.elementAt(i) != null) {
2424: rowNumMap[i] = newRowNum++;
2425: }
2426: }
2427: for (int i = 1; i < rowNumMap.length; i++) {
2428: if (tempStateTable.elementAt(i) == null) {
2429: rowNumMap[i] = rowNumMap[rowNumMap[i]];
2430: }
2431: }
2432:
2433: // allocate the permanent state table, and copy the remaining rows into it
2434: // (adjusting all the cell values, of course)
2435:
2436: // this section does that for the forward state table
2437: if (forward) {
2438: endStates = new boolean[newRowNum];
2439: lookaheadStates = new boolean[newRowNum];
2440: stateTable = new short[newRowNum * numCategories];
2441: int p = 0;
2442: int p2 = 0;
2443: for (int i = 0; i < tempStateTable.size(); i++) {
2444: short[] row = (short[]) (tempStateTable
2445: .elementAt(i));
2446: if (row == null) {
2447: continue;
2448: }
2449: for (int j = 0; j < numCategories; j++) {
2450: stateTable[p] = (short) (rowNumMap[row[j]]);
2451: ++p;
2452: }
2453: endStates[p2] = ((row[numCategories] & END_STATE_FLAG) != 0);
2454: lookaheadStates[p2] = ((row[numCategories] & LOOKAHEAD_STATE_FLAG) != 0);
2455: ++p2;
2456: }
2457: }
2458:
2459: // and this section does it for the backward state table
2460: else {
2461: backwardsStateTable = new short[newRowNum
2462: * numCategories];
2463: int p = 0;
2464: for (int i = 0; i < tempStateTable.size(); i++) {
2465: short[] row = (short[]) (tempStateTable
2466: .elementAt(i));
2467: if (row == null) {
2468: continue;
2469: }
2470: for (int j = 0; j < numCategories; j++) {
2471: backwardsStateTable[p] = (short) (rowNumMap[row[j]]);
2472: ++p;
2473: }
2474: }
2475: }
2476: }
2477:
2478: /**
2479: * This function builds the backward state table from the forward state
2480: * table and any additional rules (identified by the ! on the front)
2481: * supplied in the description
2482: */
2483: private void buildBackwardsStateTable(Vector tempRuleList) {
2484:
2485: // create the temporary state table and seed it with two rows (row 0
2486: // isn't used for anything, and we have to create row 1 (the initial
2487: // state) before we can do anything else
2488: tempStateTable = new Vector();
2489: tempStateTable.addElement(new short[numCategories + 1]);
2490: tempStateTable.addElement(new short[numCategories + 1]);
2491:
2492: // although the backwards state table is built automatically from the forward
2493: // state table, there are some situations (the default sentence-break rules,
2494: // for example) where this doesn't yield enough stop states, causing a dramatic
2495: // drop in performance. To help with these cases, the user may supply
2496: // supplemental rules that are added to the backward state table. These have
2497: // the same syntax as the normal break rules, but begin with '!' to distinguish
2498: // them from normal break rules
2499: for (int i = 0; i < tempRuleList.size(); i++) {
2500: String rule = (String) tempRuleList.elementAt(i);
2501: if (rule.charAt(0) == '!') {
2502: parseRule(rule.substring(1), false);
2503: }
2504: }
2505: backfillLoopingStates();
2506:
2507: // Backwards iteration is qualitatively different from forwards iteration.
2508: // This is because backwards iteration has to be made to operate from no context
2509: // at all-- the user should be able to ask BreakIterator for the break position
2510: // immediately on either side of some arbitrary offset in the text. The
2511: // forward iteration table doesn't let us do that-- it assumes complete
2512: // information on the context, which means starting from the beginning of the
2513: // document.
2514: // The way we do backward and random-access iteration is to back up from the
2515: // current (or user-specified) position until we see something we're sure is
2516: // a break position (it may not be the last break position immediately
2517: // preceding our starting point, however). Then we roll forward from there to
2518: // locate the actual break position we're after.
2519: // This means that the backwards state table doesn't have to identify every
2520: // break position, allowing the building algorithm to be much simpler. Here,
2521: // we use a "pairs" approach, scanning the forward-iteration state table for
2522: // pairs of character categories we ALWAYS break between, and building a state
2523: // table from that information. No context is required-- all this state table
2524: // looks at is a pair of adjacent characters.
2525:
2526: // It's possible that the user has supplied supplementary rules (see above).
2527: // This has to be done first to keep parseRule() and friends from becoming
2528: // EVEN MORE complicated. The automatically-generated states are appended
2529: // onto the end of the state table, and then the two sets of rules are
2530: // stitched together at the end. Take note of the row number of the
2531: // first row of the auromatically-generated part.
2532: int backTableOffset = tempStateTable.size();
2533: if (backTableOffset > 2) {
2534: ++backTableOffset;
2535: }
2536:
2537: // the automatically-generated part of the table models a two-dimensional
2538: // array where the two dimensions represent the two characters we're currently
2539: // looking at. To model this as a state table, we actually need one additional
2540: // row to represent the initial state. It gets populated with the row numbers
2541: // of the other rows (in order).
2542: for (int i = 0; i < numCategories + 1; i++)
2543: tempStateTable.addElement(new short[numCategories + 1]);
2544:
2545: short[] state = (short[]) tempStateTable
2546: .elementAt(backTableOffset - 1);
2547: for (int i = 0; i < numCategories; i++)
2548: state[i] = (short) (i + backTableOffset);
2549:
2550: // scavenge the forward state table for pairs of character categories
2551: // that always have a break between them. The algorithm is as follows:
2552: // Look down each column in the state table. For each nonzero cell in
2553: // that column, look up the row it points to. For each nonzero cell in
2554: // that row, populate a cell in the backwards state table: the row number
2555: // of that cell is the number of the column we were scanning (plus the
2556: // offset that locates this sub-table), and the column number of that cell
2557: // is the column number of the nonzero cell we just found. This cell is
2558: // populated with its own column number (adjusted according to the actual
2559: // location of the sub-table). This process will produce a state table
2560: // whose behavior is the same as looking up successive pairs of characters
2561: // in an array of Booleans to determine whether there is a break.
2562: int numRows = stateTable.length / numCategories;
2563: for (int column = 0; column < numCategories; column++) {
2564: for (int row = 0; row < numRows; row++) {
2565: int nextRow = lookupState(row, column);
2566: if (nextRow != 0) {
2567: for (int nextColumn = 0; nextColumn < numCategories; nextColumn++) {
2568: int cellValue = lookupState(nextRow,
2569: nextColumn);
2570: if (cellValue != 0) {
2571: state = (short[]) tempStateTable
2572: .elementAt(nextColumn
2573: + backTableOffset);
2574: state[column] = (short) (column + backTableOffset);
2575: }
2576: }
2577: }
2578: }
2579: }
2580:
2581: // if the user specified some backward-iteration rules with the ! token,
2582: // we have to merge the resulting state table with the auto-generated one
2583: // above. First copy the populated cells from row 1 over the populated
2584: // cells in the auto-generated table. Then copy values from row 1 of the
2585: // auto-generated table into all of the the unpopulated cells of the
2586: // rule-based table.
2587: if (backTableOffset > 1) {
2588:
2589: // for every row in the auto-generated sub-table, if a cell is
2590: // populated that is also populated in row 1 of the rule-based
2591: // sub-table, copy the value from row 1 over the value in the
2592: // auto-generated sub-table
2593: state = (short[]) tempStateTable.elementAt(1);
2594: for (int i = backTableOffset - 1; i < tempStateTable
2595: .size(); i++) {
2596: short[] state2 = (short[]) tempStateTable
2597: .elementAt(i);
2598: for (int j = 0; j < numCategories; j++) {
2599: if (state[j] != 0 && state2[j] != 0) {
2600: state2[j] = state[j];
2601: }
2602: }
2603: }
2604:
2605: // now, for every row in the rule-based sub-table that is not
2606: // an end state, fill in all unpopulated cells with the values
2607: // of the corresponding cells in the first row of the auto-
2608: // generated sub-table.
2609: state = (short[]) tempStateTable
2610: .elementAt(backTableOffset - 1);
2611: for (int i = 1; i < backTableOffset - 1; i++) {
2612: short[] state2 = (short[]) tempStateTable
2613: .elementAt(i);
2614: if ((state2[numCategories] & END_STATE_FLAG) == 0) {
2615: for (int j = 0; j < numCategories; j++) {
2616: if (state2[j] == 0) {
2617: state2[j] = state[j];
2618: }
2619: }
2620: }
2621: }
2622: }
2623:
2624: // finally, clean everything up and copy it into the actual BreakIterator
2625: // by calling finishBuildingStateTable()
2626: finishBuildingStateTable(false);
2627: }
2628:
2629: /**
2630: * Throws an IllegalArgumentException representing a syntax error in the rule
2631: * description. The exception's message contains some debugging information.
2632: * @param message A message describing the problem
2633: * @param position The position in the description where the problem was
2634: * discovered
2635: * @param context The string containing the error
2636: */
2637: protected void error(String message, int position,
2638: String context) {
2639: throw new IllegalArgumentException(
2640: "Parse error at position (" + position + "): "
2641: + message + "\n"
2642: + context.substring(0, position)
2643: + " -here- " + context.substring(position));
2644: }
2645:
2646: }
2647:
2648: /*
2649: * This class exists to work around a bug in incorrect implementations
2650: * of CharacterIterator, which incorrectly handle setIndex(endIndex).
2651: * This iterator relies only on base.setIndex(n) where n is less than
2652: * endIndex.
2653: *
2654: * One caveat: if the base iterator's begin and end indices change
2655: * the change will not be reflected by this wrapper. Does that matter?
2656: */
2657: private static final class SafeCharIterator implements
2658: CharacterIterator, Cloneable {
2659:
2660: private CharacterIterator base;
2661: private int rangeStart;
2662: private int rangeLimit;
2663: private int currentIndex;
2664:
2665: SafeCharIterator(CharacterIterator base) {
2666: this .base = base;
2667: this .rangeStart = base.getBeginIndex();
2668: this .rangeLimit = base.getEndIndex();
2669: this .currentIndex = base.getIndex();
2670: }
2671:
2672: public char first() {
2673: return setIndex(rangeStart);
2674: }
2675:
2676: public char last() {
2677: return setIndex(rangeLimit - 1);
2678: }
2679:
2680: public char current() {
2681: if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
2682: return DONE;
2683: } else {
2684: return base.setIndex(currentIndex);
2685: }
2686: }
2687:
2688: public char next() {
2689:
2690: currentIndex++;
2691: if (currentIndex >= rangeLimit) {
2692: currentIndex = rangeLimit;
2693: return DONE;
2694: } else {
2695: return base.setIndex(currentIndex);
2696: }
2697: }
2698:
2699: public char previous() {
2700:
2701: currentIndex--;
2702: if (currentIndex < rangeStart) {
2703: currentIndex = rangeStart;
2704: return DONE;
2705: } else {
2706: return base.setIndex(currentIndex);
2707: }
2708: }
2709:
2710: public char setIndex(int i) {
2711:
2712: if (i < rangeStart || i > rangeLimit) {
2713: throw new IllegalArgumentException("Invalid position");
2714: }
2715: currentIndex = i;
2716: return current();
2717: }
2718:
2719: public int getBeginIndex() {
2720: return rangeStart;
2721: }
2722:
2723: public int getEndIndex() {
2724: return rangeLimit;
2725: }
2726:
2727: public int getIndex() {
2728: return currentIndex;
2729: }
2730:
2731: public Object clone() {
2732:
2733: SafeCharIterator copy = null;
2734: try {
2735: copy = (SafeCharIterator) super .clone();
2736: } catch (CloneNotSupportedException e) {
2737: throw new Error("Clone not supported: " + e);
2738: }
2739:
2740: CharacterIterator copyOfBase = (CharacterIterator) base
2741: .clone();
2742: copy.base = copyOfBase;
2743: return copy;
2744: }
2745: }
2746: }
|