0001: /*
0002: *******************************************************************************
0003: * Copyright (C) 2003-2006,
0004: * International Business Machines Corporation and others. All Rights Reserved.
0005: *******************************************************************************
0006: */
0007:
0008: package com.ibm.icu.text;
0009:
0010: import java.util.HashMap;
0011: import java.lang.IllegalArgumentException;
0012: import java.text.ParsePosition;
0013:
0014: import com.ibm.icu.lang.UCharacter;
0015: import com.ibm.icu.impl.Assert;
0016: import com.ibm.icu.impl.Utility;
0017:
0018: /**
0019: * This class is part of the Rule Based Break Iterator rule compiler.
0020: * It scans the rules and builds the parse tree.
0021: * There is no public API here.
0022: * @internal
0023: */
0024: class RBBIRuleScanner {
0025:
0026: private final int kStackSize = 100; // The size of the state stack for
0027:
0028: // rules parsing. Corresponds roughly
0029: // to the depth of parentheses nesting
0030: // that is allowed in the rules.
0031:
0032: static class RBBIRuleChar {
0033: int fChar;
0034: boolean fEscaped;
0035: }
0036:
0037: RBBIRuleBuilder fRB; // The rule builder that we are part of.
0038:
0039: int fScanIndex; // Index of current character being processed
0040: // in the rule input string.
0041: int fNextIndex; // Index of the next character, which
0042: // is the first character not yet scanned.
0043: boolean fQuoteMode; // Scan is in a 'quoted region'
0044: int fLineNum; // Line number in input file.
0045: int fCharNum; // Char position within the line.
0046: int fLastChar; // Previous char, needed to count CR-LF
0047: // as a single line, not two.
0048:
0049: RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine
0050: // processing.
0051: String fVarName; // $variableName, valid when we've just
0052: // scanned one.
0053:
0054: short fStack[] = new short[kStackSize]; // State stack, holds state pushes
0055: int fStackPtr; // and pops as specified in the state
0056: // transition rules.
0057:
0058: RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
0059: // during the parse of a rule
0060: int fNodeStackPtr;
0061:
0062: boolean fReverseRule; // True if the rule currently being scanned
0063: // is a reverse direction rule (if it
0064: // starts with a '!')
0065:
0066: boolean fLookAheadRule; // True if the rule includes a '/'
0067: // somewhere within it.
0068:
0069: RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
0070: // $variable symbols.
0071:
0072: HashMap fSetTable = new HashMap(); // UnicocodeSet hash table, holds indexes to
0073: // the sets created while parsing rules.
0074: // The key is the string used for creating
0075: // the set.
0076:
0077: UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during
0078: // the scanning of RBBI rules. The
0079: // indicies for these are assigned by the
0080: // perl script that builds the state tables.
0081: // See rbbirpt.h.
0082:
0083: int fRuleNum; // Counts each rule as it is scanned.
0084:
0085: int fOptionStart; // Input index of start of a !!option
0086: // keyword, while being scanned.
0087:
0088: static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
0089: static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]";
0090: static private String gRuleSet_digit_char_pattern = "[0-9]";
0091: static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
0092: static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]";
0093: static private String kAny = "any";
0094:
0095: //----------------------------------------------------------------------------------------
0096: //
0097: // Constructor.
0098: //
0099: //----------------------------------------------------------------------------------------
0100: RBBIRuleScanner(RBBIRuleBuilder rb) {
0101: fRB = rb;
0102: fLineNum = 1;
0103:
0104: //
0105: // Set up the constant Unicode Sets.
0106: // Note: These could be made static and shared among
0107: // all instances of RBBIRuleScanners.
0108: fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(
0109: gRuleSet_rule_char_pattern);
0110: fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(
0111: gRuleSet_white_space_pattern);
0112: fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(
0113: gRuleSet_name_char_pattern);
0114: fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(
0115: gRuleSet_name_start_char_pattern);
0116: fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(
0117: gRuleSet_digit_char_pattern);
0118:
0119: fSymbolTable = new RBBISymbolTable(this , rb.fRules);
0120: }
0121:
0122: //----------------------------------------------------------------------------------------
0123: //
0124: // doParseAction Do some action during rule parsing.
0125: // Called by the parse state machine.
0126: // Actions build the parse tree and Unicode Sets,
0127: // and maintain the parse stack for nested expressions.
0128: //
0129: //----------------------------------------------------------------------------------------
0130: boolean doParseActions(int action) {
0131: RBBINode n = null;
0132:
0133: boolean returnVal = true;
0134:
0135: switch (action) {
0136:
0137: case RBBIRuleParseTable.doExprStart:
0138: pushNewNode(RBBINode.opStart);
0139: fRuleNum++;
0140: break;
0141:
0142: case RBBIRuleParseTable.doExprOrOperator: {
0143: fixOpStack(RBBINode.precOpCat);
0144: RBBINode operandNode = fNodeStack[fNodeStackPtr--];
0145: RBBINode orNode = pushNewNode(RBBINode.opOr);
0146: orNode.fLeftChild = operandNode;
0147: operandNode.fParent = orNode;
0148: }
0149: break;
0150:
0151: case RBBIRuleParseTable.doExprCatOperator:
0152: // concatenation operator.
0153: // For the implicit concatenation of adjacent terms in an expression that are
0154: // not separated by any other operator. Action is invoked between the
0155: // actions for the two terms.
0156: {
0157: fixOpStack(RBBINode.precOpCat);
0158: RBBINode operandNode = fNodeStack[fNodeStackPtr--];
0159: RBBINode catNode = pushNewNode(RBBINode.opCat);
0160: catNode.fLeftChild = operandNode;
0161: operandNode.fParent = catNode;
0162: }
0163: break;
0164:
0165: case RBBIRuleParseTable.doLParen:
0166: // Open Paren.
0167: // The openParen node is a dummy operation type with a low precedence,
0168: // which has the affect of ensuring that any real binary op that
0169: // follows within the parens binds more tightly to the operands than
0170: // stuff outside of the parens.
0171: pushNewNode(RBBINode.opLParen);
0172: break;
0173:
0174: case RBBIRuleParseTable.doExprRParen:
0175: fixOpStack(RBBINode.precLParen);
0176: break;
0177:
0178: case RBBIRuleParseTable.doNOP:
0179: break;
0180:
0181: case RBBIRuleParseTable.doStartAssign:
0182: // We've just scanned "$variable = "
0183: // The top of the node stack has the $variable ref node.
0184:
0185: // Save the start position of the RHS text in the StartExpression node
0186: // that precedes the $variableReference node on the stack.
0187: // This will eventually be used when saving the full $variable replacement
0188: // text as a string.
0189: n = fNodeStack[fNodeStackPtr - 1];
0190: n.fFirstPos = fNextIndex; // move past the '='
0191:
0192: // Push a new start-of-expression node; needed to keep parse of the
0193: // RHS expression happy.
0194: pushNewNode(RBBINode.opStart);
0195: break;
0196:
0197: case RBBIRuleParseTable.doEndAssign: {
0198: // We have reached the end of an assignement statement.
0199: // Current scan char is the ';' that terminates the assignment.
0200:
0201: // Terminate expression, leaves expression parse tree rooted in TOS node.
0202: fixOpStack(RBBINode.precStart);
0203:
0204: RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
0205: RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
0206: RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
0207:
0208: // Save original text of right side of assignment, excluding the terminating ';'
0209: // in the root of the node for the right-hand-side expression.
0210: RHSExprNode.fFirstPos = startExprNode.fFirstPos;
0211: RHSExprNode.fLastPos = fScanIndex;
0212: // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, RHSExprNode.fLastPos, RHSExprNode.fText);
0213: RHSExprNode.fText = fRB.fRules.substring(
0214: RHSExprNode.fFirstPos, RHSExprNode.fLastPos);
0215:
0216: // Expression parse tree becomes l. child of the $variable reference node.
0217: varRefNode.fLeftChild = RHSExprNode;
0218: RHSExprNode.fParent = varRefNode;
0219:
0220: // Make a symbol table entry for the $variableRef node.
0221: fSymbolTable.addEntry(varRefNode.fText, varRefNode);
0222:
0223: // Clean up the stack.
0224: fNodeStackPtr -= 3;
0225: break;
0226: }
0227:
0228: case RBBIRuleParseTable.doEndOfRule: {
0229: fixOpStack(RBBINode.precStart); // Terminate expression, leaves expression
0230:
0231: if (fRB.fDebugEnv != null
0232: && fRB.fDebugEnv.indexOf("rtree") >= 0) {
0233: printNodeStack("end of rule");
0234: }
0235: Assert.assrt(fNodeStackPtr == 1);
0236:
0237: // If this rule includes a look-ahead '/', add a endMark node to the
0238: // expression tree.
0239: if (fLookAheadRule) {
0240: RBBINode this Rule = fNodeStack[fNodeStackPtr];
0241: RBBINode endNode = pushNewNode(RBBINode.endMark);
0242: RBBINode catNode = pushNewNode(RBBINode.opCat);
0243: fNodeStackPtr -= 2;
0244: catNode.fLeftChild = this Rule;
0245: catNode.fRightChild = endNode;
0246: fNodeStack[fNodeStackPtr] = catNode;
0247: endNode.fVal = fRuleNum;
0248: endNode.fLookAheadEnd = true;
0249: }
0250:
0251: // All rule expressions are ORed together.
0252: // The ';' that terminates an expression really just functions as a '|' with
0253: // a low operator prededence.
0254: //
0255: // Each of the four sets of rules are collected separately.
0256: // (forward, reverse, safe_forward, safe_reverse)
0257: // OR this rule into the appropriate group of them.
0258: //
0259:
0260: int destRules = (fReverseRule ? fRB.fReverseTree
0261: : fRB.fDefaultTree);
0262:
0263: if (fRB.fTreeRoots[destRules] != null) {
0264: // This is not the first rule encounted.
0265: // OR previous stuff (from *destRules)
0266: // with the current rule expression (on the Node Stack)
0267: // with the resulting OR expression going to *destRules
0268: //
0269: RBBINode this Rule = fNodeStack[fNodeStackPtr];
0270: RBBINode prevRules = fRB.fTreeRoots[destRules];
0271: RBBINode orNode = pushNewNode(RBBINode.opOr);
0272: orNode.fLeftChild = prevRules;
0273: prevRules.fParent = orNode;
0274: orNode.fRightChild = this Rule;
0275: this Rule.fParent = orNode;
0276: fRB.fTreeRoots[destRules] = orNode;
0277: } else {
0278: // This is the first rule encountered (for this direction).
0279: // Just move its parse tree from the stack to *destRules.
0280: fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
0281: }
0282: fReverseRule = false; // in preparation for the next rule.
0283: fLookAheadRule = false;
0284: fNodeStackPtr = 0;
0285: }
0286: break;
0287:
0288: case RBBIRuleParseTable.doRuleError:
0289: error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
0290: returnVal = false;
0291: break;
0292:
0293: case RBBIRuleParseTable.doVariableNameExpectedErr:
0294: error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
0295: break;
0296:
0297: //
0298: // Unary operands + ? *
0299: // These all appear after the operand to which they apply.
0300: // When we hit one, the operand (may be a whole sub expression)
0301: // will be on the top of the stack.
0302: // Unary Operator becomes TOS, with the old TOS as its one child.
0303: case RBBIRuleParseTable.doUnaryOpPlus: {
0304: RBBINode operandNode = fNodeStack[fNodeStackPtr--];
0305: RBBINode plusNode = pushNewNode(RBBINode.opPlus);
0306: plusNode.fLeftChild = operandNode;
0307: operandNode.fParent = plusNode;
0308: }
0309: break;
0310:
0311: case RBBIRuleParseTable.doUnaryOpQuestion: {
0312: RBBINode operandNode = fNodeStack[fNodeStackPtr--];
0313: RBBINode qNode = pushNewNode(RBBINode.opQuestion);
0314: qNode.fLeftChild = operandNode;
0315: operandNode.fParent = qNode;
0316: }
0317: break;
0318:
0319: case RBBIRuleParseTable.doUnaryOpStar: {
0320: RBBINode operandNode = fNodeStack[fNodeStackPtr--];
0321: RBBINode starNode = pushNewNode(RBBINode.opStar);
0322: starNode.fLeftChild = operandNode;
0323: operandNode.fParent = starNode;
0324: }
0325: break;
0326:
0327: case RBBIRuleParseTable.doRuleChar:
0328: // A "Rule Character" is any single character that is a literal part
0329: // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]"
0330: // These are pretty uncommon in break rules; the terms are more commonly
0331: // sets. To keep things uniform, treat these characters like as
0332: // sets that just happen to contain only one character.
0333: {
0334: n = pushNewNode(RBBINode.setRef);
0335: String s = (new StringBuffer().append((char) fC.fChar))
0336: .toString();
0337: findSetFor(s, n, null);
0338: n.fFirstPos = fScanIndex;
0339: n.fLastPos = fNextIndex;
0340: n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
0341: break;
0342: }
0343:
0344: case RBBIRuleParseTable.doDotAny:
0345: // scanned a ".", meaning match any single character.
0346: {
0347: n = pushNewNode(RBBINode.setRef);
0348: findSetFor(kAny, n, null);
0349: n.fFirstPos = fScanIndex;
0350: n.fLastPos = fNextIndex;
0351: n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
0352: break;
0353: }
0354:
0355: case RBBIRuleParseTable.doSlash:
0356: // Scanned a '/', which identifies a look-ahead break position in a rule.
0357: n = pushNewNode(RBBINode.lookAhead);
0358: n.fVal = fRuleNum;
0359: n.fFirstPos = fScanIndex;
0360: n.fLastPos = fNextIndex;
0361: n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
0362: fLookAheadRule = true;
0363: break;
0364:
0365: case RBBIRuleParseTable.doStartTagValue:
0366: // Scanned a '{', the opening delimiter for a tag value within a rule.
0367: n = pushNewNode(RBBINode.tag);
0368: n.fVal = 0;
0369: n.fFirstPos = fScanIndex;
0370: n.fLastPos = fNextIndex;
0371: break;
0372:
0373: case RBBIRuleParseTable.doTagDigit:
0374: // Just scanned a decimal digit that's part of a tag value
0375: {
0376: n = fNodeStack[fNodeStackPtr];
0377: int v = Character.digit((char) fC.fChar, 10);
0378: n.fVal = n.fVal * 10 + v;
0379: break;
0380: }
0381:
0382: case RBBIRuleParseTable.doTagValue:
0383: n = fNodeStack[fNodeStackPtr];
0384: n.fLastPos = fNextIndex;
0385: n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
0386: break;
0387:
0388: case RBBIRuleParseTable.doTagExpectedError:
0389: error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
0390: returnVal = false;
0391: break;
0392:
0393: case RBBIRuleParseTable.doOptionStart:
0394: // Scanning a !!option. At the start of string.
0395: fOptionStart = fScanIndex;
0396: break;
0397:
0398: case RBBIRuleParseTable.doOptionEnd: {
0399: String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
0400: if (opt.equals("chain")) {
0401: fRB.fChainRules = true;
0402: } else if (opt.equals("LBCMNoChain")) {
0403: fRB.fLBCMNoChain = true;
0404: } else if (opt.equals("forward")) {
0405: fRB.fDefaultTree = fRB.fForwardTree;
0406: } else if (opt.equals("reverse")) {
0407: fRB.fDefaultTree = fRB.fReverseTree;
0408: } else if (opt.equals("safe_forward")) {
0409: fRB.fDefaultTree = fRB.fSafeFwdTree;
0410: } else if (opt.equals("safe_reverse")) {
0411: fRB.fDefaultTree = fRB.fSafeRevTree;
0412: } else if (opt.equals("lookAheadHardBreak")) {
0413: fRB.fLookAheadHardBreak = true;
0414: } else {
0415: error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
0416: }
0417: break;
0418: }
0419:
0420: case RBBIRuleParseTable.doReverseDir:
0421: fReverseRule = true;
0422: break;
0423:
0424: case RBBIRuleParseTable.doStartVariableName:
0425: n = pushNewNode(RBBINode.varRef);
0426: n.fFirstPos = fScanIndex;
0427: break;
0428:
0429: case RBBIRuleParseTable.doEndVariableName:
0430: n = fNodeStack[fNodeStackPtr];
0431: if (n == null || n.fType != RBBINode.varRef) {
0432: error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
0433: break;
0434: }
0435: n.fLastPos = fScanIndex;
0436: n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
0437: // Look the newly scanned name up in the symbol table
0438: // If there's an entry, set the l. child of the var ref to the replacement expression.
0439: // (We also pass through here when scanning assignments, but no harm is done, other
0440: // than a slight wasted effort that seems hard to avoid. Lookup will be null)
0441: n.fLeftChild = fSymbolTable.lookupNode(n.fText);
0442: break;
0443:
0444: case RBBIRuleParseTable.doCheckVarDef:
0445: n = fNodeStack[fNodeStackPtr];
0446: if (n.fLeftChild == null) {
0447: error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
0448: returnVal = false;
0449: }
0450: break;
0451:
0452: case RBBIRuleParseTable.doExprFinished:
0453: break;
0454:
0455: case RBBIRuleParseTable.doRuleErrorAssignExpr:
0456: error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
0457: returnVal = false;
0458: break;
0459:
0460: case RBBIRuleParseTable.doExit:
0461: returnVal = false;
0462: break;
0463:
0464: case RBBIRuleParseTable.doScanUnicodeSet:
0465: scanSet();
0466: break;
0467:
0468: default:
0469: error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
0470: returnVal = false;
0471: break;
0472: }
0473: return returnVal;
0474: }
0475:
0476: //----------------------------------------------------------------------------------------
0477: //
0478: // Error Throw and IllegalArgumentException in response to a rule parse error.
0479: //
0480: //----------------------------------------------------------------------------------------
0481: void error(int e) {
0482: String s = "Error " + e + " at line " + fLineNum + " column "
0483: + fCharNum;
0484: IllegalArgumentException ex = new IllegalArgumentException(s);
0485: throw ex;
0486:
0487: }
0488:
0489: //----------------------------------------------------------------------------------------
0490: //
0491: // fixOpStack The parse stack holds partially assembled chunks of the parse tree.
0492: // An entry on the stack may be as small as a single setRef node,
0493: // or as large as the parse tree
0494: // for an entire expression (this will be the one item left on the stack
0495: // when the parsing of an RBBI rule completes.
0496: //
0497: // This function is called when a binary operator is encountered.
0498: // It looks back up the stack for operators that are not yet associated
0499: // with a right operand, and if the precedence of the stacked operator >=
0500: // the precedence of the current operator, binds the operand left,
0501: // to the previously encountered operator.
0502: //
0503: //----------------------------------------------------------------------------------------
0504: void fixOpStack(int p) {
0505: RBBINode n;
0506: // printNodeStack("entering fixOpStack()");
0507: for (;;) {
0508: n = fNodeStack[fNodeStackPtr - 1]; // an operator node
0509: if (n.fPrecedence == 0) {
0510: System.out
0511: .print("RBBIRuleScanner.fixOpStack, bad operator node");
0512: error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
0513: return;
0514: }
0515:
0516: if (n.fPrecedence < p
0517: || n.fPrecedence <= RBBINode.precLParen) {
0518: // The most recent operand goes with the current operator,
0519: // not with the previously stacked one.
0520: break;
0521: }
0522: // Stack operator is a binary op ( '|' or concatenation)
0523: // TOS operand becomes right child of this operator.
0524: // Resulting subexpression becomes the TOS operand.
0525: n.fRightChild = fNodeStack[fNodeStackPtr];
0526: fNodeStack[fNodeStackPtr].fParent = n;
0527: fNodeStackPtr--;
0528: // printNodeStack("looping in fixOpStack() ");
0529: }
0530:
0531: if (p <= RBBINode.precLParen) {
0532: // Scan is at a right paren or end of expression.
0533: // The scanned item must match the stack, or else there was an error.
0534: // Discard the left paren (or start expr) node from the stack,
0535: // leaving the completed (sub)expression as TOS.
0536: if (n.fPrecedence != p) {
0537: // Right paren encountered matched start of expression node, or
0538: // end of expression matched with a left paren node.
0539: error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
0540: }
0541: fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
0542: fNodeStackPtr--;
0543: // Delete the now-discarded LParen or Start node.
0544: // delete n;
0545: }
0546: // printNodeStack("leaving fixOpStack()");
0547: }
0548:
0549: //----------------------------------------------------------------------------
0550: //
0551: // RBBISetTableEl is an entry in the hash table of UnicodeSets that have
0552: // been encountered. The val Node will be of nodetype uset
0553: // and contain pointers to the actual UnicodeSets.
0554: // The Key is the source string for initializing the set.
0555: //
0556: // The hash table is used to avoid creating duplicate
0557: // unnamed (not $var references) UnicodeSets.
0558: //
0559: //----------------------------------------------------------------------------
0560: static class RBBISetTableEl {
0561: String key;
0562: RBBINode val;
0563: }
0564:
0565: //----------------------------------------------------------------------------------------
0566: //
0567: // findSetFor given a String,
0568: // - find the corresponding Unicode Set (uset node)
0569: // (create one if necessary)
0570: // - Set fLeftChild of the caller's node (should be a setRef node)
0571: // to the uset node
0572: // Maintain a hash table of uset nodes, so the same one is always used
0573: // for the same string.
0574: // If a "to adopt" set is provided and we haven't seen this key before,
0575: // add the provided set to the hash table.
0576: // If the string is one (32 bit) char in length, the set contains
0577: // just one element which is the char in question.
0578: // If the string is "any", return a set containing all chars.
0579: //
0580: //----------------------------------------------------------------------------------------
0581: void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
0582:
0583: RBBISetTableEl el;
0584:
0585: // First check whether we've already cached a set for this string.
0586: // If so, just use the cached set in the new node.
0587: // delete any set provided by the caller, since we own it.
0588: el = (RBBISetTableEl) fSetTable.get(s);
0589: if (el != null) {
0590: node.fLeftChild = el.val;
0591: Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
0592: return;
0593: }
0594:
0595: // Haven't seen this set before.
0596: // If the caller didn't provide us with a prebuilt set,
0597: // create a new UnicodeSet now.
0598: if (setToAdopt == null) {
0599: if (s.equals(kAny)) {
0600: setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
0601: } else {
0602: int c;
0603: c = UTF16.charAt(s, 0);
0604: setToAdopt = new UnicodeSet(c, c);
0605: }
0606: }
0607:
0608: //
0609: // Make a new uset node to refer to this UnicodeSet
0610: // This new uset node becomes the child of the caller's setReference node.
0611: //
0612: RBBINode usetNode = new RBBINode(RBBINode.uset);
0613: usetNode.fInputSet = setToAdopt;
0614: usetNode.fParent = node;
0615: node.fLeftChild = usetNode;
0616: usetNode.fText = s;
0617:
0618: //
0619: // Add the new uset node to the list of all uset nodes.
0620: //
0621: fRB.fUSetNodes.add(usetNode);
0622:
0623: //
0624: // Add the new set to the set hash table.
0625: //
0626: el = new RBBISetTableEl();
0627: el.key = s;
0628: el.val = usetNode;
0629: fSetTable.put(el.key, el);
0630:
0631: return;
0632: }
0633:
0634: //
0635: // Assorted Unicode character constants.
0636: // Numeric because there is no portable way to enter them as literals.
0637: // (Think EBCDIC).
0638: //
0639: static final int chNEL = 0x85; // NEL newline variant
0640: static final int chLS = 0x2028; // Unicode Line Separator
0641:
0642: //----------------------------------------------------------------------------------------
0643: //
0644: // stripRules Return a rules string without unnecessary
0645: // characters.
0646: //
0647: //----------------------------------------------------------------------------------------
0648: static String stripRules(String rules) {
0649: StringBuffer strippedRules = new StringBuffer();
0650: int rulesLength = rules.length();
0651: for (int idx = 0; idx < rulesLength;) {
0652: char ch = rules.charAt(idx++);
0653: if (ch == '#') {
0654: while (idx < rulesLength && ch != '\r' && ch != '\n'
0655: && ch != chNEL) {
0656: ch = rules.charAt(idx++);
0657: }
0658: }
0659: if (!UCharacter.isISOControl(ch)) {
0660: strippedRules.append(ch);
0661: }
0662: }
0663: return strippedRules.toString();
0664: }
0665:
0666: //----------------------------------------------------------------------------------------
0667: //
0668: // nextCharLL Low Level Next Char from rule input source.
0669: // Get a char from the input character iterator,
0670: // keep track of input position for error reporting.
0671: //
0672: //----------------------------------------------------------------------------------------
0673: int nextCharLL() {
0674: int ch;
0675:
0676: if (fNextIndex >= fRB.fRules.length()) {
0677: return -1;
0678: }
0679: ch = UTF16.charAt(fRB.fRules, fNextIndex);
0680: fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex,
0681: 1);
0682:
0683: if (ch == '\r' || ch == chNEL || ch == chLS || ch == '\n'
0684: && fLastChar != '\r') {
0685: // Character is starting a new line. Bump up the line number, and
0686: // reset the column to 0.
0687: fLineNum++;
0688: fCharNum = 0;
0689: if (fQuoteMode) {
0690: error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
0691: fQuoteMode = false;
0692: }
0693: } else {
0694: // Character is not starting a new line. Except in the case of a
0695: // LF following a CR, increment the column position.
0696: if (ch != '\n') {
0697: fCharNum++;
0698: }
0699: }
0700: fLastChar = ch;
0701: return ch;
0702: }
0703:
0704: //---------------------------------------------------------------------------------
0705: //
0706: // nextChar for rules scanning. At this level, we handle stripping
0707: // out comments and processing backslash character escapes.
0708: // The rest of the rules grammar is handled at the next level up.
0709: //
0710: //---------------------------------------------------------------------------------
0711: void nextChar(RBBIRuleChar c) {
0712:
0713: // Unicode Character constants needed for the processing done by nextChar(),
0714: // in hex because literals wont work on EBCDIC machines.
0715:
0716: fScanIndex = fNextIndex;
0717: c.fChar = nextCharLL();
0718: c.fEscaped = false;
0719:
0720: //
0721: // check for '' sequence.
0722: // These are recognized in all contexts, whether in quoted text or not.
0723: //
0724: if (c.fChar == '\'') {
0725: if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
0726: c.fChar = nextCharLL(); // get nextChar officially so character counts
0727: c.fEscaped = true; // stay correct.
0728: } else {
0729: // Single quote, by itself.
0730: // Toggle quoting mode.
0731: // Return either '(' or ')', because quotes cause a grouping of the quoted text.
0732: fQuoteMode = !fQuoteMode;
0733: if (fQuoteMode == true) {
0734: c.fChar = '(';
0735: } else {
0736: c.fChar = ')';
0737: }
0738: c.fEscaped = false; // The paren that we return is not escaped.
0739: return;
0740: }
0741: }
0742:
0743: if (fQuoteMode) {
0744: c.fEscaped = true;
0745: } else {
0746: // We are not in a 'quoted region' of the source.
0747: //
0748: if (c.fChar == '#') {
0749: // Start of a comment. Consume the rest of it.
0750: // The new-line char that terminates the comment is always returned.
0751: // It will be treated as white-space, and serves to break up anything
0752: // that might otherwise incorrectly clump together with a comment in
0753: // the middle (a variable name, for example.)
0754: for (;;) {
0755: c.fChar = nextCharLL();
0756: if (c.fChar == (int) -1
0757: || // EOF
0758: c.fChar == '\r' || c.fChar == '\n'
0759: || c.fChar == chNEL || c.fChar == chLS) {
0760: break;
0761: }
0762: }
0763: }
0764: if (c.fChar == (int) -1) {
0765: return;
0766: }
0767:
0768: //
0769: // check for backslash escaped characters.
0770: // Use String.unescapeAt() to handle them.
0771: //
0772: if (c.fChar == '\\') {
0773: c.fEscaped = true;
0774: int[] unescapeIndex = new int[1];
0775: unescapeIndex[0] = fNextIndex;
0776: c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
0777: if (unescapeIndex[0] == fNextIndex) {
0778: error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
0779: }
0780:
0781: fCharNum += unescapeIndex[0] - fNextIndex;
0782: fNextIndex = unescapeIndex[0];
0783: }
0784: }
0785: // putc(c.fChar, stdout);
0786: }
0787:
0788: //---------------------------------------------------------------------------------
0789: //
0790: // Parse RBBI rules. The state machine for rules parsing is here.
0791: // The state tables are hand-written in the file rbbirpt.txt,
0792: // and converted to the form used here by a perl
0793: // script rbbicst.pl
0794: //
0795: //---------------------------------------------------------------------------------
0796: void parse() {
0797: int state;
0798: RBBIRuleParseTable.RBBIRuleTableElement tableEl;
0799:
0800: state = 1;
0801: nextChar(fC);
0802: //
0803: // Main loop for the rule parsing state machine.
0804: // Runs once per state transition.
0805: // Each time through optionally performs, depending on the state table,
0806: // - an advance to the the next input char
0807: // - an action to be performed.
0808: // - pushing or popping a state to/from the local state return stack.
0809: //
0810: for (;;) {
0811: // Quit if state == 0. This is the normal way to exit the state machine.
0812: //
0813: if (state == 0) {
0814: break;
0815: }
0816:
0817: // Find the state table element that matches the input char from the rule, or the
0818: // class of the input character. Start with the first table row for this
0819: // state, then linearly scan forward until we find a row that matches the
0820: // character. The last row for each state always matches all characters, so
0821: // the search will stop there, if not before.
0822: //
0823: tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
0824: if (fRB.fDebugEnv != null
0825: && fRB.fDebugEnv.indexOf("scan") >= 0) {
0826: System.out.println("char, line, col = (\'"
0827: + (char) fC.fChar + "\', " + fLineNum + ", "
0828: + fCharNum + " state = "
0829: + tableEl.fStateName);
0830: }
0831:
0832: for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
0833: tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
0834: if (fRB.fDebugEnv != null
0835: && fRB.fDebugEnv.indexOf("scan") >= 0) {
0836: System.out.print(".");
0837: }
0838: if (tableEl.fCharClass < 127 && fC.fEscaped == false
0839: && tableEl.fCharClass == fC.fChar) {
0840: // Table row specified an individual character, not a set, and
0841: // the input character is not escaped, and
0842: // the input character matched it.
0843: break;
0844: }
0845: if (tableEl.fCharClass == 255) {
0846: // Table row specified default, match anything character class.
0847: break;
0848: }
0849: if (tableEl.fCharClass == 254 && fC.fEscaped) {
0850: // Table row specified "escaped" and the char was escaped.
0851: break;
0852: }
0853: if (tableEl.fCharClass == 253 && fC.fEscaped
0854: && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
0855: // Table row specified "escaped P" and the char is either 'p' or 'P'.
0856: break;
0857: }
0858: if (tableEl.fCharClass == 252 && fC.fChar == (int) -1) {
0859: // Table row specified eof and we hit eof on the input.
0860: break;
0861: }
0862:
0863: if (tableEl.fCharClass >= 128
0864: && tableEl.fCharClass < 240 && // Table specs a char class &&
0865: fC.fEscaped == false && // char is not escaped &&
0866: fC.fChar != (int) -1) { // char is not EOF
0867: UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
0868: if (uniset.contains(fC.fChar)) {
0869: // Table row specified a character class, or set of characters,
0870: // and the current char matches it.
0871: break;
0872: }
0873: }
0874: }
0875:
0876: if (fRB.fDebugEnv != null
0877: && fRB.fDebugEnv.indexOf("scan") >= 0) {
0878: System.out.println("");
0879: }
0880: //
0881: // We've found the row of the state table that matches the current input
0882: // character from the rules string.
0883: // Perform any action specified by this row in the state table.
0884: if (doParseActions(tableEl.fAction) == false) {
0885: // Break out of the state machine loop if the
0886: // the action signalled some kind of error, or
0887: // the action was to exit, occurs on normal end-of-rules-input.
0888: break;
0889: }
0890:
0891: if (tableEl.fPushState != 0) {
0892: fStackPtr++;
0893: if (fStackPtr >= kStackSize) {
0894: System.out
0895: .println("RBBIRuleScanner.parse() - state stack overflow.");
0896: error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
0897: }
0898: fStack[fStackPtr] = tableEl.fPushState;
0899: }
0900:
0901: if (tableEl.fNextChar) {
0902: nextChar(fC);
0903: }
0904:
0905: // Get the next state from the table entry, or from the
0906: // state stack if the next state was specified as "pop".
0907: if (tableEl.fNextState != 255) {
0908: state = tableEl.fNextState;
0909: } else {
0910: state = fStack[fStackPtr];
0911: fStackPtr--;
0912: if (fStackPtr < 0) {
0913: System.out
0914: .println("RBBIRuleScanner.parse() - state stack underflow.");
0915: error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
0916: }
0917: }
0918:
0919: }
0920:
0921: //
0922: // If there were NO user specified reverse rules, set up the equivalent of ".*;"
0923: //
0924: if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
0925: fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
0926: RBBINode operand = pushNewNode(RBBINode.setRef);
0927: findSetFor(kAny, operand, null);
0928: fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
0929: operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
0930: fNodeStackPtr -= 2;
0931: }
0932:
0933: //
0934: // Parsing of the input RBBI rules is complete.
0935: // We now have a parse tree for the rule expressions
0936: // and a list of all UnicodeSets that are referenced.
0937: //
0938: if (fRB.fDebugEnv != null
0939: && fRB.fDebugEnv.indexOf("symbols") >= 0) {
0940: fSymbolTable.rbbiSymtablePrint();
0941: }
0942: if (fRB.fDebugEnv != null
0943: && fRB.fDebugEnv.indexOf("ptree") >= 0) {
0944: System.out.println("Completed Forward Rules Parse Tree...");
0945: fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree]
0946: .printTree(true);
0947: System.out
0948: .println("\nCompleted Reverse Rules Parse Tree...");
0949: fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree]
0950: .printTree(true);
0951: System.out
0952: .println("\nCompleted Safe Point Forward Rules Parse Tree...");
0953: if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
0954: System.out.println(" -- null -- ");
0955: } else {
0956: fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree]
0957: .printTree(true);
0958: }
0959: System.out
0960: .println("\nCompleted Safe Point Reverse Rules Parse Tree...");
0961: if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
0962: System.out.println(" -- null -- ");
0963: } else {
0964: fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree]
0965: .printTree(true);
0966: }
0967: }
0968: }
0969:
0970: //---------------------------------------------------------------------------------
0971: //
0972: // printNodeStack for debugging...
0973: //
0974: //---------------------------------------------------------------------------------
0975: void printNodeStack(String title) {
0976: int i;
0977: System.out.println(title + ". Dumping node stack...\n");
0978: for (i = fNodeStackPtr; i > 0; i--) {
0979: fNodeStack[i].printTree(true);
0980: }
0981: }
0982:
0983: //---------------------------------------------------------------------------------
0984: //
0985: // pushNewNode create a new RBBINode of the specified type and push it
0986: // onto the stack of nodes.
0987: //
0988: //---------------------------------------------------------------------------------
0989: RBBINode pushNewNode(int nodeType) {
0990: fNodeStackPtr++;
0991: if (fNodeStackPtr >= kStackSize) {
0992: System.out
0993: .println("RBBIRuleScanner.pushNewNode - stack overflow.");
0994: error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
0995: }
0996: fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
0997: return fNodeStack[fNodeStackPtr];
0998: }
0999:
1000: //---------------------------------------------------------------------------------
1001: //
1002: // scanSet Construct a UnicodeSet from the text at the current scan
1003: // position. Advance the scan position to the first character
1004: // after the set.
1005: //
1006: // A new RBBI setref node referring to the set is pushed onto the node
1007: // stack.
1008: //
1009: // The scan position is normally under the control of the state machine
1010: // that controls rule parsing. UnicodeSets, however, are parsed by
1011: // the UnicodeSet constructor, not by the RBBI rule parser.
1012: //
1013: //---------------------------------------------------------------------------------
1014: void scanSet() {
1015: UnicodeSet uset = null;
1016: int startPos;
1017: ParsePosition pos = new ParsePosition(fScanIndex);
1018: int i;
1019:
1020: startPos = fScanIndex;
1021: try {
1022: uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable,
1023: UnicodeSet.IGNORE_SPACE);
1024: } catch (Exception e) { // TODO: catch fewer exception types.
1025: // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
1026: error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
1027: }
1028:
1029: // Verify that the set contains at least one code point.
1030: //
1031: if (uset.isEmpty()) {
1032: // This set is empty.
1033: // Make it an error, because it almost certainly is not what the user wanted.
1034: // Also, avoids having to think about corner cases in the tree manipulation code
1035: // that occurs later on.
1036: // TODO: this shouldn't be an error; it does happen.
1037: error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
1038: }
1039:
1040: // Advance the RBBI parse postion over the UnicodeSet pattern.
1041: // Don't just set fScanIndex because the line/char positions maintained
1042: // for error reporting would be thrown off.
1043: i = pos.getIndex();
1044: for (;;) {
1045: if (fNextIndex >= i) {
1046: break;
1047: }
1048: nextCharLL();
1049: }
1050:
1051: RBBINode n;
1052:
1053: n = pushNewNode(RBBINode.setRef);
1054: n.fFirstPos = startPos;
1055: n.fLastPos = fNextIndex;
1056: n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
1057: // findSetFor() serves several purposes here:
1058: // - Adopts storage for the UnicodeSet, will be responsible for deleting.
1059: // - Mantains collection of all sets in use, needed later for establishing
1060: // character categories for run time engine.
1061: // - Eliminates mulitiple instances of the same set.
1062: // - Creates a new uset node if necessary (if this isn't a duplicate.)
1063: findSetFor(n.fText, n, uset);
1064: }
1065:
1066: }
|