0001: /*
0002: **********************************************************************
0003: * Copyright (c) 2002-2006, International Business Machines
0004: * Corporation and others. All Rights Reserved.
0005: **********************************************************************
0006: */
0007:
0008: package com.ibm.icu.text;
0009:
0010: import java.util.List;
0011: import java.util.ArrayList;
0012: import java.util.Set;
0013: import java.util.HashSet;
0014: import java.util.SortedSet;
0015: import java.util.TreeSet;
0016: import java.util.Iterator;
0017: import java.util.HashMap;
0018: import java.util.Map;
0019: import java.util.Collection;
0020:
0021: import com.ibm.icu.impl.Assert;
0022: import com.ibm.icu.lang.UCharacter;
0023: import com.ibm.icu.lang.UProperty;
0024:
0025: //
0026: // class RBBITableBuilder is part of the RBBI rule compiler.
0027: // It builds the state transition table used by the RBBI runtime
0028: // from the expression syntax tree generated by the rule scanner.
0029: //
0030: // This class is part of the RBBI implementation only.
0031: // There is no user-visible public API here.
0032: //
0033: class RBBITableBuilder {
0034:
0035: //
0036: // RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
0037: // one for each state.
0038: static class RBBIStateDescriptor {
0039: boolean fMarked;
0040: int fAccepting;
0041: int fLookAhead;
0042: SortedSet fTagVals;
0043: int fTagsIdx;
0044: Set fPositions; // Set of parse tree positions associated
0045: // with this state. Unordered (it's a set).
0046: // UVector contents are RBBINode *
0047:
0048: int[] fDtran; // Transitions out of this state.
0049:
0050: // indexed by input character
0051: // contents is int index of dest state
0052: // in RBBITableBuilder.fDStates
0053:
0054: RBBIStateDescriptor(int maxInputSymbol) {
0055: fTagVals = new TreeSet();
0056: fPositions = new HashSet();
0057: fDtran = new int[maxInputSymbol + 1]; // fDtran needs to be pre-sized.
0058: // It is indexed by input symbols, and will
0059: // hold the next state number for each
0060: // symbol.
0061: }
0062: }
0063:
0064: private RBBIRuleBuilder fRB;
0065: private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots
0066: // for the parse tree to operate on.
0067: // Too bad Java can't do indirection more easily!
0068:
0069: private List fDStates; // D states (Aho's terminology)
0070:
0071: // Index is state number
0072: // Contents are RBBIStateDescriptor pointers.
0073:
0074: //-----------------------------------------------------------------------------
0075: //
0076: // Constructor for RBBITableBuilder.
0077: //
0078: // rootNode is an index into the array of root nodes that is held by
0079: // the overall RBBIRuleBuilder.
0080: //-----------------------------------------------------------------------------
0081: RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) {
0082: fRootIx = rootNodeIx;
0083: fRB = rb;
0084: fDStates = new ArrayList();
0085: }
0086:
0087: //-----------------------------------------------------------------------------
0088: //
0089: // RBBITableBuilder::build - This is the main function for building the DFA state transtion
0090: // table from the RBBI rules parse tree.
0091: //
0092: //-----------------------------------------------------------------------------
0093: void build() {
0094: // If there were no rules, just return. This situation can easily arise
0095: // for the reverse rules.
0096: if (fRB.fTreeRoots[fRootIx] == null) {
0097: return;
0098: }
0099:
0100: //
0101: // Walk through the tree, replacing any references to $variables with a copy of the
0102: // parse tree for the substition expression.
0103: //
0104: fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx]
0105: .flattenVariables();
0106: if (fRB.fDebugEnv != null
0107: && fRB.fDebugEnv.indexOf("ftree") >= 0) {
0108: System.out
0109: .println("Parse tree after flattening variable references.");
0110: fRB.fTreeRoots[fRootIx].printTree(true);
0111: }
0112:
0113: //
0114: // If the rules contained any references to {bof}
0115: // add a {bof} <cat> <former root of tree> to the
0116: // tree. Means that all matches must start out with the
0117: // {bof} fake character.
0118: //
0119: if (fRB.fSetBuilder.sawBOF()) {
0120: RBBINode bofTop = new RBBINode(RBBINode.opCat);
0121: RBBINode bofLeaf = new RBBINode(RBBINode.leafChar);
0122: bofTop.fLeftChild = bofLeaf;
0123: bofTop.fRightChild = fRB.fTreeRoots[fRootIx];
0124: bofLeaf.fParent = bofTop;
0125: bofLeaf.fVal = 2; // Reserved value for {bof}.
0126: fRB.fTreeRoots[fRootIx] = bofTop;
0127: }
0128:
0129: //
0130: // Add a unique right-end marker to the expression.
0131: // Appears as a cat-node, left child being the original tree,
0132: // right child being the end marker.
0133: //
0134: RBBINode cn = new RBBINode(RBBINode.opCat);
0135: cn.fLeftChild = fRB.fTreeRoots[fRootIx];
0136: fRB.fTreeRoots[fRootIx].fParent = cn;
0137: cn.fRightChild = new RBBINode(RBBINode.endMark);
0138: cn.fRightChild.fParent = cn;
0139: fRB.fTreeRoots[fRootIx] = cn;
0140:
0141: //
0142: // Replace all references to UnicodeSets with the tree for the equivalent
0143: // expression.
0144: //
0145: fRB.fTreeRoots[fRootIx].flattenSets();
0146: if (fRB.fDebugEnv != null
0147: && fRB.fDebugEnv.indexOf("stree") >= 0) {
0148: System.out
0149: .println("Parse tree after flattening Unicode Set references.");
0150: fRB.fTreeRoots[fRootIx].printTree(true);
0151: }
0152:
0153: //
0154: // calculate the functions nullable, firstpos, lastpos and followpos on
0155: // nodes in the parse tree.
0156: // See the alogrithm description in Aho.
0157: // Understanding how this works by looking at the code alone will be
0158: // nearly impossible.
0159: //
0160: calcNullable(fRB.fTreeRoots[fRootIx]);
0161: calcFirstPos(fRB.fTreeRoots[fRootIx]);
0162: calcLastPos(fRB.fTreeRoots[fRootIx]);
0163: calcFollowPos(fRB.fTreeRoots[fRootIx]);
0164: if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("pos") >= 0) {
0165: System.out.print("\n");
0166: printPosSets(fRB.fTreeRoots[fRootIx]);
0167: }
0168:
0169: //
0170: // For "chained" rules, modify the followPos sets
0171: //
0172: if (fRB.fChainRules) {
0173: calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
0174: }
0175:
0176: //
0177: // BOF (start of input) test fixup.
0178: //
0179: if (fRB.fSetBuilder.sawBOF()) {
0180: bofFixup();
0181: }
0182:
0183: //
0184: // Build the DFA state transition tables.
0185: //
0186: buildStateTable();
0187: flagAcceptingStates();
0188: flagLookAheadStates();
0189: flagTaggedStates();
0190:
0191: //
0192: // Update the global table of rule status {tag} values
0193: // The rule builder has a global vector of status values that are common
0194: // for all tables. Merge the ones from this table into the global set.
0195: //
0196: mergeRuleStatusVals();
0197:
0198: if (fRB.fDebugEnv != null
0199: && fRB.fDebugEnv.indexOf("states") >= 0) {
0200: printStates();
0201: }
0202: ;
0203: }
0204:
0205: //-----------------------------------------------------------------------------
0206: //
0207: // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
0208: //
0209: //-----------------------------------------------------------------------------
0210: void calcNullable(RBBINode n) {
0211: if (n == null) {
0212: return;
0213: }
0214: if (n.fType == RBBINode.setRef || n.fType == RBBINode.endMark) {
0215: // These are non-empty leaf node types.
0216: n.fNullable = false;
0217: return;
0218: }
0219:
0220: if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {
0221: // Lookahead marker node. It's a leaf, so no recursion on children.
0222: // It's nullable because it does not match any literal text from the input stream.
0223: n.fNullable = true;
0224: return;
0225: }
0226:
0227: // The node is not a leaf.
0228: // Calculate nullable on its children.
0229: calcNullable(n.fLeftChild);
0230: calcNullable(n.fRightChild);
0231:
0232: // Apply functions from table 3.40 in Aho
0233: if (n.fType == RBBINode.opOr) {
0234: n.fNullable = n.fLeftChild.fNullable
0235: || n.fRightChild.fNullable;
0236: } else if (n.fType == RBBINode.opCat) {
0237: n.fNullable = n.fLeftChild.fNullable
0238: && n.fRightChild.fNullable;
0239: } else if (n.fType == RBBINode.opStar
0240: || n.fType == RBBINode.opQuestion) {
0241: n.fNullable = true;
0242: } else {
0243: n.fNullable = false;
0244: }
0245: }
0246:
0247: //-----------------------------------------------------------------------------
0248: //
0249: // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
0250: //
0251: //-----------------------------------------------------------------------------
0252: void calcFirstPos(RBBINode n) {
0253: if (n == null) {
0254: return;
0255: }
0256: if (n.fType == RBBINode.leafChar || n.fType == RBBINode.endMark
0257: || n.fType == RBBINode.lookAhead
0258: || n.fType == RBBINode.tag) {
0259: // These are non-empty leaf node types.
0260: n.fFirstPosSet.add(n);
0261: return;
0262: }
0263:
0264: // The node is not a leaf.
0265: // Calculate firstPos on its children.
0266: calcFirstPos(n.fLeftChild);
0267: calcFirstPos(n.fRightChild);
0268:
0269: // Apply functions from table 3.40 in Aho
0270: if (n.fType == RBBINode.opOr) {
0271: n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
0272: n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
0273: } else if (n.fType == RBBINode.opCat) {
0274: n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
0275: if (n.fLeftChild.fNullable) {
0276: n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
0277: }
0278: } else if (n.fType == RBBINode.opStar
0279: || n.fType == RBBINode.opQuestion
0280: || n.fType == RBBINode.opPlus) {
0281: n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
0282: }
0283: }
0284:
0285: //-----------------------------------------------------------------------------
0286: //
0287: // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
0288: //
0289: //-----------------------------------------------------------------------------
0290: void calcLastPos(RBBINode n) {
0291: if (n == null) {
0292: return;
0293: }
0294: if (n.fType == RBBINode.leafChar || n.fType == RBBINode.endMark
0295: || n.fType == RBBINode.lookAhead
0296: || n.fType == RBBINode.tag) {
0297: // These are non-empty leaf node types.
0298: n.fLastPosSet.add(n);
0299: return;
0300: }
0301:
0302: // The node is not a leaf.
0303: // Calculate lastPos on its children.
0304: calcLastPos(n.fLeftChild);
0305: calcLastPos(n.fRightChild);
0306:
0307: // Apply functions from table 3.40 in Aho
0308: if (n.fType == RBBINode.opOr) {
0309: n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
0310: n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
0311: } else if (n.fType == RBBINode.opCat) {
0312: n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
0313: if (n.fRightChild.fNullable) {
0314: n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
0315: }
0316: } else if (n.fType == RBBINode.opStar
0317: || n.fType == RBBINode.opQuestion
0318: || n.fType == RBBINode.opPlus) {
0319: n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
0320: }
0321: }
0322:
0323: //-----------------------------------------------------------------------------
0324: //
0325: // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
0326: //
0327: //-----------------------------------------------------------------------------
0328: void calcFollowPos(RBBINode n) {
0329: if (n == null || n.fType == RBBINode.leafChar
0330: || n.fType == RBBINode.endMark) {
0331: return;
0332: }
0333:
0334: calcFollowPos(n.fLeftChild);
0335: calcFollowPos(n.fRightChild);
0336:
0337: // Aho rule #1
0338: if (n.fType == RBBINode.opCat) {
0339: RBBINode i; // is 'i' in Aho's description
0340:
0341: Set LastPosOfLeftChild = n.fLeftChild.fLastPosSet;
0342:
0343: Iterator ix = LastPosOfLeftChild.iterator();
0344: while (ix.hasNext()) {
0345: i = (RBBINode) ix.next();
0346: i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);
0347: }
0348: }
0349:
0350: // Aho rule #2
0351: if (n.fType == RBBINode.opStar || n.fType == RBBINode.opPlus) {
0352: RBBINode i; // again, n and i are the names from Aho's description.
0353: Iterator ix = n.fLastPosSet.iterator();
0354: while (ix.hasNext()) {
0355: i = (RBBINode) ix.next();
0356: i.fFollowPos.addAll(n.fFirstPosSet);
0357: }
0358:
0359: }
0360:
0361: }
0362:
0363: //-----------------------------------------------------------------------------
0364: //
0365: // calcChainedFollowPos. Modify the previously calculated followPos sets
0366: // to implement rule chaining. NOT described by Aho
0367: //
0368: //-----------------------------------------------------------------------------
0369: void calcChainedFollowPos(RBBINode tree) {
0370:
0371: List endMarkerNodes = new ArrayList();
0372: List leafNodes = new ArrayList();
0373:
0374: // get a list of all endmarker nodes.
0375: tree.findNodes(endMarkerNodes, RBBINode.endMark);
0376:
0377: // get a list all leaf nodes
0378: tree.findNodes(leafNodes, RBBINode.leafChar);
0379:
0380: // Get all nodes that can be the start a match, which is FirstPosition()
0381: // of the portion of the tree corresponding to user-written rules.
0382: // See the tree description in bofFixup().
0383: RBBINode userRuleRoot = tree;
0384: if (fRB.fSetBuilder.sawBOF()) {
0385: userRuleRoot = tree.fLeftChild.fRightChild;
0386: }
0387: Assert.assrt(userRuleRoot != null);
0388: Set matchStartNodes = userRuleRoot.fFirstPosSet;
0389:
0390: // Iteratate over all leaf nodes,
0391: //
0392: Iterator endNodeIx = leafNodes.iterator();
0393:
0394: while (endNodeIx.hasNext()) {
0395: RBBINode tNode = (RBBINode) endNodeIx.next();
0396: RBBINode endNode = null;
0397:
0398: // Identify leaf nodes that correspond to overall rule match positions.
0399: // These include an endMarkerNode in their followPos sets.
0400: Iterator i = endMarkerNodes.iterator();
0401: while (i.hasNext()) {
0402: RBBINode endMarkerNode = (RBBINode) i.next();
0403: if (tNode.fFollowPos.contains(endMarkerNode)) {
0404: endNode = tNode;
0405: break;
0406: }
0407: }
0408: if (endNode == null) {
0409: // node wasn't an end node. Try again with the next.
0410: continue;
0411: }
0412:
0413: // We've got a node that can end a match.
0414:
0415: // Line Break Specific hack: If this node's val correspond to the $CM char class,
0416: // don't chain from it.
0417: // TODO: Add rule syntax for this behavior, get specifics out of here and
0418: // into the rule file.
0419: if (fRB.fLBCMNoChain) {
0420: int c = this .fRB.fSetBuilder.getFirstChar(endNode.fVal);
0421: if (c != -1) {
0422: // c == -1 occurs with sets containing only the {eof} marker string.
0423: int cLBProp = UCharacter.getIntPropertyValue(c,
0424: UProperty.LINE_BREAK);
0425: if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) {
0426: continue;
0427: }
0428: }
0429: }
0430:
0431: // Now iterate over the nodes that can start a match, looking for ones
0432: // with the same char class as our ending node.
0433: RBBINode startNode;
0434: Iterator startNodeIx = matchStartNodes.iterator();
0435: while (startNodeIx.hasNext()) {
0436: startNode = (RBBINode) startNodeIx.next();
0437: if (startNode.fType != RBBINode.leafChar) {
0438: continue;
0439: }
0440:
0441: if (endNode.fVal == startNode.fVal) {
0442: // The end val (character class) of one possible match is the
0443: // same as the start of another.
0444:
0445: // Add all nodes from the followPos of the start node to the
0446: // followPos set of the end node, which will have the effect of
0447: // letting matches transition from a match state at endNode
0448: // to the second char of a match starting with startNode.
0449: endNode.fFollowPos.addAll(startNode.fFollowPos);
0450: }
0451: }
0452: }
0453: }
0454:
0455: //-----------------------------------------------------------------------------
0456: //
0457: // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
0458: // Do an swizzle similar to chaining, modifying the followPos set of
0459: // the bofNode to include the followPos nodes from other {bot} nodes
0460: // scattered through the tree.
0461: //
0462: // This function has much in common with calcChainedFollowPos().
0463: //
0464: //-----------------------------------------------------------------------------
0465: void bofFixup() {
0466: //
0467: // The parse tree looks like this ...
0468: // fTree root --. <cat>
0469: // / \
0470: // <cat> <#end node>
0471: // / \
0472: // <bofNode> rest
0473: // of tree
0474: //
0475: // We will be adding things to the followPos set of the <bofNode>
0476: //
0477: RBBINode bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;
0478: Assert.assrt(bofNode.fType == RBBINode.leafChar);
0479: Assert.assrt(bofNode.fVal == 2);
0480:
0481: // Get all nodes that can be the start a match of the user-written rules
0482: // (excluding the fake bofNode)
0483: // We want the nodes that can start a match in the
0484: // part labeled "rest of tree"
0485: //
0486: Set matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
0487: Iterator startNodeIt = matchStartNodes.iterator();
0488: while (startNodeIt.hasNext()) {
0489: RBBINode startNode = (RBBINode) startNodeIt.next();
0490: if (startNode.fType != RBBINode.leafChar) {
0491: continue;
0492: }
0493:
0494: if (startNode.fVal == bofNode.fVal) {
0495: // We found a leaf node corresponding to a {bof} that was
0496: // explicitly written into a rule.
0497: // Add everything from the followPos set of this node to the
0498: // followPos set of the fake bofNode at the start of the tree.
0499: //
0500: bofNode.fFollowPos.addAll(startNode.fFollowPos);
0501: }
0502: }
0503: }
0504:
0505: //-----------------------------------------------------------------------------
0506: //
0507: // buildStateTable() Determine the set of runtime DFA states and the
0508: // transition tables for these states, by the algorithm
0509: // of fig. 3.44 in Aho.
0510: //
0511: // Most of the comments are quotes of Aho's psuedo-code.
0512: //
0513: //-----------------------------------------------------------------------------
0514: void buildStateTable() {
0515: //
0516: // Add a dummy state 0 - the stop state. Not from Aho.
0517: int lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;
0518: RBBIStateDescriptor failState = new RBBIStateDescriptor(
0519: lastInputSymbol);
0520: fDStates.add(failState);
0521:
0522: // initially, the only unmarked state in Dstates is firstpos(root),
0523: // where toot is the root of the syntax tree for (r)#;
0524: RBBIStateDescriptor initialState = new RBBIStateDescriptor(
0525: lastInputSymbol);
0526: initialState.fPositions
0527: .addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);
0528: fDStates.add(initialState);
0529:
0530: // while there is an unmarked state T in Dstates do begin
0531: for (;;) {
0532: RBBIStateDescriptor T = null;
0533: int tx;
0534: for (tx = 1; tx < fDStates.size(); tx++) {
0535: RBBIStateDescriptor temp = (RBBIStateDescriptor) fDStates
0536: .get(tx);
0537: if (temp.fMarked == false) {
0538: T = temp;
0539: break;
0540: }
0541: }
0542: if (T == null) {
0543: break;
0544: }
0545:
0546: // mark T;
0547: T.fMarked = true;
0548:
0549: // for each input symbol a do begin
0550: int a;
0551: for (a = 1; a <= lastInputSymbol; a++) {
0552: // let U be the set of positions that are in followpos(p)
0553: // for some position p in T
0554: // such that the symbol at position p is a;
0555: Set U = null;
0556: RBBINode p;
0557: Iterator pit = T.fPositions.iterator();
0558: while (pit.hasNext()) {
0559: p = (RBBINode) pit.next();
0560: if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) {
0561: if (U == null) {
0562: U = new HashSet();
0563: }
0564: U.addAll(p.fFollowPos);
0565: }
0566: }
0567:
0568: // if U is not empty and not in DStates then
0569: int ux = 0;
0570: boolean UinDstates = false;
0571: if (U != null) {
0572: Assert.assrt(U.size() > 0);
0573: int ix;
0574: for (ix = 0; ix < fDStates.size(); ix++) {
0575: RBBIStateDescriptor temp2;
0576: temp2 = (RBBIStateDescriptor) fDStates.get(ix);
0577: if (U.equals(temp2.fPositions)) {
0578: U = temp2.fPositions;
0579: ux = ix;
0580: UinDstates = true;
0581: break;
0582: }
0583: }
0584:
0585: // Add U as an unmarked state to Dstates
0586: if (!UinDstates) {
0587: RBBIStateDescriptor newState = new RBBIStateDescriptor(
0588: lastInputSymbol);
0589: newState.fPositions = U;
0590: fDStates.add(newState);
0591: ux = fDStates.size() - 1;
0592: }
0593:
0594: // Dtran[T, a] := U;
0595: T.fDtran[a] = ux;
0596: }
0597: }
0598: }
0599: }
0600:
0601: //-----------------------------------------------------------------------------
0602: //
0603: // flagAcceptingStates Identify accepting states.
0604: // First get a list of all of the end marker nodes.
0605: // Then, for each state s,
0606: // if s contains one of the end marker nodes in its list of tree positions then
0607: // s is an accepting state.
0608: //
0609: //-----------------------------------------------------------------------------
0610: void flagAcceptingStates() {
0611: List endMarkerNodes = new ArrayList();
0612: RBBINode endMarker;
0613: int i;
0614: int n;
0615:
0616: fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes,
0617: RBBINode.endMark);
0618:
0619: for (i = 0; i < endMarkerNodes.size(); i++) {
0620: endMarker = (RBBINode) endMarkerNodes.get(i);
0621: for (n = 0; n < fDStates.size(); n++) {
0622: RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0623: .get(n);
0624: //if (sd.fPositions.indexOf(endMarker) >= 0) {
0625: if (sd.fPositions.contains(endMarker)) {
0626: // Any non-zero value for fAccepting means this is an accepting node.
0627: // The value is what will be returned to the user as the break status.
0628: // If no other value was specified, force it to -1.
0629:
0630: if (sd.fAccepting == 0) {
0631: // State hasn't been marked as accepting yet. Do it now.
0632: sd.fAccepting = endMarker.fVal;
0633: if (sd.fAccepting == 0) {
0634: sd.fAccepting = -1;
0635: }
0636: }
0637: if (sd.fAccepting == -1 && endMarker.fVal != 0) {
0638: // Both lookahead and non-lookahead accepting for this state.
0639: // Favor the look-ahead. Expedient for line break.
0640: // TODO: need a more elegant resolution for conflicting rules.
0641: sd.fAccepting = endMarker.fVal;
0642: }
0643: // implicit else:
0644: // if sd.fAccepting already had a value other than 0 or -1, leave it be.
0645:
0646: // If the end marker node is from a look-ahead rule, set
0647: // the fLookAhead field or this state also.
0648: if (endMarker.fLookAheadEnd) {
0649: // TODO: don't change value if already set?
0650: // TODO: allow for more than one active look-ahead rule in engine.
0651: // Make value here an index to a side array in engine?
0652: sd.fLookAhead = sd.fAccepting;
0653: }
0654: }
0655: }
0656: }
0657: }
0658:
0659: //-----------------------------------------------------------------------------
0660: //
0661: // flagLookAheadStates Very similar to flagAcceptingStates, above.
0662: //
0663: //-----------------------------------------------------------------------------
0664: void flagLookAheadStates() {
0665: List lookAheadNodes = new ArrayList();
0666: RBBINode lookAheadNode;
0667: int i;
0668: int n;
0669:
0670: fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes,
0671: RBBINode.lookAhead);
0672: for (i = 0; i < lookAheadNodes.size(); i++) {
0673: lookAheadNode = (RBBINode) lookAheadNodes.get(i);
0674:
0675: for (n = 0; n < fDStates.size(); n++) {
0676: RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0677: .get(n);
0678: if (sd.fPositions.contains(lookAheadNode)) {
0679: sd.fLookAhead = lookAheadNode.fVal;
0680: }
0681: }
0682: }
0683: }
0684:
0685: //-----------------------------------------------------------------------------
0686: //
0687: // flagTaggedStates
0688: //
0689: //-----------------------------------------------------------------------------
0690: void flagTaggedStates() {
0691: List tagNodes = new ArrayList();
0692: RBBINode tagNode;
0693: int i;
0694: int n;
0695:
0696: fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);
0697: for (i = 0; i < tagNodes.size(); i++) { // For each tag node t (all of 'em)
0698: tagNode = (RBBINode) tagNodes.get(i);
0699:
0700: for (n = 0; n < fDStates.size(); n++) { // For each state s (row in the state table)
0701: RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0702: .get(n);
0703: if (sd.fPositions.contains(tagNode)) { // if s include the tag node t
0704: sd.fTagVals.add(new Integer(tagNode.fVal));
0705: }
0706: }
0707: }
0708: }
0709:
0710: //-----------------------------------------------------------------------------
0711: //
0712: // mergeRuleStatusVals
0713: //
0714: // Allocate positions in the global array of rule status {tag} values
0715: //
0716: // The RBBI runtime uses an array of {sets of status values} that can
0717: // be returned for boundaries. Each accepting state that has non-zero
0718: // status includes an index into this array. The format of the array
0719: // is
0720: // Num of status values in group 1
0721: // status val
0722: // status val
0723: // ...
0724: // Num of status vals in group 2
0725: // status val
0726: // status val
0727: // ...
0728: // etc.
0729: //
0730: //
0731: //-----------------------------------------------------------------------------
0732:
0733: void mergeRuleStatusVals() {
0734: //
0735: // The basic outline of what happens here is this...
0736: //
0737: // for each state in this state table
0738: // if the status tag list for this state is in the global statuses list
0739: // record where and
0740: // continue with the next state
0741: // else
0742: // add the tag list for this state to the global list.
0743: //
0744: int i;
0745: int n;
0746:
0747: // Pre-load a single tag of {0} into the table.
0748: // We will need this as a default, for rule sets with no explicit tagging,
0749: // or with explicit tagging of {0}.
0750: if (fRB.fRuleStatusVals.size() == 0) {
0751: fRB.fRuleStatusVals.add(new Integer(1)); // Num of statuses in group
0752: fRB.fRuleStatusVals.add(new Integer(0)); // and our single status of zero
0753:
0754: SortedSet s0 = new TreeSet();
0755: Integer izero = new Integer(0);
0756: fRB.fStatusSets.put(s0, izero);
0757: SortedSet s1 = new TreeSet();
0758: s1.add(izero);
0759: fRB.fStatusSets.put(s0, izero);
0760: }
0761:
0762: // For each state, check whether the state's status tag values are
0763: // already entered into the status values array, and add them if not.
0764: for (n = 0; n < fDStates.size(); n++) {
0765: RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0766: .get(n);
0767: Set statusVals = sd.fTagVals;
0768: Integer arrayIndexI = (Integer) fRB.fStatusSets
0769: .get(statusVals);
0770: if (arrayIndexI == null) {
0771: // This is the first encounter of this set of status values.
0772: // Add them to the statusSets map, This map associates
0773: // the set of status values with an index in the runtime status
0774: // values array.
0775: arrayIndexI = new Integer(fRB.fRuleStatusVals.size());
0776: fRB.fStatusSets.put(statusVals, arrayIndexI);
0777:
0778: // Add the new set of status values to the vector of values that
0779: // will eventually become the array used by the runtime engine.
0780: fRB.fRuleStatusVals.add(new Integer(statusVals.size()));
0781: Iterator it = statusVals.iterator();
0782: while (it.hasNext()) {
0783: fRB.fRuleStatusVals.add(it.next());
0784: }
0785:
0786: }
0787:
0788: // Save the runtime array index back into the state descriptor.
0789: sd.fTagsIdx = arrayIndexI.intValue();
0790: }
0791: }
0792:
0793: //-----------------------------------------------------------------------------
0794: //
0795: // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
0796: // for each node in the tree.
0797: //
0798: //-----------------------------------------------------------------------------
0799:
0800: void printPosSets(RBBINode n) {
0801: if (n == null) {
0802: return;
0803: }
0804: RBBINode.printNode(n);
0805: System.out.print(" Nullable: " + n.fNullable);
0806:
0807: System.out.print(" firstpos: ");
0808: printSet(n.fFirstPosSet);
0809:
0810: System.out.print(" lastpos: ");
0811: printSet(n.fLastPosSet);
0812:
0813: System.out.print(" followpos: ");
0814: printSet(n.fFollowPos);
0815:
0816: printPosSets(n.fLeftChild);
0817: printPosSets(n.fRightChild);
0818: }
0819:
0820: //-----------------------------------------------------------------------------
0821: //
0822: // getTableSize() Calculate the size in bytes of the runtime form of this
0823: // state transition table.
0824: //
0825: // Note: Refer to common/rbbidata.h from ICU4C for the declarations
0826: // of the structures being matched by this calculation.
0827: //
0828: //-----------------------------------------------------------------------------
0829: int getTableSize() {
0830: int size = 0;
0831: int numRows;
0832: int numCols;
0833: int rowSize;
0834:
0835: if (fRB.fTreeRoots[fRootIx] == null) {
0836: return 0;
0837: }
0838:
0839: size = /*sizeof(RBBIStateTable) - 4 */16; // The header, with no rows to the table.
0840:
0841: numRows = fDStates.size();
0842: numCols = fRB.fSetBuilder.getNumCharCategories();
0843:
0844: // Note The declaration of RBBIStateTableRow is for a table of two columns.
0845: // Therefore we subtract two from numCols when determining
0846: // how much storage to add to a row for the total columns.
0847: // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
0848: rowSize = 8 + 2 * numCols;
0849: size += numRows * rowSize;
0850: while (size % 8 > 0) { // Size must be multiple of 8 bytes in size.
0851: size++;
0852: }
0853:
0854: return size;
0855: }
0856:
0857: //-----------------------------------------------------------------------------
0858: //
0859: // exportTable() export the state transition table in the ICU4C format.
0860: //
0861: // Most of the table is 16 bit shorts. This function exports
0862: // the whole thing as an array of shorts.
0863: //
0864: // The size of the array must be rounded up to a multiple of
0865: // 8 bytes.
0866: //
0867: // See struct RBBIStateTable in ICU4C, common/rbbidata.h
0868: //
0869: //-----------------------------------------------------------------------------
0870:
0871: short[] exportTable() {
0872: int state;
0873: int col;
0874:
0875: if (fRB.fTreeRoots[fRootIx] == null) {
0876: return new short[0];
0877: }
0878:
0879: Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff
0880: && fDStates.size() < 0x7fff);
0881:
0882: int numStates = fDStates.size();
0883:
0884: // Size of table size in shorts.
0885: // the "4" is the size of struct RBBIStateTableRow, the row header part only.
0886: int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();
0887: int tableSize = getTableSize() / 2;
0888:
0889: short[] table = new short[tableSize];
0890:
0891: //
0892: // Fill in the header fields.
0893: // Annoying because they really want to be ints, not shorts.
0894: //
0895: // RBBIStateTable.fNumStates
0896: table[RBBIDataWrapper.NUMSTATES] = (short) (numStates >>> 16);
0897: table[RBBIDataWrapper.NUMSTATES + 1] = (short) (numStates & 0x0000ffff);
0898:
0899: // RBBIStateTable.fRowLen
0900: table[RBBIDataWrapper.ROWLEN] = (short) (rowLen >>> 16);
0901: table[RBBIDataWrapper.ROWLEN + 1] = (short) (rowLen & 0x0000ffff);
0902:
0903: // RBBIStateTable.fFlags
0904: int flags = 0;
0905: if (fRB.fLookAheadHardBreak) {
0906: flags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
0907: }
0908: if (fRB.fSetBuilder.sawBOF()) {
0909: flags |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
0910: }
0911: table[RBBIDataWrapper.FLAGS] = (short) (flags >>> 16);
0912: table[RBBIDataWrapper.FLAGS + 1] = (short) (flags & 0x0000ffff);
0913:
0914: int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
0915: for (state = 0; state < numStates; state++) {
0916: RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0917: .get(state);
0918: int row = 8 + state * rowLen;
0919: Assert.assrt(-32768 < sd.fAccepting
0920: && sd.fAccepting <= 32767);
0921: Assert.assrt(-32768 < sd.fLookAhead
0922: && sd.fLookAhead <= 32767);
0923: table[row + RBBIDataWrapper.ACCEPTING] = (short) sd.fAccepting;
0924: table[row + RBBIDataWrapper.LOOKAHEAD] = (short) sd.fLookAhead;
0925: table[row + RBBIDataWrapper.TAGIDX] = (short) sd.fTagsIdx;
0926: for (col = 0; col < numCharCategories; col++) {
0927: table[row + RBBIDataWrapper.NEXTSTATES + col] = (short) sd.fDtran[col];
0928: }
0929: }
0930: return table;
0931: }
0932:
0933: //-----------------------------------------------------------------------------
0934: //
0935: // printSet Debug function. Print the contents of a set of Nodes
0936: //
0937: //-----------------------------------------------------------------------------
0938:
0939: void printSet(Collection s) {
0940: Iterator it = s.iterator();
0941: while (it.hasNext()) {
0942: RBBINode n = (RBBINode) it.next();
0943: RBBINode.printInt(n.fSerialNum, 8);
0944: }
0945: System.out.println();
0946: }
0947:
0948: //-----------------------------------------------------------------------------
0949: //
0950: // printStates Debug Function. Dump the fully constructed state transition table.
0951: //
0952: //-----------------------------------------------------------------------------
0953:
0954: void printStates() {
0955: int c; // input "character"
0956: int n; // state number
0957:
0958: System.out
0959: .print("state | i n p u t s y m b o l s \n");
0960: System.out.print(" | Acc LA Tag");
0961: for (c = 0; c < fRB.fSetBuilder.getNumCharCategories(); c++) {
0962: RBBINode.printInt((int) c, 3);
0963: }
0964: System.out.print("\n");
0965: System.out.print(" |---------------");
0966: for (c = 0; c < fRB.fSetBuilder.getNumCharCategories(); c++) {
0967: System.out.print("---");
0968: }
0969: System.out.print("\n");
0970:
0971: for (n = 0; n < fDStates.size(); n++) {
0972: RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0973: .get(n);
0974: RBBINode.printInt(n, 5);
0975: System.out.print(" | ");
0976:
0977: RBBINode.printInt(sd.fAccepting, 3);
0978: RBBINode.printInt(sd.fLookAhead, 4);
0979: RBBINode.printInt(sd.fTagsIdx, 6);
0980: System.out.print(" ");
0981: for (c = 0; c < fRB.fSetBuilder.getNumCharCategories(); c++) {
0982: RBBINode.printInt(sd.fDtran[c], 3);
0983: }
0984: System.out.print("\n");
0985: }
0986: System.out.print("\n\n");
0987: }
0988:
0989: //-----------------------------------------------------------------------------
0990: //
0991: // printRuleStatusTable Debug Function. Dump the common rule status table
0992: //
0993: //-----------------------------------------------------------------------------
0994:
0995: void printRuleStatusTable() {
0996: int this Record = 0;
0997: int nextRecord = 0;
0998: int i;
0999: List tbl = fRB.fRuleStatusVals;
1000:
1001: System.out.print("index | tags \n");
1002: System.out.print("-------------------\n");
1003:
1004: while (nextRecord < tbl.size()) {
1005: this Record = nextRecord;
1006: nextRecord = this Record
1007: + ((Integer) tbl.get(this Record)).intValue() + 1;
1008: RBBINode.printInt(this Record, 7);
1009: for (i = this Record + 1; i < nextRecord; i++) {
1010: int val = ((Integer) tbl.get(i)).intValue();
1011: RBBINode.printInt(val, 7);
1012: }
1013: System.out.print("\n");
1014: }
1015: System.out.print("\n\n");
1016: }
1017:
1018: }
|