Source Code Cross Referenced for RBBITableBuilder.java in  » Internationalization-Localization » icu4j » com » ibm » icu » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.text 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /*
0002:         **********************************************************************
0003:         *   Copyright (c) 2002-2006, International Business Machines
0004:         *   Corporation and others.  All Rights Reserved.
0005:         **********************************************************************
0006:         */
0007:
0008:        package com.ibm.icu.text;
0009:
0010:        import java.util.List;
0011:        import java.util.ArrayList;
0012:        import java.util.Set;
0013:        import java.util.HashSet;
0014:        import java.util.SortedSet;
0015:        import java.util.TreeSet;
0016:        import java.util.Iterator;
0017:        import java.util.HashMap;
0018:        import java.util.Map;
0019:        import java.util.Collection;
0020:
0021:        import com.ibm.icu.impl.Assert;
0022:        import com.ibm.icu.lang.UCharacter;
0023:        import com.ibm.icu.lang.UProperty;
0024:
0025:        //
0026:        //  class RBBITableBuilder is part of the RBBI rule compiler.
0027:        //                         It builds the state transition table used by the RBBI runtime
0028:        //                         from the expression syntax tree generated by the rule scanner.
0029:        //
0030:        //                         This class is part of the RBBI implementation only.
0031:        //                         There is no user-visible public API here.
0032:        //
0033:        class RBBITableBuilder {
0034:
0035:            //
0036:            //  RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
0037:            //                        one for each state.
0038:            static class RBBIStateDescriptor {
0039:                boolean fMarked;
0040:                int fAccepting;
0041:                int fLookAhead;
0042:                SortedSet fTagVals;
0043:                int fTagsIdx;
0044:                Set fPositions; // Set of parse tree positions associated
0045:                //   with this state.  Unordered (it's a set).
0046:                //   UVector contents are RBBINode *
0047:
0048:                int[] fDtran; // Transitions out of this state.
0049:
0050:                //   indexed by input character
0051:                //   contents is int index of dest state
0052:                //   in RBBITableBuilder.fDStates
0053:
0054:                RBBIStateDescriptor(int maxInputSymbol) {
0055:                    fTagVals = new TreeSet();
0056:                    fPositions = new HashSet();
0057:                    fDtran = new int[maxInputSymbol + 1]; // fDtran needs to be pre-sized.
0058:                    //   It is indexed by input symbols, and will
0059:                    //   hold  the next state number for each
0060:                    //   symbol.
0061:                }
0062:            }
0063:
0064:            private RBBIRuleBuilder fRB;
0065:            private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots
0066:            //   for the parse tree to operate on.
0067:            //   Too bad Java can't do indirection more easily!
0068:
0069:            private List fDStates; //  D states (Aho's terminology)
0070:
0071:            //  Index is state number
0072:            //  Contents are RBBIStateDescriptor pointers.
0073:
0074:            //-----------------------------------------------------------------------------
0075:            //
0076:            //  Constructor    for RBBITableBuilder.
0077:            //
0078:            //                 rootNode is an index into the array of root nodes that is held by
0079:            //                          the overall RBBIRuleBuilder.
0080:            //-----------------------------------------------------------------------------
0081:            RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) {
0082:                fRootIx = rootNodeIx;
0083:                fRB = rb;
0084:                fDStates = new ArrayList();
0085:            }
0086:
0087:            //-----------------------------------------------------------------------------
0088:            //
0089:            //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
0090:            //                               table from the RBBI rules parse tree.
0091:            //
0092:            //-----------------------------------------------------------------------------
0093:            void build() {
0094:                // If there were no rules, just return.  This situation can easily arise
0095:                //   for the reverse rules.
0096:                if (fRB.fTreeRoots[fRootIx] == null) {
0097:                    return;
0098:                }
0099:
0100:                //
0101:                // Walk through the tree, replacing any references to $variables with a copy of the
0102:                //   parse tree for the substition expression.
0103:                //
0104:                fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx]
0105:                        .flattenVariables();
0106:                if (fRB.fDebugEnv != null
0107:                        && fRB.fDebugEnv.indexOf("ftree") >= 0) {
0108:                    System.out
0109:                            .println("Parse tree after flattening variable references.");
0110:                    fRB.fTreeRoots[fRootIx].printTree(true);
0111:                }
0112:
0113:                //
0114:                // If the rules contained any references to {bof} 
0115:                //   add a {bof} <cat> <former root of tree> to the
0116:                //   tree.  Means that all matches must start out with the 
0117:                //   {bof} fake character.
0118:                // 
0119:                if (fRB.fSetBuilder.sawBOF()) {
0120:                    RBBINode bofTop = new RBBINode(RBBINode.opCat);
0121:                    RBBINode bofLeaf = new RBBINode(RBBINode.leafChar);
0122:                    bofTop.fLeftChild = bofLeaf;
0123:                    bofTop.fRightChild = fRB.fTreeRoots[fRootIx];
0124:                    bofLeaf.fParent = bofTop;
0125:                    bofLeaf.fVal = 2; // Reserved value for {bof}.
0126:                    fRB.fTreeRoots[fRootIx] = bofTop;
0127:                }
0128:
0129:                //
0130:                // Add a unique right-end marker to the expression.
0131:                //   Appears as a cat-node, left child being the original tree,
0132:                //   right child being the end marker.
0133:                //
0134:                RBBINode cn = new RBBINode(RBBINode.opCat);
0135:                cn.fLeftChild = fRB.fTreeRoots[fRootIx];
0136:                fRB.fTreeRoots[fRootIx].fParent = cn;
0137:                cn.fRightChild = new RBBINode(RBBINode.endMark);
0138:                cn.fRightChild.fParent = cn;
0139:                fRB.fTreeRoots[fRootIx] = cn;
0140:
0141:                //
0142:                //  Replace all references to UnicodeSets with the tree for the equivalent
0143:                //      expression.
0144:                //
0145:                fRB.fTreeRoots[fRootIx].flattenSets();
0146:                if (fRB.fDebugEnv != null
0147:                        && fRB.fDebugEnv.indexOf("stree") >= 0) {
0148:                    System.out
0149:                            .println("Parse tree after flattening Unicode Set references.");
0150:                    fRB.fTreeRoots[fRootIx].printTree(true);
0151:                }
0152:
0153:                //
0154:                // calculate the functions nullable, firstpos, lastpos and followpos on
0155:                // nodes in the parse tree.
0156:                //    See the alogrithm description in Aho.
0157:                //    Understanding how this works by looking at the code alone will be
0158:                //       nearly impossible.
0159:                //
0160:                calcNullable(fRB.fTreeRoots[fRootIx]);
0161:                calcFirstPos(fRB.fTreeRoots[fRootIx]);
0162:                calcLastPos(fRB.fTreeRoots[fRootIx]);
0163:                calcFollowPos(fRB.fTreeRoots[fRootIx]);
0164:                if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("pos") >= 0) {
0165:                    System.out.print("\n");
0166:                    printPosSets(fRB.fTreeRoots[fRootIx]);
0167:                }
0168:
0169:                //
0170:                //  For "chained" rules, modify the followPos sets
0171:                //
0172:                if (fRB.fChainRules) {
0173:                    calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
0174:                }
0175:
0176:                //
0177:                //  BOF (start of input) test fixup.
0178:                //
0179:                if (fRB.fSetBuilder.sawBOF()) {
0180:                    bofFixup();
0181:                }
0182:
0183:                //
0184:                // Build the DFA state transition tables.
0185:                //
0186:                buildStateTable();
0187:                flagAcceptingStates();
0188:                flagLookAheadStates();
0189:                flagTaggedStates();
0190:
0191:                //
0192:                // Update the global table of rule status {tag} values
0193:                // The rule builder has a global vector of status values that are common
0194:                //    for all tables.  Merge the ones from this table into the global set.
0195:                //
0196:                mergeRuleStatusVals();
0197:
0198:                if (fRB.fDebugEnv != null
0199:                        && fRB.fDebugEnv.indexOf("states") >= 0) {
0200:                    printStates();
0201:                }
0202:                ;
0203:            }
0204:
0205:            //-----------------------------------------------------------------------------
0206:            //
0207:            //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
0208:            //
0209:            //-----------------------------------------------------------------------------
0210:            void calcNullable(RBBINode n) {
0211:                if (n == null) {
0212:                    return;
0213:                }
0214:                if (n.fType == RBBINode.setRef || n.fType == RBBINode.endMark) {
0215:                    // These are non-empty leaf node types.
0216:                    n.fNullable = false;
0217:                    return;
0218:                }
0219:
0220:                if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {
0221:                    // Lookahead marker node.  It's a leaf, so no recursion on children.
0222:                    // It's nullable because it does not match any literal text from the input stream.
0223:                    n.fNullable = true;
0224:                    return;
0225:                }
0226:
0227:                // The node is not a leaf.
0228:                //  Calculate nullable on its children.
0229:                calcNullable(n.fLeftChild);
0230:                calcNullable(n.fRightChild);
0231:
0232:                // Apply functions from table 3.40 in Aho
0233:                if (n.fType == RBBINode.opOr) {
0234:                    n.fNullable = n.fLeftChild.fNullable
0235:                            || n.fRightChild.fNullable;
0236:                } else if (n.fType == RBBINode.opCat) {
0237:                    n.fNullable = n.fLeftChild.fNullable
0238:                            && n.fRightChild.fNullable;
0239:                } else if (n.fType == RBBINode.opStar
0240:                        || n.fType == RBBINode.opQuestion) {
0241:                    n.fNullable = true;
0242:                } else {
0243:                    n.fNullable = false;
0244:                }
0245:            }
0246:
0247:            //-----------------------------------------------------------------------------
0248:            //
0249:            //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
0250:            //
0251:            //-----------------------------------------------------------------------------
0252:            void calcFirstPos(RBBINode n) {
0253:                if (n == null) {
0254:                    return;
0255:                }
0256:                if (n.fType == RBBINode.leafChar || n.fType == RBBINode.endMark
0257:                        || n.fType == RBBINode.lookAhead
0258:                        || n.fType == RBBINode.tag) {
0259:                    // These are non-empty leaf node types.
0260:                    n.fFirstPosSet.add(n);
0261:                    return;
0262:                }
0263:
0264:                // The node is not a leaf.
0265:                //  Calculate firstPos on its children.
0266:                calcFirstPos(n.fLeftChild);
0267:                calcFirstPos(n.fRightChild);
0268:
0269:                // Apply functions from table 3.40 in Aho
0270:                if (n.fType == RBBINode.opOr) {
0271:                    n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
0272:                    n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
0273:                } else if (n.fType == RBBINode.opCat) {
0274:                    n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
0275:                    if (n.fLeftChild.fNullable) {
0276:                        n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
0277:                    }
0278:                } else if (n.fType == RBBINode.opStar
0279:                        || n.fType == RBBINode.opQuestion
0280:                        || n.fType == RBBINode.opPlus) {
0281:                    n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
0282:                }
0283:            }
0284:
0285:            //-----------------------------------------------------------------------------
0286:            //
0287:            //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
0288:            //
0289:            //-----------------------------------------------------------------------------
0290:            void calcLastPos(RBBINode n) {
0291:                if (n == null) {
0292:                    return;
0293:                }
0294:                if (n.fType == RBBINode.leafChar || n.fType == RBBINode.endMark
0295:                        || n.fType == RBBINode.lookAhead
0296:                        || n.fType == RBBINode.tag) {
0297:                    // These are non-empty leaf node types.
0298:                    n.fLastPosSet.add(n);
0299:                    return;
0300:                }
0301:
0302:                // The node is not a leaf.
0303:                //  Calculate lastPos on its children.
0304:                calcLastPos(n.fLeftChild);
0305:                calcLastPos(n.fRightChild);
0306:
0307:                // Apply functions from table 3.40 in Aho
0308:                if (n.fType == RBBINode.opOr) {
0309:                    n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
0310:                    n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
0311:                } else if (n.fType == RBBINode.opCat) {
0312:                    n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
0313:                    if (n.fRightChild.fNullable) {
0314:                        n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
0315:                    }
0316:                } else if (n.fType == RBBINode.opStar
0317:                        || n.fType == RBBINode.opQuestion
0318:                        || n.fType == RBBINode.opPlus) {
0319:                    n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
0320:                }
0321:            }
0322:
0323:            //-----------------------------------------------------------------------------
0324:            //
0325:            //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
0326:            //
0327:            //-----------------------------------------------------------------------------
0328:            void calcFollowPos(RBBINode n) {
0329:                if (n == null || n.fType == RBBINode.leafChar
0330:                        || n.fType == RBBINode.endMark) {
0331:                    return;
0332:                }
0333:
0334:                calcFollowPos(n.fLeftChild);
0335:                calcFollowPos(n.fRightChild);
0336:
0337:                // Aho rule #1
0338:                if (n.fType == RBBINode.opCat) {
0339:                    RBBINode i; // is 'i' in Aho's description
0340:
0341:                    Set LastPosOfLeftChild = n.fLeftChild.fLastPosSet;
0342:
0343:                    Iterator ix = LastPosOfLeftChild.iterator();
0344:                    while (ix.hasNext()) {
0345:                        i = (RBBINode) ix.next();
0346:                        i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);
0347:                    }
0348:                }
0349:
0350:                // Aho rule #2
0351:                if (n.fType == RBBINode.opStar || n.fType == RBBINode.opPlus) {
0352:                    RBBINode i; // again, n and i are the names from Aho's description.
0353:                    Iterator ix = n.fLastPosSet.iterator();
0354:                    while (ix.hasNext()) {
0355:                        i = (RBBINode) ix.next();
0356:                        i.fFollowPos.addAll(n.fFirstPosSet);
0357:                    }
0358:
0359:                }
0360:
0361:            }
0362:
0363:            //-----------------------------------------------------------------------------
0364:            //
0365:            //   calcChainedFollowPos.    Modify the previously calculated followPos sets
0366:            //                            to implement rule chaining.  NOT described by Aho
0367:            //
0368:            //-----------------------------------------------------------------------------
0369:            void calcChainedFollowPos(RBBINode tree) {
0370:
0371:                List endMarkerNodes = new ArrayList();
0372:                List leafNodes = new ArrayList();
0373:
0374:                // get a list of all endmarker nodes.
0375:                tree.findNodes(endMarkerNodes, RBBINode.endMark);
0376:
0377:                // get a list all leaf nodes
0378:                tree.findNodes(leafNodes, RBBINode.leafChar);
0379:
0380:                // Get all nodes that can be the start a match, which is FirstPosition()
0381:                // of the portion of the tree corresponding to user-written rules.
0382:                // See the tree description in bofFixup().
0383:                RBBINode userRuleRoot = tree;
0384:                if (fRB.fSetBuilder.sawBOF()) {
0385:                    userRuleRoot = tree.fLeftChild.fRightChild;
0386:                }
0387:                Assert.assrt(userRuleRoot != null);
0388:                Set matchStartNodes = userRuleRoot.fFirstPosSet;
0389:
0390:                // Iteratate over all leaf nodes,
0391:                //
0392:                Iterator endNodeIx = leafNodes.iterator();
0393:
0394:                while (endNodeIx.hasNext()) {
0395:                    RBBINode tNode = (RBBINode) endNodeIx.next();
0396:                    RBBINode endNode = null;
0397:
0398:                    // Identify leaf nodes that correspond to overall rule match positions.
0399:                    //   These include an endMarkerNode in their followPos sets.
0400:                    Iterator i = endMarkerNodes.iterator();
0401:                    while (i.hasNext()) {
0402:                        RBBINode endMarkerNode = (RBBINode) i.next();
0403:                        if (tNode.fFollowPos.contains(endMarkerNode)) {
0404:                            endNode = tNode;
0405:                            break;
0406:                        }
0407:                    }
0408:                    if (endNode == null) {
0409:                        // node wasn't an end node.  Try again with the next.
0410:                        continue;
0411:                    }
0412:
0413:                    // We've got a node that can end a match.
0414:
0415:                    // Line Break Specific hack:  If this node's val correspond to the $CM char class,
0416:                    //                            don't chain from it.
0417:                    // TODO:  Add rule syntax for this behavior, get specifics out of here and
0418:                    //        into the rule file.
0419:                    if (fRB.fLBCMNoChain) {
0420:                        int c = this .fRB.fSetBuilder.getFirstChar(endNode.fVal);
0421:                        if (c != -1) {
0422:                            // c == -1 occurs with sets containing only the {eof} marker string.
0423:                            int cLBProp = UCharacter.getIntPropertyValue(c,
0424:                                    UProperty.LINE_BREAK);
0425:                            if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) {
0426:                                continue;
0427:                            }
0428:                        }
0429:                    }
0430:
0431:                    // Now iterate over the nodes that can start a match, looking for ones
0432:                    //   with the same char class as our ending node.
0433:                    RBBINode startNode;
0434:                    Iterator startNodeIx = matchStartNodes.iterator();
0435:                    while (startNodeIx.hasNext()) {
0436:                        startNode = (RBBINode) startNodeIx.next();
0437:                        if (startNode.fType != RBBINode.leafChar) {
0438:                            continue;
0439:                        }
0440:
0441:                        if (endNode.fVal == startNode.fVal) {
0442:                            // The end val (character class) of one possible match is the
0443:                            //   same as the start of another.
0444:
0445:                            // Add all nodes from the followPos of the start node to the
0446:                            //  followPos set of the end node, which will have the effect of
0447:                            //  letting matches transition from a match state at endNode
0448:                            //  to the second char of a match starting with startNode.
0449:                            endNode.fFollowPos.addAll(startNode.fFollowPos);
0450:                        }
0451:                    }
0452:                }
0453:            }
0454:
0455:            //-----------------------------------------------------------------------------
0456:            //
0457:            //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
0458:            //                Do an swizzle similar to chaining, modifying the followPos set of
0459:            //                the bofNode to include the followPos nodes from other {bot} nodes
0460:            //                scattered through the tree.
0461:            //
0462:            //                This function has much in common with calcChainedFollowPos().
0463:            //
0464:            //-----------------------------------------------------------------------------
0465:            void bofFixup() {
0466:                //
0467:                //   The parse tree looks like this ...
0468:                //         fTree root  --.       <cat>
0469:                //                               /     \   
0470:                //                            <cat>   <#end node>
0471:                //                           /     \   
0472:                //                     <bofNode>   rest
0473:                //                               of tree
0474:                //
0475:                //    We will be adding things to the followPos set of the <bofNode>
0476:                //
0477:                RBBINode bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;
0478:                Assert.assrt(bofNode.fType == RBBINode.leafChar);
0479:                Assert.assrt(bofNode.fVal == 2);
0480:
0481:                // Get all nodes that can be the start a match of the user-written rules
0482:                //  (excluding the fake bofNode)
0483:                //  We want the nodes that can start a match in the
0484:                //     part labeled "rest of tree"
0485:                // 
0486:                Set matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
0487:                Iterator startNodeIt = matchStartNodes.iterator();
0488:                while (startNodeIt.hasNext()) {
0489:                    RBBINode startNode = (RBBINode) startNodeIt.next();
0490:                    if (startNode.fType != RBBINode.leafChar) {
0491:                        continue;
0492:                    }
0493:
0494:                    if (startNode.fVal == bofNode.fVal) {
0495:                        //  We found a leaf node corresponding to a {bof} that was
0496:                        //    explicitly written into a rule.
0497:                        //  Add everything from the followPos set of this node to the
0498:                        //    followPos set of the fake bofNode at the start of the tree.
0499:                        //  
0500:                        bofNode.fFollowPos.addAll(startNode.fFollowPos);
0501:                    }
0502:                }
0503:            }
0504:
0505:            //-----------------------------------------------------------------------------
0506:            //
0507:            //   buildStateTable()    Determine the set of runtime DFA states and the
0508:            //                        transition tables for these states, by the algorithm
0509:            //                        of fig. 3.44 in Aho.
0510:            //
0511:            //                        Most of the comments are quotes of Aho's psuedo-code.
0512:            //
0513:            //-----------------------------------------------------------------------------
0514:            void buildStateTable() {
0515:                //
0516:                // Add a dummy state 0 - the stop state.  Not from Aho.
0517:                int lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;
0518:                RBBIStateDescriptor failState = new RBBIStateDescriptor(
0519:                        lastInputSymbol);
0520:                fDStates.add(failState);
0521:
0522:                // initially, the only unmarked state in Dstates is firstpos(root),
0523:                //       where toot is the root of the syntax tree for (r)#;
0524:                RBBIStateDescriptor initialState = new RBBIStateDescriptor(
0525:                        lastInputSymbol);
0526:                initialState.fPositions
0527:                        .addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);
0528:                fDStates.add(initialState);
0529:
0530:                // while there is an unmarked state T in Dstates do begin
0531:                for (;;) {
0532:                    RBBIStateDescriptor T = null;
0533:                    int tx;
0534:                    for (tx = 1; tx < fDStates.size(); tx++) {
0535:                        RBBIStateDescriptor temp = (RBBIStateDescriptor) fDStates
0536:                                .get(tx);
0537:                        if (temp.fMarked == false) {
0538:                            T = temp;
0539:                            break;
0540:                        }
0541:                    }
0542:                    if (T == null) {
0543:                        break;
0544:                    }
0545:
0546:                    // mark T;
0547:                    T.fMarked = true;
0548:
0549:                    // for each input symbol a do begin
0550:                    int a;
0551:                    for (a = 1; a <= lastInputSymbol; a++) {
0552:                        // let U be the set of positions that are in followpos(p)
0553:                        //    for some position p in T
0554:                        //    such that the symbol at position p is a;
0555:                        Set U = null;
0556:                        RBBINode p;
0557:                        Iterator pit = T.fPositions.iterator();
0558:                        while (pit.hasNext()) {
0559:                            p = (RBBINode) pit.next();
0560:                            if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) {
0561:                                if (U == null) {
0562:                                    U = new HashSet();
0563:                                }
0564:                                U.addAll(p.fFollowPos);
0565:                            }
0566:                        }
0567:
0568:                        // if U is not empty and not in DStates then
0569:                        int ux = 0;
0570:                        boolean UinDstates = false;
0571:                        if (U != null) {
0572:                            Assert.assrt(U.size() > 0);
0573:                            int ix;
0574:                            for (ix = 0; ix < fDStates.size(); ix++) {
0575:                                RBBIStateDescriptor temp2;
0576:                                temp2 = (RBBIStateDescriptor) fDStates.get(ix);
0577:                                if (U.equals(temp2.fPositions)) {
0578:                                    U = temp2.fPositions;
0579:                                    ux = ix;
0580:                                    UinDstates = true;
0581:                                    break;
0582:                                }
0583:                            }
0584:
0585:                            // Add U as an unmarked state to Dstates
0586:                            if (!UinDstates) {
0587:                                RBBIStateDescriptor newState = new RBBIStateDescriptor(
0588:                                        lastInputSymbol);
0589:                                newState.fPositions = U;
0590:                                fDStates.add(newState);
0591:                                ux = fDStates.size() - 1;
0592:                            }
0593:
0594:                            // Dtran[T, a] := U;
0595:                            T.fDtran[a] = ux;
0596:                        }
0597:                    }
0598:                }
0599:            }
0600:
0601:            //-----------------------------------------------------------------------------
0602:            //
0603:            //   flagAcceptingStates    Identify accepting states.
0604:            //                          First get a list of all of the end marker nodes.
0605:            //                          Then, for each state s,
0606:            //                              if s contains one of the end marker nodes in its list of tree positions then
0607:            //                                  s is an accepting state.
0608:            //
0609:            //-----------------------------------------------------------------------------
0610:            void flagAcceptingStates() {
0611:                List endMarkerNodes = new ArrayList();
0612:                RBBINode endMarker;
0613:                int i;
0614:                int n;
0615:
0616:                fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes,
0617:                        RBBINode.endMark);
0618:
0619:                for (i = 0; i < endMarkerNodes.size(); i++) {
0620:                    endMarker = (RBBINode) endMarkerNodes.get(i);
0621:                    for (n = 0; n < fDStates.size(); n++) {
0622:                        RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0623:                                .get(n);
0624:                        //if (sd.fPositions.indexOf(endMarker) >= 0) {
0625:                        if (sd.fPositions.contains(endMarker)) {
0626:                            // Any non-zero value for fAccepting means this is an accepting node.
0627:                            // The value is what will be returned to the user as the break status.
0628:                            // If no other value was specified, force it to -1.
0629:
0630:                            if (sd.fAccepting == 0) {
0631:                                // State hasn't been marked as accepting yet.  Do it now.
0632:                                sd.fAccepting = endMarker.fVal;
0633:                                if (sd.fAccepting == 0) {
0634:                                    sd.fAccepting = -1;
0635:                                }
0636:                            }
0637:                            if (sd.fAccepting == -1 && endMarker.fVal != 0) {
0638:                                // Both lookahead and non-lookahead accepting for this state.
0639:                                // Favor the look-ahead.  Expedient for line break.
0640:                                // TODO:  need a more elegant resolution for conflicting rules.
0641:                                sd.fAccepting = endMarker.fVal;
0642:                            }
0643:                            // implicit else:
0644:                            // if sd.fAccepting already had a value other than 0 or -1, leave it be.
0645:
0646:                            // If the end marker node is from a look-ahead rule, set
0647:                            //   the fLookAhead field or this state also.
0648:                            if (endMarker.fLookAheadEnd) {
0649:                                // TODO:  don't change value if already set?
0650:                                // TODO:  allow for more than one active look-ahead rule in engine.
0651:                                //        Make value here an index to a side array in engine?
0652:                                sd.fLookAhead = sd.fAccepting;
0653:                            }
0654:                        }
0655:                    }
0656:                }
0657:            }
0658:
0659:            //-----------------------------------------------------------------------------
0660:            //
0661:            //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
0662:            //
0663:            //-----------------------------------------------------------------------------
0664:            void flagLookAheadStates() {
0665:                List lookAheadNodes = new ArrayList();
0666:                RBBINode lookAheadNode;
0667:                int i;
0668:                int n;
0669:
0670:                fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes,
0671:                        RBBINode.lookAhead);
0672:                for (i = 0; i < lookAheadNodes.size(); i++) {
0673:                    lookAheadNode = (RBBINode) lookAheadNodes.get(i);
0674:
0675:                    for (n = 0; n < fDStates.size(); n++) {
0676:                        RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0677:                                .get(n);
0678:                        if (sd.fPositions.contains(lookAheadNode)) {
0679:                            sd.fLookAhead = lookAheadNode.fVal;
0680:                        }
0681:                    }
0682:                }
0683:            }
0684:
0685:            //-----------------------------------------------------------------------------
0686:            //
0687:            //    flagTaggedStates
0688:            //
0689:            //-----------------------------------------------------------------------------
0690:            void flagTaggedStates() {
0691:                List tagNodes = new ArrayList();
0692:                RBBINode tagNode;
0693:                int i;
0694:                int n;
0695:
0696:                fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);
0697:                for (i = 0; i < tagNodes.size(); i++) { // For each tag node t (all of 'em)
0698:                    tagNode = (RBBINode) tagNodes.get(i);
0699:
0700:                    for (n = 0; n < fDStates.size(); n++) { //    For each state  s (row in the state table)
0701:                        RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0702:                                .get(n);
0703:                        if (sd.fPositions.contains(tagNode)) { //       if  s include the tag node t
0704:                            sd.fTagVals.add(new Integer(tagNode.fVal));
0705:                        }
0706:                    }
0707:                }
0708:            }
0709:
0710:            //-----------------------------------------------------------------------------
0711:            //
0712:            //  mergeRuleStatusVals
0713:            //
0714:            //      Allocate positions in the  global array of rule status {tag} values
0715:            //
0716:            //      The RBBI runtime uses an array of {sets of status values} that can
0717:            //      be returned for boundaries.  Each accepting state that has non-zero
0718:            //      status includes an index into this array.  The format of the array
0719:            //      is 
0720:            //           Num of status values in group 1
0721:            //              status val
0722:            //              status val
0723:            //              ...
0724:            //           Num of status vals in group 2
0725:            //              status val
0726:            //              status val
0727:            //              ...
0728:            //           etc.
0729:            //
0730:            //
0731:            //-----------------------------------------------------------------------------
0732:
0733:            void mergeRuleStatusVals() {
0734:                //
0735:                //  The basic outline of what happens here is this...
0736:                //
0737:                //    for each state in this state table
0738:                //       if the status tag list for this state is in the global statuses list
0739:                //           record where and
0740:                //           continue with the next state
0741:                //       else
0742:                //           add the tag list for this state to the global list.
0743:                //
0744:                int i;
0745:                int n;
0746:
0747:                // Pre-load a single tag of {0} into the table.
0748:                //   We will need this as a default, for rule sets with no explicit tagging,
0749:                //   or with explicit tagging of {0}.
0750:                if (fRB.fRuleStatusVals.size() == 0) {
0751:                    fRB.fRuleStatusVals.add(new Integer(1)); // Num of statuses in group
0752:                    fRB.fRuleStatusVals.add(new Integer(0)); //   and our single status of zero
0753:
0754:                    SortedSet s0 = new TreeSet();
0755:                    Integer izero = new Integer(0);
0756:                    fRB.fStatusSets.put(s0, izero);
0757:                    SortedSet s1 = new TreeSet();
0758:                    s1.add(izero);
0759:                    fRB.fStatusSets.put(s0, izero);
0760:                }
0761:
0762:                //    For each state, check whether the state's status tag values are
0763:                //       already entered into the status values array, and add them if not.
0764:                for (n = 0; n < fDStates.size(); n++) {
0765:                    RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0766:                            .get(n);
0767:                    Set statusVals = sd.fTagVals;
0768:                    Integer arrayIndexI = (Integer) fRB.fStatusSets
0769:                            .get(statusVals);
0770:                    if (arrayIndexI == null) {
0771:                        // This is the first encounter of this set of status values.
0772:                        //   Add them to the statusSets map, This map associates
0773:                        //   the set of status values with an index in the runtime status 
0774:                        //   values array.
0775:                        arrayIndexI = new Integer(fRB.fRuleStatusVals.size());
0776:                        fRB.fStatusSets.put(statusVals, arrayIndexI);
0777:
0778:                        // Add the new set of status values to the vector of values that
0779:                        //   will eventually become the array used by the runtime engine.
0780:                        fRB.fRuleStatusVals.add(new Integer(statusVals.size()));
0781:                        Iterator it = statusVals.iterator();
0782:                        while (it.hasNext()) {
0783:                            fRB.fRuleStatusVals.add(it.next());
0784:                        }
0785:
0786:                    }
0787:
0788:                    // Save the runtime array index back into the state descriptor.
0789:                    sd.fTagsIdx = arrayIndexI.intValue();
0790:                }
0791:            }
0792:
0793:            //-----------------------------------------------------------------------------
0794:            //
0795:            //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
0796:            //                 for each node in the tree.
0797:            //
0798:            //-----------------------------------------------------------------------------
0799:
0800:            void printPosSets(RBBINode n) {
0801:                if (n == null) {
0802:                    return;
0803:                }
0804:                RBBINode.printNode(n);
0805:                System.out.print("         Nullable:  " + n.fNullable);
0806:
0807:                System.out.print("         firstpos:  ");
0808:                printSet(n.fFirstPosSet);
0809:
0810:                System.out.print("         lastpos:   ");
0811:                printSet(n.fLastPosSet);
0812:
0813:                System.out.print("         followpos: ");
0814:                printSet(n.fFollowPos);
0815:
0816:                printPosSets(n.fLeftChild);
0817:                printPosSets(n.fRightChild);
0818:            }
0819:
0820:            //-----------------------------------------------------------------------------
0821:            //
0822:            //   getTableSize()    Calculate the size in bytes of the runtime form of this
0823:            //                     state transition table.
0824:            //
0825:            //          Note:  Refer to common/rbbidata.h from ICU4C for the declarations
0826:            //                 of the structures being matched by this calculation.
0827:            //
0828:            //-----------------------------------------------------------------------------
0829:            int getTableSize() {
0830:                int size = 0;
0831:                int numRows;
0832:                int numCols;
0833:                int rowSize;
0834:
0835:                if (fRB.fTreeRoots[fRootIx] == null) {
0836:                    return 0;
0837:                }
0838:
0839:                size = /*sizeof(RBBIStateTable) - 4 */16; // The header, with no rows to the table.
0840:
0841:                numRows = fDStates.size();
0842:                numCols = fRB.fSetBuilder.getNumCharCategories();
0843:
0844:                //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
0845:                //        Therefore we subtract two from numCols when determining
0846:                //        how much storage to add to a row for the total columns.
0847:                // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
0848:                rowSize = 8 + 2 * numCols;
0849:                size += numRows * rowSize;
0850:                while (size % 8 > 0) { // Size must be multiple of 8 bytes in size.
0851:                    size++;
0852:                }
0853:
0854:                return size;
0855:            }
0856:
0857:            //-----------------------------------------------------------------------------
0858:            //
0859:            //   exportTable()    export the state transition table in the ICU4C format.
0860:            //
0861:            //                    Most of the table is 16 bit shorts.  This function exports
0862:            //                    the whole thing as an array of shorts.
0863:            //
0864:            //                    The size of the array must be rounded up to a multiple of
0865:            //                    8 bytes.
0866:            //
0867:            //                    See struct RBBIStateTable in ICU4C, common/rbbidata.h
0868:            //
0869:            //-----------------------------------------------------------------------------
0870:
0871:            short[] exportTable() {
0872:                int state;
0873:                int col;
0874:
0875:                if (fRB.fTreeRoots[fRootIx] == null) {
0876:                    return new short[0];
0877:                }
0878:
0879:                Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff
0880:                        && fDStates.size() < 0x7fff);
0881:
0882:                int numStates = fDStates.size();
0883:
0884:                // Size of table size in shorts.
0885:                //  the "4" is the size of struct RBBIStateTableRow, the row header part only.
0886:                int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();
0887:                int tableSize = getTableSize() / 2;
0888:
0889:                short[] table = new short[tableSize];
0890:
0891:                //
0892:                // Fill in the header fields.
0893:                //      Annoying because they really want to be ints, not shorts.
0894:                //
0895:                // RBBIStateTable.fNumStates
0896:                table[RBBIDataWrapper.NUMSTATES] = (short) (numStates >>> 16);
0897:                table[RBBIDataWrapper.NUMSTATES + 1] = (short) (numStates & 0x0000ffff);
0898:
0899:                // RBBIStateTable.fRowLen
0900:                table[RBBIDataWrapper.ROWLEN] = (short) (rowLen >>> 16);
0901:                table[RBBIDataWrapper.ROWLEN + 1] = (short) (rowLen & 0x0000ffff);
0902:
0903:                // RBBIStateTable.fFlags
0904:                int flags = 0;
0905:                if (fRB.fLookAheadHardBreak) {
0906:                    flags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
0907:                }
0908:                if (fRB.fSetBuilder.sawBOF()) {
0909:                    flags |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
0910:                }
0911:                table[RBBIDataWrapper.FLAGS] = (short) (flags >>> 16);
0912:                table[RBBIDataWrapper.FLAGS + 1] = (short) (flags & 0x0000ffff);
0913:
0914:                int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
0915:                for (state = 0; state < numStates; state++) {
0916:                    RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0917:                            .get(state);
0918:                    int row = 8 + state * rowLen;
0919:                    Assert.assrt(-32768 < sd.fAccepting
0920:                            && sd.fAccepting <= 32767);
0921:                    Assert.assrt(-32768 < sd.fLookAhead
0922:                            && sd.fLookAhead <= 32767);
0923:                    table[row + RBBIDataWrapper.ACCEPTING] = (short) sd.fAccepting;
0924:                    table[row + RBBIDataWrapper.LOOKAHEAD] = (short) sd.fLookAhead;
0925:                    table[row + RBBIDataWrapper.TAGIDX] = (short) sd.fTagsIdx;
0926:                    for (col = 0; col < numCharCategories; col++) {
0927:                        table[row + RBBIDataWrapper.NEXTSTATES + col] = (short) sd.fDtran[col];
0928:                    }
0929:                }
0930:                return table;
0931:            }
0932:
0933:            //-----------------------------------------------------------------------------
0934:            //
0935:            //   printSet    Debug function.   Print the contents of a set of Nodes
0936:            //
0937:            //-----------------------------------------------------------------------------
0938:
0939:            void printSet(Collection s) {
0940:                Iterator it = s.iterator();
0941:                while (it.hasNext()) {
0942:                    RBBINode n = (RBBINode) it.next();
0943:                    RBBINode.printInt(n.fSerialNum, 8);
0944:                }
0945:                System.out.println();
0946:            }
0947:
0948:            //-----------------------------------------------------------------------------
0949:            //
0950:            //   printStates    Debug Function.  Dump the fully constructed state transition table.
0951:            //
0952:            //-----------------------------------------------------------------------------
0953:
0954:            void printStates() {
0955:                int c; // input "character"
0956:                int n; // state number
0957:
0958:                System.out
0959:                        .print("state |           i n p u t     s y m b o l s \n");
0960:                System.out.print("      | Acc  LA    Tag");
0961:                for (c = 0; c < fRB.fSetBuilder.getNumCharCategories(); c++) {
0962:                    RBBINode.printInt((int) c, 3);
0963:                }
0964:                System.out.print("\n");
0965:                System.out.print("      |---------------");
0966:                for (c = 0; c < fRB.fSetBuilder.getNumCharCategories(); c++) {
0967:                    System.out.print("---");
0968:                }
0969:                System.out.print("\n");
0970:
0971:                for (n = 0; n < fDStates.size(); n++) {
0972:                    RBBIStateDescriptor sd = (RBBIStateDescriptor) fDStates
0973:                            .get(n);
0974:                    RBBINode.printInt(n, 5);
0975:                    System.out.print(" | ");
0976:
0977:                    RBBINode.printInt(sd.fAccepting, 3);
0978:                    RBBINode.printInt(sd.fLookAhead, 4);
0979:                    RBBINode.printInt(sd.fTagsIdx, 6);
0980:                    System.out.print(" ");
0981:                    for (c = 0; c < fRB.fSetBuilder.getNumCharCategories(); c++) {
0982:                        RBBINode.printInt(sd.fDtran[c], 3);
0983:                    }
0984:                    System.out.print("\n");
0985:                }
0986:                System.out.print("\n\n");
0987:            }
0988:
0989:            //-----------------------------------------------------------------------------
0990:            //
0991:            //   printRuleStatusTable    Debug Function.  Dump the common rule status table
0992:            //
0993:            //-----------------------------------------------------------------------------
0994:
0995:            void printRuleStatusTable() {
0996:                int this Record = 0;
0997:                int nextRecord = 0;
0998:                int i;
0999:                List tbl = fRB.fRuleStatusVals;
1000:
1001:                System.out.print("index |  tags \n");
1002:                System.out.print("-------------------\n");
1003:
1004:                while (nextRecord < tbl.size()) {
1005:                    this Record = nextRecord;
1006:                    nextRecord = this Record
1007:                            + ((Integer) tbl.get(this Record)).intValue() + 1;
1008:                    RBBINode.printInt(this Record, 7);
1009:                    for (i = this Record + 1; i < nextRecord; i++) {
1010:                        int val = ((Integer) tbl.get(i)).intValue();
1011:                        RBBINode.printInt(val, 7);
1012:                    }
1013:                    System.out.print("\n");
1014:                }
1015:                System.out.print("\n\n");
1016:            }
1017:
1018:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.