Source Code Cross Referenced for TransliteratorParser.java in  » Internationalization-Localization » icu4j » com » ibm » icu » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Internationalization Localization » icu4j » com.ibm.icu.text 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /*
0002:         **********************************************************************
0003:         *   Copyright (c) 2001-2006, International Business Machines
0004:         *   Corporation and others.  All Rights Reserved.
0005:         **********************************************************************
0006:         */
0007:        package com.ibm.icu.text;
0008:
0009:        import com.ibm.icu.impl.data.ResourceReader;
0010:        import com.ibm.icu.impl.Utility;
0011:        import java.util.Vector;
0012:        import java.util.Hashtable;
0013:        import java.text.ParsePosition;
0014:        import com.ibm.icu.lang.*;
0015:        import com.ibm.icu.impl.UCharacterProperty;
0016:
0017:        class TransliteratorParser {
0018:
0019:            //----------------------------------------------------------------------
0020:            // Data members
0021:            //----------------------------------------------------------------------
0022:
0023:            /**
0024:             * PUBLIC data member.
0025:             * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group
0026:             * of rules in the rule set
0027:             */
0028:            public Vector dataVector;
0029:
0030:            /**
0031:             * PUBLIC data member.
0032:             * A Vector of Strings containing all of the ID blocks in the rule set
0033:             */
0034:            public Vector idBlockVector;
0035:
0036:            /**
0037:             * The current data object for which we are parsing rules
0038:             */
0039:            private RuleBasedTransliterator.Data curData;
0040:
0041:            /**
0042:             * PUBLIC data member containing the parsed compound filter, if any.
0043:             */
0044:            public UnicodeSet compoundFilter;
0045:
0046:            private int direction;
0047:
0048:            /**
0049:             * Temporary symbol table used during parsing.
0050:             */
0051:            private ParseData parseData;
0052:
0053:            /**
0054:             * Temporary vector of set variables.  When parsing is complete, this
0055:             * is copied into the array data.variables.  As with data.variables,
0056:             * element 0 corresponds to character data.variablesBase.
0057:             */
0058:            private Vector variablesVector;
0059:
0060:            /**
0061:             * Temporary table of variable names.  When parsing is complete, this is
0062:             * copied into data.variableNames.
0063:             */
0064:            private Hashtable variableNames;
0065:
0066:            /**
0067:             * String of standins for segments.  Used during the parsing of a single
0068:             * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
0069:             * to StringMatcher object segmentObjects.elementAt(0), etc.
0070:             */
0071:            private StringBuffer segmentStandins;
0072:
0073:            /**
0074:             * Vector of StringMatcher objects for segments.  Used during the
0075:             * parsing of a single rule.  
0076:             * segmentStandins.charAt(0) is the standin for "$1" and corresponds
0077:             * to StringMatcher object segmentObjects.elementAt(0), etc.
0078:             */
0079:            private Vector segmentObjects;
0080:
0081:            /**
0082:             * The next available stand-in for variables.  This starts at some point in
0083:             * the private use area (discovered dynamically) and increments up toward
0084:             * <code>variableLimit</code>.  At any point during parsing, available
0085:             * variables are <code>variableNext..variableLimit-1</code>.
0086:             */
0087:            private char variableNext;
0088:
0089:            /**
0090:             * The last available stand-in for variables.  This is discovered
0091:             * dynamically.  At any point during parsing, available variables are
0092:             * <code>variableNext..variableLimit-1</code>.  During variable definition
0093:             * we use the special value variableLimit-1 as a placeholder.
0094:             */
0095:            private char variableLimit;
0096:
0097:            /**
0098:             * When we encounter an undefined variable, we do not immediately signal
0099:             * an error, in case we are defining this variable, e.g., "$a = [a-z];".
0100:             * Instead, we save the name of the undefined variable, and substitute
0101:             * in the placeholder char variableLimit - 1, and decrement
0102:             * variableLimit.
0103:             */
0104:            private String undefinedVariableName;
0105:
0106:            /**
0107:             * The stand-in character for the 'dot' set, represented by '.' in
0108:             * patterns.  This is allocated the first time it is needed, and
0109:             * reused thereafter.
0110:             */
0111:            private int dotStandIn = -1;
0112:
0113:            //----------------------------------------------------------------------
0114:            // Constants
0115:            //----------------------------------------------------------------------
0116:
0117:            // Indicator for ID blocks
0118:            private static final String ID_TOKEN = "::";
0119:            private static final int ID_TOKEN_LEN = 2;
0120:
0121:            /*
0122:             (reserved for future expansion)
0123:             // markers for beginning and end of rule groups
0124:             private static final String BEGIN_TOKEN = "BEGIN";
0125:             private static final String END_TOKEN = "END";
0126:             */
0127:
0128:            // Operators
0129:            private static final char VARIABLE_DEF_OP = '=';
0130:            private static final char FORWARD_RULE_OP = '>';
0131:            private static final char REVERSE_RULE_OP = '<';
0132:            private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
0133:
0134:            private static final String OPERATORS = "=><\u2190\u2192\u2194";
0135:            private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";
0136:
0137:            // Other special characters
0138:            private static final char QUOTE = '\'';
0139:            private static final char ESCAPE = '\\';
0140:            private static final char END_OF_RULE = ';';
0141:            private static final char RULE_COMMENT_CHAR = '#';
0142:
0143:            private static final char CONTEXT_ANTE = '{'; // ante{key
0144:            private static final char CONTEXT_POST = '}'; // key}post
0145:            private static final char CURSOR_POS = '|';
0146:            private static final char CURSOR_OFFSET = '@';
0147:            private static final char ANCHOR_START = '^';
0148:
0149:            private static final char KLEENE_STAR = '*';
0150:            private static final char ONE_OR_MORE = '+';
0151:            private static final char ZERO_OR_ONE = '?';
0152:
0153:            private static final char DOT = '.';
0154:            private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]";
0155:
0156:            // By definition, the ANCHOR_END special character is a
0157:            // trailing SymbolTable.SYMBOL_REF character.
0158:            // private static final char ANCHOR_END       = '$';
0159:
0160:            // Segments of the input string are delimited by "(" and ")".  In the
0161:            // output string these segments are referenced as "$1", "$2", etc.
0162:            private static final char SEGMENT_OPEN = '(';
0163:            private static final char SEGMENT_CLOSE = ')';
0164:
0165:            // A function is denoted &Source-Target/Variant(text)
0166:            private static final char FUNCTION = '&';
0167:
0168:            // Aliases for some of the syntax characters. These are provided so
0169:            // transliteration rules can be expressed in XML without clashing with
0170:            // XML syntax characters '<', '>', and '&'.
0171:            private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
0172:            private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
0173:            private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow
0174:            private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta)
0175:
0176:            // Special characters disallowed at the top level
0177:            private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
0178:
0179:            // Special characters disallowed within a segment
0180:            private static UnicodeSet ILLEGAL_SEG = new UnicodeSet(
0181:                    "[\\{\\}\\|\\@]");
0182:
0183:            // Special characters disallowed within a function argument
0184:            private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet(
0185:                    "[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");
0186:
0187:            //----------------------------------------------------------------------
0188:            // class ParseData
0189:            //----------------------------------------------------------------------
0190:
0191:            /**
0192:             * This class implements the SymbolTable interface.  It is used
0193:             * during parsing to give UnicodeSet access to variables that
0194:             * have been defined so far.  Note that it uses variablesVector,
0195:             * _not_ data.variables.
0196:             */
0197:            private class ParseData implements  SymbolTable {
0198:
0199:                /**
0200:                 * Implement SymbolTable API.
0201:                 */
0202:                public char[] lookup(String name) {
0203:                    return (char[]) variableNames.get(name);
0204:                }
0205:
0206:                /**
0207:                 * Implement SymbolTable API.
0208:                 */
0209:                public UnicodeMatcher lookupMatcher(int ch) {
0210:                    // Note that we cannot use data.lookup() because the
0211:                    // set array has not been constructed yet.
0212:                    int i = ch - curData.variablesBase;
0213:                    if (i >= 0 && i < variablesVector.size()) {
0214:                        return (UnicodeMatcher) variablesVector.elementAt(i);
0215:                    }
0216:                    return null;
0217:                }
0218:
0219:                /**
0220:                 * Implement SymbolTable API.  Parse out a symbol reference
0221:                 * name.
0222:                 */
0223:                public String parseReference(String text, ParsePosition pos,
0224:                        int limit) {
0225:                    int start = pos.getIndex();
0226:                    int i = start;
0227:                    while (i < limit) {
0228:                        char c = text.charAt(i);
0229:                        if ((i == start && !Character
0230:                                .isUnicodeIdentifierStart(c))
0231:                                || !Character.isUnicodeIdentifierPart(c)) {
0232:                            break;
0233:                        }
0234:                        ++i;
0235:                    }
0236:                    if (i == start) { // No valid name chars
0237:                        return null;
0238:                    }
0239:                    pos.setIndex(i);
0240:                    return text.substring(start, i);
0241:                }
0242:
0243:                /**
0244:                 * Return true if the given character is a matcher standin or a plain
0245:                 * character (non standin).
0246:                 */
0247:                public boolean isMatcher(int ch) {
0248:                    // Note that we cannot use data.lookup() because the
0249:                    // set array has not been constructed yet.
0250:                    int i = ch - curData.variablesBase;
0251:                    if (i >= 0 && i < variablesVector.size()) {
0252:                        return variablesVector.elementAt(i) instanceof  UnicodeMatcher;
0253:                    }
0254:                    return true;
0255:                }
0256:
0257:                /**
0258:                 * Return true if the given character is a replacer standin or a plain
0259:                 * character (non standin).
0260:                 */
0261:                public boolean isReplacer(int ch) {
0262:                    // Note that we cannot use data.lookup() because the
0263:                    // set array has not been constructed yet.
0264:                    int i = ch - curData.variablesBase;
0265:                    if (i >= 0 && i < variablesVector.size()) {
0266:                        return variablesVector.elementAt(i) instanceof  UnicodeReplacer;
0267:                    }
0268:                    return true;
0269:                }
0270:            }
0271:
0272:            //----------------------------------------------------------------------
0273:            // classes RuleBody, RuleArray, and RuleReader
0274:            //----------------------------------------------------------------------
0275:
0276:            /**
0277:             * A private abstract class representing the interface to rule
0278:             * source code that is broken up into lines.  Handles the
0279:             * folding of lines terminated by a backslash.  This folding
0280:             * is limited; it does not account for comments, quotes, or
0281:             * escapes, so its use to be limited.
0282:             */
0283:            private static abstract class RuleBody {
0284:
0285:                /**
0286:                 * Retrieve the next line of the source, or return null if
0287:                 * none.  Folds lines terminated by a backslash into the
0288:                 * next line, without regard for comments, quotes, or
0289:                 * escapes.
0290:                 */
0291:                String nextLine() {
0292:                    String s = handleNextLine();
0293:                    if (s != null && s.length() > 0
0294:                            && s.charAt(s.length() - 1) == '\\') {
0295:
0296:                        StringBuffer b = new StringBuffer(s);
0297:                        do {
0298:                            b.deleteCharAt(b.length() - 1);
0299:                            s = handleNextLine();
0300:                            if (s == null) {
0301:                                break;
0302:                            }
0303:                            b.append(s);
0304:                        } while (s.length() > 0
0305:                                && s.charAt(s.length() - 1) == '\\');
0306:
0307:                        s = b.toString();
0308:                    }
0309:                    return s;
0310:                }
0311:
0312:                /**
0313:                 * Reset to the first line of the source.
0314:                 */
0315:                abstract void reset();
0316:
0317:                /**
0318:                 * Subclass method to return the next line of the source.
0319:                 */
0320:                abstract String handleNextLine();
0321:            }
0322:
0323:            /**
0324:             * RuleBody subclass for a String[] array.
0325:             */
0326:            private static class RuleArray extends RuleBody {
0327:                String[] array;
0328:                int i;
0329:
0330:                public RuleArray(String[] array) {
0331:                    this .array = array;
0332:                    i = 0;
0333:                }
0334:
0335:                public String handleNextLine() {
0336:                    return (i < array.length) ? array[i++] : null;
0337:                }
0338:
0339:                public void reset() {
0340:                    i = 0;
0341:                }
0342:            }
0343:
0344:            /**
0345:             * RuleBody subclass for a ResourceReader.
0346:             */
0347:            private static class RuleReader extends RuleBody {
0348:                ResourceReader reader;
0349:
0350:                public RuleReader(ResourceReader reader) {
0351:                    this .reader = reader;
0352:                }
0353:
0354:                public String handleNextLine() {
0355:                    try {
0356:                        return reader.readLine();
0357:                    } catch (java.io.IOException e) {
0358:                    }
0359:                    return null;
0360:                }
0361:
0362:                public void reset() {
0363:                    reader.reset();
0364:                }
0365:            }
0366:
0367:            //----------------------------------------------------------------------
0368:            // class RuleHalf
0369:            //----------------------------------------------------------------------
0370:
0371:            /**
0372:             * A class representing one side of a rule.  This class knows how to
0373:             * parse half of a rule.  It is tightly coupled to the method
0374:             * TransliteratorParser.parseRule().
0375:             */
0376:            private static class RuleHalf {
0377:
0378:                public String text;
0379:
0380:                public int cursor = -1; // position of cursor in text
0381:                public int ante = -1; // position of ante context marker '{' in text
0382:                public int post = -1; // position of post context marker '}' in text
0383:
0384:                // Record the offset to the cursor either to the left or to the
0385:                // right of the key.  This is indicated by characters on the output
0386:                // side that allow the cursor to be positioned arbitrarily within
0387:                // the matching text.  For example, abc{def} > | @@@ xyz; changes
0388:                // def to xyz and moves the cursor to before abc.  Offset characters
0389:                // must be at the start or end, and they cannot move the cursor past
0390:                // the ante- or postcontext text.  Placeholders are only valid in
0391:                // output text.  The length of the ante and post context is
0392:                // determined at runtime, because of supplementals and quantifiers.
0393:                public int cursorOffset = 0; // only nonzero on output side
0394:
0395:                // Position of first CURSOR_OFFSET on _right_.  This will be -1
0396:                // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
0397:                private int cursorOffsetPos = 0;
0398:
0399:                public boolean anchorStart = false;
0400:                public boolean anchorEnd = false;
0401:
0402:                /**
0403:                 * The segment number from 1..n of the next '(' we see
0404:                 * during parsing; 1-based.
0405:                 */
0406:                private int nextSegmentNumber = 1;
0407:
0408:                /**
0409:                 * Parse one side of a rule, stopping at either the limit,
0410:                 * the END_OF_RULE character, or an operator.
0411:                 * @return the index after the terminating character, or
0412:                 * if limit was reached, limit
0413:                 */
0414:                public int parse(String rule, int pos, int limit,
0415:                        TransliteratorParser parser) {
0416:                    int start = pos;
0417:                    StringBuffer buf = new StringBuffer();
0418:                    pos = parseSection(rule, pos, limit, parser, buf,
0419:                            ILLEGAL_TOP, false);
0420:                    text = buf.toString();
0421:
0422:                    if (cursorOffset > 0 && cursor != cursorOffsetPos) {
0423:                        syntaxError("Misplaced " + CURSOR_POS, rule, start);
0424:                    }
0425:
0426:                    return pos;
0427:                }
0428:
0429:                /**
0430:                 * Parse a section of one side of a rule, stopping at either
0431:                 * the limit, the END_OF_RULE character, an operator, or a
0432:                 * segment close character.  This method parses both a
0433:                 * top-level rule half and a segment within such a rule half.
0434:                 * It calls itself recursively to parse segments and nested
0435:                 * segments.
0436:                 * @param buf buffer into which to accumulate the rule pattern
0437:                 * characters, either literal characters from the rule or
0438:                 * standins for UnicodeMatcher objects including segments.
0439:                 * @param illegal the set of special characters that is illegal during
0440:                 * this parse.
0441:                 * @param isSegment if true, then we've already seen a '(' and
0442:                 * pos on entry points right after it.  Accumulate everything
0443:                 * up to the closing ')', put it in a segment matcher object,
0444:                 * generate a standin for it, and add the standin to buf.  As
0445:                 * a side effect, update the segments vector with a reference
0446:                 * to the segment matcher.  This works recursively for nested
0447:                 * segments.  If isSegment is false, just accumulate
0448:                 * characters into buf.
0449:                 * @return the index after the terminating character, or
0450:                 * if limit was reached, limit
0451:                 */
0452:                private int parseSection(String rule, int pos, int limit,
0453:                        TransliteratorParser parser, StringBuffer buf,
0454:                        UnicodeSet illegal, boolean isSegment) {
0455:                    int start = pos;
0456:                    ParsePosition pp = null;
0457:                    int quoteStart = -1; // Most recent 'single quoted string'
0458:                    int quoteLimit = -1;
0459:                    int varStart = -1; // Most recent $variableReference
0460:                    int varLimit = -1;
0461:                    int[] iref = new int[1];
0462:                    int bufStart = buf.length();
0463:
0464:                    main: while (pos < limit) {
0465:                        // Since all syntax characters are in the BMP, fetching
0466:                        // 16-bit code units suffices here.
0467:                        char c = rule.charAt(pos++);
0468:                        if (UCharacterProperty.isRuleWhiteSpace(c)) {
0469:                            continue;
0470:                        }
0471:                        // HALF_ENDERS is all chars that end a rule half: "<>=;"
0472:                        if (HALF_ENDERS.indexOf(c) >= 0) {
0473:                            if (isSegment) {
0474:                                syntaxError("Unclosed segment", rule, start);
0475:                            }
0476:                            break main;
0477:                        }
0478:                        if (anchorEnd) {
0479:                            // Text after a presumed end anchor is a syntax err
0480:                            syntaxError("Malformed variable reference", rule,
0481:                                    start);
0482:                        }
0483:                        if (UnicodeSet.resemblesPattern(rule, pos - 1)) {
0484:                            if (pp == null) {
0485:                                pp = new ParsePosition(0);
0486:                            }
0487:                            pp.setIndex(pos - 1); // Backup to opening '['
0488:                            buf.append(parser.parseSet(rule, pp));
0489:                            pos = pp.getIndex();
0490:                            continue;
0491:                        }
0492:                        // Handle escapes
0493:                        if (c == ESCAPE) {
0494:                            if (pos == limit) {
0495:                                syntaxError("Trailing backslash", rule, start);
0496:                            }
0497:                            iref[0] = pos;
0498:                            int escaped = Utility.unescapeAt(rule, iref);
0499:                            pos = iref[0];
0500:                            if (escaped == -1) {
0501:                                syntaxError("Malformed escape", rule, start);
0502:                            }
0503:                            parser.checkVariableRange(escaped, rule, start);
0504:                            UTF16.append(buf, escaped);
0505:                            continue;
0506:                        }
0507:                        // Handle quoted matter
0508:                        if (c == QUOTE) {
0509:                            int iq = rule.indexOf(QUOTE, pos);
0510:                            if (iq == pos) {
0511:                                buf.append(c); // Parse [''] outside quotes as [']
0512:                                ++pos;
0513:                            } else {
0514:                                /* This loop picks up a run of quoted text of the
0515:                                 * form 'aaaa' each time through.  If this run
0516:                                 * hasn't really ended ('aaaa''bbbb') then it keeps
0517:                                 * looping, each time adding on a new run.  When it
0518:                                 * reaches the final quote it breaks.
0519:                                 */
0520:                                quoteStart = buf.length();
0521:                                for (;;) {
0522:                                    if (iq < 0) {
0523:                                        syntaxError("Unterminated quote", rule,
0524:                                                start);
0525:                                    }
0526:                                    buf.append(rule.substring(pos, iq));
0527:                                    pos = iq + 1;
0528:                                    if (pos < limit
0529:                                            && rule.charAt(pos) == QUOTE) {
0530:                                        // Parse [''] inside quotes as [']
0531:                                        iq = rule.indexOf(QUOTE, pos + 1);
0532:                                        // Continue looping
0533:                                    } else {
0534:                                        break;
0535:                                    }
0536:                                }
0537:                                quoteLimit = buf.length();
0538:
0539:                                for (iq = quoteStart; iq < quoteLimit; ++iq) {
0540:                                    parser.checkVariableRange(buf.charAt(iq),
0541:                                            rule, start);
0542:                                }
0543:                            }
0544:                            continue;
0545:                        }
0546:
0547:                        parser.checkVariableRange(c, rule, start);
0548:
0549:                        if (illegal.contains(c)) {
0550:                            syntaxError("Illegal character '" + c + '\'', rule,
0551:                                    start);
0552:                        }
0553:
0554:                        switch (c) {
0555:
0556:                        //------------------------------------------------------
0557:                        // Elements allowed within and out of segments
0558:                        //------------------------------------------------------
0559:                        case ANCHOR_START:
0560:                            if (buf.length() == 0 && !anchorStart) {
0561:                                anchorStart = true;
0562:                            } else {
0563:                                syntaxError("Misplaced anchor start", rule,
0564:                                        start);
0565:                            }
0566:                            break;
0567:                        case SEGMENT_OPEN: {
0568:                            // bufSegStart is the offset in buf to the first
0569:                            // character of the segment we are parsing.
0570:                            int bufSegStart = buf.length();
0571:
0572:                            // Record segment number now, since nextSegmentNumber
0573:                            // will be incremented during the call to parseSection
0574:                            // if there are nested segments.
0575:                            int segmentNumber = nextSegmentNumber++; // 1-based
0576:
0577:                            // Parse the segment
0578:                            pos = parseSection(rule, pos, limit, parser, buf,
0579:                                    ILLEGAL_SEG, true);
0580:
0581:                            // After parsing a segment, the relevant characters are
0582:                            // in buf, starting at offset bufSegStart.  Extract them
0583:                            // into a string matcher, and replace them with a
0584:                            // standin for that matcher.
0585:                            StringMatcher m = new StringMatcher(buf
0586:                                    .substring(bufSegStart), segmentNumber,
0587:                                    parser.curData);
0588:
0589:                            // Record and associate object and segment number
0590:                            parser.setSegmentObject(segmentNumber, m);
0591:                            buf.setLength(bufSegStart);
0592:                            buf.append(parser.getSegmentStandin(segmentNumber));
0593:                        }
0594:                            break;
0595:                        case FUNCTION:
0596:                        case ALT_FUNCTION: {
0597:                            iref[0] = pos;
0598:                            TransliteratorIDParser.SingleID single = TransliteratorIDParser
0599:                                    .parseFilterID(rule, iref);
0600:                            // The next character MUST be a segment open
0601:                            if (single == null
0602:                                    || !Utility.parseChar(rule, iref,
0603:                                            SEGMENT_OPEN)) {
0604:                                syntaxError("Invalid function", rule, start);
0605:                            }
0606:
0607:                            Transliterator t = single.getInstance();
0608:                            if (t == null) {
0609:                                syntaxError("Invalid function ID", rule, start);
0610:                            }
0611:
0612:                            // bufSegStart is the offset in buf to the first
0613:                            // character of the segment we are parsing.
0614:                            int bufSegStart = buf.length();
0615:
0616:                            // Parse the segment
0617:                            pos = parseSection(rule, iref[0], limit, parser,
0618:                                    buf, ILLEGAL_FUNC, true);
0619:
0620:                            // After parsing a segment, the relevant characters are
0621:                            // in buf, starting at offset bufSegStart.
0622:                            FunctionReplacer r = new FunctionReplacer(t,
0623:                                    new StringReplacer(buf
0624:                                            .substring(bufSegStart),
0625:                                            parser.curData));
0626:
0627:                            // Replace the buffer contents with a stand-in
0628:                            buf.setLength(bufSegStart);
0629:                            buf.append(parser.generateStandInFor(r));
0630:                        }
0631:                            break;
0632:                        case SymbolTable.SYMBOL_REF:
0633:                            // Handle variable references and segment references "$1" .. "$9"
0634:                        {
0635:                            // A variable reference must be followed immediately
0636:                            // by a Unicode identifier start and zero or more
0637:                            // Unicode identifier part characters, or by a digit
0638:                            // 1..9 if it is a segment reference.
0639:                            if (pos == limit) {
0640:                                // A variable ref character at the end acts as
0641:                                // an anchor to the context limit, as in perl.
0642:                                anchorEnd = true;
0643:                                break;
0644:                            }
0645:                            // Parse "$1" "$2" .. "$9" .. (no upper limit)
0646:                            c = rule.charAt(pos);
0647:                            int r = UCharacter.digit(c, 10);
0648:                            if (r >= 1 && r <= 9) {
0649:                                iref[0] = pos;
0650:                                r = Utility.parseNumber(rule, iref, 10);
0651:                                if (r < 0) {
0652:                                    syntaxError("Undefined segment reference",
0653:                                            rule, start);
0654:                                }
0655:                                pos = iref[0];
0656:                                buf.append(parser.getSegmentStandin(r));
0657:                            } else {
0658:                                if (pp == null) { // Lazy create
0659:                                    pp = new ParsePosition(0);
0660:                                }
0661:                                pp.setIndex(pos);
0662:                                String name = parser.parseData.parseReference(
0663:                                        rule, pp, limit);
0664:                                if (name == null) {
0665:                                    // This means the '$' was not followed by a
0666:                                    // valid name.  Try to interpret it as an
0667:                                    // end anchor then.  If this also doesn't work
0668:                                    // (if we see a following character) then signal
0669:                                    // an error.
0670:                                    anchorEnd = true;
0671:                                    break;
0672:                                }
0673:                                pos = pp.getIndex();
0674:                                // If this is a variable definition statement,
0675:                                // then the LHS variable will be undefined.  In
0676:                                // that case appendVariableDef() will append the
0677:                                // special placeholder char variableLimit-1.
0678:                                varStart = buf.length();
0679:                                parser.appendVariableDef(name, buf);
0680:                                varLimit = buf.length();
0681:                            }
0682:                        }
0683:                            break;
0684:                        case DOT:
0685:                            buf.append(parser.getDotStandIn());
0686:                            break;
0687:                        case KLEENE_STAR:
0688:                        case ONE_OR_MORE:
0689:                        case ZERO_OR_ONE:
0690:                            // Quantifiers.  We handle single characters, quoted strings,
0691:                            // variable references, and segments.
0692:                            //  a+      matches  aaa
0693:                            //  'foo'+  matches  foofoofoo
0694:                            //  $v+     matches  xyxyxy if $v == xy
0695:                            //  (seg)+  matches  segsegseg
0696:                        {
0697:                            if (isSegment && buf.length() == bufStart) {
0698:                                // The */+ immediately follows '('
0699:                                syntaxError("Misplaced quantifier", rule, start);
0700:                                break;
0701:                            }
0702:
0703:                            int qstart, qlimit;
0704:                            // The */+ follows an isolated character or quote
0705:                            // or variable reference
0706:                            if (buf.length() == quoteLimit) {
0707:                                // The */+ follows a 'quoted string'
0708:                                qstart = quoteStart;
0709:                                qlimit = quoteLimit;
0710:                            } else if (buf.length() == varLimit) {
0711:                                // The */+ follows a $variableReference
0712:                                qstart = varStart;
0713:                                qlimit = varLimit;
0714:                            } else {
0715:                                // The */+ follows a single character, possibly
0716:                                // a segment standin
0717:                                qstart = buf.length() - 1;
0718:                                qlimit = qstart + 1;
0719:                            }
0720:
0721:                            UnicodeMatcher m;
0722:                            try {
0723:                                m = new StringMatcher(buf.toString(), qstart,
0724:                                        qlimit, 0, parser.curData);
0725:                            } catch (RuntimeException e) {
0726:                                throw new IllegalArgumentException(
0727:                                        "Failure in rule: "
0728:                                                + rule.substring(pos, limit));
0729:                            }
0730:                            int min = 0;
0731:                            int max = Quantifier.MAX;
0732:                            switch (c) {
0733:                            case ONE_OR_MORE:
0734:                                min = 1;
0735:                                break;
0736:                            case ZERO_OR_ONE:
0737:                                min = 0;
0738:                                max = 1;
0739:                                break;
0740:                            // case KLEENE_STAR:
0741:                            //    do nothing -- min, max already set
0742:                            }
0743:                            m = new Quantifier(m, min, max);
0744:                            buf.setLength(qstart);
0745:                            buf.append(parser.generateStandInFor(m));
0746:                        }
0747:                            break;
0748:
0749:                        //------------------------------------------------------
0750:                        // Elements allowed ONLY WITHIN segments
0751:                        //------------------------------------------------------
0752:                        case SEGMENT_CLOSE:
0753:                            // assert(isSegment);
0754:                            // We're done parsing a segment.
0755:                            break main;
0756:
0757:                        //------------------------------------------------------
0758:                        // Elements allowed ONLY OUTSIDE segments
0759:                        //------------------------------------------------------
0760:                        case CONTEXT_ANTE:
0761:                            if (ante >= 0) {
0762:                                syntaxError("Multiple ante contexts", rule,
0763:                                        start);
0764:                            }
0765:                            ante = buf.length();
0766:                            break;
0767:                        case CONTEXT_POST:
0768:                            if (post >= 0) {
0769:                                syntaxError("Multiple post contexts", rule,
0770:                                        start);
0771:                            }
0772:                            post = buf.length();
0773:                            break;
0774:                        case CURSOR_POS:
0775:                            if (cursor >= 0) {
0776:                                syntaxError("Multiple cursors", rule, start);
0777:                            }
0778:                            cursor = buf.length();
0779:                            break;
0780:                        case CURSOR_OFFSET:
0781:                            if (cursorOffset < 0) {
0782:                                if (buf.length() > 0) {
0783:                                    syntaxError("Misplaced " + c, rule, start);
0784:                                }
0785:                                --cursorOffset;
0786:                            } else if (cursorOffset > 0) {
0787:                                if (buf.length() != cursorOffsetPos
0788:                                        || cursor >= 0) {
0789:                                    syntaxError("Misplaced " + c, rule, start);
0790:                                }
0791:                                ++cursorOffset;
0792:                            } else {
0793:                                if (cursor == 0 && buf.length() == 0) {
0794:                                    cursorOffset = -1;
0795:                                } else if (cursor < 0) {
0796:                                    cursorOffsetPos = buf.length();
0797:                                    cursorOffset = 1;
0798:                                } else {
0799:                                    syntaxError("Misplaced " + c, rule, start);
0800:                                }
0801:                            }
0802:                            break;
0803:
0804:                        //------------------------------------------------------
0805:                        // Non-special characters
0806:                        //------------------------------------------------------
0807:                        default:
0808:                            // Disallow unquoted characters other than [0-9A-Za-z]
0809:                            // in the printable ASCII range.  These characters are
0810:                            // reserved for possible future use.
0811:                            if (c >= 0x0021
0812:                                    && c <= 0x007E
0813:                                    && !((c >= '0' && c <= '9')
0814:                                            || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
0815:                                syntaxError("Unquoted " + c, rule, start);
0816:                            }
0817:                            buf.append(c);
0818:                            break;
0819:                        }
0820:                    }
0821:                    return pos;
0822:                }
0823:
0824:                /**
0825:                 * Remove context.
0826:                 */
0827:                void removeContext() {
0828:                    text = text.substring(ante < 0 ? 0 : ante, post < 0 ? text
0829:                            .length() : post);
0830:                    ante = post = -1;
0831:                    anchorStart = anchorEnd = false;
0832:                }
0833:
0834:                /**
0835:                 * Return true if this half looks like valid output, that is, does not
0836:                 * contain quantifiers or other special input-only elements.
0837:                 */
0838:                public boolean isValidOutput(TransliteratorParser parser) {
0839:                    for (int i = 0; i < text.length();) {
0840:                        int c = UTF16.charAt(text, i);
0841:                        i += UTF16.getCharCount(c);
0842:                        if (!parser.parseData.isReplacer(c)) {
0843:                            return false;
0844:                        }
0845:                    }
0846:                    return true;
0847:                }
0848:
0849:                /**
0850:                 * Return true if this half looks like valid input, that is, does not
0851:                 * contain functions or other special output-only elements.
0852:                 */
0853:                public boolean isValidInput(TransliteratorParser parser) {
0854:                    for (int i = 0; i < text.length();) {
0855:                        int c = UTF16.charAt(text, i);
0856:                        i += UTF16.getCharCount(c);
0857:                        if (!parser.parseData.isMatcher(c)) {
0858:                            return false;
0859:                        }
0860:                    }
0861:                    return true;
0862:                }
0863:            }
0864:
0865:            //----------------------------------------------------------------------
0866:            // PUBLIC methods
0867:            //----------------------------------------------------------------------
0868:
0869:            /**
0870:             * Constructor.
0871:             */
0872:            public TransliteratorParser() {
0873:            }
0874:
0875:            /**
0876:             * Parse a set of rules.  After the parse completes, examine the public
0877:             * data members for results.
0878:             */
0879:            public void parse(String rules, int direction) {
0880:                parseRules(new RuleArray(new String[] { rules }), direction);
0881:            }
0882:
0883:            /**
0884:             * Parse a set of rules.  After the parse completes, examine the public
0885:             * data members for results.
0886:             */
0887:            public void parse(ResourceReader rules, int direction) {
0888:                parseRules(new RuleReader(rules), direction);
0889:            }
0890:
0891:            //----------------------------------------------------------------------
0892:            // PRIVATE methods
0893:            //----------------------------------------------------------------------
0894:
0895:            /**
0896:             * Parse an array of zero or more rules.  The strings in the array are
0897:             * treated as if they were concatenated together, with rule terminators
0898:             * inserted between array elements if not present already.
0899:             *
0900:             * Any previous rules are discarded.  Typically this method is called exactly
0901:             * once, during construction.
0902:             *
0903:             * The member this.data will be set to null if there are no rules.
0904:             *
0905:             * @exception IllegalArgumentException if there is a syntax error in the
0906:             * rules
0907:             */
0908:            void parseRules(RuleBody ruleArray, int dir) {
0909:                boolean parsingIDs = true;
0910:                boolean inBeginEndBlock = false;
0911:                int ruleCount = 0;
0912:
0913:                dataVector = new Vector();
0914:                idBlockVector = new Vector();
0915:                curData = null;
0916:                direction = dir;
0917:                compoundFilter = null;
0918:                variablesVector = new Vector();
0919:                variableNames = new Hashtable();
0920:                parseData = new ParseData();
0921:
0922:                StringBuffer errors = null;
0923:                int errorCount = 0;
0924:
0925:                ruleArray.reset();
0926:
0927:                StringBuffer idBlockResult = new StringBuffer();
0928:
0929:                // The compound filter offset is an index into idBlockResult.
0930:                // If it is 0, then the compound filter occurred at the start,
0931:                // and it is the offset to the _start_ of the compound filter
0932:                // pattern.  Otherwise it is the offset to the _limit_ of the
0933:                // compound filter pattern within idBlockResult.
0934:                this .compoundFilter = null;
0935:                int compoundFilterOffset = -1;
0936:
0937:                main: for (;;) {
0938:                    String rule = ruleArray.nextLine();
0939:                    if (rule == null) {
0940:                        break;
0941:                    }
0942:                    int pos = 0;
0943:                    int limit = rule.length();
0944:                    while (pos < limit) {
0945:                        char c = rule.charAt(pos++);
0946:                        if (UCharacterProperty.isRuleWhiteSpace(c)) {
0947:                            continue;
0948:                        }
0949:                        // Skip lines starting with the comment character
0950:                        if (c == RULE_COMMENT_CHAR) {
0951:                            pos = rule.indexOf("\n", pos) + 1;
0952:                            if (pos == 0) {
0953:                                break; // No "\n" found; rest of rule is a commnet
0954:                            }
0955:                            continue; // Either fall out or restart with next line
0956:                        }
0957:
0958:                        // skip empty rules
0959:                        if (c == END_OF_RULE)
0960:                            continue;
0961:
0962:                        // Often a rule file contains multiple errors.  It's
0963:                        // convenient to the rule author if these are all reported
0964:                        // at once.  We keep parsing rules even after a failure, up
0965:                        // to a specified limit, and report all errors at once.
0966:                        try {
0967:                            ++ruleCount;
0968:
0969:                            // We've found the start of a rule or ID.  c is its first
0970:                            // character, and pos points past c.
0971:                            --pos;
0972:                            // Look for an ID token.  Must have at least ID_TOKEN_LEN + 1
0973:                            // chars left.
0974:                            if ((pos + ID_TOKEN_LEN + 1) <= limit
0975:                                    && rule.regionMatches(pos, ID_TOKEN, 0,
0976:                                            ID_TOKEN_LEN)) {
0977:                                pos += ID_TOKEN_LEN;
0978:                                c = rule.charAt(pos);
0979:                                while (UCharacterProperty.isRuleWhiteSpace(c)
0980:                                        && pos < limit) {
0981:                                    ++pos;
0982:                                    c = rule.charAt(pos);
0983:                                }
0984:                                int[] p = new int[] { pos };
0985:
0986:                                if (!parsingIDs) {
0987:                                    if (curData != null) {
0988:                                        if (direction == Transliterator.FORWARD)
0989:                                            dataVector.add(curData);
0990:                                        else
0991:                                            dataVector.insertElementAt(curData,
0992:                                                    0);
0993:                                        curData = null;
0994:                                    }
0995:                                    parsingIDs = true;
0996:                                }
0997:
0998:                                TransliteratorIDParser.SingleID id = TransliteratorIDParser
0999:                                        .parseSingleID(rule, p, direction);
1000:                                if (p[0] != pos
1001:                                        && Utility.parseChar(rule, p,
1002:                                                END_OF_RULE)) {
1003:                                    // Successful ::ID parse.
1004:
1005:                                    if (direction == Transliterator.FORWARD) {
1006:                                        idBlockResult.append(id.canonID)
1007:                                                .append(END_OF_RULE);
1008:                                    } else {
1009:                                        idBlockResult.insert(0, id.canonID
1010:                                                + END_OF_RULE);
1011:                                    }
1012:
1013:                                } else {
1014:                                    // Couldn't parse an ID.  Try to parse a global filter
1015:                                    int[] withParens = new int[] { -1 };
1016:                                    UnicodeSet f = TransliteratorIDParser
1017:                                            .parseGlobalFilter(rule, p,
1018:                                                    direction, withParens, null);
1019:                                    if (f != null
1020:                                            && Utility.parseChar(rule, p,
1021:                                                    END_OF_RULE)) {
1022:                                        if ((direction == Transliterator.FORWARD) == (withParens[0] == 0)) {
1023:                                            if (compoundFilter != null) {
1024:                                                // Multiple compound filters
1025:                                                syntaxError(
1026:                                                        "Multiple global filters",
1027:                                                        rule, pos);
1028:                                            }
1029:                                            compoundFilter = f;
1030:                                            compoundFilterOffset = ruleCount;
1031:                                        }
1032:                                    } else {
1033:                                        // Invalid ::id
1034:                                        // Can be parsed as neither an ID nor a global filter
1035:                                        syntaxError("Invalid ::ID", rule, pos);
1036:                                    }
1037:                                }
1038:
1039:                                pos = p[0];
1040:                            } else {
1041:                                if (parsingIDs) {
1042:                                    if (direction == Transliterator.FORWARD)
1043:                                        idBlockVector.add(idBlockResult
1044:                                                .toString());
1045:                                    else
1046:                                        idBlockVector.insertElementAt(
1047:                                                idBlockResult.toString(), 0);
1048:                                    idBlockResult.delete(0, idBlockResult
1049:                                            .length());
1050:                                    parsingIDs = false;
1051:                                    curData = new RuleBasedTransliterator.Data();
1052:
1053:                                    // By default, rules use part of the private use area
1054:                                    // E000..F8FF for variables and other stand-ins.  Currently
1055:                                    // the range F000..F8FF is typically sufficient.  The 'use
1056:                                    // variable range' pragma allows rule sets to modify this.
1057:                                    setVariableRange(0xF000, 0xF8FF);
1058:                                }
1059:
1060:                                if (resemblesPragma(rule, pos, limit)) {
1061:                                    int ppp = parsePragma(rule, pos, limit);
1062:                                    if (ppp < 0) {
1063:                                        syntaxError("Unrecognized pragma",
1064:                                                rule, pos);
1065:                                    }
1066:                                    pos = ppp;
1067:                                    // Parse a rule
1068:                                } else {
1069:                                    pos = parseRule(rule, pos, limit);
1070:                                }
1071:                            }
1072:                        } catch (IllegalArgumentException e) {
1073:                            if (errorCount == 30) {
1074:                                errors
1075:                                        .append("\nMore than 30 errors; further messages squelched");
1076:                                break main;
1077:                            }
1078:                            if (errors == null) {
1079:                                errors = new StringBuffer(e.getMessage());
1080:                            } else {
1081:                                errors.append("\n" + e.getMessage());
1082:                            }
1083:                            ++errorCount;
1084:                            pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
1085:                        }
1086:                    }
1087:                }
1088:                if (parsingIDs && idBlockResult.length() > 0) {
1089:                    if (direction == Transliterator.FORWARD)
1090:                        idBlockVector.add(idBlockResult.toString());
1091:                    else
1092:                        idBlockVector.insertElementAt(idBlockResult.toString(),
1093:                                0);
1094:                } else if (!parsingIDs && curData != null) {
1095:                    if (direction == Transliterator.FORWARD)
1096:                        dataVector.add(curData);
1097:                    else
1098:                        dataVector.insertElementAt(curData, 0);
1099:                }
1100:
1101:                // Convert the set vector to an array
1102:                for (int i = 0; i < dataVector.size(); i++) {
1103:                    RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data) dataVector
1104:                            .get(i);
1105:                    data.variables = new Object[variablesVector.size()];
1106:                    variablesVector.copyInto(data.variables);
1107:                    data.variableNames = new Hashtable();
1108:                    data.variableNames.putAll(variableNames);
1109:                }
1110:                variablesVector = null;
1111:
1112:                // Do more syntax checking and index the rules
1113:                try {
1114:                    if (compoundFilter != null) {
1115:                        if ((direction == Transliterator.FORWARD && compoundFilterOffset != 1)
1116:                                || (direction == Transliterator.REVERSE && compoundFilterOffset != ruleCount)) {
1117:                            throw new IllegalArgumentException(
1118:                                    "Compound filters misplaced");
1119:                        }
1120:                    }
1121:
1122:                    for (int i = 0; i < dataVector.size(); i++) {
1123:                        RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data) dataVector
1124:                                .get(i);
1125:                        data.ruleSet.freeze();
1126:                    }
1127:
1128:                    if (idBlockVector.size() == 1
1129:                            && ((String) idBlockVector.get(0)).length() == 0)
1130:                        idBlockVector.remove(0);
1131:
1132:                } catch (IllegalArgumentException e) {
1133:                    if (errors == null) {
1134:                        errors = new StringBuffer(e.getMessage());
1135:                    } else {
1136:                        errors.append("\n").append(e.getMessage());
1137:                    }
1138:                }
1139:
1140:                if (errors != null) {
1141:                    throw new IllegalArgumentException(errors.toString());
1142:                }
1143:            }
1144:
1145:            /**
1146:             * MAIN PARSER.  Parse the next rule in the given rule string, starting
1147:             * at pos.  Return the index after the last character parsed.  Do not
1148:             * parse characters at or after limit.
1149:             *
1150:             * Important:  The character at pos must be a non-whitespace character
1151:             * that is not the comment character.
1152:             *
1153:             * This method handles quoting, escaping, and whitespace removal.  It
1154:             * parses the end-of-rule character.  It recognizes context and cursor
1155:             * indicators.  Once it does a lexical breakdown of the rule at pos, it
1156:             * creates a rule object and adds it to our rule list.
1157:             *
1158:             * This method is tightly coupled to the inner class RuleHalf.
1159:             */
1160:            private int parseRule(String rule, int pos, int limit) {
1161:                // Locate the left side, operator, and right side
1162:                int start = pos;
1163:                char operator = 0;
1164:
1165:                // Set up segments data
1166:                segmentStandins = new StringBuffer();
1167:                segmentObjects = new Vector();
1168:
1169:                RuleHalf left = new RuleHalf();
1170:                RuleHalf right = new RuleHalf();
1171:
1172:                undefinedVariableName = null;
1173:                pos = left.parse(rule, pos, limit, this );
1174:
1175:                if (pos == limit
1176:                        || OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
1177:                    syntaxError("No operator pos=" + pos, rule, start);
1178:                }
1179:                ++pos;
1180:
1181:                // Found an operator char.  Check for forward-reverse operator.
1182:                if (operator == REVERSE_RULE_OP
1183:                        && (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
1184:                    ++pos;
1185:                    operator = FWDREV_RULE_OP;
1186:                }
1187:
1188:                // Translate alternate op characters.
1189:                switch (operator) {
1190:                case ALT_FORWARD_RULE_OP:
1191:                    operator = FORWARD_RULE_OP;
1192:                    break;
1193:                case ALT_REVERSE_RULE_OP:
1194:                    operator = REVERSE_RULE_OP;
1195:                    break;
1196:                case ALT_FWDREV_RULE_OP:
1197:                    operator = FWDREV_RULE_OP;
1198:                    break;
1199:                }
1200:
1201:                pos = right.parse(rule, pos, limit, this );
1202:
1203:                if (pos < limit) {
1204:                    if (rule.charAt(--pos) == END_OF_RULE) {
1205:                        ++pos;
1206:                    } else {
1207:                        // RuleHalf parser must have terminated at an operator
1208:                        syntaxError("Unquoted operator", rule, start);
1209:                    }
1210:                }
1211:
1212:                if (operator == VARIABLE_DEF_OP) {
1213:                    // LHS is the name.  RHS is a single character, either a literal
1214:                    // or a set (already parsed).  If RHS is longer than one
1215:                    // character, it is either a multi-character string, or multiple
1216:                    // sets, or a mixture of chars and sets -- syntax error.
1217:
1218:                    // We expect to see a single undefined variable (the one being
1219:                    // defined).
1220:                    if (undefinedVariableName == null) {
1221:                        syntaxError("Missing '$' or duplicate definition",
1222:                                rule, start);
1223:                    }
1224:                    if (left.text.length() != 1
1225:                            || left.text.charAt(0) != variableLimit) {
1226:                        syntaxError("Malformed LHS", rule, start);
1227:                    }
1228:                    if (left.anchorStart || left.anchorEnd || right.anchorStart
1229:                            || right.anchorEnd) {
1230:                        syntaxError("Malformed variable def", rule, start);
1231:                    }
1232:                    // We allow anything on the right, including an empty string.
1233:                    int n = right.text.length();
1234:                    char[] value = new char[n];
1235:                    right.text.getChars(0, n, value, 0);
1236:                    variableNames.put(undefinedVariableName, value);
1237:
1238:                    ++variableLimit;
1239:                    return pos;
1240:                }
1241:
1242:                // If this is not a variable definition rule, we shouldn't have
1243:                // any undefined variable names.
1244:                if (undefinedVariableName != null) {
1245:                    syntaxError("Undefined variable $" + undefinedVariableName,
1246:                            rule, start);
1247:                }
1248:
1249:                // Verify segments
1250:                if (segmentStandins.length() > segmentObjects.size()) {
1251:                    syntaxError("Undefined segment reference", rule, start);
1252:                }
1253:                for (int i = 0; i < segmentStandins.length(); ++i) {
1254:                    if (segmentStandins.charAt(i) == 0) {
1255:                        syntaxError("Internal error", rule, start); // will never happen
1256:                    }
1257:                }
1258:                for (int i = 0; i < segmentObjects.size(); ++i) {
1259:                    if (segmentObjects.elementAt(i) == null) {
1260:                        syntaxError("Internal error", rule, start); // will never happen
1261:                    }
1262:                }
1263:
1264:                // If the direction we want doesn't match the rule
1265:                // direction, do nothing.
1266:                if (operator != FWDREV_RULE_OP
1267:                        && ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
1268:                    return pos;
1269:                }
1270:
1271:                // Transform the rule into a forward rule by swapping the
1272:                // sides if necessary.
1273:                if (direction == Transliterator.REVERSE) {
1274:                    RuleHalf temp = left;
1275:                    left = right;
1276:                    right = temp;
1277:                }
1278:
1279:                // Remove non-applicable elements in forward-reverse
1280:                // rules.  Bidirectional rules ignore elements that do not
1281:                // apply.
1282:                if (operator == FWDREV_RULE_OP) {
1283:                    right.removeContext();
1284:                    left.cursor = -1;
1285:                    left.cursorOffset = 0;
1286:                }
1287:
1288:                // Normalize context
1289:                if (left.ante < 0) {
1290:                    left.ante = 0;
1291:                }
1292:                if (left.post < 0) {
1293:                    left.post = left.text.length();
1294:                }
1295:
1296:                // Context is only allowed on the input side.  Cursors are only
1297:                // allowed on the output side.  Segment delimiters can only appear
1298:                // on the left, and references on the right.  Cursor offset
1299:                // cannot appear without an explicit cursor.  Cursor offset
1300:                // cannot place the cursor outside the limits of the context.
1301:                // Anchors are only allowed on the input side.
1302:                if (right.ante >= 0
1303:                        || right.post >= 0
1304:                        || left.cursor >= 0
1305:                        || (right.cursorOffset != 0 && right.cursor < 0)
1306:                        ||
1307:                        // - The following two checks were used to ensure that the
1308:                        // - the cursor offset stayed within the ante- or postcontext.
1309:                        // - However, with the addition of quantifiers, we have to
1310:                        // - allow arbitrary cursor offsets and do runtime checking.
1311:                        //(right.cursorOffset > (left.text.length() - left.post)) ||
1312:                        //(-right.cursorOffset > left.ante) ||
1313:                        right.anchorStart || right.anchorEnd
1314:                        || !left.isValidInput(this )
1315:                        || !right.isValidOutput(this ) || left.ante > left.post) {
1316:                    syntaxError("Malformed rule", rule, start);
1317:                }
1318:
1319:                // Flatten segment objects vector to an array
1320:                UnicodeMatcher[] segmentsArray = null;
1321:                if (segmentObjects.size() > 0) {
1322:                    segmentsArray = new UnicodeMatcher[segmentObjects.size()];
1323:                    segmentObjects.toArray(segmentsArray);
1324:                }
1325:
1326:                curData.ruleSet.addRule(new TransliterationRule(left.text,
1327:                        left.ante, left.post, right.text, right.cursor,
1328:                        right.cursorOffset, segmentsArray, left.anchorStart,
1329:                        left.anchorEnd, curData));
1330:
1331:                return pos;
1332:            }
1333:
1334:            /**
1335:             * Set the variable range to [start, end] (inclusive).
1336:             */
1337:            private void setVariableRange(int start, int end) {
1338:                if (start > end || start < 0 || end > 0xFFFF) {
1339:                    throw new IllegalArgumentException(
1340:                            "Invalid variable range " + start + ", " + end);
1341:                }
1342:
1343:                curData.variablesBase = (char) start; // first private use
1344:
1345:                if (dataVector.size() == 0) {
1346:                    variableNext = (char) start;
1347:                    variableLimit = (char) (end + 1);
1348:                }
1349:            }
1350:
1351:            /**
1352:             * Assert that the given character is NOT within the variable range.
1353:             * If it is, signal an error.  This is neccesary to ensure that the
1354:             * variable range does not overlap characters used in a rule.
1355:             */
1356:            private void checkVariableRange(int ch, String rule, int start) {
1357:                if (ch >= curData.variablesBase && ch < variableLimit) {
1358:                    syntaxError("Variable range character in rule", rule, start);
1359:                }
1360:            }
1361:
1362:            // (The following method is part of an unimplemented feature.
1363:            // Remove this clover pragma after the feature is implemented.
1364:            // 2003-06-11 ICU 2.6 Alan)
1365:            ///CLOVER:OFF
1366:            /**
1367:             * Set the maximum backup to 'backup', in response to a pragma
1368:             * statement.
1369:             */
1370:            private void pragmaMaximumBackup(int backup) {
1371:                //TODO Finish
1372:                throw new IllegalArgumentException(
1373:                        "use maximum backup pragma not implemented yet");
1374:            }
1375:
1376:            ///CLOVER:ON
1377:
1378:            // (The following method is part of an unimplemented feature.
1379:            // Remove this clover pragma after the feature is implemented.
1380:            // 2003-06-11 ICU 2.6 Alan)
1381:            ///CLOVER:OFF
1382:            /**
1383:             * Begin normalizing all rules using the given mode, in response
1384:             * to a pragma statement.
1385:             */
1386:            private void pragmaNormalizeRules(Normalizer.Mode mode) {
1387:                //TODO Finish
1388:                throw new IllegalArgumentException(
1389:                        "use normalize rules pragma not implemented yet");
1390:            }
1391:
1392:            ///CLOVER:ON
1393:
1394:            /**
1395:             * Return true if the given rule looks like a pragma.
1396:             * @param pos offset to the first non-whitespace character
1397:             * of the rule.
1398:             * @param limit pointer past the last character of the rule.
1399:             */
1400:            static boolean resemblesPragma(String rule, int pos, int limit) {
1401:                // Must start with /use\s/i
1402:                return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
1403:            }
1404:
1405:            /**
1406:             * Parse a pragma.  This method assumes resemblesPragma() has
1407:             * already returned true.
1408:             * @param pos offset to the first non-whitespace character
1409:             * of the rule.
1410:             * @param limit pointer past the last character of the rule.
1411:             * @return the position index after the final ';' of the pragma,
1412:             * or -1 on failure.
1413:             */
1414:            private int parsePragma(String rule, int pos, int limit) {
1415:                int[] array = new int[2];
1416:
1417:                // resemblesPragma() has already returned true, so we
1418:                // know that pos points to /use\s/i; we can skip 4 characters
1419:                // immediately
1420:                pos += 4;
1421:
1422:                // Here are the pragmas we recognize:
1423:                // use variable range 0xE000 0xEFFF;
1424:                // use maximum backup 16;
1425:                // use nfd rules;
1426:                int p = Utility.parsePattern(rule, pos, limit,
1427:                        "~variable range # #~;", array);
1428:                if (p >= 0) {
1429:                    setVariableRange(array[0], array[1]);
1430:                    return p;
1431:                }
1432:
1433:                p = Utility.parsePattern(rule, pos, limit,
1434:                        "~maximum backup #~;", array);
1435:                if (p >= 0) {
1436:                    pragmaMaximumBackup(array[0]);
1437:                    return p;
1438:                }
1439:
1440:                p = Utility
1441:                        .parsePattern(rule, pos, limit, "~nfd rules~;", null);
1442:                if (p >= 0) {
1443:                    pragmaNormalizeRules(Normalizer.NFD);
1444:                    return p;
1445:                }
1446:
1447:                p = Utility
1448:                        .parsePattern(rule, pos, limit, "~nfc rules~;", null);
1449:                if (p >= 0) {
1450:                    pragmaNormalizeRules(Normalizer.NFC);
1451:                    return p;
1452:                }
1453:
1454:                // Syntax error: unable to parse pragma
1455:                return -1;
1456:            }
1457:
1458:            /**
1459:             * Throw an exception indicating a syntax error.  Search the rule string
1460:             * for the probable end of the rule.  Of course, if the error is that
1461:             * the end of rule marker is missing, then the rule end will not be found.
1462:             * In any case the rule start will be correctly reported.
1463:             * @param msg error description
1464:             * @param rule pattern string
1465:             * @param start position of first character of current rule
1466:             */
1467:            static final void syntaxError(String msg, String rule, int start) {
1468:                int end = ruleEnd(rule, start, rule.length());
1469:                throw new IllegalArgumentException(msg + " in \""
1470:                        + Utility.escape(rule.substring(start, end)) + '"');
1471:            }
1472:
1473:            static final int ruleEnd(String rule, int start, int limit) {
1474:                int end = Utility.quotedIndexOf(rule, start, limit, ";");
1475:                if (end < 0) {
1476:                    end = limit;
1477:                }
1478:                return end;
1479:            }
1480:
1481:            /**
1482:             * Parse a UnicodeSet out, store it, and return the stand-in character
1483:             * used to represent it.
1484:             */
1485:            private final char parseSet(String rule, ParsePosition pos) {
1486:                UnicodeSet set = new UnicodeSet(rule, pos, parseData);
1487:                if (variableNext >= variableLimit) {
1488:                    throw new RuntimeException(
1489:                            "Private use variables exhausted");
1490:                }
1491:                set.compact();
1492:                return generateStandInFor(set);
1493:            }
1494:
1495:            /**
1496:             * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer.
1497:             * Store the object.
1498:             */
1499:            char generateStandInFor(Object obj) {
1500:                // assert(obj != null);
1501:
1502:                // Look up previous stand-in, if any.  This is a short list
1503:                // (typical n is 0, 1, or 2); linear search is optimal.
1504:                for (int i = 0; i < variablesVector.size(); ++i) {
1505:                    if (variablesVector.elementAt(i) == obj) { // [sic] pointer comparison
1506:                        return (char) (curData.variablesBase + i);
1507:                    }
1508:                }
1509:
1510:                if (variableNext >= variableLimit) {
1511:                    throw new RuntimeException("Variable range exhausted");
1512:                }
1513:                variablesVector.addElement(obj);
1514:                return variableNext++;
1515:            }
1516:
1517:            /**
1518:             * Return the standin for segment seg (1-based).
1519:             */
1520:            public char getSegmentStandin(int seg) {
1521:                if (segmentStandins.length() < seg) {
1522:                    segmentStandins.setLength(seg);
1523:                }
1524:                char c = segmentStandins.charAt(seg - 1);
1525:                if (c == 0) {
1526:                    if (variableNext >= variableLimit) {
1527:                        throw new RuntimeException("Variable range exhausted");
1528:                    }
1529:                    c = variableNext++;
1530:                    // Set a placeholder in the master variables vector that will be
1531:                    // filled in later by setSegmentObject().  We know that we will get
1532:                    // called first because setSegmentObject() will call us.
1533:                    variablesVector.addElement(null);
1534:                    segmentStandins.setCharAt(seg - 1, c);
1535:                }
1536:                return c;
1537:            }
1538:
1539:            /**
1540:             * Set the object for segment seg (1-based).
1541:             */
1542:            public void setSegmentObject(int seg, StringMatcher obj) {
1543:                // Since we call parseSection() recursively, nested
1544:                // segments will result in segment i+1 getting parsed
1545:                // and stored before segment i; be careful with the
1546:                // vector handling here.
1547:                if (segmentObjects.size() < seg) {
1548:                    segmentObjects.setSize(seg);
1549:                }
1550:                int index = getSegmentStandin(seg) - curData.variablesBase;
1551:                if (segmentObjects.elementAt(seg - 1) != null
1552:                        || variablesVector.elementAt(index) != null) {
1553:                    throw new RuntimeException(); // should never happen
1554:                }
1555:                segmentObjects.setElementAt(obj, seg - 1);
1556:                variablesVector.setElementAt(obj, index);
1557:            }
1558:
1559:            /**
1560:             * Return the stand-in for the dot set.  It is allocated the first
1561:             * time and reused thereafter.
1562:             */
1563:            char getDotStandIn() {
1564:                if (dotStandIn == -1) {
1565:                    dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
1566:                }
1567:                return (char) dotStandIn;
1568:            }
1569:
1570:            /**
1571:             * Append the value of the given variable name to the given
1572:             * StringBuffer.
1573:             * @exception IllegalArgumentException if the name is unknown.
1574:             */
1575:            private void appendVariableDef(String name, StringBuffer buf) {
1576:                char[] ch = (char[]) variableNames.get(name);
1577:                if (ch == null) {
1578:                    // We allow one undefined variable so that variable definition
1579:                    // statements work.  For the first undefined variable we return
1580:                    // the special placeholder variableLimit-1, and save the variable
1581:                    // name.
1582:                    if (undefinedVariableName == null) {
1583:                        undefinedVariableName = name;
1584:                        if (variableNext >= variableLimit) {
1585:                            throw new RuntimeException(
1586:                                    "Private use variables exhausted");
1587:                        }
1588:                        buf.append((char) --variableLimit);
1589:                    } else {
1590:                        throw new IllegalArgumentException(
1591:                                "Undefined variable $" + name);
1592:                    }
1593:                } else {
1594:                    buf.append(ch);
1595:                }
1596:            }
1597:        }
1598:
1599:        //eof
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.