0001: /*
0002: **********************************************************************
0003: * Copyright (c) 2001-2006, International Business Machines
0004: * Corporation and others. All Rights Reserved.
0005: **********************************************************************
0006: */
0007: package com.ibm.icu.text;
0008:
0009: import com.ibm.icu.impl.data.ResourceReader;
0010: import com.ibm.icu.impl.Utility;
0011: import java.util.Vector;
0012: import java.util.Hashtable;
0013: import java.text.ParsePosition;
0014: import com.ibm.icu.lang.*;
0015: import com.ibm.icu.impl.UCharacterProperty;
0016:
0017: class TransliteratorParser {
0018:
0019: //----------------------------------------------------------------------
0020: // Data members
0021: //----------------------------------------------------------------------
0022:
0023: /**
0024: * PUBLIC data member.
0025: * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group
0026: * of rules in the rule set
0027: */
0028: public Vector dataVector;
0029:
0030: /**
0031: * PUBLIC data member.
0032: * A Vector of Strings containing all of the ID blocks in the rule set
0033: */
0034: public Vector idBlockVector;
0035:
0036: /**
0037: * The current data object for which we are parsing rules
0038: */
0039: private RuleBasedTransliterator.Data curData;
0040:
0041: /**
0042: * PUBLIC data member containing the parsed compound filter, if any.
0043: */
0044: public UnicodeSet compoundFilter;
0045:
0046: private int direction;
0047:
0048: /**
0049: * Temporary symbol table used during parsing.
0050: */
0051: private ParseData parseData;
0052:
0053: /**
0054: * Temporary vector of set variables. When parsing is complete, this
0055: * is copied into the array data.variables. As with data.variables,
0056: * element 0 corresponds to character data.variablesBase.
0057: */
0058: private Vector variablesVector;
0059:
0060: /**
0061: * Temporary table of variable names. When parsing is complete, this is
0062: * copied into data.variableNames.
0063: */
0064: private Hashtable variableNames;
0065:
0066: /**
0067: * String of standins for segments. Used during the parsing of a single
0068: * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
0069: * to StringMatcher object segmentObjects.elementAt(0), etc.
0070: */
0071: private StringBuffer segmentStandins;
0072:
0073: /**
0074: * Vector of StringMatcher objects for segments. Used during the
0075: * parsing of a single rule.
0076: * segmentStandins.charAt(0) is the standin for "$1" and corresponds
0077: * to StringMatcher object segmentObjects.elementAt(0), etc.
0078: */
0079: private Vector segmentObjects;
0080:
0081: /**
0082: * The next available stand-in for variables. This starts at some point in
0083: * the private use area (discovered dynamically) and increments up toward
0084: * <code>variableLimit</code>. At any point during parsing, available
0085: * variables are <code>variableNext..variableLimit-1</code>.
0086: */
0087: private char variableNext;
0088:
0089: /**
0090: * The last available stand-in for variables. This is discovered
0091: * dynamically. At any point during parsing, available variables are
0092: * <code>variableNext..variableLimit-1</code>. During variable definition
0093: * we use the special value variableLimit-1 as a placeholder.
0094: */
0095: private char variableLimit;
0096:
0097: /**
0098: * When we encounter an undefined variable, we do not immediately signal
0099: * an error, in case we are defining this variable, e.g., "$a = [a-z];".
0100: * Instead, we save the name of the undefined variable, and substitute
0101: * in the placeholder char variableLimit - 1, and decrement
0102: * variableLimit.
0103: */
0104: private String undefinedVariableName;
0105:
0106: /**
0107: * The stand-in character for the 'dot' set, represented by '.' in
0108: * patterns. This is allocated the first time it is needed, and
0109: * reused thereafter.
0110: */
0111: private int dotStandIn = -1;
0112:
0113: //----------------------------------------------------------------------
0114: // Constants
0115: //----------------------------------------------------------------------
0116:
0117: // Indicator for ID blocks
0118: private static final String ID_TOKEN = "::";
0119: private static final int ID_TOKEN_LEN = 2;
0120:
0121: /*
0122: (reserved for future expansion)
0123: // markers for beginning and end of rule groups
0124: private static final String BEGIN_TOKEN = "BEGIN";
0125: private static final String END_TOKEN = "END";
0126: */
0127:
0128: // Operators
0129: private static final char VARIABLE_DEF_OP = '=';
0130: private static final char FORWARD_RULE_OP = '>';
0131: private static final char REVERSE_RULE_OP = '<';
0132: private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
0133:
0134: private static final String OPERATORS = "=><\u2190\u2192\u2194";
0135: private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";
0136:
0137: // Other special characters
0138: private static final char QUOTE = '\'';
0139: private static final char ESCAPE = '\\';
0140: private static final char END_OF_RULE = ';';
0141: private static final char RULE_COMMENT_CHAR = '#';
0142:
0143: private static final char CONTEXT_ANTE = '{'; // ante{key
0144: private static final char CONTEXT_POST = '}'; // key}post
0145: private static final char CURSOR_POS = '|';
0146: private static final char CURSOR_OFFSET = '@';
0147: private static final char ANCHOR_START = '^';
0148:
0149: private static final char KLEENE_STAR = '*';
0150: private static final char ONE_OR_MORE = '+';
0151: private static final char ZERO_OR_ONE = '?';
0152:
0153: private static final char DOT = '.';
0154: private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]";
0155:
0156: // By definition, the ANCHOR_END special character is a
0157: // trailing SymbolTable.SYMBOL_REF character.
0158: // private static final char ANCHOR_END = '$';
0159:
0160: // Segments of the input string are delimited by "(" and ")". In the
0161: // output string these segments are referenced as "$1", "$2", etc.
0162: private static final char SEGMENT_OPEN = '(';
0163: private static final char SEGMENT_CLOSE = ')';
0164:
0165: // A function is denoted &Source-Target/Variant(text)
0166: private static final char FUNCTION = '&';
0167:
0168: // Aliases for some of the syntax characters. These are provided so
0169: // transliteration rules can be expressed in XML without clashing with
0170: // XML syntax characters '<', '>', and '&'.
0171: private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
0172: private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
0173: private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow
0174: private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta)
0175:
0176: // Special characters disallowed at the top level
0177: private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
0178:
0179: // Special characters disallowed within a segment
0180: private static UnicodeSet ILLEGAL_SEG = new UnicodeSet(
0181: "[\\{\\}\\|\\@]");
0182:
0183: // Special characters disallowed within a function argument
0184: private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet(
0185: "[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");
0186:
0187: //----------------------------------------------------------------------
0188: // class ParseData
0189: //----------------------------------------------------------------------
0190:
0191: /**
0192: * This class implements the SymbolTable interface. It is used
0193: * during parsing to give UnicodeSet access to variables that
0194: * have been defined so far. Note that it uses variablesVector,
0195: * _not_ data.variables.
0196: */
0197: private class ParseData implements SymbolTable {
0198:
0199: /**
0200: * Implement SymbolTable API.
0201: */
0202: public char[] lookup(String name) {
0203: return (char[]) variableNames.get(name);
0204: }
0205:
0206: /**
0207: * Implement SymbolTable API.
0208: */
0209: public UnicodeMatcher lookupMatcher(int ch) {
0210: // Note that we cannot use data.lookup() because the
0211: // set array has not been constructed yet.
0212: int i = ch - curData.variablesBase;
0213: if (i >= 0 && i < variablesVector.size()) {
0214: return (UnicodeMatcher) variablesVector.elementAt(i);
0215: }
0216: return null;
0217: }
0218:
0219: /**
0220: * Implement SymbolTable API. Parse out a symbol reference
0221: * name.
0222: */
0223: public String parseReference(String text, ParsePosition pos,
0224: int limit) {
0225: int start = pos.getIndex();
0226: int i = start;
0227: while (i < limit) {
0228: char c = text.charAt(i);
0229: if ((i == start && !Character
0230: .isUnicodeIdentifierStart(c))
0231: || !Character.isUnicodeIdentifierPart(c)) {
0232: break;
0233: }
0234: ++i;
0235: }
0236: if (i == start) { // No valid name chars
0237: return null;
0238: }
0239: pos.setIndex(i);
0240: return text.substring(start, i);
0241: }
0242:
0243: /**
0244: * Return true if the given character is a matcher standin or a plain
0245: * character (non standin).
0246: */
0247: public boolean isMatcher(int ch) {
0248: // Note that we cannot use data.lookup() because the
0249: // set array has not been constructed yet.
0250: int i = ch - curData.variablesBase;
0251: if (i >= 0 && i < variablesVector.size()) {
0252: return variablesVector.elementAt(i) instanceof UnicodeMatcher;
0253: }
0254: return true;
0255: }
0256:
0257: /**
0258: * Return true if the given character is a replacer standin or a plain
0259: * character (non standin).
0260: */
0261: public boolean isReplacer(int ch) {
0262: // Note that we cannot use data.lookup() because the
0263: // set array has not been constructed yet.
0264: int i = ch - curData.variablesBase;
0265: if (i >= 0 && i < variablesVector.size()) {
0266: return variablesVector.elementAt(i) instanceof UnicodeReplacer;
0267: }
0268: return true;
0269: }
0270: }
0271:
0272: //----------------------------------------------------------------------
0273: // classes RuleBody, RuleArray, and RuleReader
0274: //----------------------------------------------------------------------
0275:
0276: /**
0277: * A private abstract class representing the interface to rule
0278: * source code that is broken up into lines. Handles the
0279: * folding of lines terminated by a backslash. This folding
0280: * is limited; it does not account for comments, quotes, or
0281: * escapes, so its use to be limited.
0282: */
0283: private static abstract class RuleBody {
0284:
0285: /**
0286: * Retrieve the next line of the source, or return null if
0287: * none. Folds lines terminated by a backslash into the
0288: * next line, without regard for comments, quotes, or
0289: * escapes.
0290: */
0291: String nextLine() {
0292: String s = handleNextLine();
0293: if (s != null && s.length() > 0
0294: && s.charAt(s.length() - 1) == '\\') {
0295:
0296: StringBuffer b = new StringBuffer(s);
0297: do {
0298: b.deleteCharAt(b.length() - 1);
0299: s = handleNextLine();
0300: if (s == null) {
0301: break;
0302: }
0303: b.append(s);
0304: } while (s.length() > 0
0305: && s.charAt(s.length() - 1) == '\\');
0306:
0307: s = b.toString();
0308: }
0309: return s;
0310: }
0311:
0312: /**
0313: * Reset to the first line of the source.
0314: */
0315: abstract void reset();
0316:
0317: /**
0318: * Subclass method to return the next line of the source.
0319: */
0320: abstract String handleNextLine();
0321: }
0322:
0323: /**
0324: * RuleBody subclass for a String[] array.
0325: */
0326: private static class RuleArray extends RuleBody {
0327: String[] array;
0328: int i;
0329:
0330: public RuleArray(String[] array) {
0331: this .array = array;
0332: i = 0;
0333: }
0334:
0335: public String handleNextLine() {
0336: return (i < array.length) ? array[i++] : null;
0337: }
0338:
0339: public void reset() {
0340: i = 0;
0341: }
0342: }
0343:
0344: /**
0345: * RuleBody subclass for a ResourceReader.
0346: */
0347: private static class RuleReader extends RuleBody {
0348: ResourceReader reader;
0349:
0350: public RuleReader(ResourceReader reader) {
0351: this .reader = reader;
0352: }
0353:
0354: public String handleNextLine() {
0355: try {
0356: return reader.readLine();
0357: } catch (java.io.IOException e) {
0358: }
0359: return null;
0360: }
0361:
0362: public void reset() {
0363: reader.reset();
0364: }
0365: }
0366:
0367: //----------------------------------------------------------------------
0368: // class RuleHalf
0369: //----------------------------------------------------------------------
0370:
0371: /**
0372: * A class representing one side of a rule. This class knows how to
0373: * parse half of a rule. It is tightly coupled to the method
0374: * TransliteratorParser.parseRule().
0375: */
0376: private static class RuleHalf {
0377:
0378: public String text;
0379:
0380: public int cursor = -1; // position of cursor in text
0381: public int ante = -1; // position of ante context marker '{' in text
0382: public int post = -1; // position of post context marker '}' in text
0383:
0384: // Record the offset to the cursor either to the left or to the
0385: // right of the key. This is indicated by characters on the output
0386: // side that allow the cursor to be positioned arbitrarily within
0387: // the matching text. For example, abc{def} > | @@@ xyz; changes
0388: // def to xyz and moves the cursor to before abc. Offset characters
0389: // must be at the start or end, and they cannot move the cursor past
0390: // the ante- or postcontext text. Placeholders are only valid in
0391: // output text. The length of the ante and post context is
0392: // determined at runtime, because of supplementals and quantifiers.
0393: public int cursorOffset = 0; // only nonzero on output side
0394:
0395: // Position of first CURSOR_OFFSET on _right_. This will be -1
0396: // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
0397: private int cursorOffsetPos = 0;
0398:
0399: public boolean anchorStart = false;
0400: public boolean anchorEnd = false;
0401:
0402: /**
0403: * The segment number from 1..n of the next '(' we see
0404: * during parsing; 1-based.
0405: */
0406: private int nextSegmentNumber = 1;
0407:
0408: /**
0409: * Parse one side of a rule, stopping at either the limit,
0410: * the END_OF_RULE character, or an operator.
0411: * @return the index after the terminating character, or
0412: * if limit was reached, limit
0413: */
0414: public int parse(String rule, int pos, int limit,
0415: TransliteratorParser parser) {
0416: int start = pos;
0417: StringBuffer buf = new StringBuffer();
0418: pos = parseSection(rule, pos, limit, parser, buf,
0419: ILLEGAL_TOP, false);
0420: text = buf.toString();
0421:
0422: if (cursorOffset > 0 && cursor != cursorOffsetPos) {
0423: syntaxError("Misplaced " + CURSOR_POS, rule, start);
0424: }
0425:
0426: return pos;
0427: }
0428:
0429: /**
0430: * Parse a section of one side of a rule, stopping at either
0431: * the limit, the END_OF_RULE character, an operator, or a
0432: * segment close character. This method parses both a
0433: * top-level rule half and a segment within such a rule half.
0434: * It calls itself recursively to parse segments and nested
0435: * segments.
0436: * @param buf buffer into which to accumulate the rule pattern
0437: * characters, either literal characters from the rule or
0438: * standins for UnicodeMatcher objects including segments.
0439: * @param illegal the set of special characters that is illegal during
0440: * this parse.
0441: * @param isSegment if true, then we've already seen a '(' and
0442: * pos on entry points right after it. Accumulate everything
0443: * up to the closing ')', put it in a segment matcher object,
0444: * generate a standin for it, and add the standin to buf. As
0445: * a side effect, update the segments vector with a reference
0446: * to the segment matcher. This works recursively for nested
0447: * segments. If isSegment is false, just accumulate
0448: * characters into buf.
0449: * @return the index after the terminating character, or
0450: * if limit was reached, limit
0451: */
0452: private int parseSection(String rule, int pos, int limit,
0453: TransliteratorParser parser, StringBuffer buf,
0454: UnicodeSet illegal, boolean isSegment) {
0455: int start = pos;
0456: ParsePosition pp = null;
0457: int quoteStart = -1; // Most recent 'single quoted string'
0458: int quoteLimit = -1;
0459: int varStart = -1; // Most recent $variableReference
0460: int varLimit = -1;
0461: int[] iref = new int[1];
0462: int bufStart = buf.length();
0463:
0464: main: while (pos < limit) {
0465: // Since all syntax characters are in the BMP, fetching
0466: // 16-bit code units suffices here.
0467: char c = rule.charAt(pos++);
0468: if (UCharacterProperty.isRuleWhiteSpace(c)) {
0469: continue;
0470: }
0471: // HALF_ENDERS is all chars that end a rule half: "<>=;"
0472: if (HALF_ENDERS.indexOf(c) >= 0) {
0473: if (isSegment) {
0474: syntaxError("Unclosed segment", rule, start);
0475: }
0476: break main;
0477: }
0478: if (anchorEnd) {
0479: // Text after a presumed end anchor is a syntax err
0480: syntaxError("Malformed variable reference", rule,
0481: start);
0482: }
0483: if (UnicodeSet.resemblesPattern(rule, pos - 1)) {
0484: if (pp == null) {
0485: pp = new ParsePosition(0);
0486: }
0487: pp.setIndex(pos - 1); // Backup to opening '['
0488: buf.append(parser.parseSet(rule, pp));
0489: pos = pp.getIndex();
0490: continue;
0491: }
0492: // Handle escapes
0493: if (c == ESCAPE) {
0494: if (pos == limit) {
0495: syntaxError("Trailing backslash", rule, start);
0496: }
0497: iref[0] = pos;
0498: int escaped = Utility.unescapeAt(rule, iref);
0499: pos = iref[0];
0500: if (escaped == -1) {
0501: syntaxError("Malformed escape", rule, start);
0502: }
0503: parser.checkVariableRange(escaped, rule, start);
0504: UTF16.append(buf, escaped);
0505: continue;
0506: }
0507: // Handle quoted matter
0508: if (c == QUOTE) {
0509: int iq = rule.indexOf(QUOTE, pos);
0510: if (iq == pos) {
0511: buf.append(c); // Parse [''] outside quotes as [']
0512: ++pos;
0513: } else {
0514: /* This loop picks up a run of quoted text of the
0515: * form 'aaaa' each time through. If this run
0516: * hasn't really ended ('aaaa''bbbb') then it keeps
0517: * looping, each time adding on a new run. When it
0518: * reaches the final quote it breaks.
0519: */
0520: quoteStart = buf.length();
0521: for (;;) {
0522: if (iq < 0) {
0523: syntaxError("Unterminated quote", rule,
0524: start);
0525: }
0526: buf.append(rule.substring(pos, iq));
0527: pos = iq + 1;
0528: if (pos < limit
0529: && rule.charAt(pos) == QUOTE) {
0530: // Parse [''] inside quotes as [']
0531: iq = rule.indexOf(QUOTE, pos + 1);
0532: // Continue looping
0533: } else {
0534: break;
0535: }
0536: }
0537: quoteLimit = buf.length();
0538:
0539: for (iq = quoteStart; iq < quoteLimit; ++iq) {
0540: parser.checkVariableRange(buf.charAt(iq),
0541: rule, start);
0542: }
0543: }
0544: continue;
0545: }
0546:
0547: parser.checkVariableRange(c, rule, start);
0548:
0549: if (illegal.contains(c)) {
0550: syntaxError("Illegal character '" + c + '\'', rule,
0551: start);
0552: }
0553:
0554: switch (c) {
0555:
0556: //------------------------------------------------------
0557: // Elements allowed within and out of segments
0558: //------------------------------------------------------
0559: case ANCHOR_START:
0560: if (buf.length() == 0 && !anchorStart) {
0561: anchorStart = true;
0562: } else {
0563: syntaxError("Misplaced anchor start", rule,
0564: start);
0565: }
0566: break;
0567: case SEGMENT_OPEN: {
0568: // bufSegStart is the offset in buf to the first
0569: // character of the segment we are parsing.
0570: int bufSegStart = buf.length();
0571:
0572: // Record segment number now, since nextSegmentNumber
0573: // will be incremented during the call to parseSection
0574: // if there are nested segments.
0575: int segmentNumber = nextSegmentNumber++; // 1-based
0576:
0577: // Parse the segment
0578: pos = parseSection(rule, pos, limit, parser, buf,
0579: ILLEGAL_SEG, true);
0580:
0581: // After parsing a segment, the relevant characters are
0582: // in buf, starting at offset bufSegStart. Extract them
0583: // into a string matcher, and replace them with a
0584: // standin for that matcher.
0585: StringMatcher m = new StringMatcher(buf
0586: .substring(bufSegStart), segmentNumber,
0587: parser.curData);
0588:
0589: // Record and associate object and segment number
0590: parser.setSegmentObject(segmentNumber, m);
0591: buf.setLength(bufSegStart);
0592: buf.append(parser.getSegmentStandin(segmentNumber));
0593: }
0594: break;
0595: case FUNCTION:
0596: case ALT_FUNCTION: {
0597: iref[0] = pos;
0598: TransliteratorIDParser.SingleID single = TransliteratorIDParser
0599: .parseFilterID(rule, iref);
0600: // The next character MUST be a segment open
0601: if (single == null
0602: || !Utility.parseChar(rule, iref,
0603: SEGMENT_OPEN)) {
0604: syntaxError("Invalid function", rule, start);
0605: }
0606:
0607: Transliterator t = single.getInstance();
0608: if (t == null) {
0609: syntaxError("Invalid function ID", rule, start);
0610: }
0611:
0612: // bufSegStart is the offset in buf to the first
0613: // character of the segment we are parsing.
0614: int bufSegStart = buf.length();
0615:
0616: // Parse the segment
0617: pos = parseSection(rule, iref[0], limit, parser,
0618: buf, ILLEGAL_FUNC, true);
0619:
0620: // After parsing a segment, the relevant characters are
0621: // in buf, starting at offset bufSegStart.
0622: FunctionReplacer r = new FunctionReplacer(t,
0623: new StringReplacer(buf
0624: .substring(bufSegStart),
0625: parser.curData));
0626:
0627: // Replace the buffer contents with a stand-in
0628: buf.setLength(bufSegStart);
0629: buf.append(parser.generateStandInFor(r));
0630: }
0631: break;
0632: case SymbolTable.SYMBOL_REF:
0633: // Handle variable references and segment references "$1" .. "$9"
0634: {
0635: // A variable reference must be followed immediately
0636: // by a Unicode identifier start and zero or more
0637: // Unicode identifier part characters, or by a digit
0638: // 1..9 if it is a segment reference.
0639: if (pos == limit) {
0640: // A variable ref character at the end acts as
0641: // an anchor to the context limit, as in perl.
0642: anchorEnd = true;
0643: break;
0644: }
0645: // Parse "$1" "$2" .. "$9" .. (no upper limit)
0646: c = rule.charAt(pos);
0647: int r = UCharacter.digit(c, 10);
0648: if (r >= 1 && r <= 9) {
0649: iref[0] = pos;
0650: r = Utility.parseNumber(rule, iref, 10);
0651: if (r < 0) {
0652: syntaxError("Undefined segment reference",
0653: rule, start);
0654: }
0655: pos = iref[0];
0656: buf.append(parser.getSegmentStandin(r));
0657: } else {
0658: if (pp == null) { // Lazy create
0659: pp = new ParsePosition(0);
0660: }
0661: pp.setIndex(pos);
0662: String name = parser.parseData.parseReference(
0663: rule, pp, limit);
0664: if (name == null) {
0665: // This means the '$' was not followed by a
0666: // valid name. Try to interpret it as an
0667: // end anchor then. If this also doesn't work
0668: // (if we see a following character) then signal
0669: // an error.
0670: anchorEnd = true;
0671: break;
0672: }
0673: pos = pp.getIndex();
0674: // If this is a variable definition statement,
0675: // then the LHS variable will be undefined. In
0676: // that case appendVariableDef() will append the
0677: // special placeholder char variableLimit-1.
0678: varStart = buf.length();
0679: parser.appendVariableDef(name, buf);
0680: varLimit = buf.length();
0681: }
0682: }
0683: break;
0684: case DOT:
0685: buf.append(parser.getDotStandIn());
0686: break;
0687: case KLEENE_STAR:
0688: case ONE_OR_MORE:
0689: case ZERO_OR_ONE:
0690: // Quantifiers. We handle single characters, quoted strings,
0691: // variable references, and segments.
0692: // a+ matches aaa
0693: // 'foo'+ matches foofoofoo
0694: // $v+ matches xyxyxy if $v == xy
0695: // (seg)+ matches segsegseg
0696: {
0697: if (isSegment && buf.length() == bufStart) {
0698: // The */+ immediately follows '('
0699: syntaxError("Misplaced quantifier", rule, start);
0700: break;
0701: }
0702:
0703: int qstart, qlimit;
0704: // The */+ follows an isolated character or quote
0705: // or variable reference
0706: if (buf.length() == quoteLimit) {
0707: // The */+ follows a 'quoted string'
0708: qstart = quoteStart;
0709: qlimit = quoteLimit;
0710: } else if (buf.length() == varLimit) {
0711: // The */+ follows a $variableReference
0712: qstart = varStart;
0713: qlimit = varLimit;
0714: } else {
0715: // The */+ follows a single character, possibly
0716: // a segment standin
0717: qstart = buf.length() - 1;
0718: qlimit = qstart + 1;
0719: }
0720:
0721: UnicodeMatcher m;
0722: try {
0723: m = new StringMatcher(buf.toString(), qstart,
0724: qlimit, 0, parser.curData);
0725: } catch (RuntimeException e) {
0726: throw new IllegalArgumentException(
0727: "Failure in rule: "
0728: + rule.substring(pos, limit));
0729: }
0730: int min = 0;
0731: int max = Quantifier.MAX;
0732: switch (c) {
0733: case ONE_OR_MORE:
0734: min = 1;
0735: break;
0736: case ZERO_OR_ONE:
0737: min = 0;
0738: max = 1;
0739: break;
0740: // case KLEENE_STAR:
0741: // do nothing -- min, max already set
0742: }
0743: m = new Quantifier(m, min, max);
0744: buf.setLength(qstart);
0745: buf.append(parser.generateStandInFor(m));
0746: }
0747: break;
0748:
0749: //------------------------------------------------------
0750: // Elements allowed ONLY WITHIN segments
0751: //------------------------------------------------------
0752: case SEGMENT_CLOSE:
0753: // assert(isSegment);
0754: // We're done parsing a segment.
0755: break main;
0756:
0757: //------------------------------------------------------
0758: // Elements allowed ONLY OUTSIDE segments
0759: //------------------------------------------------------
0760: case CONTEXT_ANTE:
0761: if (ante >= 0) {
0762: syntaxError("Multiple ante contexts", rule,
0763: start);
0764: }
0765: ante = buf.length();
0766: break;
0767: case CONTEXT_POST:
0768: if (post >= 0) {
0769: syntaxError("Multiple post contexts", rule,
0770: start);
0771: }
0772: post = buf.length();
0773: break;
0774: case CURSOR_POS:
0775: if (cursor >= 0) {
0776: syntaxError("Multiple cursors", rule, start);
0777: }
0778: cursor = buf.length();
0779: break;
0780: case CURSOR_OFFSET:
0781: if (cursorOffset < 0) {
0782: if (buf.length() > 0) {
0783: syntaxError("Misplaced " + c, rule, start);
0784: }
0785: --cursorOffset;
0786: } else if (cursorOffset > 0) {
0787: if (buf.length() != cursorOffsetPos
0788: || cursor >= 0) {
0789: syntaxError("Misplaced " + c, rule, start);
0790: }
0791: ++cursorOffset;
0792: } else {
0793: if (cursor == 0 && buf.length() == 0) {
0794: cursorOffset = -1;
0795: } else if (cursor < 0) {
0796: cursorOffsetPos = buf.length();
0797: cursorOffset = 1;
0798: } else {
0799: syntaxError("Misplaced " + c, rule, start);
0800: }
0801: }
0802: break;
0803:
0804: //------------------------------------------------------
0805: // Non-special characters
0806: //------------------------------------------------------
0807: default:
0808: // Disallow unquoted characters other than [0-9A-Za-z]
0809: // in the printable ASCII range. These characters are
0810: // reserved for possible future use.
0811: if (c >= 0x0021
0812: && c <= 0x007E
0813: && !((c >= '0' && c <= '9')
0814: || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
0815: syntaxError("Unquoted " + c, rule, start);
0816: }
0817: buf.append(c);
0818: break;
0819: }
0820: }
0821: return pos;
0822: }
0823:
0824: /**
0825: * Remove context.
0826: */
0827: void removeContext() {
0828: text = text.substring(ante < 0 ? 0 : ante, post < 0 ? text
0829: .length() : post);
0830: ante = post = -1;
0831: anchorStart = anchorEnd = false;
0832: }
0833:
0834: /**
0835: * Return true if this half looks like valid output, that is, does not
0836: * contain quantifiers or other special input-only elements.
0837: */
0838: public boolean isValidOutput(TransliteratorParser parser) {
0839: for (int i = 0; i < text.length();) {
0840: int c = UTF16.charAt(text, i);
0841: i += UTF16.getCharCount(c);
0842: if (!parser.parseData.isReplacer(c)) {
0843: return false;
0844: }
0845: }
0846: return true;
0847: }
0848:
0849: /**
0850: * Return true if this half looks like valid input, that is, does not
0851: * contain functions or other special output-only elements.
0852: */
0853: public boolean isValidInput(TransliteratorParser parser) {
0854: for (int i = 0; i < text.length();) {
0855: int c = UTF16.charAt(text, i);
0856: i += UTF16.getCharCount(c);
0857: if (!parser.parseData.isMatcher(c)) {
0858: return false;
0859: }
0860: }
0861: return true;
0862: }
0863: }
0864:
0865: //----------------------------------------------------------------------
0866: // PUBLIC methods
0867: //----------------------------------------------------------------------
0868:
0869: /**
0870: * Constructor.
0871: */
0872: public TransliteratorParser() {
0873: }
0874:
0875: /**
0876: * Parse a set of rules. After the parse completes, examine the public
0877: * data members for results.
0878: */
0879: public void parse(String rules, int direction) {
0880: parseRules(new RuleArray(new String[] { rules }), direction);
0881: }
0882:
0883: /**
0884: * Parse a set of rules. After the parse completes, examine the public
0885: * data members for results.
0886: */
0887: public void parse(ResourceReader rules, int direction) {
0888: parseRules(new RuleReader(rules), direction);
0889: }
0890:
0891: //----------------------------------------------------------------------
0892: // PRIVATE methods
0893: //----------------------------------------------------------------------
0894:
0895: /**
0896: * Parse an array of zero or more rules. The strings in the array are
0897: * treated as if they were concatenated together, with rule terminators
0898: * inserted between array elements if not present already.
0899: *
0900: * Any previous rules are discarded. Typically this method is called exactly
0901: * once, during construction.
0902: *
0903: * The member this.data will be set to null if there are no rules.
0904: *
0905: * @exception IllegalArgumentException if there is a syntax error in the
0906: * rules
0907: */
0908: void parseRules(RuleBody ruleArray, int dir) {
0909: boolean parsingIDs = true;
0910: boolean inBeginEndBlock = false;
0911: int ruleCount = 0;
0912:
0913: dataVector = new Vector();
0914: idBlockVector = new Vector();
0915: curData = null;
0916: direction = dir;
0917: compoundFilter = null;
0918: variablesVector = new Vector();
0919: variableNames = new Hashtable();
0920: parseData = new ParseData();
0921:
0922: StringBuffer errors = null;
0923: int errorCount = 0;
0924:
0925: ruleArray.reset();
0926:
0927: StringBuffer idBlockResult = new StringBuffer();
0928:
0929: // The compound filter offset is an index into idBlockResult.
0930: // If it is 0, then the compound filter occurred at the start,
0931: // and it is the offset to the _start_ of the compound filter
0932: // pattern. Otherwise it is the offset to the _limit_ of the
0933: // compound filter pattern within idBlockResult.
0934: this .compoundFilter = null;
0935: int compoundFilterOffset = -1;
0936:
0937: main: for (;;) {
0938: String rule = ruleArray.nextLine();
0939: if (rule == null) {
0940: break;
0941: }
0942: int pos = 0;
0943: int limit = rule.length();
0944: while (pos < limit) {
0945: char c = rule.charAt(pos++);
0946: if (UCharacterProperty.isRuleWhiteSpace(c)) {
0947: continue;
0948: }
0949: // Skip lines starting with the comment character
0950: if (c == RULE_COMMENT_CHAR) {
0951: pos = rule.indexOf("\n", pos) + 1;
0952: if (pos == 0) {
0953: break; // No "\n" found; rest of rule is a commnet
0954: }
0955: continue; // Either fall out or restart with next line
0956: }
0957:
0958: // skip empty rules
0959: if (c == END_OF_RULE)
0960: continue;
0961:
0962: // Often a rule file contains multiple errors. It's
0963: // convenient to the rule author if these are all reported
0964: // at once. We keep parsing rules even after a failure, up
0965: // to a specified limit, and report all errors at once.
0966: try {
0967: ++ruleCount;
0968:
0969: // We've found the start of a rule or ID. c is its first
0970: // character, and pos points past c.
0971: --pos;
0972: // Look for an ID token. Must have at least ID_TOKEN_LEN + 1
0973: // chars left.
0974: if ((pos + ID_TOKEN_LEN + 1) <= limit
0975: && rule.regionMatches(pos, ID_TOKEN, 0,
0976: ID_TOKEN_LEN)) {
0977: pos += ID_TOKEN_LEN;
0978: c = rule.charAt(pos);
0979: while (UCharacterProperty.isRuleWhiteSpace(c)
0980: && pos < limit) {
0981: ++pos;
0982: c = rule.charAt(pos);
0983: }
0984: int[] p = new int[] { pos };
0985:
0986: if (!parsingIDs) {
0987: if (curData != null) {
0988: if (direction == Transliterator.FORWARD)
0989: dataVector.add(curData);
0990: else
0991: dataVector.insertElementAt(curData,
0992: 0);
0993: curData = null;
0994: }
0995: parsingIDs = true;
0996: }
0997:
0998: TransliteratorIDParser.SingleID id = TransliteratorIDParser
0999: .parseSingleID(rule, p, direction);
1000: if (p[0] != pos
1001: && Utility.parseChar(rule, p,
1002: END_OF_RULE)) {
1003: // Successful ::ID parse.
1004:
1005: if (direction == Transliterator.FORWARD) {
1006: idBlockResult.append(id.canonID)
1007: .append(END_OF_RULE);
1008: } else {
1009: idBlockResult.insert(0, id.canonID
1010: + END_OF_RULE);
1011: }
1012:
1013: } else {
1014: // Couldn't parse an ID. Try to parse a global filter
1015: int[] withParens = new int[] { -1 };
1016: UnicodeSet f = TransliteratorIDParser
1017: .parseGlobalFilter(rule, p,
1018: direction, withParens, null);
1019: if (f != null
1020: && Utility.parseChar(rule, p,
1021: END_OF_RULE)) {
1022: if ((direction == Transliterator.FORWARD) == (withParens[0] == 0)) {
1023: if (compoundFilter != null) {
1024: // Multiple compound filters
1025: syntaxError(
1026: "Multiple global filters",
1027: rule, pos);
1028: }
1029: compoundFilter = f;
1030: compoundFilterOffset = ruleCount;
1031: }
1032: } else {
1033: // Invalid ::id
1034: // Can be parsed as neither an ID nor a global filter
1035: syntaxError("Invalid ::ID", rule, pos);
1036: }
1037: }
1038:
1039: pos = p[0];
1040: } else {
1041: if (parsingIDs) {
1042: if (direction == Transliterator.FORWARD)
1043: idBlockVector.add(idBlockResult
1044: .toString());
1045: else
1046: idBlockVector.insertElementAt(
1047: idBlockResult.toString(), 0);
1048: idBlockResult.delete(0, idBlockResult
1049: .length());
1050: parsingIDs = false;
1051: curData = new RuleBasedTransliterator.Data();
1052:
1053: // By default, rules use part of the private use area
1054: // E000..F8FF for variables and other stand-ins. Currently
1055: // the range F000..F8FF is typically sufficient. The 'use
1056: // variable range' pragma allows rule sets to modify this.
1057: setVariableRange(0xF000, 0xF8FF);
1058: }
1059:
1060: if (resemblesPragma(rule, pos, limit)) {
1061: int ppp = parsePragma(rule, pos, limit);
1062: if (ppp < 0) {
1063: syntaxError("Unrecognized pragma",
1064: rule, pos);
1065: }
1066: pos = ppp;
1067: // Parse a rule
1068: } else {
1069: pos = parseRule(rule, pos, limit);
1070: }
1071: }
1072: } catch (IllegalArgumentException e) {
1073: if (errorCount == 30) {
1074: errors
1075: .append("\nMore than 30 errors; further messages squelched");
1076: break main;
1077: }
1078: if (errors == null) {
1079: errors = new StringBuffer(e.getMessage());
1080: } else {
1081: errors.append("\n" + e.getMessage());
1082: }
1083: ++errorCount;
1084: pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
1085: }
1086: }
1087: }
1088: if (parsingIDs && idBlockResult.length() > 0) {
1089: if (direction == Transliterator.FORWARD)
1090: idBlockVector.add(idBlockResult.toString());
1091: else
1092: idBlockVector.insertElementAt(idBlockResult.toString(),
1093: 0);
1094: } else if (!parsingIDs && curData != null) {
1095: if (direction == Transliterator.FORWARD)
1096: dataVector.add(curData);
1097: else
1098: dataVector.insertElementAt(curData, 0);
1099: }
1100:
1101: // Convert the set vector to an array
1102: for (int i = 0; i < dataVector.size(); i++) {
1103: RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data) dataVector
1104: .get(i);
1105: data.variables = new Object[variablesVector.size()];
1106: variablesVector.copyInto(data.variables);
1107: data.variableNames = new Hashtable();
1108: data.variableNames.putAll(variableNames);
1109: }
1110: variablesVector = null;
1111:
1112: // Do more syntax checking and index the rules
1113: try {
1114: if (compoundFilter != null) {
1115: if ((direction == Transliterator.FORWARD && compoundFilterOffset != 1)
1116: || (direction == Transliterator.REVERSE && compoundFilterOffset != ruleCount)) {
1117: throw new IllegalArgumentException(
1118: "Compound filters misplaced");
1119: }
1120: }
1121:
1122: for (int i = 0; i < dataVector.size(); i++) {
1123: RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data) dataVector
1124: .get(i);
1125: data.ruleSet.freeze();
1126: }
1127:
1128: if (idBlockVector.size() == 1
1129: && ((String) idBlockVector.get(0)).length() == 0)
1130: idBlockVector.remove(0);
1131:
1132: } catch (IllegalArgumentException e) {
1133: if (errors == null) {
1134: errors = new StringBuffer(e.getMessage());
1135: } else {
1136: errors.append("\n").append(e.getMessage());
1137: }
1138: }
1139:
1140: if (errors != null) {
1141: throw new IllegalArgumentException(errors.toString());
1142: }
1143: }
1144:
1145: /**
1146: * MAIN PARSER. Parse the next rule in the given rule string, starting
1147: * at pos. Return the index after the last character parsed. Do not
1148: * parse characters at or after limit.
1149: *
1150: * Important: The character at pos must be a non-whitespace character
1151: * that is not the comment character.
1152: *
1153: * This method handles quoting, escaping, and whitespace removal. It
1154: * parses the end-of-rule character. It recognizes context and cursor
1155: * indicators. Once it does a lexical breakdown of the rule at pos, it
1156: * creates a rule object and adds it to our rule list.
1157: *
1158: * This method is tightly coupled to the inner class RuleHalf.
1159: */
1160: private int parseRule(String rule, int pos, int limit) {
1161: // Locate the left side, operator, and right side
1162: int start = pos;
1163: char operator = 0;
1164:
1165: // Set up segments data
1166: segmentStandins = new StringBuffer();
1167: segmentObjects = new Vector();
1168:
1169: RuleHalf left = new RuleHalf();
1170: RuleHalf right = new RuleHalf();
1171:
1172: undefinedVariableName = null;
1173: pos = left.parse(rule, pos, limit, this );
1174:
1175: if (pos == limit
1176: || OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
1177: syntaxError("No operator pos=" + pos, rule, start);
1178: }
1179: ++pos;
1180:
1181: // Found an operator char. Check for forward-reverse operator.
1182: if (operator == REVERSE_RULE_OP
1183: && (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
1184: ++pos;
1185: operator = FWDREV_RULE_OP;
1186: }
1187:
1188: // Translate alternate op characters.
1189: switch (operator) {
1190: case ALT_FORWARD_RULE_OP:
1191: operator = FORWARD_RULE_OP;
1192: break;
1193: case ALT_REVERSE_RULE_OP:
1194: operator = REVERSE_RULE_OP;
1195: break;
1196: case ALT_FWDREV_RULE_OP:
1197: operator = FWDREV_RULE_OP;
1198: break;
1199: }
1200:
1201: pos = right.parse(rule, pos, limit, this );
1202:
1203: if (pos < limit) {
1204: if (rule.charAt(--pos) == END_OF_RULE) {
1205: ++pos;
1206: } else {
1207: // RuleHalf parser must have terminated at an operator
1208: syntaxError("Unquoted operator", rule, start);
1209: }
1210: }
1211:
1212: if (operator == VARIABLE_DEF_OP) {
1213: // LHS is the name. RHS is a single character, either a literal
1214: // or a set (already parsed). If RHS is longer than one
1215: // character, it is either a multi-character string, or multiple
1216: // sets, or a mixture of chars and sets -- syntax error.
1217:
1218: // We expect to see a single undefined variable (the one being
1219: // defined).
1220: if (undefinedVariableName == null) {
1221: syntaxError("Missing '$' or duplicate definition",
1222: rule, start);
1223: }
1224: if (left.text.length() != 1
1225: || left.text.charAt(0) != variableLimit) {
1226: syntaxError("Malformed LHS", rule, start);
1227: }
1228: if (left.anchorStart || left.anchorEnd || right.anchorStart
1229: || right.anchorEnd) {
1230: syntaxError("Malformed variable def", rule, start);
1231: }
1232: // We allow anything on the right, including an empty string.
1233: int n = right.text.length();
1234: char[] value = new char[n];
1235: right.text.getChars(0, n, value, 0);
1236: variableNames.put(undefinedVariableName, value);
1237:
1238: ++variableLimit;
1239: return pos;
1240: }
1241:
1242: // If this is not a variable definition rule, we shouldn't have
1243: // any undefined variable names.
1244: if (undefinedVariableName != null) {
1245: syntaxError("Undefined variable $" + undefinedVariableName,
1246: rule, start);
1247: }
1248:
1249: // Verify segments
1250: if (segmentStandins.length() > segmentObjects.size()) {
1251: syntaxError("Undefined segment reference", rule, start);
1252: }
1253: for (int i = 0; i < segmentStandins.length(); ++i) {
1254: if (segmentStandins.charAt(i) == 0) {
1255: syntaxError("Internal error", rule, start); // will never happen
1256: }
1257: }
1258: for (int i = 0; i < segmentObjects.size(); ++i) {
1259: if (segmentObjects.elementAt(i) == null) {
1260: syntaxError("Internal error", rule, start); // will never happen
1261: }
1262: }
1263:
1264: // If the direction we want doesn't match the rule
1265: // direction, do nothing.
1266: if (operator != FWDREV_RULE_OP
1267: && ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
1268: return pos;
1269: }
1270:
1271: // Transform the rule into a forward rule by swapping the
1272: // sides if necessary.
1273: if (direction == Transliterator.REVERSE) {
1274: RuleHalf temp = left;
1275: left = right;
1276: right = temp;
1277: }
1278:
1279: // Remove non-applicable elements in forward-reverse
1280: // rules. Bidirectional rules ignore elements that do not
1281: // apply.
1282: if (operator == FWDREV_RULE_OP) {
1283: right.removeContext();
1284: left.cursor = -1;
1285: left.cursorOffset = 0;
1286: }
1287:
1288: // Normalize context
1289: if (left.ante < 0) {
1290: left.ante = 0;
1291: }
1292: if (left.post < 0) {
1293: left.post = left.text.length();
1294: }
1295:
1296: // Context is only allowed on the input side. Cursors are only
1297: // allowed on the output side. Segment delimiters can only appear
1298: // on the left, and references on the right. Cursor offset
1299: // cannot appear without an explicit cursor. Cursor offset
1300: // cannot place the cursor outside the limits of the context.
1301: // Anchors are only allowed on the input side.
1302: if (right.ante >= 0
1303: || right.post >= 0
1304: || left.cursor >= 0
1305: || (right.cursorOffset != 0 && right.cursor < 0)
1306: ||
1307: // - The following two checks were used to ensure that the
1308: // - the cursor offset stayed within the ante- or postcontext.
1309: // - However, with the addition of quantifiers, we have to
1310: // - allow arbitrary cursor offsets and do runtime checking.
1311: //(right.cursorOffset > (left.text.length() - left.post)) ||
1312: //(-right.cursorOffset > left.ante) ||
1313: right.anchorStart || right.anchorEnd
1314: || !left.isValidInput(this )
1315: || !right.isValidOutput(this ) || left.ante > left.post) {
1316: syntaxError("Malformed rule", rule, start);
1317: }
1318:
1319: // Flatten segment objects vector to an array
1320: UnicodeMatcher[] segmentsArray = null;
1321: if (segmentObjects.size() > 0) {
1322: segmentsArray = new UnicodeMatcher[segmentObjects.size()];
1323: segmentObjects.toArray(segmentsArray);
1324: }
1325:
1326: curData.ruleSet.addRule(new TransliterationRule(left.text,
1327: left.ante, left.post, right.text, right.cursor,
1328: right.cursorOffset, segmentsArray, left.anchorStart,
1329: left.anchorEnd, curData));
1330:
1331: return pos;
1332: }
1333:
1334: /**
1335: * Set the variable range to [start, end] (inclusive).
1336: */
1337: private void setVariableRange(int start, int end) {
1338: if (start > end || start < 0 || end > 0xFFFF) {
1339: throw new IllegalArgumentException(
1340: "Invalid variable range " + start + ", " + end);
1341: }
1342:
1343: curData.variablesBase = (char) start; // first private use
1344:
1345: if (dataVector.size() == 0) {
1346: variableNext = (char) start;
1347: variableLimit = (char) (end + 1);
1348: }
1349: }
1350:
1351: /**
1352: * Assert that the given character is NOT within the variable range.
1353: * If it is, signal an error. This is neccesary to ensure that the
1354: * variable range does not overlap characters used in a rule.
1355: */
1356: private void checkVariableRange(int ch, String rule, int start) {
1357: if (ch >= curData.variablesBase && ch < variableLimit) {
1358: syntaxError("Variable range character in rule", rule, start);
1359: }
1360: }
1361:
1362: // (The following method is part of an unimplemented feature.
1363: // Remove this clover pragma after the feature is implemented.
1364: // 2003-06-11 ICU 2.6 Alan)
1365: ///CLOVER:OFF
1366: /**
1367: * Set the maximum backup to 'backup', in response to a pragma
1368: * statement.
1369: */
1370: private void pragmaMaximumBackup(int backup) {
1371: //TODO Finish
1372: throw new IllegalArgumentException(
1373: "use maximum backup pragma not implemented yet");
1374: }
1375:
1376: ///CLOVER:ON
1377:
1378: // (The following method is part of an unimplemented feature.
1379: // Remove this clover pragma after the feature is implemented.
1380: // 2003-06-11 ICU 2.6 Alan)
1381: ///CLOVER:OFF
1382: /**
1383: * Begin normalizing all rules using the given mode, in response
1384: * to a pragma statement.
1385: */
1386: private void pragmaNormalizeRules(Normalizer.Mode mode) {
1387: //TODO Finish
1388: throw new IllegalArgumentException(
1389: "use normalize rules pragma not implemented yet");
1390: }
1391:
1392: ///CLOVER:ON
1393:
1394: /**
1395: * Return true if the given rule looks like a pragma.
1396: * @param pos offset to the first non-whitespace character
1397: * of the rule.
1398: * @param limit pointer past the last character of the rule.
1399: */
1400: static boolean resemblesPragma(String rule, int pos, int limit) {
1401: // Must start with /use\s/i
1402: return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
1403: }
1404:
1405: /**
1406: * Parse a pragma. This method assumes resemblesPragma() has
1407: * already returned true.
1408: * @param pos offset to the first non-whitespace character
1409: * of the rule.
1410: * @param limit pointer past the last character of the rule.
1411: * @return the position index after the final ';' of the pragma,
1412: * or -1 on failure.
1413: */
1414: private int parsePragma(String rule, int pos, int limit) {
1415: int[] array = new int[2];
1416:
1417: // resemblesPragma() has already returned true, so we
1418: // know that pos points to /use\s/i; we can skip 4 characters
1419: // immediately
1420: pos += 4;
1421:
1422: // Here are the pragmas we recognize:
1423: // use variable range 0xE000 0xEFFF;
1424: // use maximum backup 16;
1425: // use nfd rules;
1426: int p = Utility.parsePattern(rule, pos, limit,
1427: "~variable range # #~;", array);
1428: if (p >= 0) {
1429: setVariableRange(array[0], array[1]);
1430: return p;
1431: }
1432:
1433: p = Utility.parsePattern(rule, pos, limit,
1434: "~maximum backup #~;", array);
1435: if (p >= 0) {
1436: pragmaMaximumBackup(array[0]);
1437: return p;
1438: }
1439:
1440: p = Utility
1441: .parsePattern(rule, pos, limit, "~nfd rules~;", null);
1442: if (p >= 0) {
1443: pragmaNormalizeRules(Normalizer.NFD);
1444: return p;
1445: }
1446:
1447: p = Utility
1448: .parsePattern(rule, pos, limit, "~nfc rules~;", null);
1449: if (p >= 0) {
1450: pragmaNormalizeRules(Normalizer.NFC);
1451: return p;
1452: }
1453:
1454: // Syntax error: unable to parse pragma
1455: return -1;
1456: }
1457:
1458: /**
1459: * Throw an exception indicating a syntax error. Search the rule string
1460: * for the probable end of the rule. Of course, if the error is that
1461: * the end of rule marker is missing, then the rule end will not be found.
1462: * In any case the rule start will be correctly reported.
1463: * @param msg error description
1464: * @param rule pattern string
1465: * @param start position of first character of current rule
1466: */
1467: static final void syntaxError(String msg, String rule, int start) {
1468: int end = ruleEnd(rule, start, rule.length());
1469: throw new IllegalArgumentException(msg + " in \""
1470: + Utility.escape(rule.substring(start, end)) + '"');
1471: }
1472:
1473: static final int ruleEnd(String rule, int start, int limit) {
1474: int end = Utility.quotedIndexOf(rule, start, limit, ";");
1475: if (end < 0) {
1476: end = limit;
1477: }
1478: return end;
1479: }
1480:
1481: /**
1482: * Parse a UnicodeSet out, store it, and return the stand-in character
1483: * used to represent it.
1484: */
1485: private final char parseSet(String rule, ParsePosition pos) {
1486: UnicodeSet set = new UnicodeSet(rule, pos, parseData);
1487: if (variableNext >= variableLimit) {
1488: throw new RuntimeException(
1489: "Private use variables exhausted");
1490: }
1491: set.compact();
1492: return generateStandInFor(set);
1493: }
1494:
1495: /**
1496: * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer.
1497: * Store the object.
1498: */
1499: char generateStandInFor(Object obj) {
1500: // assert(obj != null);
1501:
1502: // Look up previous stand-in, if any. This is a short list
1503: // (typical n is 0, 1, or 2); linear search is optimal.
1504: for (int i = 0; i < variablesVector.size(); ++i) {
1505: if (variablesVector.elementAt(i) == obj) { // [sic] pointer comparison
1506: return (char) (curData.variablesBase + i);
1507: }
1508: }
1509:
1510: if (variableNext >= variableLimit) {
1511: throw new RuntimeException("Variable range exhausted");
1512: }
1513: variablesVector.addElement(obj);
1514: return variableNext++;
1515: }
1516:
1517: /**
1518: * Return the standin for segment seg (1-based).
1519: */
1520: public char getSegmentStandin(int seg) {
1521: if (segmentStandins.length() < seg) {
1522: segmentStandins.setLength(seg);
1523: }
1524: char c = segmentStandins.charAt(seg - 1);
1525: if (c == 0) {
1526: if (variableNext >= variableLimit) {
1527: throw new RuntimeException("Variable range exhausted");
1528: }
1529: c = variableNext++;
1530: // Set a placeholder in the master variables vector that will be
1531: // filled in later by setSegmentObject(). We know that we will get
1532: // called first because setSegmentObject() will call us.
1533: variablesVector.addElement(null);
1534: segmentStandins.setCharAt(seg - 1, c);
1535: }
1536: return c;
1537: }
1538:
1539: /**
1540: * Set the object for segment seg (1-based).
1541: */
1542: public void setSegmentObject(int seg, StringMatcher obj) {
1543: // Since we call parseSection() recursively, nested
1544: // segments will result in segment i+1 getting parsed
1545: // and stored before segment i; be careful with the
1546: // vector handling here.
1547: if (segmentObjects.size() < seg) {
1548: segmentObjects.setSize(seg);
1549: }
1550: int index = getSegmentStandin(seg) - curData.variablesBase;
1551: if (segmentObjects.elementAt(seg - 1) != null
1552: || variablesVector.elementAt(index) != null) {
1553: throw new RuntimeException(); // should never happen
1554: }
1555: segmentObjects.setElementAt(obj, seg - 1);
1556: variablesVector.setElementAt(obj, index);
1557: }
1558:
1559: /**
1560: * Return the stand-in for the dot set. It is allocated the first
1561: * time and reused thereafter.
1562: */
1563: char getDotStandIn() {
1564: if (dotStandIn == -1) {
1565: dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
1566: }
1567: return (char) dotStandIn;
1568: }
1569:
1570: /**
1571: * Append the value of the given variable name to the given
1572: * StringBuffer.
1573: * @exception IllegalArgumentException if the name is unknown.
1574: */
1575: private void appendVariableDef(String name, StringBuffer buf) {
1576: char[] ch = (char[]) variableNames.get(name);
1577: if (ch == null) {
1578: // We allow one undefined variable so that variable definition
1579: // statements work. For the first undefined variable we return
1580: // the special placeholder variableLimit-1, and save the variable
1581: // name.
1582: if (undefinedVariableName == null) {
1583: undefinedVariableName = name;
1584: if (variableNext >= variableLimit) {
1585: throw new RuntimeException(
1586: "Private use variables exhausted");
1587: }
1588: buf.append((char) --variableLimit);
1589: } else {
1590: throw new IllegalArgumentException(
1591: "Undefined variable $" + name);
1592: }
1593: } else {
1594: buf.append(ch);
1595: }
1596: }
1597: }
1598:
1599: //eof
|