0001: package net.sf.saxon.type;
0002:
0003: import net.sf.saxon.om.XMLChar;
0004: import net.sf.saxon.om.FastStringBuffer;
0005:
0006: import java.math.BigDecimal;
0007: import java.util.*;
0008:
0009: /**
0010: * This class translates XML Schema regex syntax into JDK 1.4 regex syntax.
0011: * Author: James Clark
0012: * Modified by Michael Kay (a) to integrate the code into Saxon, and (b) to support XPath additions
0013: * to the XML Schema regex syntax.
0014: */
0015: public class RegexTranslator {
0016:
0017: /**
0018: * Translates XML Schema regexes into <code>java.util.regex</code> regexes.
0019: *
0020: * @see java.util.regex.Pattern
0021: * @see <a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a>
0022: */
0023:
0024: private final CharSequence regExp;
0025: private boolean isXPath;
0026: private int pos = 0;
0027: private final int length;
0028: private char curChar;
0029: private boolean eos = false;
0030: private final FastStringBuffer result = new FastStringBuffer(32);
0031:
0032: private static final String categories = "LMNPZSC";
0033: private static final CharClass[] categoryCharClasses = new CharClass[categories
0034: .length()];
0035: private static final String subCategories = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoZsZlZpSmScSkSoCcCfCoCn";
0036: private static final CharClass[] subCategoryCharClasses = new CharClass[subCategories
0037: .length() / 2];
0038:
0039: private static final int NONBMP_MIN = 0x10000;
0040: private static final int NONBMP_MAX = 0x10FFFF;
0041: private static final char SURROGATE2_MIN = '\uDC00';
0042: private static final char SURROGATE2_MAX = '\uDFFF';
0043:
0044: //static final Localizer localizer = new Localizer(RegexTranslator.class);
0045:
0046: private static final String[] blockNames = { "BasicLatin",
0047: "Latin-1Supplement", "LatinExtended-A", "LatinExtended-B",
0048: "IPAExtensions", "SpacingModifierLetters",
0049: "CombiningDiacriticalMarks", "Greek", "Cyrillic",
0050: "Armenian", "Hebrew", "Arabic", "Syriac", "Thaana",
0051: "Devanagari", "Bengali", "Gurmukhi", "Gujarati", "Oriya",
0052: "Tamil", "Telugu", "Kannada", "Malayalam", "Sinhala",
0053: "Thai", "Lao", "Tibetan", "Myanmar", "Georgian",
0054: "HangulJamo", "Ethiopic", "Cherokee",
0055: "UnifiedCanadianAboriginalSyllabics", "Ogham", "Runic",
0056: "Khmer", "Mongolian", "LatinExtendedAdditional",
0057: "GreekExtended", "GeneralPunctuation",
0058: "SuperscriptsandSubscripts", "CurrencySymbols",
0059: "CombiningMarksforSymbols", "LetterlikeSymbols",
0060: "NumberForms", "Arrows", "MathematicalOperators",
0061: "MiscellaneousTechnical", "ControlPictures",
0062: "OpticalCharacterRecognition", "EnclosedAlphanumerics",
0063: "BoxDrawing", "BlockElements", "GeometricShapes",
0064: "MiscellaneousSymbols", "Dingbats", "BraillePatterns",
0065: "CJKRadicalsSupplement", "KangxiRadicals",
0066: "IdeographicDescriptionCharacters",
0067: "CJKSymbolsandPunctuation", "Hiragana",
0068: "Katakana",
0069: "Bopomofo",
0070: "HangulCompatibilityJamo",
0071: "Kanbun",
0072: "BopomofoExtended",
0073: "EnclosedCJKLettersandMonths",
0074: "CJKCompatibility",
0075: "CJKUnifiedIdeographsExtensionA",
0076: "CJKUnifiedIdeographs",
0077: "YiSyllables",
0078: "YiRadicals",
0079: "HangulSyllables",
0080: // surrogates excluded because there are never any *characters* with codes in surrogate range
0081: // "PrivateUse", excluded because 3.1 adds non-BMP ranges
0082: "CJKCompatibilityIdeographs",
0083: "AlphabeticPresentationForms", "ArabicPresentationForms-A",
0084: "CombiningHalfMarks", "CJKCompatibilityForms",
0085: "SmallFormVariants", "ArabicPresentationForms-B",
0086: "Specials", "HalfwidthandFullwidthForms", "Specials" };
0087:
0088: /**
0089: * Names of blocks including ranges outside the BMP.
0090: */
0091: private static final String[] specialBlockNames = { "OldItalic",
0092: "Gothic", "Deseret", "ByzantineMusicalSymbols",
0093: "MusicalSymbols", "MathematicalAlphanumericSymbols",
0094: "CJKUnifiedIdeographsExtensionB",
0095: "CJKCompatibilityIdeographsSupplement", "Tags",
0096: "PrivateUse", "HighSurrogates", "HighPrivateUseSurrogates",
0097: "LowSurrogates", };
0098:
0099: // This file was automatically generated by CategoriesGen
0100:
0101: static final String CATEGORY_NAMES = "NoLoMnCfLlNlPoLuMcNdSoSmCo";
0102:
0103: static final int[][] CATEGORY_RANGES = {
0104: {
0105: // No
0106: 0x10107, 0x10133, 0x10320, 0x10323 },
0107: {
0108: // Lo
0109: 0x10000, 0x1000b, 0x1000d, 0x10026, 0x10028,
0110: 0x1003a, 0x1003c, 0x1003d, 0x1003f, 0x1004d,
0111: 0x10050, 0x1005d, 0x10080, 0x100fa, 0x10300,
0112: 0x1031e, 0x10330, 0x10349, 0x10380, 0x1039d,
0113: 0x10450, 0x1049d, 0x10800, 0x10805, 0x10808,
0114: 0x10808, 0x1080a, 0x10835, 0x10837, 0x10838,
0115: 0x1083c, 0x1083c, 0x1083f, 0x1083f, 0x20000,
0116: 0x2a6d6, 0x2f800, 0x2fa1d },
0117: {
0118: // Mn
0119: 0x1d167, 0x1d169, 0x1d17b, 0x1d182, 0x1d185,
0120: 0x1d18b, 0x1d1aa, 0x1d1ad, 0xe0100, 0xe01ef },
0121: {
0122: // Cf
0123: 0x1d173, 0x1d17a, 0xe0001, 0xe0001, 0xe0020,
0124: 0xe007f },
0125: {
0126: // Ll
0127: 0x10428, 0x1044f, 0x1d41a, 0x1d433, 0x1d44e,
0128: 0x1d454, 0x1d456, 0x1d467, 0x1d482, 0x1d49b,
0129: 0x1d4b6, 0x1d4b9, 0x1d4bb, 0x1d4bb, 0x1d4bd,
0130: 0x1d4c3, 0x1d4c5, 0x1d4cf, 0x1d4ea, 0x1d503,
0131: 0x1d51e, 0x1d537, 0x1d552, 0x1d56b, 0x1d586,
0132: 0x1d59f, 0x1d5ba, 0x1d5d3, 0x1d5ee, 0x1d607,
0133: 0x1d622, 0x1d63b, 0x1d656, 0x1d66f, 0x1d68a,
0134: 0x1d6a3, 0x1d6c2, 0x1d6da, 0x1d6dc, 0x1d6e1,
0135: 0x1d6fc, 0x1d714, 0x1d716, 0x1d71b, 0x1d736,
0136: 0x1d74e, 0x1d750, 0x1d755, 0x1d770, 0x1d788,
0137: 0x1d78a, 0x1d78f, 0x1d7aa, 0x1d7c2, 0x1d7c4,
0138: 0x1d7c9 },
0139: {
0140: // Nl
0141: 0x1034a, 0x1034a },
0142: {
0143: // Po
0144: 0x10100, 0x10101, 0x1039f, 0x1039f },
0145: {
0146: // Lu
0147: 0x10400, 0x10427, 0x1d400, 0x1d419, 0x1d434,
0148: 0x1d44d, 0x1d468, 0x1d481, 0x1d49c, 0x1d49c,
0149: 0x1d49e, 0x1d49f, 0x1d4a2, 0x1d4a2, 0x1d4a5,
0150: 0x1d4a6, 0x1d4a9, 0x1d4ac, 0x1d4ae, 0x1d4b5,
0151: 0x1d4d0, 0x1d4e9, 0x1d504, 0x1d505, 0x1d507,
0152: 0x1d50a, 0x1d50d, 0x1d514, 0x1d516, 0x1d51c,
0153: 0x1d538, 0x1d539, 0x1d53b, 0x1d53e, 0x1d540,
0154: 0x1d544, 0x1d546, 0x1d546, 0x1d54a, 0x1d550,
0155: 0x1d56c, 0x1d585, 0x1d5a0, 0x1d5b9, 0x1d5d4,
0156: 0x1d5ed, 0x1d608, 0x1d621, 0x1d63c, 0x1d655,
0157: 0x1d670, 0x1d689, 0x1d6a8, 0x1d6c0, 0x1d6e2,
0158: 0x1d6fa, 0x1d71c, 0x1d734, 0x1d756, 0x1d76e,
0159: 0x1d790, 0x1d7a8 },
0160: {
0161: // Mc
0162: 0x1d165, 0x1d166, 0x1d16d, 0x1d172 },
0163: {
0164: // Nd
0165: 0x104a0, 0x104a9, 0x1d7ce, 0x1d7ff },
0166: {
0167: // So
0168: 0x10102, 0x10102, 0x10137, 0x1013f, 0x1d000,
0169: 0x1d0f5, 0x1d100, 0x1d126, 0x1d12a, 0x1d164,
0170: 0x1d16a, 0x1d16c, 0x1d183, 0x1d184, 0x1d18c,
0171: 0x1d1a9, 0x1d1ae, 0x1d1dd, 0x1d300, 0x1d356 },
0172: {
0173: // Sm
0174: 0x1d6c1, 0x1d6c1, 0x1d6db, 0x1d6db, 0x1d6fb,
0175: 0x1d6fb, 0x1d715, 0x1d715, 0x1d735, 0x1d735,
0176: 0x1d74f, 0x1d74f, 0x1d76f, 0x1d76f, 0x1d789,
0177: 0x1d789, 0x1d7a9, 0x1d7a9, 0x1d7c3, 0x1d7c3 }, {
0178: // Co
0179: 0xf0000, 0xffffd, 0x100000, 0x10fffd } };
0180:
0181: // end of generated code
0182:
0183: /**
0184: * CharClass for each block name in specialBlockNames.
0185: */
0186: private static final CharClass[] specialBlockCharClasses = {
0187: new CharRange(0x10300, 0x1032F),
0188: new CharRange(0x10330, 0x1034F),
0189: new CharRange(0x10400, 0x1044F),
0190: new CharRange(0x1D000, 0x1D0FF),
0191: new CharRange(0x1D100, 0x1D1FF),
0192: new CharRange(0x1D400, 0x1D7FF),
0193: new CharRange(0x20000, 0x2A6D6),
0194: new CharRange(0x2F800, 0x2FA1F),
0195: new CharRange(0xE0000, 0xE007F),
0196: new Union(new CharClass[] { new CharRange(0xE000, 0xF8FF),
0197: new CharRange(0xF0000, 0xFFFFD),
0198: new CharRange(0x100000, 0x10FFFD) }),
0199: Empty.getInstance(), Empty.getInstance(),
0200: Empty.getInstance() };
0201:
0202: private static final CharClass DOT = new Complement(new Union(
0203: new CharClass[] { new SingleChar('\n'),
0204: new SingleChar('\r') }));
0205:
0206: private static final CharClass ESC_d = new Property("Nd");
0207:
0208: private static final CharClass ESC_D = new Complement(ESC_d);
0209:
0210: private static final CharClass ESC_W = new Union(new CharClass[] {
0211: computeCategoryCharClass('P'),
0212: computeCategoryCharClass('Z'),
0213: computeCategoryCharClass('C') });
0214: //was: new Property("P"), new Property("Z"), new Property("C") }
0215:
0216: private static final CharClass ESC_w = new Complement(ESC_W);
0217:
0218: private static final CharClass ESC_s = new Union(new CharClass[] {
0219: new SingleChar(' '), new SingleChar('\n'),
0220: new SingleChar('\r'), new SingleChar('\t') });
0221:
0222: // This file was automatically generated by NamingExceptionsGen
0223: // class NamingExceptions {
0224: static final String NMSTRT_INCLUDES = "\u003A\u005F\u02BB\u02BC\u02BD\u02BE\u02BF\u02C0\u02C1\u0559"
0225: + "\u06E5\u06E6\u212E";
0226: static final String NMSTRT_EXCLUDE_RANGES = "\u00AA\u00BA\u0132\u0133\u013F\u0140\u0149\u0149\u017F\u017F"
0227: + "\u01C4\u01CC\u01F1\u01F3\u01F6\u01F9\u0218\u0233\u02A9\u02AD"
0228: + "\u03D7\u03D7\u03DB\u03DB\u03DD\u03DD\u03DF\u03DF\u03E1\u03E1"
0229: + "\u0400\u0400\u040D\u040D\u0450\u0450\u045D\u045D\u048C\u048F"
0230: + "\u04EC\u04ED\u0587\u0587\u06B8\u06B9\u06BF\u06BF\u06CF\u06CF"
0231: + "\u06FA\u07A5\u0950\u0950\u0AD0\u0AD0\u0D85\u0DC6\u0E2F\u0E2F"
0232: + "\u0EAF\u0EAF\u0EDC\u0F00\u0F6A\u1055\u1101\u1101\u1104\u1104"
0233: + "\u1108\u1108\u110A\u110A\u110D\u110D\u1113\u113B\u113D\u113D"
0234: + "\u113F\u113F\u1141\u114B\u114D\u114D\u114F\u114F\u1151\u1153"
0235: + "\u1156\u1158\u1162\u1162\u1164\u1164\u1166\u1166\u1168\u1168"
0236: + "\u116A\u116C\u116F\u1171\u1174\u1174\u1176\u119D\u119F\u11A2"
0237: + "\u11A9\u11AA\u11AC\u11AD\u11B0\u11B6\u11B9\u11B9\u11BB\u11BB"
0238: + "\u11C3\u11EA\u11EC\u11EF\u11F1\u11F8\u1200\u18A8\u207F\u2124"
0239: + "\u2128\u2128\u212C\u212D\u212F\u217F\u2183\u3006\u3038\u303A"
0240: + "\u3131\u4DB5\uA000\uA48C\uF900\uFFDC";
0241: static final String NMSTRT_CATEGORIES = "LlLuLoLtNl";
0242: static final String NMCHAR_INCLUDES = "\u002D\u002E\u003A\u005F\u00B7\u0387\u212E";
0243: static final String NMCHAR_EXCLUDE_RANGES = "\u00AA\u00B5\u00BA\u00BA\u0132\u0133\u013F\u0140\u0149\u0149"
0244: + "\u017F\u017F\u01C4\u01CC\u01F1\u01F3\u01F6\u01F9\u0218\u0233"
0245: + "\u02A9\u02B8\u02E0\u02EE\u0346\u034E\u0362\u037A\u03D7\u03D7"
0246: + "\u03DB\u03DB\u03DD\u03DD\u03DF\u03DF\u03E1\u03E1\u0400\u0400"
0247: + "\u040D\u040D\u0450\u0450\u045D\u045D\u0488\u048F\u04EC\u04ED"
0248: + "\u0587\u0587\u0653\u0655\u06B8\u06B9\u06BF\u06BF\u06CF\u06CF"
0249: + "\u06FA\u07B0\u0950\u0950\u0AD0\u0AD0\u0D82\u0DF3\u0E2F\u0E2F"
0250: + "\u0EAF\u0EAF\u0EDC\u0F00\u0F6A\u0F6A\u0F96\u0F96\u0FAE\u0FB0"
0251: + "\u0FB8\u0FB8\u0FBA\u1059\u1101\u1101\u1104\u1104\u1108\u1108"
0252: + "\u110A\u110A\u110D\u110D\u1113\u113B\u113D\u113D\u113F\u113F"
0253: + "\u1141\u114B\u114D\u114D\u114F\u114F\u1151\u1153\u1156\u1158"
0254: + "\u1162\u1162\u1164\u1164\u1166\u1166\u1168\u1168\u116A\u116C"
0255: + "\u116F\u1171\u1174\u1174\u1176\u119D\u119F\u11A2\u11A9\u11AA"
0256: + "\u11AC\u11AD\u11B0\u11B6\u11B9\u11B9\u11BB\u11BB\u11C3\u11EA"
0257: + "\u11EC\u11EF\u11F1\u11F8\u1200\u18A9\u207F\u207F\u20DD\u20E0"
0258: + "\u20E2\u2124\u2128\u2128\u212C\u212D\u212F\u217F\u2183\u2183"
0259: + "\u3006\u3006\u3038\u303A\u3131\u4DB5\uA000\uA48C\uF900\uFFDC";
0260: static final String NMCHAR_CATEGORIES = "LlLuLoLtNlMcMeMnLmNd";
0261: // end of generated code
0262:
0263: private static final CharClass ESC_S = new Complement(ESC_s);
0264:
0265: private static final CharClass ESC_i = makeCharClass(
0266: NMSTRT_CATEGORIES, NMSTRT_INCLUDES, NMSTRT_EXCLUDE_RANGES);
0267:
0268: private static final CharClass ESC_I = new Complement(ESC_i);
0269:
0270: private static final CharClass ESC_c = makeCharClass(
0271: NMCHAR_CATEGORIES, NMCHAR_INCLUDES, NMCHAR_EXCLUDE_RANGES);
0272:
0273: private static final CharClass ESC_C = new Complement(ESC_c);
0274:
0275: private static final char EOS = '\0';
0276:
0277: private RegexTranslator(CharSequence regExp) {
0278: this .regExp = regExp;
0279: this .length = regExp.length();
0280: advance();
0281: }
0282:
0283: /**
0284: * Translates a regular expression in the syntax of XML Schemas Part 2 into a regular
0285: * expression in the syntax of <code>java.util.regex.Pattern</code>. The translation
0286: * assumes that the string to be matched against the regex uses surrogate pairs correctly.
0287: * If the string comes from XML content, a conforming XML parser will automatically
0288: * check this; if the string comes from elsewhere, it may be necessary to check
0289: * surrogate usage before matching.
0290: *
0291: * @param regexp a String containing a regular expression in the syntax of XML Schemas Part 2
0292: * @param xpath a boolean indicating whether the XPath 2.0 F+O extensions to the schema
0293: * regex syntax are permitted
0294: * @return a String containing a regular expression in the syntax of java.util.regex.Pattern
0295: * @throws RegexSyntaxException if <code>regexp</code> is not a regular expression in the
0296: * syntax of XML Schemas Part 2
0297: * @see java.util.regex.Pattern
0298: * @see <a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a>
0299: */
0300: public static String translate(CharSequence regexp, boolean xpath)
0301: throws RegexSyntaxException {
0302: RegexTranslator tr = new RegexTranslator(regexp);
0303: tr.isXPath = xpath;
0304: tr.translateTop();
0305: return tr.result.toString();
0306: }
0307:
0308: private void advance() {
0309: if (pos < length)
0310: curChar = regExp.charAt(pos++);
0311: else {
0312: pos++;
0313: curChar = EOS;
0314: eos = true;
0315: }
0316: }
0317:
0318: private void translateTop() throws RegexSyntaxException {
0319: translateRegExp();
0320: if (!eos)
0321: throw makeException("expected end of string");
0322: }
0323:
0324: private void translateRegExp() throws RegexSyntaxException {
0325: translateBranch();
0326: while (curChar == '|') {
0327: copyCurChar();
0328: translateBranch();
0329: }
0330: }
0331:
0332: private void translateBranch() throws RegexSyntaxException {
0333: while (translateAtom())
0334: translateQuantifier();
0335: }
0336:
0337: private void translateQuantifier() throws RegexSyntaxException {
0338: switch (curChar) {
0339: case '*':
0340: case '?':
0341: case '+':
0342: copyCurChar();
0343: break;
0344: case '{':
0345: copyCurChar();
0346: translateQuantity();
0347: expect('}');
0348: copyCurChar();
0349: break;
0350: default:
0351: return;
0352: }
0353: if (curChar == '?' && isXPath) {
0354: copyCurChar();
0355: }
0356: }
0357:
0358: private void translateQuantity() throws RegexSyntaxException {
0359: String lower = parseQuantExact().toString();
0360: int lowerValue = -1;
0361: try {
0362: lowerValue = Integer.parseInt(lower);
0363: result.append(lower);
0364: } catch (NumberFormatException e) {
0365: // JDK 1.4 cannot handle ranges bigger than this
0366: result.append("" + Integer.MAX_VALUE);
0367: }
0368: if (curChar == ',') {
0369: copyCurChar();
0370: if (curChar != '}') {
0371: String upper = parseQuantExact().toString();
0372: try {
0373: int upperValue = Integer.parseInt(upper);
0374: result.append(upper);
0375: if (lowerValue < 0 || upperValue < lowerValue)
0376: throw makeException("invalid range in quantifier");
0377: } catch (NumberFormatException e) {
0378: result.append("" + Integer.MAX_VALUE);
0379: if (lowerValue < 0
0380: && new BigDecimal(lower)
0381: .compareTo(new BigDecimal(upper)) > 0)
0382: throw makeException("invalid range in quantifier");
0383: }
0384: }
0385: }
0386: }
0387:
0388: private CharSequence parseQuantExact() throws RegexSyntaxException {
0389: FastStringBuffer buf = new FastStringBuffer(10);
0390: do {
0391: if ("0123456789".indexOf(curChar) < 0)
0392: throw makeException("expected digit in quantifier");
0393: buf.append(curChar);
0394: advance();
0395: } while (curChar != ',' && curChar != '}');
0396: return buf;
0397: }
0398:
0399: private void copyCurChar() {
0400: result.append(curChar);
0401: advance();
0402: }
0403:
0404: static final int NONE = -1;
0405: static final int SOME = 0;
0406: static final int ALL = 1;
0407:
0408: static final String SURROGATES1_CLASS = "[\uD800-\uDBFF]";
0409: static final String SURROGATES2_CLASS = "[\uDC00-\uDFFF]";
0410: static final String NOT_ALLOWED_CLASS = "[\u0000&&[^\u0000]]";
0411:
0412: static final class Range implements Comparable {
0413: private final int min;
0414: private final int max;
0415:
0416: Range(int min, int max) {
0417: this .min = min;
0418: this .max = max;
0419: }
0420:
0421: int getMin() {
0422: return min;
0423: }
0424:
0425: int getMax() {
0426: return max;
0427: }
0428:
0429: public int compareTo(Object o) {
0430: Range other = (Range) o;
0431: if (this .min < other.min)
0432: return -1;
0433: if (this .min > other.min)
0434: return 1;
0435: if (this .max > other.max)
0436: return -1;
0437: if (this .max < other.max)
0438: return 1;
0439: return 0;
0440: }
0441: }
0442:
0443: static abstract class CharClass {
0444:
0445: private final int containsBmp;
0446: // if it contains ALL and containsBmp != NONE, then the generated class for containsBmp must
0447: // contain all the high surrogates
0448: private final int containsNonBmp;
0449:
0450: protected CharClass(int containsBmp, int containsNonBmp) {
0451: this .containsBmp = containsBmp;
0452: this .containsNonBmp = containsNonBmp;
0453: }
0454:
0455: int getContainsBmp() {
0456: return containsBmp;
0457: }
0458:
0459: int getContainsNonBmp() {
0460: return containsNonBmp;
0461: }
0462:
0463: final void output(FastStringBuffer buf) {
0464: switch (containsNonBmp) {
0465: case NONE:
0466: if (containsBmp == NONE)
0467: buf.append(NOT_ALLOWED_CLASS);
0468: else
0469: outputBmp(buf);
0470: break;
0471: case ALL:
0472: buf.append("(?:");
0473: if (containsBmp == NONE) {
0474: buf.append(SURROGATES1_CLASS);
0475: buf.append(SURROGATES2_CLASS);
0476: } else {
0477: outputBmp(buf);
0478: buf.append(SURROGATES2_CLASS);
0479: buf.append('?');
0480: }
0481: buf.append(')');
0482: break;
0483: case SOME:
0484: buf.append("(?:");
0485: boolean needSep = false;
0486: if (containsBmp != NONE) {
0487: needSep = true;
0488: outputBmp(buf);
0489: }
0490: List ranges = new ArrayList(10);
0491: addNonBmpRanges(ranges);
0492: sortRangeList(ranges);
0493: String hi = highSurrogateRanges(ranges);
0494: if (hi.length() > 0) {
0495: if (needSep)
0496: buf.append('|');
0497: else
0498: needSep = true;
0499: buf.append('[');
0500: for (int i = 0, len = hi.length(); i < len; i += 2) {
0501: char min = hi.charAt(i);
0502: char max = hi.charAt(i + 1);
0503: if (min == max)
0504: buf.append(min);
0505: else {
0506: buf.append(min);
0507: buf.append('-');
0508: buf.append(max);
0509: }
0510: }
0511: buf.append(']');
0512: buf.append(SURROGATES2_CLASS);
0513: }
0514: String lo = lowSurrogateRanges(ranges);
0515: for (int i = 0, len = lo.length(); i < len; i += 3) {
0516: if (needSep)
0517: buf.append('|');
0518: else
0519: needSep = true;
0520: buf.append(lo.charAt(i));
0521: char min = lo.charAt(i + 1);
0522: char max = lo.charAt(i + 2);
0523: if (min == max
0524: && (i + 3 >= len || lo.charAt(i + 3) != lo
0525: .charAt(i)))
0526: buf.append(min);
0527: else {
0528: buf.append('[');
0529: for (;;) {
0530: if (min == max)
0531: buf.append(min);
0532: else {
0533: buf.append(min);
0534: buf.append('-');
0535: buf.append(max);
0536: }
0537: if (i + 3 >= len
0538: || lo.charAt(i + 3) != lo.charAt(i))
0539: break;
0540: i += 3;
0541: min = lo.charAt(i + 1);
0542: max = lo.charAt(i + 2);
0543: }
0544: buf.append(']');
0545: }
0546: }
0547: if (!needSep)
0548: buf.append(NOT_ALLOWED_CLASS);
0549: buf.append(')');
0550: break;
0551: }
0552: }
0553:
0554: static String highSurrogateRanges(List ranges) {
0555: FastStringBuffer highRanges = new FastStringBuffer(ranges
0556: .size() * 2);
0557: for (int i = 0, len = ranges.size(); i < len; i++) {
0558: Range r = (Range) ranges.get(i);
0559: char min1 = XMLChar.highSurrogate(r.getMin());
0560: char min2 = XMLChar.lowSurrogate(r.getMin());
0561: char max1 = XMLChar.highSurrogate(r.getMax());
0562: char max2 = XMLChar.lowSurrogate(r.getMax());
0563: if (min2 != SURROGATE2_MIN)
0564: min1++;
0565: if (max2 != SURROGATE2_MAX)
0566: max1--;
0567: if (max1 >= min1) {
0568: highRanges.append(min1);
0569: highRanges.append(max1);
0570: }
0571: }
0572: return highRanges.toString();
0573: }
0574:
0575: static String lowSurrogateRanges(List ranges) {
0576: FastStringBuffer lowRanges = new FastStringBuffer(ranges
0577: .size() * 2);
0578: for (int i = 0, len = ranges.size(); i < len; i++) {
0579: Range r = (Range) ranges.get(i);
0580: char min1 = XMLChar.highSurrogate(r.getMin());
0581: char min2 = XMLChar.lowSurrogate(r.getMin());
0582: char max1 = XMLChar.highSurrogate(r.getMax());
0583: char max2 = XMLChar.lowSurrogate(r.getMax());
0584: if (min1 == max1) {
0585: if (min2 != SURROGATE2_MIN
0586: || max2 != SURROGATE2_MAX) {
0587: lowRanges.append(min1);
0588: lowRanges.append(min2);
0589: lowRanges.append(max2);
0590: }
0591: } else {
0592: if (min2 != SURROGATE2_MIN) {
0593: lowRanges.append(min1);
0594: lowRanges.append(min2);
0595: lowRanges.append(SURROGATE2_MAX);
0596: }
0597: if (max2 != SURROGATE2_MAX) {
0598: lowRanges.append(max1);
0599: lowRanges.append(SURROGATE2_MIN);
0600: lowRanges.append(max2);
0601: }
0602: }
0603: }
0604: return lowRanges.toString();
0605: }
0606:
0607: abstract void outputBmp(FastStringBuffer buf);
0608:
0609: abstract void outputComplementBmp(FastStringBuffer buf);
0610:
0611: int getSingleChar() {
0612: return -1;
0613: }
0614:
0615: void addNonBmpRanges(List ranges) {
0616: }
0617:
0618: static void sortRangeList(List ranges) {
0619: Collections.sort(ranges);
0620: int toIndex = 0;
0621: int fromIndex = 0;
0622: int len = ranges.size();
0623: while (fromIndex < len) {
0624: Range r = (Range) ranges.get(fromIndex);
0625: int min = r.getMin();
0626: int max = r.getMax();
0627: while (++fromIndex < len) {
0628: Range r2 = (Range) ranges.get(fromIndex);
0629: if (r2.getMin() > max + 1)
0630: break;
0631: if (r2.getMax() > max)
0632: max = r2.getMax();
0633: }
0634: if (max != r.getMax())
0635: r = new Range(min, max);
0636: ranges.set(toIndex++, r);
0637: }
0638: while (len > toIndex)
0639: ranges.remove(--len);
0640: }
0641:
0642: }
0643:
0644: static abstract class SimpleCharClass extends CharClass {
0645: SimpleCharClass(int containsBmp, int containsNonBmp) {
0646: super (containsBmp, containsNonBmp);
0647: }
0648:
0649: void outputBmp(FastStringBuffer buf) {
0650: buf.append('[');
0651: inClassOutputBmp(buf);
0652: buf.append(']');
0653: }
0654:
0655: // must not call if containsBmp == ALL
0656: void outputComplementBmp(FastStringBuffer buf) {
0657: if (getContainsBmp() == NONE)
0658: buf.append("[\u0000-\uFFFF]");
0659: else {
0660: buf.append("[^");
0661: inClassOutputBmp(buf);
0662: buf.append(']');
0663: }
0664: }
0665:
0666: abstract void inClassOutputBmp(FastStringBuffer buf);
0667: }
0668:
0669: static class SingleChar extends SimpleCharClass {
0670: private final char c;
0671:
0672: SingleChar(char c) {
0673: super (SOME, NONE);
0674: this .c = c;
0675: }
0676:
0677: int getSingleChar() {
0678: return c;
0679: }
0680:
0681: void outputBmp(FastStringBuffer buf) {
0682: inClassOutputBmp(buf);
0683: }
0684:
0685: void inClassOutputBmp(FastStringBuffer buf) {
0686: if (isJavaMetaChar(c)) {
0687: buf.append('\\');
0688: buf.append(c);
0689: } else {
0690: switch (c) {
0691: case '\r':
0692: buf.append("\\r");
0693: break;
0694: case '\n':
0695: buf.append("\\n");
0696: break;
0697: case '\t':
0698: buf.append("\\t");
0699: break;
0700: case ' ':
0701: buf.append("\\x20");
0702: break;
0703: default:
0704: buf.append(c);
0705: }
0706: }
0707: return;
0708: }
0709:
0710: }
0711:
0712: static class WideSingleChar extends SimpleCharClass {
0713: private final int c;
0714:
0715: WideSingleChar(int c) {
0716: super (NONE, SOME);
0717: this .c = c;
0718: }
0719:
0720: void inClassOutputBmp(FastStringBuffer buf) {
0721: throw new RuntimeException("BMP output botch");
0722: }
0723:
0724: int getSingleChar() {
0725: return c;
0726: }
0727:
0728: void addNonBmpRanges(List ranges) {
0729: ranges.add(new Range(c, c));
0730: }
0731: }
0732:
0733: static class Empty extends SimpleCharClass {
0734: private static final Empty instance = new Empty();
0735:
0736: private Empty() {
0737: super (NONE, NONE);
0738: }
0739:
0740: static Empty getInstance() {
0741: return instance;
0742: }
0743:
0744: void inClassOutputBmp(FastStringBuffer buf) {
0745: throw new RuntimeException("BMP output botch");
0746: }
0747:
0748: }
0749:
0750: static class CharRange extends SimpleCharClass {
0751: private final int lower;
0752: private final int upper;
0753:
0754: CharRange(int lower, int upper) {
0755: super (lower < NONBMP_MIN ? SOME : NONE,
0756: // don't use ALL here, because that requires that the BMP class contains high surrogates
0757: upper >= NONBMP_MIN ? SOME : NONE);
0758: this .lower = lower;
0759: this .upper = upper;
0760: }
0761:
0762: void inClassOutputBmp(FastStringBuffer buf) {
0763: if (lower >= NONBMP_MIN)
0764: throw new RuntimeException("BMP output botch");
0765: if (isJavaMetaChar((char) lower))
0766: buf.append('\\');
0767: buf.append((char) lower);
0768: buf.append('-');
0769: if (upper < NONBMP_MIN) {
0770: if (isJavaMetaChar((char) upper))
0771: buf.append('\\');
0772: buf.append((char) upper);
0773: } else
0774: buf.append('\uFFFF');
0775: }
0776:
0777: void addNonBmpRanges(List ranges) {
0778: if (upper >= NONBMP_MIN)
0779: ranges.add(new Range(lower < NONBMP_MIN ? NONBMP_MIN
0780: : lower, upper));
0781: }
0782: }
0783:
0784: static class Property extends SimpleCharClass {
0785: private final String name;
0786:
0787: Property(String name) {
0788: super (SOME, NONE);
0789: this .name = name;
0790: }
0791:
0792: void outputBmp(FastStringBuffer buf) {
0793: inClassOutputBmp(buf);
0794: }
0795:
0796: void inClassOutputBmp(FastStringBuffer buf) {
0797: buf.append("\\p{");
0798: buf.append(name);
0799: buf.append('}');
0800: }
0801:
0802: void outputComplementBmp(FastStringBuffer buf) {
0803: buf.append("\\P{");
0804: buf.append(name);
0805: buf.append('}');
0806: }
0807: }
0808:
0809: static class Subtraction extends CharClass {
0810: private final CharClass cc1;
0811: private final CharClass cc2;
0812:
0813: Subtraction(CharClass cc1, CharClass cc2) {
0814: // min corresponds to intersection
0815: // complement corresponds to negation
0816: super (
0817: Math.min(cc1.getContainsBmp(), -cc2
0818: .getContainsBmp()), Math.min(cc1
0819: .getContainsNonBmp(), -cc2
0820: .getContainsNonBmp()));
0821: this .cc1 = cc1;
0822: this .cc2 = cc2;
0823: }
0824:
0825: void outputBmp(FastStringBuffer buf) {
0826: buf.append('[');
0827: cc1.outputBmp(buf);
0828: buf.append("&&");
0829: cc2.outputComplementBmp(buf);
0830: buf.append(']');
0831: }
0832:
0833: void outputComplementBmp(FastStringBuffer buf) {
0834: buf.append('[');
0835: cc1.outputComplementBmp(buf);
0836: cc2.outputBmp(buf);
0837: buf.append(']');
0838: }
0839:
0840: void addNonBmpRanges(List ranges) {
0841: List posList = new Vector();
0842: cc1.addNonBmpRanges(posList);
0843: List negList = new Vector();
0844: cc2.addNonBmpRanges(negList);
0845: sortRangeList(posList);
0846: sortRangeList(negList);
0847: Iterator negIter = negList.iterator();
0848: Range negRange;
0849: if (negIter.hasNext())
0850: negRange = (Range) negIter.next();
0851: else
0852: negRange = null;
0853: for (int i = 0, len = posList.size(); i < len; i++) {
0854: Range posRange = (Range) posList.get(i);
0855: while (negRange != null
0856: && negRange.getMax() < posRange.getMin()) {
0857: if (negIter.hasNext())
0858: negRange = (Range) negIter.next();
0859: else
0860: negRange = null;
0861: }
0862: // if negRange != null, negRange.max >= posRange.min
0863: int min = posRange.getMin();
0864: while (negRange != null
0865: && negRange.getMin() <= posRange.getMax()) {
0866: if (min < negRange.getMin()) {
0867: ranges
0868: .add(new Range(min,
0869: negRange.getMin() - 1));
0870: }
0871: min = negRange.getMax() + 1;
0872: if (min > posRange.getMax())
0873: break;
0874: if (negIter.hasNext())
0875: negRange = (Range) negIter.next();
0876: else
0877: negRange = null;
0878: }
0879: if (min <= posRange.getMax())
0880: ranges.add(new Range(min, posRange.getMax()));
0881: }
0882: }
0883: }
0884:
0885: static class Union extends CharClass {
0886: private final List members;
0887:
0888: Union(CharClass[] v) {
0889: this (toList(v));
0890: }
0891:
0892: private static List toList(CharClass[] v) {
0893: List members = new Vector();
0894: for (int i = 0; i < v.length; i++)
0895: members.add(v[i]);
0896: return members;
0897: }
0898:
0899: Union(List members) {
0900: super (computeContainsBmp(members),
0901: computeContainsNonBmp(members));
0902: this .members = members;
0903: }
0904:
0905: void outputBmp(FastStringBuffer buf) {
0906: buf.append('[');
0907: for (int i = 0, len = members.size(); i < len; i++) {
0908: CharClass cc = (CharClass) members.get(i);
0909: if (cc.getContainsBmp() != NONE) {
0910: if (cc instanceof SimpleCharClass)
0911: ((SimpleCharClass) cc).inClassOutputBmp(buf);
0912: else
0913: cc.outputBmp(buf);
0914: }
0915: }
0916: buf.append(']');
0917: }
0918:
0919: void outputComplementBmp(FastStringBuffer buf) {
0920: boolean first = true;
0921: int len = members.size();
0922: for (int i = 0; i < len; i++) {
0923: CharClass cc = (CharClass) members.get(i);
0924: if (cc.getContainsBmp() != NONE
0925: && cc instanceof SimpleCharClass) {
0926: if (first) {
0927: buf.append("[^");
0928: first = false;
0929: }
0930: ((SimpleCharClass) cc).inClassOutputBmp(buf);
0931: }
0932: }
0933: for (int i = 0; i < len; i++) {
0934: CharClass cc = (CharClass) members.get(i);
0935: if (cc.getContainsBmp() != NONE
0936: && !(cc instanceof SimpleCharClass)) {
0937: if (first) {
0938: buf.append('[');
0939: first = false;
0940: } else
0941: buf.append("&&");
0942: // can't have any members that are ALL, because that would make this ALL, which violates
0943: // the precondition for outputComplementBmp
0944: cc.outputComplementBmp(buf);
0945: }
0946: }
0947: if (first == true)
0948: // all members are NONE, so this is NONE, so complement is everything
0949: buf.append("[\u0000-\uFFFF]");
0950: else
0951: buf.append(']');
0952: }
0953:
0954: void addNonBmpRanges(List ranges) {
0955: for (int i = 0, len = members.size(); i < len; i++)
0956: ((CharClass) members.get(i)).addNonBmpRanges(ranges);
0957: }
0958:
0959: private static int computeContainsBmp(List members) {
0960: int ret = NONE;
0961: for (int i = 0, len = members.size(); i < len; i++)
0962: ret = Math.max(ret, ((CharClass) members.get(i))
0963: .getContainsBmp());
0964: return ret;
0965: }
0966:
0967: private static int computeContainsNonBmp(List members) {
0968: int ret = NONE;
0969: for (int i = 0, len = members.size(); i < len; i++)
0970: ret = Math.max(ret, ((CharClass) members.get(i))
0971: .getContainsNonBmp());
0972: return ret;
0973: }
0974: }
0975:
0976: static class BackReference extends CharClass {
0977: private final int i;
0978:
0979: BackReference(int i) {
0980: super (SOME, NONE);
0981: this .i = i;
0982: }
0983:
0984: void outputBmp(FastStringBuffer buf) {
0985: inClassOutputBmp(buf);
0986: }
0987:
0988: void outputComplementBmp(FastStringBuffer buf) {
0989: inClassOutputBmp(buf);
0990: }
0991:
0992: void inClassOutputBmp(FastStringBuffer buf) {
0993: buf.append("\\" + i);
0994: }
0995: }
0996:
0997: /**
0998: * Thrown when an syntactically incorrect regular expression is detected.
0999: */
1000: public static class RegexSyntaxException extends Exception {
1001: private final int position;
1002:
1003: /**
1004: * Represents an unknown position within a string containing a regular expression.
1005: */
1006: public static final int UNKNOWN_POSITION = -1;
1007:
1008: public RegexSyntaxException(String detail) {
1009: this (detail, UNKNOWN_POSITION);
1010: }
1011:
1012: public RegexSyntaxException(String detail, int position) {
1013: super (detail);
1014: this .position = position;
1015: }
1016:
1017: /**
1018: * Returns the index into the regular expression where the error was detected
1019: * or <code>UNKNOWN_POSITION</code> if this is unknown.
1020: *
1021: * @return the index into the regular expression where the error was detected,
1022: * or <code>UNKNOWNN_POSITION</code> if this is unknown
1023: */
1024: public int getPosition() {
1025: return position;
1026: }
1027: }
1028:
1029: // public static class Localizer {
1030: // private final Class cls;
1031: // private ResourceBundle bundle;
1032: //
1033: // public Localizer(Class cls) {
1034: // this.cls = cls;
1035: // }
1036: //
1037: // public String message(String key) {
1038: // return MessageFormat.format(getBundle().getString(key), new Object[]{});
1039: // }
1040: //
1041: // public String message(String key, Object arg) {
1042: // return MessageFormat.format(getBundle().getString(key),
1043: // new Object[]{arg});
1044: // }
1045: //
1046: // public String message(String key, Object arg1, Object arg2) {
1047: // return MessageFormat.format(getBundle().getString(key),
1048: // new Object[]{arg1, arg2});
1049: // }
1050: //
1051: // public String message(String key, Object[] args) {
1052: // return MessageFormat.format(getBundle().getString(key), args);
1053: // }
1054: //
1055: // private ResourceBundle getBundle() {
1056: // if (bundle == null) {
1057: // String s = cls.getName();
1058: // int i = s.lastIndexOf('.');
1059: // if (i > 0)
1060: // s = s.substring(0, i + 1);
1061: // else
1062: // s = "";
1063: // bundle = ResourceBundle.getBundle(s + "resources.Messages");
1064: // }
1065: // return bundle;
1066: // }
1067: // }
1068:
1069: static class Complement extends CharClass {
1070: private final CharClass cc;
1071:
1072: Complement(CharClass cc) {
1073: super (-cc.getContainsBmp(), -cc.getContainsNonBmp());
1074: this .cc = cc;
1075: }
1076:
1077: void outputBmp(FastStringBuffer buf) {
1078: cc.outputComplementBmp(buf);
1079: }
1080:
1081: void outputComplementBmp(FastStringBuffer buf) {
1082: cc.outputBmp(buf);
1083: }
1084:
1085: void addNonBmpRanges(List ranges) {
1086: List tem = new Vector();
1087: cc.addNonBmpRanges(tem);
1088: sortRangeList(tem);
1089: int c = NONBMP_MIN;
1090: for (int i = 0, len = tem.size(); i < len; i++) {
1091: Range r = (Range) tem.get(i);
1092: if (r.getMin() > c)
1093: ranges.add(new Range(c, r.getMin() - 1));
1094: c = r.getMax() + 1;
1095: }
1096: if (c != NONBMP_MAX + 1)
1097: ranges.add(new Range(c, NONBMP_MAX));
1098: }
1099: }
1100:
1101: private boolean translateAtom() throws RegexSyntaxException {
1102: switch (curChar) {
1103: case EOS:
1104: if (!eos)
1105: break;
1106: // fall through
1107: case '?':
1108: case '*':
1109: case '+':
1110: case ')':
1111: case '{':
1112: case '}':
1113: case '|':
1114: case ']':
1115: return false;
1116: case '(':
1117: copyCurChar();
1118: translateRegExp();
1119: expect(')');
1120: copyCurChar();
1121: return true;
1122: case '\\':
1123: advance();
1124: parseEsc().output(result);
1125: return true;
1126: case '[':
1127: advance();
1128: parseCharClassExpr().output(result);
1129: return true;
1130: case '.':
1131: if (isXPath) {
1132: // Note: "." matches a surrogate pair under JDK 1.5, but not under JDK 1.4
1133: // We'll live with this problem until 1.4 goes away...
1134: break;
1135: } else {
1136: DOT.output(result);
1137: advance();
1138: return true;
1139: }
1140: case '$':
1141: case '^':
1142: if (isXPath) {
1143: copyCurChar();
1144: return true;
1145: }
1146: result.append('\\');
1147: break;
1148: }
1149: copyCurChar();
1150: return true;
1151: }
1152:
1153: static private CharClass makeCharClass(String categories,
1154: String includes, String excludeRanges) {
1155: List includeList = new Vector();
1156: for (int i = 0, len = categories.length(); i < len; i += 2)
1157: includeList
1158: .add(new Property(categories.substring(i, i + 2)));
1159: for (int i = 0, len = includes.length(); i < len; i++) {
1160: int j = i + 1;
1161: for (; j < len
1162: && includes.charAt(j) - includes.charAt(i) == j - i; j++)
1163: ;
1164: --j;
1165: if (i == j - 1)
1166: --j;
1167: if (i == j)
1168: includeList.add(new SingleChar(includes.charAt(i)));
1169: else
1170: includeList.add(new CharRange(includes.charAt(i),
1171: includes.charAt(j)));
1172: i = j;
1173: }
1174: List excludeList = new Vector();
1175: for (int i = 0, len = excludeRanges.length(); i < len; i += 2) {
1176: char min = excludeRanges.charAt(i);
1177: char max = excludeRanges.charAt(i + 1);
1178: if (min == max)
1179: excludeList.add(new SingleChar(min));
1180: else if (min == max - 1) {
1181: excludeList.add(new SingleChar(min));
1182: excludeList.add(new SingleChar(max));
1183: } else
1184: excludeList.add(new CharRange(min, max));
1185: }
1186: return new Subtraction(new Union(includeList), new Union(
1187: excludeList));
1188: }
1189:
1190: private CharClass parseEsc() throws RegexSyntaxException {
1191: switch (curChar) {
1192: case 'n':
1193: advance();
1194: return new SingleChar('\n');
1195: case 'r':
1196: advance();
1197: return new SingleChar('\r');
1198: case 't':
1199: advance();
1200: return new SingleChar('\t');
1201: case '\\':
1202: case '|':
1203: case '.':
1204: case '-':
1205: case '^':
1206: case '?':
1207: case '*':
1208: case '+':
1209: case '(':
1210: case ')':
1211: case '{':
1212: case '}':
1213: case '[':
1214: case ']':
1215: break;
1216: case 's':
1217: advance();
1218: return ESC_s;
1219: case 'S':
1220: advance();
1221: return ESC_S;
1222: case 'i':
1223: advance();
1224: return ESC_i;
1225: case 'I':
1226: advance();
1227: return ESC_I;
1228: case 'c':
1229: advance();
1230: return ESC_c;
1231: case 'C':
1232: advance();
1233: return ESC_C;
1234: case 'd':
1235: advance();
1236: return ESC_d;
1237: case 'D':
1238: advance();
1239: return ESC_D;
1240: case 'w':
1241: advance();
1242: return ESC_w;
1243: case 'W':
1244: advance();
1245: return ESC_W;
1246: case 'p':
1247: advance();
1248: return parseProp();
1249: case 'P':
1250: advance();
1251: return new Complement(parseProp());
1252: case '0':
1253: case '1':
1254: case '2':
1255: case '3':
1256: case '4':
1257: case '5':
1258: case '6':
1259: case '7':
1260: case '8':
1261: case '9':
1262: if (isXPath) {
1263: char c = curChar;
1264: advance();
1265: return new BackReference(c - '0');
1266: } else {
1267: throw makeException("digit not allowed after \\");
1268: }
1269: case '$':
1270: if (isXPath) {
1271: break;
1272: }
1273: // otherwise fall through
1274: default:
1275: throw makeException("invalid escape sequence");
1276: }
1277: CharClass tem = new SingleChar(curChar);
1278: advance();
1279: return tem;
1280: }
1281:
1282: private CharClass parseProp() throws RegexSyntaxException {
1283: expect('{');
1284: int start = pos;
1285: for (;;) {
1286: advance();
1287: if (curChar == '}')
1288: break;
1289: if (!isAsciiAlnum(curChar) && curChar != '-')
1290: expect('}');
1291: }
1292: String propertyName = regExp.subSequence(start, pos - 1)
1293: .toString();
1294: advance();
1295: switch (propertyName.length()) {
1296: case 0:
1297: throw makeException("empty property name");
1298: case 2:
1299: int sci = subCategories.indexOf(propertyName);
1300: if (sci < 0 || sci % 2 == 1)
1301: throw makeException("unknown category");
1302: return getSubCategoryCharClass(sci / 2);
1303: case 1:
1304: int ci = categories.indexOf(propertyName.charAt(0));
1305: if (ci < 0)
1306: throw makeException("unknown category", propertyName);
1307: return getCategoryCharClass(ci);
1308: default:
1309: if (!propertyName.startsWith("Is"))
1310: break;
1311: String blockName = propertyName.substring(2);
1312: for (int i = 0; i < specialBlockNames.length; i++)
1313: if (blockName.equals(specialBlockNames[i]))
1314: return specialBlockCharClasses[i];
1315: if (!isBlock(blockName))
1316: throw makeException("invalid block name", blockName);
1317: return new Property("In" + blockName);
1318: }
1319: throw makeException("invalid property name", propertyName);
1320: }
1321:
1322: private static boolean isBlock(String name) {
1323: for (int i = 0; i < blockNames.length; i++)
1324: if (name.equals(blockNames[i]))
1325: return true;
1326: return false;
1327: }
1328:
1329: private static boolean isAsciiAlnum(char c) {
1330: if ('a' <= c && c <= 'z')
1331: return true;
1332: if ('A' <= c && c <= 'Z')
1333: return true;
1334: if ('0' <= c && c <= '9')
1335: return true;
1336: return false;
1337: }
1338:
1339: private void expect(char c) throws RegexSyntaxException {
1340: if (curChar != c)
1341: throw makeException("expected",
1342: new String(new char[] { c }));
1343: }
1344:
1345: private CharClass parseCharClassExpr() throws RegexSyntaxException {
1346: boolean compl;
1347: if (curChar == '^') {
1348: advance();
1349: compl = true;
1350: } else
1351: compl = false;
1352: List members = new ArrayList(10);
1353: boolean first = true;
1354: do {
1355: CharClass lower = parseCharClassEscOrXmlChar(first);
1356: first = false;
1357: members.add(lower);
1358: if (curChar == '-') {
1359: advance();
1360: if (curChar == ']') { // MHK: [+-] is reallowed by Schema Oct 2004 2nd edition
1361: break;
1362: }
1363: if (curChar == '[') {
1364: break;
1365: }
1366: CharClass upper = parseCharClassEscOrXmlChar(first);
1367: if (lower.getSingleChar() < 0
1368: || upper.getSingleChar() < 0)
1369: throw makeException("multi_range");
1370: if (lower.getSingleChar() > upper.getSingleChar())
1371: throw makeException("invalid range (start > end)");
1372: members.set(members.size() - 1, new CharRange(lower
1373: .getSingleChar(), upper.getSingleChar()));
1374: if (curChar == '-') {
1375: advance();
1376: expect('[');
1377: break;
1378: }
1379: }
1380: } while (curChar != ']');
1381: CharClass result;
1382: if (members.size() == 1)
1383: result = (CharClass) members.get(0);
1384: else
1385: result = new Union(members);
1386: if (compl)
1387: result = new Complement(result);
1388: if (curChar == '[') {
1389: advance();
1390: result = new Subtraction(result, parseCharClassExpr());
1391: expect(']');
1392: }
1393: advance();
1394: return result;
1395: }
1396:
1397: private CharClass parseCharClassEscOrXmlChar(boolean first)
1398: throws RegexSyntaxException {
1399: switch (curChar) {
1400: case EOS:
1401: if (eos)
1402: expect(']');
1403: break;
1404: case '\\':
1405: advance();
1406: return parseEsc();
1407: case '[':
1408: case ']':
1409: throw makeException("character must be escaped",
1410: new String(new char[] { curChar }));
1411: case '-':
1412: if (!first) {
1413: throw makeException("character must be escaped",
1414: new String(new char[] { curChar }));
1415: }
1416: break;
1417: }
1418: CharClass tem;
1419: if (XMLChar.isSurrogate(curChar)) {
1420: if (!XMLChar.isHighSurrogate(curChar))
1421: throw makeException("invalid surrogate pair");
1422: char c1 = curChar;
1423: advance();
1424: if (!XMLChar.isLowSurrogate(curChar))
1425: throw makeException("invalid surrogate pair");
1426: tem = new WideSingleChar(XMLChar.supplemental(c1, curChar));
1427: } else
1428: tem = new SingleChar(curChar);
1429: advance();
1430: return tem;
1431: }
1432:
1433: private RegexSyntaxException makeException(String key) {
1434: return new RegexSyntaxException("Error at character "
1435: + (pos - 1) + " in regular expression: " + key);
1436: }
1437:
1438: private RegexSyntaxException makeException(String key, String arg) {
1439: return new RegexSyntaxException("Error at character "
1440: + (pos - 1) + " in regular expression: " + key + " ("
1441: + arg + ')');
1442: }
1443:
1444: private static boolean isJavaMetaChar(char c) {
1445: switch (c) {
1446: case '\\':
1447: case '^':
1448: case '?':
1449: case '*':
1450: case '+':
1451: case '(':
1452: case ')':
1453: case '{':
1454: case '}':
1455: case '|':
1456: case '[':
1457: case ']':
1458: case '-':
1459: case '&':
1460: case '$':
1461: case '.':
1462: return true;
1463: }
1464: return false;
1465: }
1466:
1467: private static synchronized CharClass getCategoryCharClass(int ci) {
1468: if (categoryCharClasses[ci] == null)
1469: categoryCharClasses[ci] = computeCategoryCharClass(categories
1470: .charAt(ci));
1471: return categoryCharClasses[ci];
1472: }
1473:
1474: private static synchronized CharClass getSubCategoryCharClass(
1475: int sci) {
1476: if (subCategoryCharClasses[sci] == null)
1477: subCategoryCharClasses[sci] = computeSubCategoryCharClass(subCategories
1478: .substring(sci * 2, (sci + 1) * 2));
1479: return subCategoryCharClasses[sci];
1480: }
1481:
1482: private static final char UNICODE_3_1_ADD_Lu = '\u03F4'; // added in 3.1
1483: private static final char UNICODE_3_1_ADD_Ll = '\u03F5'; // added in 3.1
1484: // 3 characters changed from No to Nl between 3.0 and 3.1
1485: private static final char UNICODE_3_1_CHANGE_No_to_Nl_MIN = '\u16EE';
1486: private static final char UNICODE_3_1_CHANGE_No_to_Nl_MAX = '\u16F0';
1487: private static final String CATEGORY_Pi = "\u00AB\u2018\u201B\u201C\u201F\u2039"; // Java doesn't know about category Pi
1488: private static final String CATEGORY_Pf = "\u00BB\u2019\u201D\u203A"; // Java doesn't know about category Pf
1489:
1490: private static CharClass computeCategoryCharClass(char code) {
1491: List classes = new Vector();
1492: classes.add(new Property(new String(new char[] { code })));
1493: for (int ci = CATEGORY_NAMES.indexOf(code); ci >= 0; ci = CATEGORY_NAMES
1494: .indexOf(code, ci + 1)) {
1495: int[] addRanges = CATEGORY_RANGES[ci / 2];
1496: for (int i = 0; i < addRanges.length; i += 2)
1497: classes.add(new CharRange(addRanges[i],
1498: addRanges[i + 1]));
1499: }
1500: if (code == 'P')
1501: classes.add(makeCharClass(CATEGORY_Pi + CATEGORY_Pf));
1502: if (code == 'L') {
1503: classes.add(new SingleChar(UNICODE_3_1_ADD_Ll));
1504: classes.add(new SingleChar(UNICODE_3_1_ADD_Lu));
1505: }
1506: if (code == 'C') {
1507: // JDK 1.4 leaves Cn out of C?
1508: classes.add(new Subtraction(new Property("Cn"), new Union(
1509: new CharClass[] {
1510: new SingleChar(UNICODE_3_1_ADD_Lu),
1511: new SingleChar(UNICODE_3_1_ADD_Ll) })));
1512: List assignedRanges = new Vector();
1513: for (int i = 0; i < CATEGORY_RANGES.length; i++)
1514: for (int j = 0; j < CATEGORY_RANGES[i].length; j += 2)
1515: assignedRanges.add(new CharRange(
1516: CATEGORY_RANGES[i][j],
1517: CATEGORY_RANGES[i][j + 1]));
1518: classes.add(new Subtraction(new CharRange(NONBMP_MIN,
1519: NONBMP_MAX), new Union(assignedRanges)));
1520: }
1521: if (classes.size() == 1)
1522: return (CharClass) classes.get(0);
1523: return new Union(classes);
1524: }
1525:
1526: private static CharClass computeSubCategoryCharClass(String name) {
1527: CharClass base = new Property(name);
1528: int sci = CATEGORY_NAMES.indexOf(name);
1529: if (sci < 0) {
1530: if (name.equals("Cn")) {
1531: // Unassigned
1532: List assignedRanges = new Vector();
1533: assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Lu));
1534: assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Ll));
1535: for (int i = 0; i < CATEGORY_RANGES.length; i++)
1536: for (int j = 0; j < CATEGORY_RANGES[i].length; j += 2)
1537: assignedRanges.add(new CharRange(
1538: CATEGORY_RANGES[i][j],
1539: CATEGORY_RANGES[i][j + 1]));
1540: return new Subtraction(new Union(new CharClass[] {
1541: base, new CharRange(NONBMP_MIN, NONBMP_MAX) }),
1542: new Union(assignedRanges));
1543: }
1544: if (name.equals("Pi"))
1545: return makeCharClass(CATEGORY_Pi);
1546: if (name.equals("Pf"))
1547: return makeCharClass(CATEGORY_Pf);
1548: return base;
1549: }
1550: List classes = new Vector();
1551: classes.add(base);
1552: int[] addRanges = CATEGORY_RANGES[sci / 2];
1553: for (int i = 0; i < addRanges.length; i += 2)
1554: classes.add(new CharRange(addRanges[i], addRanges[i + 1]));
1555: if (name.equals("Lu"))
1556: classes.add(new SingleChar(UNICODE_3_1_ADD_Lu));
1557: else if (name.equals("Ll"))
1558: classes.add(new SingleChar(UNICODE_3_1_ADD_Ll));
1559: else if (name.equals("Nl"))
1560: classes.add(new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN,
1561: UNICODE_3_1_CHANGE_No_to_Nl_MAX));
1562: else if (name.equals("No"))
1563: return new Subtraction(new Union(classes), new CharRange(
1564: UNICODE_3_1_CHANGE_No_to_Nl_MIN,
1565: UNICODE_3_1_CHANGE_No_to_Nl_MAX));
1566: return new Union(classes);
1567: }
1568:
1569: private static CharClass makeCharClass(String members) {
1570: List list = new Vector();
1571: for (int i = 0, len = members.length(); i < len; i++)
1572: list.add(new SingleChar(members.charAt(i)));
1573: return new Union(list);
1574: }
1575:
1576: public static void main(String[] args) throws RegexSyntaxException {
1577: String s = translate(args[0], args[1].equals("xpath"));
1578: for (int i = 0, len = s.length(); i < len; i++) {
1579: char c = s.charAt(i);
1580: if (c >= 0x20 && c <= 0x7e)
1581: System.err.print(c);
1582: else {
1583: System.err.print("\\u");
1584: for (int shift = 12; shift >= 0; shift -= 4)
1585: System.err.print("0123456789ABCDEF"
1586: .charAt((c >> shift) & 0xF));
1587: }
1588: }
1589: System.err.println();
1590: }
1591:
1592: //}
1593:
1594: }
1595:
1596: //
1597: // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
1598: // you may not use this file except in compliance with the License. You may obtain a copy of the
1599: // License at http://www.mozilla.org/MPL/
1600: //
1601: // Software distributed under the License is distributed on an "AS IS" basis,
1602: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
1603: // See the License for the specific language governing rights and limitations under the License.
1604: //
1605: // The Original Code is: all this file except changes marked.
1606: //
1607: // The Initial Developer of the Original Code is James Clark
1608: //
1609: // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
1610: //
1611: // Contributor(s): Michael Kay
1612: //
|