001: /**
002: *
003: */package org.jruby;
004:
005: import jregex.MatchResult;
006: import jregex.Pattern;
007: import jregex.REFlags;
008: import jregex.Replacer;
009: import jregex.Substitution;
010: import jregex.TextBuffer;
011:
012: import org.jruby.parser.ReOptions;
013: import org.jruby.util.ByteList;
014:
015: public class RegexpTranslator {
016:
017: private static final Pattern SHARP_IN_CHARACTER_CLASS_PATTERN = new Pattern(
018: "(\\[[^]]*)#(.*?])");
019: private static final Pattern SPACE_IN_CHARACTER_CLASS_PATTERN = new Pattern(
020: "(\\[[^]]*) (.*?])");
021: private static final Pattern COMMENT_PATTERN = new Pattern(
022: "\\(\\?#[^)]*\\)");
023: private static final Pattern COMMENT2_PATTERN = new Pattern(
024: "(?<!\\\\)#.*");
025: private static final Pattern HEX_SINGLE_DIGIT_PATTERN = new Pattern(
026: "\\\\x(\\p{XDigit})(?!\\p{XDigit})");
027: private static final Pattern OCTAL_SINGLE_ZERO_PATTERN = new Pattern(
028: "\\\\(0)(?![0-7])");
029: private static final Pattern OCTAL_MISSING_ZERO_PATTERN = new Pattern(
030: "\\\\([1-7][0-7]{1,2})");
031: private static final Pattern POSIX_NAME = new Pattern(
032: "\\[:(\\w+):\\]");
033:
034: public Pattern translate(String regex, int options,
035: int javaRegexFlags) {
036: javaRegexFlags |= translateFlags(options);
037: regex = translatePattern(regex,
038: (javaRegexFlags & REFlags.IGNORE_SPACES) != 0);
039: return new Pattern(regex, javaRegexFlags);
040: }
041:
042: public Pattern translate(ByteList regex, int options,
043: int javaRegexFlags) {
044: javaRegexFlags |= translateFlags(options);
045: String regexString;
046: if ((options & ReOptions.RE_UNICODE) == 0) {
047: regexString = regex.toString();
048: } else {
049: regexString = regex.toUtf8String();
050: }
051: String newRegex = translatePattern(regexString,
052: (javaRegexFlags & REFlags.IGNORE_SPACES) != 0);
053: return new Pattern(newRegex, javaRegexFlags);
054: }
055:
056: public int flagsFor(int options, int javaRegexFlags) {
057: return javaRegexFlags | translateFlags(options);
058: }
059:
060: // We do not check for pathological case of [:foo:] outside of [] (bug 1475096).
061: private static String translatePosixPattern(String regex) {
062: Substitution posix = new Substitution() {
063: public void appendSubstitution(MatchResult match,
064: TextBuffer dest) {
065: String value = match.group(1);
066: if ("alnum".equals(value)) {
067: dest.append("\\p{Alnum}");
068: } else if ("alpha".equals(value)) {
069: dest.append("\\p{Alpha}");
070: } else if ("blank".equals(value)) {
071: dest.append("\\p{Blank}");
072: } else if ("cntrl".equals(value)) {
073: dest.append("\\p{Cntrl}");
074: } else if ("digit".equals(value)) {
075: dest.append("\\p{Digit}");
076: } else if ("graph".equals(value)) {
077: dest.append("\\p{Graph}");
078: } else if ("lower".equals(value)) {
079: dest.append("\\p{Lower}");
080: } else if ("print".equals(value)) {
081: dest.append("\\p{Print}");
082: } else if ("punct".equals(value)) {
083: dest.append("\\p{Punct}");
084: } else if ("space".equals(value)) {
085: dest.append("\\p{Space}");
086: } else if ("upper".equals(value)) {
087: dest.append("\\p{Upper}");
088: } else if ("xdigit".equals(value)) {
089: dest.append("\\p{XDigit}");
090: } else {
091: dest.append("\\[:" + value + ":\\]");
092: }
093: }
094: };
095: Replacer r = POSIX_NAME.replacer(posix);
096: return r.replace(regex);
097: }
098:
099: public static String translatePattern(String regex,
100: boolean commentsAllowed) {
101: regex = COMMENT_PATTERN.replacer("").replace(regex);
102: regex = translatePosixPattern(regex);
103: regex = HEX_SINGLE_DIGIT_PATTERN.replacer("\\\\" + "x0$1")
104: .replace(regex);
105: regex = OCTAL_SINGLE_ZERO_PATTERN.replacer("\\\\" + "0$1")
106: .replace(regex);
107: regex = OCTAL_MISSING_ZERO_PATTERN.replacer("\\\\" + "0$1")
108: .replace(regex);
109: if (commentsAllowed) {
110: regex = SPACE_IN_CHARACTER_CLASS_PATTERN.replacer(
111: "$1\\\\x20$2").replace(regex);
112: regex = SHARP_IN_CHARACTER_CLASS_PATTERN.replacer(
113: "$1\\\\x23$2").replace(regex);
114: regex = COMMENT2_PATTERN.replacer("").replace(regex);
115: }
116: return regex;
117: }
118:
119: public static int translateFlags(int options) {
120: int flags = REFlags.MULTILINE;
121: if ((options & ReOptions.RE_OPTION_IGNORECASE) > 0) {
122: flags |= REFlags.IGNORE_CASE;
123: }
124: if ((options & ReOptions.RE_OPTION_EXTENDED) > 0) {
125: flags |= REFlags.IGNORE_SPACES;
126: }
127: if ((options & ReOptions.RE_OPTION_MULTILINE) > 0) {
128: flags |= REFlags.DOTALL;
129: }
130: // FIXME: This may be useful for something, but doesn't appear to be right
131: // for Ruby. It turns \w, \s, etc into Unicode forms, but that appears to
132: // break some test cases for us
133: //if ((options & ReOptions.RE_UNICODE) > 0) {
134: // flags |= REFlags.UNICODE;
135: //}
136: return flags;
137: }
138: }
|