0001: /*
0002: * Licensed to the Apache Software Foundation (ASF) under one or more
0003: * contributor license agreements. See the NOTICE file distributed with
0004: * this work for additional information regarding copyright ownership.
0005: * The ASF licenses this file to You under the Apache License, Version 2.0
0006: * (the "License"); you may not use this file except in compliance with
0007: * the License. You may obtain a copy of the License at
0008: *
0009: * http://www.apache.org/licenses/LICENSE-2.0
0010: *
0011: * Unless required by applicable law or agreed to in writing, software
0012: * distributed under the License is distributed on an "AS IS" BASIS,
0013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014: * See the License for the specific language governing permissions and
0015: * limitations under the License.
0016: */
0017:
0018: package org.apache.xerces.impl.xpath.regex;
0019:
0020: import java.text.CharacterIterator;
0021:
0022: /**
0023: * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
0024: * This engine does not conform to the POSIX regular expression.
0025: *
0026: * <hr width="50%">
0027: * <h3>How to use</h3>
0028: *
0029: * <dl>
0030: * <dt>A. Standard way
0031: * <dd>
0032: * <pre>
0033: * RegularExpression re = new RegularExpression(<var>regex</var>);
0034: * if (re.matches(text)) { ... }
0035: * </pre>
0036: *
0037: * <dt>B. Capturing groups
0038: * <dd>
0039: * <pre>
0040: * RegularExpression re = new RegularExpression(<var>regex</var>);
0041: * Match match = new Match();
0042: * if (re.matches(text, match)) {
0043: * ... // You can refer captured texts with methods of the <code>Match</code> class.
0044: * }
0045: * </pre>
0046: *
0047: * </dl>
0048: *
0049: * <h4>Case-insensitive matching</h4>
0050: * <pre>
0051: * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
0052: * if (re.matches(text) >= 0) { ...}
0053: * </pre>
0054: *
0055: * <h4>Options</h4>
0056: * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
0057: * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
0058: * This <var>options</var> parameter consists of the following characters.
0059: * </p>
0060: * <dl>
0061: * <dt><a name="I_OPTION"><code>"i"</code></a>
0062: * <dd>This option indicates case-insensitive matching.
0063: * <dt><a name="M_OPTION"><code>"m"</code></a>
0064: * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
0065: * <dt><a name="S_OPTION"><code>"s"</code></a>
0066: * <dd class="REGEX"><kbd>.</kbd> matches any one character.
0067: * <dt><a name="U_OPTION"><code>"u"</code></a>
0068: * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode.
0069: * <dt><a name="W_OPTION"><code>"w"</code></a>
0070: * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of
0071: * 'Unicode Regular Expression Guidelines' Revision 4.
0072: * When "w" and "u" are specified at the same time,
0073: * <kbd>\b \B \< \></kbd> are processed for the "w" option.
0074: * <dt><a name="COMMA_OPTION"><code>","</code></a>
0075: * <dd>The parser treats a comma in a character class as a range separator.
0076: * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
0077: * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
0078: *
0079: * <dt><a name="X_OPTION"><code>"X"</code></a>
0080: * <dd class="REGEX">
0081: * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
0082: * The <code>match()</code> method does not do subsring matching
0083: * but entire string matching.
0084: *
0085: * </dl>
0086: *
0087: * <hr width="50%">
0088: * <h3>Syntax</h3>
0089: * <table border="1" bgcolor="#ddeeff">
0090: * <tr>
0091: * <td>
0092: * <h4>Differences from the Perl 5 regular expression</h4>
0093: * <ul>
0094: * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
0095: * <li>Supports subtraction, union, and intersection operations for character classes.
0096: * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
0097: * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
0098: * <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
0099: * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
0100: * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
0101: * </ul>
0102: * </td>
0103: * </tr>
0104: * </table>
0105: *
0106: * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
0107: * <ul>
0108: * <li>Character
0109: * <dl>
0110: * <dt class="REGEX"><kbd>.</kbd> (A period)
0111: * <dd>Matches any one character except the following characters.
0112: * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
0113: * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
0114: * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
0115: * <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
0116: * it matches any character including the above four characters.
0117: *
0118: * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
0119: * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
0120: * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
0121: *
0122: * <dt class="REGEX"><kbd>\c</kbd><var>C</var>
0123: * <dd>Matches a control character.
0124: * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
0125: * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
0126: * It matches a control character of which the character code is less than
0127: * the character code of the <var>C</var> by 0x0040.
0128: * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
0129: * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
0130: *
0131: * <dt class="REGEX">a non-meta character
0132: * <dd>Matches the character.
0133: *
0134: * <dt class="REGEX"><KBD>\</KBD> + a meta character
0135: * <dd>Matches the meta character.
0136: *
0137: * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
0138: * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
0139: * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
0140: * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
0141: *
0142: * <!--
0143: * <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
0144: * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
0145: * -->
0146: *
0147: * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
0148: * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
0149: *
0150: * <dt class="REGEX"><kbd>\g</kbd>
0151: * <dd>Matches a grapheme.
0152: * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
0153: *
0154: * <dt class="REGEX"><kbd>\X</kbd>
0155: * <dd class="REGEX">Matches a combining character sequence.
0156: * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
0157: * </dl>
0158: * </li>
0159: *
0160: * <li>Character class
0161: * <dl>
0162: + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
0163: + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
0164: * <dd>Positive character class. It matches a character in ranges.
0165: * <dd><var>R<sub>n</sub></var>:
0166: * <ul>
0167: * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
0168: * <p>This range matches the character.
0169: * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
0170: * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point.
0171: + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
0172: + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
0173: * <p>...
0174: * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
0175: * <p>These expressions specifies the same ranges as the following expressions.
0176: * </ul>
0177: * <p class="REGEX">Enumerated ranges are merged (union operation).
0178: * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
0179: *
0180: * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
0181: * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
0182: * <dd>Negative character class. It matches a character not in ranges.
0183: *
0184: * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
0185: * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
0186: * <dd>Subtraction or union or intersection for character classes.
0187: * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
0188: * <dd>The result of this operations is a <u>positive character class</u>
0189: * even if an expression includes any negative character classes.
0190: * You have to take care on this in case-insensitive matching.
0191: * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
0192: * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
0193: * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
0194: * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
0195: * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
0196: *
0197: * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
0198: * <dd>Character class subtraction for the XML Schema.
0199: * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
0200: *
0201: * <dt class="REGEX"><kbd>\d</kbd>
0202: * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
0203: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0204: * <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
0205: *
0206: * <dt class="REGEX"><kbd>\D</kbd>
0207: * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
0208: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0209: * <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
0210: *
0211: * <dt class="REGEX"><kbd>\s</kbd>
0212: * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
0213: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0214: * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
0215: *
0216: * <dt class="REGEX"><kbd>\S</kbd>
0217: * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
0218: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0219: * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
0220: *
0221: * <dt class="REGEX"><kbd>\w</kbd>
0222: * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
0223: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0224: * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
0225: *
0226: * <dt class="REGEX"><kbd>\W</kbd>
0227: * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
0228: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0229: * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
0230: *
0231: * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
0232: * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
0233: * The following names are available:
0234: * <dl>
0235: * <dt>Unicode General Categories:
0236: * <dd><kbd>
0237: * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
0238: * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
0239: * </kbd>
0240: * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
0241: * <dt>Unicode Blocks:
0242: * <dd><kbd>
0243: * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
0244: * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
0245: * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
0246: * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
0247: * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
0248: * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
0249: * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
0250: * Miscellaneous Technical, Control Pictures, Optical Character Recognition,
0251: * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
0252: * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
0253: * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
0254: * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
0255: * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
0256: * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
0257: * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
0258: * Small Form Variants, Arabic Presentation Forms-B, Specials,
0259: * Halfwidth and Fullwidth Forms
0260: * </kbd>
0261: * <dt>Others:
0262: * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
0263: * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
0264: * <dd><kbd>UNASSGINED</kbd>
0265: * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
0266: * </dl>
0267: *
0268: * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
0269: * <dd>Matches one character not in the specified General Category or the specified Block.
0270: * </dl>
0271: * </li>
0272: *
0273: * <li>Selection and Quantifier
0274: * <dl>
0275: * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
0276: * <dd>...
0277: *
0278: * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
0279: * <dd>Matches 0 or more <var>X</var>.
0280: *
0281: * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
0282: * <dd>Matches 1 or more <var>X</var>.
0283: *
0284: * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
0285: * <dd>Matches 0 or 1 <var>X</var>.
0286: *
0287: * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
0288: * <dd>Matches <var>number</var> times.
0289: *
0290: * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
0291: * <dd>...
0292: *
0293: * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
0294: * <dd>...
0295: *
0296: * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
0297: * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
0298: * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
0299: * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
0300: * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
0301: * <dd>Non-greedy matching.
0302: * </dl>
0303: * </li>
0304: *
0305: * <li>Grouping, Capturing, and Back-reference
0306: * <dl>
0307: * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
0308: * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
0309: * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
0310: * you have to write "<KBD>(?:foo)+</KBD>".
0311: *
0312: * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
0313: * <dd>Grouping with capturing.
0314: * It make a group and applications can know
0315: * where in target text a group matched with methods of a <code>Match</code> instance
0316: * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
0317: * The 0th group means whole of this regular expression.
0318: * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
0319: *
0320: * <p>For instance, a regular expression is
0321: * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>"
0322: * and target text is
0323: * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>":
0324: * <ul>
0325: * <li><code>Match.getCapturedText(0)</code>:
0326: * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>"
0327: * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
0328: * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
0329: * </ul>
0330: *
0331: * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
0332: * <dd>
0333: *
0334: * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
0335: * <dd>Independent expression group. ................
0336: *
0337: * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
0338: * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
0339: * <dd>............................
0340: * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
0341: * Note that it can not contain 'u'.
0342: *
0343: * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
0344: * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
0345: * <dd>......
0346: * <dd>These expressions must be at the beginning of a group.
0347: * </dl>
0348: * </li>
0349: *
0350: * <li>Anchor
0351: * <dl>
0352: * <dt class="REGEX"><kbd>\A</kbd>
0353: * <dd>Matches the beginnig of the text.
0354: *
0355: * <dt class="REGEX"><kbd>\Z</kbd>
0356: * <dd>Matches the end of the text, or before an EOL character at the end of the text,
0357: * or CARRIAGE RETURN + LINE FEED at the end of the text.
0358: *
0359: * <dt class="REGEX"><kbd>\z</kbd>
0360: * <dd>Matches the end of the text.
0361: *
0362: * <dt class="REGEX"><kbd>^</kbd>
0363: * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
0364: * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
0365: * it matches the beginning of the text, or after one of EOL characters (
0366: * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
0367: * PARAGRAPH SEPARATOR (U+2029).)
0368: *
0369: * <dt class="REGEX"><kbd>$</kbd>
0370: * <dd>Matches the end of the text, or before an EOL character at the end of the text,
0371: * or CARRIAGE RETURN + LINE FEED at the end of the text.
0372: * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
0373: * it matches the end of the text, or before an EOL character.
0374: *
0375: * <dt class="REGEX"><kbd>\b</kbd>
0376: * <dd>Matches word boundary.
0377: * (See <a href="#W_OPTION">a "w" option</a>)
0378: *
0379: * <dt class="REGEX"><kbd>\B</kbd>
0380: * <dd>Matches non word boundary.
0381: * (See <a href="#W_OPTION">a "w" option</a>)
0382: *
0383: * <dt class="REGEX"><kbd>\<</kbd>
0384: * <dd>Matches the beginning of a word.
0385: * (See <a href="#W_OPTION">a "w" option</a>)
0386: *
0387: * <dt class="REGEX"><kbd>\></kbd>
0388: * <dd>Matches the end of a word.
0389: * (See <a href="#W_OPTION">a "w" option</a>)
0390: * </dl>
0391: * </li>
0392: * <li>Lookahead and lookbehind
0393: * <dl>
0394: * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
0395: * <dd>Lookahead.
0396: *
0397: * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
0398: * <dd>Negative lookahead.
0399: *
0400: * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd>
0401: * <dd>Lookbehind.
0402: * <dd>(Note for text capturing......)
0403: *
0404: * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd>
0405: * <dd>Negative lookbehind.
0406: * </dl>
0407: * </li>
0408: *
0409: * <li>Misc.
0410: * <dl>
0411: * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
0412: * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
0413: * <dd>......
0414: * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
0415: * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'.
0416: * You can not write comments in character classes and before quantifiers.
0417: * </dl>
0418: * </li>
0419: * </ul>
0420: *
0421: *
0422: * <hr width="50%">
0423: * <h3>BNF for the regular expression</h3>
0424: * <pre>
0425: * regex ::= ('(?' options ')')? term ('|' term)*
0426: * term ::= factor+
0427: * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
0428: * | '(?#' [^)]* ')'
0429: * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
0430: * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
0431: * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
0432: * | '(?>' regex ')' | '(?' options ':' regex ')'
0433: * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
0434: * options ::= [imsw]* ('-' [imsw]+)?
0435: * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
0436: * looks ::= '(?=' regex ')' | '(?!' regex ')'
0437: * | '(?<=' regex ')' | '(?<!' regex ')'
0438: * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
0439: * category-block ::= '\' [pP] category-symbol-1
0440: * | ('\p{' | '\P{') (category-symbol | block-name
0441: * | other-properties) '}'
0442: * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
0443: * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
0444: * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
0445: * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
0446: * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
0447: * | 'Sm' | 'Sc' | 'Sk' | 'So'
0448: * block-name ::= (See above)
0449: * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
0450: * character-1 ::= (any character except meta-characters)
0451: *
0452: * char-class ::= '[' ranges ']'
0453: * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
0454: * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
0455: * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
0456: * | range-char | range-char '-' range-char
0457: * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
0458: * code-point ::= '\x' hex-char hex-char
0459: * | '\x{' hex-char+ '}'
0460: * <!-- | '\u005c u' hex-char hex-char hex-char hex-char
0461: * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
0462: * hex-char ::= [0-9a-fA-F]
0463: * character-2 ::= (any character except \[]-,)
0464: * </pre>
0465: *
0466: * <hr width="50%">
0467: * <h3>TODO</h3>
0468: * <ul>
0469: * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
0470: * <ul>
0471: * <li>2.4 Canonical Equivalents
0472: * <li>Level 3
0473: * </ul>
0474: * <li>Parsing performance
0475: * </ul>
0476: *
0477: * <hr width="50%">
0478: *
0479: * @xerces.internal
0480: *
0481: * @author TAMURA Kent <kent@trl.ibm.co.jp>
0482: * @version $Id: RegularExpression.java 446721 2006-09-15 20:35:34Z mrglavas $
0483: */
0484: public class RegularExpression implements java.io.Serializable {
0485:
0486: private static final long serialVersionUID = 6242499334195006401L;
0487:
0488: static final boolean DEBUG = false;
0489:
0490: /**
0491: * Compiles a token tree into an operation flow.
0492: */
0493: private synchronized void compile(Token tok) {
0494: if (this .operations != null)
0495: return;
0496: this .numberOfClosures = 0;
0497: this .operations = this .compile(tok, null, false);
0498: }
0499:
0500: /**
0501: * Converts a token to an operation.
0502: */
0503: private Op compile(Token tok, Op next, boolean reverse) {
0504: Op ret;
0505: switch (tok.type) {
0506: case Token.DOT:
0507: ret = Op.createDot();
0508: ret.next = next;
0509: break;
0510:
0511: case Token.CHAR:
0512: ret = Op.createChar(tok.getChar());
0513: ret.next = next;
0514: break;
0515:
0516: case Token.ANCHOR:
0517: ret = Op.createAnchor(tok.getChar());
0518: ret.next = next;
0519: break;
0520:
0521: case Token.RANGE:
0522: case Token.NRANGE:
0523: ret = Op.createRange(tok);
0524: ret.next = next;
0525: break;
0526:
0527: case Token.CONCAT:
0528: ret = next;
0529: if (!reverse) {
0530: for (int i = tok.size() - 1; i >= 0; i--) {
0531: ret = compile(tok.getChild(i), ret, false);
0532: }
0533: } else {
0534: for (int i = 0; i < tok.size(); i++) {
0535: ret = compile(tok.getChild(i), ret, true);
0536: }
0537: }
0538: break;
0539:
0540: case Token.UNION:
0541: Op.UnionOp uni = Op.createUnion(tok.size());
0542: for (int i = 0; i < tok.size(); i++) {
0543: uni.addElement(compile(tok.getChild(i), next, reverse));
0544: }
0545: ret = uni; // ret.next is null.
0546: break;
0547:
0548: case Token.CLOSURE:
0549: case Token.NONGREEDYCLOSURE:
0550: Token child = tok.getChild(0);
0551: int min = tok.getMin();
0552: int max = tok.getMax();
0553: if (min >= 0 && min == max) { // {n}
0554: ret = next;
0555: for (int i = 0; i < min; i++) {
0556: ret = compile(child, ret, reverse);
0557: }
0558: break;
0559: }
0560: if (min > 0 && max > 0)
0561: max -= min;
0562: if (max > 0) {
0563: // X{2,6} -> XX(X(X(XX?)?)?)?
0564: ret = next;
0565: for (int i = 0; i < max; i++) {
0566: Op.ChildOp q = Op
0567: .createQuestion(tok.type == Token.NONGREEDYCLOSURE);
0568: q.next = next;
0569: q.setChild(compile(child, ret, reverse));
0570: ret = q;
0571: }
0572: } else {
0573: Op.ChildOp op;
0574: if (tok.type == Token.NONGREEDYCLOSURE) {
0575: op = Op.createNonGreedyClosure();
0576: } else { // Token.CLOSURE
0577: if (child.getMinLength() == 0)
0578: op = Op.createClosure(this .numberOfClosures++);
0579: else
0580: op = Op.createClosure(-1);
0581: }
0582: op.next = next;
0583: op.setChild(compile(child, op, reverse));
0584: ret = op;
0585: }
0586: if (min > 0) {
0587: for (int i = 0; i < min; i++) {
0588: ret = compile(child, ret, reverse);
0589: }
0590: }
0591: break;
0592:
0593: case Token.EMPTY:
0594: ret = next;
0595: break;
0596:
0597: case Token.STRING:
0598: ret = Op.createString(tok.getString());
0599: ret.next = next;
0600: break;
0601:
0602: case Token.BACKREFERENCE:
0603: ret = Op.createBackReference(tok.getReferenceNumber());
0604: ret.next = next;
0605: break;
0606:
0607: case Token.PAREN:
0608: if (tok.getParenNumber() == 0) {
0609: ret = compile(tok.getChild(0), next, reverse);
0610: } else if (reverse) {
0611: next = Op.createCapture(tok.getParenNumber(), next);
0612: next = compile(tok.getChild(0), next, reverse);
0613: ret = Op.createCapture(-tok.getParenNumber(), next);
0614: } else {
0615: next = Op.createCapture(-tok.getParenNumber(), next);
0616: next = compile(tok.getChild(0), next, reverse);
0617: ret = Op.createCapture(tok.getParenNumber(), next);
0618: }
0619: break;
0620:
0621: case Token.LOOKAHEAD:
0622: ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok
0623: .getChild(0), null, false));
0624: break;
0625: case Token.NEGATIVELOOKAHEAD:
0626: ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok
0627: .getChild(0), null, false));
0628: break;
0629: case Token.LOOKBEHIND:
0630: ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok
0631: .getChild(0), null, true));
0632: break;
0633: case Token.NEGATIVELOOKBEHIND:
0634: ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(
0635: tok.getChild(0), null, true));
0636: break;
0637:
0638: case Token.INDEPENDENT:
0639: ret = Op.createIndependent(next, compile(tok.getChild(0),
0640: null, reverse));
0641: break;
0642:
0643: case Token.MODIFIERGROUP:
0644: ret = Op.createModifier(next, compile(tok.getChild(0),
0645: null, reverse), ((Token.ModifierToken) tok)
0646: .getOptions(), ((Token.ModifierToken) tok)
0647: .getOptionsMask());
0648: break;
0649:
0650: case Token.CONDITION:
0651: Token.ConditionToken ctok = (Token.ConditionToken) tok;
0652: int ref = ctok.refNumber;
0653: Op condition = ctok.condition == null ? null : compile(
0654: ctok.condition, null, reverse);
0655: Op yes = compile(ctok.yes, next, reverse);
0656: Op no = ctok.no == null ? null : compile(ctok.no, next,
0657: reverse);
0658: ret = Op.createCondition(next, ref, condition, yes, no);
0659: break;
0660:
0661: default:
0662: throw new RuntimeException("Unknown token type: "
0663: + tok.type);
0664: } // switch (tok.type)
0665: return ret;
0666: }
0667:
0668: //Public
0669:
0670: /**
0671: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
0672: *
0673: * @return true if the target is matched to this regular expression.
0674: */
0675: public boolean matches(char[] target) {
0676: return this .matches(target, 0, target.length, (Match) null);
0677: }
0678:
0679: /**
0680: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
0681: * in specified range or not.
0682: *
0683: * @param start Start offset of the range.
0684: * @param end End offset +1 of the range.
0685: * @return true if the target is matched to this regular expression.
0686: */
0687: public boolean matches(char[] target, int start, int end) {
0688: return this .matches(target, start, end, (Match) null);
0689: }
0690:
0691: /**
0692: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
0693: *
0694: * @param match A Match instance for storing matching result.
0695: * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
0696: */
0697: public boolean matches(char[] target, Match match) {
0698: return this .matches(target, 0, target.length, match);
0699: }
0700:
0701: /**
0702: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
0703: * in specified range or not.
0704: *
0705: * @param start Start offset of the range.
0706: * @param end End offset +1 of the range.
0707: * @param match A Match instance for storing matching result.
0708: * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
0709: */
0710: public boolean matches(char[] target, int start, int end,
0711: Match match) {
0712:
0713: synchronized (this ) {
0714: if (this .operations == null)
0715: this .prepare();
0716: if (this .context == null)
0717: this .context = new Context();
0718: }
0719: Context con = null;
0720: synchronized (this .context) {
0721: con = this .context.inuse ? new Context() : this .context;
0722: con.reset(target, start, end, this .numberOfClosures);
0723: }
0724: if (match != null) {
0725: match.setNumberOfGroups(this .nofparen);
0726: match.setSource(target);
0727: } else if (this .hasBackReferences) {
0728: match = new Match();
0729: match.setNumberOfGroups(this .nofparen);
0730: // Need not to call setSource() because
0731: // a caller can not access this match instance.
0732: }
0733: con.match = match;
0734:
0735: if (RegularExpression.isSet(this .options, XMLSCHEMA_MODE)) {
0736: int matchEnd = this .matchCharArray(con, this .operations,
0737: con.start, 1, this .options);
0738: //System.err.println("DEBUG: matchEnd="+matchEnd);
0739: if (matchEnd == con.limit) {
0740: if (con.match != null) {
0741: con.match.setBeginning(0, con.start);
0742: con.match.setEnd(0, matchEnd);
0743: }
0744: con.inuse = false;
0745: return true;
0746: }
0747: return false;
0748: }
0749:
0750: /*
0751: * The pattern has only fixed string.
0752: * The engine uses Boyer-Moore.
0753: */
0754: if (this .fixedStringOnly) {
0755: //System.err.println("DEBUG: fixed-only: "+this.fixedString);
0756: int o = this .fixedStringTable.matches(target, con.start,
0757: con.limit);
0758: if (o >= 0) {
0759: if (con.match != null) {
0760: con.match.setBeginning(0, o);
0761: con.match.setEnd(0, o + this .fixedString.length());
0762: }
0763: con.inuse = false;
0764: return true;
0765: }
0766: con.inuse = false;
0767: return false;
0768: }
0769:
0770: /*
0771: * The pattern contains a fixed string.
0772: * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
0773: * If not, it return with false.
0774: */
0775: if (this .fixedString != null) {
0776: int o = this .fixedStringTable.matches(target, con.start,
0777: con.limit);
0778: if (o < 0) {
0779: //System.err.println("Non-match in fixed-string search.");
0780: con.inuse = false;
0781: return false;
0782: }
0783: }
0784:
0785: int limit = con.limit - this .minlength;
0786: int matchStart;
0787: int matchEnd = -1;
0788:
0789: /*
0790: * Checks whether the expression starts with ".*".
0791: */
0792: if (this .operations != null
0793: && this .operations.type == Op.CLOSURE
0794: && this .operations.getChild().type == Op.DOT) {
0795: if (isSet(this .options, SINGLE_LINE)) {
0796: matchStart = con.start;
0797: matchEnd = this .matchCharArray(con, this .operations,
0798: con.start, 1, this .options);
0799: } else {
0800: boolean previousIsEOL = true;
0801: for (matchStart = con.start; matchStart <= limit; matchStart++) {
0802: int ch = target[matchStart];
0803: if (isEOLChar(ch)) {
0804: previousIsEOL = true;
0805: } else {
0806: if (previousIsEOL) {
0807: if (0 <= (matchEnd = this .matchCharArray(
0808: con, this .operations, matchStart,
0809: 1, this .options)))
0810: break;
0811: }
0812: previousIsEOL = false;
0813: }
0814: }
0815: }
0816: }
0817:
0818: /*
0819: * Optimization against the first character.
0820: */
0821: else if (this .firstChar != null) {
0822: //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
0823: RangeToken range = this .firstChar;
0824: if (RegularExpression.isSet(this .options, IGNORE_CASE)) {
0825: range = this .firstChar.getCaseInsensitiveToken();
0826: for (matchStart = con.start; matchStart <= limit; matchStart++) {
0827: int ch = target[matchStart];
0828: if (REUtil.isHighSurrogate(ch)
0829: && matchStart + 1 < con.limit) {
0830: ch = REUtil.composeFromSurrogates(ch,
0831: target[matchStart + 1]);
0832: if (!range.match(ch))
0833: continue;
0834: } else {
0835: if (!range.match(ch)) {
0836: char ch1 = Character.toUpperCase((char) ch);
0837: if (!range.match(ch1))
0838: if (!range.match(Character
0839: .toLowerCase(ch1)))
0840: continue;
0841: }
0842: }
0843: if (0 <= (matchEnd = this .matchCharArray(con,
0844: this .operations, matchStart, 1,
0845: this .options)))
0846: break;
0847: }
0848: } else {
0849: for (matchStart = con.start; matchStart <= limit; matchStart++) {
0850: int ch = target[matchStart];
0851: if (REUtil.isHighSurrogate(ch)
0852: && matchStart + 1 < con.limit)
0853: ch = REUtil.composeFromSurrogates(ch,
0854: target[matchStart + 1]);
0855: if (!range.match(ch))
0856: continue;
0857: if (0 <= (matchEnd = this .matchCharArray(con,
0858: this .operations, matchStart, 1,
0859: this .options)))
0860: break;
0861: }
0862: }
0863: }
0864:
0865: /*
0866: * Straightforward matching.
0867: */
0868: else {
0869: for (matchStart = con.start; matchStart <= limit; matchStart++) {
0870: if (0 <= (matchEnd = this .matchCharArray(con,
0871: this .operations, matchStart, 1, this .options)))
0872: break;
0873: }
0874: }
0875:
0876: if (matchEnd >= 0) {
0877: if (con.match != null) {
0878: con.match.setBeginning(0, matchStart);
0879: con.match.setEnd(0, matchEnd);
0880: }
0881: con.inuse = false;
0882: return true;
0883: } else {
0884: con.inuse = false;
0885: return false;
0886: }
0887: }
0888:
0889: /**
0890: * @return -1 when not match; offset of the end of matched string when match.
0891: */
0892: private int matchCharArray(Context con, Op op, int offset, int dx,
0893: int opts) {
0894:
0895: char[] target = con.charTarget;
0896:
0897: while (true) {
0898: if (op == null)
0899: return isSet(opts, XMLSCHEMA_MODE)
0900: && offset != con.limit ? -1 : offset;
0901: if (offset > con.limit || offset < con.start)
0902: return -1;
0903: switch (op.type) {
0904: case Op.CHAR:
0905: if (isSet(opts, IGNORE_CASE)) {
0906: int ch = op.getData();
0907: if (dx > 0) {
0908: if (offset >= con.limit
0909: || !matchIgnoreCase(ch, target[offset]))
0910: return -1;
0911: offset++;
0912: } else {
0913: int o1 = offset - 1;
0914: if (o1 >= con.limit || o1 < 0
0915: || !matchIgnoreCase(ch, target[o1]))
0916: return -1;
0917: offset = o1;
0918: }
0919: } else {
0920: int ch = op.getData();
0921: if (dx > 0) {
0922: if (offset >= con.limit || ch != target[offset])
0923: return -1;
0924: offset++;
0925: } else {
0926: int o1 = offset - 1;
0927: if (o1 >= con.limit || o1 < 0
0928: || ch != target[o1])
0929: return -1;
0930: offset = o1;
0931: }
0932: }
0933: op = op.next;
0934: break;
0935:
0936: case Op.DOT:
0937: if (dx > 0) {
0938: if (offset >= con.limit)
0939: return -1;
0940: int ch = target[offset];
0941: if (isSet(opts, SINGLE_LINE)) {
0942: if (REUtil.isHighSurrogate(ch)
0943: && offset + 1 < con.limit)
0944: offset++;
0945: } else {
0946: if (REUtil.isHighSurrogate(ch)
0947: && offset + 1 < con.limit)
0948: ch = REUtil.composeFromSurrogates(ch,
0949: target[++offset]);
0950: if (isEOLChar(ch))
0951: return -1;
0952: }
0953: offset++;
0954: } else {
0955: int o1 = offset - 1;
0956: if (o1 >= con.limit || o1 < 0)
0957: return -1;
0958: int ch = target[o1];
0959: if (isSet(opts, SINGLE_LINE)) {
0960: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
0961: o1--;
0962: } else {
0963: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
0964: ch = REUtil.composeFromSurrogates(
0965: target[--o1], ch);
0966: if (!isEOLChar(ch))
0967: return -1;
0968: }
0969: offset = o1;
0970: }
0971: op = op.next;
0972: break;
0973:
0974: case Op.RANGE:
0975: case Op.NRANGE:
0976: if (dx > 0) {
0977: if (offset >= con.limit)
0978: return -1;
0979: int ch = target[offset];
0980: if (REUtil.isHighSurrogate(ch)
0981: && offset + 1 < con.limit)
0982: ch = REUtil.composeFromSurrogates(ch,
0983: target[++offset]);
0984: RangeToken tok = op.getToken();
0985: if (isSet(opts, IGNORE_CASE)) {
0986: tok = tok.getCaseInsensitiveToken();
0987: if (!tok.match(ch)) {
0988: if (ch >= 0x10000)
0989: return -1;
0990: char uch;
0991: if (!tok.match(uch = Character
0992: .toUpperCase((char) ch))
0993: && !tok.match(Character
0994: .toLowerCase(uch)))
0995: return -1;
0996: }
0997: } else {
0998: if (!tok.match(ch))
0999: return -1;
1000: }
1001: offset++;
1002: } else {
1003: int o1 = offset - 1;
1004: if (o1 >= con.limit || o1 < 0)
1005: return -1;
1006: int ch = target[o1];
1007: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
1008: ch = REUtil.composeFromSurrogates(target[--o1],
1009: ch);
1010: RangeToken tok = op.getToken();
1011: if (isSet(opts, IGNORE_CASE)) {
1012: tok = tok.getCaseInsensitiveToken();
1013: if (!tok.match(ch)) {
1014: if (ch >= 0x10000)
1015: return -1;
1016: char uch;
1017: if (!tok.match(uch = Character
1018: .toUpperCase((char) ch))
1019: && !tok.match(Character
1020: .toLowerCase(uch)))
1021: return -1;
1022: }
1023: } else {
1024: if (!tok.match(ch))
1025: return -1;
1026: }
1027: offset = o1;
1028: }
1029: op = op.next;
1030: break;
1031:
1032: case Op.ANCHOR:
1033: boolean go = false;
1034: switch (op.getData()) {
1035: case '^':
1036: if (isSet(opts, MULTIPLE_LINES)) {
1037: if (!(offset == con.start || offset > con.start
1038: && isEOLChar(target[offset - 1])))
1039: return -1;
1040: } else {
1041: if (offset != con.start)
1042: return -1;
1043: }
1044: break;
1045:
1046: case '@': // Internal use only.
1047: // The @ always matches line beginnings.
1048: if (!(offset == con.start || offset > con.start
1049: && isEOLChar(target[offset - 1])))
1050: return -1;
1051: break;
1052:
1053: case '$':
1054: if (isSet(opts, MULTIPLE_LINES)) {
1055: if (!(offset == con.limit || offset < con.limit
1056: && isEOLChar(target[offset])))
1057: return -1;
1058: } else {
1059: if (!(offset == con.limit
1060: || offset + 1 == con.limit
1061: && isEOLChar(target[offset]) || offset + 2 == con.limit
1062: && target[offset] == CARRIAGE_RETURN
1063: && target[offset + 1] == LINE_FEED))
1064: return -1;
1065: }
1066: break;
1067:
1068: case 'A':
1069: if (offset != con.start)
1070: return -1;
1071: break;
1072:
1073: case 'Z':
1074: if (!(offset == con.limit
1075: || offset + 1 == con.limit
1076: && isEOLChar(target[offset]) || offset + 2 == con.limit
1077: && target[offset] == CARRIAGE_RETURN
1078: && target[offset + 1] == LINE_FEED))
1079: return -1;
1080: break;
1081:
1082: case 'z':
1083: if (offset != con.limit)
1084: return -1;
1085: break;
1086:
1087: case 'b':
1088: if (con.length == 0)
1089: return -1;
1090: {
1091: int after = getWordType(target, con.start,
1092: con.limit, offset, opts);
1093: if (after == WT_IGNORE)
1094: return -1;
1095: int before = getPreviousWordType(target,
1096: con.start, con.limit, offset, opts);
1097: if (after == before)
1098: return -1;
1099: }
1100: break;
1101:
1102: case 'B':
1103: if (con.length == 0)
1104: go = true;
1105: else {
1106: int after = getWordType(target, con.start,
1107: con.limit, offset, opts);
1108: go = after == WT_IGNORE
1109: || after == getPreviousWordType(target,
1110: con.start, con.limit, offset,
1111: opts);
1112: }
1113: if (!go)
1114: return -1;
1115: break;
1116:
1117: case '<':
1118: if (con.length == 0 || offset == con.limit)
1119: return -1;
1120: if (getWordType(target, con.start, con.limit,
1121: offset, opts) != WT_LETTER
1122: || getPreviousWordType(target, con.start,
1123: con.limit, offset, opts) != WT_OTHER)
1124: return -1;
1125: break;
1126:
1127: case '>':
1128: if (con.length == 0 || offset == con.start)
1129: return -1;
1130: if (getWordType(target, con.start, con.limit,
1131: offset, opts) != WT_OTHER
1132: || getPreviousWordType(target, con.start,
1133: con.limit, offset, opts) != WT_LETTER)
1134: return -1;
1135: break;
1136: } // switch anchor type
1137: op = op.next;
1138: break;
1139:
1140: case Op.BACKREFERENCE: {
1141: int refno = op.getData();
1142: if (refno <= 0 || refno >= this .nofparen)
1143: throw new RuntimeException(
1144: "Internal Error: Reference number must be more than zero: "
1145: + refno);
1146: if (con.match.getBeginning(refno) < 0
1147: || con.match.getEnd(refno) < 0)
1148: return -1; // ********
1149: int o2 = con.match.getBeginning(refno);
1150: int literallen = con.match.getEnd(refno) - o2;
1151: if (!isSet(opts, IGNORE_CASE)) {
1152: if (dx > 0) {
1153: if (!regionMatches(target, offset, con.limit,
1154: o2, literallen))
1155: return -1;
1156: offset += literallen;
1157: } else {
1158: if (!regionMatches(target, offset - literallen,
1159: con.limit, o2, literallen))
1160: return -1;
1161: offset -= literallen;
1162: }
1163: } else {
1164: if (dx > 0) {
1165: if (!regionMatchesIgnoreCase(target, offset,
1166: con.limit, o2, literallen))
1167: return -1;
1168: offset += literallen;
1169: } else {
1170: if (!regionMatchesIgnoreCase(target, offset
1171: - literallen, con.limit, o2, literallen))
1172: return -1;
1173: offset -= literallen;
1174: }
1175: }
1176: }
1177: op = op.next;
1178: break;
1179: case Op.STRING: {
1180: String literal = op.getString();
1181: int literallen = literal.length();
1182: if (!isSet(opts, IGNORE_CASE)) {
1183: if (dx > 0) {
1184: if (!regionMatches(target, offset, con.limit,
1185: literal, literallen))
1186: return -1;
1187: offset += literallen;
1188: } else {
1189: if (!regionMatches(target, offset - literallen,
1190: con.limit, literal, literallen))
1191: return -1;
1192: offset -= literallen;
1193: }
1194: } else {
1195: if (dx > 0) {
1196: if (!regionMatchesIgnoreCase(target, offset,
1197: con.limit, literal, literallen))
1198: return -1;
1199: offset += literallen;
1200: } else {
1201: if (!regionMatchesIgnoreCase(target, offset
1202: - literallen, con.limit, literal,
1203: literallen))
1204: return -1;
1205: offset -= literallen;
1206: }
1207: }
1208: }
1209: op = op.next;
1210: break;
1211:
1212: case Op.CLOSURE: {
1213: /*
1214: * Saves current position to avoid
1215: * zero-width repeats.
1216: */
1217: int id = op.getData();
1218: if (id >= 0) {
1219: int previousOffset = con.offsets[id];
1220: if (previousOffset < 0 || previousOffset != offset) {
1221: con.offsets[id] = offset;
1222: } else {
1223: con.offsets[id] = -1;
1224: op = op.next;
1225: break;
1226: }
1227: }
1228:
1229: int ret = this .matchCharArray(con, op.getChild(),
1230: offset, dx, opts);
1231: if (id >= 0)
1232: con.offsets[id] = -1;
1233: if (ret >= 0)
1234: return ret;
1235: op = op.next;
1236: }
1237: break;
1238:
1239: case Op.QUESTION: {
1240: int ret = this .matchCharArray(con, op.getChild(),
1241: offset, dx, opts);
1242: if (ret >= 0)
1243: return ret;
1244: op = op.next;
1245: }
1246: break;
1247:
1248: case Op.NONGREEDYCLOSURE:
1249: case Op.NONGREEDYQUESTION: {
1250: int ret = this .matchCharArray(con, op.next, offset, dx,
1251: opts);
1252: if (ret >= 0)
1253: return ret;
1254: op = op.getChild();
1255: }
1256: break;
1257:
1258: case Op.UNION:
1259: for (int i = 0; i < op.size(); i++) {
1260: int ret = this .matchCharArray(con, op.elementAt(i),
1261: offset, dx, opts);
1262: if (DEBUG) {
1263: System.err.println("UNION: " + i + ", ret="
1264: + ret);
1265: }
1266: if (ret >= 0)
1267: return ret;
1268: }
1269: return -1;
1270:
1271: case Op.CAPTURE:
1272: int refno = op.getData();
1273: if (con.match != null && refno > 0) {
1274: int save = con.match.getBeginning(refno);
1275: con.match.setBeginning(refno, offset);
1276: int ret = this .matchCharArray(con, op.next, offset,
1277: dx, opts);
1278: if (ret < 0)
1279: con.match.setBeginning(refno, save);
1280: return ret;
1281: } else if (con.match != null && refno < 0) {
1282: int index = -refno;
1283: int save = con.match.getEnd(index);
1284: con.match.setEnd(index, offset);
1285: int ret = this .matchCharArray(con, op.next, offset,
1286: dx, opts);
1287: if (ret < 0)
1288: con.match.setEnd(index, save);
1289: return ret;
1290: }
1291: op = op.next;
1292: break;
1293:
1294: case Op.LOOKAHEAD:
1295: if (0 > this .matchCharArray(con, op.getChild(), offset,
1296: 1, opts))
1297: return -1;
1298: op = op.next;
1299: break;
1300: case Op.NEGATIVELOOKAHEAD:
1301: if (0 <= this .matchCharArray(con, op.getChild(),
1302: offset, 1, opts))
1303: return -1;
1304: op = op.next;
1305: break;
1306: case Op.LOOKBEHIND:
1307: if (0 > this .matchCharArray(con, op.getChild(), offset,
1308: -1, opts))
1309: return -1;
1310: op = op.next;
1311: break;
1312: case Op.NEGATIVELOOKBEHIND:
1313: if (0 <= this .matchCharArray(con, op.getChild(),
1314: offset, -1, opts))
1315: return -1;
1316: op = op.next;
1317: break;
1318:
1319: case Op.INDEPENDENT: {
1320: int ret = this .matchCharArray(con, op.getChild(),
1321: offset, dx, opts);
1322: if (ret < 0)
1323: return ret;
1324: offset = ret;
1325: op = op.next;
1326: }
1327: break;
1328:
1329: case Op.MODIFIER: {
1330: int localopts = opts;
1331: localopts |= op.getData();
1332: localopts &= ~op.getData2();
1333: //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
1334: int ret = this .matchCharArray(con, op.getChild(),
1335: offset, dx, localopts);
1336: if (ret < 0)
1337: return ret;
1338: offset = ret;
1339: op = op.next;
1340: }
1341: break;
1342:
1343: case Op.CONDITION: {
1344: Op.ConditionOp cop = (Op.ConditionOp) op;
1345: boolean matchp = false;
1346: if (cop.refNumber > 0) {
1347: if (cop.refNumber >= this .nofparen)
1348: throw new RuntimeException(
1349: "Internal Error: Reference number must be more than zero: "
1350: + cop.refNumber);
1351: matchp = con.match.getBeginning(cop.refNumber) >= 0
1352: && con.match.getEnd(cop.refNumber) >= 0;
1353: } else {
1354: matchp = 0 <= this .matchCharArray(con,
1355: cop.condition, offset, dx, opts);
1356: }
1357:
1358: if (matchp) {
1359: op = cop.yes;
1360: } else if (cop.no != null) {
1361: op = cop.no;
1362: } else {
1363: op = cop.next;
1364: }
1365: }
1366: break;
1367:
1368: default:
1369: throw new RuntimeException("Unknown operation type: "
1370: + op.type);
1371: } // switch (op.type)
1372: } // while
1373: }
1374:
1375: private static final int getPreviousWordType(char[] target,
1376: int begin, int end, int offset, int opts) {
1377: int ret = getWordType(target, begin, end, --offset, opts);
1378: while (ret == WT_IGNORE)
1379: ret = getWordType(target, begin, end, --offset, opts);
1380: return ret;
1381: }
1382:
1383: private static final int getWordType(char[] target, int begin,
1384: int end, int offset, int opts) {
1385: if (offset < begin || offset >= end)
1386: return WT_OTHER;
1387: return getWordType0(target[offset], opts);
1388: }
1389:
1390: private static final boolean regionMatches(char[] target,
1391: int offset, int limit, String part, int partlen) {
1392: if (offset < 0)
1393: return false;
1394: if (limit - offset < partlen)
1395: return false;
1396: int i = 0;
1397: while (partlen-- > 0) {
1398: if (target[offset++] != part.charAt(i++))
1399: return false;
1400: }
1401: return true;
1402: }
1403:
1404: private static final boolean regionMatches(char[] target,
1405: int offset, int limit, int offset2, int partlen) {
1406: if (offset < 0)
1407: return false;
1408: if (limit - offset < partlen)
1409: return false;
1410: int i = offset2;
1411: while (partlen-- > 0) {
1412: if (target[offset++] != target[i++])
1413: return false;
1414: }
1415: return true;
1416: }
1417:
1418: /**
1419: * @see java.lang.String#regionMatches
1420: */
1421: private static final boolean regionMatchesIgnoreCase(char[] target,
1422: int offset, int limit, String part, int partlen) {
1423: if (offset < 0)
1424: return false;
1425: if (limit - offset < partlen)
1426: return false;
1427: int i = 0;
1428: while (partlen-- > 0) {
1429: char ch1 = target[offset++];
1430: char ch2 = part.charAt(i++);
1431: if (ch1 == ch2)
1432: continue;
1433: char uch1 = Character.toUpperCase(ch1);
1434: char uch2 = Character.toUpperCase(ch2);
1435: if (uch1 == uch2)
1436: continue;
1437: if (Character.toLowerCase(uch1) != Character
1438: .toLowerCase(uch2))
1439: return false;
1440: }
1441: return true;
1442: }
1443:
1444: private static final boolean regionMatchesIgnoreCase(char[] target,
1445: int offset, int limit, int offset2, int partlen) {
1446: if (offset < 0)
1447: return false;
1448: if (limit - offset < partlen)
1449: return false;
1450: int i = offset2;
1451: while (partlen-- > 0) {
1452: char ch1 = target[offset++];
1453: char ch2 = target[i++];
1454: if (ch1 == ch2)
1455: continue;
1456: char uch1 = Character.toUpperCase(ch1);
1457: char uch2 = Character.toUpperCase(ch2);
1458: if (uch1 == uch2)
1459: continue;
1460: if (Character.toLowerCase(uch1) != Character
1461: .toLowerCase(uch2))
1462: return false;
1463: }
1464: return true;
1465: }
1466:
1467: /**
1468: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1469: *
1470: * @return true if the target is matched to this regular expression.
1471: */
1472: public boolean matches(String target) {
1473: return this .matches(target, 0, target.length(), (Match) null);
1474: }
1475:
1476: /**
1477: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1478: * in specified range or not.
1479: *
1480: * @param start Start offset of the range.
1481: * @param end End offset +1 of the range.
1482: * @return true if the target is matched to this regular expression.
1483: */
1484: public boolean matches(String target, int start, int end) {
1485: return this .matches(target, start, end, (Match) null);
1486: }
1487:
1488: /**
1489: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1490: *
1491: * @param match A Match instance for storing matching result.
1492: * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1493: */
1494: public boolean matches(String target, Match match) {
1495: return this .matches(target, 0, target.length(), match);
1496: }
1497:
1498: /**
1499: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1500: * in specified range or not.
1501: *
1502: * @param start Start offset of the range.
1503: * @param end End offset +1 of the range.
1504: * @param match A Match instance for storing matching result.
1505: * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1506: */
1507: public boolean matches(String target, int start, int end,
1508: Match match) {
1509:
1510: synchronized (this ) {
1511: if (this .operations == null)
1512: this .prepare();
1513: if (this .context == null)
1514: this .context = new Context();
1515: }
1516: Context con = null;
1517: synchronized (this .context) {
1518: con = this .context.inuse ? new Context() : this .context;
1519: con.reset(target, start, end, this .numberOfClosures);
1520: }
1521: if (match != null) {
1522: match.setNumberOfGroups(this .nofparen);
1523: match.setSource(target);
1524: } else if (this .hasBackReferences) {
1525: match = new Match();
1526: match.setNumberOfGroups(this .nofparen);
1527: // Need not to call setSource() because
1528: // a caller can not access this match instance.
1529: }
1530: con.match = match;
1531:
1532: if (RegularExpression.isSet(this .options, XMLSCHEMA_MODE)) {
1533: if (DEBUG) {
1534: System.err.println("target string=" + target);
1535: }
1536: int matchEnd = this .matchString(con, this .operations,
1537: con.start, 1, this .options);
1538: if (DEBUG) {
1539: System.err.println("matchEnd=" + matchEnd);
1540: System.err.println("con.limit=" + con.limit);
1541: }
1542: if (matchEnd == con.limit) {
1543: if (con.match != null) {
1544: con.match.setBeginning(0, con.start);
1545: con.match.setEnd(0, matchEnd);
1546: }
1547: con.inuse = false;
1548: return true;
1549: }
1550: return false;
1551: }
1552:
1553: /*
1554: * The pattern has only fixed string.
1555: * The engine uses Boyer-Moore.
1556: */
1557: if (this .fixedStringOnly) {
1558: //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1559: int o = this .fixedStringTable.matches(target, con.start,
1560: con.limit);
1561: if (o >= 0) {
1562: if (con.match != null) {
1563: con.match.setBeginning(0, o);
1564: con.match.setEnd(0, o + this .fixedString.length());
1565: }
1566: con.inuse = false;
1567: return true;
1568: }
1569: con.inuse = false;
1570: return false;
1571: }
1572:
1573: /*
1574: * The pattern contains a fixed string.
1575: * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1576: * If not, it return with false.
1577: */
1578: if (this .fixedString != null) {
1579: int o = this .fixedStringTable.matches(target, con.start,
1580: con.limit);
1581: if (o < 0) {
1582: //System.err.println("Non-match in fixed-string search.");
1583: con.inuse = false;
1584: return false;
1585: }
1586: }
1587:
1588: int limit = con.limit - this .minlength;
1589: int matchStart;
1590: int matchEnd = -1;
1591:
1592: /*
1593: * Checks whether the expression starts with ".*".
1594: */
1595: if (this .operations != null
1596: && this .operations.type == Op.CLOSURE
1597: && this .operations.getChild().type == Op.DOT) {
1598: if (isSet(this .options, SINGLE_LINE)) {
1599: matchStart = con.start;
1600: matchEnd = this .matchString(con, this .operations,
1601: con.start, 1, this .options);
1602: } else {
1603: boolean previousIsEOL = true;
1604: for (matchStart = con.start; matchStart <= limit; matchStart++) {
1605: int ch = target.charAt(matchStart);
1606: if (isEOLChar(ch)) {
1607: previousIsEOL = true;
1608: } else {
1609: if (previousIsEOL) {
1610: if (0 <= (matchEnd = this .matchString(con,
1611: this .operations, matchStart, 1,
1612: this .options)))
1613: break;
1614: }
1615: previousIsEOL = false;
1616: }
1617: }
1618: }
1619: }
1620:
1621: /*
1622: * Optimization against the first character.
1623: */
1624: else if (this .firstChar != null) {
1625: //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1626: RangeToken range = this .firstChar;
1627: if (RegularExpression.isSet(this .options, IGNORE_CASE)) {
1628: range = this .firstChar.getCaseInsensitiveToken();
1629: for (matchStart = con.start; matchStart <= limit; matchStart++) {
1630: int ch = target.charAt(matchStart);
1631: if (REUtil.isHighSurrogate(ch)
1632: && matchStart + 1 < con.limit) {
1633: ch = REUtil.composeFromSurrogates(ch, target
1634: .charAt(matchStart + 1));
1635: if (!range.match(ch))
1636: continue;
1637: } else {
1638: if (!range.match(ch)) {
1639: char ch1 = Character.toUpperCase((char) ch);
1640: if (!range.match(ch1))
1641: if (!range.match(Character
1642: .toLowerCase(ch1)))
1643: continue;
1644: }
1645: }
1646: if (0 <= (matchEnd = this .matchString(con,
1647: this .operations, matchStart, 1,
1648: this .options)))
1649: break;
1650: }
1651: } else {
1652: for (matchStart = con.start; matchStart <= limit; matchStart++) {
1653: int ch = target.charAt(matchStart);
1654: if (REUtil.isHighSurrogate(ch)
1655: && matchStart + 1 < con.limit)
1656: ch = REUtil.composeFromSurrogates(ch, target
1657: .charAt(matchStart + 1));
1658: if (!range.match(ch))
1659: continue;
1660: if (0 <= (matchEnd = this .matchString(con,
1661: this .operations, matchStart, 1,
1662: this .options)))
1663: break;
1664: }
1665: }
1666: }
1667:
1668: /*
1669: * Straightforward matching.
1670: */
1671: else {
1672: for (matchStart = con.start; matchStart <= limit; matchStart++) {
1673: if (0 <= (matchEnd = this .matchString(con,
1674: this .operations, matchStart, 1, this .options)))
1675: break;
1676: }
1677: }
1678:
1679: if (matchEnd >= 0) {
1680: if (con.match != null) {
1681: con.match.setBeginning(0, matchStart);
1682: con.match.setEnd(0, matchEnd);
1683: }
1684: con.inuse = false;
1685: return true;
1686: } else {
1687: con.inuse = false;
1688: return false;
1689: }
1690: }
1691:
1692: /**
1693: * @return -1 when not match; offset of the end of matched string when match.
1694: */
1695: private int matchString(Context con, Op op, int offset, int dx,
1696: int opts) {
1697:
1698: String target = con.strTarget;
1699:
1700: while (true) {
1701: if (op == null)
1702: return isSet(opts, XMLSCHEMA_MODE)
1703: && offset != con.limit ? -1 : offset;
1704: if (offset > con.limit || offset < con.start)
1705: return -1;
1706: switch (op.type) {
1707: case Op.CHAR:
1708: if (isSet(opts, IGNORE_CASE)) {
1709: int ch = op.getData();
1710: if (dx > 0) {
1711: if (offset >= con.limit
1712: || !matchIgnoreCase(ch, target
1713: .charAt(offset)))
1714: return -1;
1715: offset++;
1716: } else {
1717: int o1 = offset - 1;
1718: if (o1 >= con.limit
1719: || o1 < 0
1720: || !matchIgnoreCase(ch, target
1721: .charAt(o1)))
1722: return -1;
1723: offset = o1;
1724: }
1725: } else {
1726: int ch = op.getData();
1727: if (dx > 0) {
1728: if (offset >= con.limit
1729: || ch != target.charAt(offset))
1730: return -1;
1731: offset++;
1732: } else {
1733: int o1 = offset - 1;
1734: if (o1 >= con.limit || o1 < 0
1735: || ch != target.charAt(o1))
1736: return -1;
1737: offset = o1;
1738: }
1739: }
1740: op = op.next;
1741: break;
1742:
1743: case Op.DOT:
1744: if (dx > 0) {
1745: if (offset >= con.limit)
1746: return -1;
1747: int ch = target.charAt(offset);
1748: if (isSet(opts, SINGLE_LINE)) {
1749: if (REUtil.isHighSurrogate(ch)
1750: && offset + 1 < con.limit)
1751: offset++;
1752: } else {
1753: if (REUtil.isHighSurrogate(ch)
1754: && offset + 1 < con.limit)
1755: ch = REUtil.composeFromSurrogates(ch,
1756: target.charAt(++offset));
1757: if (isEOLChar(ch))
1758: return -1;
1759: }
1760: offset++;
1761: } else {
1762: int o1 = offset - 1;
1763: if (o1 >= con.limit || o1 < 0)
1764: return -1;
1765: int ch = target.charAt(o1);
1766: if (isSet(opts, SINGLE_LINE)) {
1767: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
1768: o1--;
1769: } else {
1770: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
1771: ch = REUtil.composeFromSurrogates(target
1772: .charAt(--o1), ch);
1773: if (!isEOLChar(ch))
1774: return -1;
1775: }
1776: offset = o1;
1777: }
1778: op = op.next;
1779: break;
1780:
1781: case Op.RANGE:
1782: case Op.NRANGE:
1783: if (dx > 0) {
1784: if (offset >= con.limit)
1785: return -1;
1786: int ch = target.charAt(offset);
1787: if (REUtil.isHighSurrogate(ch)
1788: && offset + 1 < con.limit)
1789: ch = REUtil.composeFromSurrogates(ch, target
1790: .charAt(++offset));
1791: RangeToken tok = op.getToken();
1792: if (isSet(opts, IGNORE_CASE)) {
1793: tok = tok.getCaseInsensitiveToken();
1794: if (!tok.match(ch)) {
1795: if (ch >= 0x10000)
1796: return -1;
1797: char uch;
1798: if (!tok.match(uch = Character
1799: .toUpperCase((char) ch))
1800: && !tok.match(Character
1801: .toLowerCase(uch)))
1802: return -1;
1803: }
1804: } else {
1805: if (!tok.match(ch))
1806: return -1;
1807: }
1808: offset++;
1809: } else {
1810: int o1 = offset - 1;
1811: if (o1 >= con.limit || o1 < 0)
1812: return -1;
1813: int ch = target.charAt(o1);
1814: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
1815: ch = REUtil.composeFromSurrogates(target
1816: .charAt(--o1), ch);
1817: RangeToken tok = op.getToken();
1818: if (isSet(opts, IGNORE_CASE)) {
1819: tok = tok.getCaseInsensitiveToken();
1820: if (!tok.match(ch)) {
1821: if (ch >= 0x10000)
1822: return -1;
1823: char uch;
1824: if (!tok.match(uch = Character
1825: .toUpperCase((char) ch))
1826: && !tok.match(Character
1827: .toLowerCase(uch)))
1828: return -1;
1829: }
1830: } else {
1831: if (!tok.match(ch))
1832: return -1;
1833: }
1834: offset = o1;
1835: }
1836: op = op.next;
1837: break;
1838:
1839: case Op.ANCHOR:
1840: boolean go = false;
1841: switch (op.getData()) {
1842: case '^':
1843: if (isSet(opts, MULTIPLE_LINES)) {
1844: if (!(offset == con.start || offset > con.start
1845: && isEOLChar(target.charAt(offset - 1))))
1846: return -1;
1847: } else {
1848: if (offset != con.start)
1849: return -1;
1850: }
1851: break;
1852:
1853: case '@': // Internal use only.
1854: // The @ always matches line beginnings.
1855: if (!(offset == con.start || offset > con.start
1856: && isEOLChar(target.charAt(offset - 1))))
1857: return -1;
1858: break;
1859:
1860: case '$':
1861: if (isSet(opts, MULTIPLE_LINES)) {
1862: if (!(offset == con.limit || offset < con.limit
1863: && isEOLChar(target.charAt(offset))))
1864: return -1;
1865: } else {
1866: if (!(offset == con.limit
1867: || offset + 1 == con.limit
1868: && isEOLChar(target.charAt(offset)) || offset + 2 == con.limit
1869: && target.charAt(offset) == CARRIAGE_RETURN
1870: && target.charAt(offset + 1) == LINE_FEED))
1871: return -1;
1872: }
1873: break;
1874:
1875: case 'A':
1876: if (offset != con.start)
1877: return -1;
1878: break;
1879:
1880: case 'Z':
1881: if (!(offset == con.limit
1882: || offset + 1 == con.limit
1883: && isEOLChar(target.charAt(offset)) || offset + 2 == con.limit
1884: && target.charAt(offset) == CARRIAGE_RETURN
1885: && target.charAt(offset + 1) == LINE_FEED))
1886: return -1;
1887: break;
1888:
1889: case 'z':
1890: if (offset != con.limit)
1891: return -1;
1892: break;
1893:
1894: case 'b':
1895: if (con.length == 0)
1896: return -1;
1897: {
1898: int after = getWordType(target, con.start,
1899: con.limit, offset, opts);
1900: if (after == WT_IGNORE)
1901: return -1;
1902: int before = getPreviousWordType(target,
1903: con.start, con.limit, offset, opts);
1904: if (after == before)
1905: return -1;
1906: }
1907: break;
1908:
1909: case 'B':
1910: if (con.length == 0)
1911: go = true;
1912: else {
1913: int after = getWordType(target, con.start,
1914: con.limit, offset, opts);
1915: go = after == WT_IGNORE
1916: || after == getPreviousWordType(target,
1917: con.start, con.limit, offset,
1918: opts);
1919: }
1920: if (!go)
1921: return -1;
1922: break;
1923:
1924: case '<':
1925: if (con.length == 0 || offset == con.limit)
1926: return -1;
1927: if (getWordType(target, con.start, con.limit,
1928: offset, opts) != WT_LETTER
1929: || getPreviousWordType(target, con.start,
1930: con.limit, offset, opts) != WT_OTHER)
1931: return -1;
1932: break;
1933:
1934: case '>':
1935: if (con.length == 0 || offset == con.start)
1936: return -1;
1937: if (getWordType(target, con.start, con.limit,
1938: offset, opts) != WT_OTHER
1939: || getPreviousWordType(target, con.start,
1940: con.limit, offset, opts) != WT_LETTER)
1941: return -1;
1942: break;
1943: } // switch anchor type
1944: op = op.next;
1945: break;
1946:
1947: case Op.BACKREFERENCE: {
1948: int refno = op.getData();
1949: if (refno <= 0 || refno >= this .nofparen)
1950: throw new RuntimeException(
1951: "Internal Error: Reference number must be more than zero: "
1952: + refno);
1953: if (con.match.getBeginning(refno) < 0
1954: || con.match.getEnd(refno) < 0)
1955: return -1; // ********
1956: int o2 = con.match.getBeginning(refno);
1957: int literallen = con.match.getEnd(refno) - o2;
1958: if (!isSet(opts, IGNORE_CASE)) {
1959: if (dx > 0) {
1960: if (!regionMatches(target, offset, con.limit,
1961: o2, literallen))
1962: return -1;
1963: offset += literallen;
1964: } else {
1965: if (!regionMatches(target, offset - literallen,
1966: con.limit, o2, literallen))
1967: return -1;
1968: offset -= literallen;
1969: }
1970: } else {
1971: if (dx > 0) {
1972: if (!regionMatchesIgnoreCase(target, offset,
1973: con.limit, o2, literallen))
1974: return -1;
1975: offset += literallen;
1976: } else {
1977: if (!regionMatchesIgnoreCase(target, offset
1978: - literallen, con.limit, o2, literallen))
1979: return -1;
1980: offset -= literallen;
1981: }
1982: }
1983: }
1984: op = op.next;
1985: break;
1986: case Op.STRING: {
1987: String literal = op.getString();
1988: int literallen = literal.length();
1989: if (!isSet(opts, IGNORE_CASE)) {
1990: if (dx > 0) {
1991: if (!regionMatches(target, offset, con.limit,
1992: literal, literallen))
1993: return -1;
1994: offset += literallen;
1995: } else {
1996: if (!regionMatches(target, offset - literallen,
1997: con.limit, literal, literallen))
1998: return -1;
1999: offset -= literallen;
2000: }
2001: } else {
2002: if (dx > 0) {
2003: if (!regionMatchesIgnoreCase(target, offset,
2004: con.limit, literal, literallen))
2005: return -1;
2006: offset += literallen;
2007: } else {
2008: if (!regionMatchesIgnoreCase(target, offset
2009: - literallen, con.limit, literal,
2010: literallen))
2011: return -1;
2012: offset -= literallen;
2013: }
2014: }
2015: }
2016: op = op.next;
2017: break;
2018:
2019: case Op.CLOSURE: {
2020: /*
2021: * Saves current position to avoid
2022: * zero-width repeats.
2023: */
2024: int id = op.getData();
2025: if (id >= 0) {
2026: int previousOffset = con.offsets[id];
2027: if (previousOffset < 0 || previousOffset != offset) {
2028: con.offsets[id] = offset;
2029: } else {
2030: con.offsets[id] = -1;
2031: op = op.next;
2032: break;
2033: }
2034: }
2035: int ret = this .matchString(con, op.getChild(), offset,
2036: dx, opts);
2037: if (id >= 0)
2038: con.offsets[id] = -1;
2039: if (ret >= 0)
2040: return ret;
2041: op = op.next;
2042: }
2043: break;
2044:
2045: case Op.QUESTION: {
2046: int ret = this .matchString(con, op.getChild(), offset,
2047: dx, opts);
2048: if (ret >= 0)
2049: return ret;
2050: op = op.next;
2051: }
2052: break;
2053:
2054: case Op.NONGREEDYCLOSURE:
2055: case Op.NONGREEDYQUESTION: {
2056: int ret = this .matchString(con, op.next, offset, dx,
2057: opts);
2058: if (ret >= 0)
2059: return ret;
2060: op = op.getChild();
2061: }
2062: break;
2063:
2064: case Op.UNION:
2065: for (int i = 0; i < op.size(); i++) {
2066: int ret = this .matchString(con, op.elementAt(i),
2067: offset, dx, opts);
2068: if (DEBUG) {
2069: System.err.println("UNION: " + i + ", ret="
2070: + ret);
2071: }
2072: if (ret >= 0)
2073: return ret;
2074: }
2075: return -1;
2076:
2077: case Op.CAPTURE:
2078: int refno = op.getData();
2079: if (con.match != null && refno > 0) {
2080: int save = con.match.getBeginning(refno);
2081: con.match.setBeginning(refno, offset);
2082: int ret = this .matchString(con, op.next, offset,
2083: dx, opts);
2084: if (ret < 0)
2085: con.match.setBeginning(refno, save);
2086: return ret;
2087: } else if (con.match != null && refno < 0) {
2088: int index = -refno;
2089: int save = con.match.getEnd(index);
2090: con.match.setEnd(index, offset);
2091: int ret = this .matchString(con, op.next, offset,
2092: dx, opts);
2093: if (ret < 0)
2094: con.match.setEnd(index, save);
2095: return ret;
2096: }
2097: op = op.next;
2098: break;
2099:
2100: case Op.LOOKAHEAD:
2101: if (0 > this .matchString(con, op.getChild(), offset, 1,
2102: opts))
2103: return -1;
2104: op = op.next;
2105: break;
2106: case Op.NEGATIVELOOKAHEAD:
2107: if (0 <= this .matchString(con, op.getChild(), offset,
2108: 1, opts))
2109: return -1;
2110: op = op.next;
2111: break;
2112: case Op.LOOKBEHIND:
2113: if (0 > this .matchString(con, op.getChild(), offset,
2114: -1, opts))
2115: return -1;
2116: op = op.next;
2117: break;
2118: case Op.NEGATIVELOOKBEHIND:
2119: if (0 <= this .matchString(con, op.getChild(), offset,
2120: -1, opts))
2121: return -1;
2122: op = op.next;
2123: break;
2124:
2125: case Op.INDEPENDENT: {
2126: int ret = this .matchString(con, op.getChild(), offset,
2127: dx, opts);
2128: if (ret < 0)
2129: return ret;
2130: offset = ret;
2131: op = op.next;
2132: }
2133: break;
2134:
2135: case Op.MODIFIER: {
2136: int localopts = opts;
2137: localopts |= op.getData();
2138: localopts &= ~op.getData2();
2139: //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
2140: int ret = this .matchString(con, op.getChild(), offset,
2141: dx, localopts);
2142: if (ret < 0)
2143: return ret;
2144: offset = ret;
2145: op = op.next;
2146: }
2147: break;
2148:
2149: case Op.CONDITION: {
2150: Op.ConditionOp cop = (Op.ConditionOp) op;
2151: boolean matchp = false;
2152: if (cop.refNumber > 0) {
2153: if (cop.refNumber >= this .nofparen)
2154: throw new RuntimeException(
2155: "Internal Error: Reference number must be more than zero: "
2156: + cop.refNumber);
2157: matchp = con.match.getBeginning(cop.refNumber) >= 0
2158: && con.match.getEnd(cop.refNumber) >= 0;
2159: } else {
2160: matchp = 0 <= this .matchString(con, cop.condition,
2161: offset, dx, opts);
2162: }
2163:
2164: if (matchp) {
2165: op = cop.yes;
2166: } else if (cop.no != null) {
2167: op = cop.no;
2168: } else {
2169: op = cop.next;
2170: }
2171: }
2172: break;
2173:
2174: default:
2175: throw new RuntimeException("Unknown operation type: "
2176: + op.type);
2177: } // switch (op.type)
2178: } // while
2179: }
2180:
2181: private static final int getPreviousWordType(String target,
2182: int begin, int end, int offset, int opts) {
2183: int ret = getWordType(target, begin, end, --offset, opts);
2184: while (ret == WT_IGNORE)
2185: ret = getWordType(target, begin, end, --offset, opts);
2186: return ret;
2187: }
2188:
2189: private static final int getWordType(String target, int begin,
2190: int end, int offset, int opts) {
2191: if (offset < begin || offset >= end)
2192: return WT_OTHER;
2193: return getWordType0(target.charAt(offset), opts);
2194: }
2195:
2196: private static final boolean regionMatches(String text, int offset,
2197: int limit, String part, int partlen) {
2198: if (limit - offset < partlen)
2199: return false;
2200: return text.regionMatches(offset, part, 0, partlen);
2201: }
2202:
2203: private static final boolean regionMatches(String text, int offset,
2204: int limit, int offset2, int partlen) {
2205: if (limit - offset < partlen)
2206: return false;
2207: return text.regionMatches(offset, text, offset2, partlen);
2208: }
2209:
2210: private static final boolean regionMatchesIgnoreCase(String text,
2211: int offset, int limit, String part, int partlen) {
2212: return text.regionMatches(true, offset, part, 0, partlen);
2213: }
2214:
2215: private static final boolean regionMatchesIgnoreCase(String text,
2216: int offset, int limit, int offset2, int partlen) {
2217: if (limit - offset < partlen)
2218: return false;
2219: return text.regionMatches(true, offset, text, offset2, partlen);
2220: }
2221:
2222: /**
2223: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2224: *
2225: * @return true if the target is matched to this regular expression.
2226: */
2227: public boolean matches(CharacterIterator target) {
2228: return this .matches(target, (Match) null);
2229: }
2230:
2231: /**
2232: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2233: *
2234: * @param match A Match instance for storing matching result.
2235: * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
2236: */
2237: public boolean matches(CharacterIterator target, Match match) {
2238: int start = target.getBeginIndex();
2239: int end = target.getEndIndex();
2240:
2241: synchronized (this ) {
2242: if (this .operations == null)
2243: this .prepare();
2244: if (this .context == null)
2245: this .context = new Context();
2246: }
2247: Context con = null;
2248: synchronized (this .context) {
2249: con = this .context.inuse ? new Context() : this .context;
2250: con.reset(target, start, end, this .numberOfClosures);
2251: }
2252: if (match != null) {
2253: match.setNumberOfGroups(this .nofparen);
2254: match.setSource(target);
2255: } else if (this .hasBackReferences) {
2256: match = new Match();
2257: match.setNumberOfGroups(this .nofparen);
2258: // Need not to call setSource() because
2259: // a caller can not access this match instance.
2260: }
2261: con.match = match;
2262:
2263: if (RegularExpression.isSet(this .options, XMLSCHEMA_MODE)) {
2264: int matchEnd = this .matchCharacterIterator(con,
2265: this .operations, con.start, 1, this .options);
2266: //System.err.println("DEBUG: matchEnd="+matchEnd);
2267: if (matchEnd == con.limit) {
2268: if (con.match != null) {
2269: con.match.setBeginning(0, con.start);
2270: con.match.setEnd(0, matchEnd);
2271: }
2272: con.inuse = false;
2273: return true;
2274: }
2275: return false;
2276: }
2277:
2278: /*
2279: * The pattern has only fixed string.
2280: * The engine uses Boyer-Moore.
2281: */
2282: if (this .fixedStringOnly) {
2283: //System.err.println("DEBUG: fixed-only: "+this.fixedString);
2284: int o = this .fixedStringTable.matches(target, con.start,
2285: con.limit);
2286: if (o >= 0) {
2287: if (con.match != null) {
2288: con.match.setBeginning(0, o);
2289: con.match.setEnd(0, o + this .fixedString.length());
2290: }
2291: con.inuse = false;
2292: return true;
2293: }
2294: con.inuse = false;
2295: return false;
2296: }
2297:
2298: /*
2299: * The pattern contains a fixed string.
2300: * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
2301: * If not, it return with false.
2302: */
2303: if (this .fixedString != null) {
2304: int o = this .fixedStringTable.matches(target, con.start,
2305: con.limit);
2306: if (o < 0) {
2307: //System.err.println("Non-match in fixed-string search.");
2308: con.inuse = false;
2309: return false;
2310: }
2311: }
2312:
2313: int limit = con.limit - this .minlength;
2314: int matchStart;
2315: int matchEnd = -1;
2316:
2317: /*
2318: * Checks whether the expression starts with ".*".
2319: */
2320: if (this .operations != null
2321: && this .operations.type == Op.CLOSURE
2322: && this .operations.getChild().type == Op.DOT) {
2323: if (isSet(this .options, SINGLE_LINE)) {
2324: matchStart = con.start;
2325: matchEnd = this .matchCharacterIterator(con,
2326: this .operations, con.start, 1, this .options);
2327: } else {
2328: boolean previousIsEOL = true;
2329: for (matchStart = con.start; matchStart <= limit; matchStart++) {
2330: int ch = target.setIndex(matchStart);
2331: if (isEOLChar(ch)) {
2332: previousIsEOL = true;
2333: } else {
2334: if (previousIsEOL) {
2335: if (0 <= (matchEnd = this
2336: .matchCharacterIterator(con,
2337: this .operations,
2338: matchStart, 1, this .options)))
2339: break;
2340: }
2341: previousIsEOL = false;
2342: }
2343: }
2344: }
2345: }
2346:
2347: /*
2348: * Optimization against the first character.
2349: */
2350: else if (this .firstChar != null) {
2351: //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
2352: RangeToken range = this .firstChar;
2353: if (RegularExpression.isSet(this .options, IGNORE_CASE)) {
2354: range = this .firstChar.getCaseInsensitiveToken();
2355: for (matchStart = con.start; matchStart <= limit; matchStart++) {
2356: int ch = target.setIndex(matchStart);
2357: if (REUtil.isHighSurrogate(ch)
2358: && matchStart + 1 < con.limit) {
2359: ch = REUtil.composeFromSurrogates(ch, target
2360: .setIndex(matchStart + 1));
2361: if (!range.match(ch))
2362: continue;
2363: } else {
2364: if (!range.match(ch)) {
2365: char ch1 = Character.toUpperCase((char) ch);
2366: if (!range.match(ch1))
2367: if (!range.match(Character
2368: .toLowerCase(ch1)))
2369: continue;
2370: }
2371: }
2372: if (0 <= (matchEnd = this .matchCharacterIterator(
2373: con, this .operations, matchStart, 1,
2374: this .options)))
2375: break;
2376: }
2377: } else {
2378: for (matchStart = con.start; matchStart <= limit; matchStart++) {
2379: int ch = target.setIndex(matchStart);
2380: if (REUtil.isHighSurrogate(ch)
2381: && matchStart + 1 < con.limit)
2382: ch = REUtil.composeFromSurrogates(ch, target
2383: .setIndex(matchStart + 1));
2384: if (!range.match(ch))
2385: continue;
2386: if (0 <= (matchEnd = this .matchCharacterIterator(
2387: con, this .operations, matchStart, 1,
2388: this .options)))
2389: break;
2390: }
2391: }
2392: }
2393:
2394: /*
2395: * Straightforward matching.
2396: */
2397: else {
2398: for (matchStart = con.start; matchStart <= limit; matchStart++) {
2399: if (0 <= (matchEnd = this .matchCharacterIterator(con,
2400: this .operations, matchStart, 1, this .options)))
2401: break;
2402: }
2403: }
2404:
2405: if (matchEnd >= 0) {
2406: if (con.match != null) {
2407: con.match.setBeginning(0, matchStart);
2408: con.match.setEnd(0, matchEnd);
2409: }
2410: con.inuse = false;
2411: return true;
2412: } else {
2413: con.inuse = false;
2414: return false;
2415: }
2416: }
2417:
2418: /**
2419: * @return -1 when not match; offset of the end of matched string when match.
2420: */
2421: private int matchCharacterIterator(Context con, Op op, int offset,
2422: int dx, int opts) {
2423:
2424: CharacterIterator target = con.ciTarget;
2425:
2426: while (true) {
2427: if (op == null)
2428: return isSet(opts, XMLSCHEMA_MODE)
2429: && offset != con.limit ? -1 : offset;
2430: if (offset > con.limit || offset < con.start)
2431: return -1;
2432: switch (op.type) {
2433: case Op.CHAR:
2434: if (isSet(opts, IGNORE_CASE)) {
2435: int ch = op.getData();
2436: if (dx > 0) {
2437: if (offset >= con.limit
2438: || !matchIgnoreCase(ch, target
2439: .setIndex(offset)))
2440: return -1;
2441: offset++;
2442: } else {
2443: int o1 = offset - 1;
2444: if (o1 >= con.limit
2445: || o1 < 0
2446: || !matchIgnoreCase(ch, target
2447: .setIndex(o1)))
2448: return -1;
2449: offset = o1;
2450: }
2451: } else {
2452: int ch = op.getData();
2453: if (dx > 0) {
2454: if (offset >= con.limit
2455: || ch != target.setIndex(offset))
2456: return -1;
2457: offset++;
2458: } else {
2459: int o1 = offset - 1;
2460: if (o1 >= con.limit || o1 < 0
2461: || ch != target.setIndex(o1))
2462: return -1;
2463: offset = o1;
2464: }
2465: }
2466: op = op.next;
2467: break;
2468:
2469: case Op.DOT:
2470: if (dx > 0) {
2471: if (offset >= con.limit)
2472: return -1;
2473: int ch = target.setIndex(offset);
2474: if (isSet(opts, SINGLE_LINE)) {
2475: if (REUtil.isHighSurrogate(ch)
2476: && offset + 1 < con.limit)
2477: offset++;
2478: } else {
2479: if (REUtil.isHighSurrogate(ch)
2480: && offset + 1 < con.limit)
2481: ch = REUtil.composeFromSurrogates(ch,
2482: target.setIndex(++offset));
2483: if (isEOLChar(ch))
2484: return -1;
2485: }
2486: offset++;
2487: } else {
2488: int o1 = offset - 1;
2489: if (o1 >= con.limit || o1 < 0)
2490: return -1;
2491: int ch = target.setIndex(o1);
2492: if (isSet(opts, SINGLE_LINE)) {
2493: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
2494: o1--;
2495: } else {
2496: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
2497: ch = REUtil.composeFromSurrogates(target
2498: .setIndex(--o1), ch);
2499: if (!isEOLChar(ch))
2500: return -1;
2501: }
2502: offset = o1;
2503: }
2504: op = op.next;
2505: break;
2506:
2507: case Op.RANGE:
2508: case Op.NRANGE:
2509: if (dx > 0) {
2510: if (offset >= con.limit)
2511: return -1;
2512: int ch = target.setIndex(offset);
2513: if (REUtil.isHighSurrogate(ch)
2514: && offset + 1 < con.limit)
2515: ch = REUtil.composeFromSurrogates(ch, target
2516: .setIndex(++offset));
2517: RangeToken tok = op.getToken();
2518: if (isSet(opts, IGNORE_CASE)) {
2519: tok = tok.getCaseInsensitiveToken();
2520: if (!tok.match(ch)) {
2521: if (ch >= 0x10000)
2522: return -1;
2523: char uch;
2524: if (!tok.match(uch = Character
2525: .toUpperCase((char) ch))
2526: && !tok.match(Character
2527: .toLowerCase(uch)))
2528: return -1;
2529: }
2530: } else {
2531: if (!tok.match(ch))
2532: return -1;
2533: }
2534: offset++;
2535: } else {
2536: int o1 = offset - 1;
2537: if (o1 >= con.limit || o1 < 0)
2538: return -1;
2539: int ch = target.setIndex(o1);
2540: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
2541: ch = REUtil.composeFromSurrogates(target
2542: .setIndex(--o1), ch);
2543: RangeToken tok = op.getToken();
2544: if (isSet(opts, IGNORE_CASE)) {
2545: tok = tok.getCaseInsensitiveToken();
2546: if (!tok.match(ch)) {
2547: if (ch >= 0x10000)
2548: return -1;
2549: char uch;
2550: if (!tok.match(uch = Character
2551: .toUpperCase((char) ch))
2552: && !tok.match(Character
2553: .toLowerCase(uch)))
2554: return -1;
2555: }
2556: } else {
2557: if (!tok.match(ch))
2558: return -1;
2559: }
2560: offset = o1;
2561: }
2562: op = op.next;
2563: break;
2564:
2565: case Op.ANCHOR:
2566: boolean go = false;
2567: switch (op.getData()) {
2568: case '^':
2569: if (isSet(opts, MULTIPLE_LINES)) {
2570: if (!(offset == con.start || offset > con.start
2571: && isEOLChar(target
2572: .setIndex(offset - 1))))
2573: return -1;
2574: } else {
2575: if (offset != con.start)
2576: return -1;
2577: }
2578: break;
2579:
2580: case '@': // Internal use only.
2581: // The @ always matches line beginnings.
2582: if (!(offset == con.start || offset > con.start
2583: && isEOLChar(target.setIndex(offset - 1))))
2584: return -1;
2585: break;
2586:
2587: case '$':
2588: if (isSet(opts, MULTIPLE_LINES)) {
2589: if (!(offset == con.limit || offset < con.limit
2590: && isEOLChar(target.setIndex(offset))))
2591: return -1;
2592: } else {
2593: if (!(offset == con.limit
2594: || offset + 1 == con.limit
2595: && isEOLChar(target.setIndex(offset)) || offset + 2 == con.limit
2596: && target.setIndex(offset) == CARRIAGE_RETURN
2597: && target.setIndex(offset + 1) == LINE_FEED))
2598: return -1;
2599: }
2600: break;
2601:
2602: case 'A':
2603: if (offset != con.start)
2604: return -1;
2605: break;
2606:
2607: case 'Z':
2608: if (!(offset == con.limit
2609: || offset + 1 == con.limit
2610: && isEOLChar(target.setIndex(offset)) || offset + 2 == con.limit
2611: && target.setIndex(offset) == CARRIAGE_RETURN
2612: && target.setIndex(offset + 1) == LINE_FEED))
2613: return -1;
2614: break;
2615:
2616: case 'z':
2617: if (offset != con.limit)
2618: return -1;
2619: break;
2620:
2621: case 'b':
2622: if (con.length == 0)
2623: return -1;
2624: {
2625: int after = getWordType(target, con.start,
2626: con.limit, offset, opts);
2627: if (after == WT_IGNORE)
2628: return -1;
2629: int before = getPreviousWordType(target,
2630: con.start, con.limit, offset, opts);
2631: if (after == before)
2632: return -1;
2633: }
2634: break;
2635:
2636: case 'B':
2637: if (con.length == 0)
2638: go = true;
2639: else {
2640: int after = getWordType(target, con.start,
2641: con.limit, offset, opts);
2642: go = after == WT_IGNORE
2643: || after == getPreviousWordType(target,
2644: con.start, con.limit, offset,
2645: opts);
2646: }
2647: if (!go)
2648: return -1;
2649: break;
2650:
2651: case '<':
2652: if (con.length == 0 || offset == con.limit)
2653: return -1;
2654: if (getWordType(target, con.start, con.limit,
2655: offset, opts) != WT_LETTER
2656: || getPreviousWordType(target, con.start,
2657: con.limit, offset, opts) != WT_OTHER)
2658: return -1;
2659: break;
2660:
2661: case '>':
2662: if (con.length == 0 || offset == con.start)
2663: return -1;
2664: if (getWordType(target, con.start, con.limit,
2665: offset, opts) != WT_OTHER
2666: || getPreviousWordType(target, con.start,
2667: con.limit, offset, opts) != WT_LETTER)
2668: return -1;
2669: break;
2670: } // switch anchor type
2671: op = op.next;
2672: break;
2673:
2674: case Op.BACKREFERENCE: {
2675: int refno = op.getData();
2676: if (refno <= 0 || refno >= this .nofparen)
2677: throw new RuntimeException(
2678: "Internal Error: Reference number must be more than zero: "
2679: + refno);
2680: if (con.match.getBeginning(refno) < 0
2681: || con.match.getEnd(refno) < 0)
2682: return -1; // ********
2683: int o2 = con.match.getBeginning(refno);
2684: int literallen = con.match.getEnd(refno) - o2;
2685: if (!isSet(opts, IGNORE_CASE)) {
2686: if (dx > 0) {
2687: if (!regionMatches(target, offset, con.limit,
2688: o2, literallen))
2689: return -1;
2690: offset += literallen;
2691: } else {
2692: if (!regionMatches(target, offset - literallen,
2693: con.limit, o2, literallen))
2694: return -1;
2695: offset -= literallen;
2696: }
2697: } else {
2698: if (dx > 0) {
2699: if (!regionMatchesIgnoreCase(target, offset,
2700: con.limit, o2, literallen))
2701: return -1;
2702: offset += literallen;
2703: } else {
2704: if (!regionMatchesIgnoreCase(target, offset
2705: - literallen, con.limit, o2, literallen))
2706: return -1;
2707: offset -= literallen;
2708: }
2709: }
2710: }
2711: op = op.next;
2712: break;
2713: case Op.STRING: {
2714: String literal = op.getString();
2715: int literallen = literal.length();
2716: if (!isSet(opts, IGNORE_CASE)) {
2717: if (dx > 0) {
2718: if (!regionMatches(target, offset, con.limit,
2719: literal, literallen))
2720: return -1;
2721: offset += literallen;
2722: } else {
2723: if (!regionMatches(target, offset - literallen,
2724: con.limit, literal, literallen))
2725: return -1;
2726: offset -= literallen;
2727: }
2728: } else {
2729: if (dx > 0) {
2730: if (!regionMatchesIgnoreCase(target, offset,
2731: con.limit, literal, literallen))
2732: return -1;
2733: offset += literallen;
2734: } else {
2735: if (!regionMatchesIgnoreCase(target, offset
2736: - literallen, con.limit, literal,
2737: literallen))
2738: return -1;
2739: offset -= literallen;
2740: }
2741: }
2742: }
2743: op = op.next;
2744: break;
2745:
2746: case Op.CLOSURE: {
2747: /*
2748: * Saves current position to avoid
2749: * zero-width repeats.
2750: */
2751: int id = op.getData();
2752: if (id >= 0) {
2753: int previousOffset = con.offsets[id];
2754: if (previousOffset < 0 || previousOffset != offset) {
2755: con.offsets[id] = offset;
2756: } else {
2757: con.offsets[id] = -1;
2758: op = op.next;
2759: break;
2760: }
2761: }
2762:
2763: int ret = this .matchCharacterIterator(con, op
2764: .getChild(), offset, dx, opts);
2765: if (id >= 0)
2766: con.offsets[id] = -1;
2767: if (ret >= 0)
2768: return ret;
2769: op = op.next;
2770: }
2771: break;
2772:
2773: case Op.QUESTION: {
2774: int ret = this .matchCharacterIterator(con, op
2775: .getChild(), offset, dx, opts);
2776: if (ret >= 0)
2777: return ret;
2778: op = op.next;
2779: }
2780: break;
2781:
2782: case Op.NONGREEDYCLOSURE:
2783: case Op.NONGREEDYQUESTION: {
2784: int ret = this .matchCharacterIterator(con, op.next,
2785: offset, dx, opts);
2786: if (ret >= 0)
2787: return ret;
2788: op = op.getChild();
2789: }
2790: break;
2791:
2792: case Op.UNION:
2793: for (int i = 0; i < op.size(); i++) {
2794: int ret = this .matchCharacterIterator(con, op
2795: .elementAt(i), offset, dx, opts);
2796: if (DEBUG) {
2797: System.err.println("UNION: " + i + ", ret="
2798: + ret);
2799: }
2800: if (ret >= 0)
2801: return ret;
2802: }
2803: return -1;
2804:
2805: case Op.CAPTURE:
2806: int refno = op.getData();
2807: if (con.match != null && refno > 0) {
2808: int save = con.match.getBeginning(refno);
2809: con.match.setBeginning(refno, offset);
2810: int ret = this .matchCharacterIterator(con, op.next,
2811: offset, dx, opts);
2812: if (ret < 0)
2813: con.match.setBeginning(refno, save);
2814: return ret;
2815: } else if (con.match != null && refno < 0) {
2816: int index = -refno;
2817: int save = con.match.getEnd(index);
2818: con.match.setEnd(index, offset);
2819: int ret = this .matchCharacterIterator(con, op.next,
2820: offset, dx, opts);
2821: if (ret < 0)
2822: con.match.setEnd(index, save);
2823: return ret;
2824: }
2825: op = op.next;
2826: break;
2827:
2828: case Op.LOOKAHEAD:
2829: if (0 > this .matchCharacterIterator(con, op.getChild(),
2830: offset, 1, opts))
2831: return -1;
2832: op = op.next;
2833: break;
2834: case Op.NEGATIVELOOKAHEAD:
2835: if (0 <= this .matchCharacterIterator(con,
2836: op.getChild(), offset, 1, opts))
2837: return -1;
2838: op = op.next;
2839: break;
2840: case Op.LOOKBEHIND:
2841: if (0 > this .matchCharacterIterator(con, op.getChild(),
2842: offset, -1, opts))
2843: return -1;
2844: op = op.next;
2845: break;
2846: case Op.NEGATIVELOOKBEHIND:
2847: if (0 <= this .matchCharacterIterator(con,
2848: op.getChild(), offset, -1, opts))
2849: return -1;
2850: op = op.next;
2851: break;
2852:
2853: case Op.INDEPENDENT: {
2854: int ret = this .matchCharacterIterator(con, op
2855: .getChild(), offset, dx, opts);
2856: if (ret < 0)
2857: return ret;
2858: offset = ret;
2859: op = op.next;
2860: }
2861: break;
2862:
2863: case Op.MODIFIER: {
2864: int localopts = opts;
2865: localopts |= op.getData();
2866: localopts &= ~op.getData2();
2867: //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
2868: int ret = this .matchCharacterIterator(con, op
2869: .getChild(), offset, dx, localopts);
2870: if (ret < 0)
2871: return ret;
2872: offset = ret;
2873: op = op.next;
2874: }
2875: break;
2876:
2877: case Op.CONDITION: {
2878: Op.ConditionOp cop = (Op.ConditionOp) op;
2879: boolean matchp = false;
2880: if (cop.refNumber > 0) {
2881: if (cop.refNumber >= this .nofparen)
2882: throw new RuntimeException(
2883: "Internal Error: Reference number must be more than zero: "
2884: + cop.refNumber);
2885: matchp = con.match.getBeginning(cop.refNumber) >= 0
2886: && con.match.getEnd(cop.refNumber) >= 0;
2887: } else {
2888: matchp = 0 <= this .matchCharacterIterator(con,
2889: cop.condition, offset, dx, opts);
2890: }
2891:
2892: if (matchp) {
2893: op = cop.yes;
2894: } else if (cop.no != null) {
2895: op = cop.no;
2896: } else {
2897: op = cop.next;
2898: }
2899: }
2900: break;
2901:
2902: default:
2903: throw new RuntimeException("Unknown operation type: "
2904: + op.type);
2905: } // switch (op.type)
2906: } // while
2907: }
2908:
2909: private static final int getPreviousWordType(
2910: CharacterIterator target, int begin, int end, int offset,
2911: int opts) {
2912: int ret = getWordType(target, begin, end, --offset, opts);
2913: while (ret == WT_IGNORE)
2914: ret = getWordType(target, begin, end, --offset, opts);
2915: return ret;
2916: }
2917:
2918: private static final int getWordType(CharacterIterator target,
2919: int begin, int end, int offset, int opts) {
2920: if (offset < begin || offset >= end)
2921: return WT_OTHER;
2922: return getWordType0(target.setIndex(offset), opts);
2923: }
2924:
2925: private static final boolean regionMatches(
2926: CharacterIterator target, int offset, int limit,
2927: String part, int partlen) {
2928: if (offset < 0)
2929: return false;
2930: if (limit - offset < partlen)
2931: return false;
2932: int i = 0;
2933: while (partlen-- > 0) {
2934: if (target.setIndex(offset++) != part.charAt(i++))
2935: return false;
2936: }
2937: return true;
2938: }
2939:
2940: private static final boolean regionMatches(
2941: CharacterIterator target, int offset, int limit,
2942: int offset2, int partlen) {
2943: if (offset < 0)
2944: return false;
2945: if (limit - offset < partlen)
2946: return false;
2947: int i = offset2;
2948: while (partlen-- > 0) {
2949: if (target.setIndex(offset++) != target.setIndex(i++))
2950: return false;
2951: }
2952: return true;
2953: }
2954:
2955: /**
2956: * @see java.lang.String#regionMatches
2957: */
2958: private static final boolean regionMatchesIgnoreCase(
2959: CharacterIterator target, int offset, int limit,
2960: String part, int partlen) {
2961: if (offset < 0)
2962: return false;
2963: if (limit - offset < partlen)
2964: return false;
2965: int i = 0;
2966: while (partlen-- > 0) {
2967: char ch1 = target.setIndex(offset++);
2968: char ch2 = part.charAt(i++);
2969: if (ch1 == ch2)
2970: continue;
2971: char uch1 = Character.toUpperCase(ch1);
2972: char uch2 = Character.toUpperCase(ch2);
2973: if (uch1 == uch2)
2974: continue;
2975: if (Character.toLowerCase(uch1) != Character
2976: .toLowerCase(uch2))
2977: return false;
2978: }
2979: return true;
2980: }
2981:
2982: private static final boolean regionMatchesIgnoreCase(
2983: CharacterIterator target, int offset, int limit,
2984: int offset2, int partlen) {
2985: if (offset < 0)
2986: return false;
2987: if (limit - offset < partlen)
2988: return false;
2989: int i = offset2;
2990: while (partlen-- > 0) {
2991: char ch1 = target.setIndex(offset++);
2992: char ch2 = target.setIndex(i++);
2993: if (ch1 == ch2)
2994: continue;
2995: char uch1 = Character.toUpperCase(ch1);
2996: char uch2 = Character.toUpperCase(ch2);
2997: if (uch1 == uch2)
2998: continue;
2999: if (Character.toLowerCase(uch1) != Character
3000: .toLowerCase(uch2))
3001: return false;
3002: }
3003: return true;
3004: }
3005:
3006: // ================================================================
3007:
3008: /**
3009: * A regular expression.
3010: * @serial
3011: */
3012: String regex;
3013: /**
3014: * @serial
3015: */
3016: int options;
3017:
3018: /**
3019: * The number of parenthesis in the regular expression.
3020: * @serial
3021: */
3022: int nofparen;
3023: /**
3024: * Internal representation of the regular expression.
3025: * @serial
3026: */
3027: Token tokentree;
3028:
3029: boolean hasBackReferences = false;
3030:
3031: transient int minlength;
3032: transient Op operations = null;
3033: transient int numberOfClosures;
3034: transient Context context = null;
3035: transient RangeToken firstChar = null;
3036:
3037: transient String fixedString = null;
3038: transient int fixedStringOptions;
3039: transient BMPattern fixedStringTable = null;
3040: transient boolean fixedStringOnly = false;
3041:
3042: static final class Context {
3043: CharacterIterator ciTarget;
3044: String strTarget;
3045: char[] charTarget;
3046: int start;
3047: int limit;
3048: int length;
3049: Match match;
3050: boolean inuse = false;
3051: int[] offsets;
3052:
3053: Context() {
3054: }
3055:
3056: private void resetCommon(int nofclosures) {
3057: this .length = this .limit - this .start;
3058: this .inuse = true;
3059: this .match = null;
3060: if (this .offsets == null
3061: || this .offsets.length != nofclosures)
3062: this .offsets = new int[nofclosures];
3063: for (int i = 0; i < nofclosures; i++)
3064: this .offsets[i] = -1;
3065: }
3066:
3067: void reset(CharacterIterator target, int start, int limit,
3068: int nofclosures) {
3069: this .ciTarget = target;
3070: this .start = start;
3071: this .limit = limit;
3072: this .resetCommon(nofclosures);
3073: }
3074:
3075: void reset(String target, int start, int limit, int nofclosures) {
3076: this .strTarget = target;
3077: this .start = start;
3078: this .limit = limit;
3079: this .resetCommon(nofclosures);
3080: }
3081:
3082: void reset(char[] target, int start, int limit, int nofclosures) {
3083: this .charTarget = target;
3084: this .start = start;
3085: this .limit = limit;
3086: this .resetCommon(nofclosures);
3087: }
3088: }
3089:
3090: /**
3091: * Prepares for matching. This method is called just before starting matching.
3092: */
3093: void prepare() {
3094: if (Op.COUNT)
3095: Op.nofinstances = 0;
3096: this .compile(this .tokentree);
3097: /*
3098: if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
3099: Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
3100: anchor.next = this.operations;
3101: this.operations = anchor;
3102: }
3103: */
3104: if (Op.COUNT)
3105: System.err.println("DEBUG: The number of operations: "
3106: + Op.nofinstances);
3107:
3108: this .minlength = this .tokentree.getMinLength();
3109:
3110: this .firstChar = null;
3111: if (!isSet(this .options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
3112: && !isSet(this .options, XMLSCHEMA_MODE)) {
3113: RangeToken firstChar = Token.createRange();
3114: int fresult = this .tokentree.analyzeFirstCharacter(
3115: firstChar, this .options);
3116: if (fresult == Token.FC_TERMINAL) {
3117: firstChar.compactRanges();
3118: this .firstChar = firstChar;
3119: if (DEBUG)
3120: System.err
3121: .println("DEBUG: Use the first character optimization: "
3122: + firstChar);
3123: }
3124: }
3125:
3126: if (this .operations != null
3127: && (this .operations.type == Op.STRING || this .operations.type == Op.CHAR)
3128: && this .operations.next == null) {
3129: if (DEBUG)
3130: System.err.print(" *** Only fixed string! *** ");
3131: this .fixedStringOnly = true;
3132: if (this .operations.type == Op.STRING)
3133: this .fixedString = this .operations.getString();
3134: else if (this .operations.getData() >= 0x10000) { // Op.CHAR
3135: this .fixedString = REUtil
3136: .decomposeToSurrogates(this .operations
3137: .getData());
3138: } else {
3139: char[] ac = new char[1];
3140: ac[0] = (char) this .operations.getData();
3141: this .fixedString = new String(ac);
3142: }
3143: this .fixedStringOptions = this .options;
3144: this .fixedStringTable = new BMPattern(this .fixedString,
3145: 256, isSet(this .fixedStringOptions, IGNORE_CASE));
3146: } else if (!isSet(this .options,
3147: PROHIBIT_FIXED_STRING_OPTIMIZATION)
3148: && !isSet(this .options, XMLSCHEMA_MODE)) {
3149: Token.FixedStringContainer container = new Token.FixedStringContainer();
3150: this .tokentree.findFixedString(container, this .options);
3151: this .fixedString = container.token == null ? null
3152: : container.token.getString();
3153: this .fixedStringOptions = container.options;
3154: if (this .fixedString != null
3155: && this .fixedString.length() < 2)
3156: this .fixedString = null;
3157: // This pattern has a fixed string of which length is more than one.
3158: if (this .fixedString != null) {
3159: this .fixedStringTable = new BMPattern(this .fixedString,
3160: 256,
3161: isSet(this .fixedStringOptions, IGNORE_CASE));
3162: if (DEBUG) {
3163: System.err
3164: .println("DEBUG: The longest fixed string: "
3165: + this .fixedString.length()
3166: + "/" //+this.fixedString
3167: + "/"
3168: + REUtil
3169: .createOptionString(this .fixedStringOptions));
3170: System.err.print("String: ");
3171: REUtil.dumpString(this .fixedString);
3172: }
3173: }
3174: }
3175: }
3176:
3177: /**
3178: * An option.
3179: * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
3180: * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
3181: * does not capture.
3182: *
3183: * @see #RegularExpression(java.lang.String,int)
3184: * @see #setPattern(java.lang.String,int)
3185: static final int MARK_PARENS = 1<<0;
3186: */
3187:
3188: /**
3189: * "i"
3190: */
3191: static final int IGNORE_CASE = 1 << 1;
3192:
3193: /**
3194: * "s"
3195: */
3196: static final int SINGLE_LINE = 1 << 2;
3197:
3198: /**
3199: * "m"
3200: */
3201: static final int MULTIPLE_LINES = 1 << 3;
3202:
3203: /**
3204: * "x"
3205: */
3206: static final int EXTENDED_COMMENT = 1 << 4;
3207:
3208: /**
3209: * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
3210: *
3211: * @see #RegularExpression(java.lang.String,int)
3212: * @see #setPattern(java.lang.String,int)
3213: * @see #UNICODE_WORD_BOUNDARY
3214: */
3215: static final int USE_UNICODE_CATEGORY = 1 << 5; // "u"
3216:
3217: /**
3218: * An option.
3219: * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \< \></kbd></span>.
3220: * <p>By default, the engine considers a position between a word character
3221: * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
3222: * is a word boundary.
3223: * <p>By this option, the engine checks word boundaries with the method of
3224: * 'Unicode Regular Expression Guidelines' Revision 4.
3225: *
3226: * @see #RegularExpression(java.lang.String,int)
3227: * @see #setPattern(java.lang.String,int)
3228: */
3229: static final int UNICODE_WORD_BOUNDARY = 1 << 6; // "w"
3230:
3231: /**
3232: * "H"
3233: */
3234: static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1 << 7;
3235: /**
3236: * "F"
3237: */
3238: static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1 << 8;
3239: /**
3240: * "X". XML Schema mode.
3241: */
3242: static final int XMLSCHEMA_MODE = 1 << 9;
3243: /**
3244: * ",".
3245: */
3246: static final int SPECIAL_COMMA = 1 << 10;
3247:
3248: private static final boolean isSet(int options, int flag) {
3249: return (options & flag) == flag;
3250: }
3251:
3252: /**
3253: * Creates a new RegularExpression instance.
3254: *
3255: * @param regex A regular expression
3256: * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
3257: */
3258: public RegularExpression(String regex) throws ParseException {
3259: this .setPattern(regex, null);
3260: }
3261:
3262: /**
3263: * Creates a new RegularExpression instance with options.
3264: *
3265: * @param regex A regular expression
3266: * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
3267: * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
3268: */
3269: public RegularExpression(String regex, String options)
3270: throws ParseException {
3271: this .setPattern(regex, options);
3272: }
3273:
3274: RegularExpression(String regex, Token tok, int parens,
3275: boolean hasBackReferences, int options) {
3276: this .regex = regex;
3277: this .tokentree = tok;
3278: this .nofparen = parens;
3279: this .options = options;
3280: this .hasBackReferences = hasBackReferences;
3281: }
3282:
3283: /**
3284: *
3285: */
3286: public void setPattern(String newPattern) throws ParseException {
3287: this .setPattern(newPattern, this .options);
3288: }
3289:
3290: private void setPattern(String newPattern, int options)
3291: throws ParseException {
3292: this .regex = newPattern;
3293: this .options = options;
3294: RegexParser rp = RegularExpression.isSet(this .options,
3295: RegularExpression.XMLSCHEMA_MODE) ? new ParserForXMLSchema()
3296: : new RegexParser();
3297: this .tokentree = rp.parse(this .regex, this .options);
3298: this .nofparen = rp.parennumber;
3299: this .hasBackReferences = rp.hasBackReferences;
3300:
3301: this .operations = null;
3302: this .context = null;
3303: }
3304:
3305: /**
3306: *
3307: */
3308: public void setPattern(String newPattern, String options)
3309: throws ParseException {
3310: this .setPattern(newPattern, REUtil.parseOptions(options));
3311: }
3312:
3313: /**
3314: *
3315: */
3316: public String getPattern() {
3317: return this .regex;
3318: }
3319:
3320: /**
3321: * Represents this instence in String.
3322: */
3323: public String toString() {
3324: return this .tokentree.toString(this .options);
3325: }
3326:
3327: /**
3328: * Returns a option string.
3329: * The order of letters in it may be different from a string specified
3330: * in a constructor or <code>setPattern()</code>.
3331: *
3332: * @see #RegularExpression(java.lang.String,java.lang.String)
3333: * @see #setPattern(java.lang.String,java.lang.String)
3334: */
3335: public String getOptions() {
3336: return REUtil.createOptionString(this .options);
3337: }
3338:
3339: /**
3340: * Return true if patterns are the same and the options are equivalent.
3341: */
3342: public boolean equals(Object obj) {
3343: if (obj == null)
3344: return false;
3345: if (!(obj instanceof RegularExpression))
3346: return false;
3347: RegularExpression r = (RegularExpression) obj;
3348: return this .regex.equals(r.regex) && this .options == r.options;
3349: }
3350:
3351: boolean equals(String pattern, int options) {
3352: return this .regex.equals(pattern) && this .options == options;
3353: }
3354:
3355: /**
3356: *
3357: */
3358: public int hashCode() {
3359: return (this .regex + "/" + this .getOptions()).hashCode();
3360: }
3361:
3362: /**
3363: * Return the number of regular expression groups.
3364: * This method returns 1 when the regular expression has no capturing-parenthesis.
3365: *
3366: */
3367: public int getNumberOfGroups() {
3368: return this .nofparen;
3369: }
3370:
3371: // ================================================================
3372:
3373: private static final int WT_IGNORE = 0;
3374: private static final int WT_LETTER = 1;
3375: private static final int WT_OTHER = 2;
3376:
3377: private static final int getWordType0(char ch, int opts) {
3378: if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
3379: if (isSet(opts, USE_UNICODE_CATEGORY)) {
3380: return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER
3381: : WT_OTHER;
3382: }
3383: return isWordChar(ch) ? WT_LETTER : WT_OTHER;
3384: }
3385:
3386: switch (Character.getType(ch)) {
3387: case Character.UPPERCASE_LETTER: // L
3388: case Character.LOWERCASE_LETTER: // L
3389: case Character.TITLECASE_LETTER: // L
3390: case Character.MODIFIER_LETTER: // L
3391: case Character.OTHER_LETTER: // L
3392: case Character.LETTER_NUMBER: // N
3393: case Character.DECIMAL_DIGIT_NUMBER: // N
3394: case Character.OTHER_NUMBER: // N
3395: case Character.COMBINING_SPACING_MARK: // Mc
3396: return WT_LETTER;
3397:
3398: case Character.FORMAT: // Cf
3399: case Character.NON_SPACING_MARK: // Mn
3400: case Character.ENCLOSING_MARK: // Mc
3401: return WT_IGNORE;
3402:
3403: case Character.CONTROL: // Cc
3404: switch (ch) {
3405: case '\t':
3406: case '\n':
3407: case '\u000B':
3408: case '\f':
3409: case '\r':
3410: return WT_OTHER;
3411: default:
3412: return WT_IGNORE;
3413: }
3414:
3415: default:
3416: return WT_OTHER;
3417: }
3418: }
3419:
3420: // ================================================================
3421:
3422: static final int LINE_FEED = 0x000A;
3423: static final int CARRIAGE_RETURN = 0x000D;
3424: static final int LINE_SEPARATOR = 0x2028;
3425: static final int PARAGRAPH_SEPARATOR = 0x2029;
3426:
3427: private static final boolean isEOLChar(int ch) {
3428: return ch == LINE_FEED || ch == CARRIAGE_RETURN
3429: || ch == LINE_SEPARATOR || ch == PARAGRAPH_SEPARATOR;
3430: }
3431:
3432: private static final boolean isWordChar(int ch) { // Legacy word characters
3433: if (ch == '_')
3434: return true;
3435: if (ch < '0')
3436: return false;
3437: if (ch > 'z')
3438: return false;
3439: if (ch <= '9')
3440: return true;
3441: if (ch < 'A')
3442: return false;
3443: if (ch <= 'Z')
3444: return true;
3445: if (ch < 'a')
3446: return false;
3447: return true;
3448: }
3449:
3450: private static final boolean matchIgnoreCase(int chardata, int ch) {
3451: if (chardata == ch)
3452: return true;
3453: if (chardata > 0xffff || ch > 0xffff)
3454: return false;
3455: char uch1 = Character.toUpperCase((char) chardata);
3456: char uch2 = Character.toUpperCase((char) ch);
3457: if (uch1 == uch2)
3458: return true;
3459: return Character.toLowerCase(uch1) == Character
3460: .toLowerCase(uch2);
3461: }
3462: }
|