0001: /*
0002: * The Apache Software License, Version 1.1
0003: *
0004: *
0005: * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
0006: * reserved.
0007: *
0008: * Redistribution and use in source and binary forms, with or without
0009: * modification, are permitted provided that the following conditions
0010: * are met:
0011: *
0012: * 1. Redistributions of source code must retain the above copyright
0013: * notice, this list of conditions and the following disclaimer.
0014: *
0015: * 2. Redistributions in binary form must reproduce the above copyright
0016: * notice, this list of conditions and the following disclaimer in
0017: * the documentation and/or other materials provided with the
0018: * distribution.
0019: *
0020: * 3. The end-user documentation included with the redistribution,
0021: * if any, must include the following acknowledgment:
0022: * "This product includes software developed by the
0023: * Apache Software Foundation (http://www.apache.org/)."
0024: * Alternately, this acknowledgment may appear in the software itself,
0025: * if and wherever such third-party acknowledgments normally appear.
0026: *
0027: * 4. The names "Xerces" and "Apache Software Foundation" must
0028: * not be used to endorse or promote products derived from this
0029: * software without prior written permission. For written
0030: * permission, please contact apache@apache.org.
0031: *
0032: * 5. Products derived from this software may not be called "Apache",
0033: * nor may "Apache" appear in their name, without prior written
0034: * permission of the Apache Software Foundation.
0035: *
0036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0047: * SUCH DAMAGE.
0048: * ====================================================================
0049: *
0050: * This software consists of voluntary contributions made by many
0051: * individuals on behalf of the Apache Software Foundation and was
0052: * originally based on software copyright (c) 1999, International
0053: * Business Machines, Inc., http://www.apache.org. For more
0054: * information on the Apache Software Foundation, please see
0055: * <http://www.apache.org/>.
0056: */
0057:
0058: package org.apache.xerces.utils.regex;
0059:
0060: import java.text.CharacterIterator;
0061:
0062: /**
0063: * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
0064: * This engine does not conform to the POSIX regular expression.
0065: *
0066: * <hr width="50%">
0067: * <h3>How to use</h3>
0068: *
0069: * <dl>
0070: * <dt>A. Standard way
0071: * <dd>
0072: * <pre>
0073: * RegularExpression re = new RegularExpression(<var>regex</var>);
0074: * if (re.matches(text)) { ... }
0075: * </pre>
0076: *
0077: * <dt>B. Capturing groups
0078: * <dd>
0079: * <pre>
0080: * RegularExpression re = new RegularExpression(<var>regex</var>);
0081: * Match match = new Match();
0082: * if (re.matches(text, match)) {
0083: * ... // You can refer captured texts with methods of the <code>Match</code> class.
0084: * }
0085: * </pre>
0086: *
0087: * </dl>
0088: *
0089: * <h4>Case-insensitive matching</h4>
0090: * <pre>
0091: * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
0092: * if (re.matches(text) >= 0) { ...}
0093: * </pre>
0094: *
0095: * <h4>Options</h4>
0096: * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
0097: * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
0098: * This <var>options</var> parameter consists of the following characters.
0099: * </p>
0100: * <dl>
0101: * <dt><a name="I_OPTION"><code>"i"</code></a>
0102: * <dd>This option indicates case-insensitive matching.
0103: * <dt><a name="M_OPTION"><code>"m"</code></a>
0104: * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
0105: * <dt><a name="S_OPTION"><code>"s"</code></a>
0106: * <dd class="REGEX"><kbd>.</kbd> matches any one character.
0107: * <dt><a name="U_OPTION"><code>"u"</code></a>
0108: * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode.
0109: * <dt><a name="W_OPTION"><code>"w"</code></a>
0110: * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of
0111: * 'Unicode Regular Expression Guidelines' Revision 4.
0112: * When "w" and "u" are specified at the same time,
0113: * <kbd>\b \B \< \></kbd> are processed for the "w" option.
0114: * <dt><a name="COMMA_OPTION"><code>","</code></a>
0115: * <dd>The parser treats a comma in a character class as a range separator.
0116: * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
0117: * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
0118: *
0119: * <dt><a name="X_OPTION"><code>"X"</code></a>
0120: * <dd class="REGEX">
0121: * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
0122: * The <code>match()</code> method does not do subsring matching
0123: * but entire string matching.
0124: *
0125: * </dl>
0126: *
0127: * <hr width="50%">
0128: * <h3>Syntax</h3>
0129: * <table border="1" bgcolor="#ddeeff">
0130: * <tr>
0131: * <td>
0132: * <h4>Differences from the Perl 5 regular expression</h4>
0133: * <ul>
0134: * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
0135: * <li>Supports subtraction, union, and intersection operations for character classes.
0136: * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
0137: * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
0138: * <kbd>\u005cu</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
0139: * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
0140: * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
0141: * </ul>
0142: * </td>
0143: * </tr>
0144: * </table>
0145: *
0146: * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
0147: * <ul>
0148: * <li>Character
0149: * <dl>
0150: * <dt class="REGEX"><kbd>.</kbd> (A period)
0151: * <dd>Matches any one character except the following characters.
0152: * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
0153: * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
0154: * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
0155: * <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
0156: * it matches any character including the above four characters.
0157: *
0158: * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
0159: * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
0160: * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
0161: *
0162: * <dt class="REGEX"><kbd>\c</kbd><var>C</var>
0163: * <dd>Matches a control character.
0164: * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
0165: * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
0166: * It matches a control character of which the character code is less than
0167: * the character code of the <var>C</var> by 0x0040.
0168: * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
0169: * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
0170: *
0171: * <dt class="REGEX">a non-meta character
0172: * <dd>Matches the character.
0173: *
0174: * <dt class="REGEX"><KBD>\</KBD> + a meta character
0175: * <dd>Matches the meta character.
0176: *
0177: * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
0178: * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
0179: * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
0180: * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
0181: *
0182: * <!--
0183: * <dt class="REGEX"><kbd>\u005cu</kbd><var>HHHH</var>
0184: * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
0185: * -->
0186: *
0187: * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
0188: * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
0189: *
0190: * <dt class="REGEX"><kbd>\g</kbd>
0191: * <dd>Matches a grapheme.
0192: * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
0193: *
0194: * <dt class="REGEX"><kbd>\X</kbd>
0195: * <dd class="REGEX">Matches a combining character sequence.
0196: * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
0197: * </dl>
0198: * </li>
0199: *
0200: * <li>Character class
0201: * <dl>
0202: + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
0203: + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
0204: * <dd>Positive character class. It matches a character in ranges.
0205: * <dd><var>R<sub>n</sub></var>:
0206: * <ul>
0207: * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005cu</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
0208: * <p>This range matches the character.
0209: * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
0210: * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point.
0211: + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
0212: + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
0213: * <p>...
0214: * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
0215: * <p>These expressions specifies the same ranges as the following expressions.
0216: * </ul>
0217: * <p class="REGEX">Enumerated ranges are merged (union operation).
0218: * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
0219: *
0220: * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
0221: * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
0222: * <dd>Negative character class. It matches a character not in ranges.
0223: *
0224: * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
0225: * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
0226: * <dd>Subtraction or union or intersection for character classes.
0227: * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
0228: * <dd>The result of this operations is a <u>positive character class</u>
0229: * even if an expression includes any negative character classes.
0230: * You have to take care on this in case-insensitive matching.
0231: * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
0232: * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
0233: * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
0234: * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
0235: * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
0236: *
0237: * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
0238: * <dd>Character class subtraction for the XML Schema.
0239: * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
0240: *
0241: * <dt class="REGEX"><kbd>\d</kbd>
0242: * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
0243: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0244: * <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
0245: *
0246: * <dt class="REGEX"><kbd>\D</kbd>
0247: * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
0248: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0249: * <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
0250: *
0251: * <dt class="REGEX"><kbd>\s</kbd>
0252: * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
0253: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0254: * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
0255: *
0256: * <dt class="REGEX"><kbd>\S</kbd>
0257: * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
0258: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0259: * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
0260: *
0261: * <dt class="REGEX"><kbd>\w</kbd>
0262: * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
0263: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0264: * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
0265: *
0266: * <dt class="REGEX"><kbd>\W</kbd>
0267: * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
0268: * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
0269: * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
0270: *
0271: * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
0272: * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
0273: * The following names are available:
0274: * <dl>
0275: * <dt>Unicode General Categories:
0276: * <dd><kbd>
0277: * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
0278: * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
0279: * </kbd>
0280: * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
0281: * <dt>Unicode Blocks:
0282: * <dd><kbd>
0283: * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
0284: * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
0285: * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
0286: * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
0287: * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
0288: * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
0289: * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
0290: * Miscellaneous Technical, Control Pictures, Optical Character Recognition,
0291: * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
0292: * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
0293: * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
0294: * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
0295: * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
0296: * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
0297: * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
0298: * Small Form Variants, Arabic Presentation Forms-B, Specials,
0299: * Halfwidth and Fullwidth Forms
0300: * </kbd>
0301: * <dt>Others:
0302: * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
0303: * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
0304: * <dd><kbd>UNASSGINED</kbd>
0305: * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
0306: * </dl>
0307: *
0308: * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
0309: * <dd>Matches one character not in the specified General Category or the specified Block.
0310: * </dl>
0311: * </li>
0312: *
0313: * <li>Selection and Quantifier
0314: * <dl>
0315: * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
0316: * <dd>...
0317: *
0318: * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
0319: * <dd>Matches 0 or more <var>X</var>.
0320: *
0321: * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
0322: * <dd>Matches 1 or more <var>X</var>.
0323: *
0324: * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
0325: * <dd>Matches 0 or 1 <var>X</var>.
0326: *
0327: * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
0328: * <dd>Matches <var>number</var> times.
0329: *
0330: * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
0331: * <dd>...
0332: *
0333: * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
0334: * <dd>...
0335: *
0336: * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
0337: * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
0338: * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
0339: * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
0340: * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
0341: * <dd>Non-greedy matching.
0342: * </dl>
0343: * </li>
0344: *
0345: * <li>Grouping, Capturing, and Back-reference
0346: * <dl>
0347: * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
0348: * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
0349: * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
0350: * you have to write "<KBD>(?:foo)+</KBD>".
0351: *
0352: * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
0353: * <dd>Grouping with capturing.
0354: * It make a group and applications can know
0355: * where in target text a group matched with methods of a <code>Match</code> instance
0356: * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
0357: * The 0th group means whole of this regular expression.
0358: * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
0359: *
0360: * <p>For instance, a regular expression is
0361: * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>"
0362: * and target text is
0363: * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>":
0364: * <ul>
0365: * <li><code>Match.getCapturedText(0)</code>:
0366: * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>"
0367: * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
0368: * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
0369: * </ul>
0370: *
0371: * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
0372: * <dd>
0373: *
0374: * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
0375: * <dd>Independent expression group. ................
0376: *
0377: * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
0378: * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
0379: * <dd>............................
0380: * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
0381: * Note that it can not contain 'u'.
0382: *
0383: * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
0384: * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
0385: * <dd>......
0386: * <dd>These expressions must be at the beginning of a group.
0387: * </dl>
0388: * </li>
0389: *
0390: * <li>Anchor
0391: * <dl>
0392: * <dt class="REGEX"><kbd>\A</kbd>
0393: * <dd>Matches the beginnig of the text.
0394: *
0395: * <dt class="REGEX"><kbd>\Z</kbd>
0396: * <dd>Matches the end of the text, or before an EOL character at the end of the text,
0397: * or CARRIAGE RETURN + LINE FEED at the end of the text.
0398: *
0399: * <dt class="REGEX"><kbd>\z</kbd>
0400: * <dd>Matches the end of the text.
0401: *
0402: * <dt class="REGEX"><kbd>^</kbd>
0403: * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
0404: * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
0405: * it matches the beginning of the text, or after one of EOL characters (
0406: * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
0407: * PARAGRAPH SEPARATOR (U+2029).)
0408: *
0409: * <dt class="REGEX"><kbd>$</kbd>
0410: * <dd>Matches the end of the text, or before an EOL character at the end of the text,
0411: * or CARRIAGE RETURN + LINE FEED at the end of the text.
0412: * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
0413: * it matches the end of the text, or before an EOL character.
0414: *
0415: * <dt class="REGEX"><kbd>\b</kbd>
0416: * <dd>Matches word boundary.
0417: * (See <a href="#W_OPTION">a "w" option</a>)
0418: *
0419: * <dt class="REGEX"><kbd>\B</kbd>
0420: * <dd>Matches non word boundary.
0421: * (See <a href="#W_OPTION">a "w" option</a>)
0422: *
0423: * <dt class="REGEX"><kbd>\<</kbd>
0424: * <dd>Matches the beginning of a word.
0425: * (See <a href="#W_OPTION">a "w" option</a>)
0426: *
0427: * <dt class="REGEX"><kbd>\></kbd>
0428: * <dd>Matches the end of a word.
0429: * (See <a href="#W_OPTION">a "w" option</a>)
0430: * </dl>
0431: * </li>
0432: * <li>Lookahead and lookbehind
0433: * <dl>
0434: * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
0435: * <dd>Lookahead.
0436: *
0437: * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
0438: * <dd>Negative lookahead.
0439: *
0440: * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd>
0441: * <dd>Lookbehind.
0442: * <dd>(Note for text capturing......)
0443: *
0444: * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd>
0445: * <dd>Negative lookbehind.
0446: * </dl>
0447: * </li>
0448: *
0449: * <li>Misc.
0450: * <dl>
0451: * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
0452: * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
0453: * <dd>......
0454: * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
0455: * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'.
0456: * You can not write comments in character classes and before quantifiers.
0457: * </dl>
0458: * </li>
0459: * </ul>
0460: *
0461: *
0462: * <hr width="50%">
0463: * <h3>BNF for the regular expression</h3>
0464: * <pre>
0465: * regex ::= ('(?' options ')')? term ('|' term)*
0466: * term ::= factor+
0467: * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
0468: * | '(?#' [^)]* ')'
0469: * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
0470: * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
0471: * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
0472: * | '(?>' regex ')' | '(?' options ':' regex ')'
0473: * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
0474: * options ::= [imsw]* ('-' [imsw]+)?
0475: * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
0476: * looks ::= '(?=' regex ')' | '(?!' regex ')'
0477: * | '(?<=' regex ')' | '(?<!' regex ')'
0478: * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
0479: * category-block ::= '\' [pP] category-symbol-1
0480: * | ('\p{' | '\P{') (category-symbol | block-name
0481: * | other-properties) '}'
0482: * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
0483: * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
0484: * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
0485: * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
0486: * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
0487: * | 'Sm' | 'Sc' | 'Sk' | 'So'
0488: * block-name ::= (See above)
0489: * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
0490: * character-1 ::= (any character except meta-characters)
0491: *
0492: * char-class ::= '[' ranges ']'
0493: * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
0494: * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
0495: * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
0496: * | range-char | range-char '-' range-char
0497: * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
0498: * code-point ::= '\x' hex-char hex-char
0499: * | '\x{' hex-char+ '}'
0500: * <!-- | '\u005cu' hex-char hex-char hex-char hex-char
0501: * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
0502: * hex-char ::= [0-9a-fA-F]
0503: * character-2 ::= (any character except \[]-,)
0504: * </pre>
0505: *
0506: * <hr width="50%">
0507: * <h3>TODO</h3>
0508: * <ul>
0509: * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
0510: * <ul>
0511: * <li>2.4 Canonical Equivalents
0512: * <li>Level 3
0513: * </ul>
0514: * <li>Parsing performance
0515: * </ul>
0516: *
0517: * <hr width="50%">
0518: *
0519: * @author TAMURA Kent <kent@trl.ibm.co.jp>
0520: */
0521: public class RegularExpression implements java.io.Serializable {
0522: static final boolean DEBUG = false;
0523:
0524: /**
0525: * Compiles a token tree into an operation flow.
0526: */
0527: private synchronized void compile(Token tok) {
0528: if (this .operations != null)
0529: return;
0530: this .numberOfClosures = 0;
0531: this .operations = this .compile(tok, null, false);
0532: }
0533:
0534: /**
0535: * Converts a token to an operation.
0536: */
0537: private Op compile(Token tok, Op next, boolean reverse) {
0538: Op ret;
0539: switch (tok.type) {
0540: case Token.DOT:
0541: ret = Op.createDot();
0542: ret.next = next;
0543: break;
0544:
0545: case Token.CHAR:
0546: ret = Op.createChar(tok.getChar());
0547: ret.next = next;
0548: break;
0549:
0550: case Token.ANCHOR:
0551: ret = Op.createAnchor(tok.getChar());
0552: ret.next = next;
0553: break;
0554:
0555: case Token.RANGE:
0556: case Token.NRANGE:
0557: ret = Op.createRange(tok);
0558: ret.next = next;
0559: break;
0560:
0561: case Token.CONCAT:
0562: ret = next;
0563: if (!reverse) {
0564: for (int i = tok.size() - 1; i >= 0; i--) {
0565: ret = compile(tok.getChild(i), ret, false);
0566: }
0567: } else {
0568: for (int i = 0; i < tok.size(); i++) {
0569: ret = compile(tok.getChild(i), ret, true);
0570: }
0571: }
0572: break;
0573:
0574: case Token.UNION:
0575: Op.UnionOp uni = Op.createUnion(tok.size());
0576: for (int i = 0; i < tok.size(); i++) {
0577: uni.addElement(compile(tok.getChild(i), next, reverse));
0578: }
0579: ret = uni; // ret.next is null.
0580: break;
0581:
0582: case Token.CLOSURE:
0583: case Token.NONGREEDYCLOSURE:
0584: Token child = tok.getChild(0);
0585: int min = tok.getMin();
0586: int max = tok.getMax();
0587: if (min >= 0 && min == max) { // {n}
0588: ret = next;
0589: for (int i = 0; i < min; i++) {
0590: ret = compile(child, ret, reverse);
0591: }
0592: break;
0593: }
0594: if (min > 0 && max > 0)
0595: max -= min;
0596: if (max > 0) {
0597: // X{2,6} -> XX(X(X(XX?)?)?)?
0598: ret = next;
0599: for (int i = 0; i < max; i++) {
0600: Op.ChildOp q = Op
0601: .createQuestion(tok.type == Token.NONGREEDYCLOSURE);
0602: q.next = next;
0603: q.setChild(compile(child, ret, reverse));
0604: ret = q;
0605: }
0606: } else {
0607: Op.ChildOp op;
0608: if (tok.type == Token.NONGREEDYCLOSURE) {
0609: op = Op.createNonGreedyClosure();
0610: } else { // Token.CLOSURE
0611: if (child.getMinLength() == 0)
0612: op = Op.createClosure(this .numberOfClosures++);
0613: else
0614: op = Op.createClosure(-1);
0615: }
0616: op.next = next;
0617: op.setChild(compile(child, op, reverse));
0618: ret = op;
0619: }
0620: if (min > 0) {
0621: for (int i = 0; i < min; i++) {
0622: ret = compile(child, ret, reverse);
0623: }
0624: }
0625: break;
0626:
0627: case Token.EMPTY:
0628: ret = next;
0629: break;
0630:
0631: case Token.STRING:
0632: ret = Op.createString(tok.getString());
0633: ret.next = next;
0634: break;
0635:
0636: case Token.BACKREFERENCE:
0637: ret = Op.createBackReference(tok.getReferenceNumber());
0638: ret.next = next;
0639: break;
0640:
0641: case Token.PAREN:
0642: if (tok.getParenNumber() == 0) {
0643: ret = compile(tok.getChild(0), next, reverse);
0644: } else if (reverse) {
0645: next = Op.createCapture(tok.getParenNumber(), next);
0646: next = compile(tok.getChild(0), next, reverse);
0647: ret = Op.createCapture(-tok.getParenNumber(), next);
0648: } else {
0649: next = Op.createCapture(-tok.getParenNumber(), next);
0650: next = compile(tok.getChild(0), next, reverse);
0651: ret = Op.createCapture(tok.getParenNumber(), next);
0652: }
0653: break;
0654:
0655: case Token.LOOKAHEAD:
0656: ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok
0657: .getChild(0), null, false));
0658: break;
0659: case Token.NEGATIVELOOKAHEAD:
0660: ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok
0661: .getChild(0), null, false));
0662: break;
0663: case Token.LOOKBEHIND:
0664: ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok
0665: .getChild(0), null, true));
0666: break;
0667: case Token.NEGATIVELOOKBEHIND:
0668: ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(
0669: tok.getChild(0), null, true));
0670: break;
0671:
0672: case Token.INDEPENDENT:
0673: ret = Op.createIndependent(next, compile(tok.getChild(0),
0674: null, reverse));
0675: break;
0676:
0677: case Token.MODIFIERGROUP:
0678: ret = Op.createModifier(next, compile(tok.getChild(0),
0679: null, reverse), ((Token.ModifierToken) tok)
0680: .getOptions(), ((Token.ModifierToken) tok)
0681: .getOptionsMask());
0682: break;
0683:
0684: case Token.CONDITION:
0685: Token.ConditionToken ctok = (Token.ConditionToken) tok;
0686: int ref = ctok.refNumber;
0687: Op condition = ctok.condition == null ? null : compile(
0688: ctok.condition, null, reverse);
0689: Op yes = compile(ctok.yes, next, reverse);
0690: Op no = ctok.no == null ? null : compile(ctok.no, next,
0691: reverse);
0692: ret = Op.createCondition(next, ref, condition, yes, no);
0693: break;
0694:
0695: default:
0696: throw new RuntimeException("Unknown token type: "
0697: + tok.type);
0698: } // switch (tok.type)
0699: return ret;
0700: }
0701:
0702: //Public
0703:
0704: /**
0705: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
0706: *
0707: * @return true if the target is matched to this regular expression.
0708: */
0709: public boolean matches(char[] target) {
0710: return this .matches(target, 0, target.length, (Match) null);
0711: }
0712:
0713: /**
0714: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
0715: * in specified range or not.
0716: *
0717: * @param start Start offset of the range.
0718: * @param end End offset +1 of the range.
0719: * @return true if the target is matched to this regular expression.
0720: */
0721: public boolean matches(char[] target, int start, int end) {
0722: return this .matches(target, start, end, (Match) null);
0723: }
0724:
0725: /**
0726: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
0727: *
0728: * @param match A Match instance for storing matching result.
0729: * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
0730: */
0731: public boolean matches(char[] target, Match match) {
0732: return this .matches(target, 0, target.length, match);
0733: }
0734:
0735: /**
0736: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
0737: * in specified range or not.
0738: *
0739: * @param start Start offset of the range.
0740: * @param end End offset +1 of the range.
0741: * @param match A Match instance for storing matching result.
0742: * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
0743: */
0744: public boolean matches(char[] target, int start, int end,
0745: Match match) {
0746:
0747: synchronized (this ) {
0748: if (this .operations == null)
0749: this .prepare();
0750: if (this .context == null)
0751: this .context = new Context();
0752: }
0753: Context con = null;
0754: synchronized (this .context) {
0755: con = this .context.inuse ? new Context() : this .context;
0756: con.reset(target, start, end, this .numberOfClosures);
0757: }
0758: if (match != null) {
0759: match.setNumberOfGroups(this .nofparen);
0760: match.setSource(target);
0761: } else if (this .hasBackReferences) {
0762: match = new Match();
0763: match.setNumberOfGroups(this .nofparen);
0764: // Need not to call setSource() because
0765: // a caller can not access this match instance.
0766: }
0767: con.match = match;
0768:
0769: if (this .isSet(this .options, XMLSCHEMA_MODE)) {
0770: int matchEnd = this .matchCharArray(con, this .operations,
0771: con.start, 1, this .options);
0772: //System.err.println("DEBUG: matchEnd="+matchEnd);
0773: if (matchEnd == con.limit) {
0774: if (con.match != null) {
0775: con.match.setBeginning(0, con.start);
0776: con.match.setEnd(0, matchEnd);
0777: }
0778: con.inuse = false;
0779: return true;
0780: }
0781: return false;
0782: }
0783:
0784: /*
0785: * The pattern has only fixed string.
0786: * The engine uses Boyer-Moore.
0787: */
0788: if (this .fixedStringOnly) {
0789: //System.err.println("DEBUG: fixed-only: "+this.fixedString);
0790: int o = this .fixedStringTable.matches(target, con.start,
0791: con.limit);
0792: if (o >= 0) {
0793: if (con.match != null) {
0794: con.match.setBeginning(0, o);
0795: con.match.setEnd(0, o + this .fixedString.length());
0796: }
0797: con.inuse = false;
0798: return true;
0799: }
0800: con.inuse = false;
0801: return false;
0802: }
0803:
0804: /*
0805: * The pattern contains a fixed string.
0806: * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
0807: * If not, it return with false.
0808: */
0809: if (this .fixedString != null) {
0810: int o = this .fixedStringTable.matches(target, con.start,
0811: con.limit);
0812: if (o < 0) {
0813: //System.err.println("Non-match in fixed-string search.");
0814: con.inuse = false;
0815: return false;
0816: }
0817: }
0818:
0819: int limit = con.limit - this .minlength;
0820: int matchStart;
0821: int matchEnd = -1;
0822:
0823: /*
0824: * Checks whether the expression starts with ".*".
0825: */
0826: if (this .operations != null
0827: && this .operations.type == Op.CLOSURE
0828: && this .operations.getChild().type == Op.DOT) {
0829: if (isSet(this .options, SINGLE_LINE)) {
0830: matchStart = con.start;
0831: matchEnd = this .matchCharArray(con, this .operations,
0832: con.start, 1, this .options);
0833: } else {
0834: boolean previousIsEOL = true;
0835: for (matchStart = con.start; matchStart <= limit; matchStart++) {
0836: int ch = target[matchStart];
0837: if (isEOLChar(ch)) {
0838: previousIsEOL = true;
0839: } else {
0840: if (previousIsEOL) {
0841: if (0 <= (matchEnd = this .matchCharArray(
0842: con, this .operations, matchStart,
0843: 1, this .options)))
0844: break;
0845: }
0846: previousIsEOL = false;
0847: }
0848: }
0849: }
0850: }
0851:
0852: /*
0853: * Optimization against the first character.
0854: */
0855: else if (this .firstChar != null) {
0856: //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
0857: RangeToken range = this .firstChar;
0858: if (this .isSet(this .options, IGNORE_CASE)) {
0859: range = this .firstChar.getCaseInsensitiveToken();
0860: for (matchStart = con.start; matchStart <= limit; matchStart++) {
0861: int ch = target[matchStart];
0862: if (REUtil.isHighSurrogate(ch)
0863: && matchStart + 1 < con.limit) {
0864: ch = REUtil.composeFromSurrogates(ch,
0865: target[matchStart + 1]);
0866: if (!range.match(ch))
0867: continue;
0868: } else {
0869: if (!range.match(ch)) {
0870: char ch1 = Character.toUpperCase((char) ch);
0871: if (!range.match(ch1))
0872: if (!range.match(Character
0873: .toLowerCase(ch1)))
0874: continue;
0875: }
0876: }
0877: if (0 <= (matchEnd = this .matchCharArray(con,
0878: this .operations, matchStart, 1,
0879: this .options)))
0880: break;
0881: }
0882: } else {
0883: for (matchStart = con.start; matchStart <= limit; matchStart++) {
0884: int ch = target[matchStart];
0885: if (REUtil.isHighSurrogate(ch)
0886: && matchStart + 1 < con.limit)
0887: ch = REUtil.composeFromSurrogates(ch,
0888: target[matchStart + 1]);
0889: if (!range.match(ch))
0890: continue;
0891: if (0 <= (matchEnd = this .matchCharArray(con,
0892: this .operations, matchStart, 1,
0893: this .options)))
0894: break;
0895: }
0896: }
0897: }
0898:
0899: /*
0900: * Straightforward matching.
0901: */
0902: else {
0903: for (matchStart = con.start; matchStart <= limit; matchStart++) {
0904: if (0 <= (matchEnd = this .matchCharArray(con,
0905: this .operations, matchStart, 1, this .options)))
0906: break;
0907: }
0908: }
0909:
0910: if (matchEnd >= 0) {
0911: if (con.match != null) {
0912: con.match.setBeginning(0, matchStart);
0913: con.match.setEnd(0, matchEnd);
0914: }
0915: con.inuse = false;
0916: return true;
0917: } else {
0918: con.inuse = false;
0919: return false;
0920: }
0921: }
0922:
0923: /**
0924: * @return -1 when not match; offset of the end of matched string when match.
0925: */
0926: private int matchCharArray(Context con, Op op, int offset, int dx,
0927: int opts) {
0928:
0929: char[] target = con.charTarget;
0930:
0931: while (true) {
0932: if (op == null)
0933: return offset;
0934: if (offset > con.limit || offset < con.start)
0935: return -1;
0936: switch (op.type) {
0937: case Op.CHAR:
0938: if (isSet(opts, IGNORE_CASE)) {
0939: int ch = op.getData();
0940: if (dx > 0) {
0941: if (offset >= con.limit
0942: || !matchIgnoreCase(ch, target[offset]))
0943: return -1;
0944: offset++;
0945: } else {
0946: int o1 = offset - 1;
0947: if (o1 >= con.limit || o1 < 0
0948: || !matchIgnoreCase(ch, target[o1]))
0949: return -1;
0950: offset = o1;
0951: }
0952: } else {
0953: int ch = op.getData();
0954: if (dx > 0) {
0955: if (offset >= con.limit || ch != target[offset])
0956: return -1;
0957: offset++;
0958: } else {
0959: int o1 = offset - 1;
0960: if (o1 >= con.limit || o1 < 0
0961: || ch != target[o1])
0962: return -1;
0963: offset = o1;
0964: }
0965: }
0966: op = op.next;
0967: break;
0968:
0969: case Op.DOT:
0970: if (dx > 0) {
0971: if (offset >= con.limit)
0972: return -1;
0973: int ch = target[offset];
0974: if (isSet(opts, SINGLE_LINE)) {
0975: if (REUtil.isHighSurrogate(ch)
0976: && offset + 1 < con.limit)
0977: offset++;
0978: } else {
0979: if (REUtil.isHighSurrogate(ch)
0980: && offset + 1 < con.limit)
0981: ch = REUtil.composeFromSurrogates(ch,
0982: target[++offset]);
0983: if (isEOLChar(ch))
0984: return -1;
0985: }
0986: offset++;
0987: } else {
0988: int o1 = offset - 1;
0989: if (o1 >= con.limit || o1 < 0)
0990: return -1;
0991: int ch = target[o1];
0992: if (isSet(opts, SINGLE_LINE)) {
0993: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
0994: o1--;
0995: } else {
0996: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
0997: ch = REUtil.composeFromSurrogates(
0998: target[--o1], ch);
0999: if (!isEOLChar(ch))
1000: return -1;
1001: }
1002: offset = o1;
1003: }
1004: op = op.next;
1005: break;
1006:
1007: case Op.RANGE:
1008: case Op.NRANGE:
1009: if (dx > 0) {
1010: if (offset >= con.limit)
1011: return -1;
1012: int ch = target[offset];
1013: if (REUtil.isHighSurrogate(ch)
1014: && offset + 1 < con.limit)
1015: ch = REUtil.composeFromSurrogates(ch,
1016: target[++offset]);
1017: RangeToken tok = op.getToken();
1018: if (isSet(opts, IGNORE_CASE)) {
1019: tok = tok.getCaseInsensitiveToken();
1020: if (!tok.match(ch)) {
1021: if (ch >= 0x10000)
1022: return -1;
1023: char uch;
1024: if (!tok.match(uch = Character
1025: .toUpperCase((char) ch))
1026: && !tok.match(Character
1027: .toLowerCase(uch)))
1028: return -1;
1029: }
1030: } else {
1031: if (!tok.match(ch))
1032: return -1;
1033: }
1034: offset++;
1035: } else {
1036: int o1 = offset - 1;
1037: if (o1 >= con.limit || o1 < 0)
1038: return -1;
1039: int ch = target[o1];
1040: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
1041: ch = REUtil.composeFromSurrogates(target[--o1],
1042: ch);
1043: RangeToken tok = op.getToken();
1044: if (isSet(opts, IGNORE_CASE)) {
1045: tok = tok.getCaseInsensitiveToken();
1046: if (!tok.match(ch)) {
1047: if (ch >= 0x10000)
1048: return -1;
1049: char uch;
1050: if (!tok.match(uch = Character
1051: .toUpperCase((char) ch))
1052: && !tok.match(Character
1053: .toLowerCase(uch)))
1054: return -1;
1055: }
1056: } else {
1057: if (!tok.match(ch))
1058: return -1;
1059: }
1060: offset = o1;
1061: }
1062: op = op.next;
1063: break;
1064:
1065: case Op.ANCHOR:
1066: boolean go = false;
1067: switch (op.getData()) {
1068: case '^':
1069: if (isSet(opts, MULTIPLE_LINES)) {
1070: if (!(offset == con.start || offset > con.start
1071: && isEOLChar(target[offset - 1])))
1072: return -1;
1073: } else {
1074: if (offset != con.start)
1075: return -1;
1076: }
1077: break;
1078:
1079: case '@': // Internal use only.
1080: // The @ always matches line beginnings.
1081: if (!(offset == con.start || offset > con.start
1082: && isEOLChar(target[offset - 1])))
1083: return -1;
1084: break;
1085:
1086: case '$':
1087: if (isSet(opts, MULTIPLE_LINES)) {
1088: if (!(offset == con.limit || offset < con.limit
1089: && isEOLChar(target[offset])))
1090: return -1;
1091: } else {
1092: if (!(offset == con.limit
1093: || offset + 1 == con.limit
1094: && isEOLChar(target[offset]) || offset + 2 == con.limit
1095: && target[offset] == CARRIAGE_RETURN
1096: && target[offset + 1] == LINE_FEED))
1097: return -1;
1098: }
1099: break;
1100:
1101: case 'A':
1102: if (offset != con.start)
1103: return -1;
1104: break;
1105:
1106: case 'Z':
1107: if (!(offset == con.limit
1108: || offset + 1 == con.limit
1109: && isEOLChar(target[offset]) || offset + 2 == con.limit
1110: && target[offset] == CARRIAGE_RETURN
1111: && target[offset + 1] == LINE_FEED))
1112: return -1;
1113: break;
1114:
1115: case 'z':
1116: if (offset != con.limit)
1117: return -1;
1118: break;
1119:
1120: case 'b':
1121: if (con.length == 0)
1122: return -1;
1123: {
1124: int after = getWordType(target, con.start,
1125: con.limit, offset, opts);
1126: if (after == WT_IGNORE)
1127: return -1;
1128: int before = getPreviousWordType(target,
1129: con.start, con.limit, offset, opts);
1130: if (after == before)
1131: return -1;
1132: }
1133: break;
1134:
1135: case 'B':
1136: if (con.length == 0)
1137: go = true;
1138: else {
1139: int after = getWordType(target, con.start,
1140: con.limit, offset, opts);
1141: go = after == WT_IGNORE
1142: || after == getPreviousWordType(target,
1143: con.start, con.limit, offset,
1144: opts);
1145: }
1146: if (!go)
1147: return -1;
1148: break;
1149:
1150: case '<':
1151: if (con.length == 0 || offset == con.limit)
1152: return -1;
1153: if (getWordType(target, con.start, con.limit,
1154: offset, opts) != WT_LETTER
1155: || getPreviousWordType(target, con.start,
1156: con.limit, offset, opts) != WT_OTHER)
1157: return -1;
1158: break;
1159:
1160: case '>':
1161: if (con.length == 0 || offset == con.start)
1162: return -1;
1163: if (getWordType(target, con.start, con.limit,
1164: offset, opts) != WT_OTHER
1165: || getPreviousWordType(target, con.start,
1166: con.limit, offset, opts) != WT_LETTER)
1167: return -1;
1168: break;
1169: } // switch anchor type
1170: op = op.next;
1171: break;
1172:
1173: case Op.BACKREFERENCE: {
1174: int refno = op.getData();
1175: if (refno <= 0 || refno >= this .nofparen)
1176: throw new RuntimeException(
1177: "Internal Error: Reference number must be more than zero: "
1178: + refno);
1179: if (con.match.getBeginning(refno) < 0
1180: || con.match.getEnd(refno) < 0)
1181: return -1; // ********
1182: int o2 = con.match.getBeginning(refno);
1183: int literallen = con.match.getEnd(refno) - o2;
1184: if (!isSet(opts, IGNORE_CASE)) {
1185: if (dx > 0) {
1186: if (!regionMatches(target, offset, con.limit,
1187: o2, literallen))
1188: return -1;
1189: offset += literallen;
1190: } else {
1191: if (!regionMatches(target, offset - literallen,
1192: con.limit, o2, literallen))
1193: return -1;
1194: offset -= literallen;
1195: }
1196: } else {
1197: if (dx > 0) {
1198: if (!regionMatchesIgnoreCase(target, offset,
1199: con.limit, o2, literallen))
1200: return -1;
1201: offset += literallen;
1202: } else {
1203: if (!regionMatchesIgnoreCase(target, offset
1204: - literallen, con.limit, o2, literallen))
1205: return -1;
1206: offset -= literallen;
1207: }
1208: }
1209: }
1210: op = op.next;
1211: break;
1212: case Op.STRING: {
1213: String literal = op.getString();
1214: int literallen = literal.length();
1215: if (!isSet(opts, IGNORE_CASE)) {
1216: if (dx > 0) {
1217: if (!regionMatches(target, offset, con.limit,
1218: literal, literallen))
1219: return -1;
1220: offset += literallen;
1221: } else {
1222: if (!regionMatches(target, offset - literallen,
1223: con.limit, literal, literallen))
1224: return -1;
1225: offset -= literallen;
1226: }
1227: } else {
1228: if (dx > 0) {
1229: if (!regionMatchesIgnoreCase(target, offset,
1230: con.limit, literal, literallen))
1231: return -1;
1232: offset += literallen;
1233: } else {
1234: if (!regionMatchesIgnoreCase(target, offset
1235: - literallen, con.limit, literal,
1236: literallen))
1237: return -1;
1238: offset -= literallen;
1239: }
1240: }
1241: }
1242: op = op.next;
1243: break;
1244:
1245: case Op.CLOSURE: {
1246: /*
1247: * Saves current position to avoid
1248: * zero-width repeats.
1249: */
1250: int id = op.getData();
1251: if (id >= 0) {
1252: int previousOffset = con.offsets[id];
1253: if (previousOffset < 0 || previousOffset != offset) {
1254: con.offsets[id] = offset;
1255: } else {
1256: con.offsets[id] = -1;
1257: op = op.next;
1258: break;
1259: }
1260: }
1261:
1262: int ret = this .matchCharArray(con, op.getChild(),
1263: offset, dx, opts);
1264: if (id >= 0)
1265: con.offsets[id] = -1;
1266: if (ret >= 0)
1267: return ret;
1268: op = op.next;
1269: }
1270: break;
1271:
1272: case Op.QUESTION: {
1273: int ret = this .matchCharArray(con, op.getChild(),
1274: offset, dx, opts);
1275: if (ret >= 0)
1276: return ret;
1277: op = op.next;
1278: }
1279: break;
1280:
1281: case Op.NONGREEDYCLOSURE:
1282: case Op.NONGREEDYQUESTION: {
1283: int ret = this .matchCharArray(con, op.next, offset, dx,
1284: opts);
1285: if (ret >= 0)
1286: return ret;
1287: op = op.getChild();
1288: }
1289: break;
1290:
1291: case Op.UNION:
1292: for (int i = 0; i < op.size(); i++) {
1293: int ret = this .matchCharArray(con, op.elementAt(i),
1294: offset, dx, opts);
1295: if (DEBUG) {
1296: System.err.println("UNION: " + i + ", ret="
1297: + ret);
1298: }
1299: if (ret == con.length)
1300: return ret;
1301: }
1302: return -1;
1303:
1304: case Op.CAPTURE:
1305: int refno = op.getData();
1306: if (con.match != null && refno > 0) {
1307: int save = con.match.getBeginning(refno);
1308: con.match.setBeginning(refno, offset);
1309: int ret = this .matchCharArray(con, op.next, offset,
1310: dx, opts);
1311: if (ret < 0)
1312: con.match.setBeginning(refno, save);
1313: return ret;
1314: } else if (con.match != null && refno < 0) {
1315: int index = -refno;
1316: int save = con.match.getEnd(index);
1317: con.match.setEnd(index, offset);
1318: int ret = this .matchCharArray(con, op.next, offset,
1319: dx, opts);
1320: if (ret < 0)
1321: con.match.setEnd(index, save);
1322: return ret;
1323: }
1324: op = op.next;
1325: break;
1326:
1327: case Op.LOOKAHEAD:
1328: if (0 > this .matchCharArray(con, op.getChild(), offset,
1329: 1, opts))
1330: return -1;
1331: op = op.next;
1332: break;
1333: case Op.NEGATIVELOOKAHEAD:
1334: if (0 <= this .matchCharArray(con, op.getChild(),
1335: offset, 1, opts))
1336: return -1;
1337: op = op.next;
1338: break;
1339: case Op.LOOKBEHIND:
1340: if (0 > this .matchCharArray(con, op.getChild(), offset,
1341: -1, opts))
1342: return -1;
1343: op = op.next;
1344: break;
1345: case Op.NEGATIVELOOKBEHIND:
1346: if (0 <= this .matchCharArray(con, op.getChild(),
1347: offset, -1, opts))
1348: return -1;
1349: op = op.next;
1350: break;
1351:
1352: case Op.INDEPENDENT: {
1353: int ret = this .matchCharArray(con, op.getChild(),
1354: offset, dx, opts);
1355: if (ret < 0)
1356: return ret;
1357: offset = ret;
1358: op = op.next;
1359: }
1360: break;
1361:
1362: case Op.MODIFIER: {
1363: int localopts = opts;
1364: localopts |= op.getData();
1365: localopts &= ~op.getData2();
1366: //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
1367: int ret = this .matchCharArray(con, op.getChild(),
1368: offset, dx, localopts);
1369: if (ret < 0)
1370: return ret;
1371: offset = ret;
1372: op = op.next;
1373: }
1374: break;
1375:
1376: case Op.CONDITION: {
1377: Op.ConditionOp cop = (Op.ConditionOp) op;
1378: boolean matchp = false;
1379: if (cop.refNumber > 0) {
1380: if (cop.refNumber >= this .nofparen)
1381: throw new RuntimeException(
1382: "Internal Error: Reference number must be more than zero: "
1383: + cop.refNumber);
1384: matchp = con.match.getBeginning(cop.refNumber) >= 0
1385: && con.match.getEnd(cop.refNumber) >= 0;
1386: } else {
1387: matchp = 0 <= this .matchCharArray(con,
1388: cop.condition, offset, dx, opts);
1389: }
1390:
1391: if (matchp) {
1392: op = cop.yes;
1393: } else if (cop.no != null) {
1394: op = cop.no;
1395: } else {
1396: op = cop.next;
1397: }
1398: }
1399: break;
1400:
1401: default:
1402: throw new RuntimeException("Unknown operation type: "
1403: + op.type);
1404: } // switch (op.type)
1405: } // while
1406: }
1407:
1408: private static final int getPreviousWordType(char[] target,
1409: int begin, int end, int offset, int opts) {
1410: int ret = getWordType(target, begin, end, --offset, opts);
1411: while (ret == WT_IGNORE)
1412: ret = getWordType(target, begin, end, --offset, opts);
1413: return ret;
1414: }
1415:
1416: private static final int getWordType(char[] target, int begin,
1417: int end, int offset, int opts) {
1418: if (offset < begin || offset >= end)
1419: return WT_OTHER;
1420: return getWordType0(target[offset], opts);
1421: }
1422:
1423: private static final boolean regionMatches(char[] target,
1424: int offset, int limit, String part, int partlen) {
1425: if (offset < 0)
1426: return false;
1427: if (limit - offset < partlen)
1428: return false;
1429: int i = 0;
1430: while (partlen-- > 0) {
1431: if (target[offset++] != part.charAt(i++))
1432: return false;
1433: }
1434: return true;
1435: }
1436:
1437: private static final boolean regionMatches(char[] target,
1438: int offset, int limit, int offset2, int partlen) {
1439: if (offset < 0)
1440: return false;
1441: if (limit - offset < partlen)
1442: return false;
1443: int i = offset2;
1444: while (partlen-- > 0) {
1445: if (target[offset++] != target[i++])
1446: return false;
1447: }
1448: return true;
1449: }
1450:
1451: /**
1452: * @see java.lang.String#regionMatches
1453: */
1454: private static final boolean regionMatchesIgnoreCase(char[] target,
1455: int offset, int limit, String part, int partlen) {
1456: if (offset < 0)
1457: return false;
1458: if (limit - offset < partlen)
1459: return false;
1460: int i = 0;
1461: while (partlen-- > 0) {
1462: char ch1 = target[offset++];
1463: char ch2 = part.charAt(i++);
1464: if (ch1 == ch2)
1465: continue;
1466: char uch1 = Character.toUpperCase(ch1);
1467: char uch2 = Character.toUpperCase(ch2);
1468: if (uch1 == uch2)
1469: continue;
1470: if (Character.toLowerCase(uch1) != Character
1471: .toLowerCase(uch2))
1472: return false;
1473: }
1474: return true;
1475: }
1476:
1477: private static final boolean regionMatchesIgnoreCase(char[] target,
1478: int offset, int limit, int offset2, int partlen) {
1479: if (offset < 0)
1480: return false;
1481: if (limit - offset < partlen)
1482: return false;
1483: int i = offset2;
1484: while (partlen-- > 0) {
1485: char ch1 = target[offset++];
1486: char ch2 = target[i++];
1487: if (ch1 == ch2)
1488: continue;
1489: char uch1 = Character.toUpperCase(ch1);
1490: char uch2 = Character.toUpperCase(ch2);
1491: if (uch1 == uch2)
1492: continue;
1493: if (Character.toLowerCase(uch1) != Character
1494: .toLowerCase(uch2))
1495: return false;
1496: }
1497: return true;
1498: }
1499:
1500: /**
1501: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1502: *
1503: * @return true if the target is matched to this regular expression.
1504: */
1505: public boolean matches(String target) {
1506: return this .matches(target, 0, target.length(), (Match) null);
1507: }
1508:
1509: /**
1510: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1511: * in specified range or not.
1512: *
1513: * @param start Start offset of the range.
1514: * @param end End offset +1 of the range.
1515: * @return true if the target is matched to this regular expression.
1516: */
1517: public boolean matches(String target, int start, int end) {
1518: return this .matches(target, start, end, (Match) null);
1519: }
1520:
1521: /**
1522: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1523: *
1524: * @param match A Match instance for storing matching result.
1525: * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1526: */
1527: public boolean matches(String target, Match match) {
1528: return this .matches(target, 0, target.length(), match);
1529: }
1530:
1531: /**
1532: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1533: * in specified range or not.
1534: *
1535: * @param start Start offset of the range.
1536: * @param end End offset +1 of the range.
1537: * @param match A Match instance for storing matching result.
1538: * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1539: */
1540: public boolean matches(String target, int start, int end,
1541: Match match) {
1542:
1543: synchronized (this ) {
1544: if (this .operations == null)
1545: this .prepare();
1546: if (this .context == null)
1547: this .context = new Context();
1548: }
1549: Context con = null;
1550: synchronized (this .context) {
1551: con = this .context.inuse ? new Context() : this .context;
1552: con.reset(target, start, end, this .numberOfClosures);
1553: }
1554: if (match != null) {
1555: match.setNumberOfGroups(this .nofparen);
1556: match.setSource(target);
1557: } else if (this .hasBackReferences) {
1558: match = new Match();
1559: match.setNumberOfGroups(this .nofparen);
1560: // Need not to call setSource() because
1561: // a caller can not access this match instance.
1562: }
1563: con.match = match;
1564:
1565: if (this .isSet(this .options, XMLSCHEMA_MODE)) {
1566: if (DEBUG) {
1567: System.err.println("target string=" + target);
1568: }
1569: int matchEnd = this .matchString(con, this .operations,
1570: con.start, 1, this .options);
1571: if (DEBUG) {
1572: System.err.println("matchEnd=" + matchEnd);
1573: System.err.println("con.limit=" + con.limit);
1574: }
1575: if (matchEnd == con.limit) {
1576: if (con.match != null) {
1577: con.match.setBeginning(0, con.start);
1578: con.match.setEnd(0, matchEnd);
1579: }
1580: con.inuse = false;
1581: return true;
1582: }
1583: return false;
1584: }
1585:
1586: /*
1587: * The pattern has only fixed string.
1588: * The engine uses Boyer-Moore.
1589: */
1590: if (this .fixedStringOnly) {
1591: //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1592: int o = this .fixedStringTable.matches(target, con.start,
1593: con.limit);
1594: if (o >= 0) {
1595: if (con.match != null) {
1596: con.match.setBeginning(0, o);
1597: con.match.setEnd(0, o + this .fixedString.length());
1598: }
1599: con.inuse = false;
1600: return true;
1601: }
1602: con.inuse = false;
1603: return false;
1604: }
1605:
1606: /*
1607: * The pattern contains a fixed string.
1608: * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1609: * If not, it return with false.
1610: */
1611: if (this .fixedString != null) {
1612: int o = this .fixedStringTable.matches(target, con.start,
1613: con.limit);
1614: if (o < 0) {
1615: //System.err.println("Non-match in fixed-string search.");
1616: con.inuse = false;
1617: return false;
1618: }
1619: }
1620:
1621: int limit = con.limit - this .minlength;
1622: int matchStart;
1623: int matchEnd = -1;
1624:
1625: /*
1626: * Checks whether the expression starts with ".*".
1627: */
1628: if (this .operations != null
1629: && this .operations.type == Op.CLOSURE
1630: && this .operations.getChild().type == Op.DOT) {
1631: if (isSet(this .options, SINGLE_LINE)) {
1632: matchStart = con.start;
1633: matchEnd = this .matchString(con, this .operations,
1634: con.start, 1, this .options);
1635: } else {
1636: boolean previousIsEOL = true;
1637: for (matchStart = con.start; matchStart <= limit; matchStart++) {
1638: int ch = target.charAt(matchStart);
1639: if (isEOLChar(ch)) {
1640: previousIsEOL = true;
1641: } else {
1642: if (previousIsEOL) {
1643: if (0 <= (matchEnd = this .matchString(con,
1644: this .operations, matchStart, 1,
1645: this .options)))
1646: break;
1647: }
1648: previousIsEOL = false;
1649: }
1650: }
1651: }
1652: }
1653:
1654: /*
1655: * Optimization against the first character.
1656: */
1657: else if (this .firstChar != null) {
1658: //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1659: RangeToken range = this .firstChar;
1660: if (this .isSet(this .options, IGNORE_CASE)) {
1661: range = this .firstChar.getCaseInsensitiveToken();
1662: for (matchStart = con.start; matchStart <= limit; matchStart++) {
1663: int ch = target.charAt(matchStart);
1664: if (REUtil.isHighSurrogate(ch)
1665: && matchStart + 1 < con.limit) {
1666: ch = REUtil.composeFromSurrogates(ch, target
1667: .charAt(matchStart + 1));
1668: if (!range.match(ch))
1669: continue;
1670: } else {
1671: if (!range.match(ch)) {
1672: char ch1 = Character.toUpperCase((char) ch);
1673: if (!range.match(ch1))
1674: if (!range.match(Character
1675: .toLowerCase(ch1)))
1676: continue;
1677: }
1678: }
1679: if (0 <= (matchEnd = this .matchString(con,
1680: this .operations, matchStart, 1,
1681: this .options)))
1682: break;
1683: }
1684: } else {
1685: for (matchStart = con.start; matchStart <= limit; matchStart++) {
1686: int ch = target.charAt(matchStart);
1687: if (REUtil.isHighSurrogate(ch)
1688: && matchStart + 1 < con.limit)
1689: ch = REUtil.composeFromSurrogates(ch, target
1690: .charAt(matchStart + 1));
1691: if (!range.match(ch))
1692: continue;
1693: if (0 <= (matchEnd = this .matchString(con,
1694: this .operations, matchStart, 1,
1695: this .options)))
1696: break;
1697: }
1698: }
1699: }
1700:
1701: /*
1702: * Straightforward matching.
1703: */
1704: else {
1705: for (matchStart = con.start; matchStart <= limit; matchStart++) {
1706: if (0 <= (matchEnd = this .matchString(con,
1707: this .operations, matchStart, 1, this .options)))
1708: break;
1709: }
1710: }
1711:
1712: if (matchEnd >= 0) {
1713: if (con.match != null) {
1714: con.match.setBeginning(0, matchStart);
1715: con.match.setEnd(0, matchEnd);
1716: }
1717: con.inuse = false;
1718: return true;
1719: } else {
1720: con.inuse = false;
1721: return false;
1722: }
1723: }
1724:
1725: /**
1726: * @return -1 when not match; offset of the end of matched string when match.
1727: */
1728: private int matchString(Context con, Op op, int offset, int dx,
1729: int opts) {
1730:
1731: String target = con.strTarget;
1732:
1733: while (true) {
1734: if (op == null)
1735: return offset;
1736: if (offset > con.limit || offset < con.start)
1737: return -1;
1738: switch (op.type) {
1739: case Op.CHAR:
1740: if (isSet(opts, IGNORE_CASE)) {
1741: int ch = op.getData();
1742: if (dx > 0) {
1743: if (offset >= con.limit
1744: || !matchIgnoreCase(ch, target
1745: .charAt(offset)))
1746: return -1;
1747: offset++;
1748: } else {
1749: int o1 = offset - 1;
1750: if (o1 >= con.limit
1751: || o1 < 0
1752: || !matchIgnoreCase(ch, target
1753: .charAt(o1)))
1754: return -1;
1755: offset = o1;
1756: }
1757: } else {
1758: int ch = op.getData();
1759: if (dx > 0) {
1760: if (offset >= con.limit
1761: || ch != target.charAt(offset))
1762: return -1;
1763: offset++;
1764: } else {
1765: int o1 = offset - 1;
1766: if (o1 >= con.limit || o1 < 0
1767: || ch != target.charAt(o1))
1768: return -1;
1769: offset = o1;
1770: }
1771: }
1772: op = op.next;
1773: break;
1774:
1775: case Op.DOT:
1776: if (dx > 0) {
1777: if (offset >= con.limit)
1778: return -1;
1779: int ch = target.charAt(offset);
1780: if (isSet(opts, SINGLE_LINE)) {
1781: if (REUtil.isHighSurrogate(ch)
1782: && offset + 1 < con.limit)
1783: offset++;
1784: } else {
1785: if (REUtil.isHighSurrogate(ch)
1786: && offset + 1 < con.limit)
1787: ch = REUtil.composeFromSurrogates(ch,
1788: target.charAt(++offset));
1789: if (isEOLChar(ch))
1790: return -1;
1791: }
1792: offset++;
1793: } else {
1794: int o1 = offset - 1;
1795: if (o1 >= con.limit || o1 < 0)
1796: return -1;
1797: int ch = target.charAt(o1);
1798: if (isSet(opts, SINGLE_LINE)) {
1799: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
1800: o1--;
1801: } else {
1802: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
1803: ch = REUtil.composeFromSurrogates(target
1804: .charAt(--o1), ch);
1805: if (!isEOLChar(ch))
1806: return -1;
1807: }
1808: offset = o1;
1809: }
1810: op = op.next;
1811: break;
1812:
1813: case Op.RANGE:
1814: case Op.NRANGE:
1815: if (dx > 0) {
1816: if (offset >= con.limit)
1817: return -1;
1818: int ch = target.charAt(offset);
1819: if (REUtil.isHighSurrogate(ch)
1820: && offset + 1 < con.limit)
1821: ch = REUtil.composeFromSurrogates(ch, target
1822: .charAt(++offset));
1823: RangeToken tok = op.getToken();
1824: if (isSet(opts, IGNORE_CASE)) {
1825: tok = tok.getCaseInsensitiveToken();
1826: if (!tok.match(ch)) {
1827: if (ch >= 0x10000)
1828: return -1;
1829: char uch;
1830: if (!tok.match(uch = Character
1831: .toUpperCase((char) ch))
1832: && !tok.match(Character
1833: .toLowerCase(uch)))
1834: return -1;
1835: }
1836: } else {
1837: if (!tok.match(ch))
1838: return -1;
1839: }
1840: offset++;
1841: } else {
1842: int o1 = offset - 1;
1843: if (o1 >= con.limit || o1 < 0)
1844: return -1;
1845: int ch = target.charAt(o1);
1846: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
1847: ch = REUtil.composeFromSurrogates(target
1848: .charAt(--o1), ch);
1849: RangeToken tok = op.getToken();
1850: if (isSet(opts, IGNORE_CASE)) {
1851: tok = tok.getCaseInsensitiveToken();
1852: if (!tok.match(ch)) {
1853: if (ch >= 0x10000)
1854: return -1;
1855: char uch;
1856: if (!tok.match(uch = Character
1857: .toUpperCase((char) ch))
1858: && !tok.match(Character
1859: .toLowerCase(uch)))
1860: return -1;
1861: }
1862: } else {
1863: if (!tok.match(ch))
1864: return -1;
1865: }
1866: offset = o1;
1867: }
1868: op = op.next;
1869: break;
1870:
1871: case Op.ANCHOR:
1872: boolean go = false;
1873: switch (op.getData()) {
1874: case '^':
1875: if (isSet(opts, MULTIPLE_LINES)) {
1876: if (!(offset == con.start || offset > con.start
1877: && isEOLChar(target.charAt(offset - 1))))
1878: return -1;
1879: } else {
1880: if (offset != con.start)
1881: return -1;
1882: }
1883: break;
1884:
1885: case '@': // Internal use only.
1886: // The @ always matches line beginnings.
1887: if (!(offset == con.start || offset > con.start
1888: && isEOLChar(target.charAt(offset - 1))))
1889: return -1;
1890: break;
1891:
1892: case '$':
1893: if (isSet(opts, MULTIPLE_LINES)) {
1894: if (!(offset == con.limit || offset < con.limit
1895: && isEOLChar(target.charAt(offset))))
1896: return -1;
1897: } else {
1898: if (!(offset == con.limit
1899: || offset + 1 == con.limit
1900: && isEOLChar(target.charAt(offset)) || offset + 2 == con.limit
1901: && target.charAt(offset) == CARRIAGE_RETURN
1902: && target.charAt(offset + 1) == LINE_FEED))
1903: return -1;
1904: }
1905: break;
1906:
1907: case 'A':
1908: if (offset != con.start)
1909: return -1;
1910: break;
1911:
1912: case 'Z':
1913: if (!(offset == con.limit
1914: || offset + 1 == con.limit
1915: && isEOLChar(target.charAt(offset)) || offset + 2 == con.limit
1916: && target.charAt(offset) == CARRIAGE_RETURN
1917: && target.charAt(offset + 1) == LINE_FEED))
1918: return -1;
1919: break;
1920:
1921: case 'z':
1922: if (offset != con.limit)
1923: return -1;
1924: break;
1925:
1926: case 'b':
1927: if (con.length == 0)
1928: return -1;
1929: {
1930: int after = getWordType(target, con.start,
1931: con.limit, offset, opts);
1932: if (after == WT_IGNORE)
1933: return -1;
1934: int before = getPreviousWordType(target,
1935: con.start, con.limit, offset, opts);
1936: if (after == before)
1937: return -1;
1938: }
1939: break;
1940:
1941: case 'B':
1942: if (con.length == 0)
1943: go = true;
1944: else {
1945: int after = getWordType(target, con.start,
1946: con.limit, offset, opts);
1947: go = after == WT_IGNORE
1948: || after == getPreviousWordType(target,
1949: con.start, con.limit, offset,
1950: opts);
1951: }
1952: if (!go)
1953: return -1;
1954: break;
1955:
1956: case '<':
1957: if (con.length == 0 || offset == con.limit)
1958: return -1;
1959: if (getWordType(target, con.start, con.limit,
1960: offset, opts) != WT_LETTER
1961: || getPreviousWordType(target, con.start,
1962: con.limit, offset, opts) != WT_OTHER)
1963: return -1;
1964: break;
1965:
1966: case '>':
1967: if (con.length == 0 || offset == con.start)
1968: return -1;
1969: if (getWordType(target, con.start, con.limit,
1970: offset, opts) != WT_OTHER
1971: || getPreviousWordType(target, con.start,
1972: con.limit, offset, opts) != WT_LETTER)
1973: return -1;
1974: break;
1975: } // switch anchor type
1976: op = op.next;
1977: break;
1978:
1979: case Op.BACKREFERENCE: {
1980: int refno = op.getData();
1981: if (refno <= 0 || refno >= this .nofparen)
1982: throw new RuntimeException(
1983: "Internal Error: Reference number must be more than zero: "
1984: + refno);
1985: if (con.match.getBeginning(refno) < 0
1986: || con.match.getEnd(refno) < 0)
1987: return -1; // ********
1988: int o2 = con.match.getBeginning(refno);
1989: int literallen = con.match.getEnd(refno) - o2;
1990: if (!isSet(opts, IGNORE_CASE)) {
1991: if (dx > 0) {
1992: if (!regionMatches(target, offset, con.limit,
1993: o2, literallen))
1994: return -1;
1995: offset += literallen;
1996: } else {
1997: if (!regionMatches(target, offset - literallen,
1998: con.limit, o2, literallen))
1999: return -1;
2000: offset -= literallen;
2001: }
2002: } else {
2003: if (dx > 0) {
2004: if (!regionMatchesIgnoreCase(target, offset,
2005: con.limit, o2, literallen))
2006: return -1;
2007: offset += literallen;
2008: } else {
2009: if (!regionMatchesIgnoreCase(target, offset
2010: - literallen, con.limit, o2, literallen))
2011: return -1;
2012: offset -= literallen;
2013: }
2014: }
2015: }
2016: op = op.next;
2017: break;
2018: case Op.STRING: {
2019: String literal = op.getString();
2020: int literallen = literal.length();
2021: if (!isSet(opts, IGNORE_CASE)) {
2022: if (dx > 0) {
2023: if (!regionMatches(target, offset, con.limit,
2024: literal, literallen))
2025: return -1;
2026: offset += literallen;
2027: } else {
2028: if (!regionMatches(target, offset - literallen,
2029: con.limit, literal, literallen))
2030: return -1;
2031: offset -= literallen;
2032: }
2033: } else {
2034: if (dx > 0) {
2035: if (!regionMatchesIgnoreCase(target, offset,
2036: con.limit, literal, literallen))
2037: return -1;
2038: offset += literallen;
2039: } else {
2040: if (!regionMatchesIgnoreCase(target, offset
2041: - literallen, con.limit, literal,
2042: literallen))
2043: return -1;
2044: offset -= literallen;
2045: }
2046: }
2047: }
2048: op = op.next;
2049: break;
2050:
2051: case Op.CLOSURE: {
2052: /*
2053: * Saves current position to avoid
2054: * zero-width repeats.
2055: */
2056: int id = op.getData();
2057: if (id >= 0) {
2058: int previousOffset = con.offsets[id];
2059: if (previousOffset < 0 || previousOffset != offset) {
2060: con.offsets[id] = offset;
2061: } else {
2062: con.offsets[id] = -1;
2063: op = op.next;
2064: break;
2065: }
2066: }
2067: int ret = this .matchString(con, op.getChild(), offset,
2068: dx, opts);
2069: if (id >= 0)
2070: con.offsets[id] = -1;
2071: if (ret >= 0)
2072: return ret;
2073: op = op.next;
2074: }
2075: break;
2076:
2077: case Op.QUESTION: {
2078: int ret = this .matchString(con, op.getChild(), offset,
2079: dx, opts);
2080: if (ret >= 0)
2081: return ret;
2082: op = op.next;
2083: }
2084: break;
2085:
2086: case Op.NONGREEDYCLOSURE:
2087: case Op.NONGREEDYQUESTION: {
2088: int ret = this .matchString(con, op.next, offset, dx,
2089: opts);
2090: if (ret >= 0)
2091: return ret;
2092: op = op.getChild();
2093: }
2094: break;
2095:
2096: case Op.UNION:
2097: for (int i = 0; i < op.size(); i++) {
2098: int ret = this .matchString(con, op.elementAt(i),
2099: offset, dx, opts);
2100: if (DEBUG) {
2101: System.err.println("UNION: " + i + ", ret="
2102: + ret);
2103: }
2104: if (ret == con.length)
2105: return ret;
2106: }
2107: return -1;
2108:
2109: case Op.CAPTURE:
2110: int refno = op.getData();
2111: if (con.match != null && refno > 0) {
2112: int save = con.match.getBeginning(refno);
2113: con.match.setBeginning(refno, offset);
2114: int ret = this .matchString(con, op.next, offset,
2115: dx, opts);
2116: if (ret < 0)
2117: con.match.setBeginning(refno, save);
2118: return ret;
2119: } else if (con.match != null && refno < 0) {
2120: int index = -refno;
2121: int save = con.match.getEnd(index);
2122: con.match.setEnd(index, offset);
2123: int ret = this .matchString(con, op.next, offset,
2124: dx, opts);
2125: if (ret < 0)
2126: con.match.setEnd(index, save);
2127: return ret;
2128: }
2129: op = op.next;
2130: break;
2131:
2132: case Op.LOOKAHEAD:
2133: if (0 > this .matchString(con, op.getChild(), offset, 1,
2134: opts))
2135: return -1;
2136: op = op.next;
2137: break;
2138: case Op.NEGATIVELOOKAHEAD:
2139: if (0 <= this .matchString(con, op.getChild(), offset,
2140: 1, opts))
2141: return -1;
2142: op = op.next;
2143: break;
2144: case Op.LOOKBEHIND:
2145: if (0 > this .matchString(con, op.getChild(), offset,
2146: -1, opts))
2147: return -1;
2148: op = op.next;
2149: break;
2150: case Op.NEGATIVELOOKBEHIND:
2151: if (0 <= this .matchString(con, op.getChild(), offset,
2152: -1, opts))
2153: return -1;
2154: op = op.next;
2155: break;
2156:
2157: case Op.INDEPENDENT: {
2158: int ret = this .matchString(con, op.getChild(), offset,
2159: dx, opts);
2160: if (ret < 0)
2161: return ret;
2162: offset = ret;
2163: op = op.next;
2164: }
2165: break;
2166:
2167: case Op.MODIFIER: {
2168: int localopts = opts;
2169: localopts |= op.getData();
2170: localopts &= ~op.getData2();
2171: //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
2172: int ret = this .matchString(con, op.getChild(), offset,
2173: dx, localopts);
2174: if (ret < 0)
2175: return ret;
2176: offset = ret;
2177: op = op.next;
2178: }
2179: break;
2180:
2181: case Op.CONDITION: {
2182: Op.ConditionOp cop = (Op.ConditionOp) op;
2183: boolean matchp = false;
2184: if (cop.refNumber > 0) {
2185: if (cop.refNumber >= this .nofparen)
2186: throw new RuntimeException(
2187: "Internal Error: Reference number must be more than zero: "
2188: + cop.refNumber);
2189: matchp = con.match.getBeginning(cop.refNumber) >= 0
2190: && con.match.getEnd(cop.refNumber) >= 0;
2191: } else {
2192: matchp = 0 <= this .matchString(con, cop.condition,
2193: offset, dx, opts);
2194: }
2195:
2196: if (matchp) {
2197: op = cop.yes;
2198: } else if (cop.no != null) {
2199: op = cop.no;
2200: } else {
2201: op = cop.next;
2202: }
2203: }
2204: break;
2205:
2206: default:
2207: throw new RuntimeException("Unknown operation type: "
2208: + op.type);
2209: } // switch (op.type)
2210: } // while
2211: }
2212:
2213: private static final int getPreviousWordType(String target,
2214: int begin, int end, int offset, int opts) {
2215: int ret = getWordType(target, begin, end, --offset, opts);
2216: while (ret == WT_IGNORE)
2217: ret = getWordType(target, begin, end, --offset, opts);
2218: return ret;
2219: }
2220:
2221: private static final int getWordType(String target, int begin,
2222: int end, int offset, int opts) {
2223: if (offset < begin || offset >= end)
2224: return WT_OTHER;
2225: return getWordType0(target.charAt(offset), opts);
2226: }
2227:
2228: private static final boolean regionMatches(String text, int offset,
2229: int limit, String part, int partlen) {
2230: if (limit - offset < partlen)
2231: return false;
2232: return text.regionMatches(offset, part, 0, partlen);
2233: }
2234:
2235: private static final boolean regionMatches(String text, int offset,
2236: int limit, int offset2, int partlen) {
2237: if (limit - offset < partlen)
2238: return false;
2239: return text.regionMatches(offset, text, offset2, partlen);
2240: }
2241:
2242: private static final boolean regionMatchesIgnoreCase(String text,
2243: int offset, int limit, String part, int partlen) {
2244: return text.regionMatches(true, offset, part, 0, partlen);
2245: }
2246:
2247: private static final boolean regionMatchesIgnoreCase(String text,
2248: int offset, int limit, int offset2, int partlen) {
2249: if (limit - offset < partlen)
2250: return false;
2251: return text.regionMatches(true, offset, text, offset2, partlen);
2252: }
2253:
2254: /**
2255: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2256: *
2257: * @return true if the target is matched to this regular expression.
2258: */
2259: public boolean matches(CharacterIterator target) {
2260: return this .matches(target, (Match) null);
2261: }
2262:
2263: /**
2264: * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2265: *
2266: * @param match A Match instance for storing matching result.
2267: * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
2268: */
2269: public boolean matches(CharacterIterator target, Match match) {
2270: int start = target.getBeginIndex();
2271: int end = target.getEndIndex();
2272:
2273: synchronized (this ) {
2274: if (this .operations == null)
2275: this .prepare();
2276: if (this .context == null)
2277: this .context = new Context();
2278: }
2279: Context con = null;
2280: synchronized (this .context) {
2281: con = this .context.inuse ? new Context() : this .context;
2282: con.reset(target, start, end, this .numberOfClosures);
2283: }
2284: if (match != null) {
2285: match.setNumberOfGroups(this .nofparen);
2286: match.setSource(target);
2287: } else if (this .hasBackReferences) {
2288: match = new Match();
2289: match.setNumberOfGroups(this .nofparen);
2290: // Need not to call setSource() because
2291: // a caller can not access this match instance.
2292: }
2293: con.match = match;
2294:
2295: if (this .isSet(this .options, XMLSCHEMA_MODE)) {
2296: int matchEnd = this .matchCharacterIterator(con,
2297: this .operations, con.start, 1, this .options);
2298: //System.err.println("DEBUG: matchEnd="+matchEnd);
2299: if (matchEnd == con.limit) {
2300: if (con.match != null) {
2301: con.match.setBeginning(0, con.start);
2302: con.match.setEnd(0, matchEnd);
2303: }
2304: con.inuse = false;
2305: return true;
2306: }
2307: return false;
2308: }
2309:
2310: /*
2311: * The pattern has only fixed string.
2312: * The engine uses Boyer-Moore.
2313: */
2314: if (this .fixedStringOnly) {
2315: //System.err.println("DEBUG: fixed-only: "+this.fixedString);
2316: int o = this .fixedStringTable.matches(target, con.start,
2317: con.limit);
2318: if (o >= 0) {
2319: if (con.match != null) {
2320: con.match.setBeginning(0, o);
2321: con.match.setEnd(0, o + this .fixedString.length());
2322: }
2323: con.inuse = false;
2324: return true;
2325: }
2326: con.inuse = false;
2327: return false;
2328: }
2329:
2330: /*
2331: * The pattern contains a fixed string.
2332: * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
2333: * If not, it return with false.
2334: */
2335: if (this .fixedString != null) {
2336: int o = this .fixedStringTable.matches(target, con.start,
2337: con.limit);
2338: if (o < 0) {
2339: //System.err.println("Non-match in fixed-string search.");
2340: con.inuse = false;
2341: return false;
2342: }
2343: }
2344:
2345: int limit = con.limit - this .minlength;
2346: int matchStart;
2347: int matchEnd = -1;
2348:
2349: /*
2350: * Checks whether the expression starts with ".*".
2351: */
2352: if (this .operations != null
2353: && this .operations.type == Op.CLOSURE
2354: && this .operations.getChild().type == Op.DOT) {
2355: if (isSet(this .options, SINGLE_LINE)) {
2356: matchStart = con.start;
2357: matchEnd = this .matchCharacterIterator(con,
2358: this .operations, con.start, 1, this .options);
2359: } else {
2360: boolean previousIsEOL = true;
2361: for (matchStart = con.start; matchStart <= limit; matchStart++) {
2362: int ch = target.setIndex(matchStart);
2363: if (isEOLChar(ch)) {
2364: previousIsEOL = true;
2365: } else {
2366: if (previousIsEOL) {
2367: if (0 <= (matchEnd = this
2368: .matchCharacterIterator(con,
2369: this .operations,
2370: matchStart, 1, this .options)))
2371: break;
2372: }
2373: previousIsEOL = false;
2374: }
2375: }
2376: }
2377: }
2378:
2379: /*
2380: * Optimization against the first character.
2381: */
2382: else if (this .firstChar != null) {
2383: //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
2384: RangeToken range = this .firstChar;
2385: if (this .isSet(this .options, IGNORE_CASE)) {
2386: range = this .firstChar.getCaseInsensitiveToken();
2387: for (matchStart = con.start; matchStart <= limit; matchStart++) {
2388: int ch = target.setIndex(matchStart);
2389: if (REUtil.isHighSurrogate(ch)
2390: && matchStart + 1 < con.limit) {
2391: ch = REUtil.composeFromSurrogates(ch, target
2392: .setIndex(matchStart + 1));
2393: if (!range.match(ch))
2394: continue;
2395: } else {
2396: if (!range.match(ch)) {
2397: char ch1 = Character.toUpperCase((char) ch);
2398: if (!range.match(ch1))
2399: if (!range.match(Character
2400: .toLowerCase(ch1)))
2401: continue;
2402: }
2403: }
2404: if (0 <= (matchEnd = this .matchCharacterIterator(
2405: con, this .operations, matchStart, 1,
2406: this .options)))
2407: break;
2408: }
2409: } else {
2410: for (matchStart = con.start; matchStart <= limit; matchStart++) {
2411: int ch = target.setIndex(matchStart);
2412: if (REUtil.isHighSurrogate(ch)
2413: && matchStart + 1 < con.limit)
2414: ch = REUtil.composeFromSurrogates(ch, target
2415: .setIndex(matchStart + 1));
2416: if (!range.match(ch))
2417: continue;
2418: if (0 <= (matchEnd = this .matchCharacterIterator(
2419: con, this .operations, matchStart, 1,
2420: this .options)))
2421: break;
2422: }
2423: }
2424: }
2425:
2426: /*
2427: * Straightforward matching.
2428: */
2429: else {
2430: for (matchStart = con.start; matchStart <= limit; matchStart++) {
2431: if (0 <= (matchEnd = this .matchCharacterIterator(con,
2432: this .operations, matchStart, 1, this .options)))
2433: break;
2434: }
2435: }
2436:
2437: if (matchEnd >= 0) {
2438: if (con.match != null) {
2439: con.match.setBeginning(0, matchStart);
2440: con.match.setEnd(0, matchEnd);
2441: }
2442: con.inuse = false;
2443: return true;
2444: } else {
2445: con.inuse = false;
2446: return false;
2447: }
2448: }
2449:
2450: /**
2451: * @return -1 when not match; offset of the end of matched string when match.
2452: */
2453: private int matchCharacterIterator(Context con, Op op, int offset,
2454: int dx, int opts) {
2455:
2456: CharacterIterator target = con.ciTarget;
2457:
2458: while (true) {
2459: if (op == null)
2460: return offset;
2461: if (offset > con.limit || offset < con.start)
2462: return -1;
2463: switch (op.type) {
2464: case Op.CHAR:
2465: if (isSet(opts, IGNORE_CASE)) {
2466: int ch = op.getData();
2467: if (dx > 0) {
2468: if (offset >= con.limit
2469: || !matchIgnoreCase(ch, target
2470: .setIndex(offset)))
2471: return -1;
2472: offset++;
2473: } else {
2474: int o1 = offset - 1;
2475: if (o1 >= con.limit
2476: || o1 < 0
2477: || !matchIgnoreCase(ch, target
2478: .setIndex(o1)))
2479: return -1;
2480: offset = o1;
2481: }
2482: } else {
2483: int ch = op.getData();
2484: if (dx > 0) {
2485: if (offset >= con.limit
2486: || ch != target.setIndex(offset))
2487: return -1;
2488: offset++;
2489: } else {
2490: int o1 = offset - 1;
2491: if (o1 >= con.limit || o1 < 0
2492: || ch != target.setIndex(o1))
2493: return -1;
2494: offset = o1;
2495: }
2496: }
2497: op = op.next;
2498: break;
2499:
2500: case Op.DOT:
2501: if (dx > 0) {
2502: if (offset >= con.limit)
2503: return -1;
2504: int ch = target.setIndex(offset);
2505: if (isSet(opts, SINGLE_LINE)) {
2506: if (REUtil.isHighSurrogate(ch)
2507: && offset + 1 < con.limit)
2508: offset++;
2509: } else {
2510: if (REUtil.isHighSurrogate(ch)
2511: && offset + 1 < con.limit)
2512: ch = REUtil.composeFromSurrogates(ch,
2513: target.setIndex(++offset));
2514: if (isEOLChar(ch))
2515: return -1;
2516: }
2517: offset++;
2518: } else {
2519: int o1 = offset - 1;
2520: if (o1 >= con.limit || o1 < 0)
2521: return -1;
2522: int ch = target.setIndex(o1);
2523: if (isSet(opts, SINGLE_LINE)) {
2524: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
2525: o1--;
2526: } else {
2527: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
2528: ch = REUtil.composeFromSurrogates(target
2529: .setIndex(--o1), ch);
2530: if (!isEOLChar(ch))
2531: return -1;
2532: }
2533: offset = o1;
2534: }
2535: op = op.next;
2536: break;
2537:
2538: case Op.RANGE:
2539: case Op.NRANGE:
2540: if (dx > 0) {
2541: if (offset >= con.limit)
2542: return -1;
2543: int ch = target.setIndex(offset);
2544: if (REUtil.isHighSurrogate(ch)
2545: && offset + 1 < con.limit)
2546: ch = REUtil.composeFromSurrogates(ch, target
2547: .setIndex(++offset));
2548: RangeToken tok = op.getToken();
2549: if (isSet(opts, IGNORE_CASE)) {
2550: tok = tok.getCaseInsensitiveToken();
2551: if (!tok.match(ch)) {
2552: if (ch >= 0x10000)
2553: return -1;
2554: char uch;
2555: if (!tok.match(uch = Character
2556: .toUpperCase((char) ch))
2557: && !tok.match(Character
2558: .toLowerCase(uch)))
2559: return -1;
2560: }
2561: } else {
2562: if (!tok.match(ch))
2563: return -1;
2564: }
2565: offset++;
2566: } else {
2567: int o1 = offset - 1;
2568: if (o1 >= con.limit || o1 < 0)
2569: return -1;
2570: int ch = target.setIndex(o1);
2571: if (REUtil.isLowSurrogate(ch) && o1 - 1 >= 0)
2572: ch = REUtil.composeFromSurrogates(target
2573: .setIndex(--o1), ch);
2574: RangeToken tok = op.getToken();
2575: if (isSet(opts, IGNORE_CASE)) {
2576: tok = tok.getCaseInsensitiveToken();
2577: if (!tok.match(ch)) {
2578: if (ch >= 0x10000)
2579: return -1;
2580: char uch;
2581: if (!tok.match(uch = Character
2582: .toUpperCase((char) ch))
2583: && !tok.match(Character
2584: .toLowerCase(uch)))
2585: return -1;
2586: }
2587: } else {
2588: if (!tok.match(ch))
2589: return -1;
2590: }
2591: offset = o1;
2592: }
2593: op = op.next;
2594: break;
2595:
2596: case Op.ANCHOR:
2597: boolean go = false;
2598: switch (op.getData()) {
2599: case '^':
2600: if (isSet(opts, MULTIPLE_LINES)) {
2601: if (!(offset == con.start || offset > con.start
2602: && isEOLChar(target
2603: .setIndex(offset - 1))))
2604: return -1;
2605: } else {
2606: if (offset != con.start)
2607: return -1;
2608: }
2609: break;
2610:
2611: case '@': // Internal use only.
2612: // The @ always matches line beginnings.
2613: if (!(offset == con.start || offset > con.start
2614: && isEOLChar(target.setIndex(offset - 1))))
2615: return -1;
2616: break;
2617:
2618: case '$':
2619: if (isSet(opts, MULTIPLE_LINES)) {
2620: if (!(offset == con.limit || offset < con.limit
2621: && isEOLChar(target.setIndex(offset))))
2622: return -1;
2623: } else {
2624: if (!(offset == con.limit
2625: || offset + 1 == con.limit
2626: && isEOLChar(target.setIndex(offset)) || offset + 2 == con.limit
2627: && target.setIndex(offset) == CARRIAGE_RETURN
2628: && target.setIndex(offset + 1) == LINE_FEED))
2629: return -1;
2630: }
2631: break;
2632:
2633: case 'A':
2634: if (offset != con.start)
2635: return -1;
2636: break;
2637:
2638: case 'Z':
2639: if (!(offset == con.limit
2640: || offset + 1 == con.limit
2641: && isEOLChar(target.setIndex(offset)) || offset + 2 == con.limit
2642: && target.setIndex(offset) == CARRIAGE_RETURN
2643: && target.setIndex(offset + 1) == LINE_FEED))
2644: return -1;
2645: break;
2646:
2647: case 'z':
2648: if (offset != con.limit)
2649: return -1;
2650: break;
2651:
2652: case 'b':
2653: if (con.length == 0)
2654: return -1;
2655: {
2656: int after = getWordType(target, con.start,
2657: con.limit, offset, opts);
2658: if (after == WT_IGNORE)
2659: return -1;
2660: int before = getPreviousWordType(target,
2661: con.start, con.limit, offset, opts);
2662: if (after == before)
2663: return -1;
2664: }
2665: break;
2666:
2667: case 'B':
2668: if (con.length == 0)
2669: go = true;
2670: else {
2671: int after = getWordType(target, con.start,
2672: con.limit, offset, opts);
2673: go = after == WT_IGNORE
2674: || after == getPreviousWordType(target,
2675: con.start, con.limit, offset,
2676: opts);
2677: }
2678: if (!go)
2679: return -1;
2680: break;
2681:
2682: case '<':
2683: if (con.length == 0 || offset == con.limit)
2684: return -1;
2685: if (getWordType(target, con.start, con.limit,
2686: offset, opts) != WT_LETTER
2687: || getPreviousWordType(target, con.start,
2688: con.limit, offset, opts) != WT_OTHER)
2689: return -1;
2690: break;
2691:
2692: case '>':
2693: if (con.length == 0 || offset == con.start)
2694: return -1;
2695: if (getWordType(target, con.start, con.limit,
2696: offset, opts) != WT_OTHER
2697: || getPreviousWordType(target, con.start,
2698: con.limit, offset, opts) != WT_LETTER)
2699: return -1;
2700: break;
2701: } // switch anchor type
2702: op = op.next;
2703: break;
2704:
2705: case Op.BACKREFERENCE: {
2706: int refno = op.getData();
2707: if (refno <= 0 || refno >= this .nofparen)
2708: throw new RuntimeException(
2709: "Internal Error: Reference number must be more than zero: "
2710: + refno);
2711: if (con.match.getBeginning(refno) < 0
2712: || con.match.getEnd(refno) < 0)
2713: return -1; // ********
2714: int o2 = con.match.getBeginning(refno);
2715: int literallen = con.match.getEnd(refno) - o2;
2716: if (!isSet(opts, IGNORE_CASE)) {
2717: if (dx > 0) {
2718: if (!regionMatches(target, offset, con.limit,
2719: o2, literallen))
2720: return -1;
2721: offset += literallen;
2722: } else {
2723: if (!regionMatches(target, offset - literallen,
2724: con.limit, o2, literallen))
2725: return -1;
2726: offset -= literallen;
2727: }
2728: } else {
2729: if (dx > 0) {
2730: if (!regionMatchesIgnoreCase(target, offset,
2731: con.limit, o2, literallen))
2732: return -1;
2733: offset += literallen;
2734: } else {
2735: if (!regionMatchesIgnoreCase(target, offset
2736: - literallen, con.limit, o2, literallen))
2737: return -1;
2738: offset -= literallen;
2739: }
2740: }
2741: }
2742: op = op.next;
2743: break;
2744: case Op.STRING: {
2745: String literal = op.getString();
2746: int literallen = literal.length();
2747: if (!isSet(opts, IGNORE_CASE)) {
2748: if (dx > 0) {
2749: if (!regionMatches(target, offset, con.limit,
2750: literal, literallen))
2751: return -1;
2752: offset += literallen;
2753: } else {
2754: if (!regionMatches(target, offset - literallen,
2755: con.limit, literal, literallen))
2756: return -1;
2757: offset -= literallen;
2758: }
2759: } else {
2760: if (dx > 0) {
2761: if (!regionMatchesIgnoreCase(target, offset,
2762: con.limit, literal, literallen))
2763: return -1;
2764: offset += literallen;
2765: } else {
2766: if (!regionMatchesIgnoreCase(target, offset
2767: - literallen, con.limit, literal,
2768: literallen))
2769: return -1;
2770: offset -= literallen;
2771: }
2772: }
2773: }
2774: op = op.next;
2775: break;
2776:
2777: case Op.CLOSURE: {
2778: /*
2779: * Saves current position to avoid
2780: * zero-width repeats.
2781: */
2782: int id = op.getData();
2783: if (id >= 0) {
2784: int previousOffset = con.offsets[id];
2785: if (previousOffset < 0 || previousOffset != offset) {
2786: con.offsets[id] = offset;
2787: } else {
2788: con.offsets[id] = -1;
2789: op = op.next;
2790: break;
2791: }
2792: }
2793:
2794: int ret = this .matchCharacterIterator(con, op
2795: .getChild(), offset, dx, opts);
2796: if (id >= 0)
2797: con.offsets[id] = -1;
2798: if (ret >= 0)
2799: return ret;
2800: op = op.next;
2801: }
2802: break;
2803:
2804: case Op.QUESTION: {
2805: int ret = this .matchCharacterIterator(con, op
2806: .getChild(), offset, dx, opts);
2807: if (ret >= 0)
2808: return ret;
2809: op = op.next;
2810: }
2811: break;
2812:
2813: case Op.NONGREEDYCLOSURE:
2814: case Op.NONGREEDYQUESTION: {
2815: int ret = this .matchCharacterIterator(con, op.next,
2816: offset, dx, opts);
2817: if (ret >= 0)
2818: return ret;
2819: op = op.getChild();
2820: }
2821: break;
2822:
2823: case Op.UNION:
2824: for (int i = 0; i < op.size(); i++) {
2825: int ret = this .matchCharacterIterator(con, op
2826: .elementAt(i), offset, dx, opts);
2827: if (DEBUG) {
2828: System.err.println("UNION: " + i + ", ret="
2829: + ret);
2830: }
2831: if (ret == con.length)
2832: return ret;
2833: }
2834: return -1;
2835:
2836: case Op.CAPTURE:
2837: int refno = op.getData();
2838: if (con.match != null && refno > 0) {
2839: int save = con.match.getBeginning(refno);
2840: con.match.setBeginning(refno, offset);
2841: int ret = this .matchCharacterIterator(con, op.next,
2842: offset, dx, opts);
2843: if (ret < 0)
2844: con.match.setBeginning(refno, save);
2845: return ret;
2846: } else if (con.match != null && refno < 0) {
2847: int index = -refno;
2848: int save = con.match.getEnd(index);
2849: con.match.setEnd(index, offset);
2850: int ret = this .matchCharacterIterator(con, op.next,
2851: offset, dx, opts);
2852: if (ret < 0)
2853: con.match.setEnd(index, save);
2854: return ret;
2855: }
2856: op = op.next;
2857: break;
2858:
2859: case Op.LOOKAHEAD:
2860: if (0 > this .matchCharacterIterator(con, op.getChild(),
2861: offset, 1, opts))
2862: return -1;
2863: op = op.next;
2864: break;
2865: case Op.NEGATIVELOOKAHEAD:
2866: if (0 <= this .matchCharacterIterator(con,
2867: op.getChild(), offset, 1, opts))
2868: return -1;
2869: op = op.next;
2870: break;
2871: case Op.LOOKBEHIND:
2872: if (0 > this .matchCharacterIterator(con, op.getChild(),
2873: offset, -1, opts))
2874: return -1;
2875: op = op.next;
2876: break;
2877: case Op.NEGATIVELOOKBEHIND:
2878: if (0 <= this .matchCharacterIterator(con,
2879: op.getChild(), offset, -1, opts))
2880: return -1;
2881: op = op.next;
2882: break;
2883:
2884: case Op.INDEPENDENT: {
2885: int ret = this .matchCharacterIterator(con, op
2886: .getChild(), offset, dx, opts);
2887: if (ret < 0)
2888: return ret;
2889: offset = ret;
2890: op = op.next;
2891: }
2892: break;
2893:
2894: case Op.MODIFIER: {
2895: int localopts = opts;
2896: localopts |= op.getData();
2897: localopts &= ~op.getData2();
2898: //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
2899: int ret = this .matchCharacterIterator(con, op
2900: .getChild(), offset, dx, localopts);
2901: if (ret < 0)
2902: return ret;
2903: offset = ret;
2904: op = op.next;
2905: }
2906: break;
2907:
2908: case Op.CONDITION: {
2909: Op.ConditionOp cop = (Op.ConditionOp) op;
2910: boolean matchp = false;
2911: if (cop.refNumber > 0) {
2912: if (cop.refNumber >= this .nofparen)
2913: throw new RuntimeException(
2914: "Internal Error: Reference number must be more than zero: "
2915: + cop.refNumber);
2916: matchp = con.match.getBeginning(cop.refNumber) >= 0
2917: && con.match.getEnd(cop.refNumber) >= 0;
2918: } else {
2919: matchp = 0 <= this .matchCharacterIterator(con,
2920: cop.condition, offset, dx, opts);
2921: }
2922:
2923: if (matchp) {
2924: op = cop.yes;
2925: } else if (cop.no != null) {
2926: op = cop.no;
2927: } else {
2928: op = cop.next;
2929: }
2930: }
2931: break;
2932:
2933: default:
2934: throw new RuntimeException("Unknown operation type: "
2935: + op.type);
2936: } // switch (op.type)
2937: } // while
2938: }
2939:
2940: private static final int getPreviousWordType(
2941: CharacterIterator target, int begin, int end, int offset,
2942: int opts) {
2943: int ret = getWordType(target, begin, end, --offset, opts);
2944: while (ret == WT_IGNORE)
2945: ret = getWordType(target, begin, end, --offset, opts);
2946: return ret;
2947: }
2948:
2949: private static final int getWordType(CharacterIterator target,
2950: int begin, int end, int offset, int opts) {
2951: if (offset < begin || offset >= end)
2952: return WT_OTHER;
2953: return getWordType0(target.setIndex(offset), opts);
2954: }
2955:
2956: private static final boolean regionMatches(
2957: CharacterIterator target, int offset, int limit,
2958: String part, int partlen) {
2959: if (offset < 0)
2960: return false;
2961: if (limit - offset < partlen)
2962: return false;
2963: int i = 0;
2964: while (partlen-- > 0) {
2965: if (target.setIndex(offset++) != part.charAt(i++))
2966: return false;
2967: }
2968: return true;
2969: }
2970:
2971: private static final boolean regionMatches(
2972: CharacterIterator target, int offset, int limit,
2973: int offset2, int partlen) {
2974: if (offset < 0)
2975: return false;
2976: if (limit - offset < partlen)
2977: return false;
2978: int i = offset2;
2979: while (partlen-- > 0) {
2980: if (target.setIndex(offset++) != target.setIndex(i++))
2981: return false;
2982: }
2983: return true;
2984: }
2985:
2986: /**
2987: * @see java.lang.String#regionMatches
2988: */
2989: private static final boolean regionMatchesIgnoreCase(
2990: CharacterIterator target, int offset, int limit,
2991: String part, int partlen) {
2992: if (offset < 0)
2993: return false;
2994: if (limit - offset < partlen)
2995: return false;
2996: int i = 0;
2997: while (partlen-- > 0) {
2998: char ch1 = target.setIndex(offset++);
2999: char ch2 = part.charAt(i++);
3000: if (ch1 == ch2)
3001: continue;
3002: char uch1 = Character.toUpperCase(ch1);
3003: char uch2 = Character.toUpperCase(ch2);
3004: if (uch1 == uch2)
3005: continue;
3006: if (Character.toLowerCase(uch1) != Character
3007: .toLowerCase(uch2))
3008: return false;
3009: }
3010: return true;
3011: }
3012:
3013: private static final boolean regionMatchesIgnoreCase(
3014: CharacterIterator target, int offset, int limit,
3015: int offset2, int partlen) {
3016: if (offset < 0)
3017: return false;
3018: if (limit - offset < partlen)
3019: return false;
3020: int i = offset2;
3021: while (partlen-- > 0) {
3022: char ch1 = target.setIndex(offset++);
3023: char ch2 = target.setIndex(i++);
3024: if (ch1 == ch2)
3025: continue;
3026: char uch1 = Character.toUpperCase(ch1);
3027: char uch2 = Character.toUpperCase(ch2);
3028: if (uch1 == uch2)
3029: continue;
3030: if (Character.toLowerCase(uch1) != Character
3031: .toLowerCase(uch2))
3032: return false;
3033: }
3034: return true;
3035: }
3036:
3037: // ================================================================
3038:
3039: /**
3040: * A regular expression.
3041: * @serial
3042: */
3043: String regex;
3044: /**
3045: * @serial
3046: */
3047: int options;
3048:
3049: /**
3050: * The number of parenthesis in the regular expression.
3051: * @serial
3052: */
3053: int nofparen;
3054: /**
3055: * Internal representation of the regular expression.
3056: * @serial
3057: */
3058: Token tokentree;
3059:
3060: boolean hasBackReferences = false;
3061:
3062: transient int minlength;
3063: transient Op operations = null;
3064: transient int numberOfClosures;
3065: transient Context context = null;
3066: transient RangeToken firstChar = null;
3067:
3068: transient String fixedString = null;
3069: transient int fixedStringOptions;
3070: transient BMPattern fixedStringTable = null;
3071: transient boolean fixedStringOnly = false;
3072:
3073: static final class Context {
3074: CharacterIterator ciTarget;
3075: String strTarget;
3076: char[] charTarget;
3077: int start;
3078: int limit;
3079: int length;
3080: Match match;
3081: boolean inuse = false;
3082: int[] offsets;
3083:
3084: Context() {
3085: }
3086:
3087: private void resetCommon(int nofclosures) {
3088: this .length = this .limit - this .start;
3089: this .inuse = true;
3090: this .match = null;
3091: if (this .offsets == null
3092: || this .offsets.length != nofclosures)
3093: this .offsets = new int[nofclosures];
3094: for (int i = 0; i < nofclosures; i++)
3095: this .offsets[i] = -1;
3096: }
3097:
3098: void reset(CharacterIterator target, int start, int limit,
3099: int nofclosures) {
3100: this .ciTarget = target;
3101: this .start = start;
3102: this .limit = limit;
3103: this .resetCommon(nofclosures);
3104: }
3105:
3106: void reset(String target, int start, int limit, int nofclosures) {
3107: this .strTarget = target;
3108: this .start = start;
3109: this .limit = limit;
3110: this .resetCommon(nofclosures);
3111: }
3112:
3113: void reset(char[] target, int start, int limit, int nofclosures) {
3114: this .charTarget = target;
3115: this .start = start;
3116: this .limit = limit;
3117: this .resetCommon(nofclosures);
3118: }
3119: }
3120:
3121: /**
3122: * Prepares for matching. This method is called just before starting matching.
3123: */
3124: void prepare() {
3125: if (Op.COUNT)
3126: Op.nofinstances = 0;
3127: this .compile(this .tokentree);
3128: /*
3129: if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
3130: Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
3131: anchor.next = this.operations;
3132: this.operations = anchor;
3133: }
3134: */
3135: if (Op.COUNT)
3136: System.err.println("DEBUG: The number of operations: "
3137: + Op.nofinstances);
3138:
3139: this .minlength = this .tokentree.getMinLength();
3140:
3141: this .firstChar = null;
3142: if (!isSet(this .options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
3143: && !isSet(this .options, XMLSCHEMA_MODE)) {
3144: RangeToken firstChar = Token.createRange();
3145: int fresult = this .tokentree.analyzeFirstCharacter(
3146: firstChar, this .options);
3147: if (fresult == Token.FC_TERMINAL) {
3148: firstChar.compactRanges();
3149: this .firstChar = firstChar;
3150: if (DEBUG)
3151: System.err
3152: .println("DEBUG: Use the first character optimization: "
3153: + firstChar);
3154: }
3155: }
3156:
3157: if (this .operations != null
3158: && (this .operations.type == Op.STRING || this .operations.type == Op.CHAR)
3159: && this .operations.next == null) {
3160: if (DEBUG)
3161: System.err.print(" *** Only fixed string! *** ");
3162: this .fixedStringOnly = true;
3163: if (this .operations.type == Op.STRING)
3164: this .fixedString = this .operations.getString();
3165: else if (this .operations.getData() >= 0x10000) { // Op.CHAR
3166: this .fixedString = REUtil
3167: .decomposeToSurrogates(this .operations
3168: .getData());
3169: } else {
3170: char[] ac = new char[1];
3171: ac[0] = (char) this .operations.getData();
3172: this .fixedString = new String(ac);
3173: }
3174: this .fixedStringOptions = this .options;
3175: this .fixedStringTable = new BMPattern(this .fixedString,
3176: 256, isSet(this .fixedStringOptions, IGNORE_CASE));
3177: } else if (!isSet(this .options,
3178: PROHIBIT_FIXED_STRING_OPTIMIZATION)
3179: && !isSet(this .options, XMLSCHEMA_MODE)) {
3180: Token.FixedStringContainer container = new Token.FixedStringContainer();
3181: this .tokentree.findFixedString(container, this .options);
3182: this .fixedString = container.token == null ? null
3183: : container.token.getString();
3184: this .fixedStringOptions = container.options;
3185: if (this .fixedString != null
3186: && this .fixedString.length() < 2)
3187: this .fixedString = null;
3188: // This pattern has a fixed string of which length is more than one.
3189: if (this .fixedString != null) {
3190: this .fixedStringTable = new BMPattern(this .fixedString,
3191: 256,
3192: isSet(this .fixedStringOptions, IGNORE_CASE));
3193: if (DEBUG) {
3194: System.err
3195: .println("DEBUG: The longest fixed string: "
3196: + this .fixedString.length()
3197: + "/" //+this.fixedString
3198: + "/"
3199: + REUtil
3200: .createOptionString(this .fixedStringOptions));
3201: System.err.print("String: ");
3202: REUtil.dumpString(this .fixedString);
3203: }
3204: }
3205: }
3206: }
3207:
3208: /**
3209: * An option.
3210: * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
3211: * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
3212: * does not capture.
3213: *
3214: * @see #RegularExpression(java.lang.String,int)
3215: * @see #setPattern(java.lang.String,int)
3216: static final int MARK_PARENS = 1<<0;
3217: */
3218:
3219: /**
3220: * "i"
3221: */
3222: static final int IGNORE_CASE = 1 << 1;
3223:
3224: /**
3225: * "s"
3226: */
3227: static final int SINGLE_LINE = 1 << 2;
3228:
3229: /**
3230: * "m"
3231: */
3232: static final int MULTIPLE_LINES = 1 << 3;
3233:
3234: /**
3235: * "x"
3236: */
3237: static final int EXTENDED_COMMENT = 1 << 4;
3238:
3239: /**
3240: * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
3241: *
3242: * @see #RegularExpression(java.lang.String,int)
3243: * @see #setPattern(java.lang.String,int)
3244: * @see #UNICODE_WORD_BOUNDARY
3245: */
3246: static final int USE_UNICODE_CATEGORY = 1 << 5; // "u"
3247:
3248: /**
3249: * An option.
3250: * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \< \></kbd></span>.
3251: * <p>By default, the engine considers a position between a word character
3252: * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
3253: * is a word boundary.
3254: * <p>By this option, the engine checks word boundaries with the method of
3255: * 'Unicode Regular Expression Guidelines' Revision 4.
3256: *
3257: * @see #RegularExpression(java.lang.String,int)
3258: * @see #setPattern(java.lang.String,int)
3259: */
3260: static final int UNICODE_WORD_BOUNDARY = 1 << 6; // "w"
3261:
3262: /**
3263: * "H"
3264: */
3265: static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1 << 7;
3266: /**
3267: * "F"
3268: */
3269: static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1 << 8;
3270: /**
3271: * "X". XML Schema mode.
3272: */
3273: static final int XMLSCHEMA_MODE = 1 << 9;
3274: /**
3275: * ",".
3276: */
3277: static final int SPECIAL_COMMA = 1 << 10;
3278:
3279: private static final boolean isSet(int options, int flag) {
3280: return (options & flag) == flag;
3281: }
3282:
3283: /**
3284: * Creates a new RegularExpression instance.
3285: *
3286: * @param regex A regular expression
3287: * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
3288: */
3289: public RegularExpression(String regex) throws ParseException {
3290: this .setPattern(regex, null);
3291: }
3292:
3293: /**
3294: * Creates a new RegularExpression instance with options.
3295: *
3296: * @param regex A regular expression
3297: * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
3298: * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
3299: */
3300: public RegularExpression(String regex, String options)
3301: throws ParseException {
3302: this .setPattern(regex, options);
3303: }
3304:
3305: RegularExpression(String regex, Token tok, int parens,
3306: boolean hasBackReferences, int options) {
3307: this .regex = regex;
3308: this .tokentree = tok;
3309: this .nofparen = parens;
3310: this .options = options;
3311: this .hasBackReferences = hasBackReferences;
3312: }
3313:
3314: /**
3315: *
3316: */
3317: public void setPattern(String newPattern) throws ParseException {
3318: this .setPattern(newPattern, this .options);
3319: }
3320:
3321: private void setPattern(String newPattern, int options)
3322: throws ParseException {
3323: this .regex = newPattern;
3324: this .options = options;
3325: RegexParser rp = this .isSet(this .options,
3326: RegularExpression.XMLSCHEMA_MODE) ? new ParserForXMLSchema()
3327: : new RegexParser();
3328: this .tokentree = rp.parse(this .regex, this .options);
3329: this .nofparen = rp.parennumber;
3330: this .hasBackReferences = rp.hasBackReferences;
3331:
3332: this .operations = null;
3333: this .context = null;
3334: }
3335:
3336: /**
3337: *
3338: */
3339: public void setPattern(String newPattern, String options)
3340: throws ParseException {
3341: this .setPattern(newPattern, REUtil.parseOptions(options));
3342: }
3343:
3344: /**
3345: *
3346: */
3347: public String getPattern() {
3348: return this .regex;
3349: }
3350:
3351: /**
3352: * Represents this instence in String.
3353: */
3354: public String toString() {
3355: return this .tokentree.toString(this .options);
3356: }
3357:
3358: /**
3359: * Returns a option string.
3360: * The order of letters in it may be different from a string specified
3361: * in a constructor or <code>setPattern()</code>.
3362: *
3363: * @see #RegularExpression(java.lang.String,java.lang.String)
3364: * @see #setPattern(java.lang.String,java.lang.String)
3365: */
3366: public String getOptions() {
3367: return REUtil.createOptionString(this .options);
3368: }
3369:
3370: /**
3371: * Return true if patterns are the same and the options are equivalent.
3372: */
3373: public boolean equals(Object obj) {
3374: if (obj == null)
3375: return false;
3376: if (!(obj instanceof RegularExpression))
3377: return false;
3378: RegularExpression r = (RegularExpression) obj;
3379: return this .regex.equals(r.regex) && this .options == r.options;
3380: }
3381:
3382: boolean equals(String pattern, int options) {
3383: return this .regex.equals(pattern) && this .options == options;
3384: }
3385:
3386: /**
3387: *
3388: */
3389: public int hashCode() {
3390: return (this .regex + "/" + this .getOptions()).hashCode();
3391: }
3392:
3393: /**
3394: * Return the number of regular expression groups.
3395: * This method returns 1 when the regular expression has no capturing-parenthesis.
3396: *
3397: */
3398: public int getNumberOfGroups() {
3399: return this .nofparen;
3400: }
3401:
3402: // ================================================================
3403:
3404: private static final int WT_IGNORE = 0;
3405: private static final int WT_LETTER = 1;
3406: private static final int WT_OTHER = 2;
3407: transient static Token wordchar = null;
3408:
3409: private static final int getWordType0(char ch, int opts) {
3410: if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
3411: if (isSet(opts, USE_UNICODE_CATEGORY)) {
3412: if (RegularExpression.wordchar == null)
3413: RegularExpression.wordchar = Token.getRange(
3414: "IsWord", true);
3415: return RegularExpression.wordchar.match(ch) ? WT_LETTER
3416: : WT_OTHER;
3417: }
3418: return isWordChar(ch) ? WT_LETTER : WT_OTHER;
3419: }
3420:
3421: switch (Character.getType(ch)) {
3422: case Character.UPPERCASE_LETTER: // L
3423: case Character.LOWERCASE_LETTER: // L
3424: case Character.TITLECASE_LETTER: // L
3425: case Character.MODIFIER_LETTER: // L
3426: case Character.OTHER_LETTER: // L
3427: case Character.LETTER_NUMBER: // N
3428: case Character.DECIMAL_DIGIT_NUMBER: // N
3429: case Character.OTHER_NUMBER: // N
3430: case Character.COMBINING_SPACING_MARK: // Mc
3431: return WT_LETTER;
3432:
3433: case Character.FORMAT: // Cf
3434: case Character.NON_SPACING_MARK: // Mn
3435: case Character.ENCLOSING_MARK: // Mc
3436: return WT_IGNORE;
3437:
3438: case Character.CONTROL: // Cc
3439: switch (ch) {
3440: case '\t':
3441: case '\n':
3442: case '\u000B':
3443: case '\f':
3444: case '\r':
3445: return WT_OTHER;
3446: default:
3447: return WT_IGNORE;
3448: }
3449:
3450: default:
3451: return WT_OTHER;
3452: }
3453: }
3454:
3455: // ================================================================
3456:
3457: static final int LINE_FEED = 0x000A;
3458: static final int CARRIAGE_RETURN = 0x000D;
3459: static final int LINE_SEPARATOR = 0x2028;
3460: static final int PARAGRAPH_SEPARATOR = 0x2029;
3461:
3462: private static final boolean isEOLChar(int ch) {
3463: return ch == LINE_FEED || ch == CARRIAGE_RETURN
3464: || ch == LINE_SEPARATOR || ch == PARAGRAPH_SEPARATOR;
3465: }
3466:
3467: private static final boolean isWordChar(int ch) { // Legacy word characters
3468: if (ch == '_')
3469: return true;
3470: if (ch < '0')
3471: return false;
3472: if (ch > 'z')
3473: return false;
3474: if (ch <= '9')
3475: return true;
3476: if (ch < 'A')
3477: return false;
3478: if (ch <= 'Z')
3479: return true;
3480: if (ch < 'a')
3481: return false;
3482: return true;
3483: }
3484:
3485: private static final boolean matchIgnoreCase(int chardata, int ch) {
3486: if (chardata == ch)
3487: return true;
3488: if (chardata > 0xffff || ch > 0xffff)
3489: return false;
3490: char uch1 = Character.toUpperCase((char) chardata);
3491: char uch2 = Character.toUpperCase((char) ch);
3492: if (uch1 == uch2)
3493: return true;
3494: return Character.toLowerCase(uch1) == Character
3495: .toLowerCase(uch2);
3496: }
3497: }
|