0001 /*
0002 * Copyright 1999-2007 Sun Microsystems, Inc. All Rights Reserved.
0003 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0004 *
0005 * This code is free software; you can redistribute it and/or modify it
0006 * under the terms of the GNU General Public License version 2 only, as
0007 * published by the Free Software Foundation. Sun designates this
0008 * particular file as subject to the "Classpath" exception as provided
0009 * by Sun in the LICENSE file that accompanied this code.
0010 *
0011 * This code is distributed in the hope that it will be useful, but WITHOUT
0012 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0013 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
0014 * version 2 for more details (a copy is included in the LICENSE file that
0015 * accompanied this code).
0016 *
0017 * You should have received a copy of the GNU General Public License version
0018 * 2 along with this work; if not, write to the Free Software Foundation,
0019 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0020 *
0021 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
0022 * CA 95054 USA or visit www.sun.com if you need additional information or
0023 * have any questions.
0024 */
0025
0026 package java.util.regex;
0027
0028 import java.security.AccessController;
0029 import java.security.PrivilegedAction;
0030 import java.text.CharacterIterator;
0031 import java.text.Normalizer;
0032 import java.util.ArrayList;
0033 import java.util.HashMap;
0034 import java.util.Arrays;
0035
0036 /**
0037 * A compiled representation of a regular expression.
0038 *
0039 * <p> A regular expression, specified as a string, must first be compiled into
0040 * an instance of this class. The resulting pattern can then be used to create
0041 * a {@link Matcher} object that can match arbitrary {@link
0042 * java.lang.CharSequence </code>character sequences<code>} against the regular
0043 * expression. All of the state involved in performing a match resides in the
0044 * matcher, so many matchers can share the same pattern.
0045 *
0046 * <p> A typical invocation sequence is thus
0047 *
0048 * <blockquote><pre>
0049 * Pattern p = Pattern.{@link #compile compile}("a*b");
0050 * Matcher m = p.{@link #matcher matcher}("aaaaab");
0051 * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>
0052 *
0053 * <p> A {@link #matches matches} method is defined by this class as a
0054 * convenience for when a regular expression is used just once. This method
0055 * compiles an expression and matches an input sequence against it in a single
0056 * invocation. The statement
0057 *
0058 * <blockquote><pre>
0059 * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote>
0060 *
0061 * is equivalent to the three statements above, though for repeated matches it
0062 * is less efficient since it does not allow the compiled pattern to be reused.
0063 *
0064 * <p> Instances of this class are immutable and are safe for use by multiple
0065 * concurrent threads. Instances of the {@link Matcher} class are not safe for
0066 * such use.
0067 *
0068 *
0069 * <a name="sum">
0070 * <h4> Summary of regular-expression constructs </h4>
0071 *
0072 * <table border="0" cellpadding="1" cellspacing="0"
0073 * summary="Regular expression constructs, and what they match">
0074 *
0075 * <tr align="left">
0076 * <th bgcolor="#CCCCFF" align="left" id="construct">Construct</th>
0077 * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
0078 * </tr>
0079 *
0080 * <tr><th> </th></tr>
0081 * <tr align="left"><th colspan="2" id="characters">Characters</th></tr>
0082 *
0083 * <tr><td valign="top" headers="construct characters"><i>x</i></td>
0084 * <td headers="matches">The character <i>x</i></td></tr>
0085 * <tr><td valign="top" headers="construct characters"><tt>\\</tt></td>
0086 * <td headers="matches">The backslash character</td></tr>
0087 * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>n</i></td>
0088 * <td headers="matches">The character with octal value <tt>0</tt><i>n</i>
0089 * (0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr>
0090 * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>nn</i></td>
0091 * <td headers="matches">The character with octal value <tt>0</tt><i>nn</i>
0092 * (0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr>
0093 * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>mnn</i></td>
0094 * <td headers="matches">The character with octal value <tt>0</tt><i>mnn</i>
0095 * (0 <tt><=</tt> <i>m</i> <tt><=</tt> 3,
0096 * 0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr>
0097 * <tr><td valign="top" headers="construct characters"><tt>\x</tt><i>hh</i></td>
0098 * <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>hh</i></td></tr>
0099 * <tr><td valign="top" headers="construct characters"><tt>\u</tt><i>hhhh</i></td>
0100 * <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>hhhh</i></td></tr>
0101 * <tr><td valign="top" headers="matches"><tt>\t</tt></td>
0102 * <td headers="matches">The tab character (<tt>'\u0009'</tt>)</td></tr>
0103 * <tr><td valign="top" headers="construct characters"><tt>\n</tt></td>
0104 * <td headers="matches">The newline (line feed) character (<tt>'\u000A'</tt>)</td></tr>
0105 * <tr><td valign="top" headers="construct characters"><tt>\r</tt></td>
0106 * <td headers="matches">The carriage-return character (<tt>'\u000D'</tt>)</td></tr>
0107 * <tr><td valign="top" headers="construct characters"><tt>\f</tt></td>
0108 * <td headers="matches">The form-feed character (<tt>'\u000C'</tt>)</td></tr>
0109 * <tr><td valign="top" headers="construct characters"><tt>\a</tt></td>
0110 * <td headers="matches">The alert (bell) character (<tt>'\u0007'</tt>)</td></tr>
0111 * <tr><td valign="top" headers="construct characters"><tt>\e</tt></td>
0112 * <td headers="matches">The escape character (<tt>'\u001B'</tt>)</td></tr>
0113 * <tr><td valign="top" headers="construct characters"><tt>\c</tt><i>x</i></td>
0114 * <td headers="matches">The control character corresponding to <i>x</i></td></tr>
0115 *
0116 * <tr><th> </th></tr>
0117 * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>
0118 *
0119 * <tr><td valign="top" headers="construct classes"><tt>[abc]</tt></td>
0120 * <td headers="matches"><tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (simple class)</td></tr>
0121 * <tr><td valign="top" headers="construct classes"><tt>[^abc]</tt></td>
0122 * <td headers="matches">Any character except <tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (negation)</td></tr>
0123 * <tr><td valign="top" headers="construct classes"><tt>[a-zA-Z]</tt></td>
0124 * <td headers="matches"><tt>a</tt> through <tt>z</tt>
0125 * or <tt>A</tt> through <tt>Z</tt>, inclusive (range)</td></tr>
0126 * <tr><td valign="top" headers="construct classes"><tt>[a-d[m-p]]</tt></td>
0127 * <td headers="matches"><tt>a</tt> through <tt>d</tt>,
0128 * or <tt>m</tt> through <tt>p</tt>: <tt>[a-dm-p]</tt> (union)</td></tr>
0129 * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[def]]</tt></td>
0130 * <td headers="matches"><tt>d</tt>, <tt>e</tt>, or <tt>f</tt> (intersection)</tr>
0131 * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^bc]]</tt></td>
0132 * <td headers="matches"><tt>a</tt> through <tt>z</tt>,
0133 * except for <tt>b</tt> and <tt>c</tt>: <tt>[ad-z]</tt> (subtraction)</td></tr>
0134 * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^m-p]]</tt></td>
0135 * <td headers="matches"><tt>a</tt> through <tt>z</tt>,
0136 * and not <tt>m</tt> through <tt>p</tt>: <tt>[a-lq-z]</tt>(subtraction)</td></tr>
0137 * <tr><th> </th></tr>
0138 *
0139 * <tr align="left"><th colspan="2" id="predef">Predefined character classes</th></tr>
0140 *
0141 * <tr><td valign="top" headers="construct predef"><tt>.</tt></td>
0142 * <td headers="matches">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
0143 * <tr><td valign="top" headers="construct predef"><tt>\d</tt></td>
0144 * <td headers="matches">A digit: <tt>[0-9]</tt></td></tr>
0145 * <tr><td valign="top" headers="construct predef"><tt>\D</tt></td>
0146 * <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr>
0147 * <tr><td valign="top" headers="construct predef"><tt>\s</tt></td>
0148 * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
0149 * <tr><td valign="top" headers="construct predef"><tt>\S</tt></td>
0150 * <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr>
0151 * <tr><td valign="top" headers="construct predef"><tt>\w</tt></td>
0152 * <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr>
0153 * <tr><td valign="top" headers="construct predef"><tt>\W</tt></td>
0154 * <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr>
0155 *
0156 * <tr><th> </th></tr>
0157 * <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr>
0158 *
0159 * <tr><td valign="top" headers="construct posix"><tt>\p{Lower}</tt></td>
0160 * <td headers="matches">A lower-case alphabetic character: <tt>[a-z]</tt></td></tr>
0161 * <tr><td valign="top" headers="construct posix"><tt>\p{Upper}</tt></td>
0162 * <td headers="matches">An upper-case alphabetic character:<tt>[A-Z]</tt></td></tr>
0163 * <tr><td valign="top" headers="construct posix"><tt>\p{ASCII}</tt></td>
0164 * <td headers="matches">All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
0165 * <tr><td valign="top" headers="construct posix"><tt>\p{Alpha}</tt></td>
0166 * <td headers="matches">An alphabetic character:<tt>[\p{Lower}\p{Upper}]</tt></td></tr>
0167 * <tr><td valign="top" headers="construct posix"><tt>\p{Digit}</tt></td>
0168 * <td headers="matches">A decimal digit: <tt>[0-9]</tt></td></tr>
0169 * <tr><td valign="top" headers="construct posix"><tt>\p{Alnum}</tt></td>
0170 * <td headers="matches">An alphanumeric character:<tt>[\p{Alpha}\p{Digit}]</tt></td></tr>
0171 * <tr><td valign="top" headers="construct posix"><tt>\p{Punct}</tt></td>
0172 * <td headers="matches">Punctuation: One of <tt>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</tt></td></tr>
0173 * <!-- <tt>[\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]</tt>
0174 * <tt>[\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]</tt> -->
0175 * <tr><td valign="top" headers="construct posix"><tt>\p{Graph}</tt></td>
0176 * <td headers="matches">A visible character: <tt>[\p{Alnum}\p{Punct}]</tt></td></tr>
0177 * <tr><td valign="top" headers="construct posix"><tt>\p{Print}</tt></td>
0178 * <td headers="matches">A printable character: <tt>[\p{Graph}\x20]</tt></td></tr>
0179 * <tr><td valign="top" headers="construct posix"><tt>\p{Blank}</tt></td>
0180 * <td headers="matches">A space or a tab: <tt>[ \t]</tt></td></tr>
0181 * <tr><td valign="top" headers="construct posix"><tt>\p{Cntrl}</tt></td>
0182 * <td headers="matches">A control character: <tt>[\x00-\x1F\x7F]</tt></td></tr>
0183 * <tr><td valign="top" headers="construct posix"><tt>\p{XDigit}</tt></td>
0184 * <td headers="matches">A hexadecimal digit: <tt>[0-9a-fA-F]</tt></td></tr>
0185 * <tr><td valign="top" headers="construct posix"><tt>\p{Space}</tt></td>
0186 * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
0187 *
0188 * <tr><th> </th></tr>
0189 * <tr align="left"><th colspan="2">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr>
0190 *
0191 * <tr><td valign="top"><tt>\p{javaLowerCase}</tt></td>
0192 * <td>Equivalent to java.lang.Character.isLowerCase()</td></tr>
0193 * <tr><td valign="top"><tt>\p{javaUpperCase}</tt></td>
0194 * <td>Equivalent to java.lang.Character.isUpperCase()</td></tr>
0195 * <tr><td valign="top"><tt>\p{javaWhitespace}</tt></td>
0196 * <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
0197 * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
0198 * <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
0199 *
0200 * <tr><th> </th></tr>
0201 * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode blocks and categories</th></tr>
0202 *
0203 * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
0204 * <td headers="matches">A character in the Greek block (simple <a href="#ubc">block</a>)</td></tr>
0205 * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
0206 * <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
0207 * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
0208 * <td headers="matches">A currency symbol</td></tr>
0209 * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
0210 * <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
0211 * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]] </tt></td>
0212 * <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
0213 *
0214 * <tr><th> </th></tr>
0215 * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
0216 *
0217 * <tr><td valign="top" headers="construct bounds"><tt>^</tt></td>
0218 * <td headers="matches">The beginning of a line</td></tr>
0219 * <tr><td valign="top" headers="construct bounds"><tt>$</tt></td>
0220 * <td headers="matches">The end of a line</td></tr>
0221 * <tr><td valign="top" headers="construct bounds"><tt>\b</tt></td>
0222 * <td headers="matches">A word boundary</td></tr>
0223 * <tr><td valign="top" headers="construct bounds"><tt>\B</tt></td>
0224 * <td headers="matches">A non-word boundary</td></tr>
0225 * <tr><td valign="top" headers="construct bounds"><tt>\A</tt></td>
0226 * <td headers="matches">The beginning of the input</td></tr>
0227 * <tr><td valign="top" headers="construct bounds"><tt>\G</tt></td>
0228 * <td headers="matches">The end of the previous match</td></tr>
0229 * <tr><td valign="top" headers="construct bounds"><tt>\Z</tt></td>
0230 * <td headers="matches">The end of the input but for the final
0231 * <a href="#lt">terminator</a>, if any</td></tr>
0232 * <tr><td valign="top" headers="construct bounds"><tt>\z</tt></td>
0233 * <td headers="matches">The end of the input</td></tr>
0234 *
0235 * <tr><th> </th></tr>
0236 * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
0237 *
0238 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td>
0239 * <td headers="matches"><i>X</i>, once or not at all</td></tr>
0240 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>*</tt></td>
0241 * <td headers="matches"><i>X</i>, zero or more times</td></tr>
0242 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>+</tt></td>
0243 * <td headers="matches"><i>X</i>, one or more times</td></tr>
0244 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>}</tt></td>
0245 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
0246 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,}</tt></td>
0247 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
0248 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}</tt></td>
0249 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
0250 *
0251 * <tr><th> </th></tr>
0252 * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>
0253 *
0254 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>??</tt></td>
0255 * <td headers="matches"><i>X</i>, once or not at all</td></tr>
0256 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>*?</tt></td>
0257 * <td headers="matches"><i>X</i>, zero or more times</td></tr>
0258 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>+?</tt></td>
0259 * <td headers="matches"><i>X</i>, one or more times</td></tr>
0260 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>}?</tt></td>
0261 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
0262 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,}?</tt></td>
0263 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
0264 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}?</tt></td>
0265 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
0266 *
0267 * <tr><th> </th></tr>
0268 * <tr align="left"><th colspan="2" id="poss">Possessive quantifiers</th></tr>
0269 *
0270 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>?+</tt></td>
0271 * <td headers="matches"><i>X</i>, once or not at all</td></tr>
0272 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>*+</tt></td>
0273 * <td headers="matches"><i>X</i>, zero or more times</td></tr>
0274 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>++</tt></td>
0275 * <td headers="matches"><i>X</i>, one or more times</td></tr>
0276 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>}+</tt></td>
0277 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
0278 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,}+</tt></td>
0279 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
0280 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td>
0281 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
0282 *
0283 * <tr><th> </th></tr>
0284 * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>
0285 *
0286 * <tr><td valign="top" headers="construct logical"><i>XY</i></td>
0287 * <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>
0288 * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td>
0289 * <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>
0290 * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td>
0291 * <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>
0292 *
0293 * <tr><th> </th></tr>
0294 * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>
0295 *
0296 * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
0297 * <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
0298 * <a href="#cg">capturing group</a> matched</td></tr>
0299 *
0300 * <tr><th> </th></tr>
0301 * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
0302 *
0303 * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
0304 * <td headers="matches">Nothing, but quotes the following character</td></tr>
0305 * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
0306 * <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
0307 * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
0308 * <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
0309 * <!-- Metachars: !$()*+.<>?[\]^{|} -->
0310 *
0311 * <tr><th> </th></tr>
0312 * <tr align="left"><th colspan="2" id="special">Special constructs (non-capturing)</th></tr>
0313 *
0314 * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
0315 * <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
0316 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
0317 * <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
0318 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
0319 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
0320 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt> </td>
0321 * <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
0322 * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
0323 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
0324 * <a href="#COMMENTS">x</a> on - off</td></tr>
0325 * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
0326 * <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
0327 * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
0328 * <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
0329 * <tr><td valign="top" headers="construct special"><tt>(?<=</tt><i>X</i><tt>)</tt></td>
0330 * <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
0331 * <tr><td valign="top" headers="construct special"><tt>(?<!</tt><i>X</i><tt>)</tt></td>
0332 * <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
0333 * <tr><td valign="top" headers="construct special"><tt>(?></tt><i>X</i><tt>)</tt></td>
0334 * <td headers="matches"><i>X</i>, as an independent, non-capturing group</td></tr>
0335 *
0336 * </table>
0337 *
0338 * <hr>
0339 *
0340 *
0341 * <a name="bs">
0342 * <h4> Backslashes, escapes, and quoting </h4>
0343 *
0344 * <p> The backslash character (<tt>'\'</tt>) serves to introduce escaped
0345 * constructs, as defined in the table above, as well as to quote characters
0346 * that otherwise would be interpreted as unescaped constructs. Thus the
0347 * expression <tt>\\</tt> matches a single backslash and <tt>\{</tt> matches a
0348 * left brace.
0349 *
0350 * <p> It is an error to use a backslash prior to any alphabetic character that
0351 * does not denote an escaped construct; these are reserved for future
0352 * extensions to the regular-expression language. A backslash may be used
0353 * prior to a non-alphabetic character regardless of whether that character is
0354 * part of an unescaped construct.
0355 *
0356 * <p> Backslashes within string literals in Java source code are interpreted
0357 * as required by the <a
0358 * href="http://java.sun.com/docs/books/jls">Java Language
0359 * Specification</a> as either <a
0360 * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#100850">Unicode
0361 * escapes</a> or other <a
0362 * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#101089">character
0363 * escapes</a>. It is therefore necessary to double backslashes in string
0364 * literals that represent regular expressions to protect them from
0365 * interpretation by the Java bytecode compiler. The string literal
0366 * <tt>"\b"</tt>, for example, matches a single backspace character when
0367 * interpreted as a regular expression, while <tt>"\\b"</tt> matches a
0368 * word boundary. The string literal <tt>"\(hello\)"</tt> is illegal
0369 * and leads to a compile-time error; in order to match the string
0370 * <tt>(hello)</tt> the string literal <tt>"\\(hello\\)"</tt>
0371 * must be used.
0372 *
0373 * <a name="cc">
0374 * <h4> Character Classes </h4>
0375 *
0376 * <p> Character classes may appear within other character classes, and
0377 * may be composed by the union operator (implicit) and the intersection
0378 * operator (<tt>&&</tt>).
0379 * The union operator denotes a class that contains every character that is
0380 * in at least one of its operand classes. The intersection operator
0381 * denotes a class that contains every character that is in both of its
0382 * operand classes.
0383 *
0384 * <p> The precedence of character-class operators is as follows, from
0385 * highest to lowest:
0386 *
0387 * <blockquote><table border="0" cellpadding="1" cellspacing="0"
0388 * summary="Precedence of character class operators.">
0389 * <tr><th>1 </th>
0390 * <td>Literal escape </td>
0391 * <td><tt>\x</tt></td></tr>
0392 * <tr><th>2 </th>
0393 * <td>Grouping</td>
0394 * <td><tt>[...]</tt></td></tr>
0395 * <tr><th>3 </th>
0396 * <td>Range</td>
0397 * <td><tt>a-z</tt></td></tr>
0398 * <tr><th>4 </th>
0399 * <td>Union</td>
0400 * <td><tt>[a-e][i-u]</tt></td></tr>
0401 * <tr><th>5 </th>
0402 * <td>Intersection</td>
0403 * <td><tt>[a-z&&[aeiou]]</tt></td></tr>
0404 * </table></blockquote>
0405 *
0406 * <p> Note that a different set of metacharacters are in effect inside
0407 * a character class than outside a character class. For instance, the
0408 * regular expression <tt>.</tt> loses its special meaning inside a
0409 * character class, while the expression <tt>-</tt> becomes a range
0410 * forming metacharacter.
0411 *
0412 * <a name="lt">
0413 * <h4> Line terminators </h4>
0414 *
0415 * <p> A <i>line terminator</i> is a one- or two-character sequence that marks
0416 * the end of a line of the input character sequence. The following are
0417 * recognized as line terminators:
0418 *
0419 * <ul>
0420 *
0421 * <li> A newline (line feed) character (<tt>'\n'</tt>),
0422 *
0423 * <li> A carriage-return character followed immediately by a newline
0424 * character (<tt>"\r\n"</tt>),
0425 *
0426 * <li> A standalone carriage-return character (<tt>'\r'</tt>),
0427 *
0428 * <li> A next-line character (<tt>'\u0085'</tt>),
0429 *
0430 * <li> A line-separator character (<tt>'\u2028'</tt>), or
0431 *
0432 * <li> A paragraph-separator character (<tt>'\u2029</tt>).
0433 *
0434 * </ul>
0435 * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators
0436 * recognized are newline characters.
0437 *
0438 * <p> The regular expression <tt>.</tt> matches any character except a line
0439 * terminator unless the {@link #DOTALL} flag is specified.
0440 *
0441 * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore
0442 * line terminators and only match at the beginning and the end, respectively,
0443 * of the entire input sequence. If {@link #MULTILINE} mode is activated then
0444 * <tt>^</tt> matches at the beginning of input and after any line terminator
0445 * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt>
0446 * matches just before a line terminator or the end of the input sequence.
0447 *
0448 * <a name="cg">
0449 * <h4> Groups and capturing </h4>
0450 *
0451 * <p> Capturing groups are numbered by counting their opening parentheses from
0452 * left to right. In the expression <tt>((A)(B(C)))</tt>, for example, there
0453 * are four such groups: </p>
0454 *
0455 * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
0456 * <tr><th>1 </th>
0457 * <td><tt>((A)(B(C)))</tt></td></tr>
0458 * <tr><th>2 </th>
0459 * <td><tt>(A)</tt></td></tr>
0460 * <tr><th>3 </th>
0461 * <td><tt>(B(C))</tt></td></tr>
0462 * <tr><th>4 </th>
0463 * <td><tt>(C)</tt></td></tr>
0464 * </table></blockquote>
0465 *
0466 * <p> Group zero always stands for the entire expression.
0467 *
0468 * <p> Capturing groups are so named because, during a match, each subsequence
0469 * of the input sequence that matches such a group is saved. The captured
0470 * subsequence may be used later in the expression, via a back reference, and
0471 * may also be retrieved from the matcher once the match operation is complete.
0472 *
0473 * <p> The captured input associated with a group is always the subsequence
0474 * that the group most recently matched. If a group is evaluated a second time
0475 * because of quantification then its previously-captured value, if any, will
0476 * be retained if the second evaluation fails. Matching the string
0477 * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
0478 * group two set to <tt>"b"</tt>. All captured input is discarded at the
0479 * beginning of each match.
0480 *
0481 * <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
0482 * that do not capture text and do not count towards the group total.
0483 *
0484 *
0485 * <h4> Unicode support </h4>
0486 *
0487 * <p> This class is in conformance with Level 1 of <a
0488 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
0489 * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
0490 * Canonical Equivalents.
0491 *
0492 * <p> Unicode escape sequences such as <tt>\u2014</tt> in Java source code
0493 * are processed as described in <a
0494 * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#100850">\u00A73.3</a>
0495 * of the Java Language Specification. Such escape sequences are also
0496 * implemented directly by the regular-expression parser so that Unicode
0497 * escapes can be used in expressions that are read from files or from the
0498 * keyboard. Thus the strings <tt>"\u2014"</tt> and <tt>"\\u2014"</tt>,
0499 * while not equal, compile into the same pattern, which matches the character
0500 * with hexadecimal value <tt>0x2014</tt>.
0501 *
0502 * <a name="ubc"> <p>Unicode blocks and categories are written with the
0503 * <tt>\p</tt> and <tt>\P</tt> constructs as in
0504 * Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if the input has the
0505 * property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt> does not match if
0506 * the input has that property. Blocks are specified with the prefix
0507 * <tt>In</tt>, as in <tt>InMongolian</tt>. Categories may be specified with
0508 * the optional prefix <tt>Is</tt>: Both <tt>\p{L}</tt> and <tt>\p{IsL}</tt>
0509 * denote the category of Unicode letters. Blocks and categories can be used
0510 * both inside and outside of a character class.
0511 *
0512 * <p> The supported categories are those of
0513 * <a href="http://www.unicode.org/unicode/standard/standard.html">
0514 * <i>The Unicode Standard</i></a> in the version specified by the
0515 * {@link java.lang.Character Character} class. The category names are those
0516 * defined in the Standard, both normative and informative.
0517 * The block names supported by <code>Pattern</code> are the valid block names
0518 * accepted and defined by
0519 * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
0520 *
0521 * <a name="jcc"> <p>Categories that behave like the java.lang.Character
0522 * boolean is<i>methodname</i> methods (except for the deprecated ones) are
0523 * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
0524 * the specified property has the name <tt>java<i>methodname</i></tt>.
0525 *
0526 * <h4> Comparison to Perl 5 </h4>
0527 *
0528 * <p>The <code>Pattern</code> engine performs traditional NFA-based matching
0529 * with ordered alternation as occurs in Perl 5.
0530 *
0531 * <p> Perl constructs not supported by this class: </p>
0532 *
0533 * <ul>
0534 *
0535 * <li><p> The conditional constructs <tt>(?{</tt><i>X</i><tt>})</tt> and
0536 * <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,
0537 * </p></li>
0538 *
0539 * <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>
0540 * and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>
0541 *
0542 * <li><p> The embedded comment syntax <tt>(?#comment)</tt>, and </p></li>
0543 *
0544 * <li><p> The preprocessing operations <tt>\l</tt> <tt>\u</tt>,
0545 * <tt>\L</tt>, and <tt>\U</tt>. </p></li>
0546 *
0547 * </ul>
0548 *
0549 * <p> Constructs supported by this class but not by Perl: </p>
0550 *
0551 * <ul>
0552 *
0553 * <li><p> Possessive quantifiers, which greedily match as much as they can
0554 * and do not back off, even when doing so would allow the overall match to
0555 * succeed. </p></li>
0556 *
0557 * <li><p> Character-class union and intersection as described
0558 * <a href="#cc">above</a>.</p></li>
0559 *
0560 * </ul>
0561 *
0562 * <p> Notable differences from Perl: </p>
0563 *
0564 * <ul>
0565 *
0566 * <li><p> In Perl, <tt>\1</tt> through <tt>\9</tt> are always interpreted
0567 * as back references; a backslash-escaped number greater than <tt>9</tt> is
0568 * treated as a back reference if at least that many subexpressions exist,
0569 * otherwise it is interpreted, if possible, as an octal escape. In this
0570 * class octal escapes must always begin with a zero. In this class,
0571 * <tt>\1</tt> through <tt>\9</tt> are always interpreted as back
0572 * references, and a larger number is accepted as a back reference if at
0573 * least that many subexpressions exist at that point in the regular
0574 * expression, otherwise the parser will drop digits until the number is
0575 * smaller or equal to the existing number of groups or it is one digit.
0576 * </p></li>
0577 *
0578 * <li><p> Perl uses the <tt>g</tt> flag to request a match that resumes
0579 * where the last match left off. This functionality is provided implicitly
0580 * by the {@link Matcher} class: Repeated invocations of the {@link
0581 * Matcher#find find} method will resume where the last match left off,
0582 * unless the matcher is reset. </p></li>
0583 *
0584 * <li><p> In Perl, embedded flags at the top level of an expression affect
0585 * the whole expression. In this class, embedded flags always take effect
0586 * at the point at which they appear, whether they are at the top level or
0587 * within a group; in the latter case, flags are restored at the end of the
0588 * group just as in Perl. </p></li>
0589 *
0590 * <li><p> Perl is forgiving about malformed matching constructs, as in the
0591 * expression <tt>*a</tt>, as well as dangling brackets, as in the
0592 * expression <tt>abc]</tt>, and treats them as literals. This
0593 * class also accepts dangling brackets but is strict about dangling
0594 * metacharacters like +, ? and *, and will throw a
0595 * {@link PatternSyntaxException} if it encounters them. </p></li>
0596 *
0597 * </ul>
0598 *
0599 *
0600 * <p> For a more precise description of the behavior of regular expression
0601 * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/">
0602 * <i>Mastering Regular Expressions, 3nd Edition</i>, Jeffrey E. F. Friedl,
0603 * O'Reilly and Associates, 2006.</a>
0604 * </p>
0605 *
0606 * @see java.lang.String#split(String, int)
0607 * @see java.lang.String#split(String)
0608 *
0609 * @author Mike McCloskey
0610 * @author Mark Reinhold
0611 * @author JSR-51 Expert Group
0612 * @version 1.133, 07/05/05
0613 * @since 1.4
0614 * @spec JSR-51
0615 */
0616
0617 public final class Pattern implements java.io.Serializable {
0618
0619 /**
0620 * Regular expression modifier values. Instead of being passed as
0621 * arguments, they can also be passed as inline modifiers.
0622 * For example, the following statements have the same effect.
0623 * <pre>
0624 * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
0625 * RegExp r2 = RegExp.compile("(?im)abc", 0);
0626 * </pre>
0627 *
0628 * The flags are duplicated so that the familiar Perl match flag
0629 * names are available.
0630 */
0631
0632 /**
0633 * Enables Unix lines mode.
0634 *
0635 * <p> In this mode, only the <tt>'\n'</tt> line terminator is recognized
0636 * in the behavior of <tt>.</tt>, <tt>^</tt>, and <tt>$</tt>.
0637 *
0638 * <p> Unix lines mode can also be enabled via the embedded flag
0639 * expression <tt>(?d)</tt>.
0640 */
0641 public static final int UNIX_LINES = 0x01;
0642
0643 /**
0644 * Enables case-insensitive matching.
0645 *
0646 * <p> By default, case-insensitive matching assumes that only characters
0647 * in the US-ASCII charset are being matched. Unicode-aware
0648 * case-insensitive matching can be enabled by specifying the {@link
0649 * #UNICODE_CASE} flag in conjunction with this flag.
0650 *
0651 * <p> Case-insensitive matching can also be enabled via the embedded flag
0652 * expression <tt>(?i)</tt>.
0653 *
0654 * <p> Specifying this flag may impose a slight performance penalty. </p>
0655 */
0656 public static final int CASE_INSENSITIVE = 0x02;
0657
0658 /**
0659 * Permits whitespace and comments in pattern.
0660 *
0661 * <p> In this mode, whitespace is ignored, and embedded comments starting
0662 * with <tt>#</tt> are ignored until the end of a line.
0663 *
0664 * <p> Comments mode can also be enabled via the embedded flag
0665 * expression <tt>(?x)</tt>.
0666 */
0667 public static final int COMMENTS = 0x04;
0668
0669 /**
0670 * Enables multiline mode.
0671 *
0672 * <p> In multiline mode the expressions <tt>^</tt> and <tt>$</tt> match
0673 * just after or just before, respectively, a line terminator or the end of
0674 * the input sequence. By default these expressions only match at the
0675 * beginning and the end of the entire input sequence.
0676 *
0677 * <p> Multiline mode can also be enabled via the embedded flag
0678 * expression <tt>(?m)</tt>. </p>
0679 */
0680 public static final int MULTILINE = 0x08;
0681
0682 /**
0683 * Enables literal parsing of the pattern.
0684 *
0685 * <p> When this flag is specified then the input string that specifies
0686 * the pattern is treated as a sequence of literal characters.
0687 * Metacharacters or escape sequences in the input sequence will be
0688 * given no special meaning.
0689 *
0690 * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on
0691 * matching when used in conjunction with this flag. The other flags
0692 * become superfluous.
0693 *
0694 * <p> There is no embedded flag character for enabling literal parsing.
0695 * @since 1.5
0696 */
0697 public static final int LITERAL = 0x10;
0698
0699 /**
0700 * Enables dotall mode.
0701 *
0702 * <p> In dotall mode, the expression <tt>.</tt> matches any character,
0703 * including a line terminator. By default this expression does not match
0704 * line terminators.
0705 *
0706 * <p> Dotall mode can also be enabled via the embedded flag
0707 * expression <tt>(?s)</tt>. (The <tt>s</tt> is a mnemonic for
0708 * "single-line" mode, which is what this is called in Perl.) </p>
0709 */
0710 public static final int DOTALL = 0x20;
0711
0712 /**
0713 * Enables Unicode-aware case folding.
0714 *
0715 * <p> When this flag is specified then case-insensitive matching, when
0716 * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner
0717 * consistent with the Unicode Standard. By default, case-insensitive
0718 * matching assumes that only characters in the US-ASCII charset are being
0719 * matched.
0720 *
0721 * <p> Unicode-aware case folding can also be enabled via the embedded flag
0722 * expression <tt>(?u)</tt>.
0723 *
0724 * <p> Specifying this flag may impose a performance penalty. </p>
0725 */
0726 public static final int UNICODE_CASE = 0x40;
0727
0728 /**
0729 * Enables canonical equivalence.
0730 *
0731 * <p> When this flag is specified then two characters will be considered
0732 * to match if, and only if, their full canonical decompositions match.
0733 * The expression <tt>"a\u030A"</tt>, for example, will match the
0734 * string <tt>"\u00E5"</tt> when this flag is specified. By default,
0735 * matching does not take canonical equivalence into account.
0736 *
0737 * <p> There is no embedded flag character for enabling canonical
0738 * equivalence.
0739 *
0740 * <p> Specifying this flag may impose a performance penalty. </p>
0741 */
0742 public static final int CANON_EQ = 0x80;
0743
0744 /* Pattern has only two serialized components: The pattern string
0745 * and the flags, which are all that is needed to recompile the pattern
0746 * when it is deserialized.
0747 */
0748
0749 /** use serialVersionUID from Merlin b59 for interoperability */
0750 private static final long serialVersionUID = 5073258162644648461L;
0751
0752 /**
0753 * The original regular-expression pattern string.
0754 *
0755 * @serial
0756 */
0757 private String pattern;
0758
0759 /**
0760 * The original pattern flags.
0761 *
0762 * @serial
0763 */
0764 private int flags;
0765
0766 /**
0767 * Boolean indicating this Pattern is compiled; this is necessary in order
0768 * to lazily compile deserialized Patterns.
0769 */
0770 private transient volatile boolean compiled = false;
0771
0772 /**
0773 * The normalized pattern string.
0774 */
0775 private transient String normalizedPattern;
0776
0777 /**
0778 * The starting point of state machine for the find operation. This allows
0779 * a match to start anywhere in the input.
0780 */
0781 transient Node root;
0782
0783 /**
0784 * The root of object tree for a match operation. The pattern is matched
0785 * at the beginning. This may include a find that uses BnM or a First
0786 * node.
0787 */
0788 transient Node matchRoot;
0789
0790 /**
0791 * Temporary storage used by parsing pattern slice.
0792 */
0793 transient int[] buffer;
0794
0795 /**
0796 * Temporary storage used while parsing group references.
0797 */
0798 transient GroupHead[] groupNodes;
0799
0800 /**
0801 * Temporary null terminated code point array used by pattern compiling.
0802 */
0803 private transient int[] temp;
0804
0805 /**
0806 * The number of capturing groups in this Pattern. Used by matchers to
0807 * allocate storage needed to perform a match.
0808 */
0809 transient int capturingGroupCount;
0810
0811 /**
0812 * The local variable count used by parsing tree. Used by matchers to
0813 * allocate storage needed to perform a match.
0814 */
0815 transient int localCount;
0816
0817 /**
0818 * Index into the pattern string that keeps track of how much has been
0819 * parsed.
0820 */
0821 private transient int cursor;
0822
0823 /**
0824 * Holds the length of the pattern string.
0825 */
0826 private transient int patternLength;
0827
0828 /**
0829 * Compiles the given regular expression into a pattern. </p>
0830 *
0831 * @param regex
0832 * The expression to be compiled
0833 *
0834 * @throws PatternSyntaxException
0835 * If the expression's syntax is invalid
0836 */
0837 public static Pattern compile(String regex) {
0838 return new Pattern(regex, 0);
0839 }
0840
0841 /**
0842 * Compiles the given regular expression into a pattern with the given
0843 * flags. </p>
0844 *
0845 * @param regex
0846 * The expression to be compiled
0847 *
0848 * @param flags
0849 * Match flags, a bit mask that may include
0850 * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
0851 * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
0852 * {@link #LITERAL} and {@link #COMMENTS}
0853 *
0854 * @throws IllegalArgumentException
0855 * If bit values other than those corresponding to the defined
0856 * match flags are set in <tt>flags</tt>
0857 *
0858 * @throws PatternSyntaxException
0859 * If the expression's syntax is invalid
0860 */
0861 public static Pattern compile(String regex, int flags) {
0862 return new Pattern(regex, flags);
0863 }
0864
0865 /**
0866 * Returns the regular expression from which this pattern was compiled.
0867 * </p>
0868 *
0869 * @return The source of this pattern
0870 */
0871 public String pattern() {
0872 return pattern;
0873 }
0874
0875 /**
0876 * <p>Returns the string representation of this pattern. This
0877 * is the regular expression from which this pattern was
0878 * compiled.</p>
0879 *
0880 * @return The string representation of this pattern
0881 * @since 1.5
0882 */
0883 public String toString() {
0884 return pattern;
0885 }
0886
0887 /**
0888 * Creates a matcher that will match the given input against this pattern.
0889 * </p>
0890 *
0891 * @param input
0892 * The character sequence to be matched
0893 *
0894 * @return A new matcher for this pattern
0895 */
0896 public Matcher matcher(CharSequence input) {
0897 if (!compiled) {
0898 synchronized (this ) {
0899 if (!compiled)
0900 compile();
0901 }
0902 }
0903 Matcher m = new Matcher(this , input);
0904 return m;
0905 }
0906
0907 /**
0908 * Returns this pattern's match flags. </p>
0909 *
0910 * @return The match flags specified when this pattern was compiled
0911 */
0912 public int flags() {
0913 return flags;
0914 }
0915
0916 /**
0917 * Compiles the given regular expression and attempts to match the given
0918 * input against it.
0919 *
0920 * <p> An invocation of this convenience method of the form
0921 *
0922 * <blockquote><pre>
0923 * Pattern.matches(regex, input);</pre></blockquote>
0924 *
0925 * behaves in exactly the same way as the expression
0926 *
0927 * <blockquote><pre>
0928 * Pattern.compile(regex).matcher(input).matches()</pre></blockquote>
0929 *
0930 * <p> If a pattern is to be used multiple times, compiling it once and reusing
0931 * it will be more efficient than invoking this method each time. </p>
0932 *
0933 * @param regex
0934 * The expression to be compiled
0935 *
0936 * @param input
0937 * The character sequence to be matched
0938 *
0939 * @throws PatternSyntaxException
0940 * If the expression's syntax is invalid
0941 */
0942 public static boolean matches(String regex, CharSequence input) {
0943 Pattern p = Pattern.compile(regex);
0944 Matcher m = p.matcher(input);
0945 return m.matches();
0946 }
0947
0948 /**
0949 * Splits the given input sequence around matches of this pattern.
0950 *
0951 * <p> The array returned by this method contains each substring of the
0952 * input sequence that is terminated by another subsequence that matches
0953 * this pattern or is terminated by the end of the input sequence. The
0954 * substrings in the array are in the order in which they occur in the
0955 * input. If this pattern does not match any subsequence of the input then
0956 * the resulting array has just one element, namely the input sequence in
0957 * string form.
0958 *
0959 * <p> The <tt>limit</tt> parameter controls the number of times the
0960 * pattern is applied and therefore affects the length of the resulting
0961 * array. If the limit <i>n</i> is greater than zero then the pattern
0962 * will be applied at most <i>n</i> - 1 times, the array's
0963 * length will be no greater than <i>n</i>, and the array's last entry
0964 * will contain all input beyond the last matched delimiter. If <i>n</i>
0965 * is non-positive then the pattern will be applied as many times as
0966 * possible and the array can have any length. If <i>n</i> is zero then
0967 * the pattern will be applied as many times as possible, the array can
0968 * have any length, and trailing empty strings will be discarded.
0969 *
0970 * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
0971 * results with these parameters:
0972 *
0973 * <blockquote><table cellpadding=1 cellspacing=0
0974 * summary="Split examples showing regex, limit, and result">
0975 * <tr><th><P align="left"><i>Regex </i></th>
0976 * <th><P align="left"><i>Limit </i></th>
0977 * <th><P align="left"><i>Result </i></th></tr>
0978 * <tr><td align=center>:</td>
0979 * <td align=center>2</td>
0980 * <td><tt>{ "boo", "and:foo" }</tt></td></tr>
0981 * <tr><td align=center>:</td>
0982 * <td align=center>5</td>
0983 * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
0984 * <tr><td align=center>:</td>
0985 * <td align=center>-2</td>
0986 * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
0987 * <tr><td align=center>o</td>
0988 * <td align=center>5</td>
0989 * <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
0990 * <tr><td align=center>o</td>
0991 * <td align=center>-2</td>
0992 * <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
0993 * <tr><td align=center>o</td>
0994 * <td align=center>0</td>
0995 * <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
0996 * </table></blockquote>
0997 *
0998 *
0999 * @param input
1000 * The character sequence to be split
1001 *
1002 * @param limit
1003 * The result threshold, as described above
1004 *
1005 * @return The array of strings computed by splitting the input
1006 * around matches of this pattern
1007 */
1008 public String[] split(CharSequence input, int limit) {
1009 int index = 0;
1010 boolean matchLimited = limit > 0;
1011 ArrayList<String> matchList = new ArrayList<String>();
1012 Matcher m = matcher(input);
1013
1014 // Add segments before each match found
1015 while (m.find()) {
1016 if (!matchLimited || matchList.size() < limit - 1) {
1017 String match = input.subSequence(index, m.start())
1018 .toString();
1019 matchList.add(match);
1020 index = m.end();
1021 } else if (matchList.size() == limit - 1) { // last one
1022 String match = input.subSequence(index, input.length())
1023 .toString();
1024 matchList.add(match);
1025 index = m.end();
1026 }
1027 }
1028
1029 // If no match was found, return this
1030 if (index == 0)
1031 return new String[] { input.toString() };
1032
1033 // Add remaining segment
1034 if (!matchLimited || matchList.size() < limit)
1035 matchList.add(input.subSequence(index, input.length())
1036 .toString());
1037
1038 // Construct result
1039 int resultSize = matchList.size();
1040 if (limit == 0)
1041 while (resultSize > 0
1042 && matchList.get(resultSize - 1).equals(""))
1043 resultSize--;
1044 String[] result = new String[resultSize];
1045 return matchList.subList(0, resultSize).toArray(result);
1046 }
1047
1048 /**
1049 * Splits the given input sequence around matches of this pattern.
1050 *
1051 * <p> This method works as if by invoking the two-argument {@link
1052 * #split(java.lang.CharSequence, int) split} method with the given input
1053 * sequence and a limit argument of zero. Trailing empty strings are
1054 * therefore not included in the resulting array. </p>
1055 *
1056 * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
1057 * results with these expressions:
1058 *
1059 * <blockquote><table cellpadding=1 cellspacing=0
1060 * summary="Split examples showing regex and result">
1061 * <tr><th><P align="left"><i>Regex </i></th>
1062 * <th><P align="left"><i>Result</i></th></tr>
1063 * <tr><td align=center>:</td>
1064 * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
1065 * <tr><td align=center>o</td>
1066 * <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
1067 * </table></blockquote>
1068 *
1069 *
1070 * @param input
1071 * The character sequence to be split
1072 *
1073 * @return The array of strings computed by splitting the input
1074 * around matches of this pattern
1075 */
1076 public String[] split(CharSequence input) {
1077 return split(input, 0);
1078 }
1079
1080 /**
1081 * Returns a literal pattern <code>String</code> for the specified
1082 * <code>String</code>.
1083 *
1084 * <p>This method produces a <code>String</code> that can be used to
1085 * create a <code>Pattern</code> that would match the string
1086 * <code>s</code> as if it were a literal pattern.</p> Metacharacters
1087 * or escape sequences in the input sequence will be given no special
1088 * meaning.
1089 *
1090 * @param s The string to be literalized
1091 * @return A literal string replacement
1092 * @since 1.5
1093 */
1094 public static String quote(String s) {
1095 int slashEIndex = s.indexOf("\\E");
1096 if (slashEIndex == -1)
1097 return "\\Q" + s + "\\E";
1098
1099 StringBuilder sb = new StringBuilder(s.length() * 2);
1100 sb.append("\\Q");
1101 slashEIndex = 0;
1102 int current = 0;
1103 while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
1104 sb.append(s.substring(current, slashEIndex));
1105 current = slashEIndex + 2;
1106 sb.append("\\E\\\\E\\Q");
1107 }
1108 sb.append(s.substring(current, s.length()));
1109 sb.append("\\E");
1110 return sb.toString();
1111 }
1112
1113 /**
1114 * Recompile the Pattern instance from a stream. The original pattern
1115 * string is read in and the object tree is recompiled from it.
1116 */
1117 private void readObject(java.io.ObjectInputStream s)
1118 throws java.io.IOException, ClassNotFoundException {
1119
1120 // Read in all fields
1121 s.defaultReadObject();
1122
1123 // Initialize counts
1124 capturingGroupCount = 1;
1125 localCount = 0;
1126
1127 // if length > 0, the Pattern is lazily compiled
1128 compiled = false;
1129 if (pattern.length() == 0) {
1130 root = new Start(lastAccept);
1131 matchRoot = lastAccept;
1132 compiled = true;
1133 }
1134 }
1135
1136 /**
1137 * This private constructor is used to create all Patterns. The pattern
1138 * string and match flags are all that is needed to completely describe
1139 * a Pattern. An empty pattern string results in an object tree with
1140 * only a Start node and a LastNode node.
1141 */
1142 private Pattern(String p, int f) {
1143 pattern = p;
1144 flags = f;
1145
1146 // Reset group index count
1147 capturingGroupCount = 1;
1148 localCount = 0;
1149
1150 if (pattern.length() > 0) {
1151 compile();
1152 } else {
1153 root = new Start(lastAccept);
1154 matchRoot = lastAccept;
1155 }
1156 }
1157
1158 /**
1159 * The pattern is converted to normalizedD form and then a pure group
1160 * is constructed to match canonical equivalences of the characters.
1161 */
1162 private void normalize() {
1163 boolean inCharClass = false;
1164 int lastCodePoint = -1;
1165
1166 // Convert pattern into normalizedD form
1167 normalizedPattern = Normalizer.normalize(pattern,
1168 Normalizer.Form.NFD);
1169 patternLength = normalizedPattern.length();
1170
1171 // Modify pattern to match canonical equivalences
1172 StringBuilder newPattern = new StringBuilder(patternLength);
1173 for (int i = 0; i < patternLength;) {
1174 int c = normalizedPattern.codePointAt(i);
1175 StringBuilder sequenceBuffer;
1176 if ((Character.getType(c) == Character.NON_SPACING_MARK)
1177 && (lastCodePoint != -1)) {
1178 sequenceBuffer = new StringBuilder();
1179 sequenceBuffer.appendCodePoint(lastCodePoint);
1180 sequenceBuffer.appendCodePoint(c);
1181 while (Character.getType(c) == Character.NON_SPACING_MARK) {
1182 i += Character.charCount(c);
1183 if (i >= patternLength)
1184 break;
1185 c = normalizedPattern.codePointAt(i);
1186 sequenceBuffer.appendCodePoint(c);
1187 }
1188 String ea = produceEquivalentAlternation(sequenceBuffer
1189 .toString());
1190 newPattern.setLength(newPattern.length()
1191 - Character.charCount(lastCodePoint));
1192 newPattern.append("(?:").append(ea).append(")");
1193 } else if (c == '[' && lastCodePoint != '\\') {
1194 i = normalizeCharClass(newPattern, i);
1195 } else {
1196 newPattern.appendCodePoint(c);
1197 }
1198 lastCodePoint = c;
1199 i += Character.charCount(c);
1200 }
1201 normalizedPattern = newPattern.toString();
1202 }
1203
1204 /**
1205 * Complete the character class being parsed and add a set
1206 * of alternations to it that will match the canonical equivalences
1207 * of the characters within the class.
1208 */
1209 private int normalizeCharClass(StringBuilder newPattern, int i) {
1210 StringBuilder charClass = new StringBuilder();
1211 StringBuilder eq = null;
1212 int lastCodePoint = -1;
1213 String result;
1214
1215 i++;
1216 charClass.append("[");
1217 while (true) {
1218 int c = normalizedPattern.codePointAt(i);
1219 StringBuilder sequenceBuffer;
1220
1221 if (c == ']' && lastCodePoint != '\\') {
1222 charClass.append((char) c);
1223 break;
1224 } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
1225 sequenceBuffer = new StringBuilder();
1226 sequenceBuffer.appendCodePoint(lastCodePoint);
1227 while (Character.getType(c) == Character.NON_SPACING_MARK) {
1228 sequenceBuffer.appendCodePoint(c);
1229 i += Character.charCount(c);
1230 if (i >= normalizedPattern.length())
1231 break;
1232 c = normalizedPattern.codePointAt(i);
1233 }
1234 String ea = produceEquivalentAlternation(sequenceBuffer
1235 .toString());
1236
1237 charClass.setLength(charClass.length()
1238 - Character.charCount(lastCodePoint));
1239 if (eq == null)
1240 eq = new StringBuilder();
1241 eq.append('|');
1242 eq.append(ea);
1243 } else {
1244 charClass.appendCodePoint(c);
1245 i++;
1246 }
1247 if (i == normalizedPattern.length())
1248 throw error("Unclosed character class");
1249 lastCodePoint = c;
1250 }
1251
1252 if (eq != null) {
1253 result = "(?:" + charClass.toString() + eq.toString() + ")";
1254 } else {
1255 result = charClass.toString();
1256 }
1257
1258 newPattern.append(result);
1259 return i;
1260 }
1261
1262 /**
1263 * Given a specific sequence composed of a regular character and
1264 * combining marks that follow it, produce the alternation that will
1265 * match all canonical equivalences of that sequence.
1266 */
1267 private String produceEquivalentAlternation(String source) {
1268 int len = countChars(source, 0, 1);
1269 if (source.length() == len)
1270 // source has one character.
1271 return source;
1272
1273 String base = source.substring(0, len);
1274 String combiningMarks = source.substring(len);
1275
1276 String[] perms = producePermutations(combiningMarks);
1277 StringBuilder result = new StringBuilder(source);
1278
1279 // Add combined permutations
1280 for (int x = 0; x < perms.length; x++) {
1281 String next = base + perms[x];
1282 if (x > 0)
1283 result.append("|" + next);
1284 next = composeOneStep(next);
1285 if (next != null)
1286 result.append("|" + produceEquivalentAlternation(next));
1287 }
1288 return result.toString();
1289 }
1290
1291 /**
1292 * Returns an array of strings that have all the possible
1293 * permutations of the characters in the input string.
1294 * This is used to get a list of all possible orderings
1295 * of a set of combining marks. Note that some of the permutations
1296 * are invalid because of combining class collisions, and these
1297 * possibilities must be removed because they are not canonically
1298 * equivalent.
1299 */
1300 private String[] producePermutations(String input) {
1301 if (input.length() == countChars(input, 0, 1))
1302 return new String[] { input };
1303
1304 if (input.length() == countChars(input, 0, 2)) {
1305 int c0 = Character.codePointAt(input, 0);
1306 int c1 = Character.codePointAt(input, Character
1307 .charCount(c0));
1308 if (getClass(c1) == getClass(c0)) {
1309 return new String[] { input };
1310 }
1311 String[] result = new String[2];
1312 result[0] = input;
1313 StringBuilder sb = new StringBuilder(2);
1314 sb.appendCodePoint(c1);
1315 sb.appendCodePoint(c0);
1316 result[1] = sb.toString();
1317 return result;
1318 }
1319
1320 int length = 1;
1321 int nCodePoints = countCodePoints(input);
1322 for (int x = 1; x < nCodePoints; x++)
1323 length = length * (x + 1);
1324
1325 String[] temp = new String[length];
1326
1327 int combClass[] = new int[nCodePoints];
1328 for (int x = 0, i = 0; x < nCodePoints; x++) {
1329 int c = Character.codePointAt(input, i);
1330 combClass[x] = getClass(c);
1331 i += Character.charCount(c);
1332 }
1333
1334 // For each char, take it out and add the permutations
1335 // of the remaining chars
1336 int index = 0;
1337 int len;
1338 // offset maintains the index in code units.
1339 loop: for (int x = 0, offset = 0; x < nCodePoints; x++, offset += len) {
1340 len = countChars(input, offset, 1);
1341 boolean skip = false;
1342 for (int y = x - 1; y >= 0; y--) {
1343 if (combClass[y] == combClass[x]) {
1344 continue loop;
1345 }
1346 }
1347 StringBuilder sb = new StringBuilder(input);
1348 String otherChars = sb.delete(offset, offset + len)
1349 .toString();
1350 String[] subResult = producePermutations(otherChars);
1351
1352 String prefix = input.substring(offset, offset + len);
1353 for (int y = 0; y < subResult.length; y++)
1354 temp[index++] = prefix + subResult[y];
1355 }
1356 String[] result = new String[index];
1357 for (int x = 0; x < index; x++)
1358 result[x] = temp[x];
1359 return result;
1360 }
1361
1362 private int getClass(int c) {
1363 return sun.text.Normalizer.getCombiningClass(c);
1364 }
1365
1366 /**
1367 * Attempts to compose input by combining the first character
1368 * with the first combining mark following it. Returns a String
1369 * that is the composition of the leading character with its first
1370 * combining mark followed by the remaining combining marks. Returns
1371 * null if the first two characters cannot be further composed.
1372 */
1373 private String composeOneStep(String input) {
1374 int len = countChars(input, 0, 2);
1375 String firstTwoCharacters = input.substring(0, len);
1376 String result = Normalizer.normalize(firstTwoCharacters,
1377 Normalizer.Form.NFC);
1378
1379 if (result.equals(firstTwoCharacters))
1380 return null;
1381 else {
1382 String remainder = input.substring(len);
1383 return result + remainder;
1384 }
1385 }
1386
1387 /**
1388 * Preprocess any \Q...\E sequences in `temp', meta-quoting them.
1389 * See the description of `quotemeta' in perlfunc(1).
1390 */
1391 private void RemoveQEQuoting() {
1392 final int pLen = patternLength;
1393 int i = 0;
1394 while (i < pLen - 1) {
1395 if (temp[i] != '\\')
1396 i += 1;
1397 else if (temp[i + 1] != 'Q')
1398 i += 2;
1399 else
1400 break;
1401 }
1402 if (i >= pLen - 1) // No \Q sequence found
1403 return;
1404 int j = i;
1405 i += 2;
1406 int[] newtemp = new int[j + 2 * (pLen - i) + 2];
1407 System.arraycopy(temp, 0, newtemp, 0, j);
1408
1409 boolean inQuote = true;
1410 while (i < pLen) {
1411 int c = temp[i++];
1412 if (!ASCII.isAscii(c) || ASCII.isAlnum(c)) {
1413 newtemp[j++] = c;
1414 } else if (c != '\\') {
1415 if (inQuote)
1416 newtemp[j++] = '\\';
1417 newtemp[j++] = c;
1418 } else if (inQuote) {
1419 if (temp[i] == 'E') {
1420 i++;
1421 inQuote = false;
1422 } else {
1423 newtemp[j++] = '\\';
1424 newtemp[j++] = '\\';
1425 }
1426 } else {
1427 if (temp[i] == 'Q') {
1428 i++;
1429 inQuote = true;
1430 } else {
1431 newtemp[j++] = c;
1432 if (i != pLen)
1433 newtemp[j++] = temp[i++];
1434 }
1435 }
1436 }
1437
1438 patternLength = j;
1439 temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
1440 }
1441
1442 /**
1443 * Copies regular expression to an int array and invokes the parsing
1444 * of the expression which will create the object tree.
1445 */
1446 private void compile() {
1447 // Handle canonical equivalences
1448 if (has(CANON_EQ) && !has(LITERAL)) {
1449 normalize();
1450 } else {
1451 normalizedPattern = pattern;
1452 }
1453 patternLength = normalizedPattern.length();
1454
1455 // Copy pattern to int array for convenience
1456 // Use double zero to terminate pattern
1457 temp = new int[patternLength + 2];
1458
1459 boolean hasSupplementary = false;
1460 int c, count = 0;
1461 // Convert all chars into code points
1462 for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1463 c = normalizedPattern.codePointAt(x);
1464 if (isSupplementary(c)) {
1465 hasSupplementary = true;
1466 }
1467 temp[count++] = c;
1468 }
1469
1470 patternLength = count; // patternLength now in code points
1471
1472 if (!has(LITERAL))
1473 RemoveQEQuoting();
1474
1475 // Allocate all temporary objects here.
1476 buffer = new int[32];
1477 groupNodes = new GroupHead[10];
1478
1479 if (has(LITERAL)) {
1480 // Literal pattern handling
1481 matchRoot = newSlice(temp, patternLength, hasSupplementary);
1482 matchRoot.next = lastAccept;
1483 } else {
1484 // Start recursive descent parsing
1485 matchRoot = expr(lastAccept);
1486 // Check extra pattern characters
1487 if (patternLength != cursor) {
1488 if (peek() == ')') {
1489 throw error("Unmatched closing ')'");
1490 } else {
1491 throw error("Unexpected internal error");
1492 }
1493 }
1494 }
1495
1496 // Peephole optimization
1497 if (matchRoot instanceof Slice) {
1498 root = BnM.optimize(matchRoot);
1499 if (root == matchRoot) {
1500 root = hasSupplementary ? new StartS(matchRoot)
1501 : new Start(matchRoot);
1502 }
1503 } else if (matchRoot instanceof Begin
1504 || matchRoot instanceof First) {
1505 root = matchRoot;
1506 } else {
1507 root = hasSupplementary ? new StartS(matchRoot)
1508 : new Start(matchRoot);
1509 }
1510
1511 // Release temporary storage
1512 temp = null;
1513 buffer = null;
1514 groupNodes = null;
1515 patternLength = 0;
1516 compiled = true;
1517 }
1518
1519 /**
1520 * Used to print out a subtree of the Pattern to help with debugging.
1521 */
1522 private static void printObjectTree(Node node) {
1523 while (node != null) {
1524 if (node instanceof Prolog) {
1525 System.out.println(node);
1526 printObjectTree(((Prolog) node).loop);
1527 System.out.println("**** end contents prolog loop");
1528 } else if (node instanceof Loop) {
1529 System.out.println(node);
1530 printObjectTree(((Loop) node).body);
1531 System.out.println("**** end contents Loop body");
1532 } else if (node instanceof Curly) {
1533 System.out.println(node);
1534 printObjectTree(((Curly) node).atom);
1535 System.out.println("**** end contents Curly body");
1536 } else if (node instanceof GroupCurly) {
1537 System.out.println(node);
1538 printObjectTree(((GroupCurly) node).atom);
1539 System.out.println("**** end contents GroupCurly body");
1540 } else if (node instanceof GroupTail) {
1541 System.out.println(node);
1542 System.out.println("Tail next is " + node.next);
1543 return;
1544 } else {
1545 System.out.println(node);
1546 }
1547 node = node.next;
1548 if (node != null)
1549 System.out.println("->next:");
1550 if (node == Pattern.accept) {
1551 System.out.println("Accept Node");
1552 node = null;
1553 }
1554 }
1555 }
1556
1557 /**
1558 * Used to accumulate information about a subtree of the object graph
1559 * so that optimizations can be applied to the subtree.
1560 */
1561 static final class TreeInfo {
1562 int minLength;
1563 int maxLength;
1564 boolean maxValid;
1565 boolean deterministic;
1566
1567 TreeInfo() {
1568 reset();
1569 }
1570
1571 void reset() {
1572 minLength = 0;
1573 maxLength = 0;
1574 maxValid = true;
1575 deterministic = true;
1576 }
1577 }
1578
1579 /*
1580 * The following private methods are mainly used to improve the
1581 * readability of the code. In order to let the Java compiler easily
1582 * inline them, we should not put many assertions or error checks in them.
1583 */
1584
1585 /**
1586 * Indicates whether a particular flag is set or not.
1587 */
1588 private boolean has(int f) {
1589 return (flags & f) != 0;
1590 }
1591
1592 /**
1593 * Match next character, signal error if failed.
1594 */
1595 private void accept(int ch, String s) {
1596 int testChar = temp[cursor++];
1597 if (has(COMMENTS))
1598 testChar = parsePastWhitespace(testChar);
1599 if (ch != testChar) {
1600 throw error(s);
1601 }
1602 }
1603
1604 /**
1605 * Mark the end of pattern with a specific character.
1606 */
1607 private void mark(int c) {
1608 temp[patternLength] = c;
1609 }
1610
1611 /**
1612 * Peek the next character, and do not advance the cursor.
1613 */
1614 private int peek() {
1615 int ch = temp[cursor];
1616 if (has(COMMENTS))
1617 ch = peekPastWhitespace(ch);
1618 return ch;
1619 }
1620
1621 /**
1622 * Read the next character, and advance the cursor by one.
1623 */
1624 private int read() {
1625 int ch = temp[cursor++];
1626 if (has(COMMENTS))
1627 ch = parsePastWhitespace(ch);
1628 return ch;
1629 }
1630
1631 /**
1632 * Read the next character, and advance the cursor by one,
1633 * ignoring the COMMENTS setting
1634 */
1635 private int readEscaped() {
1636 int ch = temp[cursor++];
1637 return ch;
1638 }
1639
1640 /**
1641 * Advance the cursor by one, and peek the next character.
1642 */
1643 private int next() {
1644 int ch = temp[++cursor];
1645 if (has(COMMENTS))
1646 ch = peekPastWhitespace(ch);
1647 return ch;
1648 }
1649
1650 /**
1651 * Advance the cursor by one, and peek the next character,
1652 * ignoring the COMMENTS setting
1653 */
1654 private int nextEscaped() {
1655 int ch = temp[++cursor];
1656 return ch;
1657 }
1658
1659 /**
1660 * If in xmode peek past whitespace and comments.
1661 */
1662 private int peekPastWhitespace(int ch) {
1663 while (ASCII.isSpace(ch) || ch == '#') {
1664 while (ASCII.isSpace(ch))
1665 ch = temp[++cursor];
1666 if (ch == '#') {
1667 ch = peekPastLine();
1668 }
1669 }
1670 return ch;
1671 }
1672
1673 /**
1674 * If in xmode parse past whitespace and comments.
1675 */
1676 private int parsePastWhitespace(int ch) {
1677 while (ASCII.isSpace(ch) || ch == '#') {
1678 while (ASCII.isSpace(ch))
1679 ch = temp[cursor++];
1680 if (ch == '#')
1681 ch = parsePastLine();
1682 }
1683 return ch;
1684 }
1685
1686 /**
1687 * xmode parse past comment to end of line.
1688 */
1689 private int parsePastLine() {
1690 int ch = temp[cursor++];
1691 while (ch != 0 && !isLineSeparator(ch))
1692 ch = temp[cursor++];
1693 return ch;
1694 }
1695
1696 /**
1697 * xmode peek past comment to end of line.
1698 */
1699 private int peekPastLine() {
1700 int ch = temp[++cursor];
1701 while (ch != 0 && !isLineSeparator(ch))
1702 ch = temp[++cursor];
1703 return ch;
1704 }
1705
1706 /**
1707 * Determines if character is a line separator in the current mode
1708 */
1709 private boolean isLineSeparator(int ch) {
1710 if (has(UNIX_LINES)) {
1711 return ch == '\n';
1712 } else {
1713 return (ch == '\n' || ch == '\r' || (ch | 1) == '\u2029' || ch == '\u0085');
1714 }
1715 }
1716
1717 /**
1718 * Read the character after the next one, and advance the cursor by two.
1719 */
1720 private int skip() {
1721 int i = cursor;
1722 int ch = temp[i + 1];
1723 cursor = i + 2;
1724 return ch;
1725 }
1726
1727 /**
1728 * Unread one next character, and retreat cursor by one.
1729 */
1730 private void unread() {
1731 cursor--;
1732 }
1733
1734 /**
1735 * Internal method used for handling all syntax errors. The pattern is
1736 * displayed with a pointer to aid in locating the syntax error.
1737 */
1738 private PatternSyntaxException error(String s) {
1739 return new PatternSyntaxException(s, normalizedPattern,
1740 cursor - 1);
1741 }
1742
1743 /**
1744 * Determines if there is any supplementary character or unpaired
1745 * surrogate in the specified range.
1746 */
1747 private boolean findSupplementary(int start, int end) {
1748 for (int i = start; i < end; i++) {
1749 if (isSupplementary(temp[i]))
1750 return true;
1751 }
1752 return false;
1753 }
1754
1755 /**
1756 * Determines if the specified code point is a supplementary
1757 * character or unpaired surrogate.
1758 */
1759 private static final boolean isSupplementary(int ch) {
1760 return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT
1761 || isSurrogate(ch);
1762 }
1763
1764 /**
1765 * The following methods handle the main parsing. They are sorted
1766 * according to their precedence order, the lowest one first.
1767 */
1768
1769 /**
1770 * The expression is parsed with branch nodes added for alternations.
1771 * This may be called recursively to parse sub expressions that may
1772 * contain alternations.
1773 */
1774 private Node expr(Node end) {
1775 Node prev = null;
1776 Node firstTail = null;
1777 Node branchConn = null;
1778
1779 for (;;) {
1780 Node node = sequence(end);
1781 Node nodeTail = root; //double return
1782 if (prev == null) {
1783 prev = node;
1784 firstTail = nodeTail;
1785 } else {
1786 // Branch
1787 if (branchConn == null) {
1788 branchConn = new BranchConn();
1789 branchConn.next = end;
1790 }
1791 if (node == end) {
1792 // if the node returned from sequence() is "end"
1793 // we have an empty expr, set a null atom into
1794 // the branch to indicate to go "next" directly.
1795 node = null;
1796 } else {
1797 // the "tail.next" of each atom goes to branchConn
1798 nodeTail.next = branchConn;
1799 }
1800 if (prev instanceof Branch) {
1801 ((Branch) prev).add(node);
1802 } else {
1803 if (prev == end) {
1804 prev = null;
1805 } else {
1806 // replace the "end" with "branchConn" at its tail.next
1807 // when put the "prev" into the branch as the first atom.
1808 firstTail.next = branchConn;
1809 }
1810 prev = new Branch(prev, node, branchConn);
1811 }
1812 }
1813 if (peek() != '|') {
1814 return prev;
1815 }
1816 next();
1817 }
1818 }
1819
1820 /**
1821 * Parsing of sequences between alternations.
1822 */
1823 private Node sequence(Node end) {
1824 Node head = null;
1825 Node tail = null;
1826 Node node = null;
1827 LOOP: for (;;) {
1828 int ch = peek();
1829 switch (ch) {
1830 case '(':
1831 // Because group handles its own closure,
1832 // we need to treat it differently
1833 node = group0();
1834 // Check for comment or flag group
1835 if (node == null)
1836 continue;
1837 if (head == null)
1838 head = node;
1839 else
1840 tail.next = node;
1841 // Double return: Tail was returned in root
1842 tail = root;
1843 continue;
1844 case '[':
1845 node = clazz(true);
1846 break;
1847 case '\\':
1848 ch = nextEscaped();
1849 if (ch == 'p' || ch == 'P') {
1850 boolean oneLetter = true;
1851 boolean comp = (ch == 'P');
1852 ch = next(); // Consume { if present
1853 if (ch != '{') {
1854 unread();
1855 } else {
1856 oneLetter = false;
1857 }
1858 node = family(oneLetter).maybeComplement(comp);
1859 } else {
1860 unread();
1861 node = atom();
1862 }
1863 break;
1864 case '^':
1865 next();
1866 if (has(MULTILINE)) {
1867 if (has(UNIX_LINES))
1868 node = new UnixCaret();
1869 else
1870 node = new Caret();
1871 } else {
1872 node = new Begin();
1873 }
1874 break;
1875 case '$':
1876 next();
1877 if (has(UNIX_LINES))
1878 node = new UnixDollar(has(MULTILINE));
1879 else
1880 node = new Dollar(has(MULTILINE));
1881 break;
1882 case '.':
1883 next();
1884 if (has(DOTALL)) {
1885 node = new All();
1886 } else {
1887 if (has(UNIX_LINES))
1888 node = new UnixDot();
1889 else {
1890 node = new Dot();
1891 }
1892 }
1893 break;
1894 case '|':
1895 case ')':
1896 break LOOP;
1897 case ']': // Now interpreting dangling ] and } as literals
1898 case '}':
1899 node = atom();
1900 break;
1901 case '?':
1902 case '*':
1903 case '+':
1904 next();
1905 throw error("Dangling meta character '" + ((char) ch)
1906 + "'");
1907 case 0:
1908 if (cursor >= patternLength) {
1909 break LOOP;
1910 }
1911 // Fall through
1912 default:
1913 node = atom();
1914 break;
1915 }
1916
1917 node = closure(node);
1918
1919 if (head == null) {
1920 head = tail = node;
1921 } else {
1922 tail.next = node;
1923 tail = node;
1924 }
1925 }
1926 if (head == null) {
1927 return end;
1928 }
1929 tail.next = end;
1930 root = tail; //double return
1931 return head;
1932 }
1933
1934 /**
1935 * Parse and add a new Single or Slice.
1936 */
1937 private Node atom() {
1938 int first = 0;
1939 int prev = -1;
1940 boolean hasSupplementary = false;
1941 int ch = peek();
1942 for (;;) {
1943 switch (ch) {
1944 case '*':
1945 case '+':
1946 case '?':
1947 case '{':
1948 if (first > 1) {
1949 cursor = prev; // Unwind one character
1950 first--;
1951 }
1952 break;
1953 case '$':
1954 case '.':
1955 case '^':
1956 case '(':
1957 case '[':
1958 case '|':
1959 case ')':
1960 break;
1961 case '\\':
1962 ch = nextEscaped();
1963 if (ch == 'p' || ch == 'P') { // Property
1964 if (first > 0) { // Slice is waiting; handle it first
1965 unread();
1966 break;
1967 } else { // No slice; just return the family node
1968 boolean comp = (ch == 'P');
1969 boolean oneLetter = true;
1970 ch = next(); // Consume { if present
1971 if (ch != '{')
1972 unread();
1973 else
1974 oneLetter = false;
1975 return family(oneLetter).maybeComplement(comp);
1976 }
1977 }
1978 unread();
1979 prev = cursor;
1980 ch = escape(false, first == 0);
1981 if (ch >= 0) {
1982 append(ch, first);
1983 first++;
1984 if (isSupplementary(ch)) {
1985 hasSupplementary = true;
1986 }
1987 ch = peek();
1988 continue;
1989 } else if (first == 0) {
1990 return root;
1991 }
1992 // Unwind meta escape sequence
1993 cursor = prev;
1994 break;
1995 case 0:
1996 if (cursor >= patternLength) {
1997 break;
1998 }
1999 // Fall through
2000 default:
2001 prev = cursor;
2002 append(ch, first);
2003 first++;
2004 if (isSupplementary(ch)) {
2005 hasSupplementary = true;
2006 }
2007 ch = next();
2008 continue;
2009 }
2010 break;
2011 }
2012 if (first == 1) {
2013 return newSingle(buffer[0]);
2014 } else {
2015 return newSlice(buffer, first, hasSupplementary);
2016 }
2017 }
2018
2019 private void append(int ch, int len) {
2020 if (len >= buffer.length) {
2021 int[] tmp = new int[len + len];
2022 System.arraycopy(buffer, 0, tmp, 0, len);
2023 buffer = tmp;
2024 }
2025 buffer[len] = ch;
2026 }
2027
2028 /**
2029 * Parses a backref greedily, taking as many numbers as it
2030 * can. The first digit is always treated as a backref, but
2031 * multi digit numbers are only treated as a backref if at
2032 * least that many backrefs exist at this point in the regex.
2033 */
2034 private Node ref(int refNum) {
2035 boolean done = false;
2036 while (!done) {
2037 int ch = peek();
2038 switch (ch) {
2039 case '0':
2040 case '1':
2041 case '2':
2042 case '3':
2043 case '4':
2044 case '5':
2045 case '6':
2046 case '7':
2047 case '8':
2048 case '9':
2049 int newRefNum = (refNum * 10) + (ch - '0');
2050 // Add another number if it doesn't make a group
2051 // that doesn't exist
2052 if (capturingGroupCount - 1 < newRefNum) {
2053 done = true;
2054 break;
2055 }
2056 refNum = newRefNum;
2057 read();
2058 break;
2059 default:
2060 done = true;
2061 break;
2062 }
2063 }
2064 if (has(CASE_INSENSITIVE))
2065 return new CIBackRef(refNum, has(UNICODE_CASE));
2066 else
2067 return new BackRef(refNum);
2068 }
2069
2070 /**
2071 * Parses an escape sequence to determine the actual value that needs
2072 * to be matched.
2073 * If -1 is returned and create was true a new object was added to the tree
2074 * to handle the escape sequence.
2075 * If the returned value is greater than zero, it is the value that
2076 * matches the escape sequence.
2077 */
2078 private int escape(boolean inclass, boolean create) {
2079 int ch = skip();
2080 switch (ch) {
2081 case '0':
2082 return o();
2083 case '1':
2084 case '2':
2085 case '3':
2086 case '4':
2087 case '5':
2088 case '6':
2089 case '7':
2090 case '8':
2091 case '9':
2092 if (inclass)
2093 break;
2094 if (create) {
2095 root = ref((ch - '0'));
2096 }
2097 return -1;
2098 case 'A':
2099 if (inclass)
2100 break;
2101 if (create)
2102 root = new Begin();
2103 return -1;
2104 case 'B':
2105 if (inclass)
2106 break;
2107 if (create)
2108 root = new Bound(Bound.NONE);
2109 return -1;
2110 case 'C':
2111 break;
2112 case 'D':
2113 if (create)
2114 root = new Ctype(ASCII.DIGIT).complement();
2115 return -1;
2116 case 'E':
2117 case 'F':
2118 break;
2119 case 'G':
2120 if (inclass)
2121 break;
2122 if (create)
2123 root = new LastMatch();
2124 return -1;
2125 case 'H':
2126 case 'I':
2127 case 'J':
2128 case 'K':
2129 case 'L':
2130 case 'M':
2131 case 'N':
2132 case 'O':
2133 case 'P':
2134 case 'Q':
2135 case 'R':
2136 break;
2137 case 'S':
2138 if (create)
2139 root = new Ctype(ASCII.SPACE).complement();
2140 return -1;
2141 case 'T':
2142 case 'U':
2143 case 'V':
2144 break;
2145 case 'W':
2146 if (create)
2147 root = new Ctype(ASCII.WORD).complement();
2148 return -1;
2149 case 'X':
2150 case 'Y':
2151 break;
2152 case 'Z':
2153 if (inclass)
2154 break;
2155 if (create) {
2156 if (has(UNIX_LINES))
2157 root = new UnixDollar(false);
2158 else
2159 root = new Dollar(false);
2160 }
2161 return -1;
2162 case 'a':
2163 return '\007';
2164 case 'b':
2165 if (inclass)
2166 break;
2167 if (create)
2168 root = new Bound(Bound.BOTH);
2169 return -1;
2170 case 'c':
2171 return c();
2172 case 'd':
2173 if (create)
2174 root = new Ctype(ASCII.DIGIT);
2175 return -1;
2176 case 'e':
2177 return '\033';
2178 case 'f':
2179 return '\f';
2180 case 'g':
2181 case 'h':
2182 case 'i':
2183 case 'j':
2184 case 'k':
2185 case 'l':
2186 case 'm':
2187 break;
2188 case 'n':
2189 return '\n';
2190 case 'o':
2191 case 'p':
2192 case 'q':
2193 break;
2194 case 'r':
2195 return '\r';
2196 case 's':
2197 if (create)
2198 root = new Ctype(ASCII.SPACE);
2199 return -1;
2200 case 't':
2201 return '\t';
2202 case 'u':
2203 return u();
2204 case 'v':
2205 return '\013';
2206 case 'w':
2207 if (create)
2208 root = new Ctype(ASCII.WORD);
2209 return -1;
2210 case 'x':
2211 return x();
2212 case 'y':
2213 break;
2214 case 'z':
2215 if (inclass)
2216 break;
2217 if (create)
2218 root = new End();
2219 return -1;
2220 default:
2221 return ch;
2222 }
2223 throw error("Illegal/unsupported escape sequence");
2224 }
2225
2226 /**
2227 * Parse a character class, and return the node that matches it.
2228 *
2229 * Consumes a ] on the way out if consume is true. Usually consume
2230 * is true except for the case of [abc&&def] where def is a separate
2231 * right hand node with "understood" brackets.
2232 */
2233 private CharProperty clazz(boolean consume) {
2234 CharProperty prev = null;
2235 CharProperty node = null;
2236 BitClass bits = new BitClass();
2237 boolean include = true;
2238 boolean firstInClass = true;
2239 int ch = next();
2240 for (;;) {
2241 switch (ch) {
2242 case '^':
2243 // Negates if first char in a class, otherwise literal
2244 if (firstInClass) {
2245 if (temp[cursor - 1] != '[')
2246 break;
2247 ch = next();
2248 include = !include;
2249 continue;
2250 } else {
2251 // ^ not first in class, treat as literal
2252 break;
2253 }
2254 case '[':
2255 firstInClass = false;
2256 node = clazz(true);
2257 if (prev == null)
2258 prev = node;
2259 else
2260 prev = union(prev, node);
2261 ch = peek();
2262 continue;
2263 case '&':
2264 firstInClass = false;
2265 ch = next();
2266 if (ch == '&') {
2267 ch = next();
2268 CharProperty rightNode = null;
2269 while (ch != ']' && ch != '&') {
2270 if (ch == '[') {
2271 if (rightNode == null)
2272 rightNode = clazz(true);
2273 else
2274 rightNode = union(rightNode,
2275 clazz(true));
2276 } else { // abc&&def
2277 unread();
2278 rightNode = clazz(false);
2279 }
2280 ch = peek();
2281 }
2282 if (rightNode != null)
2283 node = rightNode;
2284 if (prev == null) {
2285 if (rightNode == null)
2286 throw error("Bad class syntax");
2287 else
2288 prev = rightNode;
2289 } else {
2290 prev = intersection(prev, node);
2291 }
2292 } else {
2293 // treat as a literal &
2294 unread();
2295 break;
2296 }
2297 continue;
2298 case 0:
2299 firstInClass = false;
2300 if (cursor >= patternLength)
2301 throw error("Unclosed character class");
2302 break;
2303 case ']':
2304 firstInClass = false;
2305 if (prev != null) {
2306 if (consume)
2307 next();
2308 return prev;
2309 }
2310 break;
2311 default:
2312 firstInClass = false;
2313 break;
2314 }
2315 node = range(bits);
2316 if (include) {
2317 if (prev == null) {
2318 prev = node;
2319 } else {
2320 if (prev != node)
2321 prev = union(prev, node);
2322 }
2323 } else {
2324 if (prev == null) {
2325 prev = node.complement();
2326 } else {
2327 if (prev != node)
2328 prev = setDifference(prev, node);
2329 }
2330 }
2331 ch = peek();
2332 }
2333 }
2334
2335 private CharProperty bitsOrSingle(BitClass bits, int ch) {
2336 /* Bits can only handle codepoints in [u+0000-u+00ff] range.
2337 Use "single" node instead of bits when dealing with unicode
2338 case folding for codepoints listed below.
2339 (1)Uppercase out of range: u+00ff, u+00b5
2340 toUpperCase(u+00ff) -> u+0178
2341 toUpperCase(u+00b5) -> u+039c
2342 (2)LatinSmallLetterLongS u+17f
2343 toUpperCase(u+017f) -> u+0053
2344 (3)LatinSmallLetterDotlessI u+131
2345 toUpperCase(u+0131) -> u+0049
2346 (4)LatinCapitalLetterIWithDotAbove u+0130
2347 toLowerCase(u+0130) -> u+0069
2348 (5)KelvinSign u+212a
2349 toLowerCase(u+212a) ==> u+006B
2350 (6)AngstromSign u+212b
2351 toLowerCase(u+212b) ==> u+00e5
2352 */
2353 int d;
2354 if (ch < 256
2355 && !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && (ch == 0xff
2356 || ch == 0xb5 || ch == 0x49 || ch == 0x69 || //I and i
2357 ch == 0x53 || ch == 0x73 || //S and s
2358 ch == 0x4b || ch == 0x6b || //K and k
2359 ch == 0xc5 || ch == 0xe5))) //A+ring
2360 return bits.add(ch, flags());
2361 return newSingle(ch);
2362 }
2363
2364 /**
2365 * Parse a single character or a character range in a character class
2366 * and return its representative node.
2367 */
2368 private CharProperty range(BitClass bits) {
2369 int ch = peek();
2370 if (ch == '\\') {
2371 ch = nextEscaped();
2372 if (ch == 'p' || ch == 'P') { // A property
2373 boolean comp = (ch == 'P');
2374 boolean oneLetter = true;
2375 // Consume { if present
2376 ch = next();
2377 if (ch != '{')
2378 unread();
2379 else
2380 oneLetter = false;
2381 return family(oneLetter).maybeComplement(comp);
2382 } else { // ordinary escape
2383 unread();
2384 ch = escape(true, true);
2385 if (ch == -1)
2386 return (CharProperty) root;
2387 }
2388 } else {
2389 ch = single();
2390 }
2391 if (ch >= 0) {
2392 if (peek() == '-') {
2393 int endRange = temp[cursor + 1];
2394 if (endRange == '[') {
2395 return bitsOrSingle(bits, ch);
2396 }
2397 if (endRange != ']') {
2398 next();
2399 int m = single();
2400 if (m < ch)
2401 throw error("Illegal character range");
2402 if (has(CASE_INSENSITIVE))
2403 return caseInsensitiveRangeFor(ch, m);
2404 else
2405 return rangeFor(ch, m);
2406 }
2407 }
2408 return bitsOrSingle(bits, ch);
2409 }
2410 throw error("Unexpected character '" + ((char) ch) + "'");
2411 }
2412
2413 private int single() {
2414 int ch = peek();
2415 switch (ch) {
2416 case '\\':
2417 return escape(true, false);
2418 default:
2419 next();
2420 return ch;
2421 }
2422 }
2423
2424 /**
2425 * Parses a Unicode character family and returns its representative node.
2426 */
2427 private CharProperty family(boolean singleLetter) {
2428 next();
2429 String name;
2430
2431 if (singleLetter) {
2432 int c = temp[cursor];
2433 if (!Character.isSupplementaryCodePoint(c)) {
2434 name = String.valueOf((char) c);
2435 } else {
2436 name = new String(temp, cursor, 1);
2437 }
2438 read();
2439 } else {
2440 int i = cursor;
2441 mark('}');
2442 while (read() != '}') {
2443 }
2444 mark('\000');
2445 int j = cursor;
2446 if (j > patternLength)
2447 throw error("Unclosed character family");
2448 if (i + 1 >= j)
2449 throw error("Empty character family");
2450 name = new String(temp, i, j - i - 1);
2451 }
2452
2453 if (name.startsWith("In")) {
2454 return unicodeBlockPropertyFor(name.substring(2));
2455 } else {
2456 if (name.startsWith("Is"))
2457 name = name.substring(2);
2458 return charPropertyNodeFor(name);
2459 }
2460 }
2461
2462 /**
2463 * Returns a CharProperty matching all characters in a UnicodeBlock.
2464 */
2465 private CharProperty unicodeBlockPropertyFor(String name) {
2466 final Character.UnicodeBlock block;
2467 try {
2468 block = Character.UnicodeBlock.forName(name);
2469 } catch (IllegalArgumentException iae) {
2470 throw error("Unknown character block name {" + name + "}");
2471 }
2472 return new CharProperty() {
2473 boolean isSatisfiedBy(int ch) {
2474 return block == Character.UnicodeBlock.of(ch);
2475 }
2476 };
2477 }
2478
2479 /**
2480 * Returns a CharProperty matching all characters in a named property.
2481 */
2482 private CharProperty charPropertyNodeFor(String name) {
2483 CharProperty p = CharPropertyNames.charPropertyFor(name);
2484 if (p == null)
2485 throw error("Unknown character property name {" + name
2486 + "}");
2487 return p;
2488 }
2489
2490 /**
2491 * Parses a group and returns the head node of a set of nodes that process
2492 * the group. Sometimes a double return system is used where the tail is
2493 * returned in root.
2494 */
2495 private Node group0() {
2496 boolean capturingGroup = false;
2497 Node head = null;
2498 Node tail = null;
2499 int save = flags;
2500 root = null;
2501 int ch = next();
2502 if (ch == '?') {
2503 ch = skip();
2504 switch (ch) {
2505 case ':': // (?:xxx) pure group
2506 head = createGroup(true);
2507 tail = root;
2508 head.next = expr(tail);
2509 break;
2510 case '=': // (?=xxx) and (?!xxx) lookahead
2511 case '!':
2512 head = createGroup(true);
2513 tail = root;
2514 head.next = expr(tail);
2515 if (ch == '=') {
2516 head = tail = new Pos(head);
2517 } else {
2518 head = tail = new Neg(head);
2519 }
2520 break;
2521 case '>': // (?>xxx) independent group
2522 head = createGroup(true);
2523 tail = root;
2524 head.next = expr(tail);
2525 head = tail = new Ques(head, INDEPENDENT);
2526 break;
2527 case '<': // (?<xxx) look behind
2528 ch = read();
2529 int start = cursor;
2530 head = createGroup(true);
2531 tail = root;
2532 head.next = expr(tail);
2533 tail.next = lookbehindEnd;
2534 TreeInfo info = new TreeInfo();
2535 head.study(info);
2536 if (info.maxValid == false) {
2537 throw error("Look-behind group does not have "
2538 + "an obvious maximum length");
2539 }
2540 boolean hasSupplementary = findSupplementary(start,
2541 patternLength);
2542 if (ch == '=') {
2543 head = tail = (hasSupplementary ? new BehindS(head,
2544 info.maxLength, info.minLength)
2545 : new Behind(head, info.maxLength,
2546 info.minLength));
2547 } else if (ch == '!') {
2548 head = tail = (hasSupplementary ? new NotBehindS(
2549 head, info.maxLength, info.minLength)
2550 : new NotBehind(head, info.maxLength,
2551 info.minLength));
2552 } else {
2553 throw error("Unknown look-behind group");
2554 }
2555 break;
2556 case '$':
2557 case '@':
2558 throw error("Unknown group type");
2559 default: // (?xxx:) inlined match flags
2560 unread();
2561 addFlag();
2562 ch = read();
2563 if (ch == ')') {
2564 return null; // Inline modifier only
2565 }
2566 if (ch != ':') {
2567 throw error("Unknown inline modifier");
2568 }
2569 head = createGroup(true);
2570 tail = root;
2571 head.next = expr(tail);
2572 break;
2573 }
2574 } else { // (xxx) a regular group
2575 capturingGroup = true;
2576 head = createGroup(false);
2577 tail = root;
2578 head.next = expr(tail);
2579 }
2580
2581 accept(')', "Unclosed group");
2582 flags = save;
2583
2584 // Check for quantifiers
2585 Node node = closure(head);
2586 if (node == head) { // No closure
2587 root = tail;
2588 return node; // Dual return
2589 }
2590 if (head == tail) { // Zero length assertion
2591 root = node;
2592 return node; // Dual return
2593 }
2594
2595 if (node instanceof Ques) {
2596 Ques ques = (Ques) node;
2597 if (ques.type == POSSESSIVE) {
2598 root = node;
2599 return node;
2600 }
2601 tail.next = new BranchConn();
2602 tail = tail.next;
2603 if (ques.type == GREEDY) {
2604 head = new Branch(head, null, tail);
2605 } else { // Reluctant quantifier
2606 head = new Branch(null, head, tail);
2607 }
2608 root = tail;
2609 return head;
2610 } else if (node instanceof Curly) {
2611 Curly curly = (Curly) node;
2612 if (curly.type == POSSESSIVE) {
2613 root = node;
2614 return node;
2615 }
2616 // Discover if the group is deterministic
2617 TreeInfo info = new TreeInfo();
2618 if (head.study(info)) { // Deterministic
2619 GroupTail temp = (GroupTail) tail;
2620 head = root = new GroupCurly(head.next, curly.cmin,
2621 curly.cmax, curly.type,
2622 ((GroupTail) tail).localIndex,
2623 ((GroupTail) tail).groupIndex, capturingGroup);
2624 return head;
2625 } else { // Non-deterministic
2626 int temp = ((GroupHead) head).localIndex;
2627 Loop loop;
2628 if (curly.type == GREEDY)
2629 loop = new Loop(this .localCount, temp);
2630 else
2631 // Reluctant Curly
2632 loop = new LazyLoop(this .localCount, temp);
2633 Prolog prolog = new Prolog(loop);
2634 this .localCount += 1;
2635 loop.cmin = curly.cmin;
2636 loop.cmax = curly.cmax;
2637 loop.body = head;
2638 tail.next = loop;
2639 root = loop;
2640 return prolog; // Dual return
2641 }
2642 }
2643 throw error("Internal logic error");
2644 }
2645
2646 /**
2647 * Create group head and tail nodes using double return. If the group is
2648 * created with anonymous true then it is a pure group and should not
2649 * affect group counting.
2650 */
2651 private Node createGroup(boolean anonymous) {
2652 int localIndex = localCount++;
2653 int groupIndex = 0;
2654 if (!anonymous)
2655 groupIndex = capturingGroupCount++;
2656 GroupHead head = new GroupHead(localIndex);
2657 root = new GroupTail(localIndex, groupIndex);
2658 if (!anonymous && groupIndex < 10)
2659 groupNodes[groupIndex] = head;
2660 return head;
2661 }
2662
2663 /**
2664 * Parses inlined match flags and set them appropriately.
2665 */
2666 private void addFlag() {
2667 int ch = peek();
2668 for (;;) {
2669 switch (ch) {
2670 case 'i':
2671 flags |= CASE_INSENSITIVE;
2672 break;
2673 case 'm':
2674 flags |= MULTILINE;
2675 break;
2676 case 's':
2677 flags |= DOTALL;
2678 break;
2679 case 'd':
2680 flags |= UNIX_LINES;
2681 break;
2682 case 'u':
2683 flags |= UNICODE_CASE;
2684 break;
2685 case 'c':
2686 flags |= CANON_EQ;
2687 break;
2688 case 'x':
2689 flags |= COMMENTS;
2690 break;
2691 case '-': // subFlag then fall through
2692 ch = next();
2693 subFlag();
2694 default:
2695 return;
2696 }
2697 ch = next();
2698 }
2699 }
2700
2701 /**
2702 * Parses the second part of inlined match flags and turns off
2703 * flags appropriately.
2704 */
2705 private void subFlag() {
2706 int ch = peek();
2707 for (;;) {
2708 switch (ch) {
2709 case 'i':
2710 flags &= ~CASE_INSENSITIVE;
2711 break;
2712 case 'm':
2713 flags &= ~MULTILINE;
2714 break;
2715 case 's':
2716 flags &= ~DOTALL;
2717 break;
2718 case 'd':
2719 flags &= ~UNIX_LINES;
2720 break;
2721 case 'u':
2722 flags &= ~UNICODE_CASE;
2723 break;
2724 case 'c':
2725 flags &= ~CANON_EQ;
2726 break;
2727 case 'x':
2728 flags &= ~COMMENTS;
2729 break;
2730 default:
2731 return;
2732 }
2733 ch = next();
2734 }
2735 }
2736
2737 static final int MAX_REPS = 0x7FFFFFFF;
2738
2739 static final int GREEDY = 0;
2740
2741 static final int LAZY = 1;
2742
2743 static final int POSSESSIVE = 2;
2744
2745 static final int INDEPENDENT = 3;
2746
2747 /**
2748 * Processes repetition. If the next character peeked is a quantifier
2749 * then new nodes must be appended to handle the repetition.
2750 * Prev could be a single or a group, so it could be a chain of nodes.
2751 */
2752 private Node closure(Node prev) {
2753 Node atom;
2754 int ch = peek();
2755 switch (ch) {
2756 case '?':
2757 ch = next();
2758 if (ch == '?') {
2759 next();
2760 return new Ques(prev, LAZY);
2761 } else if (ch == '+') {
2762 next();
2763 return new Ques(prev, POSSESSIVE);
2764 }
2765 return new Ques(prev, GREEDY);
2766 case '*':
2767 ch = next();
2768 if (ch == '?') {
2769 next();
2770 return new Curly(prev, 0, MAX_REPS, LAZY);
2771 } else if (ch == '+') {
2772 next();
2773 return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
2774 }
2775 return new Curly(prev, 0, MAX_REPS, GREEDY);
2776 case '+':
2777 ch = next();
2778 if (ch == '?') {
2779 next();
2780 return new Curly(prev, 1, MAX_REPS, LAZY);
2781 } else if (ch == '+') {
2782 next();
2783 return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
2784 }
2785 return new Curly(prev, 1, MAX_REPS, GREEDY);
2786 case '{':
2787 ch = temp[cursor + 1];
2788 if (ASCII.isDigit(ch)) {
2789 skip();
2790 int cmin = 0;
2791 do {
2792 cmin = cmin * 10 + (ch - '0');
2793 } while (ASCII.isDigit(ch = read()));
2794 int cmax = cmin;
2795 if (ch == ',') {
2796 ch = read();
2797 cmax = MAX_REPS;
2798 if (ch != '}') {
2799 cmax = 0;
2800 while (ASCII.isDigit(ch)) {
2801 cmax = cmax * 10 + (ch - '0');
2802 ch = read();
2803 }
2804 }
2805 }
2806 if (ch != '}')
2807 throw error("Unclosed counted closure");
2808 if (((cmin) | (cmax) | (cmax - cmin)) < 0)
2809 throw error("Illegal repetition range");
2810 Curly curly;
2811 ch = peek();
2812 if (ch == '?') {
2813 next();
2814 curly = new Curly(prev, cmin, cmax, LAZY);
2815 } else if (ch == '+') {
2816 next();
2817 curly = new Curly(prev, cmin, cmax, POSSESSIVE);
2818 } else {
2819 curly = new Curly(prev, cmin, cmax, GREEDY);
2820 }
2821 return curly;
2822 } else {
2823 throw error("Illegal repetition");
2824 }
2825 default:
2826 return prev;
2827 }
2828 }
2829
2830 /**
2831 * Utility method for parsing control escape sequences.
2832 */
2833 private int c() {
2834 if (cursor < patternLength) {
2835 return read() ^ 64;
2836 }
2837 throw error("Illegal control escape sequence");
2838 }
2839
2840 /**
2841 * Utility method for parsing octal escape sequences.
2842 */
2843 private int o() {
2844 int n = read();
2845 if (((n - '0') | ('7' - n)) >= 0) {
2846 int m = read();
2847 if (((m - '0') | ('7' - m)) >= 0) {
2848 int o = read();
2849 if ((((o - '0') | ('7' - o)) >= 0)
2850 && (((n - '0') | ('3' - n)) >= 0)) {
2851 return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
2852 }
2853 unread();
2854 return (n - '0') * 8 + (m - '0');
2855 }
2856 unread();
2857 return (n - '0');
2858 }
2859 throw error("Illegal octal escape sequence");
2860 }
2861
2862 /**
2863 * Utility method for parsing hexadecimal escape sequences.
2864 */
2865 private int x() {
2866 int n = read();
2867 if (ASCII.isHexDigit(n)) {
2868 int m = read();
2869 if (ASCII.isHexDigit(m)) {
2870 return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
2871 }
2872 }
2873 throw error("Illegal hexadecimal escape sequence");
2874 }
2875
2876 /**
2877 * Utility method for parsing unicode escape sequences.
2878 */
2879 private int u() {
2880 int n = 0;
2881 for (int i = 0; i < 4; i++) {
2882 int ch = read();
2883 if (!ASCII.isHexDigit(ch)) {
2884 throw error("Illegal Unicode escape sequence");
2885 }
2886 n = n * 16 + ASCII.toDigit(ch);
2887 }
2888 return n;
2889 }
2890
2891 //
2892 // Utility methods for code point support
2893 //
2894
2895 /**
2896 * Tests a surrogate value.
2897 */
2898 private static final boolean isSurrogate(int c) {
2899 return c >= Character.MIN_HIGH_SURROGATE
2900 && c <= Character.MAX_LOW_SURROGATE;
2901 }
2902
2903 private static final int countChars(CharSequence seq, int index,
2904 int lengthInCodePoints) {
2905 // optimization
2906 if (lengthInCodePoints == 1
2907 && !Character.isHighSurrogate(seq.charAt(index))) {
2908 assert (index >= 0 && index < seq.length());
2909 return 1;
2910 }
2911 int length = seq.length();
2912 int x = index;
2913 if (lengthInCodePoints >= 0) {
2914 assert (index >= 0 && index < length);
2915 for (int i = 0; x < length && i < lengthInCodePoints; i++) {
2916 if (Character.isHighSurrogate(seq.charAt(x++))) {
2917 if (x < length
2918 && Character.isLowSurrogate(seq.charAt(x))) {
2919 x++;
2920 }
2921 }
2922 }
2923 return x - index;
2924 }
2925
2926 assert (index >= 0 && index <= length);
2927 if (index == 0) {
2928 return 0;
2929 }
2930 int len = -lengthInCodePoints;
2931 for (int i = 0; x > 0 && i < len; i++) {
2932 if (Character.isLowSurrogate(seq.charAt(--x))) {
2933 if (x > 0
2934 && Character.isHighSurrogate(seq.charAt(x - 1))) {
2935 x--;
2936 }
2937 }
2938 }
2939 return index - x;
2940 }
2941
2942 private static final int countCodePoints(CharSequence seq) {
2943 int length = seq.length();
2944 int n = 0;
2945 for (int i = 0; i < length;) {
2946 n++;
2947 if (Character.isHighSurrogate(seq.charAt(i++))) {
2948 if (i < length
2949 && Character.isLowSurrogate(seq.charAt(i))) {
2950 i++;
2951 }
2952 }
2953 }
2954 return n;
2955 }
2956
2957 /**
2958 * Creates a bit vector for matching Latin-1 values. A normal BitClass
2959 * never matches values above Latin-1, and a complemented BitClass always
2960 * matches values above Latin-1.
2961 */
2962 private static final class BitClass extends BmpCharProperty {
2963 final boolean[] bits;
2964
2965 BitClass() {
2966 bits = new boolean[256];
2967 }
2968
2969 private BitClass(boolean[] bits) {
2970 this .bits = bits;
2971 }
2972
2973 BitClass add(int c, int flags) {
2974 assert c >= 0 && c <= 255;
2975 if ((flags & CASE_INSENSITIVE) != 0) {
2976 if (ASCII.isAscii(c)) {
2977 bits[ASCII.toUpper(c)] = true;
2978 bits[ASCII.toLower(c)] = true;
2979 } else if ((flags & UNICODE_CASE) != 0) {
2980 bits[Character.toLowerCase(c)] = true;
2981 bits[Character.toUpperCase(c)] = true;
2982 }
2983 }
2984 bits[c] = true;
2985 return this ;
2986 }
2987
2988 boolean isSatisfiedBy(int ch) {
2989 return ch < 256 && bits[ch];
2990 }
2991 }
2992
2993 /**
2994 * Returns a suitably optimized, single character matcher.
2995 */
2996 private CharProperty newSingle(final int ch) {
2997 if (has(CASE_INSENSITIVE)) {
2998 int lower, upper;
2999 if (has(UNICODE_CASE)) {
3000 upper = Character.toUpperCase(ch);
3001 lower = Character.toLowerCase(upper);
3002 if (upper != lower)
3003 return new SingleU(lower);
3004 } else if (ASCII.isAscii(ch)) {
3005 lower = ASCII.toLower(ch);
3006 upper = ASCII.toUpper(ch);
3007 if (lower != upper)
3008 return new SingleI(lower, upper);
3009 }
3010 }
3011 if (isSupplementary(ch))
3012 return new SingleS(ch); // Match a given Unicode character
3013 return new Single(ch); // Match a given BMP character
3014 }
3015
3016 /**
3017 * Utility method for creating a string slice matcher.
3018 */
3019 private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
3020 int[] tmp = new int[count];
3021 if (has(CASE_INSENSITIVE)) {
3022 if (has(UNICODE_CASE)) {
3023 for (int i = 0; i < count; i++) {
3024 tmp[i] = Character.toLowerCase(Character
3025 .toUpperCase(buf[i]));
3026 }
3027 return hasSupplementary ? new SliceUS(tmp)
3028 : new SliceU(tmp);
3029 }
3030 for (int i = 0; i < count; i++) {
3031 tmp[i] = ASCII.toLower(buf[i]);
3032 }
3033 return hasSupplementary ? new SliceIS(tmp)
3034 : new SliceI(tmp);
3035 }
3036 for (int i = 0; i < count; i++) {
3037 tmp[i] = buf[i];
3038 }
3039 return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
3040 }
3041
3042 /**
3043 * The following classes are the building components of the object
3044 * tree that represents a compiled regular expression. The object tree
3045 * is made of individual elements that handle constructs in the Pattern.
3046 * Each type of object knows how to match its equivalent construct with
3047 * the match() method.
3048 */
3049
3050 /**
3051 * Base class for all node classes. Subclasses should override the match()
3052 * method as appropriate. This class is an accepting node, so its match()
3053 * always returns true.
3054 */
3055 static class Node extends Object {
3056 Node next;
3057
3058 Node() {
3059 next = Pattern.accept;
3060 }
3061
3062 /**
3063 * This method implements the classic accept node.
3064 */
3065 boolean match(Matcher matcher, int i, CharSequence seq) {
3066 matcher.last = i;
3067 matcher.groups[0] = matcher.first;
3068 matcher.groups[1] = matcher.last;
3069 return true;
3070 }
3071
3072 /**
3073 * This method is good for all zero length assertions.
3074 */
3075 boolean study(TreeInfo info) {
3076 if (next != null) {
3077 return next.study(info);
3078 } else {
3079 return info.deterministic;
3080 }
3081 }
3082 }
3083
3084 static class LastNode extends Node {
3085 /**
3086 * This method implements the classic accept node with
3087 * the addition of a check to see if the match occurred
3088 * using all of the input.
3089 */
3090 boolean match(Matcher matcher, int i, CharSequence seq) {
3091 if (matcher.acceptMode == Matcher.ENDANCHOR
3092 && i != matcher.to)
3093 return false;
3094 matcher.last = i;
3095 matcher.groups[0] = matcher.first;
3096 matcher.groups[1] = matcher.last;
3097 return true;
3098 }
3099 }
3100
3101 /**
3102 * Used for REs that can start anywhere within the input string.
3103 * This basically tries to match repeatedly at each spot in the
3104 * input string, moving forward after each try. An anchored search
3105 * or a BnM will bypass this node completely.
3106 */
3107 static class Start extends Node {
3108 int minLength;
3109
3110 Start(Node node) {
3111 this .next = node;
3112 TreeInfo info = new TreeInfo();
3113 next.study(info);
3114 minLength = info.minLength;
3115 }
3116
3117 boolean match(Matcher matcher, int i, CharSequence seq) {
3118 if (i > matcher.to - minLength) {
3119 matcher.hitEnd = true;
3120 return false;
3121 }
3122 boolean ret = false;
3123 int guard = matcher.to - minLength;
3124 for (; i <= guard; i++) {
3125 if (ret = next.match(matcher, i, seq))
3126 break;
3127 if (i == guard)
3128 matcher.hitEnd = true;
3129 }
3130 if (ret) {
3131 matcher.first = i;
3132 matcher.groups[0] = matcher.first;
3133 matcher.groups[1] = matcher.last;
3134 }
3135 return ret;
3136 }
3137
3138 boolean study(TreeInfo info) {
3139 next.study(info);
3140 info.maxValid = false;
3141 info.deterministic = false;
3142 return false;
3143 }
3144 }
3145
3146 /*
3147 * StartS supports supplementary characters, including unpaired surrogates.
3148 */
3149 static final class StartS extends Start {
3150 StartS(Node node) {
3151 super (node);
3152 }
3153
3154 boolean match(Matcher matcher, int i, CharSequence seq) {
3155 if (i > matcher.to - minLength) {
3156 matcher.hitEnd = true;
3157 return false;
3158 }
3159 boolean ret = false;
3160 int guard = matcher.to - minLength;
3161 while (i <= guard) {
3162 if ((ret = next.match(matcher, i, seq)) || i == guard)
3163 break;
3164 // Optimization to move to the next character. This is
3165 // faster than countChars(seq, i, 1).
3166 if (Character.isHighSurrogate(seq.charAt(i++))) {
3167 if (i < seq.length()
3168 && Character.isLowSurrogate(seq.charAt(i))) {
3169 i++;
3170 }
3171 }
3172 if (i == guard)
3173 matcher.hitEnd = true;
3174 }
3175 if (ret) {
3176 matcher.first = i;
3177 matcher.groups[0] = matcher.first;
3178 matcher.groups[1] = matcher.last;
3179 }
3180 return ret;
3181 }
3182 }
3183
3184 /**
3185 * Node to anchor at the beginning of input. This object implements the
3186 * match for a \A sequence, and the caret anchor will use this if not in
3187 * multiline mode.
3188 */
3189 static final class Begin extends Node {
3190 boolean match(Matcher matcher, int i, CharSequence seq) {
3191 int fromIndex = (matcher.anchoringBounds) ? matcher.from
3192 : 0;
3193 if (i == fromIndex && next.match(matcher, i, seq)) {
3194 matcher.first = i;
3195 matcher.groups[0] = i;
3196 matcher.groups[1] = matcher.last;
3197 return true;
3198 } else {
3199 return false;
3200 }
3201 }
3202 }
3203
3204 /**
3205 * Node to anchor at the end of input. This is the absolute end, so this
3206 * should not match at the last newline before the end as $ will.
3207 */
3208 static final class End extends Node {
3209 boolean match(Matcher matcher, int i, CharSequence seq) {
3210 int endIndex = (matcher.anchoringBounds) ? matcher.to
3211 : matcher.getTextLength();
3212 if (i == endIndex) {
3213 matcher.hitEnd = true;
3214 return next.match(matcher, i, seq);
3215 }
3216 return false;
3217 }
3218 }
3219
3220 /**
3221 * Node to anchor at the beginning of a line. This is essentially the
3222 * object to match for the multiline ^.
3223 */
3224 static final class Caret extends Node {
3225 boolean match(Matcher matcher, int i, CharSequence seq) {
3226 int startIndex = matcher.from;
3227 int endIndex = matcher.to;
3228 if (!matcher.anchoringBounds) {
3229 startIndex = 0;
3230 endIndex = matcher.getTextLength();
3231 }
3232 // Perl does not match ^ at end of input even after newline
3233 if (i == endIndex) {
3234 matcher.hitEnd = true;
3235 return false;
3236 }
3237 if (i > startIndex) {
3238 char ch = seq.charAt(i - 1);
3239 if (ch != '\n' && ch != '\r' && (ch | 1) != '\u2029'
3240 && ch != '\u0085') {
3241 return false;
3242 }
3243 // Should treat /r/n as one newline
3244 if (ch == '\r' && seq.charAt(i) == '\n')
3245 return false;
3246 }
3247 return next.match(matcher, i, seq);
3248 }
3249 }
3250
3251 /**
3252 * Node to anchor at the beginning of a line when in unixdot mode.
3253 */
3254 static final class UnixCaret extends Node {
3255 boolean match(Matcher matcher, int i, CharSequence seq) {
3256 int startIndex = matcher.from;
3257 int endIndex = matcher.to;
3258 if (!matcher.anchoringBounds) {
3259 startIndex = 0;
3260 endIndex = matcher.getTextLength();
3261 }
3262 // Perl does not match ^ at end of input even after newline
3263 if (i == endIndex) {
3264 matcher.hitEnd = true;
3265 return false;
3266 }
3267 if (i > startIndex) {
3268 char ch = seq.charAt(i - 1);
3269 if (ch != '\n') {
3270 return false;
3271 }
3272 }
3273 return next.match(matcher, i, seq);
3274 }
3275 }
3276
3277 /**
3278 * Node to match the location where the last match ended.
3279 * This is used for the \G construct.
3280 */
3281 static final class LastMatch extends Node {
3282 boolean match(Matcher matcher, int i, CharSequence seq) {
3283 if (i != matcher.oldLast)
3284 return false;
3285 return next.match(matcher, i, seq);
3286 }
3287 }
3288
3289 /**
3290 * Node to anchor at the end of a line or the end of input based on the
3291 * multiline mode.
3292 *
3293 * When not in multiline mode, the $ can only match at the very end
3294 * of the input, unless the input ends in a line terminator in which
3295 * it matches right before the last line terminator.
3296 *
3297 * Note that \r\n is considered an atomic line terminator.
3298 *
3299 * Like ^ the $ operator matches at a position, it does not match the
3300 * line terminators themselves.
3301 */
3302 static final class Dollar extends Node {
3303 boolean multiline;
3304
3305 Dollar(boolean mul) {
3306 multiline = mul;
3307 }
3308
3309 boolean match(Matcher matcher, int i, CharSequence seq) {
3310 int endIndex = (matcher.anchoringBounds) ? matcher.to
3311 : matcher.getTextLength();
3312 if (!multiline) {
3313 if (i < endIndex - 2)
3314 return false;
3315 if (i == endIndex - 2) {
3316 char ch = seq.charAt(i);
3317 if (ch != '\r')
3318 return false;
3319 ch = seq.charAt(i + 1);
3320 if (ch != '\n')
3321 return false;
3322 }
3323 }
3324 // Matches before any line terminator; also matches at the
3325 // end of input
3326 // Before line terminator:
3327 // If multiline, we match here no matter what
3328 // If not multiline, fall through so that the end
3329 // is marked as hit; this must be a /r/n or a /n
3330 // at the very end so the end was hit; more input
3331 // could make this not match here
3332 if (i < endIndex) {
3333 char ch = seq.charAt(i);
3334 if (ch == '\n') {
3335 // No match between \r\n
3336 if (i > 0 && seq.charAt(i - 1) == '\r')
3337 return false;
3338 if (multiline)
3339 return next.match(matcher, i, seq);
3340 } else if (ch == '\r' || ch == '\u0085'
3341 || (ch | 1) == '\u2029') {
3342 if (multiline)
3343 return next.match(matcher, i, seq);
3344 } else { // No line terminator, no match
3345 return false;
3346 }
3347 }
3348 // Matched at current end so hit end
3349 matcher.hitEnd = true;
3350 // If a $ matches because of end of input, then more input
3351 // could cause it to fail!
3352 matcher.requireEnd = true;
3353 return next.match(matcher, i, seq);
3354 }
3355
3356 boolean study(TreeInfo info) {
3357 next.study(info);
3358 return info.deterministic;
3359 }
3360 }
3361
3362 /**
3363 * Node to anchor at the end of a line or the end of input based on the
3364 * multiline mode when in unix lines mode.
3365 */
3366 static final class UnixDollar extends Node {
3367 boolean multiline;
3368
3369 UnixDollar(boolean mul) {
3370 multiline = mul;
3371 }
3372
3373 boolean match(Matcher matcher, int i, CharSequence seq) {
3374 int endIndex = (matcher.anchoringBounds) ? matcher.to
3375 : matcher.getTextLength();
3376 if (i < endIndex) {
3377 char ch = seq.charAt(i);
3378 if (ch == '\n') {
3379 // If not multiline, then only possible to
3380 // match at very end or one before end
3381 if (multiline == false && i != endIndex - 1)
3382 return false;
3383 // If multiline return next.match without setting
3384 // matcher.hitEnd
3385 if (multiline)
3386 return next.match(matcher, i, seq);
3387 } else {
3388 return false;
3389 }
3390 }
3391 // Matching because at the end or 1 before the end;
3392 // more input could change this so set hitEnd
3393 matcher.hitEnd = true;
3394 // If a $ matches because of end of input, then more input
3395 // could cause it to fail!
3396 matcher.requireEnd = true;
3397 return next.match(matcher, i, seq);
3398 }
3399
3400 boolean study(TreeInfo info) {
3401 next.study(info);
3402 return info.deterministic;
3403 }
3404 }
3405
3406 /**
3407 * Abstract node class to match one character satisfying some
3408 * boolean property.
3409 */
3410 private static abstract class CharProperty extends Node {
3411 abstract boolean isSatisfiedBy(int ch);
3412
3413 CharProperty complement() {
3414 return new CharProperty() {
3415 boolean isSatisfiedBy(int ch) {
3416 return !CharProperty.this .isSatisfiedBy(ch);
3417 }
3418 };
3419 }
3420
3421 CharProperty maybeComplement(boolean complement) {
3422 return complement ? complement() : this ;
3423 }
3424
3425 boolean match(Matcher matcher, int i, CharSequence seq) {
3426 if (i < matcher.to) {
3427 int ch = Character.codePointAt(seq, i);
3428 return isSatisfiedBy(ch)
3429 && next.match(matcher, i
3430 + Character.charCount(ch), seq);
3431 } else {
3432 matcher.hitEnd = true;
3433 return false;
3434 }
3435 }
3436
3437 boolean study(TreeInfo info) {
3438 info.minLength++;
3439 info.maxLength++;
3440 return next.study(info);
3441 }
3442 }
3443
3444 /**
3445 * Optimized version of CharProperty that works only for
3446 * properties never satisfied by Supplementary characters.
3447 */
3448 private static abstract class BmpCharProperty extends CharProperty {
3449 boolean match(Matcher matcher, int i, CharSequence seq) {
3450 if (i < matcher.to) {
3451 return isSatisfiedBy(seq.charAt(i))
3452 && next.match(matcher, i + 1, seq);
3453 } else {
3454 matcher.hitEnd = true;
3455 return false;
3456 }
3457 }
3458 }
3459
3460 /**
3461 * Node class that matches a Supplementary Unicode character
3462 */
3463 static final class SingleS extends CharProperty {
3464 final int c;
3465
3466 SingleS(int c) {
3467 this .c = c;
3468 }
3469
3470 boolean isSatisfiedBy(int ch) {
3471 return ch == c;
3472 }
3473 }
3474
3475 /**
3476 * Optimization -- matches a given BMP character
3477 */
3478 static final class Single extends BmpCharProperty {
3479 final int c;
3480
3481 Single(int c) {
3482 this .c = c;
3483 }
3484
3485 boolean isSatisfiedBy(int ch) {
3486 return ch == c;
3487 }
3488 }
3489
3490 /**
3491 * Case insensitive matches a given BMP character
3492 */
3493 static final class SingleI extends BmpCharProperty {
3494 final int lower;
3495 final int upper;
3496
3497 SingleI(int lower, int upper) {
3498 this .lower = lower;
3499 this .upper = upper;
3500 }
3501
3502 boolean isSatisfiedBy(int ch) {
3503 return ch == lower || ch == upper;
3504 }
3505 }
3506
3507 /**
3508 * Unicode case insensitive matches a given Unicode character
3509 */
3510 static final class SingleU extends CharProperty {
3511 final int lower;
3512
3513 SingleU(int lower) {
3514 this .lower = lower;
3515 }
3516
3517 boolean isSatisfiedBy(int ch) {
3518 return lower == ch
3519 || lower == Character.toLowerCase(Character
3520 .toUpperCase(ch));
3521 }
3522 }
3523
3524 /**
3525 * Node class that matches a Unicode category.
3526 */
3527 static final class Category extends CharProperty {
3528 final int typeMask;
3529
3530 Category(int typeMask) {
3531 this .typeMask = typeMask;
3532 }
3533
3534 boolean isSatisfiedBy(int ch) {
3535 return (typeMask & (1 << Character.getType(ch))) != 0;
3536 }
3537 }
3538
3539 /**
3540 * Node class that matches a POSIX type.
3541 */
3542 static final class Ctype extends BmpCharProperty {
3543 final int ctype;
3544
3545 Ctype(int ctype) {
3546 this .ctype = ctype;
3547 }
3548
3549 boolean isSatisfiedBy(int ch) {
3550 return ch < 128 && ASCII.isType(ch, ctype);
3551 }
3552 }
3553
3554 /**
3555 * Base class for all Slice nodes
3556 */
3557 static class SliceNode extends Node {
3558 int[] buffer;
3559
3560 SliceNode(int[] buf) {
3561 buffer = buf;
3562 }
3563
3564 boolean study(TreeInfo info) {
3565 info.minLength += buffer.length;
3566 info.maxLength += buffer.length;
3567 return next.study(info);
3568 }
3569 }
3570
3571 /**
3572 * Node class for a case sensitive/BMP-only sequence of literal
3573 * characters.
3574 */
3575 static final class Slice extends SliceNode {
3576 Slice(int[] buf) {
3577 super (buf);
3578 }
3579
3580 boolean match(Matcher matcher, int i, CharSequence seq) {
3581 int[] buf = buffer;
3582 int len = buf.length;
3583 for (int j = 0; j < len; j++) {
3584 if ((i + j) >= matcher.to) {
3585 matcher.hitEnd = true;
3586 return false;
3587 }
3588 if (buf[j] != seq.charAt(i + j))
3589 return false;
3590 }
3591 return next.match(matcher, i + len, seq);
3592 }
3593 }
3594
3595 /**
3596 * Node class for a case_insensitive/BMP-only sequence of literal
3597 * characters.
3598 */
3599 static class SliceI extends SliceNode {
3600 SliceI(int[] buf) {
3601 super (buf);
3602 }
3603
3604 boolean match(Matcher matcher, int i, CharSequence seq) {
3605 int[] buf = buffer;
3606 int len = buf.length;
3607 for (int j = 0; j < len; j++) {
3608 if ((i + j) >= matcher.to) {
3609 matcher.hitEnd = true;
3610 return false;
3611 }
3612 int c = seq.charAt(i + j);
3613 if (buf[j] != c && buf[j] != ASCII.toLower(c))
3614 return false;
3615 }
3616 return next.match(matcher, i + len, seq);
3617 }
3618 }
3619
3620 /**
3621 * Node class for a unicode_case_insensitive/BMP-only sequence of
3622 * literal characters. Uses unicode case folding.
3623 */
3624 static final class SliceU extends SliceNode {
3625 SliceU(int[] buf) {
3626 super (buf);
3627 }
3628
3629 boolean match(Matcher matcher, int i, CharSequence seq) {
3630 int[] buf = buffer;
3631 int len = buf.length;
3632 for (int j = 0; j < len; j++) {
3633 if ((i + j) >= matcher.to) {
3634 matcher.hitEnd = true;
3635 return false;
3636 }
3637 int c = seq.charAt(i + j);
3638 if (buf[j] != c
3639 && buf[j] != Character.toLowerCase(Character
3640 .toUpperCase(c)))
3641 return false;
3642 }
3643 return next.match(matcher, i + len, seq);
3644 }
3645 }
3646
3647 /**
3648 * Node class for a case sensitive sequence of literal characters
3649 * including supplementary characters.
3650 */
3651 static final class SliceS extends SliceNode {
3652 SliceS(int[] buf) {
3653 super (buf);
3654 }
3655
3656 boolean match(Matcher matcher, int i, CharSequence seq) {
3657 int[] buf = buffer;
3658 int x = i;
3659 for (int j = 0; j < buf.length; j++) {
3660 if (x >= matcher.to) {
3661 matcher.hitEnd = true;
3662 return false;
3663 }
3664 int c = Character.codePointAt(seq, x);
3665 if (buf[j] != c)
3666 return false;
3667 x += Character.charCount(c);
3668 if (x > matcher.to) {
3669 matcher.hitEnd = true;
3670 return false;
3671 }
3672 }
3673 return next.match(matcher, x, seq);
3674 }
3675 }
3676
3677 /**
3678 * Node class for a case insensitive sequence of literal characters
3679 * including supplementary characters.
3680 */
3681 static class SliceIS extends SliceNode {
3682 SliceIS(int[] buf) {
3683 super (buf);
3684 }
3685
3686 int toLower(int c) {
3687 return ASCII.toLower(c);
3688 }
3689
3690 boolean match(Matcher matcher, int i, CharSequence seq) {
3691 int[] buf = buffer;
3692 int x = i;
3693 for (int j = 0; j < buf.length; j++) {
3694 if (x >= matcher.to) {
3695 matcher.hitEnd = true;
3696 return false;
3697 }
3698 int c = Character.codePointAt(seq, x);
3699 if (buf[j] != c && buf[j] != toLower(c))
3700 return false;
3701 x += Character.charCount(c);
3702 if (x > matcher.to) {
3703 matcher.hitEnd = true;
3704 return false;
3705 }
3706 }
3707 return next.match(matcher, x, seq);
3708 }
3709 }
3710
3711 /**
3712 * Node class for a case insensitive sequence of literal characters.
3713 * Uses unicode case folding.
3714 */
3715 static final class SliceUS extends SliceIS {
3716 SliceUS(int[] buf) {
3717 super (buf);
3718 }
3719
3720 int toLower(int c) {
3721 return Character.toLowerCase(Character.toUpperCase(c));
3722 }
3723 }
3724
3725 private static boolean inRange(int lower, int ch, int upper) {
3726 return lower <= ch && ch <= upper;
3727 }
3728
3729 /**
3730 * Returns node for matching characters within an explicit value range.
3731 */
3732 private static CharProperty rangeFor(final int lower,
3733 final int upper) {
3734 return new CharProperty() {
3735 boolean isSatisfiedBy(int ch) {
3736 return inRange(lower, ch, upper);
3737 }
3738 };
3739 }
3740
3741 /**
3742 * Returns node for matching characters within an explicit value
3743 * range in a case insensitive manner.
3744 */
3745 private CharProperty caseInsensitiveRangeFor(final int lower,
3746 final int upper) {
3747 if (has(UNICODE_CASE))
3748 return new CharProperty() {
3749 boolean isSatisfiedBy(int ch) {
3750 if (inRange(lower, ch, upper))
3751 return true;
3752 int up = Character.toUpperCase(ch);
3753 return inRange(lower, up, upper)
3754 || inRange(lower,
3755 Character.toLowerCase(up), upper);
3756 }
3757 };
3758 return new CharProperty() {
3759 boolean isSatisfiedBy(int ch) {
3760 return inRange(lower, ch, upper)
3761 || ASCII.isAscii(ch)
3762 && (inRange(lower, ASCII.toUpper(ch), upper) || inRange(
3763 lower, ASCII.toLower(ch), upper));
3764 }
3765 };
3766 }
3767
3768 /**
3769 * Implements the Unicode category ALL and the dot metacharacter when
3770 * in dotall mode.
3771 */
3772 static final class All extends CharProperty {
3773 boolean isSatisfiedBy(int ch) {
3774 return true;
3775 }
3776 }
3777
3778 /**
3779 * Node class for the dot metacharacter when dotall is not enabled.
3780 */
3781 static final class Dot extends CharProperty {
3782 boolean isSatisfiedBy(int ch) {
3783 return (ch != '\n' && ch != '\r' && (ch | 1) != '\u2029' && ch != '\u0085');
3784 }
3785 }
3786
3787 /**
3788 * Node class for the dot metacharacter when dotall is not enabled
3789 * but UNIX_LINES is enabled.
3790 */
3791 static final class UnixDot extends CharProperty {
3792 boolean isSatisfiedBy(int ch) {
3793 return ch != '\n';
3794 }
3795 }
3796
3797 /**
3798 * The 0 or 1 quantifier. This one class implements all three types.
3799 */
3800 static final class Ques extends Node {
3801 Node atom;
3802 int type;
3803
3804 Ques(Node node, int type) {
3805 this .atom = node;
3806 this .type = type;
3807 }
3808
3809 boolean match(Matcher matcher, int i, CharSequence seq) {
3810 switch (type) {
3811 case GREEDY:
3812 return (atom.match(matcher, i, seq) && next.match(
3813 matcher, matcher.last, seq))
3814 || next.match(matcher, i, seq);
3815 case LAZY:
3816 return next.match(matcher, i, seq)
3817 || (atom.match(matcher, i, seq) && next.match(
3818 matcher, matcher.last, seq));
3819 case POSSESSIVE:
3820 if (atom.match(matcher, i, seq))
3821 i = matcher.last;
3822 return next.match(matcher, i, seq);
3823 default:
3824 return atom.match(matcher, i, seq)
3825 && next.match(matcher, matcher.last, seq);
3826 }
3827 }
3828
3829 boolean study(TreeInfo info) {
3830 if (type != INDEPENDENT) {
3831 int minL = info.minLength;
3832 atom.study(info);
3833 info.minLength = minL;
3834 info.deterministic = false;
3835 return next.study(info);
3836 } else {
3837 atom.study(info);
3838 return next.study(info);
3839 }
3840 }
3841 }
3842
3843 /**
3844 * Handles the curly-brace style repetition with a specified minimum and
3845 * maximum occurrences. The * quantifier is handled as a special case.
3846 * This class handles the three types.
3847 */
3848 static final class Curly extends Node {
3849 Node atom;
3850 int type;
3851 int cmin;
3852 int cmax;
3853
3854 Curly(Node node, int cmin, int cmax, int type) {
3855 this .atom = node;
3856 this .type = type;
3857 this .cmin = cmin;
3858 this .cmax = cmax;
3859 }
3860
3861 boolean match(Matcher matcher, int i, CharSequence seq) {
3862 int j;
3863 for (j = 0; j < cmin; j++) {
3864 if (atom.match(matcher, i, seq)) {
3865 i = matcher.last;
3866 continue;
3867 }
3868 return false;
3869 }
3870 if (type == GREEDY)
3871 return match0(matcher, i, j, seq);
3872 else if (type == LAZY)
3873 return match1(matcher, i, j, seq);
3874 else
3875 return match2(matcher, i, j, seq);
3876 }
3877
3878 // Greedy match.
3879 // i is the index to start matching at
3880 // j is the number of atoms that have matched
3881 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
3882 if (j >= cmax) {
3883 // We have matched the maximum... continue with the rest of
3884 // the regular expression
3885 return next.match(matcher, i, seq);
3886 }
3887 int backLimit = j;
3888 while (atom.match(matcher, i, seq)) {
3889 // k is the length of this match
3890 int k = matcher.last - i;
3891 if (k == 0) // Zero length match
3892 break;
3893 // Move up index and number matched
3894 i = matcher.last;
3895 j++;
3896 // We are greedy so match as many as we can
3897 while (j < cmax) {
3898 if (!atom.match(matcher, i, seq))
3899 break;
3900 if (i + k != matcher.last) {
3901 if (match0(matcher, matcher.last, j + 1, seq))
3902 return true;
3903 break;
3904 }
3905 i += k;
3906 j++;
3907 }
3908 // Handle backing off if match fails
3909 while (j >= backLimit) {
3910 if (next.match(matcher, i, seq))
3911 return true;
3912 i -= k;
3913 j--;
3914 }
3915 return false;
3916 }
3917 return next.match(matcher, i, seq);
3918 }
3919
3920 // Reluctant match. At this point, the minimum has been satisfied.
3921 // i is the index to start matching at
3922 // j is the number of atoms that have matched
3923 boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
3924 for (;;) {
3925 // Try finishing match without consuming any more
3926 if (next.match(matcher, i, seq))
3927 return true;
3928 // At the maximum, no match found
3929 if (j >= cmax)
3930 return false;
3931 // Okay, must try one more atom
3932 if (!atom.match(matcher, i, seq))
3933 return false;
3934 // If we haven't moved forward then must break out
3935 if (i == matcher.last)
3936 return false;
3937 // Move up index and number matched
3938 i = matcher.last;
3939 j++;
3940 }
3941 }
3942
3943 boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
3944 for (; j < cmax; j++) {
3945 if (!atom.match(matcher, i, seq))
3946 break;
3947 if (i == matcher.last)
3948 break;
3949 i = matcher.last;
3950 }
3951 return next.match(matcher, i, seq);
3952 }
3953
3954 boolean study(TreeInfo info) {
3955 // Save original info
3956 int minL = info.minLength;
3957 int maxL = info.maxLength;
3958 boolean maxV = info.maxValid;
3959 boolean detm = info.deterministic;
3960 info.reset();
3961
3962 atom.study(info);
3963
3964 int temp = info.minLength * cmin + minL;
3965 if (temp < minL) {
3966 temp = 0xFFFFFFF; // arbitrary large number
3967 }
3968 info.minLength = temp;
3969
3970 if (maxV & info.maxValid) {
3971 temp = info.maxLength * cmax + maxL;
3972 info.maxLength = temp;
3973 if (temp < maxL) {
3974 info.maxValid = false;
3975 }
3976 } else {
3977 info.maxValid = false;
3978 }
3979
3980 if (info.deterministic && cmin == cmax)
3981 info.deterministic = detm;
3982 else
3983 info.deterministic = false;
3984
3985 return next.study(info);
3986 }
3987 }
3988
3989 /**
3990 * Handles the curly-brace style repetition with a specified minimum and
3991 * maximum occurrences in deterministic cases. This is an iterative
3992 * optimization over the Prolog and Loop system which would handle this
3993 * in a recursive way. The * quantifier is handled as a special case.
3994 * If capture is true then this class saves group settings and ensures
3995 * that groups are unset when backing off of a group match.
3996 */
3997 static final class GroupCurly extends Node {
3998 Node atom;
3999 int type;
4000 int cmin;
4001 int cmax;
4002 int localIndex;
4003 int groupIndex;
4004 boolean capture;
4005
4006 GroupCurly(Node node, int cmin, int cmax, int type, int local,
4007 int group, boolean capture) {
4008 this .atom = node;
4009 this .type = type;
4010 this .cmin = cmin;
4011 this .cmax = cmax;
4012 this .localIndex = local;
4013 this .groupIndex = group;
4014 this .capture = capture;
4015 }
4016
4017 boolean match(Matcher matcher, int i, CharSequence seq) {
4018 int[] groups = matcher.groups;
4019 int[] locals = matcher.locals;
4020 int save0 = locals[localIndex];
4021 int save1 = 0;
4022 int save2 = 0;
4023
4024 if (capture) {
4025 save1 = groups[groupIndex];
4026 save2 = groups[groupIndex + 1];
4027 }
4028
4029 // Notify GroupTail there is no need to setup group info
4030 // because it will be set here
4031 locals[localIndex] = -1;
4032
4033 boolean ret = true;
4034 for (int j = 0; j < cmin; j++) {
4035 if (atom.match(matcher, i, seq)) {
4036 if (capture) {
4037 groups[groupIndex] = i;
4038 groups[groupIndex + 1] = matcher.last;
4039 }
4040 i = matcher.last;
4041 } else {
4042 ret = false;
4043 break;
4044 }
4045 }
4046 if (ret) {
4047 if (type == GREEDY) {
4048 ret = match0(matcher, i, cmin, seq);
4049 } else if (type == LAZY) {
4050 ret = match1(matcher, i, cmin, seq);
4051 } else {
4052 ret = match2(matcher, i, cmin, seq);
4053 }
4054 }
4055 if (!ret) {
4056 locals[localIndex] = save0;
4057 if (capture) {
4058 groups[groupIndex] = save1;
4059 groups[groupIndex + 1] = save2;
4060 }
4061 }
4062 return ret;
4063 }
4064
4065 // Aggressive group match
4066 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4067 int[] groups = matcher.groups;
4068 int save0 = 0;
4069 int save1 = 0;
4070 if (capture) {
4071 save0 = groups[groupIndex];
4072 save1 = groups[groupIndex + 1];
4073 }
4074 for (;;) {
4075 if (j >= cmax)
4076 break;
4077 if (!atom.match(matcher, i, seq))
4078 break;
4079 int k = matcher.last - i;
4080 if (k <= 0) {
4081 if (capture) {
4082 groups[groupIndex] = i;
4083 groups[groupIndex + 1] = i + k;
4084 }
4085 i = i + k;
4086 break;
4087 }
4088 for (;;) {
4089 if (capture) {
4090 groups[groupIndex] = i;
4091 groups[groupIndex + 1] = i + k;
4092 }
4093 i = i + k;
4094 if (++j >= cmax)
4095 break;
4096 if (!atom.match(matcher, i, seq))
4097 break;
4098 if (i + k != matcher.last) {
4099 if (match0(matcher, i, j, seq))
4100 return true;
4101 break;
4102 }
4103 }
4104 while (j > cmin) {
4105 if (next.match(matcher, i, seq)) {
4106 if (capture) {
4107 groups[groupIndex + 1] = i;
4108 groups[groupIndex] = i - k;
4109 }
4110 i = i - k;
4111 return true;
4112 }
4113 // backing off
4114 if (capture) {
4115 groups[groupIndex + 1] = i;
4116 groups[groupIndex] = i - k;
4117 }
4118 i = i - k;
4119 j--;
4120 }
4121 break;
4122 }
4123 if (capture) {
4124 groups[groupIndex] = save0;
4125 groups[groupIndex + 1] = save1;
4126 }
4127 return next.match(matcher, i, seq);
4128 }
4129
4130 // Reluctant matching
4131 boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
4132 for (;;) {
4133 if (next.match(matcher, i, seq))
4134 return true;
4135 if (j >= cmax)
4136 return false;
4137 if (!atom.match(matcher, i, seq))
4138 return false;
4139 if (i == matcher.last)
4140 return false;
4141 if (capture) {
4142 matcher.groups[groupIndex] = i;
4143 matcher.groups[groupIndex + 1] = matcher.last;
4144 }
4145 i = matcher.last;
4146 j++;
4147 }
4148 }
4149
4150 // Possessive matching
4151 boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
4152 for (; j < cmax; j++) {
4153 if (!atom.match(matcher, i, seq)) {
4154 break;
4155 }
4156 if (capture) {
4157 matcher.groups[groupIndex] = i;
4158 matcher.groups[groupIndex + 1] = matcher.last;
4159 }
4160 if (i == matcher.last) {
4161 break;
4162 }
4163 i = matcher.last;
4164 }
4165 return next.match(matcher, i, seq);
4166 }
4167
4168 boolean study(TreeInfo info) {
4169 // Save original info
4170 int minL = info.minLength;
4171 int maxL = info.maxLength;
4172 boolean maxV = info.maxValid;
4173 boolean detm = info.deterministic;
4174 info.reset();
4175
4176 atom.study(info);
4177
4178 int temp = info.minLength * cmin + minL;
4179 if (temp < minL) {
4180 temp = 0xFFFFFFF; // Arbitrary large number
4181 }
4182 info.minLength = temp;
4183
4184 if (maxV & info.maxValid) {
4185 temp = info.maxLength * cmax + maxL;
4186 info.maxLength = temp;
4187 if (temp < maxL) {
4188 info.maxValid = false;
4189 }
4190 } else {
4191 info.maxValid = false;
4192 }
4193
4194 if (info.deterministic && cmin == cmax) {
4195 info.deterministic = detm;
4196 } else {
4197 info.deterministic = false;
4198 }
4199
4200 return next.study(info);
4201 }
4202 }
4203
4204 /**
4205 * A Guard node at the end of each atom node in a Branch. It
4206 * serves the purpose of chaining the "match" operation to
4207 * "next" but not the "study", so we can collect the TreeInfo
4208 * of each atom node without including the TreeInfo of the
4209 * "next".
4210 */
4211 static final class BranchConn extends Node {
4212 BranchConn() {
4213 };
4214
4215 boolean match(Matcher matcher, int i, CharSequence seq) {
4216 return next.match(matcher, i, seq);
4217 }
4218
4219 boolean study(TreeInfo info) {
4220 return info.deterministic;
4221 }
4222 }
4223
4224 /**
4225 * Handles the branching of alternations. Note this is also used for
4226 * the ? quantifier to branch between the case where it matches once
4227 * and where it does not occur.
4228 */
4229 static final class Branch extends Node {
4230 Node[] atoms = new Node[2];
4231 int size = 2;
4232 Node conn;
4233
4234 Branch(Node first, Node second, Node branchConn) {
4235 conn = branchConn;
4236 atoms[0] = first;
4237 atoms[1] = second;
4238 }
4239
4240 void add(Node node) {
4241 if (size >= atoms.length) {
4242 Node[] tmp = new Node[atoms.length * 2];
4243 System.arraycopy(atoms, 0, tmp, 0, atoms.length);
4244 atoms = tmp;
4245 }
4246 atoms[size++] = node;
4247 }
4248
4249 boolean match(Matcher matcher, int i, CharSequence seq) {
4250 for (int n = 0; n < size; n++) {
4251 if (atoms[n] == null) {
4252 if (conn.next.match(matcher, i, seq))
4253 return true;
4254 } else if (atoms[n].match(matcher, i, seq)) {
4255 return true;
4256 }
4257 }
4258 return false;
4259 }
4260
4261 boolean study(TreeInfo info) {
4262 int minL = info.minLength;
4263 int maxL = info.maxLength;
4264 boolean maxV = info.maxValid;
4265
4266 int minL2 = Integer.MAX_VALUE; //arbitrary large enough num
4267 int maxL2 = -1;
4268 for (int n = 0; n < size; n++) {
4269 info.reset();
4270 if (atoms[n] != null)
4271 atoms[n].study(info);
4272 minL2 = Math.min(minL2, info.minLength);
4273 maxL2 = Math.max(maxL2, info.maxLength);
4274 maxV = (maxV & info.maxValid);
4275 }
4276
4277 minL += minL2;
4278 maxL += maxL2;
4279
4280 info.reset();
4281 conn.next.study(info);
4282
4283 info.minLength += minL;
4284 info.maxLength += maxL;
4285 info.maxValid &= maxV;
4286 info.deterministic = false;
4287 return false;
4288 }
4289 }
4290
4291 /**
4292 * The GroupHead saves the location where the group begins in the locals
4293 * and restores them when the match is done.
4294 *
4295 * The matchRef is used when a reference to this group is accessed later
4296 * in the expression. The locals will have a negative value in them to
4297 * indicate that we do not want to unset the group if the reference
4298 * doesn't match.
4299 */
4300 static final class GroupHead extends Node {
4301 int localIndex;
4302
4303 GroupHead(int localCount) {
4304 localIndex = localCount;
4305 }
4306
4307 boolean match(Matcher matcher, int i, CharSequence seq) {
4308 int save = matcher.locals[localIndex];
4309 matcher.locals[localIndex] = i;
4310 boolean ret = next.match(matcher, i, seq);
4311 matcher.locals[localIndex] = save;
4312 return ret;
4313 }
4314
4315 boolean matchRef(Matcher matcher, int i, CharSequence seq) {
4316 int save = matcher.locals[localIndex];
4317 matcher.locals[localIndex] = ~i; // HACK
4318 boolean ret = next.match(matcher, i, seq);
4319 matcher.locals[localIndex] = save;
4320 return ret;
4321 }
4322 }
4323
4324 /**
4325 * Recursive reference to a group in the regular expression. It calls
4326 * matchRef because if the reference fails to match we would not unset
4327 * the group.
4328 */
4329 static final class GroupRef extends Node {
4330 GroupHead head;
4331
4332 GroupRef(GroupHead head) {
4333 this .head = head;
4334 }
4335
4336 boolean match(Matcher matcher, int i, CharSequence seq) {
4337 return head.matchRef(matcher, i, seq)
4338 && next.match(matcher, matcher.last, seq);
4339 }
4340
4341 boolean study(TreeInfo info) {
4342 info.maxValid = false;
4343 info.deterministic = false;
4344 return next.study(info);
4345 }
4346 }
4347
4348 /**
4349 * The GroupTail handles the setting of group beginning and ending
4350 * locations when groups are successfully matched. It must also be able to
4351 * unset groups that have to be backed off of.
4352 *
4353 * The GroupTail node is also used when a previous group is referenced,
4354 * and in that case no group information needs to be set.
4355 */
4356 static final class GroupTail extends Node {
4357 int localIndex;
4358 int groupIndex;
4359
4360 GroupTail(int localCount, int groupCount) {
4361 localIndex = localCount;
4362 groupIndex = groupCount + groupCount;
4363 }
4364
4365 boolean match(Matcher matcher, int i, CharSequence seq) {
4366 int tmp = matcher.locals[localIndex];
4367 if (tmp >= 0) { // This is the normal group case.
4368 // Save the group so we can unset it if it
4369 // backs off of a match.
4370 int groupStart = matcher.groups[groupIndex];
4371 int groupEnd = matcher.groups[groupIndex + 1];
4372
4373 matcher.groups[groupIndex] = tmp;
4374 matcher.groups[groupIndex + 1] = i;
4375 if (next.match(matcher, i, seq)) {
4376 return true;
4377 }
4378 matcher.groups[groupIndex] = groupStart;
4379 matcher.groups[groupIndex + 1] = groupEnd;
4380 return false;
4381 } else {
4382 // This is a group reference case. We don't need to save any
4383 // group info because it isn't really a group.
4384 matcher.last = i;
4385 return true;
4386 }
4387 }
4388 }
4389
4390 /**
4391 * This sets up a loop to handle a recursive quantifier structure.
4392 */
4393 static final class Prolog extends Node {
4394 Loop loop;
4395
4396 Prolog(Loop loop) {
4397 this .loop = loop;
4398 }
4399
4400 boolean match(Matcher matcher, int i, CharSequence seq) {
4401 return loop.matchInit(matcher, i, seq);
4402 }
4403
4404 boolean study(TreeInfo info) {
4405 return loop.study(info);
4406 }
4407 }
4408
4409 /**
4410 * Handles the repetition count for a greedy Curly. The matchInit
4411 * is called from the Prolog to save the index of where the group
4412 * beginning is stored. A zero length group check occurs in the
4413 * normal match but is skipped in the matchInit.
4414 */
4415 static class Loop extends Node {
4416 Node body;
4417 int countIndex; // local count index in matcher locals
4418 int beginIndex; // group beginning index
4419 int cmin, cmax;
4420
4421 Loop(int countIndex, int beginIndex) {
4422 this .countIndex = countIndex;
4423 this .beginIndex = beginIndex;
4424 }
4425
4426 boolean match(Matcher matcher, int i, CharSequence seq) {
4427 // Avoid infinite loop in zero-length case.
4428 if (i > matcher.locals[beginIndex]) {
4429 int count = matcher.locals[countIndex];
4430
4431 // This block is for before we reach the minimum
4432 // iterations required for the loop to match
4433 if (count < cmin) {
4434 matcher.locals[countIndex] = count + 1;
4435 boolean b = body.match(matcher, i, seq);
4436 // If match failed we must backtrack, so
4437 // the loop count should NOT be incremented
4438 if (!b)
4439 matcher.locals[countIndex] = count;
4440 // Return success or failure since we are under
4441 // minimum
4442 return b;
4443 }
4444 // This block is for after we have the minimum
4445 // iterations required for the loop to match
4446 if (count < cmax) {
4447 matcher.locals[countIndex] = count + 1;
4448 boolean b = body.match(matcher, i, seq);
4449 // If match failed we must backtrack, so
4450 // the loop count should NOT be incremented
4451 if (!b)
4452 matcher.locals[countIndex] = count;
4453 else
4454 return true;
4455 }
4456 }
4457 return next.match(matcher, i, seq);
4458 }
4459
4460 boolean matchInit(Matcher matcher, int i, CharSequence seq) {
4461 int save = matcher.locals[countIndex];
4462 boolean ret = false;
4463 if (0 < cmin) {
4464 matcher.locals[countIndex] = 1;
4465 ret = body.match(matcher, i, seq);
4466 } else if (0 < cmax) {
4467 matcher.locals[countIndex] = 1;
4468 ret = body.match(matcher, i, seq);
4469 if (ret == false)
4470 ret = next.match(matcher, i, seq);
4471 } else {
4472 ret = next.match(matcher, i, seq);
4473 }
4474 matcher.locals[countIndex] = save;
4475 return ret;
4476 }
4477
4478 boolean study(TreeInfo info) {
4479 info.maxValid = false;
4480 info.deterministic = false;
4481 return false;
4482 }
4483 }
4484
4485 /**
4486 * Handles the repetition count for a reluctant Curly. The matchInit
4487 * is called from the Prolog to save the index of where the group
4488 * beginning is stored. A zero length group check occurs in the
4489 * normal match but is skipped in the matchInit.
4490 */
4491 static final class LazyLoop extends Loop {
4492 LazyLoop(int countIndex, int beginIndex) {
4493 super (countIndex, beginIndex);
4494 }
4495
4496 boolean match(Matcher matcher, int i, CharSequence seq) {
4497 // Check for zero length group
4498 if (i > matcher.locals[beginIndex]) {
4499 int count = matcher.locals[countIndex];
4500 if (count < cmin) {
4501 matcher.locals[countIndex] = count + 1;
4502 boolean result = body.match(matcher, i, seq);
4503 // If match failed we must backtrack, so
4504 // the loop count should NOT be incremented
4505 if (!result)
4506 matcher.locals[countIndex] = count;
4507 return result;
4508 }
4509 if (next.match(matcher, i, seq))
4510 return true;
4511 if (count < cmax) {
4512 matcher.locals[countIndex] = count + 1;
4513 boolean result = body.match(matcher, i, seq);
4514 // If match failed we must backtrack, so
4515 // the loop count should NOT be incremented
4516 if (!result)
4517 matcher.locals[countIndex] = count;
4518 return result;
4519 }
4520 return false;
4521 }
4522 return next.match(matcher, i, seq);
4523 }
4524
4525 boolean matchInit(Matcher matcher, int i, CharSequence seq) {
4526 int save = matcher.locals[countIndex];
4527 boolean ret = false;
4528 if (0 < cmin) {
4529 matcher.locals[countIndex] = 1;
4530 ret = body.match(matcher, i, seq);
4531 } else if (next.match(matcher, i, seq)) {
4532 ret = true;
4533 } else if (0 < cmax) {
4534 matcher.locals[countIndex] = 1;
4535 ret = body.match(matcher, i, seq);
4536 }
4537 matcher.locals[countIndex] = save;
4538 return ret;
4539 }
4540
4541 boolean study(TreeInfo info) {
4542 info.maxValid = false;
4543 info.deterministic = false;
4544 return false;
4545 }
4546 }
4547
4548 /**
4549 * Refers to a group in the regular expression. Attempts to match
4550 * whatever the group referred to last matched.
4551 */
4552 static class BackRef extends Node {
4553 int groupIndex;
4554
4555 BackRef(int groupCount) {
4556 super ();
4557 groupIndex = groupCount + groupCount;
4558 }
4559
4560 boolean match(Matcher matcher, int i, CharSequence seq) {
4561 int j = matcher.groups[groupIndex];
4562 int k = matcher.groups[groupIndex + 1];
4563
4564 int groupSize = k - j;
4565
4566 // If the referenced group didn't match, neither can this
4567 if (j < 0)
4568 return false;
4569
4570 // If there isn't enough input left no match
4571 if (i + groupSize > matcher.to) {
4572 matcher.hitEnd = true;
4573 return false;
4574 }
4575
4576 // Check each new char to make sure it matches what the group
4577 // referenced matched last time around
4578 for (int index = 0; index < groupSize; index++)
4579 if (seq.charAt(i + index) != seq.charAt(j + index))
4580 return false;
4581
4582 return next.match(matcher, i + groupSize, seq);
4583 }
4584
4585 boolean study(TreeInfo info) {
4586 info.maxValid = false;
4587 return next.study(info);
4588 }
4589 }
4590
4591 static class CIBackRef extends Node {
4592 int groupIndex;
4593 boolean doUnicodeCase;
4594
4595 CIBackRef(int groupCount, boolean doUnicodeCase) {
4596 super ();
4597 groupIndex = groupCount + groupCount;
4598 this .doUnicodeCase = doUnicodeCase;
4599 }
4600
4601 boolean match(Matcher matcher, int i, CharSequence seq) {
4602 int j = matcher.groups[groupIndex];
4603 int k = matcher.groups[groupIndex + 1];
4604
4605 int groupSize = k - j;
4606
4607 // If the referenced group didn't match, neither can this
4608 if (j < 0)
4609 return false;
4610
4611 // If there isn't enough input left no match
4612 if (i + groupSize > matcher.to) {
4613 matcher.hitEnd = true;
4614 return false;
4615 }
4616
4617 // Check each new char to make sure it matches what the group
4618 // referenced matched last time around
4619 int x = i;
4620 for (int index = 0; index < groupSize; index++) {
4621 int c1 = Character.codePointAt(seq, x);
4622 int c2 = Character.codePointAt(seq, j);
4623 if (c1 != c2) {
4624 if (doUnicodeCase) {
4625 int cc1 = Character.toUpperCase(c1);
4626 int cc2 = Character.toUpperCase(c2);
4627 if (cc1 != cc2
4628 && Character.toLowerCase(cc1) != Character
4629 .toLowerCase(cc2))
4630 return false;
4631 } else {
4632 if (ASCII.toLower(c1) != ASCII.toLower(c2))
4633 return false;
4634 }
4635 }
4636 x += Character.charCount(c1);
4637 j += Character.charCount(c2);
4638 }
4639
4640 return next.match(matcher, i + groupSize, seq);
4641 }
4642
4643 boolean study(TreeInfo info) {
4644 info.maxValid = false;
4645 return next.study(info);
4646 }
4647 }
4648
4649 /**
4650 * Searches until the next instance of its atom. This is useful for
4651 * finding the atom efficiently without passing an instance of it
4652 * (greedy problem) and without a lot of wasted search time (reluctant
4653 * problem).
4654 */
4655 static final class First extends Node {
4656 Node atom;
4657
4658 First(Node node) {
4659 this .atom = BnM.optimize(node);
4660 }
4661
4662 boolean match(Matcher matcher, int i, CharSequence seq) {
4663 if (atom instanceof BnM) {
4664 return atom.match(matcher, i, seq)
4665 && next.match(matcher, matcher.last, seq);
4666 }
4667 for (;;) {
4668 if (i > matcher.to) {
4669 matcher.hitEnd = true;
4670 return false;
4671 }
4672 if (atom.match(matcher, i, seq)) {
4673 return next.match(matcher, matcher.last, seq);
4674 }
4675 i += countChars(seq, i, 1);
4676 matcher.first++;
4677 }
4678 }
4679
4680 boolean study(TreeInfo info) {
4681 atom.study(info);
4682 info.maxValid = false;
4683 info.deterministic = false;
4684 return next.study(info);
4685 }
4686 }
4687
4688 static final class Conditional extends Node {
4689 Node cond, yes, not;
4690
4691 Conditional(Node cond, Node yes, Node not) {
4692 this .cond = cond;
4693 this .yes = yes;
4694 this .not = not;
4695 }
4696
4697 boolean match(Matcher matcher, int i, CharSequence seq) {
4698 if (cond.match(matcher, i, seq)) {
4699 return yes.match(matcher, i, seq);
4700 } else {
4701 return not.match(matcher, i, seq);
4702 }
4703 }
4704
4705 boolean study(TreeInfo info) {
4706 int minL = info.minLength;
4707 int maxL = info.maxLength;
4708 boolean maxV = info.maxValid;
4709 info.reset();
4710 yes.study(info);
4711
4712 int minL2 = info.minLength;
4713 int maxL2 = info.maxLength;
4714 boolean maxV2 = info.maxValid;
4715 info.reset();
4716 not.study(info);
4717
4718 info.minLength = minL + Math.min(minL2, info.minLength);
4719 info.maxLength = maxL + Math.max(maxL2, info.maxLength);
4720 info.maxValid = (maxV & maxV2 & info.maxValid);
4721 info.deterministic = false;
4722 return next.study(info);
4723 }
4724 }
4725
4726 /**
4727 * Zero width positive lookahead.
4728 */
4729 static final class Pos extends Node {
4730 Node cond;
4731
4732 Pos(Node cond) {
4733 this .cond = cond;
4734 }
4735
4736 boolean match(Matcher matcher, int i, CharSequence seq) {
4737 int savedTo = matcher.to;
4738 boolean conditionMatched = false;
4739
4740 // Relax transparent region boundaries for lookahead
4741 if (matcher.transparentBounds)
4742 matcher.to = matcher.getTextLength();
4743 try {
4744 conditionMatched = cond.match(matcher, i, seq);
4745 } finally {
4746 // Reinstate region boundaries
4747 matcher.to = savedTo;
4748 }
4749 return conditionMatched && next.match(matcher, i, seq);
4750 }
4751 }
4752
4753 /**
4754 * Zero width negative lookahead.
4755 */
4756 static final class Neg extends Node {
4757 Node cond;
4758
4759 Neg(Node cond) {
4760 this .cond = cond;
4761 }
4762
4763 boolean match(Matcher matcher, int i, CharSequence seq) {
4764 int savedTo = matcher.to;
4765 boolean conditionMatched = false;
4766
4767 // Relax transparent region boundaries for lookahead
4768 if (matcher.transparentBounds)
4769 matcher.to = matcher.getTextLength();
4770 try {
4771 if (i < matcher.to) {
4772 conditionMatched = !cond.match(matcher, i, seq);
4773 } else {
4774 // If a negative lookahead succeeds then more input
4775 // could cause it to fail!
4776 matcher.requireEnd = true;
4777 conditionMatched = !cond.match(matcher, i, seq);
4778 }
4779 } finally {
4780 // Reinstate region boundaries
4781 matcher.to = savedTo;
4782 }
4783 return conditionMatched && next.match(matcher, i, seq);
4784 }
4785 }
4786
4787 /**
4788 * For use with lookbehinds; matches the position where the lookbehind
4789 * was encountered.
4790 */
4791 static Node lookbehindEnd = new Node() {
4792 boolean match(Matcher matcher, int i, CharSequence seq) {
4793 return i == matcher.lookbehindTo;
4794 }
4795 };
4796
4797 /**
4798 * Zero width positive lookbehind.
4799 */
4800 static class Behind extends Node {
4801 Node cond;
4802 int rmax, rmin;
4803
4804 Behind(Node cond, int rmax, int rmin) {
4805 this .cond = cond;
4806 this .rmax = rmax;
4807 this .rmin = rmin;
4808 }
4809
4810 boolean match(Matcher matcher, int i, CharSequence seq) {
4811 int savedFrom = matcher.from;
4812 boolean conditionMatched = false;
4813 int startIndex = (!matcher.transparentBounds) ? matcher.from
4814 : 0;
4815 int from = Math.max(i - rmax, startIndex);
4816 // Set end boundary
4817 int savedLBT = matcher.lookbehindTo;
4818 matcher.lookbehindTo = i;
4819 // Relax transparent region boundaries for lookbehind
4820 if (matcher.transparentBounds)
4821 matcher.from = 0;
4822 for (int j = i - rmin; !conditionMatched && j >= from; j--) {
4823 conditionMatched = cond.match(matcher, j, seq);
4824 }
4825 matcher.from = savedFrom;
4826 matcher.lookbehindTo = savedLBT;
4827 return conditionMatched && next.match(matcher, i, seq);
4828 }
4829 }
4830
4831 /**
4832 * Zero width positive lookbehind, including supplementary
4833 * characters or unpaired surrogates.
4834 */
4835 static final class BehindS extends Behind {
4836 BehindS(Node cond, int rmax, int rmin) {
4837 super (cond, rmax, rmin);
4838 }
4839
4840 boolean match(Matcher matcher, int i, CharSequence seq) {
4841 int rmaxChars = countChars(seq, i, -rmax);
4842 int rminChars = countChars(seq, i, -rmin);
4843 int savedFrom = matcher.from;
4844 int startIndex = (!matcher.transparentBounds) ? matcher.from
4845 : 0;
4846 boolean conditionMatched = false;
4847 int from = Math.max(i - rmaxChars, startIndex);
4848 // Set end boundary
4849 int savedLBT = matcher.lookbehindTo;
4850 matcher.lookbehindTo = i;
4851 // Relax transparent region boundaries for lookbehind
4852 if (matcher.transparentBounds)
4853 matcher.from = 0;
4854
4855 for (int j = i - rminChars; !conditionMatched && j >= from; j -= j > from ? countChars(
4856 seq, j, -1)
4857 : 1) {
4858 conditionMatched = cond.match(matcher, j, seq);
4859 }
4860 matcher.from = savedFrom;
4861 matcher.lookbehindTo = savedLBT;
4862 return conditionMatched && next.match(matcher, i, seq);
4863 }
4864 }
4865
4866 /**
4867 * Zero width negative lookbehind.
4868 */
4869 static class NotBehind extends Node {
4870 Node cond;
4871 int rmax, rmin;
4872
4873 NotBehind(Node cond, int rmax, int rmin) {
4874 this .cond = cond;
4875 this .rmax = rmax;
4876 this .rmin = rmin;
4877 }
4878
4879 boolean match(Matcher matcher, int i, CharSequence seq) {
4880 int savedLBT = matcher.lookbehindTo;
4881 int savedFrom = matcher.from;
4882 boolean conditionMatched = false;
4883 int startIndex = (!matcher.transparentBounds) ? matcher.from
4884 : 0;
4885 int from = Math.max(i - rmax, startIndex);
4886 matcher.lookbehindTo = i;
4887 // Relax transparent region boundaries for lookbehind
4888 if (matcher.transparentBounds)
4889 matcher.from = 0;
4890 for (int j = i - rmin; !conditionMatched && j >= from; j--) {
4891 conditionMatched = cond.match(matcher, j, seq);
4892 }
4893 // Reinstate region boundaries
4894 matcher.from = savedFrom;
4895 matcher.lookbehindTo = savedLBT;
4896 return !conditionMatched && next.match(matcher, i, seq);
4897 }
4898 }
4899
4900 /**
4901 * Zero width negative lookbehind, including supplementary
4902 * characters or unpaired surrogates.
4903 */
4904 static final class NotBehindS extends NotBehind {
4905 NotBehindS(Node cond, int rmax, int rmin) {
4906 super (cond, rmax, rmin);
4907 }
4908
4909 boolean match(Matcher matcher, int i, CharSequence seq) {
4910 int rmaxChars = countChars(seq, i, -rmax);
4911 int rminChars = countChars(seq, i, -rmin);
4912 int savedFrom = matcher.from;
4913 int savedLBT = matcher.lookbehindTo;
4914 boolean conditionMatched = false;
4915 int startIndex = (!matcher.transparentBounds) ? matcher.from
4916 : 0;
4917 int from = Math.max(i - rmaxChars, startIndex);
4918 matcher.lookbehindTo = i;
4919 // Relax transparent region boundaries for lookbehind
4920 if (matcher.transparentBounds)
4921 matcher.from = 0;
4922 for (int j = i - rminChars; !conditionMatched && j >= from; j -= j > from ? countChars(
4923 seq, j, -1)
4924 : 1) {
4925 conditionMatched = cond.match(matcher, j, seq);
4926 }
4927 //Reinstate region boundaries
4928 matcher.from = savedFrom;
4929 matcher.lookbehindTo = savedLBT;
4930 return !conditionMatched && next.match(matcher, i, seq);
4931 }
4932 }
4933
4934 /**
4935 * Returns the set union of two CharProperty nodes.
4936 */
4937 private static CharProperty union(final CharProperty lhs,
4938 final CharProperty rhs) {
4939 return new CharProperty() {
4940 boolean isSatisfiedBy(int ch) {
4941 return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);
4942 }
4943 };
4944 }
4945
4946 /**
4947 * Returns the set intersection of two CharProperty nodes.
4948 */
4949 private static CharProperty intersection(final CharProperty lhs,
4950 final CharProperty rhs) {
4951 return new CharProperty() {
4952 boolean isSatisfiedBy(int ch) {
4953 return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);
4954 }
4955 };
4956 }
4957
4958 /**
4959 * Returns the set difference of two CharProperty nodes.
4960 */
4961 private static CharProperty setDifference(final CharProperty lhs,
4962 final CharProperty rhs) {
4963 return new CharProperty() {
4964 boolean isSatisfiedBy(int ch) {
4965 return !rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);
4966 }
4967 };
4968 }
4969
4970 /**
4971 * Handles word boundaries. Includes a field to allow this one class to
4972 * deal with the different types of word boundaries we can match. The word
4973 * characters include underscores, letters, and digits. Non spacing marks
4974 * can are also part of a word if they have a base character, otherwise
4975 * they are ignored for purposes of finding word boundaries.
4976 */
4977 static final class Bound extends Node {
4978 static int LEFT = 0x1;
4979 static int RIGHT = 0x2;
4980 static int BOTH = 0x3;
4981 static int NONE = 0x4;
4982 int type;
4983
4984 Bound(int n) {
4985 type = n;
4986 }
4987
4988 int check(Matcher matcher, int i, CharSequence seq) {
4989 int ch;
4990 boolean left = false;
4991 int startIndex = matcher.from;
4992 int endIndex = matcher.to;
4993 if (matcher.transparentBounds) {
4994 startIndex = 0;
4995 endIndex = matcher.getTextLength();
4996 }
4997 if (i > startIndex) {
4998 ch = Character.codePointBefore(seq, i);
4999 left = (ch == '_' || Character.isLetterOrDigit(ch) || ((Character
5000 .getType(ch) == Character.NON_SPACING_MARK) && hasBaseCharacter(
5001 matcher, i - 1, seq)));
5002 }
5003 boolean right = false;
5004 if (i < endIndex) {
5005 ch = Character.codePointAt(seq, i);
5006 right = (ch == '_' || Character.isLetterOrDigit(ch) || ((Character
5007 .getType(ch) == Character.NON_SPACING_MARK) && hasBaseCharacter(
5008 matcher, i, seq)));
5009 } else {
5010 // Tried to access char past the end
5011 matcher.hitEnd = true;
5012 // The addition of another char could wreck a boundary
5013 matcher.requireEnd = true;
5014 }
5015 return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
5016 }
5017
5018 boolean match(Matcher matcher, int i, CharSequence seq) {
5019 return (check(matcher, i, seq) & type) > 0
5020 && next.match(matcher, i, seq);
5021 }
5022 }
5023
5024 /**
5025 * Non spacing marks only count as word characters in bounds calculations
5026 * if they have a base character.
5027 */
5028 private static boolean hasBaseCharacter(Matcher matcher, int i,
5029 CharSequence seq) {
5030 int start = (!matcher.transparentBounds) ? matcher.from : 0;
5031 for (int x = i; x >= start; x--) {
5032 int ch = Character.codePointAt(seq, x);
5033 if (Character.isLetterOrDigit(ch))
5034 return true;
5035 if (Character.getType(ch) == Character.NON_SPACING_MARK)
5036 continue;
5037 return false;
5038 }
5039 return false;
5040 }
5041
5042 /**
5043 * Attempts to match a slice in the input using the Boyer-Moore string
5044 * matching algorithm. The algorithm is based on the idea that the
5045 * pattern can be shifted farther ahead in the search text if it is
5046 * matched right to left.
5047 * <p>
5048 * The pattern is compared to the input one character at a time, from
5049 * the rightmost character in the pattern to the left. If the characters
5050 * all match the pattern has been found. If a character does not match,
5051 * the pattern is shifted right a distance that is the maximum of two
5052 * functions, the bad character shift and the good suffix shift. This
5053 * shift moves the attempted match position through the input more
5054 * quickly than a naive one position at a time check.
5055 * <p>
5056 * The bad character shift is based on the character from the text that
5057 * did not match. If the character does not appear in the pattern, the
5058 * pattern can be shifted completely beyond the bad character. If the
5059 * character does occur in the pattern, the pattern can be shifted to
5060 * line the pattern up with the next occurrence of that character.
5061 * <p>
5062 * The good suffix shift is based on the idea that some subset on the right
5063 * side of the pattern has matched. When a bad character is found, the
5064 * pattern can be shifted right by the pattern length if the subset does
5065 * not occur again in pattern, or by the amount of distance to the
5066 * next occurrence of the subset in the pattern.
5067 *
5068 * Boyer-Moore search methods adapted from code by Amy Yu.
5069 */
5070 static class BnM extends Node {
5071 int[] buffer;
5072 int[] lastOcc;
5073 int[] optoSft;
5074
5075 /**
5076 * Pre calculates arrays needed to generate the bad character
5077 * shift and the good suffix shift. Only the last seven bits
5078 * are used to see if chars match; This keeps the tables small
5079 * and covers the heavily used ASCII range, but occasionally
5080 * results in an aliased match for the bad character shift.
5081 */
5082 static Node optimize(Node node) {
5083 if (!(node instanceof Slice)) {
5084 return node;
5085 }
5086
5087 int[] src = ((Slice) node).buffer;
5088 int patternLength = src.length;
5089 // The BM algorithm requires a bit of overhead;
5090 // If the pattern is short don't use it, since
5091 // a shift larger than the pattern length cannot
5092 // be used anyway.
5093 if (patternLength < 4) {
5094 return node;
5095 }
5096 int i, j, k;
5097 int[] lastOcc = new int[128];
5098 int[] optoSft = new int[patternLength];
5099 // Precalculate part of the bad character shift
5100 // It is a table for where in the pattern each
5101 // lower 7-bit value occurs
5102 for (i = 0; i < patternLength; i++) {
5103 lastOcc[src[i] & 0x7F] = i + 1;
5104 }
5105 // Precalculate the good suffix shift
5106 // i is the shift amount being considered
5107 NEXT: for (i = patternLength; i > 0; i--) {
5108 // j is the beginning index of suffix being considered
5109 for (j = patternLength - 1; j >= i; j--) {
5110 // Testing for good suffix
5111 if (src[j] == src[j - i]) {
5112 // src[j..len] is a good suffix
5113 optoSft[j - 1] = i;
5114 } else {
5115 // No match. The array has already been
5116 // filled up with correct values before.
5117 continue NEXT;
5118 }
5119 }
5120 // This fills up the remaining of optoSft
5121 // any suffix can not have larger shift amount
5122 // then its sub-suffix. Why???
5123 while (j > 0) {
5124 optoSft[--j] = i;
5125 }
5126 }
5127 // Set the guard value because of unicode compression
5128 optoSft[patternLength - 1] = 1;
5129 if (node instanceof SliceS)
5130 return new BnMS(src, lastOcc, optoSft, node.next);
5131 return new BnM(src, lastOcc, optoSft, node.next);
5132 }
5133
5134 BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {
5135 this .buffer = src;
5136 this .lastOcc = lastOcc;
5137 this .optoSft = optoSft;
5138 this .next = next;
5139 }
5140
5141 boolean match(Matcher matcher, int i, CharSequence seq) {
5142 int[] src = buffer;
5143 int patternLength = src.length;
5144 int last = matcher.to - patternLength;
5145
5146 // Loop over all possible match positions in text
5147 NEXT: while (i <= last) {
5148 // Loop over pattern from right to left
5149 for (int j = patternLength - 1; j >= 0; j--) {
5150 int ch = seq.charAt(i + j);
5151 if (ch != src[j]) {
5152 // Shift search to the right by the maximum of the
5153 // bad character shift and the good suffix shift
5154 i += Math.max(j + 1 - lastOcc[ch & 0x7F],
5155 optoSft[j]);
5156 continue NEXT;
5157 }
5158 }
5159 // Entire pattern matched starting at i
5160 matcher.first = i;
5161 boolean ret = next.match(matcher, i + patternLength,
5162 seq);
5163 if (ret) {
5164 matcher.first = i;
5165 matcher.groups[0] = matcher.first;
5166 matcher.groups[1] = matcher.last;
5167 return true;
5168 }
5169 i++;
5170 }
5171 // BnM is only used as the leading node in the unanchored case,
5172 // and it replaced its Start() which always searches to the end
5173 // if it doesn't find what it's looking for, so hitEnd is true.
5174 matcher.hitEnd = true;
5175 return false;
5176 }
5177
5178 boolean study(TreeInfo info) {
5179 info.minLength += buffer.length;
5180 info.maxValid = false;
5181 return next.study(info);
5182 }
5183 }
5184
5185 /**
5186 * Supplementary support version of BnM(). Unpaired surrogates are
5187 * also handled by this class.
5188 */
5189 static final class BnMS extends BnM {
5190 int lengthInChars;
5191
5192 BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {
5193 super (src, lastOcc, optoSft, next);
5194 for (int x = 0; x < buffer.length; x++) {
5195 lengthInChars += Character.charCount(buffer[x]);
5196 }
5197 }
5198
5199 boolean match(Matcher matcher, int i, CharSequence seq) {
5200 int[] src = buffer;
5201 int patternLength = src.length;
5202 int last = matcher.to - lengthInChars;
5203
5204 // Loop over all possible match positions in text
5205 NEXT: while (i <= last) {
5206 // Loop over pattern from right to left
5207 int ch;
5208 for (int j = countChars(seq, i, patternLength), x = patternLength - 1; j > 0; j -= Character
5209 .charCount(ch), x--) {
5210 ch = Character.codePointBefore(seq, i + j);
5211 if (ch != src[x]) {
5212 // Shift search to the right by the maximum of the
5213 // bad character shift and the good suffix shift
5214 int n = Math.max(x + 1 - lastOcc[ch & 0x7F],
5215 optoSft[x]);
5216 i += countChars(seq, i, n);
5217 continue NEXT;
5218 }
5219 }
5220 // Entire pattern matched starting at i
5221 matcher.first = i;
5222 boolean ret = next.match(matcher, i + lengthInChars,
5223 seq);
5224 if (ret) {
5225 matcher.first = i;
5226 matcher.groups[0] = matcher.first;
5227 matcher.groups[1] = matcher.last;
5228 return true;
5229 }
5230 i += countChars(seq, i, 1);
5231 }
5232 matcher.hitEnd = true;
5233 return false;
5234 }
5235 }
5236
5237 ///////////////////////////////////////////////////////////////////////////////
5238 ///////////////////////////////////////////////////////////////////////////////
5239
5240 /**
5241 * This must be the very first initializer.
5242 */
5243 static Node accept = new Node();
5244
5245 static Node lastAccept = new LastNode();
5246
5247 private static class CharPropertyNames {
5248
5249 static CharProperty charPropertyFor(String name) {
5250 CharPropertyFactory m = map.get(name);
5251 return m == null ? null : m.make();
5252 }
5253
5254 private static abstract class CharPropertyFactory {
5255 abstract CharProperty make();
5256 }
5257
5258 private static void defCategory(String name, final int typeMask) {
5259 map.put(name, new CharPropertyFactory() {
5260 CharProperty make() {
5261 return new Category(typeMask);
5262 }
5263 });
5264 }
5265
5266 private static void defRange(String name, final int lower,
5267 final int upper) {
5268 map.put(name, new CharPropertyFactory() {
5269 CharProperty make() {
5270 return rangeFor(lower, upper);
5271 }
5272 });
5273 }
5274
5275 private static void defCtype(String name, final int ctype) {
5276 map.put(name, new CharPropertyFactory() {
5277 CharProperty make() {
5278 return new Ctype(ctype);
5279 }
5280 });
5281 }
5282
5283 private static abstract class CloneableProperty extends
5284 CharProperty implements Cloneable {
5285 public CloneableProperty clone() {
5286 try {
5287 return (CloneableProperty) super .clone();
5288 } catch (CloneNotSupportedException e) {
5289 throw new AssertionError(e);
5290 }
5291 }
5292 }
5293
5294 private static void defClone(String name,
5295 final CloneableProperty p) {
5296 map.put(name, new CharPropertyFactory() {
5297 CharProperty make() {
5298 return p.clone();
5299 }
5300 });
5301 }
5302
5303 private static final HashMap<String, CharPropertyFactory> map = new HashMap<String, CharPropertyFactory>();
5304
5305 static {
5306 // Unicode character property aliases, defined in
5307 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
5308 defCategory("Cn", 1 << Character.UNASSIGNED);
5309 defCategory("Lu", 1 << Character.UPPERCASE_LETTER);
5310 defCategory("Ll", 1 << Character.LOWERCASE_LETTER);
5311 defCategory("Lt", 1 << Character.TITLECASE_LETTER);
5312 defCategory("Lm", 1 << Character.MODIFIER_LETTER);
5313 defCategory("Lo", 1 << Character.OTHER_LETTER);
5314 defCategory("Mn", 1 << Character.NON_SPACING_MARK);
5315 defCategory("Me", 1 << Character.ENCLOSING_MARK);
5316 defCategory("Mc", 1 << Character.COMBINING_SPACING_MARK);
5317 defCategory("Nd", 1 << Character.DECIMAL_DIGIT_NUMBER);
5318 defCategory("Nl", 1 << Character.LETTER_NUMBER);
5319 defCategory("No", 1 << Character.OTHER_NUMBER);
5320 defCategory("Zs", 1 << Character.SPACE_SEPARATOR);
5321 defCategory("Zl", 1 << Character.LINE_SEPARATOR);
5322 defCategory("Zp", 1 << Character.PARAGRAPH_SEPARATOR);
5323 defCategory("Cc", 1 << Character.CONTROL);
5324 defCategory("Cf", 1 << Character.FORMAT);
5325 defCategory("Co", 1 << Character.PRIVATE_USE);
5326 defCategory("Cs", 1 << Character.SURROGATE);
5327 defCategory("Pd", 1 << Character.DASH_PUNCTUATION);
5328 defCategory("Ps", 1 << Character.START_PUNCTUATION);
5329 defCategory("Pe", 1 << Character.END_PUNCTUATION);
5330 defCategory("Pc", 1 << Character.CONNECTOR_PUNCTUATION);
5331 defCategory("Po", 1 << Character.OTHER_PUNCTUATION);
5332 defCategory("Sm", 1 << Character.MATH_SYMBOL);
5333 defCategory("Sc", 1 << Character.CURRENCY_SYMBOL);
5334 defCategory("Sk", 1 << Character.MODIFIER_SYMBOL);
5335 defCategory("So", 1 << Character.OTHER_SYMBOL);
5336 defCategory("Pi", 1 << Character.INITIAL_QUOTE_PUNCTUATION);
5337 defCategory("Pf", 1 << Character.FINAL_QUOTE_PUNCTUATION);
5338 defCategory(
5339 "L",
5340 ((1 << Character.UPPERCASE_LETTER)
5341 | (1 << Character.LOWERCASE_LETTER)
5342 | (1 << Character.TITLECASE_LETTER)
5343 | (1 << Character.MODIFIER_LETTER) | (1 << Character.OTHER_LETTER)));
5344 defCategory(
5345 "M",
5346 ((1 << Character.NON_SPACING_MARK)
5347 | (1 << Character.ENCLOSING_MARK) | (1 << Character.COMBINING_SPACING_MARK)));
5348 defCategory(
5349 "N",
5350 ((1 << Character.DECIMAL_DIGIT_NUMBER)
5351 | (1 << Character.LETTER_NUMBER) | (1 << Character.OTHER_NUMBER)));
5352 defCategory(
5353 "Z",
5354 ((1 << Character.SPACE_SEPARATOR)
5355 | (1 << Character.LINE_SEPARATOR) | (1 << Character.PARAGRAPH_SEPARATOR)));
5356 defCategory(
5357 "C",
5358 ((1 << Character.CONTROL) | (1 << Character.FORMAT)
5359 | (1 << Character.PRIVATE_USE) | (1 << Character.SURROGATE))); // Other
5360 defCategory(
5361 "P",
5362 ((1 << Character.DASH_PUNCTUATION)
5363 | (1 << Character.START_PUNCTUATION)
5364 | (1 << Character.END_PUNCTUATION)
5365 | (1 << Character.CONNECTOR_PUNCTUATION)
5366 | (1 << Character.OTHER_PUNCTUATION)
5367 | (1 << Character.INITIAL_QUOTE_PUNCTUATION) | (1 << Character.FINAL_QUOTE_PUNCTUATION)));
5368 defCategory(
5369 "S",
5370 ((1 << Character.MATH_SYMBOL)
5371 | (1 << Character.CURRENCY_SYMBOL)
5372 | (1 << Character.MODIFIER_SYMBOL) | (1 << Character.OTHER_SYMBOL)));
5373 defCategory(
5374 "LC",
5375 ((1 << Character.UPPERCASE_LETTER)
5376 | (1 << Character.LOWERCASE_LETTER) | (1 << Character.TITLECASE_LETTER)));
5377 defCategory(
5378 "LD",
5379 ((1 << Character.UPPERCASE_LETTER)
5380 | (1 << Character.LOWERCASE_LETTER)
5381 | (1 << Character.TITLECASE_LETTER)
5382 | (1 << Character.MODIFIER_LETTER)
5383 | (1 << Character.OTHER_LETTER) | (1 << Character.DECIMAL_DIGIT_NUMBER)));
5384 defRange("L1", 0x00, 0xFF); // Latin-1
5385 map.put("all", new CharPropertyFactory() {
5386 CharProperty make() {
5387 return new All();
5388 }
5389 });
5390
5391 // Posix regular expression character classes, defined in
5392 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
5393 defRange("ASCII", 0x00, 0x7F); // ASCII
5394 defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
5395 defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
5396 defCtype("Blank", ASCII.BLANK); // Space and tab characters
5397 defCtype("Cntrl", ASCII.CNTRL); // Control characters
5398 defRange("Digit", '0', '9'); // Numeric characters
5399 defCtype("Graph", ASCII.GRAPH); // printable and visible
5400 defRange("Lower", 'a', 'z'); // Lower-case alphabetic
5401 defRange("Print", 0x20, 0x7E); // Printable characters
5402 defCtype("Punct", ASCII.PUNCT); // Punctuation characters
5403 defCtype("Space", ASCII.SPACE); // Space characters
5404 defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
5405 defCtype("XDigit", ASCII.XDIGIT); // hexadecimal digits
5406
5407 // Java character properties, defined by methods in Character.java
5408 defClone("javaLowerCase", new CloneableProperty() {
5409 boolean isSatisfiedBy(int ch) {
5410 return Character.isLowerCase(ch);
5411 }
5412 });
5413 defClone("javaUpperCase", new CloneableProperty() {
5414 boolean isSatisfiedBy(int ch) {
5415 return Character.isUpperCase(ch);
5416 }
5417 });
5418 defClone("javaTitleCase", new CloneableProperty() {
5419 boolean isSatisfiedBy(int ch) {
5420 return Character.isTitleCase(ch);
5421 }
5422 });
5423 defClone("javaDigit", new CloneableProperty() {
5424 boolean isSatisfiedBy(int ch) {
5425 return Character.isDigit(ch);
5426 }
5427 });
5428 defClone("javaDefined", new CloneableProperty() {
5429 boolean isSatisfiedBy(int ch) {
5430 return Character.isDefined(ch);
5431 }
5432 });
5433 defClone("javaLetter", new CloneableProperty() {
5434 boolean isSatisfiedBy(int ch) {
5435 return Character.isLetter(ch);
5436 }
5437 });
5438 defClone("javaLetterOrDigit", new CloneableProperty() {
5439 boolean isSatisfiedBy(int ch) {
5440 return Character.isLetterOrDigit(ch);
5441 }
5442 });
5443 defClone("javaJavaIdentifierStart",
5444 new CloneableProperty() {
5445 boolean isSatisfiedBy(int ch) {
5446 return Character.isJavaIdentifierStart(ch);
5447 }
5448 });
5449 defClone("javaJavaIdentifierPart", new CloneableProperty() {
5450 boolean isSatisfiedBy(int ch) {
5451 return Character.isJavaIdentifierPart(ch);
5452 }
5453 });
5454 defClone("javaUnicodeIdentifierStart",
5455 new CloneableProperty() {
5456 boolean isSatisfiedBy(int ch) {
5457 return Character
5458 .isUnicodeIdentifierStart(ch);
5459 }
5460 });
5461 defClone("javaUnicodeIdentifierPart",
5462 new CloneableProperty() {
5463 boolean isSatisfiedBy(int ch) {
5464 return Character
5465 .isUnicodeIdentifierPart(ch);
5466 }
5467 });
5468 defClone("javaIdentifierIgnorable",
5469 new CloneableProperty() {
5470 boolean isSatisfiedBy(int ch) {
5471 return Character.isIdentifierIgnorable(ch);
5472 }
5473 });
5474 defClone("javaSpaceChar", new CloneableProperty() {
5475 boolean isSatisfiedBy(int ch) {
5476 return Character.isSpaceChar(ch);
5477 }
5478 });
5479 defClone("javaWhitespace", new CloneableProperty() {
5480 boolean isSatisfiedBy(int ch) {
5481 return Character.isWhitespace(ch);
5482 }
5483 });
5484 defClone("javaISOControl", new CloneableProperty() {
5485 boolean isSatisfiedBy(int ch) {
5486 return Character.isISOControl(ch);
5487 }
5488 });
5489 defClone("javaMirrored", new CloneableProperty() {
5490 boolean isSatisfiedBy(int ch) {
5491 return Character.isMirrored(ch);
5492 }
5493 });
5494 }
5495 }
5496 }
|