0001 /*
0002 * Copyright 1999-2006 Sun Microsystems, Inc. All Rights Reserved.
0003 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0004 *
0005 * This code is free software; you can redistribute it and/or modify it
0006 * under the terms of the GNU General Public License version 2 only, as
0007 * published by the Free Software Foundation. Sun designates this
0008 * particular file as subject to the "Classpath" exception as provided
0009 * by Sun in the LICENSE file that accompanied this code.
0010 *
0011 * This code is distributed in the hope that it will be useful, but WITHOUT
0012 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0013 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
0014 * version 2 for more details (a copy is included in the LICENSE file that
0015 * accompanied this code).
0016 *
0017 * You should have received a copy of the GNU General Public License version
0018 * 2 along with this work; if not, write to the Free Software Foundation,
0019 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0020 *
0021 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
0022 * CA 95054 USA or visit www.sun.com if you need additional information or
0023 * have any questions.
0024 */
0025
0026 package java.util.regex;
0027
0028 /**
0029 * An engine that performs match operations on a {@link java.lang.CharSequence
0030 * </code>character sequence<code>} by interpreting a {@link Pattern}.
0031 *
0032 * <p> A matcher is created from a pattern by invoking the pattern's {@link
0033 * Pattern#matcher matcher} method. Once created, a matcher can be used to
0034 * perform three different kinds of match operations:
0035 *
0036 * <ul>
0037 *
0038 * <li><p> The {@link #matches matches} method attempts to match the entire
0039 * input sequence against the pattern. </p></li>
0040 *
0041 * <li><p> The {@link #lookingAt lookingAt} method attempts to match the
0042 * input sequence, starting at the beginning, against the pattern. </p></li>
0043 *
0044 * <li><p> The {@link #find find} method scans the input sequence looking for
0045 * the next subsequence that matches the pattern. </p></li>
0046 *
0047 * </ul>
0048 *
0049 * <p> Each of these methods returns a boolean indicating success or failure.
0050 * More information about a successful match can be obtained by querying the
0051 * state of the matcher.
0052 *
0053 * <p> A matcher finds matches in a subset of its input called the
0054 * <i>region</i>. By default, the region contains all of the matcher's input.
0055 * The region can be modified via the{@link #region region} method and queried
0056 * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
0057 * methods. The way that the region boundaries interact with some pattern
0058 * constructs can be changed. See {@link #useAnchoringBounds
0059 * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
0060 * for more details.
0061 *
0062 * <p> This class also defines methods for replacing matched subsequences with
0063 * new strings whose contents can, if desired, be computed from the match
0064 * result. The {@link #appendReplacement appendReplacement} and {@link
0065 * #appendTail appendTail} methods can be used in tandem in order to collect
0066 * the result into an existing string buffer, or the more convenient {@link
0067 * #replaceAll replaceAll} method can be used to create a string in which every
0068 * matching subsequence in the input sequence is replaced.
0069 *
0070 * <p> The explicit state of a matcher includes the start and end indices of
0071 * the most recent successful match. It also includes the start and end
0072 * indices of the input subsequence captured by each <a
0073 * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
0074 * count of such subsequences. As a convenience, methods are also provided for
0075 * returning these captured subsequences in string form.
0076 *
0077 * <p> The explicit state of a matcher is initially undefined; attempting to
0078 * query any part of it before a successful match will cause an {@link
0079 * IllegalStateException} to be thrown. The explicit state of a matcher is
0080 * recomputed by every match operation.
0081 *
0082 * <p> The implicit state of a matcher includes the input character sequence as
0083 * well as the <i>append position</i>, which is initially zero and is updated
0084 * by the {@link #appendReplacement appendReplacement} method.
0085 *
0086 * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
0087 * method or, if a new input sequence is desired, its {@link
0088 * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a
0089 * matcher discards its explicit state information and sets the append position
0090 * to zero.
0091 *
0092 * <p> Instances of this class are not safe for use by multiple concurrent
0093 * threads. </p>
0094 *
0095 *
0096 * @author Mike McCloskey
0097 * @author Mark Reinhold
0098 * @author JSR-51 Expert Group
0099 * @version 1.73, 07/05/05
0100 * @since 1.4
0101 * @spec JSR-51
0102 */
0103
0104 public final class Matcher implements MatchResult {
0105
0106 /**
0107 * The Pattern object that created this Matcher.
0108 */
0109 Pattern parentPattern;
0110
0111 /**
0112 * The storage used by groups. They may contain invalid values if
0113 * a group was skipped during the matching.
0114 */
0115 int[] groups;
0116
0117 /**
0118 * The range within the sequence that is to be matched. Anchors
0119 * will match at these "hard" boundaries. Changing the region
0120 * changes these values.
0121 */
0122 int from, to;
0123
0124 /**
0125 * Lookbehind uses this value to ensure that the subexpression
0126 * match ends at the point where the lookbehind was encountered.
0127 */
0128 int lookbehindTo;
0129
0130 /**
0131 * The original string being matched.
0132 */
0133 CharSequence text;
0134
0135 /**
0136 * Matcher state used by the last node. NOANCHOR is used when a
0137 * match does not have to consume all of the input. ENDANCHOR is
0138 * the mode used for matching all the input.
0139 */
0140 static final int ENDANCHOR = 1;
0141 static final int NOANCHOR = 0;
0142 int acceptMode = NOANCHOR;
0143
0144 /**
0145 * The range of string that last matched the pattern. If the last
0146 * match failed then first is -1; last initially holds 0 then it
0147 * holds the index of the end of the last match (which is where the
0148 * next search starts).
0149 */
0150 int first = -1, last = 0;
0151
0152 /**
0153 * The end index of what matched in the last match operation.
0154 */
0155 int oldLast = -1;
0156
0157 /**
0158 * The index of the last position appended in a substitution.
0159 */
0160 int lastAppendPosition = 0;
0161
0162 /**
0163 * Storage used by nodes to tell what repetition they are on in
0164 * a pattern, and where groups begin. The nodes themselves are stateless,
0165 * so they rely on this field to hold state during a match.
0166 */
0167 int[] locals;
0168
0169 /**
0170 * Boolean indicating whether or not more input could change
0171 * the results of the last match.
0172 *
0173 * If hitEnd is true, and a match was found, then more input
0174 * might cause a different match to be found.
0175 * If hitEnd is true and a match was not found, then more
0176 * input could cause a match to be found.
0177 * If hitEnd is false and a match was found, then more input
0178 * will not change the match.
0179 * If hitEnd is false and a match was not found, then more
0180 * input will not cause a match to be found.
0181 */
0182 boolean hitEnd;
0183
0184 /**
0185 * Boolean indicating whether or not more input could change
0186 * a positive match into a negative one.
0187 *
0188 * If requireEnd is true, and a match was found, then more
0189 * input could cause the match to be lost.
0190 * If requireEnd is false and a match was found, then more
0191 * input might change the match but the match won't be lost.
0192 * If a match was not found, then requireEnd has no meaning.
0193 */
0194 boolean requireEnd;
0195
0196 /**
0197 * If transparentBounds is true then the boundaries of this
0198 * matcher's region are transparent to lookahead, lookbehind,
0199 * and boundary matching constructs that try to see beyond them.
0200 */
0201 boolean transparentBounds = false;
0202
0203 /**
0204 * If anchoringBounds is true then the boundaries of this
0205 * matcher's region match anchors such as ^ and $.
0206 */
0207 boolean anchoringBounds = true;
0208
0209 /**
0210 * No default constructor.
0211 */
0212 Matcher() {
0213 }
0214
0215 /**
0216 * All matchers have the state used by Pattern during a match.
0217 */
0218 Matcher(Pattern parent, CharSequence text) {
0219 this .parentPattern = parent;
0220 this .text = text;
0221
0222 // Allocate state storage
0223 int parentGroupCount = Math.max(parent.capturingGroupCount, 10);
0224 groups = new int[parentGroupCount * 2];
0225 locals = new int[parent.localCount];
0226
0227 // Put fields into initial states
0228 reset();
0229 }
0230
0231 /**
0232 * Returns the pattern that is interpreted by this matcher.
0233 *
0234 * @return The pattern for which this matcher was created
0235 */
0236 public Pattern pattern() {
0237 return parentPattern;
0238 }
0239
0240 /**
0241 * Returns the match state of this matcher as a {@link MatchResult}.
0242 * The result is unaffected by subsequent operations performed upon this
0243 * matcher.
0244 *
0245 * @return a <code>MatchResult</code> with the state of this matcher
0246 * @since 1.5
0247 */
0248 public MatchResult toMatchResult() {
0249 Matcher result = new Matcher(this .parentPattern, text
0250 .toString());
0251 result.first = this .first;
0252 result.last = this .last;
0253 result.groups = (int[]) (this .groups.clone());
0254 return result;
0255 }
0256
0257 /**
0258 * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to
0259 * find matches with.
0260 *
0261 * <p> This method causes this matcher to lose information
0262 * about the groups of the last match that occurred. The
0263 * matcher's position in the input is maintained and its
0264 * last append position is unaffected.</p>
0265 *
0266 * @param newPattern
0267 * The new pattern used by this matcher
0268 * @return This matcher
0269 * @throws IllegalArgumentException
0270 * If newPattern is <tt>null</tt>
0271 * @since 1.5
0272 */
0273 public Matcher usePattern(Pattern newPattern) {
0274 if (newPattern == null)
0275 throw new IllegalArgumentException("Pattern cannot be null");
0276 parentPattern = newPattern;
0277
0278 // Reallocate state storage
0279 int parentGroupCount = Math.max(newPattern.capturingGroupCount,
0280 10);
0281 groups = new int[parentGroupCount * 2];
0282 locals = new int[newPattern.localCount];
0283 for (int i = 0; i < groups.length; i++)
0284 groups[i] = -1;
0285 for (int i = 0; i < locals.length; i++)
0286 locals[i] = -1;
0287 return this ;
0288 }
0289
0290 /**
0291 * Resets this matcher.
0292 *
0293 * <p> Resetting a matcher discards all of its explicit state information
0294 * and sets its append position to zero. The matcher's region is set to the
0295 * default region, which is its entire character sequence. The anchoring
0296 * and transparency of this matcher's region boundaries are unaffected.
0297 *
0298 * @return This matcher
0299 */
0300 public Matcher reset() {
0301 first = -1;
0302 last = 0;
0303 oldLast = -1;
0304 for (int i = 0; i < groups.length; i++)
0305 groups[i] = -1;
0306 for (int i = 0; i < locals.length; i++)
0307 locals[i] = -1;
0308 lastAppendPosition = 0;
0309 from = 0;
0310 to = getTextLength();
0311 return this ;
0312 }
0313
0314 /**
0315 * Resets this matcher with a new input sequence.
0316 *
0317 * <p> Resetting a matcher discards all of its explicit state information
0318 * and sets its append position to zero. The matcher's region is set to
0319 * the default region, which is its entire character sequence. The
0320 * anchoring and transparency of this matcher's region boundaries are
0321 * unaffected.
0322 *
0323 * @param input
0324 * The new input character sequence
0325 *
0326 * @return This matcher
0327 */
0328 public Matcher reset(CharSequence input) {
0329 text = input;
0330 return reset();
0331 }
0332
0333 /**
0334 * Returns the start index of the previous match. </p>
0335 *
0336 * @return The index of the first character matched
0337 *
0338 * @throws IllegalStateException
0339 * If no match has yet been attempted,
0340 * or if the previous match operation failed
0341 */
0342 public int start() {
0343 if (first < 0)
0344 throw new IllegalStateException("No match available");
0345 return first;
0346 }
0347
0348 /**
0349 * Returns the start index of the subsequence captured by the given group
0350 * during the previous match operation.
0351 *
0352 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
0353 * to right, starting at one. Group zero denotes the entire pattern, so
0354 * the expression <i>m.</i><tt>start(0)</tt> is equivalent to
0355 * <i>m.</i><tt>start()</tt>. </p>
0356 *
0357 * @param group
0358 * The index of a capturing group in this matcher's pattern
0359 *
0360 * @return The index of the first character captured by the group,
0361 * or <tt>-1</tt> if the match was successful but the group
0362 * itself did not match anything
0363 *
0364 * @throws IllegalStateException
0365 * If no match has yet been attempted,
0366 * or if the previous match operation failed
0367 *
0368 * @throws IndexOutOfBoundsException
0369 * If there is no capturing group in the pattern
0370 * with the given index
0371 */
0372 public int start(int group) {
0373 if (first < 0)
0374 throw new IllegalStateException("No match available");
0375 if (group > groupCount())
0376 throw new IndexOutOfBoundsException("No group " + group);
0377 return groups[group * 2];
0378 }
0379
0380 /**
0381 * Returns the offset after the last character matched. </p>
0382 *
0383 * @return The offset after the last character matched
0384 *
0385 * @throws IllegalStateException
0386 * If no match has yet been attempted,
0387 * or if the previous match operation failed
0388 */
0389 public int end() {
0390 if (first < 0)
0391 throw new IllegalStateException("No match available");
0392 return last;
0393 }
0394
0395 /**
0396 * Returns the offset after the last character of the subsequence
0397 * captured by the given group during the previous match operation.
0398 *
0399 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
0400 * to right, starting at one. Group zero denotes the entire pattern, so
0401 * the expression <i>m.</i><tt>end(0)</tt> is equivalent to
0402 * <i>m.</i><tt>end()</tt>. </p>
0403 *
0404 * @param group
0405 * The index of a capturing group in this matcher's pattern
0406 *
0407 * @return The offset after the last character captured by the group,
0408 * or <tt>-1</tt> if the match was successful
0409 * but the group itself did not match anything
0410 *
0411 * @throws IllegalStateException
0412 * If no match has yet been attempted,
0413 * or if the previous match operation failed
0414 *
0415 * @throws IndexOutOfBoundsException
0416 * If there is no capturing group in the pattern
0417 * with the given index
0418 */
0419 public int end(int group) {
0420 if (first < 0)
0421 throw new IllegalStateException("No match available");
0422 if (group > groupCount())
0423 throw new IndexOutOfBoundsException("No group " + group);
0424 return groups[group * 2 + 1];
0425 }
0426
0427 /**
0428 * Returns the input subsequence matched by the previous match.
0429 *
0430 * <p> For a matcher <i>m</i> with input sequence <i>s</i>,
0431 * the expressions <i>m.</i><tt>group()</tt> and
0432 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt> <i>m.</i><tt>end())</tt>
0433 * are equivalent. </p>
0434 *
0435 * <p> Note that some patterns, for example <tt>a*</tt>, match the empty
0436 * string. This method will return the empty string when the pattern
0437 * successfully matches the empty string in the input. </p>
0438 *
0439 * @return The (possibly empty) subsequence matched by the previous match,
0440 * in string form
0441 *
0442 * @throws IllegalStateException
0443 * If no match has yet been attempted,
0444 * or if the previous match operation failed
0445 */
0446 public String group() {
0447 return group(0);
0448 }
0449
0450 /**
0451 * Returns the input subsequence captured by the given group during the
0452 * previous match operation.
0453 *
0454 * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index
0455 * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and
0456 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt> <i>m.</i><tt>end(</tt><i>g</i><tt>))</tt>
0457 * are equivalent. </p>
0458 *
0459 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
0460 * to right, starting at one. Group zero denotes the entire pattern, so
0461 * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>.
0462 * </p>
0463 *
0464 * <p> If the match was successful but the group specified failed to match
0465 * any part of the input sequence, then <tt>null</tt> is returned. Note
0466 * that some groups, for example <tt>(a*)</tt>, match the empty string.
0467 * This method will return the empty string when such a group successfully
0468 * matches the empty string in the input. </p>
0469 *
0470 * @param group
0471 * The index of a capturing group in this matcher's pattern
0472 *
0473 * @return The (possibly empty) subsequence captured by the group
0474 * during the previous match, or <tt>null</tt> if the group
0475 * failed to match part of the input
0476 *
0477 * @throws IllegalStateException
0478 * If no match has yet been attempted,
0479 * or if the previous match operation failed
0480 *
0481 * @throws IndexOutOfBoundsException
0482 * If there is no capturing group in the pattern
0483 * with the given index
0484 */
0485 public String group(int group) {
0486 if (first < 0)
0487 throw new IllegalStateException("No match found");
0488 if (group < 0 || group > groupCount())
0489 throw new IndexOutOfBoundsException("No group " + group);
0490 if ((groups[group * 2] == -1) || (groups[group * 2 + 1] == -1))
0491 return null;
0492 return getSubSequence(groups[group * 2], groups[group * 2 + 1])
0493 .toString();
0494 }
0495
0496 /**
0497 * Returns the number of capturing groups in this matcher's pattern.
0498 *
0499 * <p> Group zero denotes the entire pattern by convention. It is not
0500 * included in this count.
0501 *
0502 * <p> Any non-negative integer smaller than or equal to the value
0503 * returned by this method is guaranteed to be a valid group index for
0504 * this matcher. </p>
0505 *
0506 * @return The number of capturing groups in this matcher's pattern
0507 */
0508 public int groupCount() {
0509 return parentPattern.capturingGroupCount - 1;
0510 }
0511
0512 /**
0513 * Attempts to match the entire region against the pattern.
0514 *
0515 * <p> If the match succeeds then more information can be obtained via the
0516 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
0517 *
0518 * @return <tt>true</tt> if, and only if, the entire region sequence
0519 * matches this matcher's pattern
0520 */
0521 public boolean matches() {
0522 return match(from, ENDANCHOR);
0523 }
0524
0525 /**
0526 * Attempts to find the next subsequence of the input sequence that matches
0527 * the pattern.
0528 *
0529 * <p> This method starts at the beginning of this matcher's region, or, if
0530 * a previous invocation of the method was successful and the matcher has
0531 * not since been reset, at the first character not matched by the previous
0532 * match.
0533 *
0534 * <p> If the match succeeds then more information can be obtained via the
0535 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
0536 *
0537 * @return <tt>true</tt> if, and only if, a subsequence of the input
0538 * sequence matches this matcher's pattern
0539 */
0540 public boolean find() {
0541 int nextSearchIndex = last;
0542 if (nextSearchIndex == first)
0543 nextSearchIndex++;
0544
0545 // If next search starts before region, start it at region
0546 if (nextSearchIndex < from)
0547 nextSearchIndex = from;
0548
0549 // If next search starts beyond region then it fails
0550 if (nextSearchIndex > to) {
0551 for (int i = 0; i < groups.length; i++)
0552 groups[i] = -1;
0553 return false;
0554 }
0555 return search(nextSearchIndex);
0556 }
0557
0558 /**
0559 * Resets this matcher and then attempts to find the next subsequence of
0560 * the input sequence that matches the pattern, starting at the specified
0561 * index.
0562 *
0563 * <p> If the match succeeds then more information can be obtained via the
0564 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent
0565 * invocations of the {@link #find()} method will start at the first
0566 * character not matched by this match. </p>
0567 *
0568 * @throws IndexOutOfBoundsException
0569 * If start is less than zero or if start is greater than the
0570 * length of the input sequence.
0571 *
0572 * @return <tt>true</tt> if, and only if, a subsequence of the input
0573 * sequence starting at the given index matches this matcher's
0574 * pattern
0575 */
0576 public boolean find(int start) {
0577 int limit = getTextLength();
0578 if ((start < 0) || (start > limit))
0579 throw new IndexOutOfBoundsException("Illegal start index");
0580 reset();
0581 return search(start);
0582 }
0583
0584 /**
0585 * Attempts to match the input sequence, starting at the beginning of the
0586 * region, against the pattern.
0587 *
0588 * <p> Like the {@link #matches matches} method, this method always starts
0589 * at the beginning of the region; unlike that method, it does not
0590 * require that the entire region be matched.
0591 *
0592 * <p> If the match succeeds then more information can be obtained via the
0593 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
0594 *
0595 * @return <tt>true</tt> if, and only if, a prefix of the input
0596 * sequence matches this matcher's pattern
0597 */
0598 public boolean lookingAt() {
0599 return match(from, NOANCHOR);
0600 }
0601
0602 /**
0603 * Returns a literal replacement <code>String</code> for the specified
0604 * <code>String</code>.
0605 *
0606 * This method produces a <code>String</code> that will work
0607 * as a literal replacement <code>s</code> in the
0608 * <code>appendReplacement</code> method of the {@link Matcher} class.
0609 * The <code>String</code> produced will match the sequence of characters
0610 * in <code>s</code> treated as a literal sequence. Slashes ('\') and
0611 * dollar signs ('$') will be given no special meaning.
0612 *
0613 * @param s The string to be literalized
0614 * @return A literal string replacement
0615 * @since 1.5
0616 */
0617 public static String quoteReplacement(String s) {
0618 if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
0619 return s;
0620 StringBuilder sb = new StringBuilder();
0621 for (int i = 0; i < s.length(); i++) {
0622 char c = s.charAt(i);
0623 if (c == '\\' || c == '$') {
0624 sb.append('\\');
0625 }
0626 sb.append(c);
0627 }
0628 return sb.toString();
0629 }
0630
0631 /**
0632 * Implements a non-terminal append-and-replace step.
0633 *
0634 * <p> This method performs the following actions: </p>
0635 *
0636 * <ol>
0637 *
0638 * <li><p> It reads characters from the input sequence, starting at the
0639 * append position, and appends them to the given string buffer. It
0640 * stops after reading the last character preceding the previous match,
0641 * that is, the character at index {@link
0642 * #start()} <tt>-</tt> <tt>1</tt>. </p></li>
0643 *
0644 * <li><p> It appends the given replacement string to the string buffer.
0645 * </p></li>
0646 *
0647 * <li><p> It sets the append position of this matcher to the index of
0648 * the last character matched, plus one, that is, to {@link #end()}.
0649 * </p></li>
0650 *
0651 * </ol>
0652 *
0653 * <p> The replacement string may contain references to subsequences
0654 * captured during the previous match: Each occurrence of
0655 * <tt>$</tt><i>g</i><tt></tt> will be replaced by the result of
0656 * evaluating {@link #group(int) group}<tt>(</tt><i>g</i><tt>)</tt>.
0657 * The first number after the <tt>$</tt> is always treated as part of
0658 * the group reference. Subsequent numbers are incorporated into g if
0659 * they would form a legal group reference. Only the numerals '0'
0660 * through '9' are considered as potential components of the group
0661 * reference. If the second group matched the string <tt>"foo"</tt>, for
0662 * example, then passing the replacement string <tt>"$2bar"</tt> would
0663 * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar
0664 * sign (<tt>$</tt>) may be included as a literal in the replacement
0665 * string by preceding it with a backslash (<tt>\$</tt>).
0666 *
0667 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
0668 * the replacement string may cause the results to be different than if it
0669 * were being treated as a literal replacement string. Dollar signs may be
0670 * treated as references to captured subsequences as described above, and
0671 * backslashes are used to escape literal characters in the replacement
0672 * string.
0673 *
0674 * <p> This method is intended to be used in a loop together with the
0675 * {@link #appendTail appendTail} and {@link #find find} methods. The
0676 * following code, for example, writes <tt>one dog two dogs in the
0677 * yard</tt> to the standard-output stream: </p>
0678 *
0679 * <blockquote><pre>
0680 * Pattern p = Pattern.compile("cat");
0681 * Matcher m = p.matcher("one cat two cats in the yard");
0682 * StringBuffer sb = new StringBuffer();
0683 * while (m.find()) {
0684 * m.appendReplacement(sb, "dog");
0685 * }
0686 * m.appendTail(sb);
0687 * System.out.println(sb.toString());</pre></blockquote>
0688 *
0689 * @param sb
0690 * The target string buffer
0691 *
0692 * @param replacement
0693 * The replacement string
0694 *
0695 * @return This matcher
0696 *
0697 * @throws IllegalStateException
0698 * If no match has yet been attempted,
0699 * or if the previous match operation failed
0700 *
0701 * @throws IndexOutOfBoundsException
0702 * If the replacement string refers to a capturing group
0703 * that does not exist in the pattern
0704 */
0705 public Matcher appendReplacement(StringBuffer sb, String replacement) {
0706
0707 // If no match, return error
0708 if (first < 0)
0709 throw new IllegalStateException("No match available");
0710
0711 // Process substitution string to replace group references with groups
0712 int cursor = 0;
0713 StringBuilder result = new StringBuilder();
0714
0715 while (cursor < replacement.length()) {
0716 char nextChar = replacement.charAt(cursor);
0717 if (nextChar == '\\') {
0718 cursor++;
0719 nextChar = replacement.charAt(cursor);
0720 result.append(nextChar);
0721 cursor++;
0722 } else if (nextChar == '$') {
0723 // Skip past $
0724 cursor++;
0725 // The first number is always a group
0726 int refNum = (int) replacement.charAt(cursor) - '0';
0727 if ((refNum < 0) || (refNum > 9))
0728 throw new IllegalArgumentException(
0729 "Illegal group reference");
0730 cursor++;
0731
0732 // Capture the largest legal group string
0733 boolean done = false;
0734 while (!done) {
0735 if (cursor >= replacement.length()) {
0736 break;
0737 }
0738 int nextDigit = replacement.charAt(cursor) - '0';
0739 if ((nextDigit < 0) || (nextDigit > 9)) { // not a number
0740 break;
0741 }
0742 int newRefNum = (refNum * 10) + nextDigit;
0743 if (groupCount() < newRefNum) {
0744 done = true;
0745 } else {
0746 refNum = newRefNum;
0747 cursor++;
0748 }
0749 }
0750 // Append group
0751 if (start(refNum) != -1 && end(refNum) != -1)
0752 result.append(text, start(refNum), end(refNum));
0753 } else {
0754 result.append(nextChar);
0755 cursor++;
0756 }
0757 }
0758 // Append the intervening text
0759 sb.append(text, lastAppendPosition, first);
0760 // Append the match substitution
0761 sb.append(result);
0762
0763 lastAppendPosition = last;
0764 return this ;
0765 }
0766
0767 /**
0768 * Implements a terminal append-and-replace step.
0769 *
0770 * <p> This method reads characters from the input sequence, starting at
0771 * the append position, and appends them to the given string buffer. It is
0772 * intended to be invoked after one or more invocations of the {@link
0773 * #appendReplacement appendReplacement} method in order to copy the
0774 * remainder of the input sequence. </p>
0775 *
0776 * @param sb
0777 * The target string buffer
0778 *
0779 * @return The target string buffer
0780 */
0781 public StringBuffer appendTail(StringBuffer sb) {
0782 sb.append(text, lastAppendPosition, getTextLength());
0783 return sb;
0784 }
0785
0786 /**
0787 * Replaces every subsequence of the input sequence that matches the
0788 * pattern with the given replacement string.
0789 *
0790 * <p> This method first resets this matcher. It then scans the input
0791 * sequence looking for matches of the pattern. Characters that are not
0792 * part of any match are appended directly to the result string; each match
0793 * is replaced in the result by the replacement string. The replacement
0794 * string may contain references to captured subsequences as in the {@link
0795 * #appendReplacement appendReplacement} method.
0796 *
0797 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
0798 * the replacement string may cause the results to be different than if it
0799 * were being treated as a literal replacement string. Dollar signs may be
0800 * treated as references to captured subsequences as described above, and
0801 * backslashes are used to escape literal characters in the replacement
0802 * string.
0803 *
0804 * <p> Given the regular expression <tt>a*b</tt>, the input
0805 * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
0806 * <tt>"-"</tt>, an invocation of this method on a matcher for that
0807 * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
0808 *
0809 * <p> Invoking this method changes this matcher's state. If the matcher
0810 * is to be used in further matching operations then it should first be
0811 * reset. </p>
0812 *
0813 * @param replacement
0814 * The replacement string
0815 *
0816 * @return The string constructed by replacing each matching subsequence
0817 * by the replacement string, substituting captured subsequences
0818 * as needed
0819 */
0820 public String replaceAll(String replacement) {
0821 reset();
0822 boolean result = find();
0823 if (result) {
0824 StringBuffer sb = new StringBuffer();
0825 do {
0826 appendReplacement(sb, replacement);
0827 result = find();
0828 } while (result);
0829 appendTail(sb);
0830 return sb.toString();
0831 }
0832 return text.toString();
0833 }
0834
0835 /**
0836 * Replaces the first subsequence of the input sequence that matches the
0837 * pattern with the given replacement string.
0838 *
0839 * <p> This method first resets this matcher. It then scans the input
0840 * sequence looking for a match of the pattern. Characters that are not
0841 * part of the match are appended directly to the result string; the match
0842 * is replaced in the result by the replacement string. The replacement
0843 * string may contain references to captured subsequences as in the {@link
0844 * #appendReplacement appendReplacement} method.
0845 *
0846 * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
0847 * the replacement string may cause the results to be different than if it
0848 * were being treated as a literal replacement string. Dollar signs may be
0849 * treated as references to captured subsequences as described above, and
0850 * backslashes are used to escape literal characters in the replacement
0851 * string.
0852 *
0853 * <p> Given the regular expression <tt>dog</tt>, the input
0854 * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
0855 * <tt>"cat"</tt>, an invocation of this method on a matcher for that
0856 * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p>
0857 *
0858 * <p> Invoking this method changes this matcher's state. If the matcher
0859 * is to be used in further matching operations then it should first be
0860 * reset. </p>
0861 *
0862 * @param replacement
0863 * The replacement string
0864 * @return The string constructed by replacing the first matching
0865 * subsequence by the replacement string, substituting captured
0866 * subsequences as needed
0867 */
0868 public String replaceFirst(String replacement) {
0869 if (replacement == null)
0870 throw new NullPointerException("replacement");
0871 reset();
0872 if (!find())
0873 return text.toString();
0874 StringBuffer sb = new StringBuffer();
0875 appendReplacement(sb, replacement);
0876 appendTail(sb);
0877 return sb.toString();
0878 }
0879
0880 /**
0881 * Sets the limits of this matcher's region. The region is the part of the
0882 * input sequence that will be searched to find a match. Invoking this
0883 * method resets the matcher, and then sets the region to start at the
0884 * index specified by the <code>start</code> parameter and end at the
0885 * index specified by the <code>end</code> parameter.
0886 *
0887 * <p>Depending on the transparency and anchoring being used (see
0888 * {@link #useTransparentBounds useTransparentBounds} and
0889 * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
0890 * as anchors may behave differently at or around the boundaries of the
0891 * region.
0892 *
0893 * @param start
0894 * The index to start searching at (inclusive)
0895 * @param end
0896 * The index to end searching at (exclusive)
0897 * @throws IndexOutOfBoundsException
0898 * If start or end is less than zero, if
0899 * start is greater than the length of the input sequence, if
0900 * end is greater than the length of the input sequence, or if
0901 * start is greater than end.
0902 * @return this matcher
0903 * @since 1.5
0904 */
0905 public Matcher region(int start, int end) {
0906 if ((start < 0) || (start > getTextLength()))
0907 throw new IndexOutOfBoundsException("start");
0908 if ((end < 0) || (end > getTextLength()))
0909 throw new IndexOutOfBoundsException("end");
0910 if (start > end)
0911 throw new IndexOutOfBoundsException("start > end");
0912 reset();
0913 from = start;
0914 to = end;
0915 return this ;
0916 }
0917
0918 /**
0919 * Reports the start index of this matcher's region. The
0920 * searches this matcher conducts are limited to finding matches
0921 * within {@link #regionStart regionStart} (inclusive) and
0922 * {@link #regionEnd regionEnd} (exclusive).
0923 *
0924 * @return The starting point of this matcher's region
0925 * @since 1.5
0926 */
0927 public int regionStart() {
0928 return from;
0929 }
0930
0931 /**
0932 * Reports the end index (exclusive) of this matcher's region.
0933 * The searches this matcher conducts are limited to finding matches
0934 * within {@link #regionStart regionStart} (inclusive) and
0935 * {@link #regionEnd regionEnd} (exclusive).
0936 *
0937 * @return the ending point of this matcher's region
0938 * @since 1.5
0939 */
0940 public int regionEnd() {
0941 return to;
0942 }
0943
0944 /**
0945 * Queries the transparency of region bounds for this matcher.
0946 *
0947 * <p> This method returns <tt>true</tt> if this matcher uses
0948 * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i>
0949 * bounds.
0950 *
0951 * <p> See {@link #useTransparentBounds useTransparentBounds} for a
0952 * description of transparent and opaque bounds.
0953 *
0954 * <p> By default, a matcher uses opaque region boundaries.
0955 *
0956 * @return <tt>true</tt> iff this matcher is using transparent bounds,
0957 * <tt>false</tt> otherwise.
0958 * @see java.util.regex.Matcher#useTransparentBounds(boolean)
0959 * @since 1.5
0960 */
0961 public boolean hasTransparentBounds() {
0962 return transparentBounds;
0963 }
0964
0965 /**
0966 * Sets the transparency of region bounds for this matcher.
0967 *
0968 * <p> Invoking this method with an argument of <tt>true</tt> will set this
0969 * matcher to use <i>transparent</i> bounds. If the boolean
0970 * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used.
0971 *
0972 * <p> Using transparent bounds, the boundaries of this
0973 * matcher's region are transparent to lookahead, lookbehind,
0974 * and boundary matching constructs. Those constructs can see beyond the
0975 * boundaries of the region to see if a match is appropriate.
0976 *
0977 * <p> Using opaque bounds, the boundaries of this matcher's
0978 * region are opaque to lookahead, lookbehind, and boundary matching
0979 * constructs that may try to see beyond them. Those constructs cannot
0980 * look past the boundaries so they will fail to match anything outside
0981 * of the region.
0982 *
0983 * <p> By default, a matcher uses opaque bounds.
0984 *
0985 * @param b a boolean indicating whether to use opaque or transparent
0986 * regions
0987 * @return this matcher
0988 * @see java.util.regex.Matcher#hasTransparentBounds
0989 * @since 1.5
0990 */
0991 public Matcher useTransparentBounds(boolean b) {
0992 transparentBounds = b;
0993 return this ;
0994 }
0995
0996 /**
0997 * Queries the anchoring of region bounds for this matcher.
0998 *
0999 * <p> This method returns <tt>true</tt> if this matcher uses
1000 * <i>anchoring</i> bounds, <tt>false</tt> otherwise.
1001 *
1002 * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a
1003 * description of anchoring bounds.
1004 *
1005 * <p> By default, a matcher uses anchoring region boundaries.
1006 *
1007 * @return <tt>true</tt> iff this matcher is using anchoring bounds,
1008 * <tt>false</tt> otherwise.
1009 * @see java.util.regex.Matcher#useAnchoringBounds(boolean)
1010 * @since 1.5
1011 */
1012 public boolean hasAnchoringBounds() {
1013 return anchoringBounds;
1014 }
1015
1016 /**
1017 * Sets the anchoring of region bounds for this matcher.
1018 *
1019 * <p> Invoking this method with an argument of <tt>true</tt> will set this
1020 * matcher to use <i>anchoring</i> bounds. If the boolean
1021 * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be
1022 * used.
1023 *
1024 * <p> Using anchoring bounds, the boundaries of this
1025 * matcher's region match anchors such as ^ and $.
1026 *
1027 * <p> Without anchoring bounds, the boundaries of this
1028 * matcher's region will not match anchors such as ^ and $.
1029 *
1030 * <p> By default, a matcher uses anchoring region boundaries.
1031 *
1032 * @param b a boolean indicating whether or not to use anchoring bounds.
1033 * @return this matcher
1034 * @see java.util.regex.Matcher#hasAnchoringBounds
1035 * @since 1.5
1036 */
1037 public Matcher useAnchoringBounds(boolean b) {
1038 anchoringBounds = b;
1039 return this ;
1040 }
1041
1042 /**
1043 * <p>Returns the string representation of this matcher. The
1044 * string representation of a <code>Matcher</code> contains information
1045 * that may be useful for debugging. The exact format is unspecified.
1046 *
1047 * @return The string representation of this matcher
1048 * @since 1.5
1049 */
1050 public String toString() {
1051 StringBuilder sb = new StringBuilder();
1052 sb.append("java.util.regex.Matcher");
1053 sb.append("[pattern=" + pattern());
1054 sb.append(" region=");
1055 sb.append(regionStart() + "," + regionEnd());
1056 sb.append(" lastmatch=");
1057 if ((first >= 0) && (group() != null)) {
1058 sb.append(group());
1059 }
1060 sb.append("]");
1061 return sb.toString();
1062 }
1063
1064 /**
1065 * <p>Returns true if the end of input was hit by the search engine in
1066 * the last match operation performed by this matcher.
1067 *
1068 * <p>When this method returns true, then it is possible that more input
1069 * would have changed the result of the last search.
1070 *
1071 * @return true iff the end of input was hit in the last match; false
1072 * otherwise
1073 * @since 1.5
1074 */
1075 public boolean hitEnd() {
1076 return hitEnd;
1077 }
1078
1079 /**
1080 * <p>Returns true if more input could change a positive match into a
1081 * negative one.
1082 *
1083 * <p>If this method returns true, and a match was found, then more
1084 * input could cause the match to be lost. If this method returns false
1085 * and a match was found, then more input might change the match but the
1086 * match won't be lost. If a match was not found, then requireEnd has no
1087 * meaning.
1088 *
1089 * @return true iff more input could change a positive match into a
1090 * negative one.
1091 * @since 1.5
1092 */
1093 public boolean requireEnd() {
1094 return requireEnd;
1095 }
1096
1097 /**
1098 * Initiates a search to find a Pattern within the given bounds.
1099 * The groups are filled with default values and the match of the root
1100 * of the state machine is called. The state machine will hold the state
1101 * of the match as it proceeds in this matcher.
1102 *
1103 * Matcher.from is not set here, because it is the "hard" boundary
1104 * of the start of the search which anchors will set to. The from param
1105 * is the "soft" boundary of the start of the search, meaning that the
1106 * regex tries to match at that index but ^ won't match there. Subsequent
1107 * calls to the search methods start at a new "soft" boundary which is
1108 * the end of the previous match.
1109 */
1110 boolean search(int from) {
1111 this .hitEnd = false;
1112 this .requireEnd = false;
1113 from = from < 0 ? 0 : from;
1114 this .first = from;
1115 this .oldLast = oldLast < 0 ? from : oldLast;
1116 for (int i = 0; i < groups.length; i++)
1117 groups[i] = -1;
1118 acceptMode = NOANCHOR;
1119 boolean result = parentPattern.root.match(this , from, text);
1120 if (!result)
1121 this .first = -1;
1122 this .oldLast = this .last;
1123 return result;
1124 }
1125
1126 /**
1127 * Initiates a search for an anchored match to a Pattern within the given
1128 * bounds. The groups are filled with default values and the match of the
1129 * root of the state machine is called. The state machine will hold the
1130 * state of the match as it proceeds in this matcher.
1131 */
1132 boolean match(int from, int anchor) {
1133 this .hitEnd = false;
1134 this .requireEnd = false;
1135 from = from < 0 ? 0 : from;
1136 this .first = from;
1137 this .oldLast = oldLast < 0 ? from : oldLast;
1138 for (int i = 0; i < groups.length; i++)
1139 groups[i] = -1;
1140 acceptMode = anchor;
1141 boolean result = parentPattern.matchRoot
1142 .match(this , from, text);
1143 if (!result)
1144 this .first = -1;
1145 this .oldLast = this .last;
1146 return result;
1147 }
1148
1149 /**
1150 * Returns the end index of the text.
1151 *
1152 * @return the index after the last character in the text
1153 */
1154 int getTextLength() {
1155 return text.length();
1156 }
1157
1158 /**
1159 * Generates a String from this Matcher's input in the specified range.
1160 *
1161 * @param beginIndex the beginning index, inclusive
1162 * @param endIndex the ending index, exclusive
1163 * @return A String generated from this Matcher's input
1164 */
1165 CharSequence getSubSequence(int beginIndex, int endIndex) {
1166 return text.subSequence(beginIndex, endIndex);
1167 }
1168
1169 /**
1170 * Returns this Matcher's input character at index i.
1171 *
1172 * @return A char from the specified index
1173 */
1174 char charAt(int i) {
1175 return text.charAt(i);
1176 }
1177
1178 }
|