001: /*
002: * Regsub.java
003: *
004: * See the file "license.terms" for information on usage and
005: * redistribution of this file, and for a DISCLAIMER OF ALL
006: * WARRANTIES.
007: *
008: * SCCS: %Z% %M% %I% %E% %U%
009: */
010:
011: package sunlabs.brazil.util.regexp;
012:
013: /**
014: * The <code>Regsub</code> class provides an iterator-like object to
015: * extract the matched and unmatched portions of a string with respect to
016: * a given regular expression.
017: * <p>
018: * After each match is found, the portions of the string already
019: * checked are not searched again -- searching for the next match will
020: * begin at the character just after where the last match ended.
021: * <p>
022: * Here is an example of using Regsub to replace all "%XX" sequences in
023: * a string with the ASCII character represented by the hex digits "XX":
024: * <pre>
025: * public static void
026: * main(String[] args)
027: * throws Exception
028: * {
029: * Regexp re = new Regexp("%[a-fA-F0-9][a-fA-F0-9]");
030: * Regsub rs = new Regsub(re, args[0]);
031: *
032: * StringBuffer sb = new StringBuffer();
033: *
034: * while (rs.nextMatch()) {
035: * sb.append(rs.skipped());
036: *
037: * String match = rs.matched();
038: *
039: * int hi = Character.digit(match.charAt(1), 16);
040: * int lo = Character.digit(match.charAt(2), 16);
041: * sb.append((char) ((hi << 4) | lo));
042: * }
043: * sb.append(rs.rest());
044: *
045: * System.out.println(sb);
046: * }
047: * </pre>
048: *
049: * @author Colin Stevens (colin.stevens@sun.com)
050: * @version 1.4, 99/10/14
051: * @see Regexp
052: */
053: public class Regsub {
054: Regexp r;
055: String str;
056: int ustart;
057: int mstart;
058: int end;
059: Regexp.Match m;
060:
061: /**
062: * Construct a new <code>Regsub</code> that can be used to step
063: * through the given string, finding each substring that matches
064: * the given regular expression.
065: * <p>
066: * <code>Regexp</code> contains two substitution methods,
067: * <code>sub</code> and <code>subAll</code>, that can be used instead
068: * of <code>Regsub</code> if just simple substitutions are being done.
069: *
070: * @param r
071: * The compiled regular expression.
072: *
073: * @param str
074: * The string to search.
075: *
076: * @see Regexp#sub
077: * @see Regexp#subAll
078: */
079: public Regsub(Regexp r, String str) {
080: this .r = r;
081: this .str = str;
082: this .ustart = 0;
083: this .mstart = -1;
084: this .end = 0;
085: }
086:
087: /**
088: * Searches for the next substring that matches the regular expression.
089: * After calling this method, the caller would call methods like
090: * <code>skipped</code>, <code>matched</code>, etc. to query attributes
091: * of the matched region.
092: * <p>
093: * Calling this function again will search for the next match, beginning
094: * at the character just after where the last match ended.
095: *
096: * @return <code>true</code> if a match was found, <code>false</code>
097: * if there are no more matches.
098: */
099: public boolean nextMatch() {
100: ustart = end;
101:
102: /*
103: * Consume one character if the last match didn't consume any
104: * characters, to avoid an infinite loop.
105: */
106:
107: int off = ustart;
108: if (off == mstart) {
109: off++;
110: if (off >= str.length()) {
111: return false;
112: }
113: }
114:
115: m = r.exec(str, 0, off);
116: if (m == null) {
117: return false;
118: }
119:
120: mstart = m.indices[0];
121: end = m.indices[1];
122:
123: return true;
124: }
125:
126: /**
127: * Returns a substring consisting of all the characters skipped
128: * between the end of the last match (or the start of the original
129: * search string) and the start of this match.
130: * <p>
131: * This method can be used extract all the portions of string that
132: * <b>didn't</b> match the regular expression.
133: *
134: * @return The characters that didn't match.
135: */
136: public String skipped() {
137: return str.substring(ustart, mstart);
138: }
139:
140: /**
141: * Returns a substring consisting of the characters that matched
142: * the entire regular expression during the last call to
143: * <code>nextMatch</code>.
144: *
145: * @return The characters that did match.
146: *
147: * @see #submatch
148: */
149: public String matched() {
150: return str.substring(mstart, end);
151: }
152:
153: /**
154: * Returns a substring consisting of the characters that matched
155: * the given parenthesized subexpression during the last call to
156: * <code>nextMatch</code>.
157: *
158: * @param i
159: * The index of the parenthesized subexpression.
160: *
161: * @return The characters that matched the subexpression, or
162: * <code>null</code> if the given subexpression did not
163: * exist or did not match.
164: */
165: public String submatch(int i) {
166: if (i * 2 + 1 >= m.indices.length) {
167: return null;
168: }
169: int start = m.indices[i * 2];
170: int end = m.indices[i * 2 + 1];
171: if ((start < 0) || (end < 0)) {
172: return null;
173: }
174: return str.substring(start, end);
175: }
176:
177: /**
178: * Returns a substring consisting of all the characters that come
179: * after the last match. As the matches progress, the <code>rest</code>
180: * gets shorter. When <code>nextMatch</code> returns <code>false</code>,
181: * then this method will return the rest of the string that can't be
182: * matched.
183: *
184: * @return The rest of the characters after the last match.
185: */
186: public String rest() {
187: return str.substring(end);
188: }
189: }
|