001: /*
002: * $Id: Util.java,v 1.15 2003/11/07 20:16:25 dfs Exp $
003: *
004: * ====================================================================
005: * The Apache Software License, Version 1.1
006: *
007: * Copyright (c) 2000-2002 The Apache Software Foundation. All rights
008: * reserved.
009: *
010: * Redistribution and use in source and binary forms, with or without
011: * modification, are permitted provided that the following conditions
012: * are met:
013: *
014: * 1. Redistributions of source code must retain the above copyright
015: * notice, this list of conditions and the following disclaimer.
016: *
017: * 2. Redistributions in binary form must reproduce the above copyright
018: * notice, this list of conditions and the following disclaimer in
019: * the documentation and/or other materials provided with the
020: * distribution.
021: *
022: * 3. The end-user documentation included with the redistribution,
023: * if any, must include the following acknowledgment:
024: * "This product includes software developed by the
025: * Apache Software Foundation (http://www.apache.org/)."
026: * Alternately, this acknowledgment may appear in the software itself,
027: * if and wherever such third-party acknowledgments normally appear.
028: *
029: * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
030: * must not be used to endorse or promote products derived from this
031: * software without prior written permission. For written
032: * permission, please contact apache@apache.org.
033: *
034: * 5. Products derived from this software may not be called "Apache"
035: * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
036: * name, without prior written permission of the Apache Software Foundation.
037: *
038: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
039: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
040: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
041: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
042: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
043: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
044: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
045: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
046: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
047: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
048: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
049: * SUCH DAMAGE.
050: * ====================================================================
051: *
052: * This software consists of voluntary contributions made by many
053: * individuals on behalf of the Apache Software Foundation. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package org.apache.oro.text.regex;
059:
060: import java.util.*;
061:
062: /**
063: * The Util class is a holder for useful static utility methods that can
064: * be generically applied to Pattern and PatternMatcher instances.
065: * This class cannot and is not meant to be instantiated.
066: * The Util class currently contains versions of the split() and substitute()
067: * methods inspired by Perl's split function and <b>s</b> operation
068: * respectively, although they are implemented in such a way as not to
069: * rely on the Perl5 implementations of the OROMatcher packages regular
070: * expression interfaces. They may operate on any interface implementations
071: * conforming to the OROMatcher API specification for the PatternMatcher,
072: * Pattern, and MatchResult interfaces. Future versions of the class may
073: * include additional utility methods.
074: * <p>
075: * A grep method is not included for two reasons:
076: * <ol>
077: * <li> The details of reading a line at a time from an input stream
078: * differ in JDK 1.0.2 and JDK 1.1, making it difficult to
079: * retain compatibility across both Java releases.
080: * <li> Grep style processing is trivial for the programmer to implement
081: * in a while loop. Rarely does anyone want to retrieve all
082: * occurences of a pattern and then process them. More often a
083: * programmer will retrieve pattern matches and process them as they
084: * are retrieved, which is more efficient than storing them all in a
085: * Vector and then accessing them.
086: * </ol>
087: *
088: * @version @version@
089: * @since 1.0
090: * @see Pattern
091: * @see PatternMatcher
092: */
093: public final class Util {
094: /**
095: * A constant passed to the {@link #substitute substitute()}
096: * methods indicating that all occurrences of a pattern should be
097: * substituted.
098: */
099: public static final int SUBSTITUTE_ALL = -1;
100:
101: /**
102: * A constant passed to the {@link #split split()} methods
103: * indicating that all occurrences of a pattern should be used to
104: * split a string.
105: */
106: public static final int SPLIT_ALL = 0;
107:
108: /**
109: * The default destructor for the Util class. It is made private
110: * to prevent the instantiation of the class.
111: */
112: private Util() {
113: }
114:
115: /**
116: * Splits up a <code>String</code> instance and stores results as a
117: * <code>List</code> of substrings numbering no more than a specified
118: * limit. The string is split with a regular expression as the delimiter.
119: * The <b>limit</b> parameter essentially says to split the
120: * string only on at most the first <b>limit - 1</b> number of pattern
121: * occurences.
122: * <p>
123: * This method is inspired by the Perl split() function and behaves
124: * identically to it when used in conjunction with the Perl5Matcher and
125: * Perl5Pattern classes except for the following difference:
126: * <ul><p>
127: * In Perl, if the split expression contains parentheses, the split()
128: * method creates additional list elements from each of the matching
129: * subgroups in the pattern. In other words:
130: * <ul><p>
131: * <code>split(list, "/([,-])/", "8-12,15,18", Util.SPLIT_ALL)</code></ul>
132: * <p> produces the list containing:
133: * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
134: * <p> The OROMatcher split method does not follow this behavior. The
135: * following list would be produced by OROMatcher:
136: * <ul><p><code> { "8", "12", "15", "18" } </code> </ul>
137: * <p> To obtain the Perl behavior, use
138: * {@link org.apache.oro.text.perl.Perl5Util#split}.
139: * </ul>
140: * <p>
141: * @param results A Collection to which the split results are appended.
142: * After the method returns, it contains the substrings of the input
143: * that occur between the regular expression delimiter occurences.
144: * The input will not be split into any more substrings than the
145: * specified <code>limit</code>. A way of thinking of this is that
146: * only the first <code>limit - 1</code> matches of the delimiting
147: * regular expression will be used to split the input.
148: * @param matcher The regular expression matcher to execute the split.
149: * @param pattern The regular expression to use as a split delimiter.
150: * @param input The <code>String</code> to split.
151: * @param limit The limit on the number of resulting split elements.
152: * Values <= 0 produce the same behavior as using the
153: * <b>SPLIT_ALL</b> constant which causes the limit to be
154: * ignored and splits to be performed on all occurrences of
155: * the pattern. You should use the <b>SPLIT_ALL</b> constant
156: * to achieve this behavior instead of relying on the default
157: * behavior associated with non-positive limit values.
158: * @since 2.0
159: */
160: public static void split(Collection results,
161: PatternMatcher matcher, Pattern pattern, String input,
162: int limit) {
163: int beginOffset;
164: MatchResult currentResult;
165: PatternMatcherInput pinput;
166:
167: pinput = new PatternMatcherInput(input);
168: beginOffset = 0;
169:
170: while (--limit != 0 && matcher.contains(pinput, pattern)) {
171: currentResult = matcher.getMatch();
172: results.add(input.substring(beginOffset, currentResult
173: .beginOffset(0)));
174: beginOffset = currentResult.endOffset(0);
175: }
176:
177: results.add(input.substring(beginOffset, input.length()));
178: }
179:
180: /**
181: * Splits up a <code>String</code> instance and stores results as a
182: * <code>Collection</code> of all its substrings using a regular expression
183: * as the delimiter.
184: * This method is inspired by the Perl split() function and behaves
185: * identically to it when used in conjunction with the Perl5Matcher and
186: * Perl5Pattern classes except for the following difference:
187: * <p>
188: * <ul>
189: * In Perl, if the split expression contains parentheses, the split()
190: * method creates additional list elements from each of the matching
191: * subgroups in the pattern. In other words:
192: * <ul><p><code>split(list, "/([,-])/", "8-12,15,18")</code></ul>
193: * <p> produces the list containing:
194: * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
195: * <p> The OROMatcher split method does not follow this behavior. The
196: * following list would be produced by OROMatcher:
197: * <ul><p><code> { "8", "12", "15", "18" } </code> </ul>
198: * <p> To obtain the Perl behavior, use
199: * {@link org.apache.oro.text.perl.Perl5Util#split}.
200: * </ul>
201: * <p>
202: * This method is identical to calling:
203: * <blockquote><pre>
204: * split(matcher, pattern, input, Util.SPLIT_ALL);
205: * </pre></blockquote>
206: * <p>
207: * @param results A <code>Collection</code> to which all the substrings of
208: * the input that occur between the regular expression delimiter
209: * occurences are appended.
210: * @param matcher The regular expression matcher to execute the split.
211: * @param pattern The regular expression to use as a split delimiter.
212: * @param input The <code>String</code> to split.
213: * @since 2.0
214: */
215: public static void split(Collection results,
216: PatternMatcher matcher, Pattern pattern, String input) {
217: split(results, matcher, pattern, input, SPLIT_ALL);
218: }
219:
220: /**
221: * Splits up a <code>String</code> instance into strings contained in a
222: * <code>Vector</code> of size not greater than a specified limit. The
223: * string is split with a regular expression as the delimiter.
224: * The <b>limit</b> parameter essentially says to split the
225: * string only on at most the first <b>limit - 1</b> number of pattern
226: * occurences.
227: * <p>
228: * This method is inspired by the Perl split() function and behaves
229: * identically to it when used in conjunction with the Perl5Matcher and
230: * Perl5Pattern classes except for the following difference:
231: * <ul><p>
232: * In Perl, if the split expression contains parentheses, the split()
233: * method creates additional list elements from each of the matching
234: * subgroups in the pattern. In other words:
235: * <ul><p><code>split("/([,-])/", "8-12,15,18")</code></ul>
236: * <p> produces the Vector containing:
237: * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
238: * <p> The OROMatcher split method does not follow this behavior. The
239: * following Vector would be produced by OROMatcher:
240: * <ul><p><code> { "8", "12", "15", "18" } </code> </ul>
241: * <p> To obtain the Perl behavior, use
242: * {@link org.apache.oro.text.perl.Perl5Util#split}.
243: * </ul>
244: * <p>
245: * @deprecated Use
246: * {@link #split(Collection, PatternMatcher, Pattern, String, int)} instead.
247: * @param matcher The regular expression matcher to execute the split.
248: * @param pattern The regular expression to use as a split delimiter.
249: * @param input The <code>String</code> to split.
250: * @param limit The limit on the size of the returned <code>Vector</code>.
251: * Values <= 0 produce the same behavior as using the
252: * <b>SPLIT_ALL</b> constant which causes the limit to be
253: * ignored and splits to be performed on all occurrences of
254: * the pattern. You should use the <b>SPLIT_ALL</b> constant
255: * to achieve this behavior instead of relying on the default
256: * behavior associated with non-positive limit values.
257: * @return A <code>Vector</code> containing the substrings of the input
258: * that occur between the regular expression delimiter occurences.
259: * The input will not be split into any more substrings than the
260: * specified <code>limit</code>. A way of thinking of this is that
261: * only the first <code>limit - 1</code> matches of the delimiting
262: * regular expression will be used to split the input.
263: * @since 1.0
264: */
265: public static Vector split(PatternMatcher matcher, Pattern pattern,
266: String input, int limit) {
267: Vector results = new Vector(20);
268:
269: split(results, matcher, pattern, input, limit);
270:
271: return results;
272: }
273:
274: /**
275: * Splits up a <code>String</code> instance into a <code>Vector</code>
276: * of all its substrings using a regular expression as the delimiter.
277: * This method is inspired by the Perl split() function and behaves
278: * identically to it when used in conjunction with the Perl5Matcher and
279: * Perl5Pattern classes except for the following difference:
280: * <p>
281: * <ul>
282: * In Perl, if the split expression contains parentheses, the split()
283: * method creates additional list elements from each of the matching
284: * subgroups in the pattern. In other words:
285: * <ul><p><code>split("/([,-])/", "8-12,15,18")</code></ul>
286: * <p> produces the Vector containing:
287: * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
288: * <p> The OROMatcher split method does not follow this behavior. The
289: * following Vector would be produced by OROMatcher:
290: * <ul><p><code> { "8", "12", "15", "18" } </code> </ul>
291: * <p> To obtain the Perl behavior, use
292: * {@link org.apache.oro.text.perl.Perl5Util#split}.
293: * </ul>
294: * <p>
295: * This method is identical to calling:
296: * <blockquote><pre>
297: * split(matcher, pattern, input, Util.SPLIT_ALL);
298: * </pre></blockquote>
299: * <p>
300: * @deprecated Use
301: * {@link #split(Collection, PatternMatcher, Pattern, String)} instead.
302: * @param matcher The regular expression matcher to execute the split.
303: * @param pattern The regular expression to use as a split delimiter.
304: * @param input The <code>String</code> to split.
305: * @return A <code>Vector</code> containing all the substrings of the input
306: * that occur between the regular expression delimiter occurences.
307: * @since 1.0
308: */
309: public static Vector split(PatternMatcher matcher, Pattern pattern,
310: String input) {
311: return split(matcher, pattern, input, SPLIT_ALL);
312: }
313:
314: /**
315: * Searches a string for a pattern and replaces the first occurrences
316: * of the pattern with a Substitution up to the number of
317: * substitutions specified by the <b>numSubs</b> parameter. A
318: * <b>numSubs</b> value of <b>SUBSTITUTE_ALL</b> will cause all occurrences
319: * of the pattern to be replaced.
320: * <p>
321: * @param matcher The regular expression matcher to execute the pattern
322: * search.
323: * @param pattern The regular expression to search for and substitute
324: * occurrences of.
325: * @param sub The Substitution used to substitute pattern occurences.
326: * @param input The <code>String</code> on which to perform substitutions.
327: * @param numSubs The number of substitutions to perform. Only the
328: * first <b> numSubs </b> patterns encountered are
329: * substituted. If you want to substitute all occurences
330: * set this parameter to <b> SUBSTITUTE_ALL </b>.
331: * @return A String comprising the input string with the substitutions,
332: * if any, made. If no substitutions are made, the returned String
333: * is the original input String.
334: * @since 1.0
335: */
336: public static String substitute(PatternMatcher matcher,
337: Pattern pattern, Substitution sub, String input, int numSubs) {
338: StringBuffer buffer = new StringBuffer(input.length());
339: PatternMatcherInput pinput = new PatternMatcherInput(input);
340:
341: // Users have indicated that they expect the result to be the
342: // original input string, rather than a copy, if no substitutions
343: // are performed,
344: if (substitute(buffer, matcher, pattern, sub, pinput, numSubs) != 0)
345: return buffer.toString();
346: return input;
347: }
348:
349: /**
350: * Searches a string for a pattern and substitutes only the first
351: * occurence of the pattern.
352: * <p>
353: * This method is identical to calling:
354: * <blockquote><pre>
355: * substitute(matcher, pattern, sub, input, 1);
356: * </pre></blockquote>
357: * <p>
358: * @param matcher The regular expression matcher to execute the pattern
359: * search.
360: * @param pattern The regular expression to search for and substitute
361: * occurrences of.
362: * @param sub The Substitution used to substitute pattern occurences.
363: * @param input The <code>String</code> on which to perform substitutions.
364: * @return A String comprising the input string with the substitutions,
365: * if any, made. If no substitutions are made, the returned String
366: * is the original input String.
367: * @since 1.0
368: */
369: public static String substitute(PatternMatcher matcher,
370: Pattern pattern, Substitution sub, String input) {
371: return substitute(matcher, pattern, sub, input, 1);
372: }
373:
374: /**
375: * Searches a string for a pattern and replaces the first occurrences
376: * of the pattern with a Substitution up to the number of
377: * substitutions specified by the <b>numSubs</b> parameter. A
378: * <b>numSubs</b> value of <b>SUBSTITUTE_ALL</b> will cause all occurrences
379: * of the pattern to be replaced. The number of substitutions made
380: * is returned.
381: * <p>
382: * @param result The StringBuffer in which to store the result of the
383: * substitutions. The buffer is only appended to.
384: * @param matcher The regular expression matcher to execute the pattern
385: * search.
386: * @param pattern The regular expression to search for and substitute
387: * occurrences of.
388: * @param sub The Substitution used to substitute pattern occurences.
389: * @param input The input on which to perform substitutions.
390: * @param numSubs The number of substitutions to perform. Only the
391: * first <b> numSubs </b> patterns encountered are
392: * substituted. If you want to substitute all occurences
393: * set this parameter to <b> SUBSTITUTE_ALL </b>.
394: * @return The number of substitutions made.
395: * @since 2.0.6
396: */
397: public static int substitute(StringBuffer result,
398: PatternMatcher matcher, Pattern pattern, Substitution sub,
399: String input, int numSubs) {
400: PatternMatcherInput pinput = new PatternMatcherInput(input);
401: return substitute(result, matcher, pattern, sub, pinput,
402: numSubs);
403: }
404:
405: /**
406: * Searches a string for a pattern and replaces the first occurrences
407: * of the pattern with a Substitution up to the number of
408: * substitutions specified by the <b>numSubs</b> parameter. A
409: * <b>numSubs</b> value of <b>SUBSTITUTE_ALL</b> will cause all occurrences
410: * of the pattern to be replaced. The number of substitutions made
411: * is returned.
412: * <p>
413: * @param result The StringBuffer in which to store the result of the
414: * substitutions. The buffer is only appended to.
415: * @param matcher The regular expression matcher to execute the pattern
416: * search.
417: * @param pattern The regular expression to search for and substitute
418: * occurrences of.
419: * @param sub The Substitution used to substitute pattern occurences.
420: * @param input The input on which to perform substitutions.
421: * @param numSubs The number of substitutions to perform. Only the
422: * first <b> numSubs </b> patterns encountered are
423: * substituted. If you want to substitute all occurences
424: * set this parameter to <b> SUBSTITUTE_ALL </b>.
425: * @return The number of substitutions made.
426: * @since 2.0.3
427: */
428: public static int substitute(StringBuffer result,
429: PatternMatcher matcher, Pattern pattern, Substitution sub,
430: PatternMatcherInput input, int numSubs) {
431: int beginOffset, subCount;
432: char[] inputBuffer;
433:
434: subCount = 0;
435: beginOffset = input.getBeginOffset();
436: inputBuffer = input.getBuffer();
437:
438: // Must be != 0 because SUBSTITUTE_ALL is represented by -1.
439: // Do NOT change to numSubs > 0.
440: while (numSubs != 0 && matcher.contains(input, pattern)) {
441: --numSubs;
442: ++subCount;
443: result.append(inputBuffer, beginOffset, input
444: .getMatchBeginOffset()
445: - beginOffset);
446: sub.appendSubstitution(result, matcher.getMatch(),
447: subCount, input, matcher, pattern);
448: beginOffset = input.getMatchEndOffset();
449: }
450:
451: result.append(inputBuffer, beginOffset, input.length()
452: - beginOffset);
453: return subCount;
454: }
455: }
|