001: /*
002: * $Id: PatternMatcherInput.java,v 1.7 2003/11/07 20:16:25 dfs Exp $
003: *
004: * ====================================================================
005: * The Apache Software License, Version 1.1
006: *
007: * Copyright (c) 2000 The Apache Software Foundation. All rights
008: * reserved.
009: *
010: * Redistribution and use in source and binary forms, with or without
011: * modification, are permitted provided that the following conditions
012: * are met:
013: *
014: * 1. Redistributions of source code must retain the above copyright
015: * notice, this list of conditions and the following disclaimer.
016: *
017: * 2. Redistributions in binary form must reproduce the above copyright
018: * notice, this list of conditions and the following disclaimer in
019: * the documentation and/or other materials provided with the
020: * distribution.
021: *
022: * 3. The end-user documentation included with the redistribution,
023: * if any, must include the following acknowledgment:
024: * "This product includes software developed by the
025: * Apache Software Foundation (http://www.apache.org/)."
026: * Alternately, this acknowledgment may appear in the software itself,
027: * if and wherever such third-party acknowledgments normally appear.
028: *
029: * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
030: * must not be used to endorse or promote products derived from this
031: * software without prior written permission. For written
032: * permission, please contact apache@apache.org.
033: *
034: * 5. Products derived from this software may not be called "Apache"
035: * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
036: * name, without prior written permission of the Apache Software Foundation.
037: *
038: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
039: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
040: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
041: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
042: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
043: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
044: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
045: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
046: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
047: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
048: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
049: * SUCH DAMAGE.
050: * ====================================================================
051: *
052: * This software consists of voluntary contributions made by many
053: * individuals on behalf of the Apache Software Foundation. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package org.apache.oro.text.regex;
059:
060: /**
061: * The PatternMatcherInput class is used to preserve state across
062: * calls to the <code>contains()</code> methods of PatternMatcher instances.
063: * It is also used to specify that only a subregion of a string
064: * should be used as input when looking for a pattern match. All that
065: * is meant by preserving state is that the end offset of the last match
066: * is remembered, so that the next match is performed from that point
067: * where the last match left off. This offset can be accessed from
068: * the {@link #getCurrentOffset()} method and can be set with the
069: * {@link #setCurrentOffset(int)} method.
070: * <p>
071: * You would use a PatternMatcherInput object when you want to search for
072: * more than just the first occurrence of a pattern in a string, or when
073: * you only want to search a subregion of the string for a match. An
074: * example of its most common use is:
075: * <blockquote><pre>
076: * PatternMatcher matcher;
077: * PatternCompiler compiler;
078: * Pattern pattern;
079: * PatternMatcherInput input;
080: * MatchResult result;
081: *
082: * compiler = new Perl5Compiler();
083: * matcher = new Perl5Matcher();
084: *
085: * try {
086: * pattern = compiler.compile(somePatternString);
087: * } catch(MalformedPatternException e) {
088: * System.out.println("Bad pattern.");
089: * System.out.println(e.getMessage());
090: * return;
091: * }
092: *
093: * input = new PatternMatcherInput(someStringInput);
094: *
095: * while(matcher.contains(input, pattern)) {
096: * result = matcher.getMatch();
097: * // Perform whatever processing on the result you want.
098: * }
099: * // Suppose we want to start searching from the beginning again with
100: * // a different pattern.
101: * // Just set the current offset to the begin offset.
102: * input.setCurrentOffset(input.getBeginOffset());
103: *
104: * // Second search omitted
105: *
106: * // Suppose we're done with this input, but want to search another string.
107: * // There's no need to create another PatternMatcherInput instance.
108: * // We can just use the setInput() method.
109: * input.setInput(aNewInputString);
110: *
111: * </pre></blockquote>
112: *
113: * @version @version@
114: * @since 1.0
115: * @see PatternMatcher
116: */
117: public final class PatternMatcherInput {
118: String _originalStringInput;
119: char[] _originalCharInput, _originalBuffer, _toLowerBuffer;
120: int _beginOffset, _endOffset, _currentOffset;
121: int _matchBeginOffset = -1, _matchEndOffset = -1;
122:
123: /**
124: * Creates a PatternMatcherInput object, associating a region of a String
125: * as input to be used for pattern matching by PatternMatcher objects.
126: * A copy of the string is not made, therefore you should not modify
127: * the string unless you know what you are doing.
128: * The current offset of the PatternMatcherInput is set to the begin
129: * offset of the region.
130: * <p>
131: * @param input The input to associate with the PatternMatcherInput.
132: * @param begin The offset into the char[] to use as the beginning of
133: * the input.
134: * @param length The length of the reegion starting from the begin offset
135: * to use as the input for pattern matching purposes.
136: */
137: public PatternMatcherInput(String input, int begin, int length) {
138: setInput(input, begin, length);
139: }
140:
141: /**
142: * Like calling
143: * <blockquote><pre>
144: * PatternMatcherInput(input, 0, input.length());
145: * </pre></blockquote>
146: * <p>
147: * @param input The input to associate with the PatternMatcherInput.
148: */
149: public PatternMatcherInput(String input) {
150: this (input, 0, input.length());
151: }
152:
153: /**
154: * Creates a PatternMatcherInput object, associating a region of a string
155: * (represented as a char[]) as input
156: * to be used for pattern matching by PatternMatcher objects.
157: * A copy of the string is not made, therefore you should not modify
158: * the string unless you know what you are doing.
159: * The current offset of the PatternMatcherInput is set to the begin
160: * offset of the region.
161: * <p>
162: * @param input The input to associate with the PatternMatcherInput.
163: * @param begin The offset into the char[] to use as the beginning of
164: * the input.
165: * @param length The length of the reegion starting from the begin offset
166: * to use as the input for pattern matching purposes.
167: */
168: public PatternMatcherInput(char[] input, int begin, int length) {
169: setInput(input, begin, length);
170: }
171:
172: /**
173: * Like calling:
174: * <blockquote><pre>
175: * PatternMatcherInput(input, 0, input.length);
176: * </pre></blockquote>
177: * <p>
178: * @param input The input to associate with the PatternMatcherInput.
179: */
180: public PatternMatcherInput(char[] input) {
181: this (input, 0, input.length);
182: }
183:
184: /**
185: * @return The length of the region to be considered input for pattern
186: * matching purposes. Essentially this is then end offset minus
187: * the begin offset.
188: */
189: public int length() {
190: return (_endOffset - _beginOffset);
191: //return _originalBuffer.length;
192: }
193:
194: /**
195: * Associates a region of a String as input
196: * to be used for pattern matching by PatternMatcher objects.
197: * The current offset of the PatternMatcherInput is set to the begin
198: * offset of the region.
199: * <p>
200: * @param input The input to associate with the PatternMatcherInput.
201: * @param begin The offset into the String to use as the beginning of
202: * the input.
203: * @param length The length of the reegion starting from the begin offset
204: * to use as the input for pattern matching purposes.
205: */
206: public void setInput(String input, int begin, int length) {
207: _originalStringInput = input;
208: _originalCharInput = null;
209: _toLowerBuffer = null;
210: _originalBuffer = input.toCharArray();
211: setCurrentOffset(begin);
212: setBeginOffset(begin);
213: setEndOffset(_beginOffset + length);
214: }
215:
216: /**
217: * This method is identical to calling:
218: * <blockquote><pre>
219: * setInput(input, 0, input.length());
220: * </pre></blockquote>
221: * <p>
222: * @param input The input to associate with the PatternMatcherInput.
223: */
224: public void setInput(String input) {
225: setInput(input, 0, input.length());
226: }
227:
228: /**
229: * Associates a region of a string (represented as a char[]) as input
230: * to be used for pattern matching by PatternMatcher objects.
231: * A copy of the string is not made, therefore you should not modify
232: * the string unless you know what you are doing.
233: * The current offset of the PatternMatcherInput is set to the begin
234: * offset of the region.
235: * <p>
236: * @param input The input to associate with the PatternMatcherInput.
237: * @param begin The offset into the char[] to use as the beginning of
238: * the input.
239: * @param length The length of the reegion starting from the begin offset
240: * to use as the input for pattern matching purposes.
241: */
242: public void setInput(char[] input, int begin, int length) {
243: _originalStringInput = null;
244: _toLowerBuffer = null;
245: _originalBuffer = _originalCharInput = input;
246: setCurrentOffset(begin);
247: setBeginOffset(begin);
248: setEndOffset(_beginOffset + length);
249: }
250:
251: /**
252: * This method is identical to calling:
253: * <blockquote><pre>
254: * setInput(input, 0, input.length);
255: * </pre></blockquote>
256: * <p>
257: * @param input The input to associate with the PatternMatcherInput.
258: */
259: public void setInput(char[] input) {
260: setInput(input, 0, input.length);
261: }
262:
263: /**
264: * Returns the character at a particular offset relative to the begin
265: * offset of the input.
266: * <p>
267: * @param offset The offset at which to fetch a character (relative to
268: * the beginning offset.
269: * @return The character at a particular offset.
270: * @exception ArrayIndexOutOfBoundsException If the offset does not occur
271: * within the bounds of the input.
272: */
273: public char charAt(int offset) {
274: return _originalBuffer[_beginOffset + offset];
275: }
276:
277: /**
278: * Returns a new string that is a substring of the PatternMatcherInput
279: * instance. The substring begins at the specified beginOffset relative
280: * to the begin offset and extends to the specified endOffset - 1
281: * relative to the begin offset of the PatternMatcherInput instance.
282: * <p>
283: * @param beginOffset The offset relative to the begin offset of the
284: * PatternMatcherInput at which to start the substring (inclusive).
285: * @param endOffset The offset relative to the begin offset of the
286: * PatternMatcherInput at which to end the substring (exclusive).
287: * @return The specified substring.
288: * @exception ArrayIndexOutOfBoundsException If one of the offsets does
289: * not occur within the bounds of the input.
290: */
291: public String substring(int beginOffset, int endOffset) {
292: return new String(_originalBuffer, _beginOffset + beginOffset,
293: endOffset - beginOffset);
294: }
295:
296: /**
297: * Returns a new string that is a substring of the PatternMatcherInput
298: * instance. The substring begins at the specified beginOffset relative
299: * to the begin offset and extends to the end offset of the
300: * PatternMatcherInput.
301: * <p>
302: * @param beginOffset The offset relative to the begin offset of the
303: * PatternMatcherInput at which to start the substring.
304: * @return The specified substring.
305: * @exception ArrayIndexOutOfBoundsException If the offset does not occur
306: * within the bounds of the input.
307: */
308: public String substring(int beginOffset) {
309: beginOffset += _beginOffset;
310: return new String(_originalBuffer, beginOffset, _endOffset
311: - beginOffset);
312: }
313:
314: /**
315: * Retrieves the original input used to initialize the PatternMatcherInput
316: * instance. If a String was used, the String instance will be returned.
317: * If a char[] was used, a char instance will be returned. This violates
318: * data encapsulation and hiding principles, but it is a great convenience
319: * for the programmer.
320: * <p>
321: * @return The String or char[] input used to initialize the
322: * PatternMatcherInput instance.
323: */
324: public Object getInput() {
325: if (_originalStringInput == null)
326: return _originalCharInput;
327: return _originalStringInput;
328: }
329:
330: /**
331: * Retrieves the char[] buffer to be used used as input by PatternMatcher
332: * implementations to look for matches. This array should be treated
333: * as read only by the programmer.
334: * <p>
335: * @return The char[] buffer to be used as input by PatternMatcher
336: * implementations.
337: */
338: public char[] getBuffer() {
339: return _originalBuffer;
340: }
341:
342: /**
343: * Returns whether or not the end of the input has been reached.
344: * <p>
345: * @return True if the current offset is greater than or equal to the
346: * end offset.
347: */
348: public boolean endOfInput() {
349: return (_currentOffset >= _endOffset);
350: }
351:
352: /**
353: * @return The offset of the input that should be considered the start
354: * of the region to be considered as input by PatternMatcher
355: * methods.
356: */
357: public int getBeginOffset() {
358: return _beginOffset;
359: }
360:
361: /**
362: * @return The offset of the input that should be considered the end
363: * of the region to be considered as input by PatternMatcher
364: * methods. This offset is actually 1 plus the last offset
365: * that is part of the input region.
366: */
367: public int getEndOffset() {
368: return _endOffset;
369: }
370:
371: /**
372: * @return The offset of the input that should be considered the current
373: * offset where PatternMatcher methods should start looking for
374: * matches.
375: */
376: public int getCurrentOffset() {
377: return _currentOffset;
378: }
379:
380: /**
381: * Sets the offset of the input that should be considered the start
382: * of the region to be considered as input by PatternMatcher
383: * methods. In other words, everything before this offset is ignored
384: * by a PatternMatcher.
385: * <p>
386: * @param offset The offset to use as the beginning of the input.
387: */
388: public void setBeginOffset(int offset) {
389: _beginOffset = offset;
390: }
391:
392: /**
393: * Sets the offset of the input that should be considered the end
394: * of the region to be considered as input by PatternMatcher
395: * methods. This offset is actually 1 plus the last offset
396: * that is part of the input region.
397: * <p>
398: * @param offset The offset to use as the end of the input.
399: */
400: public void setEndOffset(int offset) {
401: _endOffset = offset;
402: }
403:
404: /**
405: * Sets the offset of the input that should be considered the current
406: * offset where PatternMatcher methods should start looking for
407: * matches. Also resets all match offset information to -1. By calling
408: * this method, you invalidate all previous match information. Therefore
409: * a PatternMatcher implementation must call this method before setting
410: * match offset information.
411: * <p>
412: * @param offset The offset to use as the current offset.
413: */
414: public void setCurrentOffset(int offset) {
415: _currentOffset = offset;
416: setMatchOffsets(-1, -1);
417: }
418:
419: /**
420: * Returns the string representation of the input, where the input is
421: * considered to start from the begin offset and end at the end offset.
422: * <p>
423: * @return The string representation of the input.
424: */
425: public String toString() {
426: return new String(_originalBuffer, _beginOffset, length());
427: }
428:
429: /**
430: * A convenience method returning the part of the input occurring before
431: * the last match found by a call to a Perl5Matcher
432: * {@link Perl5Matcher#contains contains} method.
433: * <p>
434: * @return The input preceeding a match.
435: */
436: public String preMatch() {
437: return new String(_originalBuffer, _beginOffset,
438: _matchBeginOffset - _beginOffset);
439: }
440:
441: /**
442: * A convenience method returning the part of the input occurring after
443: * the last match found by a call to a Perl5Matcher
444: * {@link Perl5Matcher#contains contains} method.
445: * <p>
446: * @return The input succeeding a contains() match.
447: */
448: public String postMatch() {
449: return new String(_originalBuffer, _matchEndOffset, _endOffset
450: - _matchEndOffset);
451: }
452:
453: /**
454: * A convenience method returning the part of the input corresponding
455: * to the last match found by a call to a Perl5Matcher
456: * {@link Perl5Matcher#contains contains} method.
457: * The method is not called getMatch() so as not to confuse it
458: * with Perl5Matcher's getMatch() which returns a MatchResult instance
459: * and also for consistency with preMatch() and postMatch().
460: * <p>
461: * @return The input consisting of the match found by contains().
462: */
463: public String match() {
464: return new String(_originalBuffer, _matchBeginOffset,
465: _matchEndOffset - _matchBeginOffset);
466: }
467:
468: /**
469: * This method is intended for use by PatternMatcher implementations.
470: * It is necessary to record the location of the previous match so that
471: * consecutive contains() matches involving null string matches are
472: * properly handled. If you are not implementing a PatternMatcher, forget
473: * this method exists. If you use it outside of its intended context, you
474: * will only disrupt the stored state.
475: * <p>
476: * As a note, the preMatch(), postMatch(), and match() methods are provided
477: * as conveniences because PatternMatcherInput must store match offset
478: * information to completely preserve state for consecutive PatternMatcher
479: * contains() matches.
480: * <p>
481: * @param matchBeginOffset The begin offset of a match found by contains().
482: * @param matchEndOffset The end offset of a match found by contains().
483: */
484: public void setMatchOffsets(int matchBeginOffset, int matchEndOffset) {
485: _matchBeginOffset = matchBeginOffset;
486: _matchEndOffset = matchEndOffset;
487: }
488:
489: /**
490: * Returns the offset marking the beginning of the match found by
491: * contains().
492: * <p>
493: * @return The begin offset of a contains() match.
494: */
495: public int getMatchBeginOffset() {
496: return _matchBeginOffset;
497: }
498:
499: /**
500: * Returns the offset marking the end of the match found by contains().
501: * <p>
502: * @return The end offset of a contains() match.
503: */
504: public int getMatchEndOffset() {
505: return _matchEndOffset;
506: }
507: }
|