001: package net.sf.saxon.instruct;
002:
003: import net.sf.saxon.om.ArrayIterator;
004: import net.sf.saxon.om.Item;
005: import net.sf.saxon.om.SequenceIterator;
006: import net.sf.saxon.om.EmptyIterator;
007: import net.sf.saxon.value.StringValue;
008:
009: import java.util.regex.Matcher;
010: import java.util.regex.Pattern;
011:
012: /**
013: * Class RegexIterator - provides an iterator over matched and unmatched substrings
014: */
015:
016: public class RegexIterator implements SequenceIterator {
017:
018: private String theString; // the input string being matched
019: private Pattern pattern; // the regex against which the string is matched
020: private Matcher matcher; // the Matcher object that does the matching, and holds the state
021: private String current; // the string most recently returned by the iterator
022: private String next; // if the last string was a matching string, null; otherwise the next substring
023: // matched by the regex
024: private int position = 0; // the value of XPath position()
025: private int prevEnd = 0; // the position in the input string of the end of the last match or non-match
026:
027: /**
028: * Construct a RegexIterator. Note that the underlying matcher.find() method is called once
029: * to obtain each matching substring. But the iterator also returns non-matching substrings
030: * if these appear between the matching substrings.
031: * @param string the string to be analysed
032: * @param pattern the regular expression
033: */
034:
035: public RegexIterator(String string, Pattern pattern) {
036: theString = string;
037: this .pattern = pattern;
038: matcher = pattern.matcher(string);
039: next = null;
040: }
041:
042: /**
043: * Get the next item in the sequence
044: * @return the next item in the sequence
045: */
046:
047: public Item next() {
048: if (next == null && prevEnd >= 0) {
049: // we've returned a match (or we're at the start), so find the next match
050: if (matcher.find()) {
051: int start = matcher.start();
052: int end = matcher.end();
053: if (prevEnd == start) {
054: // there's no intervening non-matching string to return
055: next = null;
056: current = theString.substring(start, end);
057: prevEnd = end;
058: } else {
059: // return the non-matching substring first
060: current = theString.substring(prevEnd, start);
061: next = theString.substring(start, end);
062: }
063: } else {
064: // there are no more regex matches, we must return the final non-matching text if any
065: if (prevEnd < theString.length()) {
066: current = theString.substring(prevEnd);
067: next = null;
068: } else {
069: // this really is the end...
070: current = null;
071: position = -1;
072: prevEnd = -1;
073: return null;
074: }
075: prevEnd = -1;
076: }
077: } else {
078: // we've returned a non-match, so now return the match that follows it, if there is one
079: if (prevEnd >= 0) {
080: current = next;
081: next = null;
082: prevEnd = matcher.end();
083: } else {
084: current = null;
085: position = -1;
086: return null;
087: }
088: }
089: position++;
090: return StringValue.makeStringValue(current);
091: }
092:
093: /**
094: * Get the current item in the sequence
095: * @return the item most recently returned by next()
096: */
097:
098: public Item current() {
099: return StringValue.makeStringValue(current);
100: }
101:
102: /**
103: * Get the position of the current item in the sequence
104: * @return the position of the item most recently returned by next(), starting at 1
105: */
106:
107: public int position() {
108: return position;
109: }
110:
111: /**
112: * Get another iterator over the same items
113: * @return a new iterator, positioned before the first item
114: */
115:
116: public SequenceIterator getAnother() {
117: return new RegexIterator(theString, pattern);
118: }
119:
120: /**
121: * Get properties of this iterator, as a bit-significant integer.
122: *
123: * @return the properties of this iterator. This will be some combination of
124: * properties such as {@link GROUNDED}, {@link LAST_POSITION_FINDER},
125: * and {@link LOOKAHEAD}. It is always
126: * acceptable to return the value zero, indicating that there are no known special properties.
127: * It is acceptable for the properties of the iterator to change depending on its state.
128: */
129:
130: public int getProperties() {
131: return 0;
132: }
133:
134: /**
135: * Determine whether the current item is a matching item or a non-matching item
136: * @return true if the current item (the one most recently returned by next()) is
137: * an item that matches the regular expression, or false if it is an item that
138: * does not match
139: */
140:
141: public boolean isMatching() {
142: return next == null && prevEnd >= 0;
143: }
144:
145: /**
146: * Get a substring that matches a parenthesised group within the regular expression
147: * @param number the number of the group to be obtained
148: * @return the substring of the current item that matches the n'th parenthesized group
149: * within the regular expression
150: */
151:
152: public String getRegexGroup(int number) {
153: if (!isMatching())
154: return null;
155: if (number > matcher.groupCount() || number < 0)
156: return "";
157: String s = matcher.group(number);
158: if (s == null)
159: return "";
160: return s;
161: }
162:
163: /**
164: * Get a sequence containing all the regex groups (except group 0, because we want to use indexing from 1).
165: * This is used by the saxon:analyze-string() higher-order extension function.
166: */
167:
168: public SequenceIterator getRegexGroupIterator() {
169: int c = matcher.groupCount();
170: if (c == 0) {
171: return EmptyIterator.getInstance();
172: } else {
173: StringValue[] groups = new StringValue[c];
174: for (int i = 1; i <= groups.length; i++) {
175: groups[i - 1] = StringValue.makeStringValue(matcher
176: .group(i));
177: }
178: return new ArrayIterator(groups);
179: }
180: }
181:
182: }
183:
184: //
185: // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
186: // you may not use this file except in compliance with the License. You may obtain a copy of the
187: // License at http://www.mozilla.org/MPL/
188: //
189: // Software distributed under the License is distributed on an "AS IS" basis,
190: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
191: // See the License for the specific language governing rights and limitations under the License.
192: //
193: // The Original Code is: all this file.
194: //
195: // The Initial Developer of the Original Code is Michael H. Kay
196: //
197: // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
198: //
199: // Contributor(s):
200: // Portions marked "e.g." are from Edwin Glaser (edwin@pannenleiter.de)
201: //
|