001: package net.sf.saxon.functions;
002:
003: import net.sf.saxon.expr.Expression;
004: import net.sf.saxon.expr.StaticContext;
005: import net.sf.saxon.expr.XPathContext;
006: import net.sf.saxon.om.Item;
007: import net.sf.saxon.om.SequenceIterator;
008: import net.sf.saxon.om.EmptyIterator;
009: import net.sf.saxon.trans.DynamicError;
010: import net.sf.saxon.trans.StaticError;
011: import net.sf.saxon.trans.XPathException;
012: import net.sf.saxon.type.RegexTranslator;
013: import net.sf.saxon.value.AtomicValue;
014: import net.sf.saxon.value.StringValue;
015: import net.sf.saxon.value.Value;
016:
017: import java.util.regex.Matcher;
018: import java.util.regex.Pattern;
019: import java.util.regex.PatternSyntaxException;
020:
021: /**
022: * This class implements the tokenize() function for regular expression matching. This returns a
023: * sequence of strings representing the unmatched substrings: the separators which match the
024: * regular expression are not returned.
025: */
026:
027: public class Tokenize extends SystemFunction {
028:
029: private Pattern regexp;
030:
031: /**
032: * Simplify and validate.
033: * This is a pure function so it can be simplified in advance if the arguments are known
034: */
035:
036: public Expression simplify(StaticContext env) throws XPathException {
037: Expression e = simplifyArguments(env);
038:
039: // compile the regular expression once if possible
040: if (!(e instanceof Value)) {
041: try {
042: regexp = Matches.tryToCompile(argument, 1, 2);
043: } catch (StaticError err) {
044: err.setLocator(this );
045: throw err;
046: }
047: // check that it's not a pattern that matches ""
048: if (regexp != null && regexp.matcher("").matches()) {
049: StaticError err = new StaticError(
050: "The regular expression in tokenize() must not be one that matches a zero-length string");
051: err.setErrorCode("FORX0003");
052: err.setLocator(this );
053: throw err;
054: }
055: }
056:
057: return e;
058: }
059:
060: /**
061: * Iterate over the results of the function
062: */
063:
064: public SequenceIterator iterate(XPathContext c)
065: throws XPathException {
066: AtomicValue sv = (AtomicValue) argument[0].evaluateItem(c);
067: if (sv == null) {
068: return EmptyIterator.getInstance();
069: }
070: ;
071: CharSequence input = sv.getStringValueCS();
072: if (input.length() == 0) {
073: return EmptyIterator.getInstance();
074: }
075:
076: Pattern re = regexp;
077: if (re == null) {
078:
079: sv = (AtomicValue) argument[1].evaluateItem(c);
080: CharSequence pattern = sv.getStringValueCS();
081:
082: CharSequence flags;
083: if (argument.length == 2) {
084: flags = "";
085: } else {
086: sv = (AtomicValue) argument[2].evaluateItem(c);
087: flags = sv.getStringValueCS();
088: }
089:
090: try {
091: String javaRegex = RegexTranslator.translate(pattern,
092: true);
093: re = Pattern
094: .compile(javaRegex, Matches.setFlags(flags));
095: } catch (RegexTranslator.RegexSyntaxException err) {
096: throw new DynamicError(err);
097: } catch (PatternSyntaxException err) {
098: throw new DynamicError(err);
099: }
100:
101: // check that it's not a pattern that matches ""
102: if (re.matcher("").matches()) {
103: StaticError err = new StaticError(
104: "The regular expression in tokenize() must not be one that matches a zero-length string");
105: err.setErrorCode("FORX0003");
106: err.setLocator(this );
107: throw err;
108: }
109:
110: }
111: return new TokenIterator(input, re);
112: }
113:
114: /**
115: * Inner class TokenIterator
116: */
117:
118: public static class TokenIterator implements SequenceIterator {
119:
120: private CharSequence input;
121: private Pattern pattern;
122: private Matcher matcher;
123: private CharSequence current;
124: private int position = 0;
125: private int prevEnd = 0;
126:
127: /**
128: * Construct a TokenIterator.
129: */
130:
131: public TokenIterator(CharSequence input, Pattern pattern) {
132: this .input = input;
133: this .pattern = pattern;
134: matcher = pattern.matcher(input);
135: prevEnd = 0;
136: }
137:
138: public Item next() {
139: if (prevEnd < 0) {
140: current = null;
141: position = -1;
142: return null;
143: }
144:
145: if (matcher.find()) {
146: current = input.subSequence(prevEnd, matcher.start());
147: prevEnd = matcher.end();
148: } else {
149: current = input.subSequence(prevEnd, input.length());
150: prevEnd = -1;
151: }
152: position++;
153: return StringValue.makeStringValue(current);
154: }
155:
156: public Item current() {
157: return (current == null ? null : StringValue
158: .makeStringValue(current));
159: }
160:
161: public int position() {
162: return position;
163: }
164:
165: public SequenceIterator getAnother() {
166: return new TokenIterator(input, pattern);
167: }
168:
169: /**
170: * Get properties of this iterator, as a bit-significant integer.
171: *
172: * @return the properties of this iterator. This will be some combination of
173: * properties such as {@link GROUNDED}, {@link LAST_POSITION_FINDER},
174: * and {@link LOOKAHEAD}. It is always
175: * acceptable to return the value zero, indicating that there are no known special properties.
176: * It is acceptable for the properties of the iterator to change depending on its state.
177: */
178:
179: public int getProperties() {
180: return 0;
181: }
182:
183: }
184:
185: /**
186: * Simple command-line interface for testing.
187: * @param args (1) the string to be tokenized (2) the regular expression
188: * @throws Exception
189: */
190:
191: public static void main(String[] args) throws Exception {
192: String in = args[0];
193: String[] out = Pattern.compile(args[1]).split(in, 0);
194: System.out.println("results");
195: for (int i = 0; i < out.length; i++) {
196: System.out.println('[' + out[i] + ']');
197: }
198: System.out.println("end results");
199: }
200:
201: }
202:
203: //
204: // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
205: // you may not use this file except in compliance with the License. You may obtain a copy of the
206: // License at http://www.mozilla.org/MPL/
207: //
208: // Software distributed under the License is distributed on an "AS IS" basis,
209: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
210: // See the License for the specific language governing rights and limitations under the License.
211: //
212: // The Original Code is: all this file.
213: //
214: // The Initial Developer of the Original Code is Michael H. Kay
215: //
216: // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
217: //
218: // Contributor(s): none.
219: //
|