001: /**
002: * Copyright (c) 2001, Sergey A. Samokhodkin
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without modification,
006: * are permitted provided that the following conditions are met:
007: *
008: * - Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * - Redistributions in binary form
011: * must reproduce the above copyright notice, this list of conditions and the following
012: * disclaimer in the documentation and/or other materials provided with the distribution.
013: * - Neither the name of jregex nor the names of its contributors may be used
014: * to endorse or promote products derived from this software without specific prior
015: * written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
018: * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
019: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
020: * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
021: * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
022: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
023: * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
024: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
025: * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
026: *
027: * @version 1.2_01
028: */package jregex;
029:
030: import java.io.*;
031: import java.util.*;
032:
033: /**
034: * The Tokenizer class suggests a methods to break a text into tokens using
035: * occurences of a pattern as delimiters.
036: * There are two ways to obtain a text tokenizer for some pattern:<pre>
037: * Pattern p=new Pattern("\\s+"); //any number of space characters
038: * String text="blah blah blah";
039: * //by factory method
040: * RETokenizer tok1=p.tokenizer(text);
041: * //or by constructor
042: * RETokenizer tok2=new RETokenizer(p,text);
043: * </pre>
044: * Now the one way is to use the tokenizer as a token enumeration/iterator:<pre>
045: * while(tok1.hasMore()) System.out.println(tok1.nextToken());
046: * </pre>
047: * and another way is to split it into a String array:<pre>
048: * String[] arr=tok2.split();
049: * for(int i=0;i<tok2.length;i++) System.out.println(arr[i]);
050: * </pre>
051: * @see Pattern#tokenizer(java.lang.String)
052: */
053:
054: public class RETokenizer implements Enumeration {
055: private Matcher matcher;
056: private boolean checked;
057: private boolean hasToken;
058: private String token;
059: private int pos = 0;
060: private boolean endReached = false;
061: private boolean emptyTokensEnabnled = false;
062:
063: public RETokenizer(Pattern pattern, String text) {
064: this (pattern.matcher(text), false);
065: }
066:
067: public RETokenizer(Pattern pattern, char[] chars, int off, int len) {
068: this (pattern.matcher(chars, off, len), false);
069: }
070:
071: public RETokenizer(Pattern pattern, Reader r, int len)
072: throws IOException {
073: this (pattern.matcher(r, len), false);
074: }
075:
076: public RETokenizer(Matcher m, boolean emptyEnabled) {
077: matcher = m;
078: emptyTokensEnabnled = emptyEnabled;
079: }
080:
081: public void setEmptyEnabled(boolean b) {
082: emptyTokensEnabnled = b;
083: }
084:
085: public boolean isEmptyEnabled() {
086: return emptyTokensEnabnled;
087: }
088:
089: public boolean hasMore() {
090: if (!checked)
091: check();
092: return hasToken;
093: }
094:
095: public String nextToken() {
096: if (!checked)
097: check();
098: if (!hasToken)
099: throw new NoSuchElementException();
100: checked = false;
101: return token;
102: }
103:
104: public String[] split() {
105: return collect(this , null, 0);
106: }
107:
108: public void reset() {
109: matcher.setPosition(0);
110: }
111:
112: private static final String[] collect(RETokenizer tok,
113: String[] arr, int count) {
114: if (tok.hasMore()) {
115: String s = tok.nextToken();
116: //System.out.println("collect(,,"+count+"): token="+s);
117: arr = collect(tok, arr, count + 1);
118: arr[count] = s;
119: } else {
120: arr = new String[count];
121: }
122: return arr;
123: }
124:
125: private void check() {
126: final boolean emptyOk = this .emptyTokensEnabnled;
127: checked = true;
128: if (endReached) {
129: hasToken = false;
130: return;
131: }
132: Matcher m = matcher;
133: boolean hasMatch = false;
134: while (m.find()) {
135: if (m.start() > 0) {
136: hasMatch = true;
137: break;
138: } else if (m.end() > 0) {
139: if (emptyOk) {
140: hasMatch = true;
141: break;
142: } else
143: m.setTarget(m, MatchResult.SUFFIX);
144: }
145: }
146: if (!hasMatch) {
147: endReached = true;
148: if (m.length(m.TARGET) == 0 && !emptyOk) {
149: hasToken = false;
150: } else {
151: hasToken = true;
152: token = m.target();
153: }
154: return;
155: }
156: //System.out.println(m.target()+": "+m.groupv());
157: //System.out.println("prefix: "+m.prefix());
158: //System.out.println("suffix: "+m.suffix());
159: hasToken = true;
160: token = m.prefix();
161: m.setTarget(m, MatchResult.SUFFIX);
162: //m.setTarget(m.suffix());
163: }
164:
165: public boolean hasMoreElements() {
166: return hasMore();
167: }
168:
169: /**
170: * @return a next token as a String
171: */
172: public Object nextElement() {
173: return nextToken();
174: }
175:
176: /*
177: public static void main(String[] args){
178: RETokenizer rt=new RETokenizer(new Pattern("/").matcher("/a//b/c/"),false);
179: while(rt.hasMore()){
180: System.out.println("<"+rt.nextToken()+">");
181: }
182: }
183: */
184: }
|