001: /******************************************************************
002: * File: Tokenizer.java
003: * Created by: Dave Reynolds
004: * Created on: 24-Jun-2003
005: *
006: * (c) Copyright 2003, 2004, 2005, 2006, 2007, 2008 Hewlett-Packard Development Company, LP
007: * [See end of file]
008: * $Id: Tokenizer.java,v 1.10 2008/01/02 12:07:43 andy_seaborne Exp $
009: *****************************************************************/package com.hp.hpl.jena.util;
010:
011: import java.util.NoSuchElementException;
012:
013: /**
014: * A tokenizer, similar to java's StringTokenizer but allows for quoted
015: * character strings which can include other separators.
016: *
017: * @author <a href="mailto:der@hplb.hpl.hp.com">Dave Reynolds</a>
018: * @version $Revision: 1.10 $ on $Date: 2008/01/02 12:07:43 $
019: */
020: public class Tokenizer {
021:
022: /** The string being parsed */
023: protected String source;
024:
025: /** The index of the first unreturned char in source */
026: protected int p;
027:
028: /** The set of delimiter characters */
029: protected String delim;
030:
031: /** If true then delimiters should be returned as tokens */
032: protected boolean returnDelims;
033:
034: /** Literal string delimiters */
035: protected String literalDelim;
036:
037: /** The lex state */
038: protected int state;
039:
040: /** A lookahead for tokens */
041: protected String lookahead;
042:
043: /** State flag: normal parse */
044: protected static final int NORMAL = 1;
045:
046: /** State flag: start of literal */
047: protected static final int LITERAL_START = 2;
048:
049: /** State flag: end of literal */
050: protected static final int LITERAL_END = 3;
051:
052: /**
053: * Constructor.
054: * @param str the source string to be parsed
055: * @param delim The set of delimiter characters
056: * @param literalDelim Literal string delimiters
057: * @param returnDelims If true then delimiters should be returned as tokens
058: */
059: public Tokenizer(String str, String delim, String literalDelim,
060: boolean returnDelims) {
061: this .source = str;
062: this .delim = delim;
063: this .literalDelim = literalDelim;
064: this .returnDelims = returnDelims;
065: p = 0;
066: state = NORMAL;
067: }
068:
069: /**
070: * Return the next token.
071: * @throws java.util.NoSuchElementException if there are no more tokens available
072: */
073: public String nextToken() {
074: String result = null;
075: if (lookahead != null) {
076: result = lookahead;
077: lookahead = null;
078: } else {
079: result = getNextToken();
080: }
081: if (result == null) {
082: throw new NoSuchElementException(
083: "No more elements in tokenized string");
084: }
085: if (!returnDelims) {
086: if (result.length() == 1) {
087: char c = result.charAt(0);
088: if (delim.indexOf(c) != -1
089: || literalDelim.indexOf(c) != -1) {
090: return nextToken();
091: }
092: }
093: }
094: return result;
095: }
096:
097: /**
098: * Test if there are more tokens which can be returned.
099: */
100: public boolean hasMoreTokens() {
101: if (lookahead == null)
102: lookahead = getNextToken();
103: return lookahead != null;
104: }
105:
106: /**
107: * Find the next token which can either be a delimiter or a real token.
108: */
109: private String getNextToken() {
110: if (p >= source.length()) {
111: return null;
112: }
113: switch (state) {
114: case NORMAL:
115: if (is(literalDelim)) {
116: state = LITERAL_START;
117: p++;
118: return source.substring(p - 1, p);
119: } else if (is(delim)) {
120: p++;
121: return source.substring(p - 1, p);
122: } else {
123: int start = p;
124: p++;
125: while (p < source.length() && !is(delim))
126: p++;
127: return source.substring(start, p);
128: }
129: case LITERAL_START:
130: char delim = source.charAt(p - 1);
131: StringBuffer literal = new StringBuffer();
132: while (p < source.length()) {
133: char c = source.charAt(p);
134: if (c == '\\') {
135: p++;
136: if (p >= source.length())
137: break;
138: c = source.charAt(p);
139: } else {
140: if (c == delim)
141: break;
142: }
143: literal.append(c);
144: p++;
145: }
146: state = LITERAL_END;
147: return literal.toString();
148: case LITERAL_END:
149: state = NORMAL;
150: p++;
151: return source.substring(p - 1, p);
152: }
153: return null;
154: }
155:
156: /**
157: * Returns true if the current character is contained in the given classification.
158: */
159: private boolean is(String classification) {
160: return classification.indexOf(source.charAt(p)) != -1;
161: }
162:
163: public static void main(String[] args) {
164: System.out.println("Starting");
165: Tokenizer tokenizer = new Tokenizer(
166: "foo '' 'a literal' \"a double literal\" 'literal with \\\" in it' 'literal with unquoted\"in it'",
167: "()[], \t\n\r", "'\"", true);
168: while (tokenizer.hasMoreTokens()) {
169: String t = tokenizer.nextToken();
170: System.out.println("Token: [" + t + "]");
171: }
172: }
173: }
174:
175: /*
176: (c) Copyright 2003, 2004, 2005, 2006, 2007, 2008 Hewlett-Packard Development Company, LP
177: All rights reserved.
178:
179: Redistribution and use in source and binary forms, with or without
180: modification, are permitted provided that the following conditions
181: are met:
182:
183: 1. Redistributions of source code must retain the above copyright
184: notice, this list of conditions and the following disclaimer.
185:
186: 2. Redistributions in binary form must reproduce the above copyright
187: notice, this list of conditions and the following disclaimer in the
188: documentation and/or other materials provided with the distribution.
189:
190: 3. The name of the author may not be used to endorse or promote products
191: derived from this software without specific prior written permission.
192:
193: THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
194: IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
195: OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
196: IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
197: INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
198: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
199: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
200: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
201: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
202: THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
203: */
|