001: package org.ofbiz.rules.parse.tokens;
002:
003: import java.io.*;
004:
005: /**
006: * <p><b>Title:</b> Tokenizer
007: * <p><b>Description:</b> None
008: * <p>Copyright (c) 1999 Steven J. Metsker.
009: * <p>Copyright (c) 2001 The Open For Business Project - www.ofbiz.org
010: *
011: * <p>Permission is hereby granted, free of charge, to any person obtaining a
012: * copy of this software and associated documentation files (the "Software"),
013: * to deal in the Software without restriction, including without limitation
014: * the rights to use, copy, modify, merge, publish, distribute, sublicense,
015: * and/or sell copies of the Software, and to permit persons to whom the
016: * Software is furnished to do so, subject to the following conditions:
017: *
018: * <p>The above copyright notice and this permission notice shall be included
019: * in all copies or substantial portions of the Software.
020: *
021: * <p>THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
022: * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
023: * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
024: * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
025: * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
026: * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
027: * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
028: *
029: * <br>
030: * A tokenizer divides a string into tokens. This class is
031: * highly customizable with regard to exactly how this division
032: * occurs, but it also has defaults that are suitable for many
033: * languages. This class assumes that the character values read
034: * from the string lie in the range 0-255. For example, the
035: * Unicode value of a capital A is 65, so
036: * <code> System.out.println((char)65); </code> prints out a
037: * capital A.
038: *
039: * <p>
040: * The behavior of a tokenizer depends on its character state
041: * table. This table is an array of 256 <code>TokenizerState
042: * </code> states. The state table decides which state to
043: * enter upon reading a character from the input
044: * string.
045: *
046: * <p>
047: * For example, by default, upon reading an 'A', a tokenizer
048: * will enter a "word" state. This means the tokenizer will
049: * ask a <code>WordState</code> object to consume the 'A',
050: * along with the characters after the 'A' that form a word.
051: * The state's responsibility is to consume characters and
052: * return a complete token.
053: *
054: * <p>
055: * The default table sets a SymbolState for every character
056: * from 0 to 255, and then overrides this with:
057: *
058: * <blockquote><pre>
059: * From To State
060: * 0 ' ' whitespaceState
061: * 'a' 'z' wordState
062: * 'A' 'Z' wordState
063: * 160 255 wordState
064: * '0' '9' numberState
065: * '-' '-' numberState
066: * '.' '.' numberState
067: * '"' '"' quoteState
068: * '\'' '\'' quoteState
069: * '/' '/' slashState
070: * </pre></blockquote>
071: *
072: * In addition to allowing modification of the state table,
073: * this class makes each of the states above available. Some
074: * of these states are customizable. For example, wordState
075: * allows customization of what characters can be part of a
076: * word, after the first character.
077: *
078: * @author Steven J. Metsker
079: * @version 1.0
080: */
081: public class Tokenizer {
082:
083: /**
084: * The reader to read characters from
085: */
086: protected PushbackReader reader;
087:
088: /**
089: * The number of characters that might be in a symbol;
090: */
091: protected static final int DEFAULT_SYMBOL_MAX = 4;
092:
093: /**
094: * The state lookup table
095: */
096: protected TokenizerState[] characterState = new TokenizerState[256];
097:
098: /**
099: * The default states that actually consume text and
100: * produce a token
101: */
102: protected NumberState numberState = new NumberState();
103: protected QuoteState quoteState = new QuoteState();
104: protected SlashState slashState = new SlashState();
105: protected SymbolState symbolState = new SymbolState();
106: protected WhitespaceState whitespaceState = new WhitespaceState();
107: protected WordState wordState = new WordState();
108:
109: /**
110: * Constructs a tokenizer with a default state table (as
111: * described in the class comment).
112: *
113: * @return a tokenizer
114: */
115: public Tokenizer() {
116:
117: setCharacterState(0, 255, symbolState()); // the default
118:
119: setCharacterState(0, ' ', whitespaceState());
120: setCharacterState('a', 'z', wordState());
121: setCharacterState('A', 'Z', wordState());
122: setCharacterState(0xc0, 0xff, wordState());
123: setCharacterState('0', '9', numberState());
124: setCharacterState('-', '-', numberState());
125: setCharacterState('.', '.', numberState());
126: setCharacterState('"', '"', quoteState());
127: setCharacterState('\'', '\'', quoteState());
128: setCharacterState('/', '/', slashState());
129: }
130:
131: /**
132: * Constructs a tokenizer to read from the supplied string.
133: *
134: * @param String the string to read from
135: */
136: public Tokenizer(String s) {
137: this ();
138: setString(s);
139: }
140:
141: /**
142: * Return the reader this tokenizer will read from.
143: *
144: * @return the reader this tokenizer will read from
145: */
146: public PushbackReader getReader() {
147: return reader;
148: }
149:
150: /**
151: * Return the next token.
152: *
153: * @return the next token.
154: *
155: * @exception IOException if there is any problem reading
156: */
157: public Token nextToken() throws IOException {
158: int c = reader.read();
159:
160: /* There was a defect here, that resulted from the fact
161: * that unreading a -1 results in the next read having a
162: * value of (int)(char)-1, which is 65535. This may be
163: * a defect in PushbackReader. */
164:
165: if (c >= 0 && c < characterState.length) {
166: return characterState[c].nextToken(reader, c, this );
167: }
168: return Token.EOF;
169: }
170:
171: /**
172: * Return the state this tokenizer uses to build numbers.
173: *
174: * @return the state this tokenizer uses to build numbers
175: */
176: public NumberState numberState() {
177: return numberState;
178: }
179:
180: /**
181: * Return the state this tokenizer uses to build quoted
182: * strings.
183: *
184: * @return the state this tokenizer uses to build quoted
185: * strings
186: */
187: public QuoteState quoteState() {
188: return quoteState;
189: }
190:
191: /**
192: * Change the state the tokenizer will enter upon reading
193: * any character between "from" and "to".
194: *
195: * @param from the "from" character
196: *
197: * @param to the "to" character
198: *
199: * @param TokenizerState the state to enter upon reading a
200: * character between "from" and "to"
201: */
202: public void setCharacterState(int from, int to, TokenizerState state) {
203:
204: for (int i = from; i <= to; i++) {
205: if (i >= 0 && i < characterState.length) {
206: characterState[i] = state;
207: }
208: }
209: }
210:
211: /**
212: * Set the reader to read from.
213: *
214: * @param PushbackReader the reader to read from
215: */
216: public void setReader(PushbackReader r) {
217: this .reader = r;
218: }
219:
220: /**
221: * Set the string to read from.
222: *
223: * @param String the string to read from
224: */
225: public void setString(String s) {
226: setString(s, DEFAULT_SYMBOL_MAX);
227: }
228:
229: /**
230: * Set the string to read from.
231: *
232: * @param String the string to read from
233: *
234: * @param int the maximum length of a symbol, which
235: * establishes the size of pushback buffer
236: * we need
237: */
238: public void setString(String s, int symbolMax) {
239: setReader(new PushbackReader(new StringReader(s), symbolMax));
240: }
241:
242: /**
243: * Return the state this tokenizer uses to recognize
244: * (and ignore) comments.
245: *
246: * @return the state this tokenizer uses to recognize
247: * (and ignore) comments
248: *
249: */
250: public SlashState slashState() {
251: return slashState;
252: }
253:
254: /**
255: * Return the state this tokenizer uses to recognize
256: * symbols.
257: *
258: * @return the state this tokenizer uses to recognize
259: * symbols
260: */
261: public SymbolState symbolState() {
262: return symbolState;
263: }
264:
265: /**
266: * Return the state this tokenizer uses to recognize (and
267: * ignore) whitespace.
268: *
269: * @return the state this tokenizer uses to recognize
270: * whitespace
271: */
272: public WhitespaceState whitespaceState() {
273: return whitespaceState;
274: }
275:
276: /**
277: * Return the state this tokenizer uses to build words.
278: *
279: * @return the state this tokenizer uses to build words
280: */
281: public WordState wordState() {
282: return wordState;
283: }
284: }
|