001: /*************************************************************************
002: * *
003: * 1) This source code file, in unmodified form, and compiled classes *
004: * derived from it can be used and distributed without restriction, *
005: * including for commercial use. (Attribution is not required *
006: * but is appreciated.) *
007: * *
008: * 2) Modified versions of this file can be made and distributed *
009: * provided: the modified versions are put into a Java package *
010: * different from the original package, edu.hws; modified *
011: * versions are distributed under the same terms as the original; *
012: * and the modifications are documented in comments. (Modification *
013: * here does not include simply making subclasses that belong to *
014: * a package other than edu.hws, which can be done without any *
015: * restriction.) *
016: * *
017: * David J. Eck *
018: * Department of Mathematics and Computer Science *
019: * Hobart and William Smith Colleges *
020: * Geneva, New York 14456, USA *
021: * Email: eck@hws.edu WWW: http://math.hws.edu/eck/ *
022: * *
023: *************************************************************************/package edu.hws.jcm.data;
024:
025: /**
026: * A ParserContext holds all the state data for a parsing operation, including the
027: * string that is being parsed, a pointer to the current position in that string,
028: * and the most recently parsed token from the string. The ParserContext object
029: * does the tokenization. Token types are retrieved by calling look() and
030: * next(). Attributes of the token are then available in the member variables
031: * tokenString, tokenObject, and tokenValue. You will probably only use this
032: * if you write a ParserExtension.
033: */
034: public class ParserContext implements java.io.Serializable {
035:
036: /**
037: * One of the possible token types returned by look() and next().
038: * Represents the end of the string
039: * that is being parsed.
040: */
041: public static final int END_OF_STRING = 1;
042:
043: /**
044: * One of the possible token types returned by look() and next().
045: * Indicates aht the token is a number. The numerical value
046: * of the token is in the tokenValue member variable.
047: */
048: public static final int NUMBER = 2;
049:
050: /**
051: * One of the possible token types returned by look() and next().
052: * The token is a word. If there is a
053: * MathObject in the symbol table associated
054: * with this word, then that object is in the
055: * tokenObject member variable. If not, tokenObject is null.
056: */
057: public static final int IDENTIFIER = 3;
058:
059: /**
060: * One of the possible token types returned by look() and next().
061: * Any other token besides end-of-string, number, or word.
062: * The only information about
063: * the token is the tokenString member variable. For some special operators
064: * (<> <= <=), the tokenString has two characters, but
065: * generally it has only one. Note that ** is translated
066: * to ^. Also, the special tokens "and", "or", and "not"
067: * are translated to type OPCHARS with tokenString
068: * equal to "&", "|", or "~" (but only if options & BOOLEANS is != 0).
069: */
070: public static final int OPCHARS = 4;
071:
072: private static final int NONE = 0; // A special value for token that is used internally to
073: // mean that the current token has been consumed by next()
074: // so that when look() or next() is called again, a new
075: // token has to be read. (Note that this is the initial value
076: // of token.)
077: /**
078: * The string that is being parsed.
079: */
080: public String data;
081:
082: /**
083: * Current position in that string, indicating how many
084: * characters have been consumed.
085: */
086: public int pos;
087:
088: /**
089: * The ExpressionProgram that is being generated as the string
090: * is parsed. Note that while parsing a ConditionalExpression, the
091: * value of prog is temporarily changed. ParserExtensions might
092: * want to do something similar.
093: */
094: public ExpressionProgram prog;
095:
096: /**
097: * The most recently read token type, or NONE if that token
098: * has been consumed by a call to next(). The value NONE is never
099: * returned by look() or next().
100: */
101: public int token;
102:
103: /**
104: * The substring of the parse string that corresponds to the most recently
105: * read token. This can change when look() or next() is called.
106: */
107: public String tokenString;
108:
109: /**
110: * If the most recently read token was of type IDENTIFIER, then
111: * this is the corresponding MathObject from the symbol table,
112: * or null if the identifier is not in the symbol table.
113: */
114: public MathObject tokenObject;
115:
116: /**
117: * If the most recently read token was of type NUMBER, then
118: * this is its numerical value.
119: */
120: public double tokenValue;
121:
122: /**
123: * The options from the Parser. Some of these options
124: * affect tokenization, such as whether BOOLEANS is enabled.
125: */
126: public int options;
127:
128: /**
129: * The Parser's symbol table, which is used for looking up
130: * tokens of type IDENTIFIER.
131: */
132: protected SymbolTable symbols;
133:
134: private StringBuffer tokBuf = new StringBuffer(); // Used in the readToken method. (Created
135:
136: // once for efficiency.)
137:
138: /**
139: * Create a ParserContext for parsing the data String, using the
140: * specified options and symbol table. A new ExpressionProgram
141: * is created to hold the program that will be generated from
142: * the string.
143: */
144: public ParserContext(String data, int options, SymbolTable symbols) {
145: this .data = data;
146: this .options = options;
147: this .symbols = symbols;
148: prog = new ExpressionProgram();
149: }
150:
151: //---------- Wrapper functions for accessing the symbol table --------------
152:
153: /** MathObjects added to the symbol table after a call to mark() will
154: * be removed by a later, matching call to revert(). In the meantime,
155: * older symbols of the same name will only be hidden, not replaced,
156: * so they will still be there after the revert. It is important that
157: * a call to this routine is followed by a later call to revert! No
158: * error checking is done to make sure that this is true.
159: */
160: public void mark() { // ADDED SEPTEMBER 23, 2000
161: symbols = new SymbolTable(symbols);
162: }
163:
164: /* After a call to mark(), revert() must be called to restore the
165: * state of the symbol table.
166: */
167: public void revert() { // ADDED SEPTEMBER 23, 2000
168: symbols = symbols.getParent();
169: }
170:
171: /**
172: * Get the MathObject associated with name in the symbol table.
173: */
174: public MathObject get(String name) {
175: if ((options & Parser.CASE_SENSITIVE) != 0)
176: return symbols.get(name);
177: else
178: return symbols.get(name.toLowerCase());
179: }
180:
181: /**
182: * Add a new MathObject to the symbol table.
183: */
184: public void add(MathObject sym) {
185: if ((options & Parser.CASE_SENSITIVE) != 0)
186: symbols.add(sym);
187: else
188: symbols.add(sym.getName().toLowerCase(), sym);
189: }
190:
191: //---------------------------- Tokenization ---------------------------------
192:
193: /**
194: * Consume one token from the string. The token type is returned.
195: * After this is called, attributes of the token can be obtained
196: * from the public member variables tokenString, tokenObject,
197: * and tokenValue. Note that the END_OF_STRING token is never
198: * really consumed and can be returned multiple times. Can
199: * throw a ParseError in the case of an illegal numeric token.
200: */
201: public int next() {
202: int tok = look();
203: if (token != END_OF_STRING)
204: token = NONE;
205: return tok;
206: }
207:
208: /**
209: * Look ahead at the next token in the data string, without consuming it.
210: * Successive calls to look() will return the same token. (The token
211: * must be consumed by a call to next().) The token type is returned.
212: * After a call to look(), attributes of the token can be obtained
213: * from the public member variables tokenString, tokenObject,
214: * and tokenValue. Can throw a ParseError in the case of an illegal
215: * numeric token.
216: */
217: public int look() {
218: if (token == NONE) {
219: // Token has been consumed. Read a new token.
220: while (pos < data.length()
221: && (data.charAt(pos) == ' ' || data.charAt(pos) == '\t'))
222: pos++;
223: if (pos >= data.length()) {
224: token = END_OF_STRING;
225: tokenString = null;
226: } else
227: readToken();
228: }
229: return token;
230: }
231:
232: /**
233: * Read the next token from the data string, and set the values of token, tokenString,
234: * tokenNumber, and tokenObject appropriately.
235: * When this is called, we know that pos < data.length and data.charAt(pos)
236: * is not blank or tab.
237: */
238: private void readToken() {
239: char ch = data.charAt(pos); // The first character of the token. This determines the token type.
240: int savePosition = pos; // The starting position in data string.
241: tokBuf.setLength(0);
242: if (Character.isLetter(ch)
243: || (ch == '_' && ((options & Parser.NO_UNDERSCORE_IN_IDENTIFIERS) == 0))) {
244: token = IDENTIFIER;
245: while (Character.isLetter(ch)
246: || (ch == '_' && ((options & Parser.NO_UNDERSCORE_IN_IDENTIFIERS) == 0))
247: || (Character.isDigit(ch) && ((options & Parser.NO_DIGITS_IN_IDENTIFIERS) == 0))) {
248: tokBuf.append(ch);
249: pos++;
250: if (pos >= data.length())
251: break;
252: ch = data.charAt(pos);
253: }
254: tokenString = tokBuf.toString();
255: tokenObject = null;
256: for (int i = tokenString.length(); i > 0; i--) {
257: // Tricky programming: If the OPTIONAL_SPACES option is not set,
258: // then this for loop is executed only once, because of the break
259: // at the end. Therefor, only the complete string is tested as
260: // being a known identifier. If the option is off, then
261: // each prefix of the string is tested, the process ending with
262: // the longest prefix that is a known identifier. However, if
263: // no prefix is a known word, then the entire string is reported
264: // as an unknown identifier.
265: String str = tokenString.substring(0, i);
266: if (((options & Parser.BOOLEANS) != 0)) {
267: // If BOOLEANS is enabled, the special words "and", "or", "not" are
268: // converted to operators. Case is ignored. Note that when BOOLEANS
269: // is enabled, it is impossible to have MathObjects named "and", "or",
270: // "And", "AND", etc. (That is, such MathObjects are hidden if they exist.)
271: if (str.equalsIgnoreCase("and")) {
272: token = OPCHARS;
273: tokenString = "&";
274: pos = savePosition + 3;
275: return;
276: } else if (str.equalsIgnoreCase("or")) {
277: token = OPCHARS;
278: tokenString = "|";
279: pos = savePosition + 2;
280: return;
281: } else if (str.equalsIgnoreCase("not")) {
282: token = OPCHARS;
283: tokenString = "~";
284: pos = savePosition + 3;
285: return;
286: }
287: }
288: if (get(str) != null) {
289: tokenString = str;
290: tokenObject = get(tokenString);
291: pos = savePosition + i;
292: break;
293: }
294: if (((options & Parser.OPTIONAL_SPACES) == 0))
295: break;
296: }
297: } else if (Character.isDigit(ch)
298: || (ch == '.' && pos < data.length() - 1 && Character
299: .isDigit(data.charAt(pos + 1)))) {
300: token = NUMBER;
301: while (pos < data.length()
302: && Character.isDigit(data.charAt(pos)))
303: tokBuf.append(data.charAt(pos++));
304: if (pos < data.length() && data.charAt(pos) == '.') {
305: tokBuf.append(data.charAt(pos++));
306: while (pos < data.length()
307: && Character.isDigit(data.charAt(pos)))
308: tokBuf.append(data.charAt(pos++));
309: }
310: if (pos < data.length()
311: && (data.charAt(pos) == 'e' || data.charAt(pos) == 'E')) {
312: savePosition = pos;
313: tokBuf.append(data.charAt(pos++));
314: if (pos < data.length()
315: && (data.charAt(pos) == '+' || data.charAt(pos) == '-'))
316: tokBuf.append(data.charAt(pos++));
317: if (pos >= data.length()
318: || (!Character.isDigit(data.charAt(pos)))) {
319: if ((options & Parser.OPTIONAL_STARS) == 0)
320: throw new ParseError("Illegal number, '"
321: + tokBuf.toString()
322: + "'. No digits in exponential part.",
323: this );
324: else
325: pos = savePosition;
326: } else {
327: while (pos < data.length()
328: && Character.isDigit(data.charAt(pos)))
329: tokBuf.append(data.charAt(pos++));
330: }
331: }
332: tokenString = tokBuf.toString();
333: double d = NumUtils.stringToReal(tokenString);
334: if (Double.isInfinite(d))
335: throw new ParseError("The number '" + tokBuf.toString()
336: + "' is outside the range of legal numbers.",
337: this );
338: if (Double.isNaN(d))
339: throw new ParseError("The string '" + tokBuf.toString()
340: + "' is not a legal number.", this );
341: tokenValue = d;
342: } else {
343: token = OPCHARS;
344: tokenString = "" + ch;
345: pos++;
346: if (pos < data.length()) {
347: // Check for two-character operators.
348: char nextch = data.charAt(pos);
349: switch (ch) {
350: case '*':
351: if (nextch == '*') { // "**" is an alternative to "^".
352: tokenString = "^";
353: pos++;
354: }
355: break;
356: case '=':
357: if (nextch == '<' || nextch == '>')
358: tokenString = data.charAt(pos++) + tokenString;
359: break;
360: case '<':
361: if (nextch == '=' || nextch == '>')
362: tokenString += data.charAt(pos++);
363: break;
364: case '>':
365: if (nextch == '=')
366: tokenString += data.charAt(pos++);
367: else if (nextch == '<')
368: tokenString = data.charAt(pos++) + tokenString;
369: break;
370: }
371: }
372: }
373: }
374:
375: } // end class ParserContext
|