001: /*
002: [The "BSD licence"]
003: Copyright (c) 2005-2006 Terence Parr
004: All rights reserved.
005:
006: Redistribution and use in source and binary forms, with or without
007: modification, are permitted provided that the following conditions
008: are met:
009: 1. Redistributions of source code must retain the above copyright
010: notice, this list of conditions and the following disclaimer.
011: 2. Redistributions in binary form must reproduce the above copyright
012: notice, this list of conditions and the following disclaimer in the
013: documentation and/or other materials provided with the distribution.
014: 3. The name of the author may not be used to endorse or promote products
015: derived from this software without specific prior written permission.
016:
017: THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
018: IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
019: OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
020: IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
021: INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
022: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
023: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
024: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
026: THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: */
028: package org.antlr.runtime;
029:
030: /** A lexer is recognizer that draws input symbols from a character stream.
031: * lexer grammars result in a subclass of this object. A Lexer object
032: * uses simplified match() and error recovery mechanisms in the interest
033: * of speed.
034: */
035: public abstract class Lexer extends BaseRecognizer implements
036: TokenSource {
037: /** Where is the lexer drawing characters from? */
038: protected CharStream input;
039:
040: /** The goal of all lexer rules/methods is to create a token object.
041: * This is an instance variable as multiple rules may collaborate to
042: * create a single token. nextToken will return this object after
043: * matching lexer rule(s). If you subclass to allow multiple token
044: * emissions, then set this to the last token to be matched or
045: * something nonnull so that the auto token emit mechanism will not
046: * emit another token.
047: */
048: protected Token token;
049:
050: /** What character index in the stream did the current token start at?
051: * Needed, for example, to get the text for current token. Set at
052: * the start of nextToken.
053: */
054: protected int tokenStartCharIndex = -1;
055:
056: /** The line on which the first character of the token resides */
057: protected int tokenStartLine;
058:
059: /** The character position of first character within the line */
060: protected int tokenStartCharPositionInLine;
061:
062: /** The channel number for the current token */
063: protected int channel;
064:
065: /** The token type for the current token */
066: protected int type;
067:
068: /** You can set the text for the current token to override what is in
069: * the input char buffer. Use setText() or can set this instance var.
070: */
071: protected String text;
072:
073: public Lexer() {
074: }
075:
076: public Lexer(CharStream input) {
077: this .input = input;
078: }
079:
080: public void reset() {
081: super .reset(); // reset all recognizer state variables
082: // wack Lexer state variables
083: token = null;
084: type = Token.INVALID_TOKEN_TYPE;
085: channel = Token.DEFAULT_CHANNEL;
086: tokenStartCharIndex = -1;
087: tokenStartCharPositionInLine = -1;
088: tokenStartLine = -1;
089: text = null;
090: if (input != null) {
091: input.seek(0); // rewind the input
092: }
093: }
094:
095: /** Return a token from this source; i.e., match a token on the char
096: * stream.
097: */
098: public Token nextToken() {
099: while (true) {
100: token = null;
101: channel = Token.DEFAULT_CHANNEL;
102: tokenStartCharIndex = input.index();
103: tokenStartCharPositionInLine = input
104: .getCharPositionInLine();
105: tokenStartLine = input.getLine();
106: text = null;
107: if (input.LA(1) == CharStream.EOF) {
108: return Token.EOF_TOKEN;
109: }
110: try {
111: mTokens();
112: if (token == null) {
113: emit();
114: } else if (token == Token.SKIP_TOKEN) {
115: continue;
116: }
117: return token;
118: } catch (RecognitionException re) {
119: reportError(re);
120: recover(re);
121: }
122: }
123: }
124:
125: /** Instruct the lexer to skip creating a token for current lexer rule
126: * and look for another token. nextToken() knows to keep looking when
127: * a lexer rule finishes with token set to SKIP_TOKEN. Recall that
128: * if token==null at end of any token rule, it creates one for you
129: * and emits it.
130: */
131: public void skip() {
132: token = Token.SKIP_TOKEN;
133: }
134:
135: /** This is the lexer entry point that sets instance var 'token' */
136: public abstract void mTokens() throws RecognitionException;
137:
138: /** Set the char stream and reset the lexer */
139: public void setCharStream(CharStream input) {
140: this .input = null;
141: reset();
142: this .input = input;
143: }
144:
145: /** Currently does not support multiple emits per nextToken invocation
146: * for efficiency reasons. Subclass and override this method and
147: * nextToken (to push tokens into a list and pull from that list rather
148: * than a single variable as this implementation does).
149: */
150: public void emit(Token token) {
151: this .token = token;
152: }
153:
154: /** The standard method called to automatically emit a token at the
155: * outermost lexical rule. The token object should point into the
156: * char buffer start..stop. If there is a text override in 'text',
157: * use that to set the token's text. Override this method to emit
158: * custom Token objects.
159: */
160: public Token emit() {
161: Token t = new CommonToken(input, type, channel,
162: tokenStartCharIndex, getCharIndex() - 1);
163: t.setLine(tokenStartLine);
164: t.setText(text);
165: t.setCharPositionInLine(tokenStartCharPositionInLine);
166: emit(t);
167: return t;
168: }
169:
170: public void match(String s) throws MismatchedTokenException {
171: int i = 0;
172: while (i < s.length()) {
173: if (input.LA(1) != s.charAt(i)) {
174: if (backtracking > 0) {
175: failed = true;
176: return;
177: }
178: MismatchedTokenException mte = new MismatchedTokenException(
179: s.charAt(i), input);
180: recover(mte);
181: throw mte;
182: }
183: i++;
184: input.consume();
185: failed = false;
186: }
187: }
188:
189: public void matchAny() {
190: input.consume();
191: }
192:
193: public void match(int c) throws MismatchedTokenException {
194: if (input.LA(1) != c) {
195: if (backtracking > 0) {
196: failed = true;
197: return;
198: }
199: MismatchedTokenException mte = new MismatchedTokenException(
200: c, input);
201: recover(mte);
202: throw mte;
203: }
204: input.consume();
205: failed = false;
206: }
207:
208: public void matchRange(int a, int b)
209: throws MismatchedRangeException {
210: if (input.LA(1) < a || input.LA(1) > b) {
211: if (backtracking > 0) {
212: failed = true;
213: return;
214: }
215: MismatchedRangeException mre = new MismatchedRangeException(
216: a, b, input);
217: recover(mre);
218: throw mre;
219: }
220: input.consume();
221: failed = false;
222: }
223:
224: public int getLine() {
225: return input.getLine();
226: }
227:
228: public int getCharPositionInLine() {
229: return input.getCharPositionInLine();
230: }
231:
232: /** What is the index of the current character of lookahead? */
233: public int getCharIndex() {
234: return input.index();
235: }
236:
237: /** Return the text matched so far for the current token or any
238: * text override.
239: */
240: public String getText() {
241: if (text != null) {
242: return text;
243: }
244: return input.substring(tokenStartCharIndex, getCharIndex() - 1);
245: }
246:
247: /** Set the complete text of this token; it wipes any previous
248: * changes to the text.
249: */
250: public void setText(String text) {
251: this .text = text;
252: }
253:
254: public void reportError(RecognitionException e) {
255: /** TODO: not thought about recovery in lexer yet.
256: *
257: // if we've already reported an error and have not matched a token
258: // yet successfully, don't report any errors.
259: if ( errorRecovery ) {
260: //System.err.print("[SPURIOUS] ");
261: return;
262: }
263: errorRecovery = true;
264: */
265:
266: displayRecognitionError(this .getTokenNames(), e);
267: }
268:
269: public String getErrorMessage(RecognitionException e,
270: String[] tokenNames) {
271: String msg = null;
272: if (e instanceof MismatchedTokenException) {
273: MismatchedTokenException mte = (MismatchedTokenException) e;
274: msg = "mismatched character " + getCharErrorDisplay(e.c)
275: + " expecting "
276: + getCharErrorDisplay(mte.expecting);
277: } else if (e instanceof NoViableAltException) {
278: NoViableAltException nvae = (NoViableAltException) e;
279: // for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>"
280: // and "(decision="+nvae.decisionNumber+") and
281: // "state "+nvae.stateNumber
282: msg = "no viable alternative at character "
283: + getCharErrorDisplay(e.c);
284: } else if (e instanceof EarlyExitException) {
285: EarlyExitException eee = (EarlyExitException) e;
286: // for development, can add "(decision="+eee.decisionNumber+")"
287: msg = "required (...)+ loop did not match anything at character "
288: + getCharErrorDisplay(e.c);
289: } else if (e instanceof MismatchedNotSetException) {
290: MismatchedNotSetException mse = (MismatchedNotSetException) e;
291: msg = "mismatched character " + getCharErrorDisplay(e.c)
292: + " expecting set " + mse.expecting;
293: } else if (e instanceof MismatchedSetException) {
294: MismatchedSetException mse = (MismatchedSetException) e;
295: msg = "mismatched character " + getCharErrorDisplay(e.c)
296: + " expecting set " + mse.expecting;
297: } else if (e instanceof MismatchedRangeException) {
298: MismatchedRangeException mre = (MismatchedRangeException) e;
299: msg = "mismatched character " + getCharErrorDisplay(e.c)
300: + " expecting set " + getCharErrorDisplay(mre.a)
301: + ".." + getCharErrorDisplay(mre.b);
302: } else {
303: msg = super .getErrorMessage(e, tokenNames);
304: }
305: return msg;
306: }
307:
308: public String getCharErrorDisplay(int c) {
309: String s = String.valueOf((char) c);
310: switch (c) {
311: case Token.EOF:
312: s = "<EOF>";
313: break;
314: case '\n':
315: s = "\\n";
316: break;
317: case '\t':
318: s = "\\t";
319: break;
320: case '\r':
321: s = "\\r";
322: break;
323: }
324: return "'" + s + "'";
325: }
326:
327: /** Lexers can normally match any char in it's vocabulary after matching
328: * a token, so do the easy thing and just kill a character and hope
329: * it all works out. You can instead use the rule invocation stack
330: * to do sophisticated error recovery if you are in a fragment rule.
331: */
332: public void recover(RecognitionException re) {
333: //System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
334: //re.printStackTrace();
335: input.consume();
336: }
337:
338: public void traceIn(String ruleName, int ruleIndex) {
339: String inputSymbol = ((char) input.LT(1)) + " line="
340: + getLine() + ":" + getCharPositionInLine();
341: super .traceIn(ruleName, ruleIndex, inputSymbol);
342: }
343:
344: public void traceOut(String ruleName, int ruleIndex) {
345: String inputSymbol = ((char) input.LT(1)) + " line="
346: + getLine() + ":" + getCharPositionInLine();
347: super.traceOut(ruleName, ruleIndex, inputSymbol);
348: }
349: }
|