001: package fri.patterns.interpreter.parsergenerator.lexer;
002:
003: import java.util.*;
004: import java.io.*;
005: import fri.patterns.interpreter.parsergenerator.Lexer;
006: import fri.patterns.interpreter.parsergenerator.Token;
007:
008: /**
009: This Lexer must be created using LexerBuilder. It knows token and ignored terminals.
010: To get this Lexer working the <i>setTerminals()</i> call must be called at least once.
011: When using the Lexer standalone, the client must do this, else the Parser will
012: call that method.
013: <p>
014: This lexer can be reused, but it can not be loaded with other syntaxes after it
015: has been built for one.
016:
017: @author (c) 2002, Fritz Ritzberger
018: */
019:
020: public class LexerImpl implements Lexer, StrategyFactoryMethod,
021: Serializable {
022: protected Strategy strategy;
023: private List ignoredSymbols;
024: private Map charConsumers;
025: private transient InputText input;
026: private List listeners;
027: private transient boolean debug;
028:
029: /**
030: Creates a Lexer from token- and ignored symbols, and a map of character consumers (built by LexerBuilder).
031: @param ignoredSymbols list of Strings containing ignored symbols to scan. These are NOT enclosed in `backquotes` like tokens.
032: @param charConsumers map with key = nonterminal and value = Consumer.
033: */
034: public LexerImpl(List ignoredSymbols, Map charConsumers) {
035: setConsumers(ignoredSymbols, charConsumers);
036: }
037:
038: /** Do-nothing constructor for subclasses (currently unused). */
039: protected LexerImpl() {
040: }
041:
042: /** Implements Lexer. Adds the passed token listener to listener list. */
043: public void addTokenListener(Lexer.TokenListener tokenListener) {
044: if (listeners == null)
045: listeners = new ArrayList(1);
046: listeners.add(tokenListener);
047: }
048:
049: /** Implements Lexer. Removes the passed token listener from listener list. */
050: public void removeTokenListener(Lexer.TokenListener tokenListener) {
051: if (listeners != null)
052: listeners.remove(tokenListener);
053: }
054:
055: private void setConsumers(List ignoredSymbols, Map charConsumers) {
056: this .charConsumers = charConsumers; // store for check at setTerminals()
057: this .ignoredSymbols = ignoredSymbols; // need to know which token should be ignored
058:
059: for (int i = 0; ignoredSymbols != null
060: && i < ignoredSymbols.size(); i++) { // ignored symbols will not be passed by the parser
061: String sym = (String) ignoredSymbols.get(i);
062: Consumer cc = (Consumer) charConsumers.get(sym);
063: ensureStrategy().addIgnoringConsumer(sym, cc);
064: }
065:
066: // propagate this LexerImpl as StrategyFactoryMethod to ConsumerAlternatives
067: for (Iterator it = charConsumers.entrySet().iterator(); it
068: .hasNext();) {
069: Consumer c = (Consumer) ((Map.Entry) it.next()).getValue();
070: if (c instanceof ConsumerAlternatives) {
071: ((ConsumerAlternatives) c)
072: .setStrategyFactoryMethod(this );
073: }
074: }
075: }
076:
077: private Strategy ensureStrategy() {
078: if (strategy == null)
079: strategy = newStrategy();
080: return strategy;
081: }
082:
083: /** Implements StrategyFactoryMethod. To be overridden to create a derived Strategy implementation. */
084: public Strategy newStrategy() {
085: return new Strategy();
086: }
087:
088: /**
089: When false, the sort order (significance) of scan items without fixed start character decide what token is returned.
090: When true (default), the scan item (without fixed start character) that scnas longest wins.
091: */
092: public void setCompeteForLongestInput(boolean competeForLongestInput) {
093: ensureStrategy().setCompeteForLongestInput(
094: competeForLongestInput);
095: }
096:
097: // implementing Lexer
098:
099: /**
100: Implements Lexer: set the input to be scanned. If text is InputStream, no Reader
101: will be used (characters will not be converted).
102: @param text text to scan, as String, StringBuffer, File, InputStream, Reader.
103: */
104: public void setInput(Object text) throws IOException {
105: input = new InputText(text);
106: }
107:
108: /**
109: Implements Lexer: Parser call to pass all tokens symbols (all enclosed in `backquote`) and literals ("xyz").
110: @param terminals List of String containing "literals" and `lexertokens`.
111: */
112: public void setTerminals(List terminals) {
113: for (int i = 0; i < terminals.size(); i++) {
114: String symbol = (String) terminals.get(i);
115:
116: // check if it is a terminal as this is a public call
117: if (symbol.length() <= 2
118: || Token.isTerminal(symbol) == false)
119: throw new IllegalArgumentException(
120: "Terminals must be enclosed within quotes: "
121: + symbol);
122:
123: String text = symbol.substring(1, symbol.length() - 1); // remove quotes
124:
125: if (ensureStrategy().hasTerminal(symbol) == false) { // could have been called for second time
126: if (symbol.charAt(0) == Token.COMMAND_QUOTE) { // is a scan terminal covered by a Consumer
127: Consumer cc = (Consumer) charConsumers.get(text);
128: if (cc == null)
129: throw new IllegalArgumentException(
130: "Lexer token is not among character consumers: "
131: + text);
132: else
133: ensureStrategy().addTokenConsumer(symbol, cc);
134: } else {
135: ensureStrategy().addTokenConsumer(symbol,
136: new Consumer(text));
137: }
138: }
139: } // end for
140:
141: if (debug)
142: System.err.println("StrategyList is:\n" + strategy);
143: }
144:
145: /** Implements Lexer: Does nothing as no states are stored. This Lexer can not be loaded with new syntaxes. */
146: public void clear() {
147: }
148:
149: /**
150: This is an optional functionality of Lexer. It is <b>NOT</b> called by the Parser.
151: It can be used for heuristic reading from an input (not knowing if there is more input
152: after the token was read).
153: <p />
154: The passed LexerSemantic will receive every matched rule (top-down) together with
155: its ResultTree. See <i>lex()</i> for details.
156:
157: @param lexerSemantic the LexerSemantic to be called with every evaluated Rule and its lexing ResultTree.
158: @return a Token with a terminal symbol and its instance text, or a Token with null symbol for error.
159: */
160: public Token getNextToken(LexerSemantic lexerSemantic)
161: throws IOException {
162: return getNextToken(lexerSemantic, null);
163: }
164:
165: /**
166: Implements Lexer: returns the next token from input, or EPSILON when no more input.
167: This is called by the Parser to get the next syntax token from input.
168: When returned <i>token.symbol</i> is null, no input could be recognized (ERROR).
169: @param expectedTokenSymbols contains the expected String token symbols (in keys),
170: can be null when no Parser drives this Lexer.
171: @return a Token with a terminal symbol and its instance text, or a Token with null symbol for error.
172: */
173: public Token getNextToken(Map expectedTokenSymbols)
174: throws IOException {
175: return getNextToken(null, expectedTokenSymbols);
176: }
177:
178: private Token getNextToken(LexerSemantic lexerSemantic,
179: Map expectedTokenSymbols) throws IOException {
180: if (input == null)
181: throw new IllegalStateException(
182: "Lexer has no input, call setInput(...).");
183:
184: Token.Address start = new Token.Address(input.getScanLine(),
185: input.getScanColumn(), input.getScanOffset());
186: int c = input.peek(); // read lookahead
187: if (c == Input.EOF)
188: return createToken(Token.EPSILON, null, new Token.Range(
189: start, start));
190:
191: // not EOF, there must be a lexer item or error
192: Strategy.Item item = getNextLexerItem(expectedTokenSymbols, c);
193:
194: if (item != null) { // successful scan
195: if (ignoredSymbols != null
196: && ignoredSymbols.indexOf(item.getSymbol()) >= 0) {
197: if (listeners != null && listeners.size() > 0) // creating a token takes time, do it only when listeners are present
198: fireTokenReceived(createToken(item
199: .getTokenIdentifier(),
200: item.getResultTree(), lexerSemantic), true);
201: return getNextToken(expectedTokenSymbols);
202: } else {
203: Token token = createToken(item.getTokenIdentifier(),
204: item.getResultTree(), lexerSemantic);
205: fireTokenReceived(token, false);
206: return token;
207: }
208: }
209:
210: // error state, return an error Token with null symbol
211: Token.Address end = new Token.Address(input.getReadLine(),
212: input.getReadColumn(), input.getScanOffset());
213: return createToken(null, input.getUnreadText(),
214: new Token.Range(start, end));
215: }
216:
217: // strategic scan of next item
218: private Strategy.Item getNextLexerItem(Map expectedTokenSymbols,
219: int lookahead) throws IOException {
220: if (strategy == null)
221: throw new IllegalStateException(
222: "Lexer has no terminals, call setTerminals(syntaxSeparation.getTokenSymbols()).");
223:
224: Strategy.Item item = strategy.consume(input, lookahead,
225: expectedTokenSymbols);
226:
227: if (item != null)
228: input.resolveBuffer(); // forget old contents
229:
230: return item;
231: }
232:
233: // calls the token listeners with scanned token
234: private void fireTokenReceived(Token token, boolean ignored) {
235: for (int i = 0; listeners != null && i < listeners.size(); i++)
236: ((Lexer.TokenListener) listeners.get(i)).tokenReceived(
237: token, ignored);
238: }
239:
240: /** Token factory method. Can be overridden to access the lexing ResultTree. Delegates to createToken(tokenIdentifier, text, range). */
241: protected Token createToken(String tokenIdentifier,
242: ResultTree result, LexerSemantic lexerSemantic) {
243: if (lexerSemantic != null)
244: loopResultTree(result, lexerSemantic);
245: return createToken(tokenIdentifier, result.toString(), result
246: .getRange()); // toString() takes time as it builds the token text
247: }
248:
249: /** Token factory method. Can be overridden to convert token.text to some Java object. */
250: protected Token createToken(String tokenIdentifier, String text,
251: Token.Range range) {
252: return new Token(tokenIdentifier, text, range);
253: }
254:
255: /**
256: This is an optional functionality of Lexer. It is <b>NOT</b> called by the Parser.
257: It can be used to run a standalone Lexer with a LexerSemantic, processing a ready-scanned
258: syntax tree. Other than with Parser Semantic no value stack is available for LexerSemantic,
259: and all input will have been read when LexerSemantic is called with the built syntax tree.
260: <p />
261: The passed LexerSemantic will receive every matched rule (top-down) together with
262: its results ResultTree, containing the range within input.
263: ResultTree can be converted to text by calling <i>resultTree.toString()</i>.
264: <p />
265: This method evaluates the input using end-of-input like a parser, that means it returns
266: false if the input was either syntactically incorrect or EOF was not received when all rules
267: have been evaluated.
268: <p />
269: <b>MIND:</b> This method does not call any TokenListener, as the LexerSemantic is expected to
270: dispatch results!
271:
272: @param lexerSemantic the LexerSemantic to be called with every evaluated Rule and its lexing ResultTree.
273: @return true when lexer succeeded (input was syntactically ok), else false.
274: */
275: public boolean lex(LexerSemantic lexerSemantic) throws IOException {
276: int c = input.peek();
277: boolean eof = (c == Input.EOF);
278: boolean error = eof;
279:
280: if (error == false) {
281: Strategy.Item item = getNextLexerItem(null, c);
282: error = (item == null || item.getTokenIdentifier() == null);
283:
284: if (error == false && lexerSemantic != null)
285: loopResultTree(item.getResultTree(), lexerSemantic);
286:
287: c = input.peek();
288: eof = (c == Input.EOF);
289: error = (eof == false);
290: }
291:
292: if (error) {
293: dump(System.err);
294: System.err.println("Could not process character '"
295: + (char) c + "' (int " + c + "), at line/column "
296: + input.getScanLine() + "/" + input.getScanColumn()
297: + ", at offset " + input.getScanOffset());
298: }
299:
300: return error == false;
301: }
302:
303: /**
304: After top-down lexing this method is called to dispatch all results. Can be overridden to change dispatch logic.
305: This method calls itself recursively with all result tree children. Nonterminals starting with "_" are ignored
306: by default, as this marks artificial rules.
307: @param result lexer result, returns text on getText().
308: @param semantic semantic that dispatches the lexer results.
309: @return a Token with the range and return of the Semantic call for this Rule/ResultTree.
310: */
311: protected void loopResultTree(ResultTree result,
312: LexerSemantic lexerSemantic) {
313: Set wantedNonterminals = lexerSemantic.getWantedNonterminals();
314: Set ignoredNonterminals = lexerSemantic
315: .getIgnoredNonterminals();
316: String nonterminal = result.getRule().getNonterminal();
317:
318: if (nonterminal
319: .startsWith(Token.ARTIFICIAL_NONTERMINAL_START_CHARACTER) == false
320: && (wantedNonterminals == null || wantedNonterminals
321: .contains(nonterminal))
322: && (ignoredNonterminals == null || ignoredNonterminals
323: .contains(nonterminal) == false)) {
324: lexerSemantic.ruleEvaluated(result.getRule(), result);
325: }
326:
327: for (int i = 0; i < result.getChildCount(); i++) {
328: Object child = result.getChild(i);
329: if (child instanceof ResultTree)
330: loopResultTree((ResultTree) child, lexerSemantic);
331: }
332: }
333:
334: // debug methods
335:
336: /** Implements Lexer: Set debug on to output information about scanned tokens. */
337: public void setDebug(boolean debug) {
338: this .debug = debug;
339: }
340:
341: /** Returns the current line, as far as read. */
342: public String getLineText() {
343: return input.getLine();
344: }
345:
346: /** Returns the number of the current line, 1-n. */
347: public int getLine() {
348: return input.getReadLine();
349: }
350:
351: /** Returns the position within the current line, 0-n. */
352: public int getColumn() {
353: return input.getReadColumn();
354: }
355:
356: /** Returns the offset read so far from input. This is an absolute offset, including newlines. */
357: public int getOffset() {
358: return input.getScanOffset();
359: }
360:
361: /** Outputs current and previous line, with line numbers. Call this on ERROR. */
362: public void dump(PrintStream out) {
363: int lineNr = input.getReadLine();
364: String line = getLineText();
365:
366: if (lineNr > 1) {
367: String prevLine = input.getPreviousLine();
368: out.print((lineNr - 1) + ":\t");
369: out.println(prevLine);
370: }
371:
372: out.print(lineNr + ":\t");
373: out.println(line);
374:
375: int nrLen = Integer.toString(lineNr).length();
376: for (int i = 0; i < nrLen; i++)
377: out.print(" ");
378:
379: out.print("\t");
380:
381: int errPos = input.getReadColumn();
382:
383: for (int i = 0; i < errPos && i < line.length(); i++)
384: if (line.charAt(i) == '\t')
385: out.print("\t");
386: else
387: out.print(" ");
388:
389: out.println("^");
390: }
391:
392: }
|