001: package fri.patterns.interpreter.parsergenerator.lexer;
002:
003: import java.util.*;
004: import java.io.IOException;
005: import fri.patterns.interpreter.parsergenerator.Lexer;
006: import fri.patterns.interpreter.parsergenerator.Token;
007: import fri.patterns.interpreter.parsergenerator.syntax.*;
008: import fri.patterns.interpreter.parsergenerator.syntax.builder.SyntaxSeparation;
009:
010: /**
011: Generates a Lexer from a Syntax. The Syntax can contain also parser rules.
012: These will be retrievable (without the removed lexer rules) after build by
013: calling <i>lexer.getParserSyntax()</i>.
014: <p>
015: The syntax rules may not contain '(', ')', '*', '+' or '?', but they may
016: contain character set symbols like ".." (set definitions) and "-" (intersections).
017: <p>
018: The syntax may contain identifiers enclosed within `backquotes`.
019: This marks predefined lexer rules defined in <i>StandardLexerRules</i>.
020: That class contains default rules for numbers, identifiers, stringdefinitions,
021: characterdefinitions and many other (e.g. for XML), which can be used to build
022: lexers.<br>
023: <b>CAUTION:</b> Lexer and parser rules have the same namespace, you can not define
024: <pre>
025: identifier ::= `identifier`; // wrong!
026: </pre>
027: Nevertheless you need not to care about the names silently imported from
028: <i>StandardLexerRules</i>, they will not reduce the parser syntax namespace,
029: only the toplevel rules will.
030: <p>
031: The syntax may contain (case-sensitive) these nonterminals:
032: <ul>
033: <li>token</li>
034: <li>ignored</li>
035: </ul>
036: These are lexer-reserved identifiers and can be used to mark top level lexer
037: rules (tokens). When <b>token</b> is used, the builder does not try to recognize any
038: rule as lexer rule, so this must be good modeled. Be careful: you can read away
039: comments only by using <b>ignored</b>. But you can define <b>ignored</b> without <b>token</b>,
040: then nevertheless the builder tries to recognize lexer rules.<br>
041: When the <b>token</b> marker is not used, the builder tries to separate lexer from
042: parser rules.
043: <p>
044: Example:
045: <pre>
046: token ::= `identifier` ; // using StandardLexerRules
047: ignored ::= `spaces` ;
048: ignored ::= `newline` ;
049: ignored ::= comment ;
050: comment ::= "//" char_minus_newline_list_opt ;
051: char_minus_newline ::= chars - newline;
052: char_minus_newline_list ::= char_minus_newline_list char_minus_newline;
053: char_minus_newline_list ::= char_minus_newline ;
054: char_minus_newline_list_opt ::= char_minus_newline_list;
055: char_minus_newline_list_opt ::= ; // nothing
056: </pre>
057: Mind that the builder input can not be a text file, it must be wrapped into <i>Syntax</i>.
058: Use syntax builder to convert a text into a <i>Syntax</i> object.
059: <p>
060: Java code fragment:
061: <pre>
062: SyntaxSeparation separation = new SyntaxSeparation(new Syntax(myRules));
063: LexerBuilder builder = new LexerBuilder(separation.getLexerSyntax(), separation.getIgnoredSymbols());
064: Lexer lexer = builder.getLexer();
065: // when using the lexer standalone (without Parser), you must put the token terminal symbols into it now:
066: lexer.setTerminals(separation.getTokenSymbols());
067: </pre>
068:
069: @see fri.patterns.interpreter.parsergenerator.syntax.builder.SyntaxSeparation
070: @see fri.patterns.interpreter.parsergenerator.lexer.StandardLexerRules
071: @author (c) 2002, Fritz Ritzberger
072: */
073:
074: public class LexerBuilder {
075: protected Map charConsumers;
076: protected List ignoredSymbols;
077: public static boolean DEBUG; // defaults to false
078:
079: /**
080: Creates a LexerBuilder (from lexer rules) that provides a Lexer.
081: @param lexerSyntax lexer rule (without token and ignored, use SyntaxSeparation for that)
082: @param ignoredSymbols list of ignored symbols, NOT enclosed in backquotes!
083: */
084: public LexerBuilder(Syntax lexerSyntax, List ignoredSymbols)
085: throws LexerException, SyntaxException {
086: this .ignoredSymbols = ignoredSymbols;
087: build(lexerSyntax);
088: }
089:
090: /** Returns the built Lexer. */
091: public Lexer getLexer() {
092: return new LexerImpl(ignoredSymbols, charConsumers);
093: }
094:
095: /** Returns the built Lexer, loaded with passed input (file, stream, string, ...). */
096: public Lexer getLexer(Object input) throws IOException {
097: Lexer lexer = getLexer();
098: lexer.setInput(input);
099: return lexer;
100: }
101:
102: private void build(Syntax lexerSyntax) throws LexerException,
103: SyntaxException {
104: SyntaxSeparation.IntArray deleteIndexes = new SyntaxSeparation.IntArray(
105: lexerSyntax.size());
106: if (DEBUG)
107: System.err.println("Processing lexer rules: \n"
108: + lexerSyntax);
109:
110: // resolve scanner rules to Consumers and put it into a hashtable
111: this .charConsumers = new Hashtable(lexerSyntax.size());
112: for (int i = 0; i < lexerSyntax.size(); i++)
113: translateLexerRule(lexerSyntax.getRule(i), i, deleteIndexes);
114: deleteIndexes.removeIndexesFrom(lexerSyntax);
115:
116: // check for unresolved repeatable and nullable rules and delete them from lexer syntax
117: for (int i = 0; i < lexerSyntax.size(); i++) {
118: Rule rule = lexerSyntax.getRule(i);
119: String nonterm = rule.getNonterminal();
120: if (checkNullableRule(nonterm, rule, i, deleteIndexes) == false)
121: if (checkRepeatableRule(nonterm, rule, i, deleteIndexes) == false)
122: throw new LexerException(
123: "Found no character consumer for nullable or repeatable rule "
124: + rule);
125: }
126: deleteIndexes.removeIndexesFrom(lexerSyntax);
127:
128: if (lexerSyntax.size() > 0) { // not all rules have been resolved to character consumers
129: throw new LexerException(
130: "Could not process rules in lexer syntax: "
131: + lexerSyntax);
132: }
133:
134: // resolve all symbolic consumer references after all consumers have been created
135: Map done = new Hashtable(); // beware of recursion
136: for (Iterator it = charConsumers.entrySet().iterator(); it
137: .hasNext();) {
138: Consumer cc = (Consumer) ((Map.Entry) it.next()).getValue();
139: cc.resolveConsumerReferences(charConsumers, done);
140: }
141: }
142:
143: private void translateLexerRule(Rule rule, int index,
144: SyntaxSeparation.IntArray deleteIndexes)
145: throws LexerException {
146: String nonterm = rule.getNonterminal();
147: if (rule.rightSize() <= 0
148: || rule.getRightSymbol(0).equals(nonterm)) // nullable rules and left recursive rules will be resolved later
149: return;
150:
151: //System.err.println("translating lexer rule: "+rule);
152:
153: // ExtendedGrammar should have resolved all parenthesis expressions and wildcards.
154: // We take away rules that are:
155: // - single character position definitions like
156: // nonterm ::= '0' .. '9'
157: // nonterm ::= 'a' .. 'z' - 'm' .. 'n'
158: // nonterm ::= something - "string" // "something" must be among scanner rules
159: // - single string terminal definitions like
160: // nonterm ::= "string"
161: // nonterm ::= 'c' 'd' 'e' "fgh" // do concatenation
162: // - scanner nonterminals concatenations like
163: // nonterm ::= something1 something2 // "somethingN" must be among scanner rules
164: // - scanner nonterminals concatenations like
165: // nonterm ::= "string"
166:
167: int CONCATENATION = 0, SET = 1, SUBTRACTION = 2;
168: int state = CONCATENATION;
169: boolean intersectionHappened = false;
170: Consumer consumer = new Consumer(rule); // master consumer
171: Consumer currentConsumer = new Consumer();
172: Consumer setConsumer = currentConsumer; // will host set definitions
173: consumer.append(currentConsumer); // will be resolved when trivial
174:
175: for (int i = 0; i < rule.rightSize(); i++) { // loop all symbols on right side
176: String sym = rule.getRightSymbol(i);
177:
178: if (sym.equals(Token.BUTNOT)) {
179: if (i == 0 || state != CONCATENATION)
180: throw new LexerException(
181: "Missing symbol to subtract from: " + rule);
182: state = SUBTRACTION;
183: } else if (sym.equals(Token.UPTO)) {
184: if (i == 0 || state != CONCATENATION)
185: throw new LexerException(
186: "Missing lower limit of set: " + rule);
187: state = SET;
188: } else {
189: String convertedSym = convertSymbol(sym); // remove quotes or convert number to char
190: boolean isNonterm = convertedSym.equals(sym);
191: if (isNonterm && state == SET)
192: throw new LexerException(
193: "Can not append nonterminal to set: "
194: + rule);
195:
196: boolean setWillHappen = rule.rightSize() > i + 1
197: && rule.getRightSymbol(i + 1)
198: .equals(Token.UPTO); // next symbol will be ".."
199:
200: if (state == SET) {
201: setConsumer.appendSet(convertedSym);
202: setConsumer = currentConsumer; // reset if intersection happened
203: } else if (state == SUBTRACTION) {
204: intersectionHappened = true;
205: if (isNonterm)
206: if (setWillHappen)
207: throw new LexerException(
208: "Nonterminal can not open set after subtraction: "
209: + rule);
210: else
211: currentConsumer
212: .subtract(new Consumer.Reference(
213: sym));
214: else if (setWillHappen)
215: currentConsumer
216: .subtract(setConsumer = new Consumer(
217: convertedSym));
218: else
219: currentConsumer.subtract(new Consumer(
220: convertedSym));
221: } else if (state == CONCATENATION) {
222: if (intersectionHappened) { // start new consumer
223: intersectionHappened = false;
224: currentConsumer = new Consumer();
225: consumer.append(currentConsumer);
226: }
227:
228: if (isNonterm)
229: if (setWillHappen)
230: throw new LexerException(
231: "Nonterminal can not open set in concatenation: "
232: + rule);
233: else
234: currentConsumer
235: .append(new Consumer.Reference(sym));
236: else
237: currentConsumer.append(convertedSym); // a following set will be recognized by consumer
238: }
239:
240: state = CONCATENATION; // reset to normal state
241:
242: } // end switch current symbol
243: } // end for right side of rule
244:
245: putCharConsumer(nonterm, consumer.optimize());
246: deleteIndexes.add(index);
247: }
248:
249: private void putCharConsumer(String key, Consumer consumer) {
250: //System.err.println("putting character consumer for "+key);
251: Object o = charConsumers.get(key); // test if existing
252:
253: if (o == null) { // not in list
254: charConsumers.put(key, consumer);
255: } else {
256: ConsumerAlternatives ca;
257:
258: if (o instanceof ConsumerAlternatives == false) {
259: ca = new ConsumerAlternatives((Consumer) o);
260: charConsumers.put(key, ca); // replace consumer
261: } else {
262: ca = (ConsumerAlternatives) o;
263: }
264:
265: ca.addAlternate(consumer); // add a new alternative
266: }
267: }
268:
269: private boolean checkNullableRule(String nonterm, Rule rule,
270: int index, SyntaxSeparation.IntArray deleteIndexes) {
271: // We take away rules that are optional nonterminals like
272: // nonterm ::= something // "nonterm" is already among scanner rules
273: // nonterm ::= /*nothing*/ // this is the rule to remove now
274:
275: if (rule.rightSize() <= 0) {
276: Object o = charConsumers.get(nonterm);
277: ((Consumer) o).setNullable();
278: deleteIndexes.add(index);
279: return true; // do not explore empty rule, return "found nullable"
280: }
281: return false;
282: }
283:
284: private boolean checkRepeatableRule(String nonterm, Rule rule,
285: int index, SyntaxSeparation.IntArray deleteIndexes) {
286: // We take away rules that are left recursive like
287: // nonterm ::= nonterm something // this is the rule to remove now
288: // nonterm ::= something // "nonterm" must be already among scanner rules
289:
290: // check for nonterm in hashtable, set it repeatable if found
291: if (rule.rightSize() >= 2
292: && rule.getRightSymbol(0).equals(nonterm)) { // left recursive
293: Consumer cc = (Consumer) charConsumers.get(nonterm);
294: if (cc.matchesRepeatableRule(rule)) { // check if rest on the right is same
295: cc.setRepeatable();
296: deleteIndexes.add(index);
297: return true;
298: }
299: }
300: return false;
301: }
302:
303: /**
304: Converts a character or string definition to its processable form.
305: This implementation must be according to "bnf_chardef" in <i>StandardLexerRules</i>.
306: <ul>
307: <li>0xFFFF hexadecimal, convert to character</li>
308: <li>0777 octal, convert to character</li>
309: <li>12345 decimal, convert to character</li>
310: <li>'c' character, remove quotes</li>
311: <li>'\n', "\n" escaped character, decode</li>
312: <li>"string" string, remove quotes</li>
313: <li>nonterminal - will stay unchanged</li>
314: </ul>
315: */
316: private String convertSymbol(String sym) {
317: if (sym.charAt(0) == '\'' || sym.charAt(0) == '"') {
318: String s = sym.substring(1, sym.length() - 1);
319: if (s.length() <= 0)
320: throw new IllegalArgumentException(
321: "Empty character or string definition: " + sym);
322:
323: StringBuffer sb = new StringBuffer(s.length()); // convert escape sequences to their real meaning
324: for (int i = 0; i < s.length(); i++) {
325: char c = s.charAt(i);
326: if (c == '\\') {
327: char c1 = s.length() > i + 1 ? s.charAt(i + 1) : 0;
328: switch (c1) {
329: case 'n':
330: sb.append('\n');
331: i++;
332: break;
333: case 'r':
334: sb.append('\r');
335: i++;
336: break;
337: case 't':
338: sb.append('\t');
339: i++;
340: break;
341: case 'f':
342: sb.append('\f');
343: i++;
344: break;
345: case 'b':
346: sb.append('\b');
347: i++;
348: break;
349: case '\'':
350: sb.append('\'');
351: i++;
352: break;
353: case '"':
354: sb.append('"');
355: i++;
356: break;
357: case '\\':
358: sb.append('\\');
359: i++;
360: break;
361: default:
362: sb.append(c);
363: break;
364: }
365: } else {
366: sb.append(c);
367: }
368: }
369: return sb.toString();
370: } else { // must be starting with digit or be a nonterminal
371: char c;
372: if (sym.startsWith("0x") || sym.startsWith("0X")) // hexadecimal number
373: c = (char) Integer.valueOf(sym.substring(2), 16)
374: .intValue();
375: else if (sym.startsWith("0")) // octal number
376: c = (char) Integer.valueOf(sym.substring(1), 8)
377: .intValue();
378: else if (Character.isDigit(sym.charAt(0)))
379: c = (char) Integer.valueOf(sym).intValue(); // will throw NumberFormatException when not number
380: else
381: return sym; // is a nonterminal
382:
383: return new String(new char[] { c });
384: }
385: }
386:
387: }
|