001: package fri.patterns.interpreter.parsergenerator.syntax.builder;
002:
003: import java.util.*;
004: import fri.patterns.interpreter.parsergenerator.Token;
005: import fri.patterns.interpreter.parsergenerator.syntax.*;
006: import fri.patterns.interpreter.parsergenerator.lexer.StandardLexerRules;
007:
008: /**
009: Separates lexer from parser rules. Provides token and ignored symbols
010: after construction, which can be used for LexerBuilder construction.
011: <p>
012: Example:
013: <pre>
014: SyntaxSeparation separation = new SyntaxSeparation(new Syntax(myRules));
015: LexerBuilder builder = new LexerBuilder(separation.getLexerSyntax(), separation.getIgnoredSymbols());
016: Lexer lexer = builder.getLexer();
017: // when using the lexer standalone, you must put the token terminal symbols into it now:
018: lexer.setTerminals(separation.getTokenSymbols());
019: // when using a Parser the Parser will do this:
020: </pre>
021:
022: @see fri.patterns.interpreter.parsergenerator.lexer.LexerBuilder
023: @author (c) 2002, Fritz Ritzberger
024: */
025:
026: public class SyntaxSeparation {
027: private List tokenSymbols;
028: private List ignoredSymbols;
029: private Syntax parserSyntax;
030: private Syntax lexerSyntax;
031: public static boolean DEBUG = true;
032:
033: /** Splits a syntax into two syntaxes containing parser and lexer rules, tokens and ignored get collected. */
034: public SyntaxSeparation(Syntax syntax) throws SyntaxException {
035: separate(syntax, new IntArray(syntax.size()));
036: }
037:
038: /** Returns the lexer syntax that remains after removing all parser rules and token/ignored directives. */
039: public Syntax getLexerSyntax() {
040: return lexerSyntax;
041: }
042:
043: /** Returns the parser syntax that remains after removing all lexer rules. All lexer tokens are marked with `backquotes`. */
044: public Syntax getParserSyntax() {
045: return parserSyntax;
046: }
047:
048: /**
049: Returns the top level token symbols for the lexer. These symbols ARE enclosed within `backquotes`.
050: This can be used at <i>setTerminals</i> call for a standalone Lexer.
051: */
052: public List getTokenSymbols() {
053: return tokenSymbols;
054: }
055:
056: /**
057: Returns the top level ignored token symbols for the lexer. These symbols are NOT enclosed within `backquotes`.
058: This will be used by the LexerBuilder for feeding the LexerImpl with ignored symbols.
059: */
060: public List getIgnoredSymbols() {
061: return ignoredSymbols;
062: }
063:
064: private void separate(Syntax syntax, IntArray deleteIndexes)
065: throws SyntaxException {
066: Hashtable tokenSymbols = new Hashtable();
067: Hashtable ignoredSymbols = new Hashtable();
068: List commandsDefinedAsTOKEN = new ArrayList();
069:
070: // take away all rules defined by "token" and "ignored"
071: for (int i = 0; i < syntax.size(); i++) {
072: Rule rule = syntax.getRule(i);
073: String nonterm = rule.getNonterminal();
074:
075: boolean token = nonterm.equals(Token.TOKEN);
076: boolean ignored = token == false
077: && nonterm.equals(Token.IGNORED);
078:
079: if (token || ignored) { // special token definition rule
080: if (rule.rightSize() != 1) // token/ignored must not contain more than one item on right side (alternatives being resoved)
081: throw new SyntaxException(
082: "\"token\" and \"ignored\" are predefined lexer keywords and must contain exactly one nonterminal symbol after resolve: "
083: + rule);
084:
085: deleteIndexes.add(i);
086:
087: String sym = rule.getRightSymbol(0); // the token identifier
088: if (sym.charAt(0) == Token.COMMAND_QUOTE) {
089: sym = sym.substring(1, sym.length() - 1); // take off quotes
090: if (token)
091: commandsDefinedAsTOKEN.add(sym);
092: }
093:
094: if (token)
095: tokenSymbols.put(sym, sym);
096: else
097: ignoredSymbols.put(sym, sym);
098: }
099: }
100: deleteIndexes.removeIndexesFrom(syntax);
101:
102: // check if tokens are defined as ignored or vice versa
103: for (Iterator it = tokenSymbols.keySet().iterator(); it
104: .hasNext();) {
105: Object o = it.next();
106: if (ignoredSymbols.get(o) != null)
107: throw new SyntaxException(
108: "Can not define token as ignored: " + o);
109: }
110: for (Iterator it = ignoredSymbols.keySet().iterator(); it
111: .hasNext();) {
112: Object o = it.next();
113: if (tokenSymbols.get(o) != null)
114: throw new SyntaxException(
115: "Can not define ignored as token: " + o);
116: }
117:
118: boolean tokensWereDeclared = tokenSymbols.size() > 0;
119: List commandTokens = new ArrayList();
120:
121: // collect all `lexertokens`.
122: for (int i = 0; i < syntax.size(); i++) {
123: Rule rule = syntax.getRule(i);
124:
125: for (int j = 0; j < rule.rightSize(); j++) { // loop right side of rule
126: String sym = rule.getRightSymbol(j);
127:
128: if (sym.charAt(0) == Token.COMMAND_QUOTE) {
129: sym = sym.substring(1, sym.length() - 1); // remove command quotes
130: tokenSymbols.put(sym, sym);
131: commandTokens.add(sym);
132: rule.setRightSymbol(sym, j); // remove the quotes for being able to recognize the symbol later
133: } else
134: // get out all rules containing ".." and "-" when no "token" was defined explicitely, as these must be scanner rules
135: if (tokensWereDeclared == false
136: && (sym.equals(Token.BUTNOT) || sym
137: .equals(Token.UPTO))) {
138: String s = rule.getNonterminal(); // take this rule to lexer rules
139: tokenSymbols.put(s, s);
140: }
141: }
142: }
143:
144: // Get out all scanner rules from parser syntax, recursively
145: this .lexerSyntax = new Syntax(tokenSymbols.size()
146: + ignoredSymbols.size());
147: Hashtable[] varr = new Hashtable[] { tokenSymbols,
148: ignoredSymbols };
149:
150: for (int j = 0; j < varr.length; j++) { // loop token and ignored lists
151: Hashtable symbols = varr[j];
152:
153: for (Enumeration e = symbols.keys(); e.hasMoreElements();) {
154: String nonterm = (String) e.nextElement();
155:
156: getRulesUnderSymbol(nonterm, syntax, lexerSyntax,
157: deleteIndexes);
158:
159: if (deleteIndexes.isEmpty()
160: && lexerSyntax.hasRule(nonterm) == false) { // if there were no rules for symbol
161: String[][] predefinedRules = StandardLexerRules
162: .rulesForIdentifier(nonterm); // look for predefined rule
163: if (predefinedRules == null
164: || predefinedRules.length <= 0)
165: throw new SyntaxException(
166: "Found nonterminal that has no rule and is no predefined lexer nonterminal: >"
167: + nonterm + "<");
168:
169: lexerSyntax.appendRules(SyntaxUtil
170: .ruleArrayToList(predefinedRules));
171: }
172:
173: deleteIndexes.removeIndexesFrom(syntax);
174: }
175: }
176:
177: // add ignoredSymbols to member variable lists
178: this .ignoredSymbols = new ArrayList(ignoredSymbols.size());
179: for (Enumeration e = ignoredSymbols.keys(); e.hasMoreElements();)
180: this .ignoredSymbols.add(e.nextElement());
181:
182: this .parserSyntax = provideParserSyntax(syntax, lexerSyntax,
183: tokensWereDeclared, tokenSymbols, commandTokens,
184: commandsDefinedAsTOKEN);
185:
186: //System.err.println("Parser syntax:\n"+syntax);
187: //System.err.println("Lexer syntax:\n"+lexerSyntax);
188: //System.err.println("token symbols: "+tokenSymbols);
189: //System.err.println("ignored symbols: "+ignoredSymbols);
190: }
191:
192: private Syntax provideParserSyntax(Syntax parserSyntax,
193: Syntax lexerSyntax, boolean tokensWereDeclared,
194: Map tokenSymbols, List commandTokens,
195: List commandsDefinedAsTOKEN) throws SyntaxException {
196: boolean lexerOnlyHandling = false;
197:
198: // when this was a mixed grammar: mark all tokens and ignored symbols as lexer `terminals` in parserSyntax
199: if (parserSyntax.size() > 0) {
200: if (DEBUG)
201: System.err
202: .println("INFO: Mixed parser and lexer specification, "
203: + lexerSyntax.size()
204: + " lexer rules, "
205: + parserSyntax.size()
206: + " parser rules.");
207: this .tokenSymbols = new ArrayList(tokenSymbols.size()); // keep only toplevel token symbols
208:
209: for (int i = 0; i < parserSyntax.size(); i++) {
210: Rule rule = parserSyntax.getRule(i);
211:
212: for (int j = 0; j < rule.rightSize(); j++) {
213: String sym = rule.getRightSymbol(j);
214:
215: if (tokenSymbols.get(sym) != null) { // this is a lexer token, mask it with `backquotes`
216: String parserSymbol = Token.COMMAND_QUOTE + sym
217: + Token.COMMAND_QUOTE;
218: if (sym.charAt(0) != Token.COMMAND_QUOTE) // mark symbol with backquotes as lexer token
219: rule.setRightSymbol(parserSymbol, j);
220:
221: if (this .tokenSymbols.indexOf(parserSymbol) < 0)
222: this .tokenSymbols.add(parserSymbol);
223: // add top level token as `symbol` (with backquotes), as Parser will pass terminals like that,
224: // and the Lexer should also be usable without Parser.
225: } else if (sym.equals(Token.UPTO)
226: || sym.equals(Token.BUTNOT)) {
227: throw new SyntaxException(
228: "Found lexer rule in parser syntax: "
229: + rule
230: + ". Please define \"token\" and \"ignored\" better!");
231: } else if (Token.isTerminal(sym) == false) { // check if there is a nonterminal without rule on right side and try to find it among lexer rules
232: boolean found = parserSyntax.hasRule(sym);
233: if (found == false) { // try to find it in lexer rules and make it a token when found, else throw error
234: if (lexerSyntax.hasRule(sym)) {
235: String parserSymbol = Token.COMMAND_QUOTE
236: + sym + Token.COMMAND_QUOTE;
237: rule.setRightSymbol(parserSymbol, j);
238: if (this .tokenSymbols
239: .indexOf(parserSymbol) < 0)
240: this .tokenSymbols.add(parserSymbol);
241: } else {
242: throw new SyntaxException(
243: "Parser nonterminal without rule: "
244: + sym);
245: }
246: }
247: }
248: }
249: }
250: } else // no declared tokens and no parser syntax remained
251: if (tokensWereDeclared == false) { // find top level lexer rules and put them into token list marked with `backquotes`
252: if (DEBUG)
253: System.err
254: .println("INFO: No tokens were defined, lexer specification without parser rules, "
255: + lexerSyntax.size() + " lexer rules.");
256: List startRules = lexerSyntax.findStartRules();
257:
258: if (startRules.size() > 0) {
259: this .tokenSymbols = new ArrayList(startRules.size()); // allocate new list only if there were start rules
260:
261: for (int i = 0; i < startRules.size(); i++) {
262: String symbol = Token.COMMAND_QUOTE
263: + ((Rule) startRules.get(i))
264: .getNonterminal()
265: + Token.COMMAND_QUOTE;
266: if (this .tokenSymbols.indexOf(symbol) < 0) // add unique
267: this .tokenSymbols.add(symbol);
268: }
269: } else {
270: lexerOnlyHandling = true;
271: }
272: } else {
273: if (DEBUG)
274: System.err
275: .println("INFO: tokens were defined, lexer specification without parser rules, "
276: + lexerSyntax.size() + " lexer rules.");
277: lexerOnlyHandling = true;
278: }
279:
280: if (lexerOnlyHandling) { // no parser syntax remained, and tokens were defined or no start rules were found
281: for (int i = 0; i < commandTokens.size(); i++) { // no parser syntax, remove collected `lexercommands` when they were not defined in TOKEN definition
282: String sym = (String) commandTokens.get(i);
283: if (commandsDefinedAsTOKEN.indexOf(sym) < 0)
284: tokenSymbols.remove(sym);
285: }
286:
287: // add tokenSymbols to member variable lists
288: this .tokenSymbols = new ArrayList(tokenSymbols.size());
289: for (Iterator it = tokenSymbols.keySet().iterator(); it
290: .hasNext();)
291: this .tokenSymbols.add(Token.COMMAND_QUOTE
292: + it.next().toString() + Token.COMMAND_QUOTE);
293: // must enclose all explicit token symbols in `backquotes`
294: }
295:
296: return parserSyntax;
297: }
298:
299: /*
300: Appends all rules in passed syntax deriving passed symbol and all their
301: sub-rules, recursively, to passed result-syntax. Fills the deleteIndex
302: with every appended rule index, but does not delete the rule from source syntax.
303: @param symbol nonterminal to search
304: @param syntax source syntax that will be searched
305: @param resultSyntax target syntax to append the found rules to
306: @param deleteIndexes receives indexes of all appended rules for later deletion
307: */
308: private void getRulesUnderSymbol(String symbol, Syntax syntax,
309: Syntax resultSyntax, IntArray deleteIndexes) {
310: for (int i = 0; i < syntax.size(); i++) {
311: Rule rule = syntax.getRule(i);
312: String nonterm = rule.getNonterminal();
313:
314: // check if rule derives symbol and was not already appended (recursive rules!)
315: if (deleteIndexes.contains(i) == false
316: && nonterm.equals(symbol)) {
317: resultSyntax.addRule(rule);
318: deleteIndexes.add(i);
319:
320: // check for further nonterminals on right side
321: for (int j = 0; j < rule.rightSize(); j++) {
322: String sym = rule.getRightSymbol(j);
323: if (Token.isTerminal(sym) == false
324: && sym.equals(Token.BUTNOT) == false
325: && sym.equals(Token.UPTO) == false)
326: getRulesUnderSymbol(sym, syntax, resultSyntax,
327: deleteIndexes);
328: }
329: }
330: }
331: }
332:
333: // Array implementation used to store indexes for fast and safe deletion of rules from syntaxes
334: public static class IntArray {
335: private int[] array;
336: private int pos;
337:
338: public IntArray(int size) {
339: array = new int[size];
340: }
341:
342: public void add(int i) {
343: if (pos >= array.length) {
344: int[] newArray = new int[array.length * 2];
345: System.arraycopy(array, 0, newArray, 0, array.length);
346: array = newArray;
347: }
348: array[pos] = i;
349: pos++;
350: }
351:
352: public boolean isEmpty() {
353: return pos == 0;
354: }
355:
356: public boolean contains(int j) {
357: for (int i = 0; i < pos; i++)
358: if (array[i] == j)
359: return true;
360: return false;
361: }
362:
363: public void removeIndexesFrom(Syntax syntax) {
364: Arrays.sort(array, 0, pos); // to delete from list indexes must be sorted
365: for (int i = pos - 1; i >= 0; i--)
366: syntax.removeRule(array[i]); // remove rule
367: pos = 0; // reset position
368: }
369:
370: } // end class IntArray
371:
372: }
|