001: package antlr;
002:
003: /* ANTLR Translator Generator
004: * Project led by Terence Parr at http://www.jGuru.com
005: * Software rights: http://www.antlr.org/RIGHTS.html
006: *
007: * $Id: AntlrCodeGenerator.java,v 1.1 2004/01/21 19:18:29 rgrimm Exp $
008: */
009:
010: import java.io.PrintWriter;
011: import java.io.IOException;
012: import java.io.FileWriter;
013:
014: import antlr.collections.impl.Vector;
015: import antlr.collections.impl.BitSet;
016:
017: /**A generic ANTLR code generator. All code generators
018: * Derive from this class.
019: *
020: * <p>
021: * A CodeGenerator knows about a Grammar data structure and
022: * a grammar analyzer. The Grammar is walked to generate the
023: * appropriate code for both a parser and lexer (if present).
024: * This interface may change slightly so that the lexer is
025: * itself living inside of a Grammar object (in which case,
026: * this class generates only one recognizer). The main method
027: * to call is <tt>gen()</tt>, which initiates all code gen.
028: *
029: * <p>
030: * The interaction of the code generator with the analyzer is
031: * simple: each subrule block calls deterministic() before generating
032: * code for the block. Method deterministic() sets lookahead caches
033: * in each Alternative object. Technically, a code generator
034: * doesn't need the grammar analyzer if all lookahead analysis
035: * is done at runtime, but this would result in a slower parser.
036: *
037: * <p>
038: * This class provides a set of support utilities to handle argument
039: * list parsing and so on.
040: *
041: * @author Terence Parr, John Lilley
042: * @version 2.00a
043: * @see antlr.JavaCodeGenerator
044: * @see antlr.DiagnosticCodeGenerator
045: * @see antlr.LLkAnalyzer
046: * @see antlr.Grammar
047: * @see antlr.AlternativeElement
048: * @see antlr.Lookahead
049: */
050: public abstract class CodeGenerator {
051: protected antlr.Tool antlrTool;
052:
053: /** Current tab indentation for code output */
054: protected int tabs = 0;
055:
056: /** Current output Stream */
057: transient protected PrintWriter currentOutput; // SAS: for proper text i/o
058:
059: /** The grammar for which we generate code */
060: protected Grammar grammar = null;
061:
062: /** List of all bitsets that must be dumped. These are Vectors of BitSet. */
063: protected Vector bitsetsUsed;
064:
065: /** The grammar behavior */
066: protected DefineGrammarSymbols behavior;
067:
068: /** The LLk analyzer */
069: protected LLkGrammarAnalyzer analyzer;
070:
071: /** Object used to format characters in the target language.
072: * subclass must initialize this to the language-specific formatter
073: */
074: protected CharFormatter charFormatter;
075:
076: /** Use option "codeGenDebug" to generate debugging output */
077: protected boolean DEBUG_CODE_GENERATOR = false;
078:
079: /** Default values for code-generation thresholds */
080: protected static final int DEFAULT_MAKE_SWITCH_THRESHOLD = 2;
081: protected static final int DEFAULT_BITSET_TEST_THRESHOLD = 4;
082:
083: /** If there are more than 8 long words to init in a bitset,
084: * try to optimize it; e.g., detect runs of -1L and 0L.
085: */
086: protected static final int BITSET_OPTIMIZE_INIT_THRESHOLD = 8;
087:
088: /** This is a hint for the language-specific code generator.
089: * A switch() or language-specific equivalent will be generated instead
090: * of a series of if/else statements for blocks with number of alternates
091: * greater than or equal to this number of non-predicated LL(1) alternates.
092: * This is modified by the grammar option "codeGenMakeSwitchThreshold"
093: */
094: protected int makeSwitchThreshold = DEFAULT_MAKE_SWITCH_THRESHOLD;
095:
096: /** This is a hint for the language-specific code generator.
097: * A bitset membership test will be generated instead of an
098: * ORed series of LA(k) comparisions for lookahead sets with
099: * degree greater than or equal to this value.
100: * This is modified by the grammar option "codeGenBitsetTestThreshold"
101: */
102: protected int bitsetTestThreshold = DEFAULT_BITSET_TEST_THRESHOLD;
103:
104: private static boolean OLD_ACTION_TRANSLATOR = true;
105:
106: public static String TokenTypesFileSuffix = "TokenTypes";
107: public static String TokenTypesFileExt = ".txt";
108:
109: /** Construct code generator base class */
110: public CodeGenerator() {
111: }
112:
113: /** Output a String to the currentOutput stream.
114: * Ignored if string is null.
115: * @param s The string to output
116: */
117: protected void _print(String s) {
118: if (s != null) {
119: currentOutput.print(s);
120: }
121: }
122:
123: /** Print an action without leading tabs, attempting to
124: * preserve the current indentation level for multi-line actions
125: * Ignored if string is null.
126: * @param s The action string to output
127: */
128: protected void _printAction(String s) {
129: if (s == null) {
130: return;
131: }
132:
133: // Skip leading newlines, tabs and spaces
134: int start = 0;
135: while (start < s.length()
136: && Character.isSpaceChar(s.charAt(start))) {
137: start++;
138: }
139:
140: // Skip leading newlines, tabs and spaces
141: int end = s.length() - 1;
142: while (end > start && Character.isSpaceChar(s.charAt(end))) {
143: end--;
144: }
145:
146: char c = 0;
147: for (int i = start; i <= end;) {
148: c = s.charAt(i);
149: i++;
150: boolean newline = false;
151: switch (c) {
152: case '\n':
153: newline = true;
154: break;
155: case '\r':
156: if (i <= end && s.charAt(i) == '\n') {
157: i++;
158: }
159: newline = true;
160: break;
161: default:
162: currentOutput.print(c);
163: break;
164: }
165: if (newline) {
166: currentOutput.println();
167: printTabs();
168: // Absorb leading whitespace
169: while (i <= end && Character.isSpaceChar(s.charAt(i))) {
170: i++;
171: }
172: newline = false;
173: }
174: }
175: currentOutput.println();
176: }
177:
178: /** Output a String followed by newline, to the currentOutput stream.
179: * Ignored if string is null.
180: * @param s The string to output
181: */
182: protected void _println(String s) {
183: if (s != null) {
184: currentOutput.println(s);
185: }
186: }
187:
188: /** Test if a set element array represents a contiguous range.
189: * @param elems The array of elements representing the set, usually from BitSet.toArray().
190: * @return true if the elements are a contiguous range (with two or more).
191: */
192: public static boolean elementsAreRange(int[] elems) {
193: if (elems.length == 0) {
194: return false;
195: }
196: int begin = elems[0];
197: int end = elems[elems.length - 1];
198: if (elems.length <= 2) {
199: // Not enough elements for a range expression
200: return false;
201: }
202: if (end - begin + 1 > elems.length) {
203: // The set does not represent a contiguous range
204: return false;
205: }
206: int v = begin + 1;
207: for (int i = 1; i < elems.length - 1; i++) {
208: if (v != elems[i]) {
209: // The set does not represent a contiguous range
210: return false;
211: }
212: v++;
213: }
214: return true;
215: }
216:
217: /** Get the identifier portion of an argument-action token.
218: * The ID of an action is assumed to be a trailing identifier.
219: * Specific code-generators may want to override this
220: * if the language has unusual declaration syntax.
221: * @param t The action token
222: * @return A string containing the text of the identifier
223: */
224: protected String extractIdOfAction(Token t) {
225: return extractIdOfAction(t.getText(), t.getLine(), t
226: .getColumn());
227: }
228:
229: /** Get the identifier portion of an argument-action.
230: * The ID of an action is assumed to be a trailing identifier.
231: * Specific code-generators may want to override this
232: * if the language has unusual declaration syntax.
233: * @param s The action text
234: * @param line Line used for error reporting.
235: * @param column Line used for error reporting.
236: * @return A string containing the text of the identifier
237: */
238: protected String extractIdOfAction(String s, int line, int column) {
239: s = removeAssignmentFromDeclaration(s);
240: // Search back from the end for a non alphanumeric. That marks the
241: // beginning of the identifier
242: for (int i = s.length() - 2; i >= 0; i--) {
243: // TODO: make this work for language-independent identifiers?
244: if (!Character.isLetterOrDigit(s.charAt(i))
245: && s.charAt(i) != '_') {
246: // Found end of type part
247: return s.substring(i + 1);
248: }
249: }
250: // Something is bogus, but we cannot parse the language-specific
251: // actions any better. The compiler will have to catch the problem.
252: antlrTool.warning("Ill-formed action", grammar.getFilename(),
253: line, column);
254: return "";
255: }
256:
257: /** Get the type string out of an argument-action token.
258: * The type of an action is assumed to precede a trailing identifier
259: * Specific code-generators may want to override this
260: * if the language has unusual declaration syntax.
261: * @param t The action token
262: * @return A string containing the text of the type
263: */
264: protected String extractTypeOfAction(Token t) {
265: return extractTypeOfAction(t.getText(), t.getLine(), t
266: .getColumn());
267: }
268:
269: /** Get the type portion of an argument-action.
270: * The type of an action is assumed to precede a trailing identifier
271: * Specific code-generators may want to override this
272: * if the language has unusual declaration syntax.
273: * @param s The action text
274: * @param line Line used for error reporting.
275: * @return A string containing the text of the type
276: */
277: protected String extractTypeOfAction(String s, int line, int column) {
278: s = removeAssignmentFromDeclaration(s);
279: // Search back from the end for a non alphanumeric. That marks the
280: // beginning of the identifier
281: for (int i = s.length() - 2; i >= 0; i--) {
282: // TODO: make this work for language-independent identifiers?
283: if (!Character.isLetterOrDigit(s.charAt(i))
284: && s.charAt(i) != '_') {
285: // Found end of type part
286: return s.substring(0, i + 1);
287: }
288: }
289: // Something is bogus, but we cannot parse the language-specific
290: // actions any better. The compiler will have to catch the problem.
291: antlrTool.warning("Ill-formed action", grammar.getFilename(),
292: line, column);
293: return "";
294: }
295:
296: /** Generate the code for all grammars
297: */
298: public abstract void gen();
299:
300: /** Generate code for the given grammar element.
301: * @param action The {...} action to generate
302: */
303: public abstract void gen(ActionElement action);
304:
305: /** Generate code for the given grammar element.
306: * @param blk The "x|y|z|..." block to generate
307: */
308: public abstract void gen(AlternativeBlock blk);
309:
310: /** Generate code for the given grammar element.
311: * @param end The block-end element to generate. Block-end
312: * elements are synthesized by the grammar parser to represent
313: * the end of a block.
314: */
315: public abstract void gen(BlockEndElement end);
316:
317: /** Generate code for the given grammar element.
318: * @param atom The character literal reference to generate
319: */
320: public abstract void gen(CharLiteralElement atom);
321:
322: /** Generate code for the given grammar element.
323: * @param r The character-range reference to generate
324: */
325: public abstract void gen(CharRangeElement r);
326:
327: /** Generate the code for a parser */
328: public abstract void gen(LexerGrammar g) throws IOException;
329:
330: /** Generate code for the given grammar element.
331: * @param blk The (...)+ block to generate
332: */
333: public abstract void gen(OneOrMoreBlock blk);
334:
335: /** Generate the code for a parser */
336: public abstract void gen(ParserGrammar g) throws IOException;
337:
338: /** Generate code for the given grammar element.
339: * @param rr The rule-reference to generate
340: */
341: public abstract void gen(RuleRefElement rr);
342:
343: /** Generate code for the given grammar element.
344: * @param atom The string-literal reference to generate
345: */
346: public abstract void gen(StringLiteralElement atom);
347:
348: /** Generate code for the given grammar element.
349: * @param r The token-range reference to generate
350: */
351: public abstract void gen(TokenRangeElement r);
352:
353: /** Generate code for the given grammar element.
354: * @param atom The token-reference to generate
355: */
356: public abstract void gen(TokenRefElement atom);
357:
358: /** Generate code for the given grammar element.
359: * @param blk The tree to generate code for.
360: */
361: public abstract void gen(TreeElement t);
362:
363: /** Generate the code for a parser */
364: public abstract void gen(TreeWalkerGrammar g) throws IOException;
365:
366: /** Generate code for the given grammar element.
367: * @param wc The wildcard element to generate
368: */
369: public abstract void gen(WildcardElement wc);
370:
371: /** Generate code for the given grammar element.
372: * @param blk The (...)* block to generate
373: */
374: public abstract void gen(ZeroOrMoreBlock blk);
375:
376: /** Generate the token types as a text file for persistence across shared lexer/parser */
377: protected void genTokenInterchange(TokenManager tm)
378: throws IOException {
379: // Open the token output Java file and set the currentOutput stream
380: String fName = tm.getName() + TokenTypesFileSuffix
381: + TokenTypesFileExt;
382: currentOutput = antlrTool.openOutputFile(fName);
383:
384: println("// $ANTLR " + antlrTool.version + ": "
385: + antlrTool.fileMinusPath(antlrTool.grammarFile)
386: + " -> " + fName + "$");
387:
388: tabs = 0;
389:
390: // Header
391: println(tm.getName() + " // output token vocab name");
392:
393: // Generate a definition for each token type
394: Vector v = tm.getVocabulary();
395: for (int i = Token.MIN_USER_TYPE; i < v.size(); i++) {
396: String s = (String) v.elementAt(i);
397: if (DEBUG_CODE_GENERATOR) {
398: System.out.println("gen persistence file entry for: "
399: + s);
400: }
401: if (s != null && !s.startsWith("<")) {
402: // if literal, find label
403: if (s.startsWith("\"")) {
404: StringLiteralSymbol sl = (StringLiteralSymbol) tm
405: .getTokenSymbol(s);
406: if (sl != null && sl.label != null) {
407: print(sl.label + "=");
408: }
409: println(s + "=" + i);
410: } else {
411: print(s);
412: // check for a paraphrase
413: TokenSymbol ts = (TokenSymbol) tm.getTokenSymbol(s);
414: if (ts == null) {
415: antlrTool.warning("undefined token symbol: "
416: + s);
417: } else {
418: if (ts.getParaphrase() != null) {
419: print("(" + ts.getParaphrase() + ")");
420: }
421: }
422: println("=" + i);
423: }
424: }
425: }
426:
427: // Close the tokens output file
428: currentOutput.close();
429: currentOutput = null;
430: }
431:
432: /** Process a string for an simple expression for use in xx/action.g
433: * it is used to cast simple tokens/references to the right type for
434: * the generated language.
435: * @param str A String.
436: */
437: public String processStringForASTConstructor(String str) {
438: return str;
439: }
440:
441: /** Get a string for an expression to generate creation of an AST subtree.
442: * @param v A Vector of String, where each element is an expression in the target language yielding an AST node.
443: */
444: public abstract String getASTCreateString(Vector v);
445:
446: /** Get a string for an expression to generate creating of an AST node
447: * @param str The text of the arguments to the AST construction
448: */
449: public abstract String getASTCreateString(GrammarAtom atom,
450: String str);
451:
452: /** Given the index of a bitset in the bitset list, generate a unique name.
453: * Specific code-generators may want to override this
454: * if the language does not allow '_' or numerals in identifiers.
455: * @param index The index of the bitset in the bitset list.
456: */
457: protected String getBitsetName(int index) {
458: return "_tokenSet_" + index;
459: }
460:
461: public static String encodeLexerRuleName(String id) {
462: return "m" + id;
463: }
464:
465: public static String decodeLexerRuleName(String id) {
466: if (id == null) {
467: return null;
468: }
469: return id.substring(1, id.length());
470: }
471:
472: /** Map an identifier to it's corresponding tree-node variable.
473: * This is context-sensitive, depending on the rule and alternative
474: * being generated
475: * @param id The identifier name to map
476: * @param forInput true if the input tree node variable is to be returned, otherwise the output variable is returned.
477: * @return The mapped id (which may be the same as the input), or null if the mapping is invalid due to duplicates
478: */
479: public abstract String mapTreeId(String id, ActionTransInfo tInfo);
480:
481: /** Add a bitset to the list of bitsets to be generated.
482: * if the bitset is already in the list, ignore the request.
483: * Always adds the bitset to the end of the list, so the
484: * caller can rely on the position of bitsets in the list.
485: * The returned position can be used to format the bitset
486: * name, since it is invariant.
487: * @param p Bit set to mark for code generation
488: * @param forParser true if the bitset is used for the parser, false for the lexer
489: * @return The position of the bitset in the list.
490: */
491: protected int markBitsetForGen(BitSet p) {
492: // Is the bitset (or an identical one) already marked for gen?
493: for (int i = 0; i < bitsetsUsed.size(); i++) {
494: BitSet set = (BitSet) bitsetsUsed.elementAt(i);
495: if (p.equals(set)) {
496: // Use the identical one already stored
497: return i;
498: }
499: }
500:
501: // Add the new bitset
502: bitsetsUsed.appendElement(p.clone());
503: return bitsetsUsed.size() - 1;
504: }
505:
506: /** Output tab indent followed by a String, to the currentOutput stream.
507: * Ignored if string is null.
508: * @param s The string to output.
509: */
510: protected void print(String s) {
511: if (s != null) {
512: printTabs();
513: currentOutput.print(s);
514: }
515: }
516:
517: /** Print an action with leading tabs, attempting to
518: * preserve the current indentation level for multi-line actions
519: * Ignored if string is null.
520: * @param s The action string to output
521: */
522: protected void printAction(String s) {
523: if (s != null) {
524: printTabs();
525: _printAction(s);
526: }
527: }
528:
529: /** Output tab indent followed by a String followed by newline,
530: * to the currentOutput stream. Ignored if string is null.
531: * @param s The string to output
532: */
533: protected void println(String s) {
534: if (s != null) {
535: printTabs();
536: currentOutput.println(s);
537: }
538: }
539:
540: /** Output the current tab indentation. This outputs the number of tabs
541: * indicated by the "tabs" variable to the currentOutput stream.
542: */
543: protected void printTabs() {
544: for (int i = 1; i <= tabs; i++) {
545: currentOutput.print("\t");
546: }
547: }
548:
549: /** Lexically process $ and # references within the action.
550: * This will replace #id and #(...) with the appropriate
551: * function calls and/or variables etc...
552: */
553: protected abstract String processActionForSpecialSymbols(
554: String actionStr, int line, RuleBlock currentRule,
555: ActionTransInfo tInfo);
556:
557: public String getFOLLOWBitSet(String ruleName, int k) {
558: GrammarSymbol rs = grammar.getSymbol(ruleName);
559: if (!(rs instanceof RuleSymbol)) {
560: return null;
561: }
562: RuleBlock blk = ((RuleSymbol) rs).getBlock();
563: Lookahead follow = grammar.theLLkAnalyzer
564: .FOLLOW(k, blk.endNode);
565: String followSetName = getBitsetName(markBitsetForGen(follow.fset));
566: return followSetName;
567: }
568:
569: public String getFIRSTBitSet(String ruleName, int k) {
570: GrammarSymbol rs = grammar.getSymbol(ruleName);
571: if (!(rs instanceof RuleSymbol)) {
572: return null;
573: }
574: RuleBlock blk = ((RuleSymbol) rs).getBlock();
575: Lookahead first = grammar.theLLkAnalyzer.look(k, blk);
576: String firstSetName = getBitsetName(markBitsetForGen(first.fset));
577: return firstSetName;
578: }
579:
580: /**
581: * Remove the assignment portion of a declaration, if any.
582: * @param d the declaration
583: * @return the declaration without any assignment portion
584: */
585: protected String removeAssignmentFromDeclaration(String d) {
586: // If d contains an equal sign, then it's a declaration
587: // with an initialization. Strip off the initialization part.
588: if (d.indexOf('=') >= 0)
589: d = d.substring(0, d.indexOf('=')).trim();
590: return d;
591: }
592:
593: /** Set all fields back like one just created */
594: private void reset() {
595: tabs = 0;
596: // Allocate list of bitsets tagged for code generation
597: bitsetsUsed = new Vector();
598: currentOutput = null;
599: grammar = null;
600: DEBUG_CODE_GENERATOR = false;
601: makeSwitchThreshold = DEFAULT_MAKE_SWITCH_THRESHOLD;
602: bitsetTestThreshold = DEFAULT_BITSET_TEST_THRESHOLD;
603: }
604:
605: public static String reverseLexerRuleName(String id) {
606: return id.substring(1, id.length());
607: }
608:
609: public void setAnalyzer(LLkGrammarAnalyzer analyzer_) {
610: analyzer = analyzer_;
611: }
612:
613: public void setBehavior(DefineGrammarSymbols behavior_) {
614: behavior = behavior_;
615: }
616:
617: /** Set a grammar for the code generator to use */
618: protected void setGrammar(Grammar g) {
619: reset();
620: grammar = g;
621: // Lookup make-switch threshold in the grammar generic options
622: if (grammar.hasOption("codeGenMakeSwitchThreshold")) {
623: try {
624: makeSwitchThreshold = grammar
625: .getIntegerOption("codeGenMakeSwitchThreshold");
626: //System.out.println("setting codeGenMakeSwitchThreshold to " + makeSwitchThreshold);
627: } catch (NumberFormatException e) {
628: Token tok = grammar
629: .getOption("codeGenMakeSwitchThreshold");
630: antlrTool
631: .error(
632: "option 'codeGenMakeSwitchThreshold' must be an integer",
633: grammar.getClassName(), tok.getLine(),
634: tok.getColumn());
635: }
636: }
637:
638: // Lookup bitset-test threshold in the grammar generic options
639: if (grammar.hasOption("codeGenBitsetTestThreshold")) {
640: try {
641: bitsetTestThreshold = grammar
642: .getIntegerOption("codeGenBitsetTestThreshold");
643: //System.out.println("setting codeGenBitsetTestThreshold to " + bitsetTestThreshold);
644: } catch (NumberFormatException e) {
645: Token tok = grammar
646: .getOption("codeGenBitsetTestThreshold");
647: antlrTool
648: .error(
649: "option 'codeGenBitsetTestThreshold' must be an integer",
650: grammar.getClassName(), tok.getLine(),
651: tok.getColumn());
652: }
653: }
654:
655: // Lookup debug code-gen in the grammar generic options
656: if (grammar.hasOption("codeGenDebug")) {
657: Token t = grammar.getOption("codeGenDebug");
658: if (t.getText().equals("true")) {
659: //System.out.println("setting code-generation debug ON");
660: DEBUG_CODE_GENERATOR = true;
661: } else if (t.getText().equals("false")) {
662: //System.out.println("setting code-generation debug OFF");
663: DEBUG_CODE_GENERATOR = false;
664: } else {
665: antlrTool.error(
666: "option 'codeGenDebug' must be true or false",
667: grammar.getClassName(), t.getLine(), t
668: .getColumn());
669: }
670: }
671: }
672:
673: public void setTool(Tool tool) {
674: antlrTool = tool;
675: }
676: }
|