001: package antlr;
002:
003: /* ANTLR Translator Generator
004: * Project led by Terence Parr at http://www.cs.usfca.edu
005: * Software rights: http://www.antlr.org/license.html
006: */
007:
008: import java.io.PrintWriter;
009: import java.io.IOException;
010: import java.io.FileWriter;
011:
012: import antlr.collections.impl.Vector;
013: import antlr.collections.impl.BitSet;
014:
015: /**A generic ANTLR code generator. All code generators
016: * Derive from this class.
017: *
018: * <p>
019: * A CodeGenerator knows about a Grammar data structure and
020: * a grammar analyzer. The Grammar is walked to generate the
021: * appropriate code for both a parser and lexer (if present).
022: * This interface may change slightly so that the lexer is
023: * itself living inside of a Grammar object (in which case,
024: * this class generates only one recognizer). The main method
025: * to call is <tt>gen()</tt>, which initiates all code gen.
026: *
027: * <p>
028: * The interaction of the code generator with the analyzer is
029: * simple: each subrule block calls deterministic() before generating
030: * code for the block. Method deterministic() sets lookahead caches
031: * in each Alternative object. Technically, a code generator
032: * doesn't need the grammar analyzer if all lookahead analysis
033: * is done at runtime, but this would result in a slower parser.
034: *
035: * <p>
036: * This class provides a set of support utilities to handle argument
037: * list parsing and so on.
038: *
039: * @author Terence Parr, John Lilley
040: * @version 2.00a
041: * @see antlr.JavaCodeGenerator
042: * @see antlr.DiagnosticCodeGenerator
043: * @see antlr.LLkAnalyzer
044: * @see antlr.Grammar
045: * @see antlr.AlternativeElement
046: * @see antlr.Lookahead
047: */
048: public abstract class CodeGenerator {
049: protected antlr.Tool antlrTool;
050:
051: /** Current tab indentation for code output */
052: protected int tabs = 0;
053:
054: /** Current output Stream */
055: transient protected PrintWriter currentOutput; // SAS: for proper text i/o
056:
057: /** The grammar for which we generate code */
058: protected Grammar grammar = null;
059:
060: /** List of all bitsets that must be dumped. These are Vectors of BitSet. */
061: protected Vector bitsetsUsed;
062:
063: /** The grammar behavior */
064: protected DefineGrammarSymbols behavior;
065:
066: /** The LLk analyzer */
067: protected LLkGrammarAnalyzer analyzer;
068:
069: /** Object used to format characters in the target language.
070: * subclass must initialize this to the language-specific formatter
071: */
072: protected CharFormatter charFormatter;
073:
074: /** Use option "codeGenDebug" to generate debugging output */
075: protected boolean DEBUG_CODE_GENERATOR = false;
076:
077: /** Default values for code-generation thresholds */
078: protected static final int DEFAULT_MAKE_SWITCH_THRESHOLD = 2;
079: protected static final int DEFAULT_BITSET_TEST_THRESHOLD = 4;
080:
081: /** If there are more than 8 long words to init in a bitset,
082: * try to optimize it; e.g., detect runs of -1L and 0L.
083: */
084: protected static final int BITSET_OPTIMIZE_INIT_THRESHOLD = 8;
085:
086: /** This is a hint for the language-specific code generator.
087: * A switch() or language-specific equivalent will be generated instead
088: * of a series of if/else statements for blocks with number of alternates
089: * greater than or equal to this number of non-predicated LL(1) alternates.
090: * This is modified by the grammar option "codeGenMakeSwitchThreshold"
091: */
092: protected int makeSwitchThreshold = DEFAULT_MAKE_SWITCH_THRESHOLD;
093:
094: /** This is a hint for the language-specific code generator.
095: * A bitset membership test will be generated instead of an
096: * ORed series of LA(k) comparisions for lookahead sets with
097: * degree greater than or equal to this value.
098: * This is modified by the grammar option "codeGenBitsetTestThreshold"
099: */
100: protected int bitsetTestThreshold = DEFAULT_BITSET_TEST_THRESHOLD;
101:
102: private static boolean OLD_ACTION_TRANSLATOR = true;
103:
104: public static String TokenTypesFileSuffix = "TokenTypes";
105: public static String TokenTypesFileExt = ".txt";
106:
107: /** Construct code generator base class */
108: public CodeGenerator() {
109: }
110:
111: /** Output a String to the currentOutput stream.
112: * Ignored if string is null.
113: * @param s The string to output
114: */
115: protected void _print(String s) {
116: if (s != null) {
117: currentOutput.print(s);
118: }
119: }
120:
121: /** Print an action without leading tabs, attempting to
122: * preserve the current indentation level for multi-line actions
123: * Ignored if string is null.
124: * @param s The action string to output
125: */
126: protected void _printAction(String s) {
127: if (s == null) {
128: return;
129: }
130:
131: // Skip leading newlines, tabs and spaces
132: int start = 0;
133: while (start < s.length()
134: && Character.isSpaceChar(s.charAt(start))) {
135: start++;
136: }
137:
138: // Skip leading newlines, tabs and spaces
139: int end = s.length() - 1;
140: while (end > start && Character.isSpaceChar(s.charAt(end))) {
141: end--;
142: }
143:
144: char c = 0;
145: for (int i = start; i <= end;) {
146: c = s.charAt(i);
147: i++;
148: boolean newline = false;
149: switch (c) {
150: case '\n':
151: newline = true;
152: break;
153: case '\r':
154: if (i <= end && s.charAt(i) == '\n') {
155: i++;
156: }
157: newline = true;
158: break;
159: default:
160: currentOutput.print(c);
161: break;
162: }
163: if (newline) {
164: currentOutput.println();
165: printTabs();
166: // Absorb leading whitespace
167: while (i <= end && Character.isSpaceChar(s.charAt(i))) {
168: i++;
169: }
170: newline = false;
171: }
172: }
173: currentOutput.println();
174: }
175:
176: /** Output a String followed by newline, to the currentOutput stream.
177: * Ignored if string is null.
178: * @param s The string to output
179: */
180: protected void _println(String s) {
181: if (s != null) {
182: currentOutput.println(s);
183: }
184: }
185:
186: /** Test if a set element array represents a contiguous range.
187: * @param elems The array of elements representing the set, usually from BitSet.toArray().
188: * @return true if the elements are a contiguous range (with two or more).
189: */
190: public static boolean elementsAreRange(int[] elems) {
191: if (elems.length == 0) {
192: return false;
193: }
194: int begin = elems[0];
195: int end = elems[elems.length - 1];
196: if (elems.length <= 2) {
197: // Not enough elements for a range expression
198: return false;
199: }
200: if (end - begin + 1 > elems.length) {
201: // The set does not represent a contiguous range
202: return false;
203: }
204: int v = begin + 1;
205: for (int i = 1; i < elems.length - 1; i++) {
206: if (v != elems[i]) {
207: // The set does not represent a contiguous range
208: return false;
209: }
210: v++;
211: }
212: return true;
213: }
214:
215: /** Get the identifier portion of an argument-action token.
216: * The ID of an action is assumed to be a trailing identifier.
217: * Specific code-generators may want to override this
218: * if the language has unusual declaration syntax.
219: * @param t The action token
220: * @return A string containing the text of the identifier
221: */
222: protected String extractIdOfAction(Token t) {
223: return extractIdOfAction(t.getText(), t.getLine(), t
224: .getColumn());
225: }
226:
227: /** Get the identifier portion of an argument-action.
228: * The ID of an action is assumed to be a trailing identifier.
229: * Specific code-generators may want to override this
230: * if the language has unusual declaration syntax.
231: * @param s The action text
232: * @param line Line used for error reporting.
233: * @param column Line used for error reporting.
234: * @return A string containing the text of the identifier
235: */
236: protected String extractIdOfAction(String s, int line, int column) {
237: s = removeAssignmentFromDeclaration(s);
238: // Search back from the end for a non alphanumeric. That marks the
239: // beginning of the identifier
240: for (int i = s.length() - 2; i >= 0; i--) {
241: // TODO: make this work for language-independent identifiers?
242: if (!Character.isLetterOrDigit(s.charAt(i))
243: && s.charAt(i) != '_') {
244: // Found end of type part
245: return s.substring(i + 1);
246: }
247: }
248: // Something is bogus, but we cannot parse the language-specific
249: // actions any better. The compiler will have to catch the problem.
250: antlrTool.warning("Ill-formed action", grammar.getFilename(),
251: line, column);
252: return "";
253: }
254:
255: /** Get the type string out of an argument-action token.
256: * The type of an action is assumed to precede a trailing identifier
257: * Specific code-generators may want to override this
258: * if the language has unusual declaration syntax.
259: * @param t The action token
260: * @return A string containing the text of the type
261: */
262: protected String extractTypeOfAction(Token t) {
263: return extractTypeOfAction(t.getText(), t.getLine(), t
264: .getColumn());
265: }
266:
267: /** Get the type portion of an argument-action.
268: * The type of an action is assumed to precede a trailing identifier
269: * Specific code-generators may want to override this
270: * if the language has unusual declaration syntax.
271: * @param s The action text
272: * @param line Line used for error reporting.
273: * @return A string containing the text of the type
274: */
275: protected String extractTypeOfAction(String s, int line, int column) {
276: s = removeAssignmentFromDeclaration(s);
277: // Search back from the end for a non alphanumeric. That marks the
278: // beginning of the identifier
279: for (int i = s.length() - 2; i >= 0; i--) {
280: // TODO: make this work for language-independent identifiers?
281: if (!Character.isLetterOrDigit(s.charAt(i))
282: && s.charAt(i) != '_') {
283: // Found end of type part
284: return s.substring(0, i + 1);
285: }
286: }
287: // Something is bogus, but we cannot parse the language-specific
288: // actions any better. The compiler will have to catch the problem.
289: antlrTool.warning("Ill-formed action", grammar.getFilename(),
290: line, column);
291: return "";
292: }
293:
294: /** Generate the code for all grammars
295: */
296: public abstract void gen();
297:
298: /** Generate code for the given grammar element.
299: * @param action The {...} action to generate
300: */
301: public abstract void gen(ActionElement action, Context context);
302:
303: /** Generate code for the given grammar element.
304: * @param blk The "x|y|z|..." block to generate
305: */
306: public abstract void gen(AlternativeBlock blk, Context context);
307:
308: /** Generate code for the given grammar element.
309: * @param end The block-end element to generate. Block-end
310: * elements are synthesized by the grammar parser to represent
311: * the end of a block.
312: */
313: public abstract void gen(BlockEndElement end, Context context);
314:
315: /** Generate code for the given grammar element.
316: * @param atom The character literal reference to generate
317: */
318: public abstract void gen(CharLiteralElement atom, Context context);
319:
320: /** Generate code for the given grammar element.
321: * @param r The character-range reference to generate
322: */
323: public abstract void gen(CharRangeElement r, Context context);
324:
325: /** Generate the code for a parser */
326: public abstract void gen(LexerGrammar g) throws IOException;
327:
328: /** Generate code for the given grammar element.
329: * @param blk The (...)+ block to generate
330: */
331: public abstract void gen(OneOrMoreBlock blk, Context context);
332:
333: /** Generate the code for a parser */
334: public abstract void gen(ParserGrammar g) throws IOException;
335:
336: /** Generate code for the given grammar element.
337: * @param rr The rule-reference to generate
338: */
339: public abstract void gen(RuleRefElement rr, Context context);
340:
341: /** Generate code for the given grammar element.
342: * @param atom The string-literal reference to generate
343: */
344: public abstract void gen(StringLiteralElement atom, Context context);
345:
346: /** Generate code for the given grammar element.
347: * @param r The token-range reference to generate
348: */
349: public abstract void gen(TokenRangeElement r, Context context);
350:
351: /** Generate code for the given grammar element.
352: * @param atom The token-reference to generate
353: */
354: public abstract void gen(TokenRefElement atom, Context context);
355:
356: /** Generate code for the given grammar element.
357: * @param blk The tree to generate code for.
358: */
359: public abstract void gen(TreeElement t, Context context);
360:
361: /** Generate the code for a parser */
362: public abstract void gen(TreeWalkerGrammar g) throws IOException;
363:
364: /** Generate code for the given grammar element.
365: * @param wc The wildcard element to generate
366: */
367: public abstract void gen(WildcardElement wc, Context context);
368:
369: /** Generate code for the given grammar element.
370: * @param blk The (...)* block to generate
371: */
372: public abstract void gen(ZeroOrMoreBlock blk, Context context);
373:
374: /** Generate the token types as a text file for persistence across shared lexer/parser */
375: protected void genTokenInterchange(TokenManager tm)
376: throws IOException {
377: // Open the token output Java file and set the currentOutput stream
378: String fName = tm.getName() + TokenTypesFileSuffix
379: + TokenTypesFileExt;
380: currentOutput = antlrTool.openOutputFile(fName);
381:
382: println("// $ANTLR " + antlrTool.version + ": "
383: + antlrTool.fileMinusPath(antlrTool.grammarFile)
384: + " -> " + fName + "$");
385:
386: tabs = 0;
387:
388: // Header
389: println(tm.getName() + " // output token vocab name");
390:
391: // Generate a definition for each token type
392: Vector v = tm.getVocabulary();
393: for (int i = Token.MIN_USER_TYPE; i < v.size(); i++) {
394: String s = (String) v.elementAt(i);
395: if (DEBUG_CODE_GENERATOR) {
396: System.out.println("gen persistence file entry for: "
397: + s);
398: }
399: if (s != null && !s.startsWith("<")) {
400: // if literal, find label
401: if (s.startsWith("\"")) {
402: StringLiteralSymbol sl = (StringLiteralSymbol) tm
403: .getTokenSymbol(s);
404: if (sl != null && sl.label != null) {
405: print(sl.label + "=");
406: }
407: println(s + "=" + i);
408: } else {
409: print(s);
410: // check for a paraphrase
411: TokenSymbol ts = (TokenSymbol) tm.getTokenSymbol(s);
412: if (ts == null) {
413: antlrTool.warning("undefined token symbol: "
414: + s);
415: } else {
416: if (ts.getParaphrase() != null) {
417: print("(" + ts.getParaphrase() + ")");
418: }
419: }
420: println("=" + i);
421: }
422: }
423: }
424:
425: // Close the tokens output file
426: currentOutput.close();
427: currentOutput = null;
428: }
429:
430: /** Process a string for an simple expression for use in xx/action.g
431: * it is used to cast simple tokens/references to the right type for
432: * the generated language.
433: * @param str A String.
434: */
435: public String processStringForASTConstructor(String str) {
436: return str;
437: }
438:
439: /** Get a string for an expression to generate creation of an AST subtree.
440: * @param v A Vector of String, where each element is an expression in the target language yielding an AST node.
441: */
442: public abstract String getASTCreateString(Vector v);
443:
444: /** Get a string for an expression to generate creating of an AST node
445: * @param str The text of the arguments to the AST construction
446: */
447: public abstract String getASTCreateString(GrammarAtom atom,
448: String str);
449:
450: /** Given the index of a bitset in the bitset list, generate a unique name.
451: * Specific code-generators may want to override this
452: * if the language does not allow '_' or numerals in identifiers.
453: * @param index The index of the bitset in the bitset list.
454: */
455: protected String getBitsetName(int index) {
456: return "_tokenSet_" + index;
457: }
458:
459: public static String encodeLexerRuleName(String id) {
460: return "m" + id;
461: }
462:
463: public static String decodeLexerRuleName(String id) {
464: if (id == null) {
465: return null;
466: }
467: return id.substring(1, id.length());
468: }
469:
470: /** Map an identifier to it's corresponding tree-node variable.
471: * This is context-sensitive, depending on the rule and alternative
472: * being generated
473: * @param id The identifier name to map
474: * @param forInput true if the input tree node variable is to be returned, otherwise the output variable is returned.
475: * @return The mapped id (which may be the same as the input), or null if the mapping is invalid due to duplicates
476: */
477: public abstract String mapTreeId(String id, ActionTransInfo tInfo);
478:
479: /** Add a bitset to the list of bitsets to be generated.
480: * if the bitset is already in the list, ignore the request.
481: * Always adds the bitset to the end of the list, so the
482: * caller can rely on the position of bitsets in the list.
483: * The returned position can be used to format the bitset
484: * name, since it is invariant.
485: * @param p Bit set to mark for code generation
486: * @param forParser true if the bitset is used for the parser, false for the lexer
487: * @return The position of the bitset in the list.
488: */
489: protected int markBitsetForGen(BitSet p) {
490: // Is the bitset (or an identical one) already marked for gen?
491: for (int i = 0; i < bitsetsUsed.size(); i++) {
492: BitSet set = (BitSet) bitsetsUsed.elementAt(i);
493: if (p.equals(set)) {
494: // Use the identical one already stored
495: return i;
496: }
497: }
498:
499: // Add the new bitset
500: bitsetsUsed.appendElement(p.clone());
501: return bitsetsUsed.size() - 1;
502: }
503:
504: /** Output tab indent followed by a String, to the currentOutput stream.
505: * Ignored if string is null.
506: * @param s The string to output.
507: */
508: protected void print(String s) {
509: if (s != null) {
510: printTabs();
511: currentOutput.print(s);
512: }
513: }
514:
515: /** Print an action with leading tabs, attempting to
516: * preserve the current indentation level for multi-line actions
517: * Ignored if string is null.
518: * @param s The action string to output
519: */
520: protected void printAction(String s) {
521: if (s != null) {
522: printTabs();
523: _printAction(s);
524: }
525: }
526:
527: /** Output tab indent followed by a String followed by newline,
528: * to the currentOutput stream. Ignored if string is null.
529: * @param s The string to output
530: */
531: protected void println(String s) {
532: if (s != null) {
533: printTabs();
534: currentOutput.println(s);
535: }
536: }
537:
538: /** Output the current tab indentation. This outputs the number of tabs
539: * indicated by the "tabs" variable to the currentOutput stream.
540: */
541: protected void printTabs() {
542: for (int i = 1; i <= tabs; i++) {
543: currentOutput.print("\t");
544: }
545: }
546:
547: /** Lexically process $ and # references within the action.
548: * This will replace #id and #(...) with the appropriate
549: * function calls and/or variables etc...
550: */
551: protected abstract String processActionForSpecialSymbols(
552: String actionStr, int line, RuleBlock currentRule,
553: ActionTransInfo tInfo);
554:
555: public String getFOLLOWBitSet(String ruleName, int k) {
556: GrammarSymbol rs = grammar.getSymbol(ruleName);
557: if (!(rs instanceof RuleSymbol)) {
558: return null;
559: }
560: RuleBlock blk = ((RuleSymbol) rs).getBlock();
561: Lookahead follow = grammar.theLLkAnalyzer
562: .FOLLOW(k, blk.endNode);
563: String followSetName = getBitsetName(markBitsetForGen(follow.fset));
564: return followSetName;
565: }
566:
567: public String getFIRSTBitSet(String ruleName, int k) {
568: GrammarSymbol rs = grammar.getSymbol(ruleName);
569: if (!(rs instanceof RuleSymbol)) {
570: return null;
571: }
572: RuleBlock blk = ((RuleSymbol) rs).getBlock();
573: Lookahead first = grammar.theLLkAnalyzer.look(k, blk);
574: String firstSetName = getBitsetName(markBitsetForGen(first.fset));
575: return firstSetName;
576: }
577:
578: /**
579: * Remove the assignment portion of a declaration, if any.
580: * @param d the declaration
581: * @return the declaration without any assignment portion
582: */
583: protected String removeAssignmentFromDeclaration(String d) {
584: // If d contains an equal sign, then it's a declaration
585: // with an initialization. Strip off the initialization part.
586: if (d.indexOf('=') >= 0)
587: d = d.substring(0, d.indexOf('=')).trim();
588: return d;
589: }
590:
591: /** Set all fields back like one just created */
592: private void reset() {
593: tabs = 0;
594: // Allocate list of bitsets tagged for code generation
595: bitsetsUsed = new Vector();
596: currentOutput = null;
597: grammar = null;
598: DEBUG_CODE_GENERATOR = false;
599: makeSwitchThreshold = DEFAULT_MAKE_SWITCH_THRESHOLD;
600: bitsetTestThreshold = DEFAULT_BITSET_TEST_THRESHOLD;
601: }
602:
603: public static String reverseLexerRuleName(String id) {
604: return id.substring(1, id.length());
605: }
606:
607: public void setAnalyzer(LLkGrammarAnalyzer analyzer_) {
608: analyzer = analyzer_;
609: }
610:
611: public void setBehavior(DefineGrammarSymbols behavior_) {
612: behavior = behavior_;
613: }
614:
615: /** Set a grammar for the code generator to use */
616: protected void setGrammar(Grammar g) {
617: reset();
618: grammar = g;
619: // Lookup make-switch threshold in the grammar generic options
620: if (grammar.hasOption("codeGenMakeSwitchThreshold")) {
621: try {
622: makeSwitchThreshold = grammar
623: .getIntegerOption("codeGenMakeSwitchThreshold");
624: //System.out.println("setting codeGenMakeSwitchThreshold to " + makeSwitchThreshold);
625: } catch (NumberFormatException e) {
626: Token tok = grammar
627: .getOption("codeGenMakeSwitchThreshold");
628: antlrTool
629: .error(
630: "option 'codeGenMakeSwitchThreshold' must be an integer",
631: grammar.getClassName(), tok.getLine(),
632: tok.getColumn());
633: }
634: }
635:
636: // Lookup bitset-test threshold in the grammar generic options
637: if (grammar.hasOption("codeGenBitsetTestThreshold")) {
638: try {
639: bitsetTestThreshold = grammar
640: .getIntegerOption("codeGenBitsetTestThreshold");
641: //System.out.println("setting codeGenBitsetTestThreshold to " + bitsetTestThreshold);
642: } catch (NumberFormatException e) {
643: Token tok = grammar
644: .getOption("codeGenBitsetTestThreshold");
645: antlrTool
646: .error(
647: "option 'codeGenBitsetTestThreshold' must be an integer",
648: grammar.getClassName(), tok.getLine(),
649: tok.getColumn());
650: }
651: }
652:
653: // Lookup debug code-gen in the grammar generic options
654: if (grammar.hasOption("codeGenDebug")) {
655: Token t = grammar.getOption("codeGenDebug");
656: if (t.getText().equals("true")) {
657: //System.out.println("setting code-generation debug ON");
658: DEBUG_CODE_GENERATOR = true;
659: } else if (t.getText().equals("false")) {
660: //System.out.println("setting code-generation debug OFF");
661: DEBUG_CODE_GENERATOR = false;
662: } else {
663: antlrTool.error(
664: "option 'codeGenDebug' must be true or false",
665: grammar.getClassName(), t.getLine(), t
666: .getColumn());
667: }
668: }
669: }
670:
671: public void setTool(Tool tool) {
672: antlrTool = tool;
673: }
674: }
|