001: package persistence.antlr;
002:
003: /* ANTLR Translator Generator
004: * Project led by Terence Parr at http://www.jGuru.com
005: * Software rights: http://www.antlr.org/license.html
006: *
007: */
008:
009: import java.io.PrintWriter;
010: import java.io.IOException;
011: import java.io.FileWriter;
012:
013: import persistence.antlr.collections.impl.Vector;
014: import persistence.antlr.collections.impl.BitSet;
015:
016: /**A generic ANTLR code generator. All code generators
017: * Derive from this class.
018: *
019: * <p>
020: * A CodeGenerator knows about a Grammar data structure and
021: * a grammar analyzer. The Grammar is walked to generate the
022: * appropriate code for both a parser and lexer (if present).
023: * This interface may change slightly so that the lexer is
024: * itself living inside of a Grammar object (in which case,
025: * this class generates only one recognizer). The main method
026: * to call is <tt>gen()</tt>, which initiates all code gen.
027: *
028: * <p>
029: * The interaction of the code generator with the analyzer is
030: * simple: each subrule block calls deterministic() before generating
031: * code for the block. Method deterministic() sets lookahead caches
032: * in each Alternative object. Technically, a code generator
033: * doesn't need the grammar analyzer if all lookahead analysis
034: * is done at runtime, but this would result in a slower parser.
035: *
036: * <p>
037: * This class provides a set of support utilities to handle argument
038: * list parsing and so on.
039: *
040: * @author Terence Parr, John Lilley
041: * @version 2.00a
042: * @see persistence.antlr.JavaCodeGenerator
043: * @see persistence.antlr.DiagnosticCodeGenerator
044: * @see persistence.antlr.LLkAnalyzer
045: * @see persistence.antlr.Grammar
046: * @see persistence.antlr.AlternativeElement
047: * @see persistence.antlr.Lookahead
048: */
049: public abstract class CodeGenerator {
050: protected persistence.antlr.Tool antlrTool;
051:
052: /** Current tab indentation for code output */
053: protected int tabs = 0;
054:
055: /** Current output Stream */
056: transient protected PrintWriter currentOutput; // SAS: for proper text i/o
057:
058: /** The grammar for which we generate code */
059: protected Grammar grammar = null;
060:
061: /** List of all bitsets that must be dumped. These are Vectors of BitSet. */
062: protected Vector bitsetsUsed;
063:
064: /** The grammar behavior */
065: protected DefineGrammarSymbols behavior;
066:
067: /** The LLk analyzer */
068: protected LLkGrammarAnalyzer analyzer;
069:
070: /** Object used to format characters in the target language.
071: * subclass must initialize this to the language-specific formatter
072: */
073: protected CharFormatter charFormatter;
074:
075: /** Use option "codeGenDebug" to generate debugging output */
076: protected boolean DEBUG_CODE_GENERATOR = false;
077:
078: /** Default values for code-generation thresholds */
079: protected static final int DEFAULT_MAKE_SWITCH_THRESHOLD = 2;
080: protected static final int DEFAULT_BITSET_TEST_THRESHOLD = 4;
081:
082: /** If there are more than 8 long words to init in a bitset,
083: * try to optimize it; e.g., detect runs of -1L and 0L.
084: */
085: protected static final int BITSET_OPTIMIZE_INIT_THRESHOLD = 8;
086:
087: /** This is a hint for the language-specific code generator.
088: * A switch() or language-specific equivalent will be generated instead
089: * of a series of if/else statements for blocks with number of alternates
090: * greater than or equal to this number of non-predicated LL(1) alternates.
091: * This is modified by the grammar option "codeGenMakeSwitchThreshold"
092: */
093: protected int makeSwitchThreshold = DEFAULT_MAKE_SWITCH_THRESHOLD;
094:
095: /** This is a hint for the language-specific code generator.
096: * A bitset membership test will be generated instead of an
097: * ORed series of LA(k) comparisions for lookahead sets with
098: * degree greater than or equal to this value.
099: * This is modified by the grammar option "codeGenBitsetTestThreshold"
100: */
101: protected int bitsetTestThreshold = DEFAULT_BITSET_TEST_THRESHOLD;
102:
103: private static boolean OLD_ACTION_TRANSLATOR = true;
104:
105: public static String TokenTypesFileSuffix = "TokenTypes";
106: public static String TokenTypesFileExt = ".txt";
107:
108: /** Construct code generator base class */
109: public CodeGenerator() {
110: }
111:
112: /** Output a String to the currentOutput stream.
113: * Ignored if string is null.
114: * @param s The string to output
115: */
116: protected void _print(String s) {
117: if (s != null) {
118: currentOutput.print(s);
119: }
120: }
121:
122: /** Print an action without leading tabs, attempting to
123: * preserve the current indentation level for multi-line actions
124: * Ignored if string is null.
125: * @param s The action string to output
126: */
127: protected void _printAction(String s) {
128: if (s == null) {
129: return;
130: }
131:
132: // Skip leading newlines, tabs and spaces
133: int start = 0;
134: while (start < s.length()
135: && Character.isSpaceChar(s.charAt(start))) {
136: start++;
137: }
138:
139: // Skip leading newlines, tabs and spaces
140: int end = s.length() - 1;
141: while (end > start && Character.isSpaceChar(s.charAt(end))) {
142: end--;
143: }
144:
145: char c = 0;
146: for (int i = start; i <= end;) {
147: c = s.charAt(i);
148: i++;
149: boolean newline = false;
150: switch (c) {
151: case '\n':
152: newline = true;
153: break;
154: case '\r':
155: if (i <= end && s.charAt(i) == '\n') {
156: i++;
157: }
158: newline = true;
159: break;
160: default:
161: currentOutput.print(c);
162: break;
163: }
164: if (newline) {
165: currentOutput.println();
166: printTabs();
167: // Absorb leading whitespace
168: while (i <= end && Character.isSpaceChar(s.charAt(i))) {
169: i++;
170: }
171: newline = false;
172: }
173: }
174: currentOutput.println();
175: }
176:
177: /** Output a String followed by newline, to the currentOutput stream.
178: * Ignored if string is null.
179: * @param s The string to output
180: */
181: protected void _println(String s) {
182: if (s != null) {
183: currentOutput.println(s);
184: }
185: }
186:
187: /** Test if a set element array represents a contiguous range.
188: * @param elems The array of elements representing the set, usually from BitSet.toArray().
189: * @return true if the elements are a contiguous range (with two or more).
190: */
191: public static boolean elementsAreRange(int[] elems) {
192: if (elems.length == 0) {
193: return false;
194: }
195: int begin = elems[0];
196: int end = elems[elems.length - 1];
197: if (elems.length <= 2) {
198: // Not enough elements for a range expression
199: return false;
200: }
201: if (end - begin + 1 > elems.length) {
202: // The set does not represent a contiguous range
203: return false;
204: }
205: int v = begin + 1;
206: for (int i = 1; i < elems.length - 1; i++) {
207: if (v != elems[i]) {
208: // The set does not represent a contiguous range
209: return false;
210: }
211: v++;
212: }
213: return true;
214: }
215:
216: /** Get the identifier portion of an argument-action token.
217: * The ID of an action is assumed to be a trailing identifier.
218: * Specific code-generators may want to override this
219: * if the language has unusual declaration syntax.
220: * @param t The action token
221: * @return A string containing the text of the identifier
222: */
223: protected String extractIdOfAction(Token t) {
224: return extractIdOfAction(t.getText(), t.getLine(), t
225: .getColumn());
226: }
227:
228: /** Get the identifier portion of an argument-action.
229: * The ID of an action is assumed to be a trailing identifier.
230: * Specific code-generators may want to override this
231: * if the language has unusual declaration syntax.
232: * @param s The action text
233: * @param line Line used for error reporting.
234: * @param column Line used for error reporting.
235: * @return A string containing the text of the identifier
236: */
237: protected String extractIdOfAction(String s, int line, int column) {
238: s = removeAssignmentFromDeclaration(s);
239: // Search back from the end for a non alphanumeric. That marks the
240: // beginning of the identifier
241: for (int i = s.length() - 2; i >= 0; i--) {
242: // TODO: make this work for language-independent identifiers?
243: if (!Character.isLetterOrDigit(s.charAt(i))
244: && s.charAt(i) != '_') {
245: // Found end of type part
246: return s.substring(i + 1);
247: }
248: }
249: // Something is bogus, but we cannot parse the language-specific
250: // actions any better. The compiler will have to catch the problem.
251: antlrTool.warning("Ill-formed action", grammar.getFilename(),
252: line, column);
253: return "";
254: }
255:
256: /** Get the type string out of an argument-action token.
257: * The type of an action is assumed to precede a trailing identifier
258: * Specific code-generators may want to override this
259: * if the language has unusual declaration syntax.
260: * @param t The action token
261: * @return A string containing the text of the type
262: */
263: protected String extractTypeOfAction(Token t) {
264: return extractTypeOfAction(t.getText(), t.getLine(), t
265: .getColumn());
266: }
267:
268: /** Get the type portion of an argument-action.
269: * The type of an action is assumed to precede a trailing identifier
270: * Specific code-generators may want to override this
271: * if the language has unusual declaration syntax.
272: * @param s The action text
273: * @param line Line used for error reporting.
274: * @return A string containing the text of the type
275: */
276: protected String extractTypeOfAction(String s, int line, int column) {
277: s = removeAssignmentFromDeclaration(s);
278: // Search back from the end for a non alphanumeric. That marks the
279: // beginning of the identifier
280: for (int i = s.length() - 2; i >= 0; i--) {
281: // TODO: make this work for language-independent identifiers?
282: if (!Character.isLetterOrDigit(s.charAt(i))
283: && s.charAt(i) != '_') {
284: // Found end of type part
285: return s.substring(0, i + 1);
286: }
287: }
288: // Something is bogus, but we cannot parse the language-specific
289: // actions any better. The compiler will have to catch the problem.
290: antlrTool.warning("Ill-formed action", grammar.getFilename(),
291: line, column);
292: return "";
293: }
294:
295: /** Generate the code for all grammars
296: */
297: public abstract void gen();
298:
299: /** Generate code for the given grammar element.
300: * @param action The {...} action to generate
301: */
302: public abstract void gen(ActionElement action);
303:
304: /** Generate code for the given grammar element.
305: * @param blk The "x|y|z|..." block to generate
306: */
307: public abstract void gen(AlternativeBlock blk);
308:
309: /** Generate code for the given grammar element.
310: * @param end The block-end element to generate. Block-end
311: * elements are synthesized by the grammar parser to represent
312: * the end of a block.
313: */
314: public abstract void gen(BlockEndElement end);
315:
316: /** Generate code for the given grammar element.
317: * @param atom The character literal reference to generate
318: */
319: public abstract void gen(CharLiteralElement atom);
320:
321: /** Generate code for the given grammar element.
322: * @param r The character-range reference to generate
323: */
324: public abstract void gen(CharRangeElement r);
325:
326: /** Generate the code for a parser */
327: public abstract void gen(LexerGrammar g) throws IOException;
328:
329: /** Generate code for the given grammar element.
330: * @param blk The (...)+ block to generate
331: */
332: public abstract void gen(OneOrMoreBlock blk);
333:
334: /** Generate the code for a parser */
335: public abstract void gen(ParserGrammar g) throws IOException;
336:
337: /** Generate code for the given grammar element.
338: * @param rr The rule-reference to generate
339: */
340: public abstract void gen(RuleRefElement rr);
341:
342: /** Generate code for the given grammar element.
343: * @param atom The string-literal reference to generate
344: */
345: public abstract void gen(StringLiteralElement atom);
346:
347: /** Generate code for the given grammar element.
348: * @param r The token-range reference to generate
349: */
350: public abstract void gen(TokenRangeElement r);
351:
352: /** Generate code for the given grammar element.
353: * @param atom The token-reference to generate
354: */
355: public abstract void gen(TokenRefElement atom);
356:
357: /** Generate code for the given grammar element.
358: * @param blk The tree to generate code for.
359: */
360: public abstract void gen(TreeElement t);
361:
362: /** Generate the code for a parser */
363: public abstract void gen(TreeWalkerGrammar g) throws IOException;
364:
365: /** Generate code for the given grammar element.
366: * @param wc The wildcard element to generate
367: */
368: public abstract void gen(WildcardElement wc);
369:
370: /** Generate code for the given grammar element.
371: * @param blk The (...)* block to generate
372: */
373: public abstract void gen(ZeroOrMoreBlock blk);
374:
375: /** Generate the token types as a text file for persistence across shared lexer/parser */
376: protected void genTokenInterchange(TokenManager tm)
377: throws IOException {
378: // Open the token output Java file and set the currentOutput stream
379: String fName = tm.getName() + TokenTypesFileSuffix
380: + TokenTypesFileExt;
381: currentOutput = antlrTool.openOutputFile(fName);
382:
383: println("// $ANTLR " + antlrTool.version + ": "
384: + antlrTool.fileMinusPath(antlrTool.grammarFile)
385: + " -> " + fName + "$");
386:
387: tabs = 0;
388:
389: // Header
390: println(tm.getName() + " // output token vocab name");
391:
392: // Generate a definition for each token type
393: Vector v = tm.getVocabulary();
394: for (int i = Token.MIN_USER_TYPE; i < v.size(); i++) {
395: String s = (String) v.elementAt(i);
396: if (DEBUG_CODE_GENERATOR) {
397: System.out.println("gen persistence file entry for: "
398: + s);
399: }
400: if (s != null && !s.startsWith("<")) {
401: // if literal, find label
402: if (s.startsWith("\"")) {
403: StringLiteralSymbol sl = (StringLiteralSymbol) tm
404: .getTokenSymbol(s);
405: if (sl != null && sl.label != null) {
406: print(sl.label + "=");
407: }
408: println(s + "=" + i);
409: } else {
410: print(s);
411: // check for a paraphrase
412: TokenSymbol ts = (TokenSymbol) tm.getTokenSymbol(s);
413: if (ts == null) {
414: antlrTool.warning("undefined token symbol: "
415: + s);
416: } else {
417: if (ts.getParaphrase() != null) {
418: print("(" + ts.getParaphrase() + ")");
419: }
420: }
421: println("=" + i);
422: }
423: }
424: }
425:
426: // Close the tokens output file
427: currentOutput.close();
428: currentOutput = null;
429: }
430:
431: /** Process a string for an simple expression for use in xx/action.g
432: * it is used to cast simple tokens/references to the right type for
433: * the generated language.
434: * @param str A String.
435: */
436: public String processStringForASTConstructor(String str) {
437: return str;
438: }
439:
440: /** Get a string for an expression to generate creation of an AST subtree.
441: * @param v A Vector of String, where each element is an expression in the target language yielding an AST node.
442: */
443: public abstract String getASTCreateString(Vector v);
444:
445: /** Get a string for an expression to generate creating of an AST node
446: * @param str The text of the arguments to the AST construction
447: */
448: public abstract String getASTCreateString(GrammarAtom atom,
449: String str);
450:
451: /** Given the index of a bitset in the bitset list, generate a unique name.
452: * Specific code-generators may want to override this
453: * if the language does not allow '_' or numerals in identifiers.
454: * @param index The index of the bitset in the bitset list.
455: */
456: protected String getBitsetName(int index) {
457: return "_tokenSet_" + index;
458: }
459:
460: public static String encodeLexerRuleName(String id) {
461: return "m" + id;
462: }
463:
464: public static String decodeLexerRuleName(String id) {
465: if (id == null) {
466: return null;
467: }
468: return id.substring(1, id.length());
469: }
470:
471: /** Map an identifier to it's corresponding tree-node variable.
472: * This is context-sensitive, depending on the rule and alternative
473: * being generated
474: * @param id The identifier name to map
475: * @param forInput true if the input tree node variable is to be returned, otherwise the output variable is returned.
476: * @return The mapped id (which may be the same as the input), or null if the mapping is invalid due to duplicates
477: */
478: public abstract String mapTreeId(String id, ActionTransInfo tInfo);
479:
480: /** Add a bitset to the list of bitsets to be generated.
481: * if the bitset is already in the list, ignore the request.
482: * Always adds the bitset to the end of the list, so the
483: * caller can rely on the position of bitsets in the list.
484: * The returned position can be used to format the bitset
485: * name, since it is invariant.
486: * @param p Bit set to mark for code generation
487: * @param forParser true if the bitset is used for the parser, false for the lexer
488: * @return The position of the bitset in the list.
489: */
490: protected int markBitsetForGen(BitSet p) {
491: // Is the bitset (or an identical one) already marked for gen?
492: for (int i = 0; i < bitsetsUsed.size(); i++) {
493: BitSet set = (BitSet) bitsetsUsed.elementAt(i);
494: if (p.equals(set)) {
495: // Use the identical one already stored
496: return i;
497: }
498: }
499:
500: // Add the new bitset
501: bitsetsUsed.appendElement(p.clone());
502: return bitsetsUsed.size() - 1;
503: }
504:
505: /** Output tab indent followed by a String, to the currentOutput stream.
506: * Ignored if string is null.
507: * @param s The string to output.
508: */
509: protected void print(String s) {
510: if (s != null) {
511: printTabs();
512: currentOutput.print(s);
513: }
514: }
515:
516: /** Print an action with leading tabs, attempting to
517: * preserve the current indentation level for multi-line actions
518: * Ignored if string is null.
519: * @param s The action string to output
520: */
521: protected void printAction(String s) {
522: if (s != null) {
523: printTabs();
524: _printAction(s);
525: }
526: }
527:
528: /** Output tab indent followed by a String followed by newline,
529: * to the currentOutput stream. Ignored if string is null.
530: * @param s The string to output
531: */
532: protected void println(String s) {
533: if (s != null) {
534: printTabs();
535: currentOutput.println(s);
536: }
537: }
538:
539: /** Output the current tab indentation. This outputs the number of tabs
540: * indicated by the "tabs" variable to the currentOutput stream.
541: */
542: protected void printTabs() {
543: for (int i = 1; i <= tabs; i++) {
544: currentOutput.print("\t");
545: }
546: }
547:
548: /** Lexically process $ and # references within the action.
549: * This will replace #id and #(...) with the appropriate
550: * function calls and/or variables etc...
551: */
552: protected abstract String processActionForSpecialSymbols(
553: String actionStr, int line, RuleBlock currentRule,
554: ActionTransInfo tInfo);
555:
556: public String getFOLLOWBitSet(String ruleName, int k) {
557: GrammarSymbol rs = grammar.getSymbol(ruleName);
558: if (!(rs instanceof RuleSymbol)) {
559: return null;
560: }
561: RuleBlock blk = ((RuleSymbol) rs).getBlock();
562: Lookahead follow = grammar.theLLkAnalyzer
563: .FOLLOW(k, blk.endNode);
564: String followSetName = getBitsetName(markBitsetForGen(follow.fset));
565: return followSetName;
566: }
567:
568: public String getFIRSTBitSet(String ruleName, int k) {
569: GrammarSymbol rs = grammar.getSymbol(ruleName);
570: if (!(rs instanceof RuleSymbol)) {
571: return null;
572: }
573: RuleBlock blk = ((RuleSymbol) rs).getBlock();
574: Lookahead first = grammar.theLLkAnalyzer.look(k, blk);
575: String firstSetName = getBitsetName(markBitsetForGen(first.fset));
576: return firstSetName;
577: }
578:
579: /**
580: * Remove the assignment portion of a declaration, if any.
581: * @param d the declaration
582: * @return the declaration without any assignment portion
583: */
584: protected String removeAssignmentFromDeclaration(String d) {
585: // If d contains an equal sign, then it's a declaration
586: // with an initialization. Strip off the initialization part.
587: if (d.indexOf('=') >= 0)
588: d = d.substring(0, d.indexOf('=')).trim();
589: return d;
590: }
591:
592: /** Set all fields back like one just created */
593: private void reset() {
594: tabs = 0;
595: // Allocate list of bitsets tagged for code generation
596: bitsetsUsed = new Vector();
597: currentOutput = null;
598: grammar = null;
599: DEBUG_CODE_GENERATOR = false;
600: makeSwitchThreshold = DEFAULT_MAKE_SWITCH_THRESHOLD;
601: bitsetTestThreshold = DEFAULT_BITSET_TEST_THRESHOLD;
602: }
603:
604: public static String reverseLexerRuleName(String id) {
605: return id.substring(1, id.length());
606: }
607:
608: public void setAnalyzer(LLkGrammarAnalyzer analyzer_) {
609: analyzer = analyzer_;
610: }
611:
612: public void setBehavior(DefineGrammarSymbols behavior_) {
613: behavior = behavior_;
614: }
615:
616: /** Set a grammar for the code generator to use */
617: protected void setGrammar(Grammar g) {
618: reset();
619: grammar = g;
620: // Lookup make-switch threshold in the grammar generic options
621: if (grammar.hasOption("codeGenMakeSwitchThreshold")) {
622: try {
623: makeSwitchThreshold = grammar
624: .getIntegerOption("codeGenMakeSwitchThreshold");
625: //System.out.println("setting codeGenMakeSwitchThreshold to " + makeSwitchThreshold);
626: } catch (NumberFormatException e) {
627: Token tok = grammar
628: .getOption("codeGenMakeSwitchThreshold");
629: antlrTool
630: .error(
631: "option 'codeGenMakeSwitchThreshold' must be an integer",
632: grammar.getClassName(), tok.getLine(),
633: tok.getColumn());
634: }
635: }
636:
637: // Lookup bitset-test threshold in the grammar generic options
638: if (grammar.hasOption("codeGenBitsetTestThreshold")) {
639: try {
640: bitsetTestThreshold = grammar
641: .getIntegerOption("codeGenBitsetTestThreshold");
642: //System.out.println("setting codeGenBitsetTestThreshold to " + bitsetTestThreshold);
643: } catch (NumberFormatException e) {
644: Token tok = grammar
645: .getOption("codeGenBitsetTestThreshold");
646: antlrTool
647: .error(
648: "option 'codeGenBitsetTestThreshold' must be an integer",
649: grammar.getClassName(), tok.getLine(),
650: tok.getColumn());
651: }
652: }
653:
654: // Lookup debug code-gen in the grammar generic options
655: if (grammar.hasOption("codeGenDebug")) {
656: Token t = grammar.getOption("codeGenDebug");
657: if (t.getText().equals("true")) {
658: //System.out.println("setting code-generation debug ON");
659: DEBUG_CODE_GENERATOR = true;
660: } else if (t.getText().equals("false")) {
661: //System.out.println("setting code-generation debug OFF");
662: DEBUG_CODE_GENERATOR = false;
663: } else {
664: antlrTool.error(
665: "option 'codeGenDebug' must be true or false",
666: grammar.getClassName(), t.getLine(), t
667: .getColumn());
668: }
669: }
670: }
671:
672: public void setTool(Tool tool) {
673: antlrTool = tool;
674: }
675: }
|