001: /*
002: * xtc - The eXTensible Compiler
003: * Copyright (C) 2007 Robert Grimm
004: *
005: * This program is free software; you can redistribute it and/or
006: * modify it under the terms of the GNU General Public License
007: * version 2 as published by the Free Software Foundation.
008: *
009: * This program is distributed in the hope that it will be useful,
010: * but WITHOUT ANY WARRANTY; without even the implied warranty of
011: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
012: * GNU General Public License for more details.
013: *
014: * You should have received a copy of the GNU General Public License
015: * along with this program; if not, write to the Free Software
016: * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
017: * USA.
018: */
019: package xtc.parser;
020:
021: import java.util.ArrayList;
022: import java.util.List;
023:
024: import xtc.Constants;
025:
026: import xtc.tree.Visitor;
027:
028: import xtc.type.AST;
029:
030: import xtc.util.Runtime;
031:
032: /**
033: * Visitor to recognize token-level productions.
034: *
035: * <p />This visitor recognizes the boundary between hierarchical and
036: * lexical syntax:
037: *
038: * <li>This visitor recognizes all lexical productions. A production
039: * is lexical if it is text-only or if it is void and only references
040: * other lexical productions (if any). As a result, a lexical
041: * production may not contain parser actions, semantic actions that
042: * reference {@link CodeGenerator#VALUE}, or bindings to {@link
043: * CodeGenerator#VALUE}.</li>
044: *
045: * <li>This visitor traverses the grammar, starting with its public
046: * productions. When encountering a lexical production it does not
047: * traverse into the production; rather, if the lexical production
048: * also consumes the input, this visitor marks the production as
049: * token-level.</li>
050: *
051: * <li>This visitor ensures that all void productions are correctly
052: * annotated with the {@link Properties#INPUT} property, indicating
053: * whether they consume the input.</li>
054: *
055: * </ol>
056: *
057: * <p />This visitor does <em>not</em> change a token-level
058: * production's type, so that later parser generator phases can still
059: * distinguish between productions that used to be text-only and that
060: * used to be void. It does however, remove any {@link
061: * Properties#TEXT_ONLY} property.
062: *
063: * <p />This visitor assumes that the entire grammar is contained in a
064: * single module and that text-only productions have been marked as
065: * such. It may perform faster if the grammar has been annotated with
066: * its real root.
067: *
068: * @see TextTester
069: * @see RootFinder
070: *
071: * @author Robert Grimm
072: * @version $Revision: 1.11 $
073: */
074: public class Tokenizer extends GrammarVisitor {
075:
076: /** Visitor to determine which productions are lexical. */
077: public static class Tester extends Visitor {
078:
079: /** The runtime. */
080: protected final Runtime runtime;
081:
082: /** The analyzer utility. */
083: protected final Analyzer analyzer;
084:
085: /** The flag for whether the current production is lexical. */
086: protected boolean isLexical;
087:
088: /**
089: * Create a new lexical tester.
090: *
091: * @param runtime The runtime.
092: * @param analyzer The analyzer utility.
093: */
094: public Tester(Runtime runtime, Analyzer analyzer) {
095: this .runtime = runtime;
096: this .analyzer = analyzer;
097: }
098:
099: /**
100: * Mark the specified production as lexical.
101: *
102: * @param p The production.
103: */
104: protected void mark(Production p) {
105: if (runtime.test("optionVerbose")) {
106: System.err.println("[Recognizing " + p.qName
107: + " as lexical syntax]");
108: }
109: p.setProperty(Properties.LEXICAL, Boolean.TRUE);
110: }
111:
112: /** Visit the specified grammar. */
113: public void visit(Module m) {
114: // Initialize the per-grammar state.
115: analyzer.register(this );
116: analyzer.init(m);
117:
118: // Process the productions.
119: for (Production p : m.productions) {
120: // Make sure that the production has not been processed
121: // already and that it returns a string.
122: if (analyzer.isProcessed(p.qName)) {
123: continue;
124: } else if (p.getBooleanProperty(Properties.TEXT_ONLY)) {
125: mark(p);
126: analyzer.processed(p.qName);
127: continue;
128: } else if (!AST.isVoid(p.type)) {
129: analyzer.processed(p.qName);
130: continue;
131: }
132:
133: // Clear the per-production state.
134: isLexical = true;
135:
136: // Process the production.
137: analyzer.process(p);
138:
139: // Tabulate the results.
140: if (isLexical) {
141: // All visited productions are guaranteed to be lexical.
142: for (NonTerminal nt : analyzer.working()) {
143: // This lookup is guaranteed to work, as the production's
144: // fully qualified name was added by visit(Production).
145: Production p2 = analyzer.lookup(nt);
146: mark(p2);
147: analyzer.processed(p2.qName);
148: }
149:
150: } else {
151: // We only know that the current production is not lexical.
152: analyzer.processed(p.qName);
153: }
154: }
155: }
156:
157: /** Visit the specified production. */
158: public void visit(Production p) {
159: Object closure = analyzer.enter(p);
160: analyzer.workingOn(p.qName);
161: dispatch(p.choice);
162: analyzer.exit(closure);
163: }
164:
165: /** Visit the specified ordered choice. */
166: public void visit(OrderedChoice c) {
167: for (Sequence alt : c.alternatives) {
168: dispatch(alt);
169: if (!isLexical) {
170: // We don't need to look any further.
171: return;
172: }
173: }
174: }
175:
176: /** Visit the specified sequence. */
177: public void visit(Sequence s) {
178: for (Element e : s.elements) {
179: dispatch(e);
180: if (!isLexical) {
181: // We don't need to look any further.
182: return;
183: }
184: }
185: }
186:
187: /** Visit the specified semantic predicate. */
188: public void visit(SemanticPredicate p) {
189: // Ignore the semantic action.
190: }
191:
192: /** Visit the specified binding. */
193: public void visit(Binding b) {
194: // We allow bindings in lexical productions, so that they can
195: // contain semantic predicates. However, we disallow a binding to
196: // CodeGenerator.VALUE.
197: if (CodeGenerator.VALUE.equals(b.name)) {
198: isLexical = false;
199: } else {
200: dispatch(b.element);
201: }
202: }
203:
204: /** Visit the specified nonterminal. */
205: public void visit(NonTerminal nt) {
206: Production p;
207:
208: try {
209: p = analyzer.lookup(nt);
210: } catch (IllegalArgumentException x) {
211: // Too many productions. We assume the worst.
212: isLexical = false;
213: return;
214: }
215:
216: if (null == p) {
217: // No such production. We assume the worst.
218: isLexical = false;
219:
220: } else if (analyzer.isProcessed(p.qName)) {
221: // If the corresponding production has already been processed,
222: // make sure it is lexical.
223: if (!p.getBooleanProperty(Properties.LEXICAL)) {
224: isLexical = false;
225: }
226:
227: } else if (!analyzer.isBeingWorkedOn(p.qName)) {
228: // The production has not been processed and is not yet under
229: // consideration. If is text-only, accept it. If it is void,
230: // check it out.
231: if (p.getBooleanProperty(Properties.TEXT_ONLY)) {
232: // Nothing to do.
233: } else if (AST.isVoid(p.type)) {
234: dispatch(p);
235: } else {
236: isLexical = false;
237: }
238: }
239: }
240:
241: /** Visit the specified character case. */
242: public void visit(CharCase c) {
243: dispatch(c.element);
244: }
245:
246: /** Visit the specified character switch. */
247: public void visit(CharSwitch s) {
248: for (CharCase kase : s.cases) {
249: dispatch(kase);
250: if (!isLexical) {
251: // We don't need to look any further.
252: return;
253: }
254: }
255: dispatch(s.base);
256: }
257:
258: /** Visit the specified terminal. */
259: public void visit(Terminal t) {
260: // Nothing to do. Terminals are lexical.
261: }
262:
263: /**
264: * Visit the specified unary operator. This method provides the
265: * default implementation for repetitions, options, syntactic
266: * predicates, voided elements, and string matches.
267: */
268: public void visit(UnaryOperator op) {
269: dispatch(op.element);
270: }
271:
272: /** Visit the specified null literal. */
273: public void visit(NullLiteral l) {
274: // Nothing to do.
275: }
276:
277: /** Visit the specified node marker. */
278: public void visit(NodeMarker m) {
279: isLexical = false;
280: }
281:
282: /** Visit the specified action. */
283: public void visit(Action a) {
284: if (a.setsValue())
285: isLexical = false;
286: }
287:
288: /** Visit the specified parser action. */
289: public void visit(ParserAction pa) {
290: isLexical = false;
291: }
292:
293: /**
294: * Visit the specified element. This method provides the default
295: * implementation for parse tree nodes and value elements.
296: */
297: public void visit(Element e) {
298: isLexical = false;
299: }
300:
301: }
302:
303: // ==========================================================================
304:
305: /**
306: * Create a new tokenizer.
307: *
308: * @param runtime The runtime.
309: * @param analyzer The analyzer utility.
310: */
311: public Tokenizer(Runtime runtime, Analyzer analyzer) {
312: super (runtime, analyzer);
313: }
314:
315: /** Visit the specified grammar. */
316: public Object visit(Module m) {
317: // Recognize lexical syntax first.
318: new Tester(runtime, analyzer).dispatch(m);
319:
320: // Initialize the per-grammar state.
321: analyzer.register(this );
322: analyzer.init(m);
323:
324: // Make sure that all lexical and void productions are tested for
325: // whether they consume the input.
326: for (Production p : m.productions) {
327: if (p.getBooleanProperty(Properties.LEXICAL)
328: || AST.isVoid(p.type)) {
329: analyzer.notWorkingOnAny();
330: analyzer.consumesInput(p.qName);
331: }
332: }
333:
334: // Determine which productions to process.
335: List<Production> todo;
336: if (m.hasProperty(Properties.ROOT)) {
337: todo = new ArrayList<Production>(1);
338: todo.add(analyzer.lookup((NonTerminal) m
339: .getProperty(Properties.ROOT)));
340: } else {
341: todo = m.productions;
342: }
343:
344: // Process the productions.
345: for (Production p : todo) {
346: // Skip processed or non-public productions.
347: if (analyzer.isProcessed(p.qName)
348: || (!p.hasAttribute(Constants.ATT_PUBLIC))) {
349: continue;
350: }
351:
352: // Mark production as processed to avoid recursive processing.
353: analyzer.processed(p.qName);
354:
355: if (p.getBooleanProperty(Properties.LEXICAL)) {
356: // We have reached a lexical production. If it consumes the
357: // input, we mark it as token-level.
358: analyzer.notWorkingOnAny();
359: if (analyzer.consumesInput(p.qName)) {
360: markToken(p, runtime.test("optionVerbose"));
361: }
362: } else {
363: // Recurse into the production.
364: analyzer.process(p);
365: }
366: }
367:
368: // Done.
369: return null;
370: }
371:
372: /** Visit the specified nonterminal. */
373: public Element visit(NonTerminal nt) {
374: FullProduction p = analyzer.lookup(nt);
375:
376: if (!analyzer.isProcessed(p.qName)) {
377: analyzer.processed(p.qName);
378: if (p.getBooleanProperty(Properties.LEXICAL)) {
379: if (analyzer.consumesInput(nt)) {
380: markToken(p, runtime.test("optionVerbose"));
381: }
382: } else {
383: dispatch(p);
384: }
385: }
386:
387: return nt;
388: }
389:
390: /**
391: * Mark the specified production as token-level. This method sets
392: * the specified production's {@link Properties#TOKEN} property and
393: * removes any {@link Properties#TEXT_ONLY} property. It does,
394: * however, <em>not</em> adjust the production's type to
395: * <code>Token</code>.
396: *
397: * @param p The production.
398: */
399: public static void markToken(Production p, boolean verbose) {
400: if (verbose) {
401: System.err.println("[Recognizing " + p.qName
402: + " as token-level]");
403: }
404: p.setProperty(Properties.TOKEN, Boolean.TRUE);
405: p.removeProperty(Properties.TEXT_ONLY);
406: }
407:
408: }
|