001: /*
002: * Copyright (C) Chaperon. All rights reserved.
003: * -------------------------------------------------------------------------
004: * This software is published under the terms of the Apache Software License
005: * version 1.1, a copy of which has been included with this distribution in
006: * the LICENSE file.
007: */
008:
009: package net.sourceforge.chaperon.process;
010:
011: import net.sourceforge.chaperon.common.Decoder;
012:
013: import org.apache.commons.logging.Log;
014:
015: import org.xml.sax.*;
016: import org.xml.sax.ext.LexicalHandler;
017: import org.xml.sax.helpers.AttributesImpl;
018: import org.xml.sax.helpers.LocatorImpl;
019:
020: /**
021: * The processor convert a stream text into lexical tokens, like a tokenizer.
022: *
023: * @author <a href="mailto:stephan@apache.org">Stephan Michels </a>
024: * @version CVS $Id: LexicalProcessor.java,v 1.22 2004/01/04 16:54:34 benedikta Exp $
025: */
026: public class LexicalProcessor implements ContentHandler, LexicalHandler {
027: public static final String NS = "http://chaperon.sourceforge.net/schema/text/1.0";
028: public static final String TEXT = "text";
029: public static final String NS_OUTPUT = "http://chaperon.sourceforge.net/schema/lexer/2.0";
030: public static final String OUTPUT = "output";
031: public static final String LEXEME = "lexeme";
032: public static final String GROUP = "group";
033: public static final String ERROR = "error";
034: private ContentHandler contentHandler = null;
035: private LexicalHandler lexicalHandler = null;
036: private static final int STATE_OUTSIDE = 0;
037: private static final int STATE_TEXT = 1;
038: private int state = STATE_OUTSIDE;
039: private Locator locator = null;
040: private LocatorImpl locatorImpl = null;
041: private LexicalAutomaton automaton = null;
042: private Log log = null;
043: private boolean grouping = false;
044: private boolean localizable = false;
045: private String source;
046: private int lineNumber;
047: private int columnNumber;
048: private StringBuffer buffer = null;
049: private char[] text = null;
050:
051: /**
052: * Create a new lexical processor.
053: */
054: public LexicalProcessor() {
055: }
056:
057: /**
058: * Create a new lexical processor.
059: *
060: * @param automaton Lexical automaton, which should be used.
061: * @param handler Handler, which should receives the events.
062: */
063: public LexicalProcessor(LexicalAutomaton automaton) {
064: this .automaton = automaton;
065: }
066:
067: /**
068: * Set the lexical automaton, which the processor should use.
069: *
070: * @param automaton Lexical automaton, which should be used.
071: */
072: public void setLexicalAutomaton(LexicalAutomaton automaton) {
073: this .automaton = automaton;
074: }
075:
076: /**
077: * Set the <code>ContentHandler</code> that will receive XML data.
078: */
079: public void setContentHandler(ContentHandler handler) {
080: this .contentHandler = handler;
081: }
082:
083: /**
084: * Set the <code>LexicalHandler</code> that will receive XML data.
085: */
086: public void setLexicalHandler(LexicalHandler handler) {
087: this .lexicalHandler = handler;
088: }
089:
090: /**
091: * Set the log, which should be used.
092: *
093: * @param log Log.
094: */
095: public void setLog(Log log) {
096: this .log = log;
097: }
098:
099: public void setGrouping(boolean grouping) {
100: this .grouping = grouping;
101: }
102:
103: public void setLocalizable(boolean localizable) {
104: this .localizable = localizable;
105: }
106:
107: /**
108: * Receive an object for locating the origin of SAX document events.
109: */
110: public void setDocumentLocator(Locator locator) {
111: this .locator = locator;
112: this .locatorImpl = null;
113: if (locator != null) {
114: this .locatorImpl = new LocatorImpl(locator);
115: contentHandler.setDocumentLocator(locatorImpl);
116: }
117: }
118:
119: /**
120: * Receive notification of the beginning of a document.
121: */
122: public void startDocument() throws SAXException {
123: if (locatorImpl != null) {
124: locatorImpl.setLineNumber(locator.getLineNumber());
125: locatorImpl.setColumnNumber(locator.getColumnNumber());
126: }
127:
128: contentHandler.startDocument();
129: state = STATE_OUTSIDE;
130:
131: buffer = new StringBuffer();
132: }
133:
134: /**
135: * Receive notification of the beginning of an element.
136: */
137: public void startElement(String namespaceURI, String localName,
138: String qName, Attributes atts) throws SAXException {
139: if (state == STATE_OUTSIDE) {
140: if ((namespaceURI != null) && (namespaceURI.equals(NS))
141: && (localName.equals(TEXT))) {
142: state = STATE_TEXT;
143: buffer = new StringBuffer();
144:
145: if (atts.getValue("source") != null)
146: source = atts.getValue("source");
147: else if (locator != null)
148: source = locator.getSystemId();
149: else
150: source = "unknown";
151:
152: if (atts.getValue("column") != null)
153: columnNumber = Integer.parseInt(atts
154: .getValue("column"));
155: else if (locator != null)
156: columnNumber = locator.getColumnNumber();
157: else
158: columnNumber = 1;
159:
160: if (atts.getValue("line") != null)
161: lineNumber = Integer
162: .parseInt(atts.getValue("line"));
163: else if (locator != null)
164: lineNumber = locator.getLineNumber();
165: else
166: lineNumber = 1;
167: } else
168: contentHandler.startElement(namespaceURI, localName,
169: qName, atts);
170: } else if (state == STATE_TEXT)
171: throw new SAXException("Unexpected start element '" + qName
172: + "'.");
173: }
174:
175: /**
176: * Receive notification of character data.
177: */
178: public void characters(char[] ch, int start, int length)
179: throws SAXException {
180: if (state == STATE_OUTSIDE)
181: contentHandler.characters(ch, start, length);
182: else if (state == STATE_TEXT)
183: buffer.append(ch, start, length);
184: }
185:
186: /**
187: * Receive notification of ignorable whitespace in element content.
188: */
189: public void ignorableWhitespace(char[] ch, int start, int length)
190: throws SAXException {
191: if (state == STATE_OUTSIDE)
192: contentHandler.characters(ch, start, length);
193: else if (state == STATE_TEXT)
194: buffer.append(ch, start, length);
195: }
196:
197: /**
198: * Receive notification of the end of an element.
199: */
200: public void endElement(String namespaceURI, String localName,
201: String qName) throws SAXException {
202: if (state == STATE_OUTSIDE)
203: contentHandler.endElement(namespaceURI, localName, qName);
204: else if (state == STATE_TEXT) {
205: if ((namespaceURI != null) && (namespaceURI.equals(NS))
206: && (localName.equals(TEXT))) {
207: state = STATE_OUTSIDE;
208:
209: handleEndDocument();
210: } else
211: throw new SAXException("Unexpected end element '"
212: + qName + "'.");
213: }
214: }
215:
216: /**
217: * Begin the scope of a prefix-URI Namespace mapping.
218: */
219: public void startPrefixMapping(String prefix, String uri)
220: throws SAXException {
221: if (locatorImpl != null) {
222: locatorImpl.setLineNumber(locator.getLineNumber());
223: locatorImpl.setColumnNumber(locator.getColumnNumber());
224: }
225:
226: contentHandler.startPrefixMapping(prefix, uri);
227: }
228:
229: /**
230: * End the scope of a prefix-URI mapping.
231: */
232: public void endPrefixMapping(String prefix) throws SAXException {
233: if (locatorImpl != null) {
234: locatorImpl.setLineNumber(locator.getLineNumber());
235: locatorImpl.setColumnNumber(locator.getColumnNumber());
236: }
237:
238: contentHandler.endPrefixMapping(prefix);
239: }
240:
241: /**
242: * Receive notification of a processing instruction.
243: */
244: public void processingInstruction(String target, String data)
245: throws SAXException {
246: if (locatorImpl != null) {
247: locatorImpl.setLineNumber(locator.getLineNumber());
248: locatorImpl.setColumnNumber(locator.getColumnNumber());
249: }
250:
251: if (state == STATE_OUTSIDE)
252: contentHandler.processingInstruction(target, data);
253: }
254:
255: /**
256: * Receive notification of a skipped entity.
257: */
258: public void skippedEntity(String name) throws SAXException {
259: if (locatorImpl != null) {
260: locatorImpl.setLineNumber(locator.getLineNumber());
261: locatorImpl.setColumnNumber(locator.getColumnNumber());
262: }
263:
264: if (state == STATE_OUTSIDE)
265: contentHandler.skippedEntity(name);
266: }
267:
268: /**
269: * Receive notification of the end of a document.
270: */
271: public void endDocument() throws SAXException {
272: if (locatorImpl != null) {
273: locatorImpl.setLineNumber(locator.getLineNumber());
274: locatorImpl.setColumnNumber(locator.getColumnNumber());
275: }
276:
277: if (state == STATE_OUTSIDE)
278: contentHandler.endDocument();
279: }
280:
281: /**
282: * Report the start of DTD declarations, if any.
283: */
284: public void startDTD(String name, String publicId, String systemId)
285: throws SAXException {
286: if (lexicalHandler != null)
287: lexicalHandler.startDTD(name, publicId, systemId);
288: }
289:
290: /**
291: * Report the end of DTD declarations.
292: */
293: public void endDTD() throws SAXException {
294: if (lexicalHandler != null)
295: lexicalHandler.endDTD();
296: }
297:
298: /**
299: * Report the beginning of an entity.
300: */
301: public void startEntity(String name) throws SAXException {
302: if (lexicalHandler != null)
303: lexicalHandler.startEntity(name);
304: }
305:
306: /**
307: * Report the end of an entity.
308: */
309: public void endEntity(String name) throws SAXException {
310: if (lexicalHandler != null)
311: lexicalHandler.endEntity(name);
312: }
313:
314: /**
315: * Report the start of a CDATA section.
316: */
317: public void startCDATA() throws SAXException {
318: if (lexicalHandler != null)
319: lexicalHandler.startCDATA();
320: }
321:
322: /**
323: * Report the end of a CDATA section.
324: */
325: public void endCDATA() throws SAXException {
326: if (lexicalHandler != null)
327: lexicalHandler.endCDATA();
328: }
329:
330: /**
331: * Report an XML comment anywhere in the document.
332: */
333: public void comment(char[] ch, int start, int len)
334: throws SAXException {
335: if (lexicalHandler != null)
336: lexicalHandler.comment(ch, start, len);
337: }
338:
339: /**
340: * Receives the notification, that the text stream ended.
341: */
342: public void handleEndDocument() throws SAXException {
343: PatternProcessor processor = new PatternProcessor();
344: text = buffer.toString().toCharArray();
345:
346: int position = 0;
347:
348: if (locatorImpl != null) {
349: locatorImpl.setSystemId(source);
350: locatorImpl.setLineNumber(lineNumber);
351: locatorImpl.setColumnNumber(columnNumber);
352: }
353:
354: contentHandler.startPrefixMapping("", NS_OUTPUT);
355:
356: AttributesImpl atts = new AttributesImpl();
357: if (localizable)
358: atts.addAttribute("", "source", "source", "CDATA", source);
359:
360: contentHandler.startElement(NS_OUTPUT, OUTPUT, OUTPUT,
361: new AttributesImpl());
362:
363: StringBuffer unrecognized = new StringBuffer();
364: while (position < text.length) {
365: String tokensymbol = null;
366: String tokentext = null;
367:
368: for (int lexemeindex = automaton.getLexemeCount() - 1; lexemeindex >= 0; lexemeindex--) {
369: processor.setPatternAutomaton(automaton
370: .getLexemeDefinition(lexemeindex));
371:
372: if ((processor.match(text, position))
373: && ((tokentext == null) || (processor
374: .getGroup().length() >= tokentext
375: .length()))) {
376: tokensymbol = automaton
377: .getLexemeSymbol(lexemeindex);
378: tokentext = processor.getGroup();
379: }
380: }
381:
382: if ((tokentext != null) && (tokentext.length() == 0))
383: log.warn("Lexical processor recognized empty lexeme '"
384: + tokensymbol + "'");
385:
386: if ((tokentext != null) && (tokentext.length() > 0)) {
387: if (unrecognized.length() > 0) {
388: if (log != null)
389: log.debug("Text was not recognized "
390: + Decoder.toString(unrecognized
391: .toString()));
392:
393: atts = new AttributesImpl();
394: atts.addAttribute("", "text", "text", "CDATA",
395: unrecognized.toString());
396: if (localizable) {
397: atts.addAttribute("", "line", "line", "CDATA",
398: String.valueOf(lineNumber));
399: atts.addAttribute("", "column", "column",
400: "CDATA", String.valueOf(columnNumber));
401: }
402:
403: contentHandler.startElement(NS_OUTPUT, ERROR,
404: ERROR, atts);
405: contentHandler.endElement(NS_OUTPUT, ERROR, ERROR);
406:
407: increasePosition(position - unrecognized.length(),
408: unrecognized.length());
409:
410: unrecognized = new StringBuffer();
411: }
412:
413: if (tokensymbol != null) {
414: if (log != null)
415: log.debug("Recognize token " + tokensymbol
416: + " with "
417: + Decoder.toString(tokentext));
418:
419: if (locatorImpl != null) {
420: locatorImpl.setLineNumber(locator
421: .getLineNumber());
422: locatorImpl.setColumnNumber(locator
423: .getColumnNumber());
424: }
425:
426: atts = new AttributesImpl();
427:
428: atts.addAttribute("", "symbol", "symbol", "CDATA",
429: tokensymbol);
430: atts.addAttribute("", "text", "text", "CDATA",
431: tokentext);
432: if (localizable) {
433: atts.addAttribute("", "line", "line", "CDATA",
434: String.valueOf(lineNumber));
435: atts.addAttribute("", "column", "column",
436: "CDATA", String.valueOf(columnNumber));
437: }
438:
439: contentHandler.startElement(NS_OUTPUT, LEXEME,
440: LEXEME, atts);
441:
442: if (grouping)
443: for (int i = 1; i < processor.getGroupCount(); i++) {
444: AttributesImpl groupatts = new AttributesImpl();
445: groupatts.addAttribute("", "text", "text",
446: "CDATA", processor.getGroup(i));
447: contentHandler.startElement(NS_OUTPUT,
448: GROUP, GROUP, groupatts);
449: contentHandler.endElement(NS_OUTPUT, GROUP,
450: GROUP);
451: }
452:
453: contentHandler
454: .endElement(NS_OUTPUT, LEXEME, LEXEME);
455: } else if (log != null)
456: log.debug("Ignore lexeme with "
457: + Decoder.toString(tokentext));
458:
459: if (locatorImpl != null) {
460: locatorImpl.setColumnNumber(columnNumber);
461: locatorImpl.setLineNumber(lineNumber);
462: }
463:
464: position += tokentext.length();
465:
466: increasePosition(position - tokentext.length(),
467: tokentext.length());
468: } else {
469: if (locatorImpl != null) {
470: locatorImpl.setColumnNumber(columnNumber);
471: locatorImpl.setLineNumber(lineNumber);
472: }
473:
474: unrecognized.append(text[position]);
475: position++;
476: }
477: }
478:
479: if (unrecognized.length() > 0) {
480: if (log != null)
481: log.debug("Text was not recognized "
482: + Decoder.toString(unrecognized.toString()));
483:
484: atts = new AttributesImpl();
485: atts.addAttribute("", "text", "text", "CDATA", unrecognized
486: .toString());
487: if (localizable) {
488: atts.addAttribute("", "line", "line", "CDATA", String
489: .valueOf(lineNumber));
490: atts.addAttribute("", "column", "column", "CDATA",
491: String.valueOf(columnNumber));
492: }
493:
494: contentHandler.startElement(NS_OUTPUT, ERROR, ERROR, atts);
495: contentHandler.endElement(NS_OUTPUT, ERROR, ERROR);
496:
497: System.out.println("push \"" + unrecognized.toString()
498: + "\"");
499: increasePosition(position - unrecognized.length(),
500: unrecognized.length());
501: }
502:
503: if (locatorImpl != null) {
504: locatorImpl.setLineNumber(locator.getLineNumber());
505: locatorImpl.setColumnNumber(locator.getColumnNumber());
506: }
507:
508: contentHandler.endElement(NS_OUTPUT, OUTPUT, OUTPUT);
509: contentHandler.endPrefixMapping("");
510: }
511:
512: private void increasePosition(int position, int length) {
513: for (int i = position; i < (position + length); i++) {
514: if (text[i] == '\n') {
515: columnNumber = 1;
516: lineNumber++;
517: } else if ((text[i] == '\r')
518: && ((i == (text.length - 1)) || (text[i + 1] != '\n'))) {
519: columnNumber = 1;
520: lineNumber++;
521: } else
522: columnNumber++;
523: }
524: }
525: }
|