001: package com.quadcap.text.sax;
002:
003: /* Copyright 1999 - 2003 Quadcap Software. All rights reserved.
004: *
005: * This software is distributed under the Quadcap Free Software License.
006: * This software may be used or modified for any purpose, personal or
007: * commercial. Open Source redistributions are permitted. Commercial
008: * redistribution of larger works derived from, or works which bundle
009: * this software requires a "Commercial Redistribution License"; see
010: * http://www.quadcap.com/purchase.
011: *
012: * Redistributions qualify as "Open Source" under one of the following terms:
013: *
014: * Redistributions are made at no charge beyond the reasonable cost of
015: * materials and delivery.
016: *
017: * Redistributions are accompanied by a copy of the Source Code or by an
018: * irrevocable offer to provide a copy of the Source Code for up to three
019: * years at the cost of materials and delivery. Such redistributions
020: * must allow further use, modification, and redistribution of the Source
021: * Code under substantially the same terms as this license.
022: *
023: * Redistributions of source code must retain the copyright notices as they
024: * appear in each source code file, these license terms, and the
025: * disclaimer/limitation of liability set forth as paragraph 6 below.
026: *
027: * Redistributions in binary form must reproduce this Copyright Notice,
028: * these license terms, and the disclaimer/limitation of liability set
029: * forth as paragraph 6 below, in the documentation and/or other materials
030: * provided with the distribution.
031: *
032: * The Software is provided on an "AS IS" basis. No warranty is
033: * provided that the Software is free of defects, or fit for a
034: * particular purpose.
035: *
036: * Limitation of Liability. Quadcap Software shall not be liable
037: * for any damages suffered by the Licensee or any third party resulting
038: * from use of the Software.
039: */
040:
041: import java.io.CharArrayWriter;
042: import java.io.IOException;
043: import java.io.InputStreamReader;
044: import java.io.Reader;
045:
046: import org.xml.sax.DocumentHandler;
047: import org.xml.sax.DTDHandler;
048: import org.xml.sax.EntityResolver;
049: import org.xml.sax.ErrorHandler;
050: import org.xml.sax.HandlerBase;
051: import org.xml.sax.InputSource;
052: import org.xml.sax.SAXException;
053:
054: import com.quadcap.text.NoStringPool;
055: import com.quadcap.text.StringPool;
056: import com.quadcap.util.collections.ArrayQueue;
057:
058: import com.quadcap.util.Debug;
059:
060: /**
061: * SAX Parser implementation.
062: *
063: * @author Stan Bailes
064: */
065: public class Parser implements org.xml.sax.Parser {
066: boolean docStarted = false;
067: HandlerBase defaultHandler = new HandlerBase();
068: StringPool pool = new NoStringPool();
069: InputSource in;
070: Reader r;
071: DocumentHandler docHandler = defaultHandler;
072: DTDHandler dtdHandler = defaultHandler;
073: EntityResolver entityResolver = defaultHandler;
074: ErrorHandler errorHandler = defaultHandler;
075: char[] ebuf = new char[6];
076: char[] tag = new char[1024 * 32];
077: int taglen = 0;
078: CharArrayWriter data = new CharArrayWriter();
079: AttributeList attributes = new AttributeList();
080: String attrName = null;
081: String tagName = null;
082: ArrayQueue inStack = null;
083: ArrayQueue locStack = null;
084: int lineNumber = 1;
085: int columnNumber = 1;
086: String lastEntityVal = "";
087: boolean trace = false;
088: int commentLevel = 0;
089:
090: public Parser() {
091: }
092:
093: public void parse(InputSource in) throws SAXException, IOException {
094: this .in = in;
095: this .r = getCharacterStream(in);
096: taglen = 0;
097: lineNumber = 1;
098: columnNumber = 1;
099: data.reset();
100: try {
101: parse();
102: } catch (SAXException ex) {
103: if (locStack != null) {
104: for (int i = 0; i < locStack.size(); i++) {
105: com.quadcap.util.Debug.println(" at "
106: + locStack.top(i));
107: }
108: }
109: throw ex;
110: }
111: }
112:
113: final Reader getCharacterStream(InputSource in) {
114: Reader rd = in.getCharacterStream();
115: if (rd == null) {
116: rd = new InputStreamReader(in.getByteStream());
117: }
118: return rd;
119: }
120:
121: public void pushInputSource(InputSource in2) {
122: if (inStack == null) {
123: inStack = new ArrayQueue();
124: locStack = new ArrayQueue();
125: }
126: inStack.push(in);
127: locStack.push("" + lineNumber + ":" + columnNumber);
128: lineNumber = 1;
129: columnNumber = 1;
130: in = in2;
131: r = getCharacterStream(in);
132: }
133:
134: boolean popInputSource() {
135: if (inStack == null || inStack.size() == 0)
136: return false;
137: in = (InputSource) inStack.pop();
138: String s = locStack.pop().toString();
139: int idx = s.indexOf(':');
140: lineNumber = Integer.parseInt(s.substring(0, idx));
141: columnNumber = Integer.parseInt(s.substring(idx + 1));
142: r = getCharacterStream(in);
143: return true;
144: }
145:
146: final void addTagChar(int c) throws SAXException {
147: if (taglen >= tag.length)
148: throw new SAXException("tag too long");
149: tag[taglen++] = (char) c;
150: }
151:
152: public void parse(String s) {
153: }
154:
155: public void setDocumentHandler(DocumentHandler dh) {
156: this .docHandler = dh;
157: }
158:
159: public void setDTDHandler(DTDHandler dh) {
160: this .dtdHandler = dh;
161: }
162:
163: public void setEntityResolver(EntityResolver er) {
164: this .entityResolver = er;
165: }
166:
167: public EntityResolver getEntityResolver() {
168: return entityResolver;
169: }
170:
171: public void setErrorHandler(ErrorHandler er) {
172: errorHandler = er;
173: }
174:
175: public void setLocale(java.util.Locale locale) {
176: }
177:
178: final int read() throws IOException {
179: int c = r.read();
180: if (c == '\n') {
181: lineNumber++;
182: columnNumber = 1;
183: } else {
184: columnNumber++;
185: }
186: return c;
187: }
188:
189: final char parseEntity() throws SAXException, IOException {
190: int len = 0;
191: int c;
192: int state = 0;
193: while ((c = read()) >= 0) {
194: ebuf[len++] = (char) c;
195: if (!Character.isLetter((char) c) || len >= ebuf.length)
196: break;
197: }
198: lastEntityVal = new String(ebuf, 0, len);
199: if (len == 5 && ebuf[0] == 'q' && ebuf[1] == 'u'
200: && ebuf[2] == 'o' && ebuf[3] == 't') {
201: return '"';
202: }
203: if (len == 4 && ebuf[0] == 'a' && ebuf[1] == 'm'
204: && ebuf[2] == 'p') {
205: return '&';
206: }
207: if (len == 3) {
208: if (ebuf[0] == 'l') {
209: if (ebuf[1] == 't')
210: return '<';
211: } else if (ebuf[0] == 'g') {
212: if (ebuf[1] == 't')
213: return '>';
214: }
215: }
216: throw new SAXException("unknown entity: " + lastEntityVal);
217:
218: }
219:
220: public int step(int state, int c) throws SAXException, IOException {
221: // Debug.println("step[" + state + " " + commentLevel +
222: // "]: " + ((char)c));
223: switch (state) {
224: case 0:
225: if (c == '<') {
226: if (data.size() > 0) {
227: docHandler.characters(data.toCharArray(), 0, data
228: .size());
229: data.reset();
230: }
231: state = 1;
232: } else {
233: if (c == '&') {
234: try {
235: c = parseEntity();
236: } catch (SAXException e) {
237: data.write('&');
238: data.write(lastEntityVal);
239: break;
240: }
241: }
242: data.write(c);
243: }
244: break;
245: case 1: // seen '<'
246: switch (c) {
247: case '!':
248: state = 30;
249: break;
250: case '\\':
251: state = 4;
252: break;
253: case '/':
254: state = 8;
255: break;
256: case '?':
257: data.reset();
258: state = 20;
259: break;
260: default:
261: addTagChar(c);
262: state = 5;
263: break;
264: }
265: break;
266: case 4: // seen <\
267: data.write('<');
268: data.write(c);
269: state = 0;
270: break;
271: case 5: // collect tag name
272: switch (c) {
273: case ' ':
274: case '\r':
275: case '\n':
276: case '\t':
277: tagName = pool.intern(tag, 0, taglen);
278: taglen = 0;
279: state = 6;
280: break;
281: case '/':
282: tagName = pool.intern(tag, 0, taglen);
283: taglen = 0;
284: state = 9;
285: break;
286: case '>':
287: tagName = pool.intern(tag, 0, taglen);
288: taglen = 0;
289: state = 0;
290: startElement(tagName, attributes);
291: break;
292: case '<':
293: tagName = pool.intern(tag, 0, taglen);
294: taglen = 0;
295: if (data.size() > 0) {
296: docHandler.characters(data.toCharArray(), 0, data
297: .size());
298: data.reset();
299: }
300: state = 1;
301: break;
302: default:
303: if (Character.isLetter((char) c)
304: || Character.isDigit((char) c) || c == '.'
305: || c == '-' || c == '_' || c == ':') {
306: addTagChar(c);
307: } else {
308: // this isn't a tag after all (e.g., inside a <script>
309: // section, we've found "if (a < b) ..."
310: for (int i = 0; i < taglen; i++) {
311: data.write(tag[i]);
312: }
313: data.write(c);
314: state = 0;
315: taglen = 0;
316: break;
317: }
318:
319: }
320: break;
321: case 6: // collect attributes
322: switch (c) {
323: case ' ':
324: case '\n':
325: case '\r':
326: case '\t':
327: break;
328: case '/':
329: state = 9;
330: break;
331: case '%':
332: addTagChar(c);
333: break;
334: case '>':
335: state = 0;
336: startElement(tagName, attributes);
337: break;
338: case '=':
339: attrName = pool.intern(tag, 0, taglen);
340: taglen = 0;
341: state = 10;
342: break;
343: case '<':
344: state = 61;
345: break;
346: default:
347: addTagChar(c);
348: }
349: break;
350: case 61:
351: switch (c) {
352: case '?':
353: state = 62;
354: break;
355: default:
356: addTagChar('<');
357: addTagChar(c);
358: state = 6;
359: break;
360: }
361: break;
362: case 62:
363: switch (c) {
364: case '?':
365: state = 63;
366: break;
367: default:
368: addTagChar(c);
369: break;
370: }
371: break;
372: case 63:
373: switch (c) {
374: case '>':
375: addTagChar(c);
376: state = 6;
377: break;
378: default:
379: addTagChar('?');
380: if (c != '?')
381: state = 62;
382: break;
383: }
384: break;
385: case 8: // seen </
386: if (c == '>') {
387: tagName = pool.intern(tag, 0, taglen);
388: taglen = 0;
389: state = 0;
390: docHandler.endElement(tagName);
391: } else {
392: addTagChar(c);
393: }
394: break;
395: case 9: // in <tag, seen /
396: if (c == '>') {
397: startElement(tagName, attributes);
398: state = 0;
399: docHandler.endElement(tagName);
400: } else {
401: addTagChar('/');
402: addTagChar(c);
403: state = 6;
404: }
405: break;
406: case 10: // in attriblist, seen name=
407: if (c == '"') {
408: state = 12;
409: } else if (c == '\'') {
410: state = 121;
411: } else {
412: addTagChar(c);
413: state = 13;
414: }
415: break;
416: case 12: // in attriblist, seen name="
417: if (c == '"') {
418: attributes.addAttribute(attrName, "CDATA", pool.intern(
419: tag, 0, taglen));
420: taglen = 0;
421: state = 6;
422: } else {
423: addTagChar(c);
424: }
425: break;
426: case 121: // in attriblist, seen name='
427: if (c == '\'') {
428: attributes.addAttribute(attrName, "CDATA", pool.intern(
429: tag, 0, taglen));
430: taglen = 0;
431: state = 6;
432: } else {
433: addTagChar(c);
434: }
435: break;
436: case 13: // in attriblist, seen name=c
437: switch (c) {
438: case ' ':
439: attributes.addAttribute(attrName, "CDATA", pool.intern(
440: tag, 0, taglen));
441: taglen = 0;
442: state = 6;
443: break;
444: case '/':
445: state = 14;
446: break;
447: case '>':
448: attributes.addAttribute(attrName, "CDATA", pool.intern(
449: tag, 0, taglen));
450: taglen = 0;
451: state = 0;
452: startElement(tagName, attributes);
453: break;
454: default:
455: addTagChar(c);
456: }
457: break;
458: case 14: // in attriblist, seen name=dfdf/
459: if (c == '>') {
460: attributes.addAttribute(attrName, "CDATA", pool.intern(
461: tag, 0, taglen));
462: taglen = 0;
463: state = 0;
464: startElement(tagName, attributes);
465: docHandler.endElement(tagName);
466: } else {
467: addTagChar('/');
468: if (c != '/') {
469: addTagChar(c);
470: state = 13;
471: }
472: }
473: break;
474: case 15:
475: if (c == '-')
476: state = 16;
477: break;
478: case 16:
479: if (c == '-')
480: state = 17;
481: else
482: state = 15;
483: break;
484: case 17:
485: if (c == '>')
486: state = 0;
487: else if (c != '-')
488: state = 15;
489: break;
490: case 20:
491: if (c == '?')
492: state = 21;
493: else
494: data.write(c);
495: break;
496: case 21:
497: if (c == '>') {
498: String s = data.toString().trim();
499: if (s.startsWith("xml")) {
500: if (inStack == null || inStack.size() == 0) {
501: if (!docStarted) {
502: docStarted = true;
503: docHandler.startDocument();
504: }
505: }
506: } else {
507: int idx = s.indexOf(' ');
508: String dat = "";
509: String target = s;
510: if (idx >= 0) {
511: target = s.substring(0, idx);
512: dat = s.substring(idx + 1).trim();
513: }
514: docHandler.processingInstruction(target, dat);
515: }
516: data.reset();
517: state = 0;
518: } else {
519: data.write('?');
520: if (c != '?') {
521: data.write(c);
522: state = 20;
523: }
524: }
525: break;
526: case 30: // seen <!
527: if (c == '-')
528: state = 31;
529: else if (c == '[')
530: state = 41;
531: else
532: state = 40;
533: break;
534: case 31: // seen <!-
535: if (c == '-') {
536: commentLevel = 1;
537: state = 32;
538: } else
539: state = 40;
540: break;
541: case 32: // in comment, look for '-'
542: if (c == '-')
543: state = 33;
544: else if (c == '<')
545: state = 320;
546: break;
547: case 320: // in comment, seen <
548: if (c == '!')
549: state = 321;
550: else if (c == '-')
551: state = 33;
552: else
553: state = 32;
554: break;
555: case 321: // in comment, seen <!
556: if (c == '-')
557: state = 322;
558: else
559: state = 32;
560: break;
561: case 322: // in comment, seen <!-
562: if (c == '-') {
563: commentLevel++;
564: }
565: state = 32;
566: break;
567: case 33: // in comment, seen -
568: if (c == '-')
569: state = 34;
570: else
571: state = 32;
572: break;
573: case 34: // in comment, seen --
574: if (c == '>') {
575: if (--commentLevel == 0) {
576: state = 0;
577: } else {
578: state = 32;
579: }
580: } else if (c != '-')
581: state = 32;
582: break;
583: case 40: // seen <!, but not comment
584: if (c == '>')
585: state = 0;
586: break;
587: case 41: // seen <![
588: if (c == '[') {
589: if (data.toString().equals("CDATA")) {
590: data.reset();
591: state = 42;
592: } else {
593: state = 40;
594: }
595: } else {
596: data.write(c);
597: }
598: break;
599: case 42: // in CDATA section
600: if (c == ']') {
601: state = 43;
602: } else {
603: data.write(c);
604: }
605: break;
606: case 43: // in CDATA, seen ']'
607: if (c == ']') {
608: state = 44;
609: } else {
610: data.write(']');
611: data.write(c);
612: state = 42;
613: }
614: break;
615: case 44: // in CDATA, seen ']]'
616: if (c == '>') {
617: state = 0;
618: } else if (c == ']') {
619: data.write(']');
620: } else {
621: data.write("]]");
622: data.write(c);
623: state = 42;
624: }
625: break;
626: default:
627: throw new SAXException("Bad parser state: " + state);
628: }
629: return state;
630: }
631:
632: public void parse() throws SAXException, IOException {
633: int state = 0;
634: docHandler.setDocumentLocator(new Locator(this ));
635: while (parseUntilEOF()) {
636: }
637: docHandler.endDocument();
638: }
639:
640: public boolean parseUntilEOF() throws SAXException, IOException {
641: boolean ret = false;
642: int state = 0;
643: while (state >= 0) {
644: int c = read();
645: if (c < 0) {
646: try {
647: r.close();
648: } catch (Exception e) {
649: }
650: ret = popInputSource();
651: state = -1;
652: } else {
653: state = step(state, c);
654: }
655: }
656: return ret;
657: }
658:
659: public int getLineNumber() {
660: return lineNumber;
661: }
662:
663: public int getColumnNumber() {
664: return columnNumber;
665: }
666:
667: void startElement(String name, AttributeList attributes)
668: throws SAXException {
669: if (!docStarted) {
670: docStarted = true;
671: docHandler.startDocument();
672: }
673: docHandler.startElement(tagName, attributes);
674: attributes.clear();
675: }
676:
677: }
|