001: /*
002: * Copyright 2003 Paulo Soares
003: *
004: * The contents of this file are subject to the Mozilla Public License Version 1.1
005: * (the "License"); you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at http://www.mozilla.org/MPL/
007: *
008: * Software distributed under the License is distributed on an "AS IS" basis,
009: * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
010: * for the specific language governing rights and limitations under the License.
011: *
012: * The Original Code is 'iText, a free JAVA-PDF library'.
013: *
014: * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
015: * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
016: * All Rights Reserved.
017: * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
018: * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
019: *
020: * Contributor(s): all the names of the contributors are added in the source code
021: * where applicable.
022: *
023: * Alternatively, the contents of this file may be used under the terms of the
024: * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
025: * provisions of LGPL are applicable instead of those above. If you wish to
026: * allow use of your version of this file only under the terms of the LGPL
027: * License and not to allow others to use your version of this file under
028: * the MPL, indicate your decision by deleting the provisions above and
029: * replace them with the notice and other provisions required by the LGPL.
030: * If you do not delete the provisions above, a recipient may use your version
031: * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
032: *
033: * This library is free software; you can redistribute it and/or modify it
034: * under the terms of the MPL as stated above or under the terms of the GNU
035: * Library General Public License as published by the Free Software Foundation;
036: * either version 2 of the License, or any later version.
037: *
038: * This library is distributed in the hope that it will be useful, but WITHOUT
039: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
040: * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
041: * details.
042: *
043: * If you didn't download this code from the following link, you should check if
044: * you aren't using an obsolete version:
045: * http://www.lowagie.com/iText/
046: *
047: * The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license:
048: * Licensed to the Apache Software Foundation (ASF) under one or more
049: * contributor license agreements. See the NOTICE file distributed with
050: * this work for additional information regarding copyright ownership.
051: * The ASF licenses this file to You under the Apache License, Version 2.0
052: * (the "License"); you may not use this file except in compliance with
053: * the License. You may obtain a copy of the License at
054: *
055: * http://www.apache.org/licenses/LICENSE-2.0
056: *
057: * Unless required by applicable law or agreed to in writing, software
058: * distributed under the License is distributed on an "AS IS" BASIS,
059: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
060: * See the License for the specific language governing permissions and
061: * limitations under the License.
062: *
063: * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt.
064: * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128).
065: * Steven Brandt and JavaWorld gave permission to use the code for free.
066: * (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in
067: * conformance with the rest of the code).
068: * The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>.
069: * It was substantially refactored by Bruno Lowagie.
070: *
071: * The method 'private static String getEncodingName(byte[] b4)' was found
072: * in org.apache.xerces.impl.XMLEntityManager, originaly published by the
073: * Apache Software Foundation under the Apache Software License; now being
074: * used in iText under the MPL.
075: */
076: package com.lowagie.text.xml.simpleparser;
077:
078: import java.io.BufferedReader;
079: import java.io.ByteArrayOutputStream;
080: import java.io.IOException;
081: import java.io.InputStream;
082: import java.io.InputStreamReader;
083: import java.io.Reader;
084: import java.util.HashMap;
085: import java.util.Stack;
086:
087: /**
088: * A simple XML and HTML parser. This parser is, like the SAX parser,
089: * an event based parser, but with much less functionality.
090: * <p>
091: * The parser can:
092: * <p>
093: * <ul>
094: * <li>It recognizes the encoding used
095: * <li>It recognizes all the elements' start tags and end tags
096: * <li>It lists attributes, where attribute values can be enclosed in single or double quotes
097: * <li>It recognizes the <code><[CDATA[ ... ]]></code> construct
098: * <li>It recognizes the standard entities: &amp;, &lt;, &gt;, &quot;, and &apos;, as well as numeric entities
099: * <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11
100: * </ul>
101: * <p>
102: */
103: public class SimpleXMLParser {
104: /** possible states */
105: private final static int UNKNOWN = 0;
106: private final static int TEXT = 1;
107: private final static int TAG_ENCOUNTERED = 2;
108: private final static int EXAMIN_TAG = 3;
109: private final static int TAG_EXAMINED = 4;
110: private final static int IN_CLOSETAG = 5;
111: private final static int SINGLE_TAG = 6;
112: private final static int CDATA = 7;
113: private final static int COMMENT = 8;
114: private final static int PI = 9;
115: private final static int ENTITY = 10;
116: private final static int QUOTE = 11;
117: private final static int ATTRIBUTE_KEY = 12;
118: private final static int ATTRIBUTE_EQUAL = 13;
119: private final static int ATTRIBUTE_VALUE = 14;
120:
121: /** the state stack */
122: protected Stack stack;
123: /** The current character. */
124: protected int character = 0;
125: /** The previous character. */
126: protected int previousCharacter = -1;
127: /** the line we are currently reading */
128: protected int lines = 1;
129: /** the column where the current character occurs */
130: protected int columns = 0;
131: /** was the last character equivalent to a newline? */
132: protected boolean eol = false;
133: /** the current state */
134: protected int state;
135: /** Are we parsing HTML? */
136: protected boolean html;
137: /** current text (whatever is encountered between tags) */
138: protected StringBuffer text = new StringBuffer();
139: /** current entity (whatever is encountered between & and ;) */
140: protected StringBuffer entity = new StringBuffer();
141: /** current tagname */
142: protected String tag = null;
143: /** current attributes */
144: protected HashMap attributes = null;
145: /** The handler to which we are going to forward document content */
146: protected SimpleXMLDocHandler doc;
147: /** The handler to which we are going to forward comments. */
148: protected SimpleXMLDocHandlerComment comment;
149: /** Keeps track of the number of tags that are open. */
150: int nested = 0;
151: /** the quote character that was used to open the quote. */
152: protected int quoteCharacter = '"';
153: /** the attribute key. */
154: String attributekey = null;
155: /** the attribute value. */
156: String attributevalue = null;
157:
158: /**
159: * Creates a Simple XML parser object.
160: * Call go(BufferedReader) immediately after creation.
161: */
162: private SimpleXMLParser(SimpleXMLDocHandler doc,
163: SimpleXMLDocHandlerComment comment, boolean html) {
164: this .doc = doc;
165: this .comment = comment;
166: this .html = html;
167: stack = new Stack();
168: state = html ? TEXT : UNKNOWN;
169: }
170:
171: /**
172: * Does the actual parsing. Perform this immediately
173: * after creating the parser object.
174: */
175: private void go(Reader r) throws IOException {
176: BufferedReader reader;
177: if (r instanceof BufferedReader)
178: reader = (BufferedReader) r;
179: else
180: reader = new BufferedReader(r);
181: doc.startDocument();
182: while (true) {
183: // read a new character
184: if (previousCharacter == -1) {
185: character = reader.read();
186: }
187: // or re-examin the previous character
188: else {
189: character = previousCharacter;
190: previousCharacter = -1;
191: }
192:
193: // the end of the file was reached
194: if (character == -1) {
195: if (html) {
196: if (html && state == TEXT)
197: flush();
198: doc.endDocument();
199: } else {
200: throwException("Missing end tag");
201: }
202: return;
203: }
204:
205: // dealing with \n and \r
206: if (character == '\n' && eol) {
207: eol = false;
208: continue;
209: } else if (eol) {
210: eol = false;
211: } else if (character == '\n') {
212: lines++;
213: columns = 0;
214: } else if (character == '\r') {
215: eol = true;
216: character = '\n';
217: lines++;
218: columns = 0;
219: } else {
220: columns++;
221: }
222:
223: switch (state) {
224: // we are in an unknown state before there's actual content
225: case UNKNOWN:
226: if (character == '<') {
227: saveState(TEXT);
228: state = TAG_ENCOUNTERED;
229: }
230: break;
231: // we can encounter any content
232: case TEXT:
233: if (character == '<') {
234: flush();
235: saveState(state);
236: state = TAG_ENCOUNTERED;
237: } else if (character == '&') {
238: saveState(state);
239: entity.setLength(0);
240: state = ENTITY;
241: } else
242: text.append((char) character);
243: break;
244: // we have just seen a < and are wondering what we are looking at
245: // <foo>, </foo>, <!-- ... --->, etc.
246: case TAG_ENCOUNTERED:
247: initTag();
248: if (character == '/') {
249: state = IN_CLOSETAG;
250: } else if (character == '?') {
251: restoreState();
252: state = PI;
253: } else {
254: text.append((char) character);
255: state = EXAMIN_TAG;
256: }
257: break;
258: // we are processing something like this <foo ... >.
259: // It could still be a <!-- ... --> or something.
260: case EXAMIN_TAG:
261: if (character == '>') {
262: doTag();
263: processTag(true);
264: initTag();
265: state = restoreState();
266: } else if (character == '/') {
267: state = SINGLE_TAG;
268: } else if (character == '-'
269: && text.toString().equals("!-")) {
270: flush();
271: state = COMMENT;
272: } else if (character == '['
273: && text.toString().equals("![CDATA")) {
274: flush();
275: state = CDATA;
276: } else if (character == 'E'
277: && text.toString().equals("!DOCTYP")) {
278: flush();
279: state = PI;
280: } else if (Character.isWhitespace((char) character)) {
281: doTag();
282: state = TAG_EXAMINED;
283: } else {
284: text.append((char) character);
285: }
286: break;
287: // we know the name of the tag now.
288: case TAG_EXAMINED:
289: if (character == '>') {
290: processTag(true);
291: initTag();
292: state = restoreState();
293: } else if (character == '/') {
294: state = SINGLE_TAG;
295: } else if (Character.isWhitespace((char) character)) {
296: // empty
297: } else {
298: text.append((char) character);
299: state = ATTRIBUTE_KEY;
300: }
301: break;
302:
303: // we are processing a closing tag: e.g. </foo>
304: case IN_CLOSETAG:
305: if (character == '>') {
306: doTag();
307: processTag(false);
308: if (!html && nested == 0)
309: return;
310: state = restoreState();
311: } else {
312: if (!Character.isWhitespace((char) character))
313: text.append((char) character);
314: }
315: break;
316:
317: // we have just seen something like this: <foo a="b"/
318: // and are looking for the final >.
319: case SINGLE_TAG:
320: if (character != '>')
321: throwException("Expected > for tag: <" + tag + "/>");
322: doTag();
323: processTag(true);
324: processTag(false);
325: initTag();
326: if (!html && nested == 0) {
327: doc.endDocument();
328: return;
329: }
330: state = restoreState();
331: break;
332:
333: // we are processing CDATA
334: case CDATA:
335: if (character == '>' && text.toString().endsWith("]]")) {
336: text.setLength(text.length() - 2);
337: flush();
338: state = restoreState();
339: } else
340: text.append((char) character);
341: break;
342:
343: // we are processing a comment. We are inside
344: // the <!-- .... --> looking for the -->.
345: case COMMENT:
346: if (character == '>' && text.toString().endsWith("--")) {
347: text.setLength(text.length() - 2);
348: flush();
349: state = restoreState();
350: } else
351: text.append((char) character);
352: break;
353:
354: // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
355: case PI:
356: if (character == '>') {
357: state = restoreState();
358: if (state == TEXT)
359: state = UNKNOWN;
360: }
361: break;
362:
363: // we are processing an entity, e.g. <, », etc.
364: case ENTITY:
365: if (character == ';') {
366: state = restoreState();
367: String cent = entity.toString();
368: entity.setLength(0);
369: char ce = EntitiesToUnicode.decodeEntity(cent);
370: if (ce == '\0')
371: text.append('&').append(cent).append(';');
372: else
373: text.append(ce);
374: } else if ((character != '#'
375: && (character < '0' || character > '9')
376: && (character < 'a' || character > 'z') && (character < 'A' || character > 'Z'))
377: || entity.length() >= 7) {
378: state = restoreState();
379: previousCharacter = character;
380: text.append('&').append(entity.toString());
381: entity.setLength(0);
382: } else {
383: entity.append((char) character);
384: }
385: break;
386: // We are processing the quoted right-hand side of an element's attribute.
387: case QUOTE:
388: if (html && quoteCharacter == ' ' && character == '>') {
389: flush();
390: processTag(true);
391: initTag();
392: state = restoreState();
393: } else if (html && quoteCharacter == ' '
394: && Character.isWhitespace((char) character)) {
395: flush();
396: state = TAG_EXAMINED;
397: } else if (html && quoteCharacter == ' ') {
398: text.append((char) character);
399: } else if (character == quoteCharacter) {
400: flush();
401: state = TAG_EXAMINED;
402: } else if (" \r\n\u0009".indexOf(character) >= 0) {
403: text.append(' ');
404: } else if (character == '&') {
405: saveState(state);
406: state = ENTITY;
407: entity.setLength(0);
408: } else {
409: text.append((char) character);
410: }
411: break;
412:
413: case ATTRIBUTE_KEY:
414: if (Character.isWhitespace((char) character)) {
415: flush();
416: state = ATTRIBUTE_EQUAL;
417: } else if (character == '=') {
418: flush();
419: state = ATTRIBUTE_VALUE;
420: } else if (html && character == '>') {
421: text.setLength(0);
422: processTag(true);
423: initTag();
424: state = restoreState();
425: } else {
426: text.append((char) character);
427: }
428: break;
429:
430: case ATTRIBUTE_EQUAL:
431: if (character == '=') {
432: state = ATTRIBUTE_VALUE;
433: } else if (Character.isWhitespace((char) character)) {
434: // empty
435: } else if (html && character == '>') {
436: text.setLength(0);
437: processTag(true);
438: initTag();
439: state = restoreState();
440: } else if (html && character == '/') {
441: flush();
442: state = SINGLE_TAG;
443: } else if (html) {
444: flush();
445: text.append((char) character);
446: state = ATTRIBUTE_KEY;
447: } else {
448: throwException("Error in attribute processing.");
449: }
450: break;
451:
452: case ATTRIBUTE_VALUE:
453: if (character == '"' || character == '\'') {
454: quoteCharacter = character;
455: state = QUOTE;
456: } else if (Character.isWhitespace((char) character)) {
457: // empty
458: } else if (html && character == '>') {
459: flush();
460: processTag(true);
461: initTag();
462: state = restoreState();
463: } else if (html) {
464: text.append((char) character);
465: quoteCharacter = ' ';
466: state = QUOTE;
467: } else {
468: throwException("Error in attribute processing");
469: }
470: break;
471: }
472: }
473: }
474:
475: /**
476: * Gets a state from the stack
477: * @return the previous state
478: */
479: private int restoreState() {
480: if (!stack.empty())
481: return ((Integer) stack.pop()).intValue();
482: else
483: return UNKNOWN;
484: }
485:
486: /**
487: * Adds a state to the stack.
488: * @param s a state to add to the stack
489: */
490: private void saveState(int s) {
491: stack.push(new Integer(s));
492: }
493:
494: /**
495: * Flushes the text that is currently in the buffer.
496: * The text can be ignored, added to the document
497: * as content or as comment,... depending on the current state.
498: */
499: private void flush() {
500: switch (state) {
501: case TEXT:
502: case CDATA:
503: if (text.length() > 0) {
504: doc.text(text.toString());
505: }
506: break;
507: case COMMENT:
508: if (comment != null) {
509: comment.comment(text.toString());
510: }
511: break;
512: case ATTRIBUTE_KEY:
513: attributekey = text.toString();
514: if (html)
515: attributekey = attributekey.toLowerCase();
516: break;
517: case QUOTE:
518: case ATTRIBUTE_VALUE:
519: attributevalue = text.toString();
520: attributes.put(attributekey, attributevalue);
521: break;
522: default:
523: // do nothing
524: }
525: text.setLength(0);
526: }
527:
528: /**
529: * Initialized the tag name and attributes.
530: */
531: private void initTag() {
532: tag = null;
533: attributes = new HashMap();
534: }
535:
536: /** Sets the name of the tag. */
537: private void doTag() {
538: if (tag == null)
539: tag = text.toString();
540: if (html)
541: tag = tag.toLowerCase();
542: text.setLength(0);
543: }
544:
545: /**
546: * processes the tag.
547: * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag.
548: */
549: private void processTag(boolean start) {
550: if (start) {
551: nested++;
552: doc.startElement(tag, attributes);
553: } else {
554: nested--;
555: doc.endElement(tag);
556: }
557: }
558:
559: /** Throws an exception */
560: private void throwException(String s) throws IOException {
561: throw new IOException(s + " near line " + lines + ", column "
562: + columns);
563: }
564:
565: /**
566: * Parses the XML document firing the events to the handler.
567: * @param doc the document handler
568: * @param r the document. The encoding is already resolved. The reader is not closed
569: * @throws IOException on error
570: */
571: public static void parse(SimpleXMLDocHandler doc,
572: SimpleXMLDocHandlerComment comment, Reader r, boolean html)
573: throws IOException {
574: SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html);
575: parser.go(r);
576: }
577:
578: /**
579: * Parses the XML document firing the events to the handler.
580: * @param doc the document handler
581: * @param in the document. The encoding is deduced from the stream. The stream is not closed
582: * @throws IOException on error
583: */
584: public static void parse(SimpleXMLDocHandler doc, InputStream in)
585: throws IOException {
586: byte b4[] = new byte[4];
587: int count = in.read(b4);
588: if (count != 4)
589: throw new IOException("Insufficient length.");
590: String encoding = getEncodingName(b4);
591: String decl = null;
592: if (encoding.equals("UTF-8")) {
593: StringBuffer sb = new StringBuffer();
594: int c;
595: while ((c = in.read()) != -1) {
596: if (c == '>')
597: break;
598: sb.append((char) c);
599: }
600: decl = sb.toString();
601: } else if (encoding.equals("CP037")) {
602: ByteArrayOutputStream bi = new ByteArrayOutputStream();
603: int c;
604: while ((c = in.read()) != -1) {
605: if (c == 0x6e) // that's '>' in ebcdic
606: break;
607: bi.write(c);
608: }
609: decl = new String(bi.toByteArray(), "CP037");
610: }
611: if (decl != null) {
612: decl = getDeclaredEncoding(decl);
613: if (decl != null)
614: encoding = decl;
615: }
616: parse(doc, new InputStreamReader(in, IanaEncodings
617: .getJavaEncoding(encoding)));
618: }
619:
620: private static String getDeclaredEncoding(String decl) {
621: if (decl == null)
622: return null;
623: int idx = decl.indexOf("encoding");
624: if (idx < 0)
625: return null;
626: int idx1 = decl.indexOf('"', idx);
627: int idx2 = decl.indexOf('\'', idx);
628: if (idx1 == idx2)
629: return null;
630: if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) {
631: int idx3 = decl.indexOf('\'', idx2 + 1);
632: if (idx3 < 0)
633: return null;
634: return decl.substring(idx2 + 1, idx3);
635: }
636: if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) {
637: int idx3 = decl.indexOf('"', idx1 + 1);
638: if (idx3 < 0)
639: return null;
640: return decl.substring(idx1 + 1, idx3);
641: }
642: return null;
643: }
644:
645: public static void parse(SimpleXMLDocHandler doc, Reader r)
646: throws IOException {
647: parse(doc, null, r, false);
648: }
649:
650: /**
651: * Escapes a string with the appropriated XML codes.
652: * @param s the string to be escaped
653: * @param onlyASCII codes above 127 will always be escaped with &#nn; if <CODE>true</CODE>
654: * @return the escaped string
655: */
656: public static String escapeXML(String s, boolean onlyASCII) {
657: char cc[] = s.toCharArray();
658: int len = cc.length;
659: StringBuffer sb = new StringBuffer();
660: for (int k = 0; k < len; ++k) {
661: int c = cc[k];
662: switch (c) {
663: case '<':
664: sb.append("<");
665: break;
666: case '>':
667: sb.append(">");
668: break;
669: case '&':
670: sb.append("&");
671: break;
672: case '"':
673: sb.append(""");
674: break;
675: case '\'':
676: sb.append("'");
677: break;
678: default:
679: if (onlyASCII && c > 127)
680: sb.append("&#").append(c).append(';');
681: else
682: sb.append((char) c);
683: }
684: }
685: return sb.toString();
686: }
687:
688: /**
689: * Returns the IANA encoding name that is auto-detected from
690: * the bytes specified, with the endian-ness of that encoding where appropriate.
691: * (method found in org.apache.xerces.impl.XMLEntityManager, originaly published
692: * by the Apache Software Foundation under the Apache Software License; now being
693: * used in iText under the MPL)
694: * @param b4 The first four bytes of the input.
695: * @return an IANA-encoding string
696: */
697: private static String getEncodingName(byte[] b4) {
698:
699: // UTF-16, with BOM
700: int b0 = b4[0] & 0xFF;
701: int b1 = b4[1] & 0xFF;
702: if (b0 == 0xFE && b1 == 0xFF) {
703: // UTF-16, big-endian
704: return "UTF-16BE";
705: }
706: if (b0 == 0xFF && b1 == 0xFE) {
707: // UTF-16, little-endian
708: return "UTF-16LE";
709: }
710:
711: // UTF-8 with a BOM
712: int b2 = b4[2] & 0xFF;
713: if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
714: return "UTF-8";
715: }
716:
717: // other encodings
718: int b3 = b4[3] & 0xFF;
719: if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
720: // UCS-4, big endian (1234)
721: return "ISO-10646-UCS-4";
722: }
723: if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
724: // UCS-4, little endian (4321)
725: return "ISO-10646-UCS-4";
726: }
727: if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
728: // UCS-4, unusual octet order (2143)
729: // REVISIT: What should this be?
730: return "ISO-10646-UCS-4";
731: }
732: if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
733: // UCS-4, unusual octect order (3412)
734: // REVISIT: What should this be?
735: return "ISO-10646-UCS-4";
736: }
737: if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
738: // UTF-16, big-endian, no BOM
739: // (or could turn out to be UCS-2...
740: // REVISIT: What should this be?
741: return "UTF-16BE";
742: }
743: if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
744: // UTF-16, little-endian, no BOM
745: // (or could turn out to be UCS-2...
746: return "UTF-16LE";
747: }
748: if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
749: // EBCDIC
750: // a la xerces1, return CP037 instead of EBCDIC here
751: return "CP037";
752: }
753:
754: // default encoding
755: return "UTF-8";
756: }
757: }
|