001: package uk.org.ponder.xml;
002:
003: import java.io.Reader;
004: import java.io.InputStreamReader;
005: import java.io.InputStream;
006: import java.io.PushbackInputStream;
007: import java.io.IOException;
008:
009: import uk.org.ponder.byteutil.ByteWrap;
010: import uk.org.ponder.streamutil.DirectInputStreamReader;
011: import uk.org.ponder.stringutil.EncodingErrorHandler;
012:
013: /** The XMLDecoder is used to provide a more instrumented decoding facility for
014: * parsing XML files. The Sun default UTF-8 decoder supplies inaccurate location
015: * information for invalid characters, provides minimal UTF-8 validation and
016: * minimal description of decoding errors. XMLDecoder does the work of parsing
017: * an XML declaration in a small repertoire of encodings, and hands off the
018: * task of character conversion for the rest of the stream to the
019: * <code>org.ponder.streamutil.DirectInputStreamReader</code> if the declaration
020: * is consistent with the UTF-8 encoding scheme.
021: */
022:
023: public class XMLDecoder {
024: /** Strips off the declaration from an XML file, and returns information suitable
025: * for further processing. The declaration is decoded to infer the encoding scheme
026: * as far as possible, and the return value includes the number of bytes decoded,
027: * the declaration as a String, and a Reader from which the decoded contents of the
028: * rest of the file may be read.
029: * <p> This decoder currently detects UCS16 both little and big-endian using the BOM
030: * (Byte Order Mark) <code>0xffef</code> invented by the Evil Empire, and 8-bit
031: * encodings for which the JVM includes a converter with a name matching the XML
032: * declaration version. Files without an encoding declaration which appear to be
033: * 8-bit are interpreted as UTF-8 (as per XML specification).
034: * @param is An inputstream containing an XML file.
035: * @param handler An EncodingErrorHandler to which errors encountered during UTF-8
036: * decoding may be reported.
037: * @return An XMLDecoderReturn object providing the client with everything required
038: * to continue parsing the XML file.
039: */
040:
041: public static XMLDecoderReturn stripDeclaration(InputStream is,
042: EncodingErrorHandler handler) throws IOException {
043: Reader readertogo = null;
044: ByteWrap first4 = new ByteWrap(4);
045: int totalbytesread = 0;
046: String decstring = "";
047: do {
048: int bytesread = is.read(first4.bytes);
049: if (bytesread == -1) {
050: throw new IOException(
051: "Unexpected EOF found while parsing XML declaration after "
052: + totalbytesread + " bytes");
053: }
054: totalbytesread += bytesread;
055: } while (totalbytesread != 4);
056: PushbackInputStream pushback = new PushbackInputStream(is, 4);
057: pushback.unread(first4.bytes);
058: int magic2 = first4.read_at2(0);
059: int magic4 = first4.read_at4(0);
060: if (magic2 == 0xfeff) {
061: readertogo = new InputStreamReader(pushback, "UnicodeBig");
062: } else if (magic2 == 0xffef) {
063: readertogo = new InputStreamReader(pushback,
064: "UnicodeLittle");
065: }
066: StringBuffer declaration = new StringBuffer();
067: if (readertogo != null) { // fully decoded as some sort of UCS16, scan ahead for ">"
068: while (true) {
069: int nextchar = readertogo.read();
070: if (nextchar == -1) {
071: throw new IOException(
072: "Unexpected end of XML declaration while decoding UCS-16, read so far: "
073: + declaration.toString());
074: } else
075: declaration.append(nextchar);
076: if (nextchar == '>')
077: break;
078: }
079: } else if (magic4 == 0x3c3f786d) { // not fully decoded, but declaration is 8-bit
080: while (true) {
081: int nextchar = is.read();
082: if (nextchar == -1) {
083: throw new IOException(
084: "Unexpected end of XML declaration while scanning 8-bit encoding, read so far: "
085: + declaration.toString());
086: } else {
087: declaration.append(nextchar);
088: ++totalbytesread;
089: if (nextchar == '>')
090: break;
091: }
092: }
093: decstring = declaration.toString();
094: int encodingpos = decstring.indexOf("encoding");
095: if (encodingpos == -1) {
096: System.out
097: .println("Encoding declaration not found, assuming UTF-8");
098: readertogo = new DirectInputStreamReader(is);
099: ((DirectInputStreamReader) readertogo)
100: .setEncodingErrorHandler(handler);
101: } else { // found an encoding declaration
102: int encodequotepos = decstring
103: .indexOf('"', encodingpos);
104: int encodeapospos = decstring
105: .indexOf('\'', encodingpos);
106: if (encodequotepos == -1 && encodeapospos == -1) {
107: throw new IOException(
108: "Invalid XML declaration --- encoding declared but not specified: "
109: + decstring);
110: }
111: int encodestartpos = -1, encodeendpos = -1;
112: if (encodequotepos != -1) {
113: encodestartpos = encodequotepos;
114: encodeendpos = decstring.indexOf('"',
115: encodequotepos);
116: }
117: if (encodeapospos != -1) {
118: encodestartpos = encodeapospos;
119: encodeendpos = decstring.indexOf('\'',
120: encodeapospos);
121: }
122: if (encodeendpos == -1)
123: throw new IOException(
124: "Invalid XML declaration --- unterminated encoding name: "
125: + decstring);
126: String encodingname = decstring.substring(
127: encodestartpos, encodeendpos);
128: readertogo = new InputStreamReader(is, encodingname);
129: }
130: }
131:
132: else {
133: throw new IOException(
134: "Unexpected bytes at start of XML file:"
135: + ByteWrap.intToHex(magic4));
136: }
137: XMLDecoderReturn togo = new XMLDecoderReturn(decstring,
138: totalbytesread, readertogo);
139: return togo;
140: }
141: }
|