001: package com.rimfaxe.xml.xmlreader;
002:
003: import java.io.*;
004:
005: /**
006: * An XML byte stream that has been parsed into a DOM tree.
007: * Just like ParseCharStream except handle Unicode encoding of byte stream.
008: * Use rules in
009: * http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing to guess
010: * encoding -- if encoding declaration is different, restart parsing.
011:
012: <blockquote><small> Copyright (C) 2002 Hewlett-Packard Company.
013: This file is part of Sparta, an XML Parser, DOM, and XPath library.
014: This library is free software; you can redistribute it and/or
015: modify it under the terms of the GNU Lesser General Public License
016: as published by the Free Software Foundation; either version 2.1 of
017: the License, or (at your option) any later version. This library
018: is distributed in the hope that it will be useful, but WITHOUT ANY
019: WARRANTY; without even the implied warranty of MERCHANTABILITY or
020: FITNESS FOR A PARTICULAR PURPOSE.</small></blockquote>
021: @see <a "href="doc-files/LGPL.txt">GNU Lesser General Public License</a>
022: @version $Date: 2003/01/09 01:05:38 $ $Revision: 1.3 $
023: @author Eamonn O'Brien-Strain
024: */
025:
026: class ParseByteStream implements ParseSource {
027:
028: /** Parse XML document from byte stream, converting to Unicode
029: * characters as specifed by the initial byte-order-mark.
030: * @param istream is the source of bytes and must support mark so that
031: * we can peek ahead at its first two bytes
032: */
033: public ParseByteStream(String systemId, InputStream istream,
034: ParseLog log, String guessedEncoding, ParseHandler handler)
035: throws ParseException, IOException {
036: if (log == null)
037: log = DEFAULT_LOG;
038:
039: //We need to be able to restart the stream if the declared encoding
040: //is different than our guess, os buffer if necessary. We also need
041: //to be able to peek ahead at the first 4 bytes
042: if (!istream.markSupported())
043: istream = new BufferedInputStream(istream, MAXLOOKAHEAD);
044: istream.mark(MAXLOOKAHEAD); //mark at begining
045:
046: byte[] start = new byte[4];
047: int n = istream.read(start);
048:
049: if (guessedEncoding == null)
050: guessedEncoding = guessEncoding(systemId, start, n, log);
051:
052: try {
053:
054: //First try with guessed encoding
055: istream.reset();
056: InputStreamReader reader = new InputStreamReader(istream,
057: fixEncoding(guessedEncoding));
058: try {
059:
060: parseSource_ = new ParseCharStream(systemId, reader,
061: log, guessedEncoding, handler);
062: } catch (CharConversionException e) {
063:
064: //This exception seems to be caused by reading euc-jp as utf-8
065: String secondGuessEncoding = "euc-jp";
066: log.note("Problem reading with assumed encoding of "
067: + guessedEncoding + " so restarting with "
068: + secondGuessEncoding, systemId, 1);
069: istream.reset();
070: try {
071: reader = new InputStreamReader(istream,
072: fixEncoding(secondGuessEncoding));
073: } catch (UnsupportedEncodingException ee) {
074: throw new ParseException(log, systemId, 1, '\0',
075: secondGuessEncoding, "\""
076: + secondGuessEncoding
077: + "\" is not a supported encoding");
078: }
079:
080: parseSource_ = new ParseCharStream(systemId, reader,
081: log, null, handler);
082: }
083: } catch (EncodingMismatchException e) {
084: //if that didn't work try declared encoding
085: String declaredEncoding = e.getDeclaredEncoding();
086: log
087: .note(
088: "Encoding declaration of "
089: + declaredEncoding
090: + " is different that assumed "
091: + guessedEncoding
092: + " so restarting the parsing with the new encoding",
093: systemId, 1);
094: istream.reset();
095: InputStreamReader reader;
096: try {
097: reader = new InputStreamReader(istream,
098: fixEncoding(declaredEncoding));
099: } catch (UnsupportedEncodingException ee) {
100: throw new ParseException(log, systemId, 1, '\0',
101: declaredEncoding, "\"" + declaredEncoding
102: + "\" is not a supported encoding");
103: }
104: parseSource_ = new ParseCharStream(systemId, reader, log,
105: null, handler);
106: }
107: }
108:
109: public String toString() {
110: return parseSource_.toString();
111: }
112:
113: public String getSystemId() {
114: return parseSource_.getSystemId();
115: }
116:
117: /** Last line number read by parser. */
118: public int getLineNumber() {
119: return parseSource_.getLineNumber();
120: }
121:
122: /**
123: * @link aggregationByValue
124: */
125: private ParseCharStream parseSource_;
126:
127: /////////////////////////////////////////////////////////////////////
128:
129: /** Convert byte stream to Unicode character stream according to
130: * http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing
131: * . */
132: static private String guessEncoding(String systemId, byte[] start,
133: int n, ParseLog log) throws IOException {
134: //Test for UTF-16 byte-order mark
135: String encoding;
136: if (n != 4) {
137: String msg = n <= 0 ? "no characters in input"
138: : "less than 4 characters in input: \""
139: + new String(start, 0, n) + "\"";
140: log.error(msg, systemId, 1);
141: encoding = "UTF-8";
142: } else if (equals(start, 0x0000FEFF)
143: || equals(start, 0xFFFE0000)
144: || equals(start, 0x0000FFFE)
145: || equals(start, 0xFEFF0000)
146: || equals(start, 0x0000003C)
147: || equals(start, 0x3C000000)
148: || equals(start, 0x00003C00)
149: || equals(start, 0x003C0000))
150: encoding = "UCS-4";
151: else if (equals(start, 0x003C003F))
152: encoding = "UTF-16BE"; //or ISO-10646-UCS-2
153: else if (equals(start, 0x3C003F00))
154: encoding = "UTF-16LE"; //or ISO-10646-UCS-2
155: else if (equals(start, 0x3C3F786D))
156: encoding = "UTF-8";//or ISO 646, ASCII, ISO 8859, Shift-JIS, EUC
157: else if (equals(start, 0x4C6FA794))
158: encoding = "EBCDIC";
159: else if (equals(start, (short) 0xFFFE)
160: || equals(start, (short) 0xFEFF))
161: encoding = "UTF-16";
162: else
163: encoding = "UTF-8";
164:
165: if (!encoding.equals("UTF-8"))
166: log.note(
167: "From start " + hex(start[0]) + " " + hex(start[1])
168: + " " + hex(start[2]) + " " + hex(start[3])
169: + " deduced encoding = " + encoding,
170: systemId, 1);
171: return encoding;
172: }
173:
174: static private String hex(byte b) {
175: String s = Integer.toHexString(b);
176: switch (s.length()) {
177: case 1:
178: return "0" + s;
179: case 2:
180: return s;
181: default:
182: return s.substring(s.length() - 2);
183: }
184: }
185:
186: static private boolean equals(byte[] bytes, int integer) {
187: return bytes[0] == (byte) ((integer >>> 24))
188: && bytes[1] == (byte) ((integer >>> 16) & 0xFF)
189: && bytes[2] == (byte) ((integer >>> 8) & 0xFF)
190: && bytes[3] == (byte) ((integer) & 0xFF);
191: }
192:
193: static private boolean equals(byte[] bytes, short integer) {
194: return bytes[0] == (byte) ((integer >>> 8))
195: && bytes[1] == (byte) ((integer) & 0xFF);
196: }
197:
198: // Return a string with all the dashes (-) removed from the input.
199: static private String fixEncoding(String encoding) {
200: int index = encoding.indexOf("-");
201: if (index == -1) {
202: return encoding;
203: }
204:
205: // encoding contains "-"
206: int encodingLength = encoding.length();
207: StringBuffer newEncoding = new StringBuffer(encodingLength - 1);
208: for (int i = 0; i < encodingLength; i++) {
209: char oneChar = encoding.charAt(i);
210: if (oneChar != '-') {
211: newEncoding.append(oneChar);
212: }
213: }
214:
215: return newEncoding.toString();
216: }
217:
218: static private final int MAXLOOKAHEAD = "<?xml version=\"1.0\" encoding=\"\""
219: .length() + 40;
220: //Max charset name is 40 according to
221: //http://www.iana.org/assignments/character-sets
222:
223: }
224:
225: // $Log: ParseByteStream.java,v $
226: // Revision 1.3 2003/01/09 01:05:38 yuhongx
227: // added FixEncoding().
228: //
229: // Revision 1.2 2002/11/06 02:57:59 eobrain
230: // Organize imputs to removed unused imports. Remove some unused local variables.
231: //
232: // Revision 1.1.1.1 2002/08/19 05:04:00 eobrain
233: // import from HP Labs internal CVS
234: //
235: // Revision 1.14 2002/08/18 04:36:25 eob
236: // Make interface package-private so as not to clutter up the javadoc.
237: //
238: // Revision 1.13 2002/08/17 00:54:14 sermarti
239: //
240: // Revision 1.12 2002/08/05 20:04:32 sermarti
241: //
242: // Revision 1.11 2002/07/25 21:10:15 sermarti
243: // Adding files that mysteriously weren't added from Sparta before.
244: //
245: // Revision 1.10 2002/05/23 22:00:19 eob
246: // Add better error handling.
247: //
248: // Revision 1.9 2002/05/09 17:02:26 eob
249: // Fix NullPointerException in error reporting.
250: //
251: // Revision 1.8 2002/05/09 16:49:52 eob
252: // Add history for better error reporting.
253: //
254: // Revision 1.7 2002/03/21 23:50:49 eob
255: // Deprecate functionality moved to Parser facade class.
256: //
257: // Revision 1.6 2002/02/15 21:30:38 eob
258: // Comment changes only.
259: //
260: // Revision 1.5 2002/02/01 21:55:15 eob
261: // Comment change only.
262: //
263: // Revision 1.4 2002/01/09 00:45:58 eob
264: // Formatting change only.
265: //
266: // Revision 1.3 2002/01/09 00:44:57 eob
267: // Handle CharConversionException caused by reading euc-jp characters
268: // before encoding has been established. Restart parsing.
269: //
270: // Revision 1.2 2002/01/08 19:53:43 eob
271: // Comment change only.
272: //
273: // Revision 1.1 2002/01/08 19:31:33 eob
274: // Factored out ParseSource functionality into ParseCharStream and
275: // ParseByteStream.
|