001: /*
002: * $Id: XMLStreamReader.java,v 1.7 2004/07/11 09:37:37 yuvalo Exp $
003: *
004: * (C) Copyright 2002-2004 by Yuval Oren. All rights reserved.
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: package com.bluecast.xml;
020:
021: import java.io.*;
022: import com.bluecast.io.*;
023: import java.util.*;
024:
025: /**
026: * A Reader for XML documents and streams. This class automatically determines
027: * the proper character set to use based on Byte Order Marks and XML
028: * declarations.
029: *
030: * @author Yuval Oren, yuval@bluecast.com
031: * @version $Revision: 1.7 $
032: */
033: final public class XMLStreamReader extends XMLInputReader {
034: private static final int BYTE_BUFFER_SIZE = 8192;
035: private XMLDecoder decoder;
036: private int minBytesPerChar, maxBytesPerChar;
037: private InputStream in;
038: private int[] decodeResult = new int[2];
039: private String encoding;
040: private boolean useDeclaredEncoding;
041: private boolean rewindDeclaration;
042: private char[] cbuf = new char[MAX_XML_DECL_CHARS];
043: private byte[] bbuf = new byte[BYTE_BUFFER_SIZE];
044: private int cbufPos, cbufEnd, bbufPos, bbufEnd;
045: private boolean eofReached;
046: // How many characters we should read to parse the <?xml...?> declaration
047: private static final int MAX_XML_DECL_CHARS = 100;
048:
049: private FastStreamDecoder fastStreamDecoder = new FastStreamDecoder();
050: private JavaStreamDecoder javaStreamDecoder = null;
051:
052: private XMLStreamDecoder activeStreamDecoder;
053:
054: /**
055: * Create an XMLStreamReader without providing an InputStream yet.
056: * You must call reset() before using.
057: */
058: public XMLStreamReader() {
059: }
060:
061: /**
062: * Creates an XMLStreamReader.
063: *
064: * @param in the InputStream
065: * @param rewindDeclaration a value of false will skip past any
066: * XML declaration. True will dish out the entire document.
067: */
068: public XMLStreamReader(InputStream in, boolean rewindDeclaration)
069: throws IOException {
070: this (in, null, rewindDeclaration);
071: }
072:
073: /**
074: * Creates an XMLStreamReader while specifying a character encoding.
075: */
076: public XMLStreamReader(InputStream in, String encoding,
077: boolean rewindDeclaration) throws IOException {
078: reset(in, encoding, rewindDeclaration);
079: }
080:
081: /**
082: * Reuses this XMLStreamReader for a different InputStream.
083: */
084: public void reset(InputStream in, String encoding,
085: boolean rewindDeclaration) throws IOException {
086: super .resetInput();
087: this .in = in;
088: eofReached = false;
089: this .rewindDeclaration = rewindDeclaration;
090: useDeclaredEncoding = false;
091: bbufPos = bbufEnd = 0;
092: cbufPos = cbufEnd = 0;
093: activeStreamDecoder = fastStreamDecoder;
094: fillByteBuffer();
095: // If we've been given a character set, use it.
096: if (encoding != null) {
097: this .encoding = getJavaCharset(encoding);
098:
099: // If it's Unicode we need to find out which-endian
100: // Per the unicode.org FAQ, default to big-endian
101: if (this .encoding.equals("Unicode")) {
102: this .encoding = guessEncoding();
103: if (this .encoding == null
104: || !(this .encoding.equals("UnicodeLittle")))
105: this .encoding = "UnicodeBig";
106: }
107: } else {
108: this .encoding = guessEncoding();
109: if (this .encoding == null) {
110: useDeclaredEncoding = true;
111: this .encoding = "UTF-8"; // Default to UTF-8
112: }
113: }
114: setEncoding(this .encoding);
115: processXMLDecl();
116: }
117:
118: /** Returns the character set being used by the reader. Note that the
119: * encoding in the XML declaration is ignored if it is not needed to
120: * determine the character set.
121: */
122: public String getEncoding() {
123: return encoding;
124: }
125:
126: public void close() throws IOException {
127: eofReached = true;
128: bbufPos = bbufEnd = cbufPos = cbufEnd = 0;
129: if (in != null)
130: in.close();
131: }
132:
133: public void reset() throws IOException {
134: super .resetInput();
135: in.reset();
136: bbufPos = bbufEnd = cbufPos = cbufEnd = 0;
137: }
138:
139: public void mark(int readAheadLimit) throws IOException {
140: throw new UnsupportedOperationException("mark() not supported");
141: }
142:
143: public boolean markSupported() {
144: return false;
145: }
146:
147: public int read() throws IOException {
148: return activeStreamDecoder.read();
149: }
150:
151: public int read(char[] destbuf) throws IOException {
152: return read(destbuf, 0, destbuf.length);
153: }
154:
155: public int read(char[] destbuf, int off, int len)
156: throws IOException {
157: return activeStreamDecoder.read(destbuf, off, len);
158: }
159:
160: public boolean ready() throws IOException {
161: return activeStreamDecoder.ready();
162: }
163:
164: public long skip(long n) throws IOException {
165: return activeStreamDecoder.skip(n);
166: }
167:
168: private void setEncoding(String encoding) throws IOException {
169: try {
170: this .encoding = encoding;
171: decoder = XMLDecoderFactory.createDecoder(encoding);
172: minBytesPerChar = decoder.minBytesPerChar();
173: maxBytesPerChar = decoder.maxBytesPerChar();
174: } catch (UnsupportedEncodingException e) {
175: // We don't have a decoder; try Java's built-in one
176: if (javaStreamDecoder == null)
177: javaStreamDecoder = new JavaStreamDecoder();
178: activeStreamDecoder = javaStreamDecoder;
179: }
180:
181: activeStreamDecoder.reset();
182: }
183:
184: private int fillByteBuffer() throws IOException {
185: int bytesLeft = bbufEnd - bbufPos;
186: if (bytesLeft > 0)
187: System.arraycopy(bbuf, bbufPos, bbuf, 0, bytesLeft);
188:
189: bbufPos = 0;
190: bbufEnd = bytesLeft;
191:
192: // Don't block I/O; only get as many bytes as are available
193: int bytesAvailable = in.available();
194:
195: if (bytesAvailable == 0 && bytesLeft == 0) {
196: bytesAvailable = 1; // But always get at least 1 byte
197: }
198:
199: int bytesToRead = Math.min(BYTE_BUFFER_SIZE - bbufEnd,
200: bytesAvailable);
201: int bytesRead = 0;
202:
203: if (bbufEnd < BYTE_BUFFER_SIZE
204: && (bytesRead = in.read(bbuf, bbufEnd, bytesToRead)) != -1) {
205: bbufEnd += bytesRead;
206: }
207: if (bytesRead == -1)
208: eofReached = true;
209: return bytesRead;
210: }
211:
212: static private HashMap charsetTable = new HashMap(31);
213: static {
214: charsetTable.put("EBCDIC-CP-US", "Cp037");
215: charsetTable.put("EBCDIC-CP-CA", "Cp037");
216: charsetTable.put("EBCDIC-CP-NL", "Cp037");
217: charsetTable.put("EBCDIC-CP-WT", "Cp037");
218: charsetTable.put("EBCDIC-CP-DK", "Cp277");
219: charsetTable.put("EBCDIC-CP-NO", "Cp277");
220: charsetTable.put("EBCDIC-CP-FI", "Cp278");
221: charsetTable.put("EBCDIC-CP-SE", "Cp278");
222: charsetTable.put("EBCDIC-CP-IT", "Cp280");
223: charsetTable.put("EBCDIC-CP-ES", "Cp284");
224: charsetTable.put("EBCDIC-CP-GB", "Cp285");
225: charsetTable.put("EBCDIC-CP-FR", "Cp297");
226: charsetTable.put("EBCDIC-CP-AR1", "Cp420");
227: charsetTable.put("EBCDIC-CP-GR", "Cp423");
228: charsetTable.put("EBCDIC-CP-HE", "Cp424");
229: charsetTable.put("EBCDIC-CP-BE", "Cp500");
230: charsetTable.put("EBCDIC-CP-CH", "Cp500");
231: charsetTable.put("EBCDIC-CP-ROECE", "Cp870");
232: charsetTable.put("EBCDIC-CP-YU", "Cp870");
233: charsetTable.put("EBCDIC-CP-IS", "Cp871");
234: charsetTable.put("EBCDIC-CP-TR", "Cp905");
235: charsetTable.put("EBCDIC-CP-AR2", "Cp918");
236: charsetTable.put("UTF-16", "Unicode");
237: charsetTable.put("ISO-10646-UCS-2", "Unicode");
238:
239: charsetTable.put("ANSI_X3.4-1986", "ASCII");
240: charsetTable.put("ASCII", "ASCII");
241: charsetTable.put("CP367", "ASCII");
242: charsetTable.put("CSASCII", "ASCII");
243: charsetTable.put("IBM-367", "ASCII");
244: charsetTable.put("IBM367", "ASCII");
245: charsetTable.put("ISO-IR-6", "ASCII");
246: charsetTable.put("ISO646-US", "ASCII");
247: charsetTable.put("ISO_646.IRV:1991", "ASCII");
248: charsetTable.put("US", "ASCII");
249: charsetTable.put("US-ASCII", "ASCII");
250: charsetTable.put("BIG5", "BIG5");
251: charsetTable.put("CSBIG5", "BIG5");
252: charsetTable.put("CP037", "CP037");
253: charsetTable.put("CSIBM037", "CP037");
254: charsetTable.put("IBM-37", "CP037");
255: charsetTable.put("IBM037", "CP037");
256: charsetTable.put("CP1026", "CP1026");
257: charsetTable.put("CSIBM1026", "CP1026");
258: charsetTable.put("IBM-1026", "CP1026");
259: charsetTable.put("IBM1026", "CP1026");
260: charsetTable.put("CP1047", "CP1047");
261: charsetTable.put("IBM-1047", "CP1047");
262: charsetTable.put("IBM1047", "CP1047");
263: charsetTable.put("CCSID01140", "CP1140");
264: charsetTable.put("CP01140", "CP1140");
265: charsetTable.put("IBM-1140", "CP1140");
266: charsetTable.put("IBM01140", "CP1140");
267: charsetTable.put("CCSID01141", "CP1141");
268: charsetTable.put("CP01141", "CP1141");
269: charsetTable.put("IBM-1141", "CP1141");
270: charsetTable.put("IBM01141", "CP1141");
271: charsetTable.put("CCSID01142", "CP1142");
272: charsetTable.put("CP01142", "CP1142");
273: charsetTable.put("IBM-1142", "CP1142");
274: charsetTable.put("IBM01142", "CP1142");
275: charsetTable.put("CCSID01143", "CP1143");
276: charsetTable.put("CP01143", "CP1143");
277: charsetTable.put("IBM-1143", "CP1143");
278: charsetTable.put("IBM01143", "CP1143");
279: charsetTable.put("CCSID01144", "CP1144");
280: charsetTable.put("CP01144", "CP1144");
281: charsetTable.put("IBM-1144", "CP1144");
282: charsetTable.put("IBM01144", "CP1144");
283: charsetTable.put("CCSID01145", "CP1145");
284: charsetTable.put("CP01145", "CP1145");
285: charsetTable.put("IBM-1145", "CP1145");
286: charsetTable.put("IBM01145", "CP1145");
287: charsetTable.put("CCSID01146", "CP1146");
288: charsetTable.put("CP01146", "CP1146");
289: charsetTable.put("IBM-1146", "CP1146");
290: charsetTable.put("IBM01146", "CP1146");
291: charsetTable.put("CCSID01147", "CP1147");
292: charsetTable.put("CP01147", "CP1147");
293: charsetTable.put("IBM-1147", "CP1147");
294: charsetTable.put("IBM01147", "CP1147");
295: charsetTable.put("CCSID01148", "CP1148");
296: charsetTable.put("CP01148", "CP1148");
297: charsetTable.put("IBM-1148", "CP1148");
298: charsetTable.put("IBM01148", "CP1148");
299: charsetTable.put("CCSID01149", "CP1149");
300: charsetTable.put("CP01149", "CP1149");
301: charsetTable.put("IBM-1149", "CP1149");
302: charsetTable.put("IBM01149", "CP1149");
303: charsetTable.put("WINDOWS-1250", "CP1250");
304: charsetTable.put("WINDOWS-1251", "CP1251");
305: charsetTable.put("WINDOWS-1252", "CP1252");
306: charsetTable.put("WINDOWS-1253", "CP1253");
307: charsetTable.put("WINDOWS-1254", "CP1254");
308: charsetTable.put("WINDOWS-1255", "CP1255");
309: charsetTable.put("WINDOWS-1256", "CP1256");
310: charsetTable.put("WINDOWS-1257", "CP1257");
311: charsetTable.put("WINDOWS-1258", "CP1258");
312: charsetTable.put("CP273", "CP273");
313: charsetTable.put("CSIBM273", "CP273");
314: charsetTable.put("IBM-273", "CP273");
315: charsetTable.put("IBM273", "CP273");
316: charsetTable.put("CP277", "CP277");
317: charsetTable.put("CSIBM277", "CP277");
318: charsetTable.put("IBM-277", "CP277");
319: charsetTable.put("IBM277", "CP277");
320: charsetTable.put("CP278", "CP278");
321: charsetTable.put("CSIBM278", "CP278");
322: charsetTable.put("IBM-278", "CP278");
323: charsetTable.put("IBM278", "CP278");
324: charsetTable.put("CP280", "CP280");
325: charsetTable.put("CSIBM280", "CP280");
326: charsetTable.put("IBM-280", "CP280");
327: charsetTable.put("IBM280", "CP280");
328: charsetTable.put("CP284", "CP284");
329: charsetTable.put("CSIBM284", "CP284");
330: charsetTable.put("IBM-284", "CP284");
331: charsetTable.put("IBM284", "CP284");
332: charsetTable.put("CP285", "CP285");
333: charsetTable.put("CSIBM285", "CP285");
334: charsetTable.put("IBM-285", "CP285");
335: charsetTable.put("IBM285", "CP285");
336: charsetTable.put("CP290", "CP290");
337: charsetTable.put("CSIBM290", "CP290");
338: charsetTable.put("EBCDIC-JP-KANA", "CP290");
339: charsetTable.put("IBM-290", "CP290");
340: charsetTable.put("IBM290", "CP290");
341: charsetTable.put("CP297", "CP297");
342: charsetTable.put("CSIBM297", "CP297");
343: charsetTable.put("IBM-297", "CP297");
344: charsetTable.put("IBM297", "CP297");
345: charsetTable.put("CP420", "CP420");
346: charsetTable.put("CSIBM420", "CP420");
347: charsetTable.put("IBM-420", "CP420");
348: charsetTable.put("IBM420", "CP420");
349: charsetTable.put("CP424", "CP424");
350: charsetTable.put("CSIBM424", "CP424");
351: charsetTable.put("IBM-424", "CP424");
352: charsetTable.put("IBM424", "CP424");
353: charsetTable.put("437", "CP437");
354: charsetTable.put("CP437", "CP437");
355: charsetTable.put("CSPC8CODEPAGE437", "CP437");
356: charsetTable.put("IBM-437", "CP437");
357: charsetTable.put("IBM437", "CP437");
358: charsetTable.put("CP500", "CP500");
359: charsetTable.put("CSIBM500", "CP500");
360: charsetTable.put("IBM-500", "CP500");
361: charsetTable.put("IBM500", "CP500");
362: charsetTable.put("CP775", "CP775");
363: charsetTable.put("CSPC775BALTIC", "CP775");
364: charsetTable.put("IBM-775", "CP775");
365: charsetTable.put("IBM775", "CP775");
366: charsetTable.put("850", "CP850");
367: charsetTable.put("CP850", "CP850");
368: charsetTable.put("CSPC850MULTILINGUAL", "CP850");
369: charsetTable.put("IBM-850", "CP850");
370: charsetTable.put("IBM850", "CP850");
371: charsetTable.put("852", "CP852");
372: charsetTable.put("CP852", "CP852");
373: charsetTable.put("CSPCP852", "CP852");
374: charsetTable.put("IBM-852", "CP852");
375: charsetTable.put("IBM852", "CP852");
376: charsetTable.put("855", "CP855");
377: charsetTable.put("CP855", "CP855");
378: charsetTable.put("CSIBM855", "CP855");
379: charsetTable.put("IBM-855", "CP855");
380: charsetTable.put("IBM855", "CP855");
381: charsetTable.put("857", "CP857");
382: charsetTable.put("CP857", "CP857");
383: charsetTable.put("CSIBM857", "CP857");
384: charsetTable.put("IBM-857", "CP857");
385: charsetTable.put("IBM857", "CP857");
386: charsetTable.put("CCSID00858", "CP858");
387: charsetTable.put("CP00858", "CP858");
388: charsetTable.put("IBM-858", "CP858");
389: charsetTable.put("IBM00858", "CP858");
390: charsetTable.put("860", "CP860");
391: charsetTable.put("CP860", "CP860");
392: charsetTable.put("CSIBM860", "CP860");
393: charsetTable.put("IBM-860", "CP860");
394: charsetTable.put("IBM860", "CP860");
395: charsetTable.put("861", "CP861");
396: charsetTable.put("CP-IS", "CP861");
397: charsetTable.put("CP861", "CP861");
398: charsetTable.put("CSIBM861", "CP861");
399: charsetTable.put("IBM-861", "CP861");
400: charsetTable.put("IBM861", "CP861");
401: charsetTable.put("862", "CP862");
402: charsetTable.put("CP862", "CP862");
403: charsetTable.put("CSPC862LATINHEBREW", "CP862");
404: charsetTable.put("IBM-862", "CP862");
405: charsetTable.put("IBM862", "CP862");
406: charsetTable.put("863", "CP863");
407: charsetTable.put("CP863", "CP863");
408: charsetTable.put("CSIBM863", "CP863");
409: charsetTable.put("IBM-863", "CP863");
410: charsetTable.put("IBM863", "CP863");
411: charsetTable.put("CP864", "CP864");
412: charsetTable.put("CSIBM864", "CP864");
413: charsetTable.put("IBM-864", "CP864");
414: charsetTable.put("IBM864", "CP864");
415: charsetTable.put("865", "CP865");
416: charsetTable.put("CP865", "CP865");
417: charsetTable.put("CSIBM865", "CP865");
418: charsetTable.put("IBM-865", "CP865");
419: charsetTable.put("IBM865", "CP865");
420: charsetTable.put("866", "CP866");
421: charsetTable.put("CP866", "CP866");
422: charsetTable.put("CSIBM866", "CP866");
423: charsetTable.put("IBM-866", "CP866");
424: charsetTable.put("IBM866", "CP866");
425: charsetTable.put("CP-AR", "CP868");
426: charsetTable.put("CP868", "CP868");
427: charsetTable.put("CSIBM868", "CP868");
428: charsetTable.put("IBM-868", "CP868");
429: charsetTable.put("IBM868", "CP868");
430: charsetTable.put("CP-GR", "CP869");
431: charsetTable.put("CP869", "CP869");
432: charsetTable.put("CSIBM869", "CP869");
433: charsetTable.put("IBM-869", "CP869");
434: charsetTable.put("IBM869", "CP869");
435: charsetTable.put("CP870", "CP870");
436: charsetTable.put("CSIBM870", "CP870");
437: charsetTable.put("IBM-870", "CP870");
438: charsetTable.put("IBM870", "CP870");
439: charsetTable.put("CP871", "CP871");
440: charsetTable.put("CSIBM871", "CP871");
441: charsetTable.put("IBM-871", "CP871");
442: charsetTable.put("IBM871", "CP871");
443: charsetTable.put("CP918", "CP918");
444: charsetTable.put("CSIBM918", "CP918");
445: charsetTable.put("IBM-918", "CP918");
446: charsetTable.put("IBM918", "CP918");
447: charsetTable.put("CCSID00924", "CP924");
448: charsetTable.put("CP00924", "CP924");
449: charsetTable.put("EBCDIC-LATIN9--EURO", "CP924");
450: charsetTable.put("IBM-924", "CP924");
451: charsetTable.put("IBM00924", "CP924");
452: charsetTable.put("CSEUCPKDFMTJAPANESE", "EUCJIS");
453: charsetTable.put("EUC-JP", "EUCJIS");
454: charsetTable.put(
455: "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE",
456: "EUCJIS");
457: charsetTable.put("GB18030", "GB18030");
458: charsetTable.put("CSGB2312", "GB2312");
459: charsetTable.put("GB2312", "GB2312");
460: charsetTable.put("ISO-2022-CN", "ISO2022CN");
461: charsetTable.put("CSISO2022KR", "ISO2022KR");
462: charsetTable.put("ISO-2022-KR", "ISO2022KR");
463: charsetTable.put("CP819", "ISO8859_1");
464: charsetTable.put("CSISOLATIN1", "ISO8859_1");
465: charsetTable.put("IBM-819", "ISO8859_1");
466: charsetTable.put("IBM819", "ISO8859_1");
467: charsetTable.put("ISO-8859-1", "ISO8859_1");
468: charsetTable.put("ISO-IR-100", "ISO8859_1");
469: charsetTable.put("ISO_8859-1", "ISO8859_1");
470: charsetTable.put("L1", "ISO8859_1");
471: charsetTable.put("LATIN1", "ISO8859_1");
472: charsetTable.put("CSISOLATIN2", "ISO8859_2");
473: charsetTable.put("ISO-8859-2", "ISO8859_2");
474: charsetTable.put("ISO-IR-101", "ISO8859_2");
475: charsetTable.put("ISO_8859-2", "ISO8859_2");
476: charsetTable.put("L2", "ISO8859_2");
477: charsetTable.put("LATIN2", "ISO8859_2");
478: charsetTable.put("CSISOLATIN3", "ISO8859_3");
479: charsetTable.put("ISO-8859-3", "ISO8859_3");
480: charsetTable.put("ISO-IR-109", "ISO8859_3");
481: charsetTable.put("ISO_8859-3", "ISO8859_3");
482: charsetTable.put("L3", "ISO8859_3");
483: charsetTable.put("LATIN3", "ISO8859_3");
484: charsetTable.put("CSISOLATIN4", "ISO8859_4");
485: charsetTable.put("ISO-8859-4", "ISO8859_4");
486: charsetTable.put("ISO-IR-110", "ISO8859_4");
487: charsetTable.put("ISO_8859-4", "ISO8859_4");
488: charsetTable.put("L4", "ISO8859_4");
489: charsetTable.put("LATIN4", "ISO8859_4");
490: charsetTable.put("CSISOLATINCYRILLIC", "ISO8859_5");
491: charsetTable.put("CYRILLIC", "ISO8859_5");
492: charsetTable.put("ISO-8859-5", "ISO8859_5");
493: charsetTable.put("ISO-IR-144", "ISO8859_5");
494: charsetTable.put("ISO_8859-5", "ISO8859_5");
495: charsetTable.put("ARABIC", "ISO8859_6");
496: charsetTable.put("ASMO-708", "ISO8859_6");
497: charsetTable.put("CSISOLATINARABIC", "ISO8859_6");
498: charsetTable.put("ECMA-114", "ISO8859_6");
499: charsetTable.put("ISO-8859-6", "ISO8859_6");
500: charsetTable.put("ISO-IR-127", "ISO8859_6");
501: charsetTable.put("ISO_8859-6", "ISO8859_6");
502: charsetTable.put("CSISOLATINGREEK", "ISO8859_7");
503: charsetTable.put("ECMA-118", "ISO8859_7");
504: charsetTable.put("ELOT_928", "ISO8859_7");
505: charsetTable.put("GREEK", "ISO8859_7");
506: charsetTable.put("GREEK8", "ISO8859_7");
507: charsetTable.put("ISO-8859-7", "ISO8859_7");
508: charsetTable.put("ISO-IR-126", "ISO8859_7");
509: charsetTable.put("ISO_8859-7", "ISO8859_7");
510: charsetTable.put("CSISOLATINHEBREW", "ISO8859_8");
511: charsetTable.put("HEBREW", "ISO8859_8");
512: charsetTable.put("ISO-8859-8", "ISO8859_8");
513: charsetTable.put("ISO-8859-8-I", "ISO8859_8");
514: charsetTable.put("ISO-IR-138", "ISO8859_8");
515: charsetTable.put("ISO_8859-8", "ISO8859_8");
516: charsetTable.put("CSISOLATIN5", "ISO8859_9");
517: charsetTable.put("ISO-8859-9", "ISO8859_9");
518: charsetTable.put("ISO-IR-148", "ISO8859_9");
519: charsetTable.put("ISO_8859-9", "ISO8859_9");
520: charsetTable.put("L5", "ISO8859_9");
521: charsetTable.put("LATIN5", "ISO8859_9");
522: charsetTable.put("CSISO2022JP", "JIS");
523: charsetTable.put("ISO-2022-JP", "JIS");
524: charsetTable.put("CSISO13JISC6220JP", "JIS0201");
525: charsetTable.put("X0201", "JIS0201");
526: charsetTable.put("CSISO87JISX0208", "JIS0208");
527: charsetTable.put("ISO-IR-87", "JIS0208");
528: charsetTable.put("X0208", "JIS0208");
529: charsetTable.put("X0208DBIJIS_X0208-1983", "JIS0208");
530: charsetTable.put("CSISO159JISX02121990", "JIS0212");
531: charsetTable.put("ISO-IR-159", "JIS0212");
532: charsetTable.put("X0212", "JIS0212");
533: charsetTable.put("CSKOI8R", "KOI8_R");
534: charsetTable.put("KOI8-R", "KOI8_R");
535: charsetTable.put("EUC-KR", "KSC5601");
536: charsetTable.put("CSWINDOWS31J", "MS932");
537: charsetTable.put("WINDOWS-31J", "MS932");
538: charsetTable.put("CSSHIFTJIS", "SJIS");
539: charsetTable.put("MS_KANJI", "SJIS");
540: charsetTable.put("SHIFT_JIS", "SJIS");
541: charsetTable.put("TIS-620", "TIS620");
542: charsetTable.put("UTF-16BE", "UNICODEBIG");
543: charsetTable.put("UTF-16LE", "UNICODELITTLE");
544: charsetTable.put("UTF-8", "UTF8");
545: }
546:
547: // Get the java name for a possibly standard character set name.
548: private String getJavaCharset(String charset) {
549: if (charset == null)
550: return null;
551: String xlated = (String) charsetTable
552: .get(charset.toUpperCase());
553: if (xlated != null)
554: return xlated;
555: else
556: return charset;
557: }
558:
559: // Guesses the encoding of a stream from the first 4 bytes
560: // All bytes read will be "unread" back into the stream
561: // Returns an encoding name or null if unknown
562: private String guessEncoding() {
563: if (bbufEnd < 4)
564: return null;
565: switch (bbuf[0]) {
566: case (byte) 0xEF:
567: if (bbuf[1] == (byte) 0xBB && bbuf[2] == (byte) 0xBF) {
568: bbufPos = 3; // Skip the Byte Order Mark
569: return "UTF-8";
570: } else
571: return null;
572: case (byte) '<': // UTF-8/ASCII/etc, UTF-16LE, or UCS-4
573: switch (bbuf[1]) {
574: case (byte) '?':
575: // UTF-8/ASCII/etc, but we're not sure which
576: if (bbuf[2] == (byte) 0x78 && bbuf[3] == (byte) 0x6D) {
577: useDeclaredEncoding = true;
578: return "UTF-8";
579: // there are many other charsets that
580: // would fall into this
581: } else
582: return null;
583: case (byte) 0x00:
584: if (bbuf[2] == (byte) '?' && bbuf[3] == (byte) 0x00)
585: return "UnicodeLittleUnmarked";
586: else if (bbuf[2] == (byte) 0x00
587: && bbuf[3] == (byte) 0x00)
588: return "UCS-4";
589: else
590: return null;
591: default:
592: return null;
593: }
594: case (byte) 0xFE: // UTF-16BE or UCS-4 unusual (3412)
595: if (bbuf[1] == (byte) 0xFF) {
596: if (bbuf[2] == (byte) 0x00 && bbuf[3] == (byte) 0x00) {
597: bbufPos = 4; // Skip the Byte Order Mark
598: return "UCS-4"; // Not supported by Java
599: } else {
600: bbufPos = 2; // Skip the Byte Order Mark
601: return "UnicodeBig";
602: }
603: } else
604: return null;
605: case (byte) 0xFF: // UTF-16LE or UCS-4LE
606: if (bbuf[1] == (byte) 0xFE) {
607: if (bbuf[2] == (byte) 0x00 && bbuf[3] == (byte) 0x00) {
608: bbufPos = 4; // Skip the Byte Order Mark
609: return "UCS-4"; // LE, not supported by Java
610: } else {
611: bbufPos = 2; // Skip the Byte Order Mark
612: return "UnicodeLittle";
613: }
614: } else
615: return null;
616: case (byte) 0x00: // UCS-4BE or UCS-4 unusual (2143),
617: // or if there's no BOM, UTF-16BE or UCS-4
618: switch (bbuf[1]) {
619: case (byte) 0x00:
620: if (bbuf[2] == (byte) 0xFE && bbuf[3] == (byte) 0xFF) {
621: bbufPos = 4; // Skip the Byte Order Mark
622: return "UCS-4"; // BE, unsupported by Java
623: } else if (bbuf[2] == (byte) 0xFF
624: && bbuf[3] == (byte) 0xFE) {
625: bbufPos = 4; // Skip the Byte Order Mark
626: return "UCS-4"; // Unusual (2143)
627: }
628: // UCS-4 without a byte order mark
629: else if ((bbuf[2] == (byte) '<' && bbuf[3] == (byte) 0x00)
630: || (bbuf[2] == (byte) 0x00 && bbuf[3] == (byte) '<'))
631: return "UCS-4";
632: else
633: return null;
634: case (byte) '<': // UCS-4 or UTF-16BE
635: if (bbuf[2] == (byte) 0x00 && bbuf[3] == (byte) '?')
636: return "UnicodeBigUnmarked";
637: else if (bbuf[2] == (byte) 0x00
638: && bbuf[3] == (byte) 0x00)
639: return "UCS-4";
640: else
641: return null;
642: default:
643: return null;
644: }
645: case (byte) 0x4C: // EBCDIC
646: if (bbuf[1] == (byte) 0x6F && bbuf[2] == (byte) 0xA7
647: && bbuf[3] == (byte) 0x94) {
648: useDeclaredEncoding = true;
649: return "Cp037";
650: } else
651: return null;
652: default: // Unknown
653: useDeclaredEncoding = true;
654: return null;
655: }
656: }
657:
658: /* Read [max] characters, parse the <?xml...?> tag
659: * push it back onto the stream. Create a reader. Then, if there was
660: * no error parsing the declaration, eat up the declaration.
661: */
662: private void processXMLDecl() throws IOException {
663: int initialBBufPos = bbufPos;
664: // Convert the byte buffer to characters
665: decoder.decodeXMLDecl(bbuf, bbufPos, bbufEnd - bbufPos, cbuf,
666: cbufPos, cbuf.length, decodeResult);
667: bbufPos += decodeResult[0];
668: cbufEnd = decodeResult[1];
669:
670: int numCharsParsed = parseXMLDeclaration(cbuf, 0, cbufEnd);
671:
672: if (numCharsParsed > 0) {
673: // Declaration found and parsed
674:
675: String declaredEncoding = getJavaCharset(getXMLDeclaredEncoding());
676:
677: // Skip the XML declaration unless told otherwise
678: if (!rewindDeclaration)
679: cbufPos += numCharsParsed;
680:
681: // If another encoding was specified, use it instead of the guess.
682: if (useDeclaredEncoding && (declaredEncoding != null)
683: && !declaredEncoding.equalsIgnoreCase(encoding)) {
684:
685: cbufPos = cbufEnd = 0;
686: decoder.reset();
687:
688: if (rewindDeclaration)
689: bbufPos = initialBBufPos;
690: else
691: bbufPos = numCharsParsed * minBytesPerChar;
692:
693: setEncoding(declaredEncoding);
694: }
695: }
696: }
697:
698: private interface XMLStreamDecoder {
699: public int read() throws IOException;
700:
701: public int read(char[] destbuf, int off, int len)
702: throws IOException;
703:
704: public boolean ready() throws IOException;
705:
706: public long skip(long n) throws IOException;
707:
708: public void reset() throws IOException;
709: }
710:
711: private class FastStreamDecoder implements XMLStreamDecoder {
712: public FastStreamDecoder() {
713: }
714:
715: public void reset() {
716: }
717:
718: public int read() throws IOException {
719: if (cbufEnd - cbufPos > 0)
720: return (int) cbuf[cbufPos++];
721: else {
722: cbufPos = cbufEnd = 0;
723: cbufEnd = read(cbuf, cbufPos, MAX_XML_DECL_CHARS);
724: if (cbufEnd > 0)
725: return (int) cbuf[cbufPos++];
726: else
727: return -1;
728: }
729: }
730:
731: public int read(char[] destbuf, int off, int len)
732: throws IOException {
733: int charsRead = 0;
734: // First copy any characters from the character buffer
735: if (cbufEnd - cbufPos > 0) {
736: int numToRead = Math.min(cbufEnd - cbufPos, len
737: - charsRead);
738: if (numToRead > 0) {
739: System.arraycopy(cbuf, cbufPos, destbuf, off,
740: numToRead);
741: charsRead += numToRead;
742: cbufPos += numToRead;
743: }
744: }
745: if (charsRead < len) {
746: if (bbufEnd - bbufPos < maxBytesPerChar) {
747: fillByteBuffer();
748: if (bbufEnd - bbufPos < minBytesPerChar)
749: return (charsRead <= 0 ? -1 : charsRead);
750: }
751: decoder.decode(bbuf, bbufPos, bbufEnd - bbufPos,
752: destbuf, off + charsRead, len - charsRead,
753: decodeResult);
754: bbufPos += decodeResult[0];
755: charsRead += decodeResult[1];
756: }
757: return ((charsRead == 0 && eofReached) ? -1 : charsRead);
758: }
759:
760: public boolean ready() throws IOException {
761: return ((cbufEnd - cbufPos > 0)
762: || (bbufEnd - bbufPos > maxBytesPerChar) || (in
763: .available() > 0));
764: }
765:
766: public long skip(long n) throws IOException {
767: long skipped = 0;
768: if (cbufEnd - cbufPos > 0) {
769: skipped = Math.min((long) cbufEnd - cbufPos, n);
770: cbufPos += skipped;
771: }
772: while (skipped < n) {
773: cbufPos = 0;
774: cbufEnd = read(cbuf, 0, MAX_XML_DECL_CHARS);
775: if (cbufEnd > 0) {
776: cbufPos = (int) Math.min((long) cbufEnd, n
777: - skipped);
778: skipped += cbufPos;
779: } else {
780: cbufEnd = 0;
781: return skipped;
782: }
783: }
784: return skipped;
785: }
786: }
787:
788: private class JavaStreamDecoder implements XMLStreamDecoder {
789: private Reader reader;
790: char[] oneCharBuffer = new char[1];
791: boolean sawCR;
792:
793: public JavaStreamDecoder() throws IOException {
794: }
795:
796: public void reset() throws IOException {
797: sawCR = false;
798:
799: if (bbufEnd - bbufPos > 0) {
800: PushbackInputStream pbIn = new PushbackInputStream(in,
801: bbufEnd - bbufPos);
802: pbIn.unread(bbuf, bbufPos, bbufEnd - bbufPos);
803: reader = new InputStreamReader(pbIn, encoding);
804: } else {
805: reader = new InputStreamReader(in, encoding);
806: }
807: }
808:
809: public int read() throws IOException {
810: while (true) {
811: int c = read(oneCharBuffer, 0, 1);
812: if (c > 0)
813: return oneCharBuffer[0];
814: else if (c < 0)
815: return c;
816: }
817: }
818:
819: public int read(char[] destbuf, int off, int len)
820: throws IOException {
821: int numChars = reader.read(destbuf, off, len);
822: int outpos = off;
823: int inpos;
824: char c;
825:
826: if (numChars < 0)
827: return numChars;
828:
829: for (int i = 0; i < numChars; i++) {
830: inpos = i + off;
831: c = destbuf[inpos];
832:
833: if (c >= 0x20) {
834: if ((c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD)
835: || (c >= 0x10000 && c <= 0x10FFFF)) {
836: sawCR = false;
837: if (inpos != outpos)
838: destbuf[outpos] = c;
839:
840: outpos++;
841: } else
842: throw new IllegalCharException(
843: "Illegal XML Character: 0x"
844: + Integer.toHexString(c));
845: } else {
846: switch (c) {
847: case '\n':
848: if (sawCR) {
849: sawCR = false;
850: } else
851: destbuf[outpos++] = '\n';
852: break;
853:
854: case '\r':
855: sawCR = true;
856: destbuf[outpos++] = '\n';
857: break;
858:
859: case '\t':
860: destbuf[outpos++] = '\t';
861: break;
862:
863: default:
864: System.out.println("Char: " + c + " ["
865: + (int) c + "]");
866:
867: throw new IllegalCharException(
868: "Illegal XML character: 0x"
869: + Integer.toHexString(c));
870: }
871: }
872: }
873:
874: return outpos - off;
875: }
876:
877: public boolean ready() throws IOException {
878: return reader.ready();
879: }
880:
881: public long skip(long n) throws IOException {
882: long skipped = 0;
883: while (skipped < n) {
884: cbufEnd = read(cbuf, 0, (int) Math.min(n,
885: MAX_XML_DECL_CHARS));
886: if (cbufEnd > 0)
887: skipped += cbufEnd;
888: else
889: return skipped;
890: }
891: return skipped;
892: }
893: }
894: }
|