001: /*
002: Copyright (c) 2004, Dennis M. Sosnoski.
003: All rights reserved.
004:
005: Redistribution and use in source and binary forms, with or without modification,
006: are permitted provided that the following conditions are met:
007:
008: * Redistributions of source code must retain the above copyright notice, this
009: list of conditions and the following disclaimer.
010: * Redistributions in binary form must reproduce the above copyright notice,
011: this list of conditions and the following disclaimer in the documentation
012: and/or other materials provided with the distribution.
013: * Neither the name of JiBX nor the names of its contributors may be used
014: to endorse or promote products derived from this software without specific
015: prior written permission.
016:
017: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
018: ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
019: WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
021: ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: */
028:
029: package org.jibx.runtime.impl;
030:
031: import java.io.IOException;
032: import java.io.InputStream;
033: import java.io.InputStreamReader;
034: import java.io.Reader;
035:
036: /**
037: * Wrapper for input stream that supports multiple character encodings. This is
038: * needed because the XPP3 pull parser does not support detecting the character
039: * encoding for a document based on the content of the document. If used with a
040: * common encoding this performs the conversion to characters using an inner
041: * reader class; otherwise, this creates the appropriate reader type
042: *
043: * @author Dennis M. Sosnoski
044: * @version 1.0
045: */
046:
047: public class InputStreamWrapper {
048: /** Default input buffer size. */
049: private static final int BUFFER_SIZE = 2048;
050:
051: /** Name of encoding to be used for stream. */
052: private String m_encodingName;
053:
054: /** Stream for byte input. */
055: private InputStream m_stream;
056:
057: /** Flag for end of stream reached. */
058: private boolean m_isEnd;
059:
060: /** Buffer for input bytes. */
061: private byte[] m_buffer;
062:
063: /** Offset past end of bytes in buffer. */
064: private int m_endOffset;
065:
066: /** Current offset for generating character from buffer. */
067: private int m_emptyOffset;
068:
069: /** Scan position offset used for lookahead in buffer. */
070: private int m_scanOffset;
071:
072: /**
073: * Constructor.
074: */
075:
076: public InputStreamWrapper() {
077: m_buffer = new byte[BUFFER_SIZE];
078: }
079:
080: /**
081: * Set input stream with encoding to be defined later. If an input stream is
082: * currently open when this is called the existing stream is closed, with
083: * any errors ignored.
084: *
085: * @param ins stream for document data input
086: */
087:
088: public void setInput(InputStream ins) {
089: try {
090: close();
091: } catch (IOException e) { /* deliberately empty */
092: }
093: m_stream = ins;
094: reset();
095: }
096:
097: /**
098: * Set input stream with specified encoding. If an input stream is currently
099: * open when this is called the existing stream is closed, with any errors
100: * ignored.
101: *
102: * @param ins stream for document data input
103: * @param enc character encoding used for input from stream
104: * (<code>null</code> if to be determined from XML input)
105: * @throws IOException
106: */
107:
108: public void setInput(InputStream ins, String enc)
109: throws IOException {
110: setInput(ins);
111: setEncoding(enc);
112: }
113:
114: /**
115: * Set encoding for stream. This call is only valid if the encoding has not
116: * been set previously, and if the encoding is a recognized type.
117: *
118: * @param enc character encoding used for input from stream
119: * (<code>null</code> if to be determined from XML input)
120: * @throws IOException if unknown encoding, or encoding already set
121: */
122:
123: public void setEncoding(String enc) throws IOException {
124: if (m_encodingName == null) {
125: m_encodingName = enc;
126: } else {
127: throw new IOException(
128: "Encoding has already been set for stream");
129: }
130: }
131:
132: /**
133: * Reads data into the buffer. Any retained data is first copied down to the
134: * start of the buffer array. Next, data is read from the wrapped stream
135: * into the available space in the buffer. The actual number of characters
136: * read by a call to this method is normally between one and the space
137: * available in the buffer array.
138: *
139: * @return <code>true</code> if data has been read into buffer,
140: * <code>false</code> if not
141: * @throws IOException on error reading from wrapped stream
142: */
143:
144: private boolean fillBuffer() throws IOException {
145: if (m_isEnd) {
146: return false;
147: } else {
148:
149: // move remaining data in buffer down to start
150: int rem = m_endOffset - m_emptyOffset;
151: if (rem > 0) {
152: System.arraycopy(m_buffer, m_emptyOffset, m_buffer, 0,
153: rem);
154: }
155: m_emptyOffset = 0;
156:
157: // read to maximum capacity of buffer
158: int max = m_buffer.length - rem;
159: int actual = m_stream.read(m_buffer, rem, max);
160: if (actual >= 0) {
161: m_endOffset = rem + actual;
162: return true;
163: } else {
164: m_endOffset = rem;
165: m_isEnd = true;
166: return false;
167: }
168: }
169: }
170:
171: /**
172: * Reads data into the buffer to at least a minimum number of bytes. Any
173: * retained data is first copied down to the start of the buffer array.
174: * Next, data is read from the wrapped stream into the available space in
175: * the buffer until the end of the input stream is reached or at least the
176: * requested number of bytes are present in the buffer.
177: *
178: * @param min number of bytes required
179: * @return <code>true</code> if buffer contains at least the required byte
180: * count on return, <code>false</code> if not
181: * @throws IOException on error reading from wrapped stream
182: */
183:
184: private boolean require(int min) throws IOException {
185: while (m_endOffset - m_emptyOffset < min) {
186: if (!fillBuffer()) {
187: return false;
188: }
189: }
190: return true;
191: }
192:
193: /**
194: * Check if a character is XML whitespace.
195: *
196: * @return <code>true</code> if whitespace, <code>false</code> if not
197: */
198:
199: private boolean isWhite(int chr) {
200: return chr == ' ' || chr == 0x09 || chr == 0x0A || chr == 0x0D;
201: }
202:
203: /**
204: * Reads a space or equals ('=') delimited token from the scan position in
205: * the buffer. This treats bytes in the buffer as equivalent to characters.
206: * Besides ending a token on a delimitor, it also ends a token after adding
207: * a greater-than ('>') character.
208: *
209: * @return token read from buffer
210: * @throws IOException on error reading from wrapped stream
211: */
212:
213: private String scanToken() throws IOException {
214: boolean skipping = true;
215: StringBuffer buff = new StringBuffer();
216: while (require(m_scanOffset + 1)) {
217: char chr = (char) m_buffer[m_scanOffset++];
218: if (skipping) {
219: if (!isWhite(chr)) {
220: skipping = false;
221: buff.append(chr);
222: if (chr == '=') {
223: return buff.toString();
224: }
225: }
226: } else if (isWhite(chr) || chr == '=') {
227: m_scanOffset--;
228: return buff.toString();
229: } else {
230: buff.append(chr);
231: if (chr == '>') {
232: return buff.toString();
233: }
234: }
235: }
236: return null;
237: }
238:
239: /**
240: * Reads a quote delimited token from the scan position in the buffer. This
241: * treats bytes in the buffer as equivalent to characters, and skips past
242: * any leading whitespace.
243: *
244: * @return token read from buffer
245: * @throws IOException on error reading from wrapped stream
246: */
247:
248: private String scanQuoted() throws IOException {
249: boolean skipping = true;
250: int quot = 0;
251: StringBuffer buff = new StringBuffer();
252: while (require(m_scanOffset + 1)) {
253: char chr = (char) m_buffer[m_scanOffset++];
254: if (skipping) {
255: if (!isWhite(chr)) {
256: if (chr == '"' || chr == '\'') {
257: skipping = false;
258: quot = chr;
259: } else {
260: break;
261: }
262: }
263: } else if (chr == quot) {
264: return buff.toString();
265: } else {
266: buff.append(chr);
267: }
268: }
269: return null;
270: }
271:
272: /**
273: * Get reader for wrapped input stream. This creates and returns a reader
274: * using the appropriate encoding, if necessary reading and examining the
275: * first part of the stream (including the XML declaration, if present) to
276: * determine the encoding.
277: *
278: * @throws IOException if error reading from document or creating a reader
279: * for the encoding found
280: */
281:
282: public Reader getReader() throws IOException {
283:
284: // check if we need to determine an encoding
285: if (m_encodingName == null) {
286:
287: // try to get enough input to decide if anything other than default
288: m_encodingName = "UTF-8";
289: if (require(4)) {
290:
291: // get first four bytes for initial determination
292: int bom = (((m_buffer[0] << 8) + (m_buffer[1] & 0xFF) << 8)
293: + (m_buffer[2] & 0xFF) << 8)
294: + (m_buffer[3] & 0xFF);
295: if (bom == 0x3C3F786D) {
296:
297: // read encoding declaration with single byte characters
298: m_scanOffset = 2;
299: String token = scanToken();
300: if ("xml".equals(token)) {
301: while ((token = scanToken()) != null
302: && !"?>".equals(token)) {
303: if ("encoding".equals(token)) {
304: if ("=".equals(scanToken())) {
305: token = scanQuoted();
306: if (token != null) {
307: m_encodingName = token;
308: break;
309: }
310: }
311: } else if ("=".equals(token)) {
312: scanQuoted();
313: }
314: }
315: }
316:
317: } else if (bom == 0x0000FEFF || bom == 0xFFFE0000
318: || bom == 0x0000FFFE || bom == 0xFEFF0000) {
319:
320: // just use generic UCS-4 and let the libaries figure it out
321: m_encodingName = "UCS-4";
322:
323: } else if ((bom & 0xFFFFFF00) == 0xEFBBBF00) {
324:
325: // UTF-8 as specified by byte order mark
326: m_encodingName = "UTF-8";
327:
328: } else {
329: int upper = bom & 0xFFFF0000;
330: if (upper == 0xFEFF0000 || bom == 0x003C003F) {
331:
332: // assume UTF-16BE for 16-bit BE
333: m_encodingName = "UTF-16BE";
334:
335: } else if (upper == 0xFFFE0000 || bom == 0x3C003F00) {
336:
337: // assume UTF-16LE for 16-bit LE
338: m_encodingName = "UTF-16LE";
339:
340: } else if (bom == 0x4C6FA794) {
341:
342: // just because we can, even though nobody should
343: m_encodingName = "EBCDIC";
344: }
345: }
346: }
347: }
348: if (m_encodingName.equalsIgnoreCase("UTF-8")) {
349: return new WrappedStreamUTF8Reader();
350: } else if (m_encodingName.equalsIgnoreCase("ISO-8859-1")
351: || m_encodingName.equalsIgnoreCase("ASCII")) {
352: return new WrappedStreamISO88591Reader();
353: } else {
354: return new InputStreamReader(new WrappedStream(),
355: m_encodingName);
356: }
357: }
358:
359: /**
360: * Get encoding for input document. This call may not return an accurate
361: * result until after {@link #getReader} is called.
362: *
363: * @return character encoding for input document
364: */
365:
366: public String getEncoding() {
367: return m_encodingName;
368: }
369:
370: /**
371: * Close document input. Completes reading of document input, including
372: * closing the input medium.
373: *
374: * @throws IOException on error closing document
375: */
376:
377: public void close() throws IOException {
378: if (m_stream != null) {
379: m_stream.close();
380: m_stream = null;
381: }
382: reset();
383: }
384:
385: /**
386: * Reset to initial state for reuse.
387: */
388:
389: public void reset() {
390: m_isEnd = false;
391: m_endOffset = 0;
392: m_emptyOffset = 0;
393: m_encodingName = null;
394: }
395:
396: /**
397: * Stream that just uses the enclosing class to buffer input from the
398: * wrapped stream.
399: */
400:
401: private class WrappedStream extends InputStream {
402: /* (non-Javadoc)
403: * @see java.io.InputStream#available()
404: */
405:
406: public int available() throws IOException {
407: return m_endOffset - m_emptyOffset + m_stream.available();
408: }
409:
410: /* (non-Javadoc)
411: * @see java.io.InputStream#close()
412: */
413:
414: public void close() throws IOException {
415: InputStreamWrapper.this .close();
416: }
417:
418: /* (non-Javadoc)
419: * @see java.io.InputStream#read(byte[], int, int)
420: */
421:
422: public int read(byte[] b, int off, int len) throws IOException {
423: int avail;
424: int actual = 0;
425: while (len > (avail = m_endOffset - m_emptyOffset)) {
426: System
427: .arraycopy(m_buffer, m_emptyOffset, b, off,
428: avail);
429: off += avail;
430: len -= avail;
431: actual += avail;
432: m_emptyOffset = m_endOffset = 0;
433: if (!fillBuffer()) {
434: return actual == 0 ? -1 : actual;
435: }
436: }
437: System.arraycopy(m_buffer, m_emptyOffset, b, off, len);
438: m_emptyOffset += len;
439: return actual + len;
440: }
441:
442: /* (non-Javadoc)
443: * @see java.io.InputStream#read(byte[])
444: */
445:
446: public int read(byte[] b) throws IOException {
447: return read(b, 0, b.length);
448: }
449:
450: /* (non-Javadoc)
451: * @see java.io.InputStream#skip(long)
452: */
453:
454: public long skip(long n) throws IOException {
455: int avail = m_endOffset - m_emptyOffset;
456: if (n >= (long) avail) {
457: return avail + m_stream.skip(n - avail);
458: } else {
459: m_emptyOffset += (int) n;
460: return n;
461: }
462: }
463:
464: /* (non-Javadoc)
465: * @see java.io.InputStream#read()
466: */
467:
468: public int read() throws IOException {
469: if (m_emptyOffset >= m_endOffset && !fillBuffer()) {
470: return -1;
471: } else {
472: return m_buffer[m_emptyOffset++];
473: }
474: }
475: }
476:
477: /**
478: * Reader for input stream using UTF-8 encoding. This uses the enclosing
479: * class to buffer input from the stream, interpreting it as characters on
480: * demand.
481: */
482:
483: private class WrappedStreamUTF8Reader extends Reader {
484: /* (non-Javadoc)
485: * @see java.io.Reader#close()
486: */
487:
488: public void close() throws IOException {
489: InputStreamWrapper.this .close();
490: }
491:
492: /* (non-Javadoc)
493: * @see java.io.Reader#read(char[], int, int)
494: */
495:
496: public int read(char[] b, int off, int len) throws IOException {
497:
498: // load up local variables for conversion loop
499: int end = off + len;
500: int empty = m_emptyOffset;
501: byte[] buff = m_buffer;
502: while (off < end) {
503:
504: // fill buffer if less than maximum byte count in character
505: if (empty + 3 > m_endOffset) {
506: m_emptyOffset = empty;
507: fillBuffer();
508: empty = m_emptyOffset;
509: if (empty == m_endOffset) {
510: int actual = len + off - end;
511: return actual > 0 ? actual : -1;
512: }
513: }
514:
515: // check for single-byte vs multi-byte character next
516: int byt = buff[empty++];
517: if (byt >= 0) {
518:
519: // single-byte character, just store to output array
520: b[off++] = (char) byt;
521: if (byt == 0) {
522: System.err.println("Wrote null");
523: }
524:
525: } else if ((byt & 0xE0) == 0xC0) {
526:
527: // double-byte character, check bytes available and store
528: if (empty < m_endOffset) {
529: b[off++] = (char) (((byt & 0x1F) << 6) + (buff[empty++] & 0x3F));
530: if (b[off - 1] == 0) {
531: System.err.println("Wrote null");
532: }
533: } else {
534: throw new IOException("UTF-8 conversion error");
535: }
536:
537: } else {
538:
539: // three-byte character, check bytes available and store
540: if (empty + 1 < m_endOffset) {
541: int byt2 = buff[empty++] & 0x3F;
542: b[off++] = (char) ((((byt & 0x0F) << 6) + byt2 << 6) + (buff[empty++] & 0x3F));
543: if (b[off - 1] == 0) {
544: System.err.println("Wrote null");
545: }
546: } else {
547: throw new IOException("UTF-8 conversion error");
548: }
549: }
550: }
551: m_emptyOffset = empty;
552: return len;
553: }
554:
555: /* (non-Javadoc)
556: * @see java.io.Reader#read(char[])
557: */
558:
559: public int read(char[] b) throws IOException {
560: return read(b, 0, b.length);
561: }
562:
563: /* (non-Javadoc)
564: * @see java.io.Reader#read()
565: */
566:
567: public int read() throws IOException {
568:
569: // fill buffer if less than maximum byte count in character
570: if (m_emptyOffset + 3 > m_endOffset) {
571: fillBuffer();
572: if (m_emptyOffset == m_endOffset) {
573: return -1;
574: }
575: }
576:
577: // check for single-byte vs multi-byte character next
578: int byt = m_buffer[m_emptyOffset++];
579: if (byt >= 0) {
580:
581: // single-byte character, just store to output array
582: return byt & 0xFF;
583:
584: } else if ((byt & 0xE0) == 0xC0) {
585:
586: // double-byte character, check bytes available and store
587: if (m_emptyOffset < m_endOffset) {
588: return ((byt & 0x1F) << 6)
589: + (m_buffer[m_emptyOffset++] & 0x3F);
590: } else {
591: throw new IOException("UTF-8 conversion error");
592: }
593:
594: } else {
595:
596: // three-byte character, check bytes available and store
597: if (m_emptyOffset + 1 < m_endOffset) {
598: int byt2 = m_buffer[m_emptyOffset++] & 0xFF;
599: return (((byt & 0x0F) << 6) + byt2 << 6)
600: + (m_buffer[m_emptyOffset++] & 0x3F);
601: } else {
602: throw new IOException("UTF-8 conversion error");
603: }
604: }
605: }
606:
607: /* (non-Javadoc)
608: * @see java.io.Reader#ready()
609: */
610:
611: public boolean ready() throws IOException {
612: return m_emptyOffset + 2 < m_endOffset;
613: }
614: }
615:
616: /**
617: * Reader for input stream using ISO8859-1 encoding. This uses the enclosing
618: * class to buffer input from the stream, interpreting it as characters on
619: * demand.
620: */
621:
622: private class WrappedStreamISO88591Reader extends Reader {
623: /* (non-Javadoc)
624: * @see java.io.Reader#close()
625: */
626:
627: public void close() throws IOException {
628: InputStreamWrapper.this .close();
629: }
630:
631: /* (non-Javadoc)
632: * @see java.io.Reader#read(char[], int, int)
633: */
634:
635: public int read(char[] b, int off, int len) throws IOException {
636:
637: // load up local variables for conversion loop
638: int end = off + len;
639: int empty = m_emptyOffset;
640: byte[] buff = m_buffer;
641: while (off < end) {
642:
643: // make sure there's data in buffer
644: int avail = m_endOffset - empty;
645: if (avail == 0) {
646: m_emptyOffset = empty;
647: if (fillBuffer()) {
648: empty = m_emptyOffset;
649: avail = m_endOffset - empty;
650: } else {
651: int actual = len + off - end;
652: return actual > 0 ? actual : -1;
653: }
654: }
655:
656: // find count of bytes to convert to characters
657: int use = end - off;
658: if (use > avail) {
659: use = avail;
660: }
661:
662: // convert bytes directly to characters
663: int limit = empty + use;
664: while (empty < limit) {
665: b[off++] = (char) (buff[empty++] & 0xFF);
666: }
667: }
668: m_emptyOffset = empty;
669: return len;
670: }
671:
672: /* (non-Javadoc)
673: * @see java.io.Reader#read(char[])
674: */
675:
676: public int read(char[] b) throws IOException {
677: return read(b, 0, b.length);
678: }
679:
680: /* (non-Javadoc)
681: * @see java.io.Reader#read()
682: */
683:
684: public int read() throws IOException {
685: if (m_emptyOffset >= m_endOffset && !fillBuffer()) {
686: return -1;
687: } else {
688: return m_buffer[m_emptyOffset++] & 0xFF;
689: }
690: }
691:
692: /* (non-Javadoc)
693: * @see java.io.Reader#ready()
694: */
695:
696: public boolean ready() throws IOException {
697: return m_emptyOffset < m_endOffset;
698: }
699: }
700: }
|