001: package uk.org.ponder.streamutil;
002:
003: import java.io.InputStream;
004: import java.io.Reader;
005: import java.io.IOException;
006:
007: import uk.org.ponder.stringutil.ByteToCharConverter;
008: import uk.org.ponder.stringutil.ByteToCharUTF8;
009: import uk.org.ponder.stringutil.EncodingErrorHandler;
010:
011: /**
012: * A more efficient and sane rendering of the standard Java <code>InputStreamReader</code>
013: * class.
014: * This version features:
015: * <br> No silly exceptions indicating lack of output space, but instead a proper streaming
016: * architecture
017: * <br> No locking overhead, clearly anyone is silly enough to read from the same reader from two
018: * different threads should do the locking himself.
019: * <br> Correct handling of UTF-16 in the decoder, the Sun version would overflow its buffer
020: * on receiving any surrogate pairs in a UTF-8 stream.
021: * <br> Proper official names for UTF-16 and UTF-8 formats.
022: * <br> Powerful error handling architecture allows users to actually tell where in the input
023: * stream the byte occured that failed conversion, Sun approach of randomly throwing exceptions
024: * and then packing it in was simply not good enough.
025: * <br> Implementation of Markus Kuhn's "erroneous byte in erroneous surrogate" error encapsulation
026: * strategy. This scheme allows proper rendering of erroneous bytes further on down the pipeline.
027: * @author Antranig Basman
028: */
029: public class DirectInputStreamReader extends Reader {
030: /*
031: * Substitution mode flag.
032: */
033: protected boolean subMode = true;
034:
035: InputStream inputstream;
036:
037: private ByteToCharConverter converter;
038:
039: /**
040: * Returns the character set id for the conversion
041: */
042: public String getCharacterEncoding() {
043: return converter.getCharacterEncoding();
044: }
045:
046: public DirectInputStreamReader() {
047: inputstream = null;
048: }
049:
050: public DirectInputStreamReader(InputStream inputstream) {
051: setInputStream(inputstream);
052: }
053:
054: public void setInputStream(InputStream inputstream) {
055: setInputStream(inputstream, "UTF-8",
056: StreamCopyUtil.PROCESS_BUFFER_SIZE);
057: }
058:
059: public void setInputStream(InputStream inputstream, String encoding) {
060: setInputStream(inputstream, encoding,
061: StreamCopyUtil.PROCESS_BUFFER_SIZE);
062: }
063:
064: public void setInputStream(InputStream inputstream,
065: String encoding, int buffersize) {
066: if (converter == null
067: || converter.getCharacterEncoding() != encoding) {
068: if (encoding == "UTF-8") {
069: converter = new ByteToCharUTF8();
070: }
071: }
072: converter.blastState();
073: converter.ensureInputBuffer(buffersize);
074: this .inputstream = inputstream;
075: }
076:
077: public void setEncodingErrorHandler(EncodingErrorHandler handler) {
078: if (converter != null) {
079: converter.setEncodingErrorHandler(handler);
080: }
081: }
082:
083: public void close() throws IOException {
084: if (inputstream != null) {
085: inputstream.close();
086: }
087: converter.blastState();
088: }
089:
090: private int acceptInput() throws IOException {
091: converter.swizzInputBuffer();
092:
093: byte[] inbuffer = converter.getInputBuffer();
094: int inbufferlimit = converter.getInputBufferLimit();
095:
096: int bytesread = inputstream.read(inbuffer, inbufferlimit,
097: inbuffer.length - inbufferlimit);
098: if (bytesread != -1) {
099: converter.increaseInputBufferLimit(bytesread);
100: }
101: // System.out.println("acceptInput read "+ bytesread+" bytes");
102: return bytesread;
103: }
104:
105: private int convertInternal() throws IOException {
106: int outbufferstart = converter.getOutputBufferPos();
107: int bytesread = 0;
108: // System.out.println("Beginning convertInternal at output buffer pos "+outbufferstart);
109: while (true) { // big loop attempts to fill up outbuffer
110: int stop_reason = converter.convert();
111: // when we get to here, break was either because sequence was incomplete,
112: // we have completely come to the end of input or else we are out of output buffer space.
113: if (stop_reason == ByteToCharConverter.STOP_OUTPUT_EXHAUSTED) {
114: break;
115: } else { // otherwise we must be short of input
116: // attempt to get more input. NB 1.2.2 InputStream contract says that this will
117: // return at least 1 byte if not at EOF, and a sequence may be longer than this.
118: bytesread = acceptInput(); // nb all inbuffer parameters blasted by this
119: // System.out.println("missing_bytes:" +converter.missing_bytes());
120: if (bytesread == -1) { // only break if at EOF
121: if (converter.missing_bytes() > 0) {
122: // if there is STILL not enough data, signal error
123: converter
124: .handleEncodingError("Premature end of input stream during "
125: + converter
126: .getCharacterEncoding()
127: + " sequence");
128: }
129: break;
130: }
131: // otherwise, there (may be) now sufficient data, if not now then after a few times
132: // round this loop if inputstream perversely returns the minimum 1 byte each time
133: // restart sequence and continue reading where we left off.
134: } // end if no useable bytes WERE in buffer
135: } // end loop over output buffer
136: // output buffer is full, or no more input remaining. Return number of useful
137: // bytes in output buffer.
138: int outputchars = converter.getOutputBufferPos()
139: - outbufferstart;
140: /*
141: System.out.println("convertInternal converted "+outputchars+" output characters, "
142: + bytesread +" of input read ");
143: */
144: // nb - why may we have output 0 chars and not be at EOF? Perhaps only because perverse
145: // bugger supplied < 2-element output array?
146: return (outputchars == 0 && bytesread == -1) ? bytesread
147: : outputchars;
148: }
149:
150: public int read(char[] outbuffer, int outbufferpos, int length)
151: throws IOException {
152: converter.setOutputBuffer(outbuffer, outbufferpos, outbufferpos
153: + length);
154: return convertInternal();
155: }
156:
157: }
|