001: /*
002: * Javolution - Java(TM) Solution for Real-Time and Embedded Systems
003: * Copyright (C) 2006 - Javolution (http://javolution.org/)
004: * All rights reserved.
005: *
006: * Permission to use, copy, modify, and distribute this software is
007: * freely granted, provided that this notice is preserved.
008: */
009: package javolution.io;
010:
011: import j2me.lang.IllegalStateException;
012: import j2me.io.CharConversionException;
013:
014: import java.io.IOException;
015: import java.io.InputStream;
016: import java.io.Reader;
017:
018: import javolution.lang.Reusable;
019: import javolution.text.Appendable;
020:
021: /**
022: * <p> This class represents a UTF-8 stream reader.</p>
023: *
024: * <p> This reader supports surrogate <code>char</code> pairs (representing
025: * characters in the range [U+10000 .. U+10FFFF]). It can also be used
026: * to read characters unicodes (31 bits) directly
027: * (ref. {@link #read()}).</p>
028: *
029: * <p> Each invocation of one of the <code>read()</code> methods may cause one
030: * or more bytes to be read from the underlying byte-input stream.
031: * To enable the efficient conversion of bytes to characters, more bytes may
032: * be read ahead from the underlying stream than are necessary to satisfy
033: * the current read operation.</p>
034: *
035: * <p> Instances of this class can be reused for different input streams
036: * and can be part of a higher level component (e.g. parser) in order
037: * to avoid dynamic buffer allocation when the input source changes.
038: * Also wrapping using a <code>java.io.BufferedReader</code> is unnescessary
039: * as instances of this class embed their own data buffers.</p>
040: *
041: * <p> Note: This reader is unsynchronized and does not test if the UTF-8
042: * encoding is well-formed (e.g. UTF-8 sequences longer than
043: * necessary to encode a character).</p>
044: *
045: * @author <a href="mailto:jean-marie@dautelle.com">Jean-Marie Dautelle</a>
046: * @version 2.0, December 9, 2004
047: * @see UTF8StreamWriter
048: */
049: public final class UTF8StreamReader extends Reader implements Reusable {
050:
051: /**
052: * Holds the current input stream or <code>null</code> if closed.
053: */
054: private InputStream _inputStream;
055:
056: /**
057: * Holds the start index.
058: */
059: private int _start;
060:
061: /**
062: * Holds the end index.
063: */
064: private int _end;
065:
066: /**
067: * Holds the bytes buffer.
068: */
069: private final byte[] _bytes;
070:
071: /**
072: * Creates a UTF-8 reader having a byte buffer of moderate capacity (2048).
073: */
074: public UTF8StreamReader() {
075: _bytes = new byte[2048];
076: }
077:
078: /**
079: * Creates a UTF-8 reader having a byte buffer of specified capacity.
080: *
081: * @param capacity the capacity of the byte buffer.
082: */
083: public UTF8StreamReader(int capacity) {
084: _bytes = new byte[capacity];
085: }
086:
087: /**
088: * Sets the input stream to use for reading until this reader is closed.
089: * For example:[code]
090: * Reader reader = new UTF8StreamReader().setInput(inStream);
091: * [/code] is equivalent but reads twice as fast as [code]
092: * Reader reader = new j2me.io.InputStreamReader(inStream, "UTF-8");
093: * [/code]
094: *
095: * @param inStream the input stream.
096: * @return this UTF-8 reader.
097: * @throws IllegalStateException if this reader is being reused and
098: * it has not been {@link #close closed} or {@link #reset reset}.
099: */
100: public UTF8StreamReader setInput(InputStream inStream) {
101: if (_inputStream != null)
102: throw new IllegalStateException(
103: "Reader not closed or reset");
104: _inputStream = inStream;
105: return this ;
106: }
107:
108: /**
109: * Indicates if this stream is ready to be read.
110: *
111: * @return <code>true</code> if the next read() is guaranteed not to block
112: * for input; <code>false</code> otherwise.
113: * @throws IOException if an I/O error occurs.
114: */
115: public boolean ready() throws IOException {
116: if (_inputStream == null)
117: throw new IOException("Stream closed");
118: return ((_end - _start) > 0) || (_inputStream.available() != 0);
119: }
120:
121: /**
122: * Closes and {@link #reset resets} this reader for reuse.
123: *
124: * @throws IOException if an I/O error occurs.
125: */
126: public void close() throws IOException {
127: if (_inputStream != null) {
128: _inputStream.close();
129: reset();
130: }
131: }
132:
133: /**
134: * Reads a single character. This method will block until a character is
135: * available, an I/O error occurs or the end of the stream is reached.
136: *
137: * @return the 31-bits Unicode of the character read, or -1 if the end of
138: * the stream has been reached.
139: * @throws IOException if an I/O error occurs.
140: */
141: public int read() throws IOException {
142: byte b = _bytes[_start];
143: return ((b >= 0) && (_start++ < _end)) ? b : read2();
144: }
145:
146: // Reads one full character, blocks if necessary.
147: private int read2() throws IOException {
148: if (_start < _end) {
149: byte b = _bytes[_start++];
150:
151: // Decodes UTF-8.
152: if ((b >= 0) && (_moreBytes == 0)) {
153: // 0xxxxxxx
154: return b;
155: } else if (((b & 0xc0) == 0x80) && (_moreBytes != 0)) {
156: // 10xxxxxx (continuation byte)
157: _code = (_code << 6) | (b & 0x3f); // Adds 6 bits to code.
158: if (--_moreBytes == 0) {
159: return _code;
160: } else {
161: return read2();
162: }
163: } else if (((b & 0xe0) == 0xc0) && (_moreBytes == 0)) {
164: // 110xxxxx
165: _code = b & 0x1f;
166: _moreBytes = 1;
167: return read2();
168: } else if (((b & 0xf0) == 0xe0) && (_moreBytes == 0)) {
169: // 1110xxxx
170: _code = b & 0x0f;
171: _moreBytes = 2;
172: return read2();
173: } else if (((b & 0xf8) == 0xf0) && (_moreBytes == 0)) {
174: // 11110xxx
175: _code = b & 0x07;
176: _moreBytes = 3;
177: return read2();
178: } else if (((b & 0xfc) == 0xf8) && (_moreBytes == 0)) {
179: // 111110xx
180: _code = b & 0x03;
181: _moreBytes = 4;
182: return read2();
183: } else if (((b & 0xfe) == 0xfc) && (_moreBytes == 0)) {
184: // 1111110x
185: _code = b & 0x01;
186: _moreBytes = 5;
187: return read2();
188: } else {
189: throw new CharConversionException(
190: "Invalid UTF-8 Encoding");
191: }
192: } else { // No more bytes in buffer.
193: if (_inputStream == null)
194: throw new IOException(
195: "No input stream or stream closed");
196: _start = 0;
197: _end = _inputStream.read(_bytes, 0, _bytes.length);
198: if (_end > 0) {
199: return read2(); // Continues.
200: } else { // Done.
201: if (_moreBytes == 0) {
202: return -1;
203: } else { // Incomplete sequence.
204: throw new CharConversionException(
205: "Unexpected end of stream");
206: }
207: }
208: }
209: }
210:
211: private int _code;
212:
213: private int _moreBytes;
214:
215: /**
216: * Reads characters into a portion of an array. This method will block
217: * until some input is available, an I/O error occurs or the end of
218: * the stream is reached.
219: *
220: * <p> Note: Characters between U+10000 and U+10FFFF are represented
221: * by surrogate pairs (two <code>char</code>).</p>
222: *
223: * @param cbuf the destination buffer.
224: * @param off the offset at which to start storing characters.
225: * @param len the maximum number of characters to read
226: * @return the number of characters read, or -1 if the end of the
227: * stream has been reached
228: * @throws IOException if an I/O error occurs.
229: */
230: public int read(char cbuf[], int off, int len) throws IOException {
231: if (_inputStream == null)
232: throw new IOException("No input stream or stream closed");
233: if (_start >= _end) { // Fills buffer.
234: _start = 0;
235: _end = _inputStream.read(_bytes, 0, _bytes.length);
236: if (_end <= 0) { // Done.
237: return _end;
238: }
239: }
240: final int off_plus_len = off + len;
241: for (int i = off; i < off_plus_len;) {
242: // assert(_start < _end)
243: byte b = _bytes[_start];
244: if ((b >= 0) && (++_start < _end)) {
245: cbuf[i++] = (char) b; // Most common case.
246: } else if (b < 0) {
247: if (i < off_plus_len - 1) { // Up to two 'char' can be read.
248: int code = read2();
249: if (code < 0x10000) {
250: cbuf[i++] = (char) code;
251: } else if (code <= 0x10ffff) { // Surrogates.
252: cbuf[i++] = (char) (((code - 0x10000) >> 10) + 0xd800);
253: cbuf[i++] = (char) (((code - 0x10000) & 0x3ff) + 0xdc00);
254: } else {
255: throw new CharConversionException(
256: "Cannot convert U+"
257: + Integer.toHexString(code)
258: + " to char (code greater than U+10FFFF)");
259: }
260: if (_start < _end) {
261: continue;
262: }
263: }
264: return i - off;
265: } else { // End of buffer (_start >= _end).
266: cbuf[i++] = (char) b;
267: return i - off;
268: }
269: }
270: return len;
271: }
272:
273: /**
274: * Reads characters into the specified appendable. This method will block
275: * until the end of the stream is reached.
276: *
277: * @param dest the destination buffer.
278: * @throws IOException if an I/O error occurs.
279: */
280: public void read(Appendable dest) throws IOException {
281: if (_inputStream == null)
282: throw new IOException("No input stream or stream closed");
283: while (true) {
284: if (_start >= _end) { // Fills buffer.
285: _start = 0;
286: _end = _inputStream.read(_bytes, 0, _bytes.length);
287: if (_end <= 0) { // Done.
288: break;
289: }
290: }
291: byte b = _bytes[_start];
292: if (b >= 0) {
293: dest.append((char) b); // Most common case.
294: _start++;
295: } else {
296: int code = read2();
297: if (code < 0x10000) {
298: dest.append((char) code);
299: } else if (code <= 0x10ffff) { // Surrogates.
300: dest
301: .append((char) (((code - 0x10000) >> 10) + 0xd800));
302: dest
303: .append((char) (((code - 0x10000) & 0x3ff) + 0xdc00));
304: } else {
305: throw new CharConversionException(
306: "Cannot convert U+"
307: + Integer.toHexString(code)
308: + " to char (code greater than U+10FFFF)");
309: }
310: }
311: }
312: }
313:
314: // Implements Reusable.
315: public void reset() {
316: _code = 0;
317: _end = 0;
318: _inputStream = null;
319: _moreBytes = 0;
320: _start = 0;
321: }
322:
323: /**
324: * @deprecated Replaced by {@link #setInput(InputStream)}
325: */
326: public UTF8StreamReader setInputStream(InputStream inStream) {
327: return this.setInput(inStream);
328: }
329: }
|