001: /**
002: * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
003: * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
004: */package com.sun.xml.stream.xerces.impl.io;
005:
006: import com.sun.xml.stream.util.BufferAllocator;
007: import com.sun.xml.stream.util.ThreadLocalBufferAllocator;
008: import java.io.*;
009: import java.util.Locale;
010:
011: import com.sun.xml.stream.xerces.impl.msg.XMLMessageFormatter;
012: import com.sun.xml.stream.xerces.util.MessageFormatter;
013:
014: /**
015: * @author Andy Clark, IBM
016: *
017: * @version $Id: UTF8Reader.java,v 1.3 2006/11/28 18:50:05 spericas Exp $
018: */
019: public class UTF8Reader extends Reader {
020:
021: //
022: // Constants
023: //
024:
025: /** Default byte buffer size (2048). */
026: public static final int DEFAULT_BUFFER_SIZE = 2048;
027:
028: // debugging
029:
030: /** Debug read. */
031: private static final boolean DEBUG_READ = false;
032:
033: //
034: // Data
035: //
036:
037: /** Input stream. */
038: protected InputStream fInputStream;
039:
040: /** Byte buffer. */
041: protected byte[] fBuffer;
042:
043: /** Offset into buffer. */
044: protected int fOffset;
045:
046: /** Surrogate character. */
047: private int fSurrogate = -1;
048:
049: // message formatter; used to produce localized
050: // exception messages
051: private MessageFormatter fFormatter = null;
052:
053: //Locale to use for messages
054: private Locale fLocale = null;
055:
056: //
057: // Constructors
058: //
059:
060: /**
061: * Constructs a UTF-8 reader from the specified input stream
062: * using the default buffer size. Primarily for testing.
063: *
064: * @param inputStream The input stream.
065: */
066: public UTF8Reader(InputStream inputStream) {
067: this (inputStream, DEFAULT_BUFFER_SIZE,
068: new XMLMessageFormatter(), Locale.getDefault());
069: } // <init>(InputStream, MessageFormatter)
070:
071: /**
072: * Constructs a UTF-8 reader from the specified input stream
073: * using the default buffer size and the given MessageFormatter.
074: *
075: * @param inputStream The input stream.
076: * @param messageFormatter given MessageFormatter
077: * @param locale Locale to use for messages
078: */
079: public UTF8Reader(InputStream inputStream,
080: MessageFormatter messageFormatter, Locale locale) {
081: this (inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
082: } // <init>(InputStream, MessageFormatter)
083:
084: /**
085: * Constructs a UTF-8 reader from the specified input stream,
086: * buffer size and MessageFormatter.
087: *
088: * @param inputStream The input stream.
089: * @param size The initial buffer size.
090: * @param messageFormatter the formatter for localizing/formatting errors.
091: * @param locale the Locale to use for messages
092: */
093: public UTF8Reader(InputStream inputStream, int size,
094: MessageFormatter messageFormatter, Locale locale) {
095: fInputStream = inputStream;
096: fFormatter = messageFormatter;
097: fLocale = locale;
098: BufferAllocator ba = ThreadLocalBufferAllocator
099: .getBufferAllocator();
100: fBuffer = ba.getByteBuffer(size);
101: if (fBuffer == null) {
102: fBuffer = new byte[size];
103: }
104: } // <init>(InputStream,int, MessageFormatter)
105:
106: //
107: // Reader methods
108: //
109:
110: /**
111: * Read a single character. This method will block until a character is
112: * available, an I/O error occurs, or the end of the stream is reached.
113: *
114: * <p> Subclasses that intend to support efficient single-character input
115: * should override this method.
116: *
117: * @return The character read, as an integer in the range 0 to 16383
118: * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
119: * been reached
120: *
121: * @exception IOException If an I/O error occurs
122: */
123: public int read() throws IOException {
124:
125: // decode character
126: int c = fSurrogate;
127: if (fSurrogate == -1) {
128: // NOTE: We use the index into the buffer if there are remaining
129: // bytes from the last block read. -Ac
130: int index = 0;
131:
132: // get first byte
133: int b0 = index == fOffset ? fInputStream.read()
134: : fBuffer[index++] & 0x00FF;
135: if (b0 == -1) {
136: return -1;
137: }
138:
139: // UTF-8: [0xxx xxxx]
140: // Unicode: [0000 0000] [0xxx xxxx]
141: if (b0 < 0x80) {
142: c = (char) b0;
143: }
144:
145: // UTF-8: [110y yyyy] [10xx xxxx]
146: // Unicode: [0000 0yyy] [yyxx xxxx]
147: else if ((b0 & 0xE0) == 0xC0) {
148: int b1 = index == fOffset ? fInputStream.read()
149: : fBuffer[index++] & 0x00FF;
150: if (b1 == -1) {
151: expectedByte(2, 2);
152: }
153: if ((b1 & 0xC0) != 0x80) {
154: invalidByte(2, 2, b1);
155: }
156: c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
157: }
158:
159: // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
160: // Unicode: [zzzz yyyy] [yyxx xxxx]
161: else if ((b0 & 0xF0) == 0xE0) {
162: int b1 = index == fOffset ? fInputStream.read()
163: : fBuffer[index++] & 0x00FF;
164: if (b1 == -1) {
165: expectedByte(2, 3);
166: }
167: if ((b1 & 0xC0) != 0x80) {
168: invalidByte(2, 3, b1);
169: }
170: int b2 = index == fOffset ? fInputStream.read()
171: : fBuffer[index++] & 0x00FF;
172: if (b2 == -1) {
173: expectedByte(3, 3);
174: }
175: if ((b2 & 0xC0) != 0x80) {
176: invalidByte(3, 3, b2);
177: }
178: c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0)
179: | (b2 & 0x003F);
180: }
181:
182: // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
183: // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
184: // [1101 11yy] [yyxx xxxx] (low surrogate)
185: // * uuuuu = wwww + 1
186: else if ((b0 & 0xF8) == 0xF0) {
187: int b1 = index == fOffset ? fInputStream.read()
188: : fBuffer[index++] & 0x00FF;
189: if (b1 == -1) {
190: expectedByte(2, 4);
191: }
192: if ((b1 & 0xC0) != 0x80) {
193: invalidByte(2, 3, b1);
194: }
195: int b2 = index == fOffset ? fInputStream.read()
196: : fBuffer[index++] & 0x00FF;
197: if (b2 == -1) {
198: expectedByte(3, 4);
199: }
200: if ((b2 & 0xC0) != 0x80) {
201: invalidByte(3, 3, b2);
202: }
203: int b3 = index == fOffset ? fInputStream.read()
204: : fBuffer[index++] & 0x00FF;
205: if (b3 == -1) {
206: expectedByte(4, 4);
207: }
208: if ((b3 & 0xC0) != 0x80) {
209: invalidByte(4, 4, b3);
210: }
211: int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
212: if (uuuuu > 0x10) {
213: invalidSurrogate(uuuuu);
214: }
215: int wwww = uuuuu - 1;
216: int hs = 0xD800 | ((wwww << 6) & 0x03C0)
217: | ((b1 << 2) & 0x003C) | ((b2 >> 4) & 0x0003);
218: int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
219: c = hs;
220: fSurrogate = ls;
221: }
222:
223: // error
224: else {
225: invalidByte(1, 1, b0);
226: }
227: }
228:
229: // use surrogate
230: else {
231: fSurrogate = -1;
232: }
233:
234: // return character
235: if (DEBUG_READ) {
236: System.out.println("read(): 0x" + Integer.toHexString(c));
237: }
238: return c;
239:
240: } // read():int
241:
242: /**
243: * Read characters into a portion of an array. This method will block
244: * until some input is available, an I/O error occurs, or the end of the
245: * stream is reached.
246: *
247: * @param ch Destination buffer
248: * @param offset Offset at which to start storing characters
249: * @param length Maximum number of characters to read
250: *
251: * @return The number of characters read, or -1 if the end of the
252: * stream has been reached
253: *
254: * @exception IOException If an I/O error occurs
255: */
256: public int read(char ch[], int offset, int length)
257: throws IOException {
258: // handle surrogate
259: int out = offset;
260: if (fSurrogate != -1) {
261: ch[offset + 1] = (char) fSurrogate;
262: fSurrogate = -1;
263: length--;
264: out++;
265: }
266:
267: // read bytes
268: int count = 0;
269: if (fOffset == 0) {
270: // adjust length to read
271: if (length > fBuffer.length) {
272: length = fBuffer.length;
273: }
274:
275: // perform read operation
276: count = fInputStream.read(fBuffer, 0, length);
277: if (count == -1) {
278: return -1;
279: }
280: count += out - offset;
281: }
282:
283: // skip read; last character was in error
284: // NOTE: Having an offset value other than zero means that there was
285: // an error in the last character read. In this case, we have
286: // skipped the read so we don't consume any bytes past the
287: // error. By signalling the error on the next block read we
288: // allow the method to return the most valid characters that
289: // it can on the previous block read. -Ac
290: else {
291: count = fOffset;
292: fOffset = 0;
293: }
294:
295: // convert bytes to characters
296: final int total = count;
297: boolean isAscii = true;
298: int lc = 0;
299:
300: for (lc = 0; lc < total; lc++) {
301: int b0 = fBuffer[lc] & 0x00FF;
302:
303: // UTF-8: [0xxx xxxx]
304: // Unicode: [0000 0000] [0xxx xxxx]
305: if (b0 < 0x80) {
306: ch[out++] = (char) b0;
307: } else {
308: isAscii = false;
309: break;
310: }
311: }
312:
313: if (isAscii) {
314: return count;
315: }
316:
317: for (int in = lc; in < total; in++) {
318: int b0 = fBuffer[in] & 0x00FF;
319:
320: // UTF-8: [0xxx xxxx]
321: // Unicode: [0000 0000] [0xxx xxxx]
322: if (b0 < 0x80) {
323: ch[out++] = (char) b0;
324: continue;
325: }
326:
327: // UTF-8: [110y yyyy] [10xx xxxx]
328: // Unicode: [0000 0yyy] [yyxx xxxx]
329: if ((b0 & 0xE0) == 0xC0) {
330: int b1 = -1;
331: if (++in < total) {
332: b1 = fBuffer[in] & 0x00FF;
333: } else {
334: b1 = fInputStream.read();
335: if (b1 == -1) {
336: if (out > offset) {
337: fBuffer[0] = (byte) b0;
338: fOffset = 1;
339: return out - offset;
340: }
341: expectedByte(2, 2);
342: }
343: count++;
344: }
345: if ((b1 & 0xC0) != 0x80) {
346: if (out > offset) {
347: fBuffer[0] = (byte) b0;
348: fBuffer[1] = (byte) b1;
349: fOffset = 2;
350: return out - offset;
351: }
352: invalidByte(2, 2, b1);
353: }
354: int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
355: ch[out++] = (char) c;
356: count -= 1;
357: continue;
358: }
359:
360: // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
361: // Unicode: [zzzz yyyy] [yyxx xxxx]
362: if ((b0 & 0xF0) == 0xE0) {
363: int b1 = -1;
364: if (++in < total) {
365: b1 = fBuffer[in] & 0x00FF;
366: } else {
367: b1 = fInputStream.read();
368: if (b1 == -1) {
369: if (out > offset) {
370: fBuffer[0] = (byte) b0;
371: fOffset = 1;
372: return out - offset;
373: }
374: expectedByte(2, 3);
375: }
376: count++;
377: }
378: if ((b1 & 0xC0) != 0x80) {
379: if (out > offset) {
380: fBuffer[0] = (byte) b0;
381: fBuffer[1] = (byte) b1;
382: fOffset = 2;
383: return out - offset;
384: }
385: invalidByte(2, 3, b1);
386: }
387: int b2 = -1;
388: if (++in < total) {
389: b2 = fBuffer[in] & 0x00FF;
390: } else {
391: b2 = fInputStream.read();
392: if (b2 == -1) {
393: if (out > offset) {
394: fBuffer[0] = (byte) b0;
395: fBuffer[1] = (byte) b1;
396: fOffset = 2;
397: return out - offset;
398: }
399: expectedByte(3, 3);
400: }
401: count++;
402: }
403: if ((b2 & 0xC0) != 0x80) {
404: if (out > offset) {
405: fBuffer[0] = (byte) b0;
406: fBuffer[1] = (byte) b1;
407: fBuffer[2] = (byte) b2;
408: fOffset = 3;
409: return out - offset;
410: }
411: invalidByte(3, 3, b2);
412: }
413: int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0)
414: | (b2 & 0x003F);
415: ch[out++] = (char) c;
416: count -= 2;
417: continue;
418: }
419:
420: // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
421: // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
422: // [1101 11yy] [yyxx xxxx] (low surrogate)
423: // * uuuuu = wwww + 1
424: if ((b0 & 0xF8) == 0xF0) {
425: int b1 = -1;
426: if (++in < total) {
427: b1 = fBuffer[in] & 0x00FF;
428: } else {
429: b1 = fInputStream.read();
430: if (b1 == -1) {
431: if (out > offset) {
432: fBuffer[0] = (byte) b0;
433: fOffset = 1;
434: return out - offset;
435: }
436: expectedByte(2, 4);
437: }
438: count++;
439: }
440: if ((b1 & 0xC0) != 0x80) {
441: if (out > offset) {
442: fBuffer[0] = (byte) b0;
443: fBuffer[1] = (byte) b1;
444: fOffset = 2;
445: return out - offset;
446: }
447: invalidByte(2, 4, b1);
448: }
449: int b2 = -1;
450: if (++in < total) {
451: b2 = fBuffer[in] & 0x00FF;
452: } else {
453: b2 = fInputStream.read();
454: if (b2 == -1) {
455: if (out > offset) {
456: fBuffer[0] = (byte) b0;
457: fBuffer[1] = (byte) b1;
458: fOffset = 2;
459: return out - offset;
460: }
461: expectedByte(3, 4);
462: }
463: count++;
464: }
465: if ((b2 & 0xC0) != 0x80) {
466: if (out > offset) {
467: fBuffer[0] = (byte) b0;
468: fBuffer[1] = (byte) b1;
469: fBuffer[2] = (byte) b2;
470: fOffset = 3;
471: return out - offset;
472: }
473: invalidByte(3, 4, b2);
474: }
475: int b3 = -1;
476: if (++in < total) {
477: b3 = fBuffer[in] & 0x00FF;
478: } else {
479: b3 = fInputStream.read();
480: if (b3 == -1) {
481: if (out > offset) {
482: fBuffer[0] = (byte) b0;
483: fBuffer[1] = (byte) b1;
484: fBuffer[2] = (byte) b2;
485: fOffset = 3;
486: return out - offset;
487: }
488: expectedByte(4, 4);
489: }
490: count++;
491: }
492: if ((b3 & 0xC0) != 0x80) {
493: if (out > offset) {
494: fBuffer[0] = (byte) b0;
495: fBuffer[1] = (byte) b1;
496: fBuffer[2] = (byte) b2;
497: fBuffer[3] = (byte) b3;
498: fOffset = 4;
499: return out - offset;
500: }
501: invalidByte(4, 4, b2);
502: }
503:
504: // decode bytes into surrogate characters
505: int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
506: if (uuuuu > 0x10) {
507: invalidSurrogate(uuuuu);
508: }
509: int wwww = uuuuu - 1;
510: int zzzz = b1 & 0x000F;
511: int yyyyyy = b2 & 0x003F;
512: int xxxxxx = b3 & 0x003F;
513: int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2)
514: | (yyyyyy >> 4);
515: int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
516:
517: // set characters
518: ch[out++] = (char) hs;
519: ch[out++] = (char) ls;
520: count -= 2;
521: continue;
522: }
523:
524: // error
525: if (out > offset) {
526: fBuffer[0] = (byte) b0;
527: fOffset = 1;
528: return out - offset;
529: }
530: invalidByte(1, 1, b0);
531: }
532:
533: // return number of characters converted
534: if (DEBUG_READ) {
535: System.out.println("read(char[]," + offset + ',' + length
536: + "): count=" + count);
537: }
538:
539: return count;
540:
541: } // read(char[],int,int)
542:
543: /**
544: * Skip characters. This method will block until some characters are
545: * available, an I/O error occurs, or the end of the stream is reached.
546: *
547: * @param n The number of characters to skip
548: *
549: * @return The number of characters actually skipped
550: *
551: * @exception IOException If an I/O error occurs
552: */
553: public long skip(long n) throws IOException {
554:
555: long remaining = n;
556: final char[] ch = new char[fBuffer.length];
557: do {
558: int length = ch.length < remaining ? ch.length
559: : (int) remaining;
560: int count = read(ch, 0, length);
561: if (count > 0) {
562: remaining -= count;
563: } else {
564: break;
565: }
566: } while (remaining > 0);
567:
568: long skipped = n - remaining;
569: return skipped;
570:
571: } // skip(long):long
572:
573: /**
574: * Tell whether this stream is ready to be read.
575: *
576: * @return True if the next read() is guaranteed not to block for input,
577: * false otherwise. Note that returning false does not guarantee that the
578: * next read will block.
579: *
580: * @exception IOException If an I/O error occurs
581: */
582: public boolean ready() throws IOException {
583: return false;
584: } // ready()
585:
586: /**
587: * Tell whether this stream supports the mark() operation.
588: */
589: public boolean markSupported() {
590: return false;
591: } // markSupported()
592:
593: /**
594: * Mark the present position in the stream. Subsequent calls to reset()
595: * will attempt to reposition the stream to this point. Not all
596: * character-input streams support the mark() operation.
597: *
598: * @param readAheadLimit Limit on the number of characters that may be
599: * read while still preserving the mark. After
600: * reading this many characters, attempting to
601: * reset the stream may fail.
602: *
603: * @exception IOException If the stream does not support mark(),
604: * or if some other I/O error occurs
605: */
606: public void mark(int readAheadLimit) throws IOException {
607: throw new IOException(fFormatter.formatMessage(fLocale,
608: "OperationNotSupported", new Object[] { "mark()",
609: "UTF-8" }));
610: } // mark(int)
611:
612: /**
613: * Reset the stream. If the stream has been marked, then attempt to
614: * reposition it at the mark. If the stream has not been marked, then
615: * attempt to reset it in some way appropriate to the particular stream,
616: * for example by repositioning it to its starting point. Not all
617: * character-input streams support the reset() operation, and some support
618: * reset() without supporting mark().
619: *
620: * @exception IOException If the stream has not been marked,
621: * or if the mark has been invalidated,
622: * or if the stream does not support reset(),
623: * or if some other I/O error occurs
624: */
625: public void reset() throws IOException {
626: fOffset = 0;
627: fSurrogate = -1;
628: } // reset()
629:
630: /**
631: * Close the stream. Once a stream has been closed, further read(),
632: * ready(), mark(), or reset() invocations will throw an IOException.
633: * Closing a previously-closed stream, however, has no effect.
634: *
635: * @exception IOException If an I/O error occurs
636: */
637: public void close() throws IOException {
638: BufferAllocator ba = ThreadLocalBufferAllocator
639: .getBufferAllocator();
640: ba.returnByteBuffer(fBuffer);
641: fBuffer = null;
642: fInputStream.close();
643: } // close()
644:
645: //
646: // Private methods
647: //
648:
649: /** Throws an exception for expected byte. */
650: private void expectedByte(int position, int count)
651: throws UTFDataFormatException {
652:
653: String message = fFormatter.formatMessage(fLocale,
654: "ExpectedByte", new Object[] {
655: Integer.toString(position),
656: Integer.toString(count) });
657: throw new UTFDataFormatException(message);
658:
659: } // expectedByte(int,int,int)
660:
661: /** Throws an exception for invalid byte. */
662: private void invalidByte(int position, int count, int c)
663: throws UTFDataFormatException {
664:
665: String message = fFormatter.formatMessage(fLocale,
666: "InvalidByte", new Object[] {
667: Integer.toString(position),
668: Integer.toString(count) });
669: throw new UTFDataFormatException(message);
670:
671: } // invalidByte(int,int,int,int)
672:
673: /** Throws an exception for invalid surrogate bits. */
674: private void invalidSurrogate(int uuuuu)
675: throws UTFDataFormatException {
676:
677: StringBuffer str = new StringBuffer();
678: str
679: .append("high surrogate bits in UTF-8 sequence must not exceed 0x10 but found 0x");
680:
681: String message = fFormatter.formatMessage(fLocale,
682: "InvalidHighSurrogate", new Object[] { Integer
683: .toHexString(uuuuu) });
684: throw new UTFDataFormatException(message);
685:
686: } // invalidSurrogate(int)
687:
688: } // class UTF8Reader
|