001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package org.apache.xerces.impl.io;
019:
020: import java.io.InputStream;
021: import java.io.IOException;
022: import java.io.Reader;
023:
024: import java.util.Locale;
025: import org.apache.xerces.util.MessageFormatter;
026: import org.apache.xerces.impl.msg.XMLMessageFormatter;
027:
028: /**
029: * <p>A UTF-8 reader.</p>
030: *
031: * @xerces.internal
032: *
033: * @author Andy Clark, IBM
034: *
035: * @version $Id: UTF8Reader.java 554069 2007-07-06 21:56:14Z mrglavas $
036: */
037: public class UTF8Reader extends Reader {
038:
039: //
040: // Constants
041: //
042:
043: /** Default byte buffer size (2048). */
044: public static final int DEFAULT_BUFFER_SIZE = 2048;
045:
046: // debugging
047:
048: /** Debug read. */
049: private static final boolean DEBUG_READ = false;
050:
051: //
052: // Data
053: //
054:
055: /** Input stream. */
056: protected final InputStream fInputStream;
057:
058: /** Byte buffer. */
059: protected final byte[] fBuffer;
060:
061: /** Offset into buffer. */
062: protected int fOffset;
063:
064: /** Surrogate character. */
065: private int fSurrogate = -1;
066:
067: // message formatter; used to produce localized
068: // exception messages
069: private final MessageFormatter fFormatter;
070:
071: //Locale to use for messages
072: private final Locale fLocale;
073:
074: //
075: // Constructors
076: //
077:
078: /**
079: * Constructs a UTF-8 reader from the specified input stream
080: * using the default buffer size. Primarily for testing.
081: *
082: * @param inputStream The input stream.
083: */
084: public UTF8Reader(InputStream inputStream) {
085: this (inputStream, DEFAULT_BUFFER_SIZE,
086: new XMLMessageFormatter(), Locale.getDefault());
087: } // <init>(InputStream, MessageFormatter)
088:
089: /**
090: * Constructs a UTF-8 reader from the specified input stream
091: * using the default buffer size and the given MessageFormatter.
092: *
093: * @param inputStream The input stream.
094: * @param messageFormatter given MessageFormatter
095: * @param locale Locale to use for messages
096: */
097: public UTF8Reader(InputStream inputStream,
098: MessageFormatter messageFormatter, Locale locale) {
099: this (inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
100: } // <init>(InputStream, MessageFormatter, Locale)
101:
102: /**
103: * Constructs a UTF-8 reader from the specified input stream,
104: * buffer size and MessageFormatter.
105: *
106: * @param inputStream The input stream.
107: * @param size The initial buffer size.
108: * @param messageFormatter the formatter for localizing/formatting errors.
109: * @param locale the Locale to use for messages
110: */
111: public UTF8Reader(InputStream inputStream, int size,
112: MessageFormatter messageFormatter, Locale locale) {
113: this (inputStream, new byte[size], messageFormatter, locale);
114: } // <init>(InputStream, int, MessageFormatter, Locale)
115:
116: /**
117: * Constructs a UTF-8 reader from the specified input stream,
118: * buffer and MessageFormatter.
119: *
120: * @param inputStream The input stream.
121: * @param buffer The byte buffer.
122: * @param messageFormatter the formatter for localizing/formatting errors.
123: * @param locale the Locale to use for messages
124: */
125: public UTF8Reader(InputStream inputStream, byte[] buffer,
126: MessageFormatter messageFormatter, Locale locale) {
127: fInputStream = inputStream;
128: fBuffer = buffer;
129: fFormatter = messageFormatter;
130: fLocale = locale;
131: } // <init>(InputStream, byte[], MessageFormatter, Locale)
132:
133: //
134: // Reader methods
135: //
136:
137: /**
138: * Read a single character. This method will block until a character is
139: * available, an I/O error occurs, or the end of the stream is reached.
140: *
141: * <p> Subclasses that intend to support efficient single-character input
142: * should override this method.
143: *
144: * @return The character read, as an integer in the range 0 to 16383
145: * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
146: * been reached
147: *
148: * @exception IOException If an I/O error occurs
149: */
150: public int read() throws IOException {
151:
152: // decode character
153: int c = fSurrogate;
154: if (fSurrogate == -1) {
155: // NOTE: We use the index into the buffer if there are remaining
156: // bytes from the last block read. -Ac
157: int index = 0;
158:
159: // get first byte
160: int b0 = index == fOffset ? fInputStream.read()
161: : fBuffer[index++] & 0x00FF;
162: if (b0 == -1) {
163: return -1;
164: }
165:
166: // UTF-8: [0xxx xxxx]
167: // Unicode: [0000 0000] [0xxx xxxx]
168: if (b0 < 0x80) {
169: c = (char) b0;
170: }
171:
172: // UTF-8: [110y yyyy] [10xx xxxx]
173: // Unicode: [0000 0yyy] [yyxx xxxx]
174: else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
175: int b1 = index == fOffset ? fInputStream.read()
176: : fBuffer[index++] & 0x00FF;
177: if (b1 == -1) {
178: expectedByte(2, 2);
179: }
180: if ((b1 & 0xC0) != 0x80) {
181: invalidByte(2, 2, b1);
182: }
183: c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
184: }
185:
186: // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
187: // Unicode: [zzzz yyyy] [yyxx xxxx]
188: else if ((b0 & 0xF0) == 0xE0) {
189: int b1 = index == fOffset ? fInputStream.read()
190: : fBuffer[index++] & 0x00FF;
191: if (b1 == -1) {
192: expectedByte(2, 3);
193: }
194: if ((b1 & 0xC0) != 0x80 || (b0 == 0xED && b1 >= 0xA0)
195: || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
196: invalidByte(2, 3, b1);
197: }
198: int b2 = index == fOffset ? fInputStream.read()
199: : fBuffer[index++] & 0x00FF;
200: if (b2 == -1) {
201: expectedByte(3, 3);
202: }
203: if ((b2 & 0xC0) != 0x80) {
204: invalidByte(3, 3, b2);
205: }
206: c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0)
207: | (b2 & 0x003F);
208: }
209:
210: // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
211: // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
212: // [1101 11yy] [yyxx xxxx] (low surrogate)
213: // * uuuuu = wwww + 1
214: else if ((b0 & 0xF8) == 0xF0) {
215: int b1 = index == fOffset ? fInputStream.read()
216: : fBuffer[index++] & 0x00FF;
217: if (b1 == -1) {
218: expectedByte(2, 4);
219: }
220: if ((b1 & 0xC0) != 0x80
221: || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
222: invalidByte(2, 3, b1);
223: }
224: int b2 = index == fOffset ? fInputStream.read()
225: : fBuffer[index++] & 0x00FF;
226: if (b2 == -1) {
227: expectedByte(3, 4);
228: }
229: if ((b2 & 0xC0) != 0x80) {
230: invalidByte(3, 3, b2);
231: }
232: int b3 = index == fOffset ? fInputStream.read()
233: : fBuffer[index++] & 0x00FF;
234: if (b3 == -1) {
235: expectedByte(4, 4);
236: }
237: if ((b3 & 0xC0) != 0x80) {
238: invalidByte(4, 4, b3);
239: }
240: int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
241: if (uuuuu > 0x10) {
242: invalidSurrogate(uuuuu);
243: }
244: int wwww = uuuuu - 1;
245: int hs = 0xD800 | ((wwww << 6) & 0x03C0)
246: | ((b1 << 2) & 0x003C) | ((b2 >> 4) & 0x0003);
247: int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
248: c = hs;
249: fSurrogate = ls;
250: }
251:
252: // error
253: else {
254: invalidByte(1, 1, b0);
255: }
256: }
257:
258: // use surrogate
259: else {
260: fSurrogate = -1;
261: }
262:
263: // return character
264: if (DEBUG_READ) {
265: System.out.println("read(): 0x" + Integer.toHexString(c));
266: }
267: return c;
268:
269: } // read():int
270:
271: /**
272: * Read characters into a portion of an array. This method will block
273: * until some input is available, an I/O error occurs, or the end of the
274: * stream is reached.
275: *
276: * @param ch Destination buffer
277: * @param offset Offset at which to start storing characters
278: * @param length Maximum number of characters to read
279: *
280: * @return The number of characters read, or -1 if the end of the
281: * stream has been reached
282: *
283: * @exception IOException If an I/O error occurs
284: */
285: public int read(char ch[], int offset, int length)
286: throws IOException {
287:
288: // read bytes
289: int out = offset;
290: int count = 0;
291: if (fOffset == 0) {
292: // adjust length to read
293: if (length > fBuffer.length) {
294: length = fBuffer.length;
295: }
296:
297: // handle surrogate
298: if (fSurrogate != -1) {
299: ch[out++] = (char) fSurrogate;
300: fSurrogate = -1;
301: length--;
302: }
303:
304: // perform read operation
305: count = fInputStream.read(fBuffer, 0, length);
306: if (count == -1) {
307: return -1;
308: }
309: count += out - offset;
310: }
311:
312: // skip read; last character was in error
313: // NOTE: Having an offset value other than zero means that there was
314: // an error in the last character read. In this case, we have
315: // skipped the read so we don't consume any bytes past the
316: // error. By signalling the error on the next block read we
317: // allow the method to return the most valid characters that
318: // it can on the previous block read. -Ac
319: else {
320: count = fOffset;
321: fOffset = 0;
322: }
323:
324: // convert bytes to characters
325: final int total = count;
326: int in;
327: byte byte1;
328: final byte byte0 = 0;
329: for (in = 0; in < total; in++) {
330: byte1 = fBuffer[in];
331: if (byte1 >= byte0) {
332: ch[out++] = (char) byte1;
333: } else {
334: break;
335: }
336: }
337: for (; in < total; in++) {
338: byte1 = fBuffer[in];
339:
340: // UTF-8: [0xxx xxxx]
341: // Unicode: [0000 0000] [0xxx xxxx]
342: if (byte1 >= byte0) {
343: ch[out++] = (char) byte1;
344: continue;
345: }
346:
347: // UTF-8: [110y yyyy] [10xx xxxx]
348: // Unicode: [0000 0yyy] [yyxx xxxx]
349: int b0 = byte1 & 0x0FF;
350: if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
351: int b1 = -1;
352: if (++in < total) {
353: b1 = fBuffer[in] & 0x00FF;
354: } else {
355: b1 = fInputStream.read();
356: if (b1 == -1) {
357: if (out > offset) {
358: fBuffer[0] = (byte) b0;
359: fOffset = 1;
360: return out - offset;
361: }
362: expectedByte(2, 2);
363: }
364: count++;
365: }
366: if ((b1 & 0xC0) != 0x80) {
367: if (out > offset) {
368: fBuffer[0] = (byte) b0;
369: fBuffer[1] = (byte) b1;
370: fOffset = 2;
371: return out - offset;
372: }
373: invalidByte(2, 2, b1);
374: }
375: int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
376: ch[out++] = (char) c;
377: count -= 1;
378: continue;
379: }
380:
381: // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
382: // Unicode: [zzzz yyyy] [yyxx xxxx]
383: if ((b0 & 0xF0) == 0xE0) {
384: int b1 = -1;
385: if (++in < total) {
386: b1 = fBuffer[in] & 0x00FF;
387: } else {
388: b1 = fInputStream.read();
389: if (b1 == -1) {
390: if (out > offset) {
391: fBuffer[0] = (byte) b0;
392: fOffset = 1;
393: return out - offset;
394: }
395: expectedByte(2, 3);
396: }
397: count++;
398: }
399: if ((b1 & 0xC0) != 0x80 || (b0 == 0xED && b1 >= 0xA0)
400: || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
401: if (out > offset) {
402: fBuffer[0] = (byte) b0;
403: fBuffer[1] = (byte) b1;
404: fOffset = 2;
405: return out - offset;
406: }
407: invalidByte(2, 3, b1);
408: }
409: int b2 = -1;
410: if (++in < total) {
411: b2 = fBuffer[in] & 0x00FF;
412: } else {
413: b2 = fInputStream.read();
414: if (b2 == -1) {
415: if (out > offset) {
416: fBuffer[0] = (byte) b0;
417: fBuffer[1] = (byte) b1;
418: fOffset = 2;
419: return out - offset;
420: }
421: expectedByte(3, 3);
422: }
423: count++;
424: }
425: if ((b2 & 0xC0) != 0x80) {
426: if (out > offset) {
427: fBuffer[0] = (byte) b0;
428: fBuffer[1] = (byte) b1;
429: fBuffer[2] = (byte) b2;
430: fOffset = 3;
431: return out - offset;
432: }
433: invalidByte(3, 3, b2);
434: }
435: int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0)
436: | (b2 & 0x003F);
437: ch[out++] = (char) c;
438: count -= 2;
439: continue;
440: }
441:
442: // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
443: // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
444: // [1101 11yy] [yyxx xxxx] (low surrogate)
445: // * uuuuu = wwww + 1
446: if ((b0 & 0xF8) == 0xF0) {
447: int b1 = -1;
448: if (++in < total) {
449: b1 = fBuffer[in] & 0x00FF;
450: } else {
451: b1 = fInputStream.read();
452: if (b1 == -1) {
453: if (out > offset) {
454: fBuffer[0] = (byte) b0;
455: fOffset = 1;
456: return out - offset;
457: }
458: expectedByte(2, 4);
459: }
460: count++;
461: }
462: if ((b1 & 0xC0) != 0x80
463: || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
464: if (out > offset) {
465: fBuffer[0] = (byte) b0;
466: fBuffer[1] = (byte) b1;
467: fOffset = 2;
468: return out - offset;
469: }
470: invalidByte(2, 4, b1);
471: }
472: int b2 = -1;
473: if (++in < total) {
474: b2 = fBuffer[in] & 0x00FF;
475: } else {
476: b2 = fInputStream.read();
477: if (b2 == -1) {
478: if (out > offset) {
479: fBuffer[0] = (byte) b0;
480: fBuffer[1] = (byte) b1;
481: fOffset = 2;
482: return out - offset;
483: }
484: expectedByte(3, 4);
485: }
486: count++;
487: }
488: if ((b2 & 0xC0) != 0x80) {
489: if (out > offset) {
490: fBuffer[0] = (byte) b0;
491: fBuffer[1] = (byte) b1;
492: fBuffer[2] = (byte) b2;
493: fOffset = 3;
494: return out - offset;
495: }
496: invalidByte(3, 4, b2);
497: }
498: int b3 = -1;
499: if (++in < total) {
500: b3 = fBuffer[in] & 0x00FF;
501: } else {
502: b3 = fInputStream.read();
503: if (b3 == -1) {
504: if (out > offset) {
505: fBuffer[0] = (byte) b0;
506: fBuffer[1] = (byte) b1;
507: fBuffer[2] = (byte) b2;
508: fOffset = 3;
509: return out - offset;
510: }
511: expectedByte(4, 4);
512: }
513: count++;
514: }
515: if ((b3 & 0xC0) != 0x80) {
516: if (out > offset) {
517: fBuffer[0] = (byte) b0;
518: fBuffer[1] = (byte) b1;
519: fBuffer[2] = (byte) b2;
520: fBuffer[3] = (byte) b3;
521: fOffset = 4;
522: return out - offset;
523: }
524: invalidByte(4, 4, b2);
525: }
526:
527: // decode bytes into surrogate characters
528: int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
529: if (uuuuu > 0x10) {
530: invalidSurrogate(uuuuu);
531: }
532: int wwww = uuuuu - 1;
533: int zzzz = b1 & 0x000F;
534: int yyyyyy = b2 & 0x003F;
535: int xxxxxx = b3 & 0x003F;
536: int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2)
537: | (yyyyyy >> 4);
538: int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
539:
540: // set characters
541: ch[out++] = (char) hs;
542: if ((count -= 2) <= length) {
543: ch[out++] = (char) ls;
544: }
545: // reached the end of the char buffer; save low surrogate for the next read
546: else {
547: fSurrogate = ls;
548: --count;
549: }
550: continue;
551: }
552:
553: // error
554: if (out > offset) {
555: fBuffer[0] = (byte) b0;
556: fOffset = 1;
557: return out - offset;
558: }
559: invalidByte(1, 1, b0);
560: }
561:
562: // return number of characters converted
563: if (DEBUG_READ) {
564: System.out.println("read(char[]," + offset + ',' + length
565: + "): count=" + count);
566: }
567: return count;
568:
569: } // read(char[],int,int)
570:
571: /**
572: * Skip characters. This method will block until some characters are
573: * available, an I/O error occurs, or the end of the stream is reached.
574: *
575: * @param n The number of characters to skip
576: *
577: * @return The number of characters actually skipped
578: *
579: * @exception IOException If an I/O error occurs
580: */
581: public long skip(long n) throws IOException {
582:
583: long remaining = n;
584: final char[] ch = new char[fBuffer.length];
585: do {
586: int length = ch.length < remaining ? ch.length
587: : (int) remaining;
588: int count = read(ch, 0, length);
589: if (count > 0) {
590: remaining -= count;
591: } else {
592: break;
593: }
594: } while (remaining > 0);
595:
596: long skipped = n - remaining;
597: return skipped;
598:
599: } // skip(long):long
600:
601: /**
602: * Tell whether this stream is ready to be read.
603: *
604: * @return True if the next read() is guaranteed not to block for input,
605: * false otherwise. Note that returning false does not guarantee that the
606: * next read will block.
607: *
608: * @exception IOException If an I/O error occurs
609: */
610: public boolean ready() throws IOException {
611: return false;
612: } // ready()
613:
614: /**
615: * Tell whether this stream supports the mark() operation.
616: */
617: public boolean markSupported() {
618: return false;
619: } // markSupported()
620:
621: /**
622: * Mark the present position in the stream. Subsequent calls to reset()
623: * will attempt to reposition the stream to this point. Not all
624: * character-input streams support the mark() operation.
625: *
626: * @param readAheadLimit Limit on the number of characters that may be
627: * read while still preserving the mark. After
628: * reading this many characters, attempting to
629: * reset the stream may fail.
630: *
631: * @exception IOException If the stream does not support mark(),
632: * or if some other I/O error occurs
633: */
634: public void mark(int readAheadLimit) throws IOException {
635: throw new IOException(fFormatter.formatMessage(fLocale,
636: "OperationNotSupported", new Object[] { "mark()",
637: "UTF-8" }));
638: } // mark(int)
639:
640: /**
641: * Reset the stream. If the stream has been marked, then attempt to
642: * reposition it at the mark. If the stream has not been marked, then
643: * attempt to reset it in some way appropriate to the particular stream,
644: * for example by repositioning it to its starting point. Not all
645: * character-input streams support the reset() operation, and some support
646: * reset() without supporting mark().
647: *
648: * @exception IOException If the stream has not been marked,
649: * or if the mark has been invalidated,
650: * or if the stream does not support reset(),
651: * or if some other I/O error occurs
652: */
653: public void reset() throws IOException {
654: fOffset = 0;
655: fSurrogate = -1;
656: } // reset()
657:
658: /**
659: * Close the stream. Once a stream has been closed, further read(),
660: * ready(), mark(), or reset() invocations will throw an IOException.
661: * Closing a previously-closed stream, however, has no effect.
662: *
663: * @exception IOException If an I/O error occurs
664: */
665: public void close() throws IOException {
666: fInputStream.close();
667: } // close()
668:
669: //
670: // Private methods
671: //
672:
673: /** Throws an exception for expected byte. */
674: private void expectedByte(int position, int count)
675: throws MalformedByteSequenceException {
676:
677: throw new MalformedByteSequenceException(fFormatter, fLocale,
678: XMLMessageFormatter.XML_DOMAIN, "ExpectedByte",
679: new Object[] { Integer.toString(position),
680: Integer.toString(count) });
681:
682: } // expectedByte(int,int)
683:
684: /** Throws an exception for invalid byte. */
685: private void invalidByte(int position, int count, int c)
686: throws MalformedByteSequenceException {
687:
688: throw new MalformedByteSequenceException(fFormatter, fLocale,
689: XMLMessageFormatter.XML_DOMAIN, "InvalidByte",
690: new Object[] { Integer.toString(position),
691: Integer.toString(count) });
692:
693: } // invalidByte(int,int,int)
694:
695: /** Throws an exception for invalid surrogate bits. */
696: private void invalidSurrogate(int uuuuu)
697: throws MalformedByteSequenceException {
698:
699: throw new MalformedByteSequenceException(fFormatter, fLocale,
700: XMLMessageFormatter.XML_DOMAIN, "InvalidHighSurrogate",
701: new Object[] { Integer.toHexString(uuuuu) });
702:
703: } // invalidSurrogate(int)
704:
705: } // class UTF8Reader
|