001: /*
002: * Copyright 1999,2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.apache.jasper.xmlparser;
018:
019: import java.io.InputStream;
020: import java.io.IOException;
021: import java.io.Reader;
022: import java.io.UTFDataFormatException;
023: import org.apache.jasper.compiler.Localizer;
024:
025: /**
026: * @author Andy Clark, IBM
027: *
028: * @version $Id: UTF8Reader.java,v 1.3 2004/03/17 19:23:05 luehe Exp $
029: */
030: public class UTF8Reader extends Reader {
031:
032: //
033: // Constants
034: //
035:
036: /** Default byte buffer size (2048). */
037: public static final int DEFAULT_BUFFER_SIZE = 2048;
038:
039: // debugging
040:
041: /** Debug read. */
042: private static final boolean DEBUG_READ = false;
043:
044: //
045: // Data
046: //
047:
048: /** Input stream. */
049: protected InputStream fInputStream;
050:
051: /** Byte buffer. */
052: protected byte[] fBuffer;
053:
054: /** Offset into buffer. */
055: protected int fOffset;
056:
057: /** Surrogate character. */
058: private int fSurrogate = -1;
059:
060: //
061: // Constructors
062: //
063:
064: /**
065: * Constructs a UTF-8 reader from the specified input stream,
066: * buffer size and MessageFormatter.
067: *
068: * @param inputStream The input stream.
069: * @param size The initial buffer size.
070: */
071: public UTF8Reader(InputStream inputStream, int size) {
072: fInputStream = inputStream;
073: fBuffer = new byte[size];
074: }
075:
076: //
077: // Reader methods
078: //
079:
080: /**
081: * Read a single character. This method will block until a character is
082: * available, an I/O error occurs, or the end of the stream is reached.
083: *
084: * <p> Subclasses that intend to support efficient single-character input
085: * should override this method.
086: *
087: * @return The character read, as an integer in the range 0 to 16383
088: * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
089: * been reached
090: *
091: * @exception IOException If an I/O error occurs
092: */
093: public int read() throws IOException {
094:
095: // decode character
096: int c = fSurrogate;
097: if (fSurrogate == -1) {
098: // NOTE: We use the index into the buffer if there are remaining
099: // bytes from the last block read. -Ac
100: int index = 0;
101:
102: // get first byte
103: int b0 = index == fOffset ? fInputStream.read()
104: : fBuffer[index++] & 0x00FF;
105: if (b0 == -1) {
106: return -1;
107: }
108:
109: // UTF-8: [0xxx xxxx]
110: // Unicode: [0000 0000] [0xxx xxxx]
111: if (b0 < 0x80) {
112: c = (char) b0;
113: }
114:
115: // UTF-8: [110y yyyy] [10xx xxxx]
116: // Unicode: [0000 0yyy] [yyxx xxxx]
117: else if ((b0 & 0xE0) == 0xC0) {
118: int b1 = index == fOffset ? fInputStream.read()
119: : fBuffer[index++] & 0x00FF;
120: if (b1 == -1) {
121: expectedByte(2, 2);
122: }
123: if ((b1 & 0xC0) != 0x80) {
124: invalidByte(2, 2, b1);
125: }
126: c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
127: }
128:
129: // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
130: // Unicode: [zzzz yyyy] [yyxx xxxx]
131: else if ((b0 & 0xF0) == 0xE0) {
132: int b1 = index == fOffset ? fInputStream.read()
133: : fBuffer[index++] & 0x00FF;
134: if (b1 == -1) {
135: expectedByte(2, 3);
136: }
137: if ((b1 & 0xC0) != 0x80) {
138: invalidByte(2, 3, b1);
139: }
140: int b2 = index == fOffset ? fInputStream.read()
141: : fBuffer[index++] & 0x00FF;
142: if (b2 == -1) {
143: expectedByte(3, 3);
144: }
145: if ((b2 & 0xC0) != 0x80) {
146: invalidByte(3, 3, b2);
147: }
148: c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0)
149: | (b2 & 0x003F);
150: }
151:
152: // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
153: // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
154: // [1101 11yy] [yyxx xxxx] (low surrogate)
155: // * uuuuu = wwww + 1
156: else if ((b0 & 0xF8) == 0xF0) {
157: int b1 = index == fOffset ? fInputStream.read()
158: : fBuffer[index++] & 0x00FF;
159: if (b1 == -1) {
160: expectedByte(2, 4);
161: }
162: if ((b1 & 0xC0) != 0x80) {
163: invalidByte(2, 3, b1);
164: }
165: int b2 = index == fOffset ? fInputStream.read()
166: : fBuffer[index++] & 0x00FF;
167: if (b2 == -1) {
168: expectedByte(3, 4);
169: }
170: if ((b2 & 0xC0) != 0x80) {
171: invalidByte(3, 3, b2);
172: }
173: int b3 = index == fOffset ? fInputStream.read()
174: : fBuffer[index++] & 0x00FF;
175: if (b3 == -1) {
176: expectedByte(4, 4);
177: }
178: if ((b3 & 0xC0) != 0x80) {
179: invalidByte(4, 4, b3);
180: }
181: int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
182: if (uuuuu > 0x10) {
183: invalidSurrogate(uuuuu);
184: }
185: int wwww = uuuuu - 1;
186: int hs = 0xD800 | ((wwww << 6) & 0x03C0)
187: | ((b1 << 2) & 0x003C) | ((b2 >> 4) & 0x0003);
188: int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
189: c = hs;
190: fSurrogate = ls;
191: }
192:
193: // error
194: else {
195: invalidByte(1, 1, b0);
196: }
197: }
198:
199: // use surrogate
200: else {
201: fSurrogate = -1;
202: }
203:
204: // return character
205: if (DEBUG_READ) {
206: System.out.println("read(): 0x" + Integer.toHexString(c));
207: }
208: return c;
209:
210: } // read():int
211:
212: /**
213: * Read characters into a portion of an array. This method will block
214: * until some input is available, an I/O error occurs, or the end of the
215: * stream is reached.
216: *
217: * @param ch Destination buffer
218: * @param offset Offset at which to start storing characters
219: * @param length Maximum number of characters to read
220: *
221: * @return The number of characters read, or -1 if the end of the
222: * stream has been reached
223: *
224: * @exception IOException If an I/O error occurs
225: */
226: public int read(char ch[], int offset, int length)
227: throws IOException {
228:
229: // handle surrogate
230: int out = offset;
231: if (fSurrogate != -1) {
232: ch[offset + 1] = (char) fSurrogate;
233: fSurrogate = -1;
234: length--;
235: out++;
236: }
237:
238: // read bytes
239: int count = 0;
240: if (fOffset == 0) {
241: // adjust length to read
242: if (length > fBuffer.length) {
243: length = fBuffer.length;
244: }
245:
246: // perform read operation
247: count = fInputStream.read(fBuffer, 0, length);
248: if (count == -1) {
249: return -1;
250: }
251: count += out - offset;
252: }
253:
254: // skip read; last character was in error
255: // NOTE: Having an offset value other than zero means that there was
256: // an error in the last character read. In this case, we have
257: // skipped the read so we don't consume any bytes past the
258: // error. By signalling the error on the next block read we
259: // allow the method to return the most valid characters that
260: // it can on the previous block read. -Ac
261: else {
262: count = fOffset;
263: fOffset = 0;
264: }
265:
266: // convert bytes to characters
267: final int total = count;
268: for (int in = 0; in < total; in++) {
269: int b0 = fBuffer[in] & 0x00FF;
270:
271: // UTF-8: [0xxx xxxx]
272: // Unicode: [0000 0000] [0xxx xxxx]
273: if (b0 < 0x80) {
274: ch[out++] = (char) b0;
275: continue;
276: }
277:
278: // UTF-8: [110y yyyy] [10xx xxxx]
279: // Unicode: [0000 0yyy] [yyxx xxxx]
280: if ((b0 & 0xE0) == 0xC0) {
281: int b1 = -1;
282: if (++in < total) {
283: b1 = fBuffer[in] & 0x00FF;
284: } else {
285: b1 = fInputStream.read();
286: if (b1 == -1) {
287: if (out > offset) {
288: fBuffer[0] = (byte) b0;
289: fOffset = 1;
290: return out - offset;
291: }
292: expectedByte(2, 2);
293: }
294: count++;
295: }
296: if ((b1 & 0xC0) != 0x80) {
297: if (out > offset) {
298: fBuffer[0] = (byte) b0;
299: fBuffer[1] = (byte) b1;
300: fOffset = 2;
301: return out - offset;
302: }
303: invalidByte(2, 2, b1);
304: }
305: int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
306: ch[out++] = (char) c;
307: count -= 1;
308: continue;
309: }
310:
311: // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
312: // Unicode: [zzzz yyyy] [yyxx xxxx]
313: if ((b0 & 0xF0) == 0xE0) {
314: int b1 = -1;
315: if (++in < total) {
316: b1 = fBuffer[in] & 0x00FF;
317: } else {
318: b1 = fInputStream.read();
319: if (b1 == -1) {
320: if (out > offset) {
321: fBuffer[0] = (byte) b0;
322: fOffset = 1;
323: return out - offset;
324: }
325: expectedByte(2, 3);
326: }
327: count++;
328: }
329: if ((b1 & 0xC0) != 0x80) {
330: if (out > offset) {
331: fBuffer[0] = (byte) b0;
332: fBuffer[1] = (byte) b1;
333: fOffset = 2;
334: return out - offset;
335: }
336: invalidByte(2, 3, b1);
337: }
338: int b2 = -1;
339: if (++in < total) {
340: b2 = fBuffer[in] & 0x00FF;
341: } else {
342: b2 = fInputStream.read();
343: if (b2 == -1) {
344: if (out > offset) {
345: fBuffer[0] = (byte) b0;
346: fBuffer[1] = (byte) b1;
347: fOffset = 2;
348: return out - offset;
349: }
350: expectedByte(3, 3);
351: }
352: count++;
353: }
354: if ((b2 & 0xC0) != 0x80) {
355: if (out > offset) {
356: fBuffer[0] = (byte) b0;
357: fBuffer[1] = (byte) b1;
358: fBuffer[2] = (byte) b2;
359: fOffset = 3;
360: return out - offset;
361: }
362: invalidByte(3, 3, b2);
363: }
364: int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0)
365: | (b2 & 0x003F);
366: ch[out++] = (char) c;
367: count -= 2;
368: continue;
369: }
370:
371: // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
372: // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
373: // [1101 11yy] [yyxx xxxx] (low surrogate)
374: // * uuuuu = wwww + 1
375: if ((b0 & 0xF8) == 0xF0) {
376: int b1 = -1;
377: if (++in < total) {
378: b1 = fBuffer[in] & 0x00FF;
379: } else {
380: b1 = fInputStream.read();
381: if (b1 == -1) {
382: if (out > offset) {
383: fBuffer[0] = (byte) b0;
384: fOffset = 1;
385: return out - offset;
386: }
387: expectedByte(2, 4);
388: }
389: count++;
390: }
391: if ((b1 & 0xC0) != 0x80) {
392: if (out > offset) {
393: fBuffer[0] = (byte) b0;
394: fBuffer[1] = (byte) b1;
395: fOffset = 2;
396: return out - offset;
397: }
398: invalidByte(2, 4, b1);
399: }
400: int b2 = -1;
401: if (++in < total) {
402: b2 = fBuffer[in] & 0x00FF;
403: } else {
404: b2 = fInputStream.read();
405: if (b2 == -1) {
406: if (out > offset) {
407: fBuffer[0] = (byte) b0;
408: fBuffer[1] = (byte) b1;
409: fOffset = 2;
410: return out - offset;
411: }
412: expectedByte(3, 4);
413: }
414: count++;
415: }
416: if ((b2 & 0xC0) != 0x80) {
417: if (out > offset) {
418: fBuffer[0] = (byte) b0;
419: fBuffer[1] = (byte) b1;
420: fBuffer[2] = (byte) b2;
421: fOffset = 3;
422: return out - offset;
423: }
424: invalidByte(3, 4, b2);
425: }
426: int b3 = -1;
427: if (++in < total) {
428: b3 = fBuffer[in] & 0x00FF;
429: } else {
430: b3 = fInputStream.read();
431: if (b3 == -1) {
432: if (out > offset) {
433: fBuffer[0] = (byte) b0;
434: fBuffer[1] = (byte) b1;
435: fBuffer[2] = (byte) b2;
436: fOffset = 3;
437: return out - offset;
438: }
439: expectedByte(4, 4);
440: }
441: count++;
442: }
443: if ((b3 & 0xC0) != 0x80) {
444: if (out > offset) {
445: fBuffer[0] = (byte) b0;
446: fBuffer[1] = (byte) b1;
447: fBuffer[2] = (byte) b2;
448: fBuffer[3] = (byte) b3;
449: fOffset = 4;
450: return out - offset;
451: }
452: invalidByte(4, 4, b2);
453: }
454:
455: // decode bytes into surrogate characters
456: int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
457: if (uuuuu > 0x10) {
458: invalidSurrogate(uuuuu);
459: }
460: int wwww = uuuuu - 1;
461: int zzzz = b1 & 0x000F;
462: int yyyyyy = b2 & 0x003F;
463: int xxxxxx = b3 & 0x003F;
464: int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2)
465: | (yyyyyy >> 4);
466: int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
467:
468: // set characters
469: ch[out++] = (char) hs;
470: ch[out++] = (char) ls;
471: count -= 2;
472: continue;
473: }
474:
475: // error
476: if (out > offset) {
477: fBuffer[0] = (byte) b0;
478: fOffset = 1;
479: return out - offset;
480: }
481: invalidByte(1, 1, b0);
482: }
483:
484: // return number of characters converted
485: if (DEBUG_READ) {
486: System.out.println("read(char[]," + offset + ',' + length
487: + "): count=" + count);
488: }
489: return count;
490:
491: } // read(char[],int,int)
492:
493: /**
494: * Skip characters. This method will block until some characters are
495: * available, an I/O error occurs, or the end of the stream is reached.
496: *
497: * @param n The number of characters to skip
498: *
499: * @return The number of characters actually skipped
500: *
501: * @exception IOException If an I/O error occurs
502: */
503: public long skip(long n) throws IOException {
504:
505: long remaining = n;
506: final char[] ch = new char[fBuffer.length];
507: do {
508: int length = ch.length < remaining ? ch.length
509: : (int) remaining;
510: int count = read(ch, 0, length);
511: if (count > 0) {
512: remaining -= count;
513: } else {
514: break;
515: }
516: } while (remaining > 0);
517:
518: long skipped = n - remaining;
519: return skipped;
520:
521: } // skip(long):long
522:
523: /**
524: * Tell whether this stream is ready to be read.
525: *
526: * @return True if the next read() is guaranteed not to block for input,
527: * false otherwise. Note that returning false does not guarantee that the
528: * next read will block.
529: *
530: * @exception IOException If an I/O error occurs
531: */
532: public boolean ready() throws IOException {
533: return false;
534: } // ready()
535:
536: /**
537: * Tell whether this stream supports the mark() operation.
538: */
539: public boolean markSupported() {
540: return false;
541: } // markSupported()
542:
543: /**
544: * Mark the present position in the stream. Subsequent calls to reset()
545: * will attempt to reposition the stream to this point. Not all
546: * character-input streams support the mark() operation.
547: *
548: * @param readAheadLimit Limit on the number of characters that may be
549: * read while still preserving the mark. After
550: * reading this many characters, attempting to
551: * reset the stream may fail.
552: *
553: * @exception IOException If the stream does not support mark(),
554: * or if some other I/O error occurs
555: */
556: public void mark(int readAheadLimit) throws IOException {
557: throw new IOException(Localizer.getMessage(
558: "jsp.error.xml.operationNotSupported", "mark()",
559: "UTF-8"));
560: }
561:
562: /**
563: * Reset the stream. If the stream has been marked, then attempt to
564: * reposition it at the mark. If the stream has not been marked, then
565: * attempt to reset it in some way appropriate to the particular stream,
566: * for example by repositioning it to its starting point. Not all
567: * character-input streams support the reset() operation, and some support
568: * reset() without supporting mark().
569: *
570: * @exception IOException If the stream has not been marked,
571: * or if the mark has been invalidated,
572: * or if the stream does not support reset(),
573: * or if some other I/O error occurs
574: */
575: public void reset() throws IOException {
576: fOffset = 0;
577: fSurrogate = -1;
578: } // reset()
579:
580: /**
581: * Close the stream. Once a stream has been closed, further read(),
582: * ready(), mark(), or reset() invocations will throw an IOException.
583: * Closing a previously-closed stream, however, has no effect.
584: *
585: * @exception IOException If an I/O error occurs
586: */
587: public void close() throws IOException {
588: fInputStream.close();
589: } // close()
590:
591: //
592: // Private methods
593: //
594:
595: /** Throws an exception for expected byte. */
596: private void expectedByte(int position, int count)
597: throws UTFDataFormatException {
598:
599: throw new UTFDataFormatException(Localizer.getMessage(
600: "jsp.error.xml.expectedByte", Integer
601: .toString(position), Integer.toString(count)));
602:
603: } // expectedByte(int,int,int)
604:
605: /** Throws an exception for invalid byte. */
606: private void invalidByte(int position, int count, int c)
607: throws UTFDataFormatException {
608:
609: throw new UTFDataFormatException(Localizer.getMessage(
610: "jsp.error.xml.invalidByte",
611: Integer.toString(position), Integer.toString(count)));
612: } // invalidByte(int,int,int,int)
613:
614: /** Throws an exception for invalid surrogate bits. */
615: private void invalidSurrogate(int uuuuu)
616: throws UTFDataFormatException {
617:
618: throw new UTFDataFormatException(Localizer.getMessage(
619: "jsp.error.xml.invalidHighSurrogate", Integer
620: .toHexString(uuuuu)));
621: } // invalidSurrogate(int)
622:
623: } // class UTF8Reader
|