001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package org.apache.jasper.xmlparser;
019:
020: import java.io.InputStream;
021: import java.io.IOException;
022: import java.io.Reader;
023: import java.io.UTFDataFormatException;
024: import org.apache.jasper.compiler.Localizer;
025:
026: /**
027: * @author Andy Clark, IBM
028: *
029: * @version $Id: UTF8Reader.java 467222 2006-10-24 03:17:11Z markt $
030: */
031: public class UTF8Reader extends Reader {
032:
033: private org.apache.juli.logging.Log log = org.apache.juli.logging.LogFactory
034: .getLog(UTF8Reader.class);
035:
036: //
037: // Constants
038: //
039:
040: /** Default byte buffer size (2048). */
041: public static final int DEFAULT_BUFFER_SIZE = 2048;
042:
043: // debugging
044:
045: /** Debug read. */
046: private static final boolean DEBUG_READ = false;
047:
048: //
049: // Data
050: //
051:
052: /** Input stream. */
053: protected InputStream fInputStream;
054:
055: /** Byte buffer. */
056: protected byte[] fBuffer;
057:
058: /** Offset into buffer. */
059: protected int fOffset;
060:
061: /** Surrogate character. */
062: private int fSurrogate = -1;
063:
064: //
065: // Constructors
066: //
067:
068: /**
069: * Constructs a UTF-8 reader from the specified input stream,
070: * buffer size and MessageFormatter.
071: *
072: * @param inputStream The input stream.
073: * @param size The initial buffer size.
074: */
075: public UTF8Reader(InputStream inputStream, int size) {
076: fInputStream = inputStream;
077: fBuffer = new byte[size];
078: }
079:
080: //
081: // Reader methods
082: //
083:
084: /**
085: * Read a single character. This method will block until a character is
086: * available, an I/O error occurs, or the end of the stream is reached.
087: *
088: * <p> Subclasses that intend to support efficient single-character input
089: * should override this method.
090: *
091: * @return The character read, as an integer in the range 0 to 16383
092: * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
093: * been reached
094: *
095: * @exception IOException If an I/O error occurs
096: */
097: public int read() throws IOException {
098:
099: // decode character
100: int c = fSurrogate;
101: if (fSurrogate == -1) {
102: // NOTE: We use the index into the buffer if there are remaining
103: // bytes from the last block read. -Ac
104: int index = 0;
105:
106: // get first byte
107: int b0 = index == fOffset ? fInputStream.read()
108: : fBuffer[index++] & 0x00FF;
109: if (b0 == -1) {
110: return -1;
111: }
112:
113: // UTF-8: [0xxx xxxx]
114: // Unicode: [0000 0000] [0xxx xxxx]
115: if (b0 < 0x80) {
116: c = (char) b0;
117: }
118:
119: // UTF-8: [110y yyyy] [10xx xxxx]
120: // Unicode: [0000 0yyy] [yyxx xxxx]
121: else if ((b0 & 0xE0) == 0xC0) {
122: int b1 = index == fOffset ? fInputStream.read()
123: : fBuffer[index++] & 0x00FF;
124: if (b1 == -1) {
125: expectedByte(2, 2);
126: }
127: if ((b1 & 0xC0) != 0x80) {
128: invalidByte(2, 2, b1);
129: }
130: c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
131: }
132:
133: // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
134: // Unicode: [zzzz yyyy] [yyxx xxxx]
135: else if ((b0 & 0xF0) == 0xE0) {
136: int b1 = index == fOffset ? fInputStream.read()
137: : fBuffer[index++] & 0x00FF;
138: if (b1 == -1) {
139: expectedByte(2, 3);
140: }
141: if ((b1 & 0xC0) != 0x80) {
142: invalidByte(2, 3, b1);
143: }
144: int b2 = index == fOffset ? fInputStream.read()
145: : fBuffer[index++] & 0x00FF;
146: if (b2 == -1) {
147: expectedByte(3, 3);
148: }
149: if ((b2 & 0xC0) != 0x80) {
150: invalidByte(3, 3, b2);
151: }
152: c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0)
153: | (b2 & 0x003F);
154: }
155:
156: // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
157: // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
158: // [1101 11yy] [yyxx xxxx] (low surrogate)
159: // * uuuuu = wwww + 1
160: else if ((b0 & 0xF8) == 0xF0) {
161: int b1 = index == fOffset ? fInputStream.read()
162: : fBuffer[index++] & 0x00FF;
163: if (b1 == -1) {
164: expectedByte(2, 4);
165: }
166: if ((b1 & 0xC0) != 0x80) {
167: invalidByte(2, 3, b1);
168: }
169: int b2 = index == fOffset ? fInputStream.read()
170: : fBuffer[index++] & 0x00FF;
171: if (b2 == -1) {
172: expectedByte(3, 4);
173: }
174: if ((b2 & 0xC0) != 0x80) {
175: invalidByte(3, 3, b2);
176: }
177: int b3 = index == fOffset ? fInputStream.read()
178: : fBuffer[index++] & 0x00FF;
179: if (b3 == -1) {
180: expectedByte(4, 4);
181: }
182: if ((b3 & 0xC0) != 0x80) {
183: invalidByte(4, 4, b3);
184: }
185: int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
186: if (uuuuu > 0x10) {
187: invalidSurrogate(uuuuu);
188: }
189: int wwww = uuuuu - 1;
190: int hs = 0xD800 | ((wwww << 6) & 0x03C0)
191: | ((b1 << 2) & 0x003C) | ((b2 >> 4) & 0x0003);
192: int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
193: c = hs;
194: fSurrogate = ls;
195: }
196:
197: // error
198: else {
199: invalidByte(1, 1, b0);
200: }
201: }
202:
203: // use surrogate
204: else {
205: fSurrogate = -1;
206: }
207:
208: // return character
209: if (DEBUG_READ) {
210: if (log.isDebugEnabled())
211: log.debug("read(): 0x" + Integer.toHexString(c));
212: }
213: return c;
214:
215: } // read():int
216:
217: /**
218: * Read characters into a portion of an array. This method will block
219: * until some input is available, an I/O error occurs, or the end of the
220: * stream is reached.
221: *
222: * @param ch Destination buffer
223: * @param offset Offset at which to start storing characters
224: * @param length Maximum number of characters to read
225: *
226: * @return The number of characters read, or -1 if the end of the
227: * stream has been reached
228: *
229: * @exception IOException If an I/O error occurs
230: */
231: public int read(char ch[], int offset, int length)
232: throws IOException {
233:
234: // handle surrogate
235: int out = offset;
236: if (fSurrogate != -1) {
237: ch[offset + 1] = (char) fSurrogate;
238: fSurrogate = -1;
239: length--;
240: out++;
241: }
242:
243: // read bytes
244: int count = 0;
245: if (fOffset == 0) {
246: // adjust length to read
247: if (length > fBuffer.length) {
248: length = fBuffer.length;
249: }
250:
251: // perform read operation
252: count = fInputStream.read(fBuffer, 0, length);
253: if (count == -1) {
254: return -1;
255: }
256: count += out - offset;
257: }
258:
259: // skip read; last character was in error
260: // NOTE: Having an offset value other than zero means that there was
261: // an error in the last character read. In this case, we have
262: // skipped the read so we don't consume any bytes past the
263: // error. By signalling the error on the next block read we
264: // allow the method to return the most valid characters that
265: // it can on the previous block read. -Ac
266: else {
267: count = fOffset;
268: fOffset = 0;
269: }
270:
271: // convert bytes to characters
272: final int total = count;
273: for (int in = 0; in < total; in++) {
274: int b0 = fBuffer[in] & 0x00FF;
275:
276: // UTF-8: [0xxx xxxx]
277: // Unicode: [0000 0000] [0xxx xxxx]
278: if (b0 < 0x80) {
279: ch[out++] = (char) b0;
280: continue;
281: }
282:
283: // UTF-8: [110y yyyy] [10xx xxxx]
284: // Unicode: [0000 0yyy] [yyxx xxxx]
285: if ((b0 & 0xE0) == 0xC0) {
286: int b1 = -1;
287: if (++in < total) {
288: b1 = fBuffer[in] & 0x00FF;
289: } else {
290: b1 = fInputStream.read();
291: if (b1 == -1) {
292: if (out > offset) {
293: fBuffer[0] = (byte) b0;
294: fOffset = 1;
295: return out - offset;
296: }
297: expectedByte(2, 2);
298: }
299: count++;
300: }
301: if ((b1 & 0xC0) != 0x80) {
302: if (out > offset) {
303: fBuffer[0] = (byte) b0;
304: fBuffer[1] = (byte) b1;
305: fOffset = 2;
306: return out - offset;
307: }
308: invalidByte(2, 2, b1);
309: }
310: int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
311: ch[out++] = (char) c;
312: count -= 1;
313: continue;
314: }
315:
316: // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
317: // Unicode: [zzzz yyyy] [yyxx xxxx]
318: if ((b0 & 0xF0) == 0xE0) {
319: int b1 = -1;
320: if (++in < total) {
321: b1 = fBuffer[in] & 0x00FF;
322: } else {
323: b1 = fInputStream.read();
324: if (b1 == -1) {
325: if (out > offset) {
326: fBuffer[0] = (byte) b0;
327: fOffset = 1;
328: return out - offset;
329: }
330: expectedByte(2, 3);
331: }
332: count++;
333: }
334: if ((b1 & 0xC0) != 0x80) {
335: if (out > offset) {
336: fBuffer[0] = (byte) b0;
337: fBuffer[1] = (byte) b1;
338: fOffset = 2;
339: return out - offset;
340: }
341: invalidByte(2, 3, b1);
342: }
343: int b2 = -1;
344: if (++in < total) {
345: b2 = fBuffer[in] & 0x00FF;
346: } else {
347: b2 = fInputStream.read();
348: if (b2 == -1) {
349: if (out > offset) {
350: fBuffer[0] = (byte) b0;
351: fBuffer[1] = (byte) b1;
352: fOffset = 2;
353: return out - offset;
354: }
355: expectedByte(3, 3);
356: }
357: count++;
358: }
359: if ((b2 & 0xC0) != 0x80) {
360: if (out > offset) {
361: fBuffer[0] = (byte) b0;
362: fBuffer[1] = (byte) b1;
363: fBuffer[2] = (byte) b2;
364: fOffset = 3;
365: return out - offset;
366: }
367: invalidByte(3, 3, b2);
368: }
369: int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0)
370: | (b2 & 0x003F);
371: ch[out++] = (char) c;
372: count -= 2;
373: continue;
374: }
375:
376: // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
377: // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
378: // [1101 11yy] [yyxx xxxx] (low surrogate)
379: // * uuuuu = wwww + 1
380: if ((b0 & 0xF8) == 0xF0) {
381: int b1 = -1;
382: if (++in < total) {
383: b1 = fBuffer[in] & 0x00FF;
384: } else {
385: b1 = fInputStream.read();
386: if (b1 == -1) {
387: if (out > offset) {
388: fBuffer[0] = (byte) b0;
389: fOffset = 1;
390: return out - offset;
391: }
392: expectedByte(2, 4);
393: }
394: count++;
395: }
396: if ((b1 & 0xC0) != 0x80) {
397: if (out > offset) {
398: fBuffer[0] = (byte) b0;
399: fBuffer[1] = (byte) b1;
400: fOffset = 2;
401: return out - offset;
402: }
403: invalidByte(2, 4, b1);
404: }
405: int b2 = -1;
406: if (++in < total) {
407: b2 = fBuffer[in] & 0x00FF;
408: } else {
409: b2 = fInputStream.read();
410: if (b2 == -1) {
411: if (out > offset) {
412: fBuffer[0] = (byte) b0;
413: fBuffer[1] = (byte) b1;
414: fOffset = 2;
415: return out - offset;
416: }
417: expectedByte(3, 4);
418: }
419: count++;
420: }
421: if ((b2 & 0xC0) != 0x80) {
422: if (out > offset) {
423: fBuffer[0] = (byte) b0;
424: fBuffer[1] = (byte) b1;
425: fBuffer[2] = (byte) b2;
426: fOffset = 3;
427: return out - offset;
428: }
429: invalidByte(3, 4, b2);
430: }
431: int b3 = -1;
432: if (++in < total) {
433: b3 = fBuffer[in] & 0x00FF;
434: } else {
435: b3 = fInputStream.read();
436: if (b3 == -1) {
437: if (out > offset) {
438: fBuffer[0] = (byte) b0;
439: fBuffer[1] = (byte) b1;
440: fBuffer[2] = (byte) b2;
441: fOffset = 3;
442: return out - offset;
443: }
444: expectedByte(4, 4);
445: }
446: count++;
447: }
448: if ((b3 & 0xC0) != 0x80) {
449: if (out > offset) {
450: fBuffer[0] = (byte) b0;
451: fBuffer[1] = (byte) b1;
452: fBuffer[2] = (byte) b2;
453: fBuffer[3] = (byte) b3;
454: fOffset = 4;
455: return out - offset;
456: }
457: invalidByte(4, 4, b2);
458: }
459:
460: // decode bytes into surrogate characters
461: int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
462: if (uuuuu > 0x10) {
463: invalidSurrogate(uuuuu);
464: }
465: int wwww = uuuuu - 1;
466: int zzzz = b1 & 0x000F;
467: int yyyyyy = b2 & 0x003F;
468: int xxxxxx = b3 & 0x003F;
469: int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2)
470: | (yyyyyy >> 4);
471: int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
472:
473: // set characters
474: ch[out++] = (char) hs;
475: ch[out++] = (char) ls;
476: count -= 2;
477: continue;
478: }
479:
480: // error
481: if (out > offset) {
482: fBuffer[0] = (byte) b0;
483: fOffset = 1;
484: return out - offset;
485: }
486: invalidByte(1, 1, b0);
487: }
488:
489: // return number of characters converted
490: if (DEBUG_READ) {
491: if (log.isDebugEnabled())
492: log.debug("read(char[]," + offset + ',' + length
493: + "): count=" + count);
494: }
495: return count;
496:
497: } // read(char[],int,int)
498:
499: /**
500: * Skip characters. This method will block until some characters are
501: * available, an I/O error occurs, or the end of the stream is reached.
502: *
503: * @param n The number of characters to skip
504: *
505: * @return The number of characters actually skipped
506: *
507: * @exception IOException If an I/O error occurs
508: */
509: public long skip(long n) throws IOException {
510:
511: long remaining = n;
512: final char[] ch = new char[fBuffer.length];
513: do {
514: int length = ch.length < remaining ? ch.length
515: : (int) remaining;
516: int count = read(ch, 0, length);
517: if (count > 0) {
518: remaining -= count;
519: } else {
520: break;
521: }
522: } while (remaining > 0);
523:
524: long skipped = n - remaining;
525: return skipped;
526:
527: } // skip(long):long
528:
529: /**
530: * Tell whether this stream is ready to be read.
531: *
532: * @return True if the next read() is guaranteed not to block for input,
533: * false otherwise. Note that returning false does not guarantee that the
534: * next read will block.
535: *
536: * @exception IOException If an I/O error occurs
537: */
538: public boolean ready() throws IOException {
539: return false;
540: } // ready()
541:
542: /**
543: * Tell whether this stream supports the mark() operation.
544: */
545: public boolean markSupported() {
546: return false;
547: } // markSupported()
548:
549: /**
550: * Mark the present position in the stream. Subsequent calls to reset()
551: * will attempt to reposition the stream to this point. Not all
552: * character-input streams support the mark() operation.
553: *
554: * @param readAheadLimit Limit on the number of characters that may be
555: * read while still preserving the mark. After
556: * reading this many characters, attempting to
557: * reset the stream may fail.
558: *
559: * @exception IOException If the stream does not support mark(),
560: * or if some other I/O error occurs
561: */
562: public void mark(int readAheadLimit) throws IOException {
563: throw new IOException(Localizer.getMessage(
564: "jsp.error.xml.operationNotSupported", "mark()",
565: "UTF-8"));
566: }
567:
568: /**
569: * Reset the stream. If the stream has been marked, then attempt to
570: * reposition it at the mark. If the stream has not been marked, then
571: * attempt to reset it in some way appropriate to the particular stream,
572: * for example by repositioning it to its starting point. Not all
573: * character-input streams support the reset() operation, and some support
574: * reset() without supporting mark().
575: *
576: * @exception IOException If the stream has not been marked,
577: * or if the mark has been invalidated,
578: * or if the stream does not support reset(),
579: * or if some other I/O error occurs
580: */
581: public void reset() throws IOException {
582: fOffset = 0;
583: fSurrogate = -1;
584: } // reset()
585:
586: /**
587: * Close the stream. Once a stream has been closed, further read(),
588: * ready(), mark(), or reset() invocations will throw an IOException.
589: * Closing a previously-closed stream, however, has no effect.
590: *
591: * @exception IOException If an I/O error occurs
592: */
593: public void close() throws IOException {
594: fInputStream.close();
595: } // close()
596:
597: //
598: // Private methods
599: //
600:
601: /** Throws an exception for expected byte. */
602: private void expectedByte(int position, int count)
603: throws UTFDataFormatException {
604:
605: throw new UTFDataFormatException(Localizer.getMessage(
606: "jsp.error.xml.expectedByte", Integer
607: .toString(position), Integer.toString(count)));
608:
609: } // expectedByte(int,int,int)
610:
611: /** Throws an exception for invalid byte. */
612: private void invalidByte(int position, int count, int c)
613: throws UTFDataFormatException {
614:
615: throw new UTFDataFormatException(Localizer.getMessage(
616: "jsp.error.xml.invalidByte",
617: Integer.toString(position), Integer.toString(count)));
618: } // invalidByte(int,int,int,int)
619:
620: /** Throws an exception for invalid surrogate bits. */
621: private void invalidSurrogate(int uuuuu)
622: throws UTFDataFormatException {
623:
624: throw new UTFDataFormatException(Localizer.getMessage(
625: "jsp.error.xml.invalidHighSurrogate", Integer
626: .toHexString(uuuuu)));
627: } // invalidSurrogate(int)
628:
629: } // class UTF8Reader
|