001: /*
002: * The Apache Software License, Version 1.1
003: *
004: *
005: * Copyright (c) 1999 The Apache Software Foundation. All rights
006: * reserved.
007: *
008: * Redistribution and use in source and binary forms, with or without
009: * modification, are permitted provided that the following conditions
010: * are met:
011: *
012: * 1. Redistributions of source code must retain the above copyright
013: * notice, this list of conditions and the following disclaimer.
014: *
015: * 2. Redistributions in binary form must reproduce the above copyright
016: * notice, this list of conditions and the following disclaimer in
017: * the documentation and/or other materials provided with the
018: * distribution.
019: *
020: * 3. The end-user documentation included with the redistribution,
021: * if any, must include the following acknowledgment:
022: * "This product includes software developed by the
023: * Apache Software Foundation (http://www.apache.org/)."
024: * Alternately, this acknowledgment may appear in the software itself,
025: * if and wherever such third-party acknowledgments normally appear.
026: *
027: * 4. The names "Xerces" and "Apache Software Foundation" must
028: * not be used to endorse or promote products derived from this
029: * software without prior written permission. For written
030: * permission, please contact apache@apache.org.
031: *
032: * 5. Products derived from this software may not be called "Apache",
033: * nor may "Apache" appear in their name, without prior written
034: * permission of the Apache Software Foundation.
035: *
036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
047: * SUCH DAMAGE.
048: * ====================================================================
049: *
050: * This software consists of voluntary contributions made by many
051: * individuals on behalf of the Apache Software Foundation and was
052: * originally based on software copyright (c) 1999, International
053: * Business Machines, Inc., http://www.apache.org. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package org.apache.xerces.readers;
059:
060: import org.apache.xerces.framework.XMLErrorReporter;
061: import org.apache.xerces.utils.CharDataChunk;
062: import org.apache.xerces.utils.StringPool;
063: import org.apache.xerces.utils.ImplementationMessages;
064: import java.io.InputStream;
065:
066: /**
067: * Simple character-based version of a UTF8 reader.
068: *
069: * This class is not commonly used, but is provided as a much simplified
070: * example of the UTF8Reader class that uses the AbstractCharReader to
071: * perform all of the reader functions except for filling each buffer
072: * of the character data when needed (fillCurrentChunk). We read the
073: * input data from an InputStream and perform end-of-line normalization
074: * as we process that data.
075: *
076: * @version
077: */
078: final class UTF8CharReader extends AbstractCharReader {
079: //
080: //
081: //
082: UTF8CharReader(XMLEntityHandler entityHandler,
083: XMLErrorReporter errorReporter,
084: boolean sendCharDataAsCharArray, InputStream dataStream,
085: StringPool stringPool) throws Exception {
086: super (entityHandler, errorReporter, sendCharDataAsCharArray,
087: stringPool);
088: fInputStream = dataStream;
089: fillCurrentChunk();
090: }
091:
092: //
093: //
094: //
095: private InputStream fInputStream = null;
096: //
097: // When we fill a chunk there may be data that was read from the
098: // input stream that has not been "processed". We need to save
099: // that data, and any in-progress state, between the calls to
100: // fillCurrentChunk() in these instance variables.
101: //
102: private boolean fCheckOverflow = false;
103: private byte[] fOverflow = null;
104: private int fOverflowOffset = 0;
105: private int fOverflowEnd = 0;
106: private int fOutputOffset = 0;
107: private boolean fSkipLinefeed = false;
108: private int fPartialMultiByteIn = 0;
109: private byte[] fPartialMultiByteChar = new byte[3];
110: private int fPartialSurrogatePair = 0;
111: private boolean fPartialMultiByteResult = false;
112:
113: //
114: //
115: //
116: protected int fillCurrentChunk() throws Exception {
117: //
118: // See if we can find a way to reuse the buffer that may have been returned
119: // with a recyled data chunk.
120: //
121: char[] recycledData = fCurrentChunk.toCharArray();
122: //
123: // If we have overflow from the last call, normalize from where
124: // we left off, copying into the front of the output buffer.
125: //
126: fOutputOffset = 0;
127: if (fCheckOverflow) {
128: //
129: // The fOverflowEnd should always be equal to CHUNK_SIZE, unless we hit
130: // EOF during the previous call. Copy the remaining data to the front
131: // of the buffer and return it as the final chunk.
132: //
133: fMostRecentData = recycledData;
134: if (fOverflowEnd < CharDataChunk.CHUNK_SIZE) {
135: recycledData = null;
136: if (fOverflowEnd > 0) {
137: if (fMostRecentData == null
138: || fMostRecentData.length < 1
139: + fOverflowEnd - fOverflowOffset)
140: fMostRecentData = new char[1 + fOverflowEnd
141: - fOverflowOffset];
142: copyNormalize(fOverflow, fOverflowOffset,
143: fMostRecentData, fOutputOffset);
144: } else {
145: if (fMostRecentData == null)
146: fMostRecentData = new char[1];
147: }
148: fMostRecentData[fOutputOffset] = 0;
149: //
150: // Update our instance variables
151: //
152: fOverflow = null;
153: fLength += fOutputOffset;
154: fCurrentIndex = 0;
155: fCurrentChunk.setCharArray(fMostRecentData);
156: return (fMostRecentChar = fMostRecentData[0]);
157: }
158: if (fMostRecentData == null
159: || fMostRecentData.length < CharDataChunk.CHUNK_SIZE)
160: fMostRecentData = new char[CharDataChunk.CHUNK_SIZE];
161: else
162: recycledData = null;
163: copyNormalize(fOverflow, fOverflowOffset, fMostRecentData,
164: fOutputOffset);
165: fCheckOverflow = false;
166: } else {
167: if (fOverflow == null)
168: fOverflow = new byte[CharDataChunk.CHUNK_SIZE];
169: fMostRecentData = null;
170: }
171: while (true) {
172: fOverflowOffset = 0;
173: fOverflowEnd = 0;
174: int capacity = CharDataChunk.CHUNK_SIZE;
175: int result = 0;
176: do {
177: try {
178: result = fInputStream.read(fOverflow, fOverflowEnd,
179: capacity);
180: } catch (java.io.IOException ex) {
181: result = -1;
182: }
183: if (result == -1) {
184: //
185: // We have reached the end of the stream.
186: //
187: fInputStream.close();
188: fInputStream = null;
189: if (fMostRecentData == null) {
190: //
191: // There is no previous output data, so we know that all of the
192: // new input data will fit.
193: //
194: fMostRecentData = recycledData;
195: if (fMostRecentData == null
196: || fMostRecentData.length < 1 + fOverflowEnd)
197: fMostRecentData = new char[1 + fOverflowEnd];
198: else
199: recycledData = null;
200: copyNormalize(fOverflow, fOverflowOffset,
201: fMostRecentData, fOutputOffset);
202: fOverflow = null;
203: fMostRecentData[fOutputOffset] = 0;
204: } else {
205: //
206: // Copy the input data to the end of the output buffer.
207: //
208: boolean alldone = copyNormalize(fOverflow,
209: fOverflowOffset, fMostRecentData,
210: fOutputOffset);
211: if (alldone) {
212: if (fOverflowEnd == CharDataChunk.CHUNK_SIZE) {
213: //
214: // Special case - everything fit into the overflow buffer,
215: // except that there is no room for the nul char we use to
216: // indicate EOF. Set the overflow buffer length to zero.
217: // On the next call to this method, we will detect this
218: // case and which we will handle above .
219: //
220: fCheckOverflow = true;
221: fOverflowOffset = 0;
222: fOverflowEnd = 0;
223: } else {
224: //
225: // It all fit into the output buffer.
226: //
227: fOverflow = null;
228: fMostRecentData[fOutputOffset] = 0;
229: }
230: } else {
231: //
232: // There is still input data left over, save the remaining data as
233: // the overflow buffer for the next call.
234: //
235: fCheckOverflow = true;
236: }
237: }
238: break;
239: }
240: if (result > 0) {
241: fOverflowEnd += result;
242: capacity -= result;
243: }
244: } while (capacity > 0);
245: //
246: //
247: //
248: if (result == -1)
249: break;
250: if (fMostRecentData != null) {
251: boolean alldone = copyNormalize(fOverflow,
252: fOverflowOffset, fMostRecentData, fOutputOffset);
253: if (fOutputOffset == CharDataChunk.CHUNK_SIZE) {
254: //
255: // We filled the output buffer.
256: //
257: if (!alldone) {
258: //
259: // The input buffer will become the next overflow buffer.
260: //
261: fCheckOverflow = true;
262: }
263: break;
264: }
265: } else {
266: //
267: // Now normalize the end-of-line characters and see if we need to read more
268: // bytes to fill up the buffer.
269: //
270: fMostRecentData = recycledData;
271: if (fMostRecentData == null
272: || fMostRecentData.length < CharDataChunk.CHUNK_SIZE)
273: fMostRecentData = new char[CharDataChunk.CHUNK_SIZE];
274: else
275: recycledData = null;
276: copyNormalize(fOverflow, fOverflowOffset,
277: fMostRecentData, fOutputOffset);
278: if (fOutputOffset == CharDataChunk.CHUNK_SIZE) {
279: //
280: // The output buffer is full. We can return now.
281: //
282: break;
283: }
284: }
285: //
286: // We will need to get another intput buffer to be able to fill the
287: // overflow buffer completely.
288: //
289: }
290: //
291: // Update our instance variables
292: //
293: fLength += fOutputOffset;
294: fCurrentIndex = 0;
295: fCurrentChunk.setCharArray(fMostRecentData);
296: return (fMostRecentChar = fMostRecentData[0]);
297: }
298:
299: //
300: // Copy and normalize bytes from the overflow buffer into chars in our data buffer.
301: //
302: private boolean copyNormalize(byte[] in, int inOffset, char[] out,
303: int outOffset) throws Exception {
304: //
305: // Handle all edge cases before dropping into the inner loop.
306: //
307: int inEnd = fOverflowEnd;
308: int outEnd = out.length;
309: if (inOffset == inEnd)
310: return true;
311: byte b = in[inOffset];
312: if (fSkipLinefeed) {
313: fSkipLinefeed = false;
314: if (b == 0x0A) {
315: if (++inOffset == inEnd)
316: return exitNormalize(inOffset, outOffset, true);
317: b = in[inOffset];
318: }
319: } else if (fPartialMultiByteIn > 0) {
320: if (!handlePartialMultiByteChar(b, in, inOffset, inEnd,
321: out, outOffset, outEnd))
322: return fPartialMultiByteResult;
323: inOffset = fOverflowOffset;
324: outOffset = fOutputOffset;
325: b = in[inOffset];
326: }
327: while (outOffset < outEnd) {
328: //
329: // Find the longest run that we can guarantee will not exceed the
330: // bounds of the outer loop.
331: //
332: int inCount = inEnd - inOffset;
333: int outCount = outEnd - outOffset;
334: if (inCount > outCount)
335: inCount = outCount;
336: inOffset++;
337: while (true) {
338: while (b == 0x0D || b < 0) {
339: if (b == 0x0D) {
340: out[outOffset++] = 0x0A;
341: if (inOffset == inEnd) {
342: fSkipLinefeed = true;
343: return exitNormalize(inOffset, outOffset,
344: true);
345: }
346: b = in[inOffset];
347: if (b == 0x0A) {
348: if (++inOffset == inEnd)
349: return exitNormalize(inOffset,
350: outOffset, true);
351: b = in[inOffset];
352: }
353: if (outOffset == outEnd)
354: return exitNormalize(inOffset, outOffset,
355: false);
356: } else {
357: if (!handleMultiByteChar(b, in, inOffset,
358: inEnd, out, outOffset, outEnd))
359: return fPartialMultiByteResult;
360: inOffset = fOverflowOffset;
361: outOffset = fOutputOffset;
362: b = in[inOffset];
363: }
364: inCount = inEnd - inOffset;
365: outCount = outEnd - outOffset;
366: if (inCount > outCount)
367: inCount = outCount;
368: inOffset++;
369: }
370: while (true) {
371: out[outOffset++] = (char) b;
372: if (--inCount == 0)
373: break;
374: b = in[inOffset++];
375: if (b == 0x0D || b < 0)
376: break;
377: }
378: if (inCount == 0)
379: break;
380: }
381: if (inOffset == inEnd)
382: break;
383: }
384: return exitNormalize(inOffset, outOffset, inOffset == inEnd);
385: }
386:
387: //
388: //
389: //
390: private boolean exitNormalize(int inOffset, int outOffset,
391: boolean result) {
392: fOverflowOffset = inOffset;
393: fOutputOffset = outOffset;
394: return result;
395: }
396:
397: //
398: //
399: //
400: private void savePartialMultiByte(int inCount, byte bz, byte by,
401: byte bx) {
402: fPartialMultiByteIn = inCount;
403: fPartialMultiByteChar[--inCount] = bz;
404: fPartialMultiByteChar[--inCount] = by;
405: fPartialMultiByteChar[--inCount] = bx;
406: }
407:
408: private void savePartialMultiByte(int inCount, byte bz, byte by) {
409: fPartialMultiByteIn = inCount;
410: fPartialMultiByteChar[--inCount] = bz;
411: fPartialMultiByteChar[--inCount] = by;
412: }
413:
414: private void savePartialMultiByte(int inCount, byte bz) {
415: fPartialMultiByteIn = inCount;
416: fPartialMultiByteChar[--inCount] = bz;
417: }
418:
419: private boolean handleMultiByteChar(byte b, byte[] in,
420: int inOffset, int inEnd, char[] out, int outOffset,
421: int outEnd) throws Exception {
422: if (inOffset == inEnd) {
423: savePartialMultiByte(1, b);
424: fPartialMultiByteResult = exitNormalize(inOffset,
425: outOffset, true);
426: return false;
427: }
428: byte b1 = in[inOffset++];
429: if ((b1 & 0xc0) != 0x80) {
430: Object[] args = { Integer.toHexString(b & 0xff),
431: Integer.toHexString(b1 & 0xff) };
432: deferException(ImplementationMessages.ENC5, args, outOffset);
433: out[outOffset++] = 0;
434: return exitNormalize(inOffset, outOffset, true);
435: }
436: if ((b & 0xe0) == 0xc0) { // 110yyyyy 10xxxxxx
437: int ch = ((0x1f & b) << 6) + (0x3f & b1);
438: out[outOffset++] = (char) ch;
439: if (inOffset == inEnd || outOffset == outEnd) {
440: fPartialMultiByteResult = exitNormalize(inOffset,
441: outOffset, inOffset == inEnd);
442: return false;
443: }
444: } else {
445: if (inOffset == inEnd) {
446: savePartialMultiByte(2, b1, b);
447: fPartialMultiByteResult = exitNormalize(inOffset,
448: outOffset, true);
449: return false;
450: }
451: byte b2 = in[inOffset++];
452: if ((b2 & 0xc0) != 0x80) {
453: Object[] args = { Integer.toHexString(b & 0xff),
454: Integer.toHexString(b1 & 0xff),
455: Integer.toHexString(b2 & 0xff) };
456: deferException(ImplementationMessages.ENC6, args,
457: outOffset);
458: out[outOffset++] = 0;
459: return exitNormalize(inOffset, outOffset, true);
460: }
461: if ((b & 0xf0) == 0xe0) { // 1110zzzz 10yyyyyy 10xxxxxx
462: int ch = ((0x0f & b) << 12) + ((0x3f & b1) << 6)
463: + (0x3f & b2);
464: out[outOffset++] = (char) ch;
465: if (inOffset == inEnd || outOffset == outEnd) {
466: fPartialMultiByteResult = exitNormalize(inOffset,
467: outOffset, inOffset == inEnd);
468: return false;
469: }
470: } else {
471: if ((b & 0xf8) != 0xf0) {
472: Object[] args = { Integer.toHexString(b & 0xff) };
473: deferException(ImplementationMessages.ENC4, args,
474: outOffset);
475: out[outOffset++] = 0;
476: return exitNormalize(inOffset, outOffset, true);
477: }
478: if (inOffset == inEnd) {
479: savePartialMultiByte(3, b2, b1, b);
480: fPartialMultiByteResult = exitNormalize(inOffset,
481: outOffset, true);
482: return false;
483: }
484: byte b3 = in[inOffset++];
485: if ((b3 & 0xc0) != 0x80) {
486: Object[] args = { Integer.toHexString(b & 0xff),
487: Integer.toHexString(b1 & 0xff),
488: Integer.toHexString(b2 & 0xff),
489: Integer.toHexString(b3 & 0xff) };
490: deferException(ImplementationMessages.ENC7, args,
491: outOffset);
492: out[outOffset++] = 0;
493: return exitNormalize(inOffset, outOffset, true);
494: }
495: int ch = ((0x0f & b) << 18) + ((0x3f & b1) << 12)
496: + ((0x3f & b2) << 6) + (0x3f & b3);
497: if (ch >= 0x10000) {
498: out[outOffset++] = (char) (((ch - 0x00010000) >> 10) + 0xd800);
499: ch = (((ch - 0x00010000) & 0x3ff) + 0xdc00);
500: if (outOffset == outEnd) {
501: fPartialSurrogatePair = ch;
502: fPartialMultiByteResult = exitNormalize(
503: inOffset, outOffset, inOffset == inEnd);
504: return false;
505: }
506: }
507: out[outOffset++] = (char) ch;
508: if (inOffset == inEnd || outOffset == outEnd) {
509: fPartialMultiByteResult = exitNormalize(inOffset,
510: outOffset, inOffset == inEnd);
511: return false;
512: }
513: }
514: }
515: return exitNormalize(inOffset, outOffset, true);
516: }
517:
518: private boolean handlePartialMultiByteChar(byte b, byte[] in,
519: int inOffset, int inEnd, char[] out, int outOffset,
520: int outEnd) throws Exception {
521: if (outOffset == outEnd) {
522: fPartialMultiByteResult = exitNormalize(inOffset,
523: outOffset, inOffset == inEnd);
524: return false;
525: }
526: if (fPartialMultiByteIn == 4) {
527: out[outOffset++] = (char) fPartialSurrogatePair;
528: if (outOffset == outEnd) {
529: fPartialMultiByteResult = exitNormalize(inOffset,
530: outOffset, false);
531: return false;
532: }
533: fOutputOffset = outOffset;
534: return true;
535: }
536: int byteIn = fPartialMultiByteIn;
537: fPartialMultiByteIn = 0;
538: byte b1 = 0;
539: byte b2 = 0;
540: byte b3 = 0;
541: switch (byteIn) {
542: case 1:
543: b1 = b;
544: break;
545: case 2:
546: b2 = b;
547: break;
548: case 3:
549: b3 = b;
550: break;
551: }
552: int i = byteIn;
553: switch (byteIn) {
554: case 3:
555: b2 = fPartialMultiByteChar[--i];
556: case 2:
557: b1 = fPartialMultiByteChar[--i];
558: case 1:
559: b = fPartialMultiByteChar[--i];
560: }
561: switch (byteIn) {
562: case 1:
563: if ((b1 & 0xc0) != 0x80) {
564: Object[] args = { Integer.toHexString(b),
565: Integer.toHexString(b1) };
566: deferException(ImplementationMessages.ENC5, args,
567: outOffset);
568: out[outOffset++] = 0;
569: break;
570: }
571: // fall through
572: case 2:
573: if ((b & 0xe0) == 0xc0) { // 110yyyyy 10xxxxxx
574: int ch = ((0x1f & b) << 6) + (0x3f & b1);
575: out[outOffset++] = (char) ch;
576: if (outOffset == outEnd) {
577: fPartialMultiByteResult = exitNormalize(inOffset,
578: outOffset, false);
579: return false;
580: }
581: if (byteIn < 2 && ++inOffset == inEnd) {
582: fPartialMultiByteResult = exitNormalize(inOffset,
583: outOffset, true);
584: return false;
585: }
586: break;
587: }
588: if (byteIn < 2) {
589: if (++inOffset == inEnd) {
590: savePartialMultiByte(2, b1);
591: fPartialMultiByteResult = exitNormalize(inOffset,
592: outOffset, true);
593: return false;
594: }
595: b2 = in[inOffset];
596: }
597: if ((b2 & 0xc0) != 0x80) {
598: Object[] args = { Integer.toHexString(b),
599: Integer.toHexString(b1),
600: Integer.toHexString(b2) };
601: deferException(ImplementationMessages.ENC6, args,
602: outOffset);
603: out[outOffset++] = 0;
604: break;
605: }
606: // fall through
607: case 3:
608: if ((b & 0xf0) == 0xe0) { // 1110zzzz 10yyyyyy 10xxxxxx
609: int ch = ((0x0f & b) << 12) + ((0x3f & b1) << 6)
610: + (0x3f & b2);
611: out[outOffset++] = (char) ch;
612: if (outOffset == outEnd) {
613: fPartialMultiByteResult = exitNormalize(inOffset,
614: outOffset, false);
615: return false;
616: }
617: if (byteIn < 3 && ++inOffset == inEnd) {
618: fPartialMultiByteResult = exitNormalize(inOffset,
619: outOffset, true);
620: return false;
621: }
622: break;
623: }
624: if (byteIn < 3) {
625: if ((b & 0xf8) != 0xf0) {
626: Object[] args = { Integer.toHexString(b) };
627: deferException(ImplementationMessages.ENC4, args,
628: outOffset);
629: out[outOffset++] = 0;
630: break;
631: }
632: if (++inOffset == inEnd) {
633: savePartialMultiByte(3, b2, b1);
634: fPartialMultiByteResult = exitNormalize(inOffset,
635: outOffset, true);
636: return false;
637: }
638: b3 = in[inOffset];
639: }
640: if ((b3 & 0xc0) != 0x80) {
641: Object[] args = { Integer.toHexString(b),
642: Integer.toHexString(b1),
643: Integer.toHexString(b2),
644: Integer.toHexString(b3) };
645: deferException(ImplementationMessages.ENC7, args,
646: outOffset);
647: out[outOffset++] = 0;
648: break;
649: }
650: int ch = ((0x0f & b) << 18) + ((0x3f & b1) << 12)
651: + ((0x3f & b2) << 6) + (0x3f & b3);
652: if (ch >= 0x10000) {
653: out[outOffset++] = (char) (((ch - 0x00010000) >> 10) + 0xd800);
654: ch = (((ch - 0x00010000) & 0x3ff) + 0xdc00);
655: if (outOffset == outEnd) {
656: fPartialSurrogatePair = ch;
657: fPartialMultiByteResult = exitNormalize(inOffset,
658: outOffset, false);
659: return false;
660: }
661: }
662: out[outOffset++] = (char) ch;
663: if (outOffset == outEnd) {
664: fPartialMultiByteResult = exitNormalize(inOffset,
665: outOffset, false);
666: return false;
667: }
668: if (++inOffset == inEnd) {
669: fPartialMultiByteResult = exitNormalize(inOffset,
670: outOffset, true);
671: return false;
672: }
673: break;
674: }
675: return exitNormalize(inOffset, outOffset, true);
676: }
677: }
|