001: /*
002:
003: ============================================================================
004: The Apache Software License, Version 1.1
005: ============================================================================
006:
007: Copyright (C) 1999-2003 The Apache Software Foundation. All rights reserved.
008:
009: Redistribution and use in source and binary forms, with or without modifica-
010: tion, are permitted provided that the following conditions are met:
011:
012: 1. Redistributions of source code must retain the above copyright notice,
013: this list of conditions and the following disclaimer.
014:
015: 2. Redistributions in binary form must reproduce the above copyright notice,
016: this list of conditions and the following disclaimer in the documentation
017: and/or other materials provided with the distribution.
018:
019: 3. The end-user documentation included with the redistribution, if any, must
020: include the following acknowledgment: "This product includes software
021: developed by the Apache Software Foundation (http://www.apache.org/)."
022: Alternately, this acknowledgment may appear in the software itself, if
023: and wherever such third-party acknowledgments normally appear.
024:
025: 4. The names "Batik" and "Apache Software Foundation" must not be
026: used to endorse or promote products derived from this software without
027: prior written permission. For written permission, please contact
028: apache@apache.org.
029:
030: 5. Products derived from this software may not be called "Apache", nor may
031: "Apache" appear in their name, without prior written permission of the
032: Apache Software Foundation.
033:
034: THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
035: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
036: FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
037: APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
038: INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU-
039: DING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
040: OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
041: ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
042: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
043: THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
044:
045: This software consists of voluntary contributions made by many individuals
046: on behalf of the Apache Software Foundation. For more information on the
047: Apache Software Foundation, please see <http://www.apache.org/>.
048:
049: */
050:
051: package org.apache.batik.util.io;
052:
053: import java.io.IOException;
054: import java.io.InputStream;
055:
056: /**
057: * This class represents an object which decodes UTF-8 characters from
058: * a stream of bytes.
059: *
060: * @author <a href="mailto:stephane@hillion.org">Stephane Hillion</a>
061: * @version $Id$
062: */
063: public class UTF8Decoder extends AbstractCharDecoder {
064:
065: /**
066: * The number of bytes of a UTF-8 sequence indexed by the first
067: * byte of the sequence.
068: */
069: protected final static byte[] UTF8_BYTES = { 1, 1, 1, 1, 1, 1, 1,
070: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
071: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
072: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
073: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
074: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
075: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
076: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
077: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
078: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
079: 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
080: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
081: 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4,
082: 4, 0, 0, 0, 0, 0, 0, 0, 0, };
083:
084: /**
085: * The next char, in case of a 4 bytes sequence.
086: */
087: protected int nextChar = -1;
088:
089: /**
090: * Creates a new UTF8Decoder.
091: */
092: public UTF8Decoder(InputStream is) {
093: super (is);
094: }
095:
096: /**
097: * Reads the next character.
098: * @return a character or END_OF_STREAM.
099: */
100: public int readChar() throws IOException {
101: if (nextChar != -1) {
102: int result = nextChar;
103: nextChar = -1;
104: return result;
105: }
106: if (position == count) {
107: fillBuffer();
108: }
109: if (count == -1) {
110: return END_OF_STREAM;
111: }
112: int b1 = buffer[position++] & 0xff;
113: switch (UTF8_BYTES[b1]) {
114: default:
115: charError("UTF-8");
116:
117: case 1:
118: return b1;
119:
120: case 2:
121: if (position == count) {
122: fillBuffer();
123: }
124: if (count == -1) {
125: endOfStreamError("UTF-8");
126: }
127: return ((b1 & 0x1f) << 6) | (buffer[position++] & 0x3f);
128:
129: case 3:
130: if (position == count) {
131: fillBuffer();
132: }
133: if (count == -1) {
134: endOfStreamError("UTF-8");
135: }
136: int b2 = buffer[position++];
137: if (position == count) {
138: fillBuffer();
139: }
140: if (count == -1) {
141: endOfStreamError("UTF-8");
142: }
143: int b3 = buffer[position++];
144: if ((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80) {
145: charError("UTF-8");
146: }
147: return ((b1 & 0x1f) << 12) | ((b2 & 0x3f) << 6)
148: | (b3 & 0x1f);
149:
150: case 4:
151: if (position == count) {
152: fillBuffer();
153: }
154: if (count == -1) {
155: endOfStreamError("UTF-8");
156: }
157: b2 = buffer[position++];
158: if (position == count) {
159: fillBuffer();
160: }
161: if (count == -1) {
162: endOfStreamError("UTF-8");
163: }
164: b3 = buffer[position++];
165: if (position == count) {
166: fillBuffer();
167: }
168: if (count == -1) {
169: endOfStreamError("UTF-8");
170: }
171: int b4 = buffer[position++];
172: if ((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80
173: || (b4 & 0xc0) != 0x80) {
174: charError("UTF-8");
175: }
176: int c = ((b1 & 0x1f) << 18) | ((b2 & 0x3f) << 12)
177: | ((b3 & 0x1f) << 6) | (b4 & 0x1f);
178: nextChar = (c - 0x10000) % 0x400 + 0xdc00;
179: return (c - 0x10000) / 0x400 + 0xd800;
180: }
181: }
182: }
|