001: /*
002: * $Id: UTF8XMLDecoder.java,v 1.5 2004/07/11 09:37:37 yuvalo Exp $
003: *
004: * (C) Copyright 2002-2004 by Yuval Oren. All rights reserved.
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: package com.bluecast.xml;
020:
021: import com.bluecast.io.CharsetDecoder;
022: import com.bluecast.io.IllegalCharException;
023:
024: import java.io.CharConversionException;
025:
026: /**
027: * A decoder for UTF-8 text. Also converts
028: * carriage returns into linefeeds and CRLF into LF.
029: *
030: * @author Yuval Oren, yuval@bluecast.com
031: * @version $Revision: 1.5 $
032: */
033:
034: final public class UTF8XMLDecoder implements XMLDecoder {
035: private boolean sawCR = false;
036:
037: public CharsetDecoder newCharsetDecoder() {
038: return newXMLDecoder();
039: }
040:
041: public XMLDecoder newXMLDecoder() {
042: return new UTF8XMLDecoder();
043: }
044:
045: public int minBytesPerChar() {
046: return 1;
047: }
048:
049: public int maxBytesPerChar() {
050: return 3;
051: }
052:
053: public void reset() {
054: sawCR = false;
055: }
056:
057: public void decode(byte[] in_buf, int in_off, int in_len,
058: char[] out_buf, int out_off, int out_len, int[] result)
059: throws CharConversionException {
060: int i, o;
061:
062: for (i = o = 0; i < in_len && o < out_len; i++) {
063: // A UTF-8 character can be 1 or more bytes. Length
064: // is determined by the bitmask of the first byte
065: int c, c2, c3, c4;
066: c = in_buf[in_off + i];
067: // Check for 1 byte first, since it's the most common.
068: // One byte: 0xxxxxxx
069: if ((c & 0x80) == 0) {
070: /* We got the character */
071: } else {
072: // It's at least two bytes long
073: if (++i < in_len)
074: c2 = in_buf[in_off + i];
075: else {
076: result[0] = i - 1;
077: result[1] = o;
078: return;
079: }
080:
081: // Two bytes: 110xxxxx 10xxxxxx
082: if ((c & 0xE0) == 0xC0) {
083: // Subsequent bytes must begin with 0x10
084: if ((c2 & 0x80) != 0x80)
085: throw new CharConversionException(
086: "Malformed UTF-8 character: 0x"
087: + Integer.toHexString(c & 0xFF)
088: + " 0x"
089: + Integer
090: .toHexString(c2 & 0xFF));
091:
092: c = ((c & 0x1F) << 6) | (c2 & 0x3F);
093: // Make sure this is not an overlong character:
094: // this character must not fit within 7 bits
095: if ((c & 0x780) == 0)
096: throw new CharConversionException(
097: "2-byte UTF-8 character is overlong: 0x"
098: + Integer
099: .toHexString(in_buf[in_off
100: + i - 1] & 0xFF)
101: + " 0x"
102: + Integer
103: .toHexString(c2 & 0xFF));
104:
105: }
106: // Three bytes: 1110xxxx 10xxxxxx 10xxxxxx
107: else if ((c & 0xF0) == 0xE0) {
108: if (++i < in_len)
109: c3 = in_buf[in_off + i];
110: else {
111: result[0] = i - 2;
112: result[1] = o;
113: return;
114: }
115:
116: // Subsequent bytes must begin with 0x10
117: if (((c2 & 0x80) != 0x80) || ((c3 & 0x80) != 0x80))
118: throw new CharConversionException(
119: "Malformed UTF-8 character: 0x"
120: + Integer.toHexString(c & 0xFF)
121: + " 0x"
122: + Integer
123: .toHexString(c2 & 0xFF)
124: + " 0x"
125: + Integer
126: .toHexString(c3 & 0xFF));
127:
128: c = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6)
129: | (c3 & 0x3F);
130: // Make sure this is not an overlong character:
131: // this character must not fit within 11 bits
132: if ((c & 0xF800) == 0)
133: throw new CharConversionException(
134: "3-byte UTF-8 character is overlong: 0x"
135: + Integer
136: .toHexString(in_buf[in_off
137: + i - 2] & 0xFF)
138: + " 0x"
139: + Integer
140: .toHexString(c2 & 0xFF)
141: + " 0x"
142: + Integer
143: .toHexString(c3 & 0xFF));
144: }
145:
146: // Four bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
147: // This is returned as a surrogate pair of characters
148: else if ((c & 0xF0) == 0xF0) {
149: if (i + 2 < in_len) {
150: c3 = in_buf[in_off + (++i)];
151: c4 = in_buf[in_off + (++i)];
152: } else {
153: result[0] = i - 2;
154: result[1] = o;
155: return;
156: }
157:
158: // Subsequent bytes must begin with 0x10
159: if (((c2 & 0x80) != 0x80) || ((c3 & 0x80) != 0x80)
160: || ((c4 & 0x80) != 0x80))
161: throw new CharConversionException(
162: "Malformed UTF-8 character: 0x"
163: + Integer.toHexString(c & 0xFF)
164: + " 0x"
165: + Integer
166: .toHexString(c2 & 0xFF)
167: + " 0x"
168: + Integer
169: .toHexString(c3 & 0xFF)
170: + " 0x"
171: + Integer
172: .toHexString(c4 & 0xFF));
173:
174: c = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12)
175: | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
176:
177: if (c < 0x10000 || c > 0x10FFFF)
178: throw new IllegalCharException(
179: "Illegal XML character: 0x"
180: + Integer.toHexString(c));
181:
182: // Construct the surrogate pair
183: c -= 0x10000;
184: out_buf[out_off + (o++)] = (char) ((c >> 10) | 0xD800);
185: out_buf[out_off + (o++)] = (char) ((c & ((1 << 10) - 1)) | 0xDC00);
186: sawCR = false;
187: continue;
188: } else {
189: throw new CharConversionException(
190: "Characters larger than 4 bytes are "
191: + "not supported: byte 0x"
192: + Integer.toHexString(c & 0xFF)
193: + " implies a length of more than 4 bytes");
194: }
195:
196: if ((c >= 0xD800 && c < 0xE000)
197: || (c == 0xFFFE || c == 0xFFFF))
198: throw new IllegalCharException(
199: "Illegal XML character: 0x"
200: + Integer.toHexString(c));
201: }
202: // Now condense CRLF into LF and transform a lone CR into LF
203: if (c >= 0x20) {
204: sawCR = false;
205: out_buf[out_off + o++] = (char) c;
206: } else {
207: switch (c) {
208: case '\n':
209: if (sawCR) {
210: sawCR = false;
211: } else
212: out_buf[out_off + o++] = '\n';
213: break;
214:
215: case '\r':
216: sawCR = true;
217: out_buf[out_off + o++] = '\n';
218: break;
219:
220: case '\t':
221: out_buf[out_off + o++] = '\t';
222: break;
223:
224: default:
225: throw new IllegalCharException(
226: "Illegal XML character: 0x"
227: + Integer.toHexString(c));
228: }
229: }
230: }
231: result[0] = i;
232: result[1] = o;
233: }
234:
235: public void decodeXMLDecl(byte[] in_buf, int in_off, int in_len,
236: char[] out_buf, int out_off, int out_len, int[] result)
237: throws CharConversionException {
238: int i, o;
239: inputLoop: for (i = o = 0; i < in_len && o < out_len; i++) {
240: // A UTF-8 character can be 1 or more bytes. Length
241: // is determined by the bitmask of the first byte
242: int c = in_buf[in_off + i];
243:
244: // Check for 1 byte first, since it's the most common.
245: // One byte: 0xxxxxxx
246: if ((c & 0x80) != 0) {
247: // XML declarations don't use more than 1 byte per char.
248: break inputLoop;
249: }
250:
251: // Now condense CRLF into LF and transform a lone CR into LF
252: if (c >= 0x20) {
253: sawCR = false;
254: out_buf[out_off + o++] = (char) c;
255:
256: // Stop decoding after the first '>' because anything more
257: // can't be part of the declaration
258: if (c == '>') {
259: i++;
260: break inputLoop;
261: }
262: } else {
263: switch (c) {
264: case '\n':
265: if (sawCR) {
266: sawCR = false;
267: } else
268: out_buf[out_off + o++] = '\n';
269: break;
270:
271: case '\r':
272: sawCR = true;
273: out_buf[out_off + o++] = '\n';
274: break;
275:
276: case '\t':
277: out_buf[out_off + o++] = '\t';
278: break;
279:
280: default:
281: // Illegal character. Stop decoding
282: break inputLoop;
283: }
284: }
285: }
286: result[0] = i;
287: result[1] = o;
288: }
289:
290: }
|