001: /*
002: * Copyright 1999-2004 The Apache Software Foundation
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.apache.tomcat.util.buf;
018:
019: import java.io.IOException;
020:
021: /**
022: * Moved from ByteChunk - code to convert from UTF8 bytes to chars.
023: * Not used in the current tomcat3.3 : the performance gain is not very
024: * big if the String is created, only if we avoid that and work only
025: * on char[]. Until than, it's better to be safe. ( I tested this code
026: * with 2 and 3 bytes chars, and it works fine in xerces )
027: *
028: * Cut from xerces' UTF8Reader.copyMultiByteCharData()
029: *
030: * @author Costin Manolache
031: * @author ( Xml-Xerces )
032: */
033: public final class UTF8Decoder extends B2CConverter {
034: // may have state !!
035:
036: public UTF8Decoder() {
037:
038: }
039:
040: public void recycle() {
041: }
042:
043: public void convert(ByteChunk mb, CharChunk cb) throws IOException {
044: int bytesOff = mb.getOffset();
045: int bytesLen = mb.getLength();
046: byte bytes[] = mb.getBytes();
047:
048: int j = bytesOff;
049: int end = j + bytesLen;
050:
051: while (j < end) {
052: int b0 = 0xff & bytes[j];
053:
054: if ((b0 & 0x80) == 0) {
055: cb.append((char) b0);
056: j++;
057: continue;
058: }
059:
060: // 2 byte ?
061: if (j++ >= end) {
062: // ok, just ignore - we could throw exception
063: throw new IOException("Conversion error - EOF ");
064: }
065: int b1 = 0xff & bytes[j];
066:
067: // ok, let's the fun begin - we're handling UTF8
068: if ((0xe0 & b0) == 0xc0) { // 110yyyyy 10xxxxxx (0x80 to 0x7ff)
069: int ch = ((0x1f & b0) << 6) + (0x3f & b1);
070: if (debug > 0)
071: log("Convert " + b0 + " " + b1 + " " + ch
072: + ((char) ch));
073:
074: cb.append((char) ch);
075: j++;
076: continue;
077: }
078:
079: if (j++ >= end)
080: return;
081: int b2 = 0xff & bytes[j];
082:
083: if ((b0 & 0xf0) == 0xe0) {
084: if ((b0 == 0xED && b1 >= 0xA0)
085: || (b0 == 0xEF && b1 == 0xBF && b2 >= 0xBE)) {
086: if (debug > 0)
087: log("Error " + b0 + " " + b1 + " " + b2);
088:
089: throw new IOException("Conversion error 2");
090: }
091:
092: int ch = ((0x0f & b0) << 12) + ((0x3f & b1) << 6)
093: + (0x3f & b2);
094: cb.append((char) ch);
095: if (debug > 0)
096: log("Convert " + b0 + " " + b1 + " " + b2 + " "
097: + ch + ((char) ch));
098: j++;
099: continue;
100: }
101:
102: if (j++ >= end)
103: return;
104: int b3 = 0xff & bytes[j];
105:
106: if ((0xf8 & b0) == 0xf0) {
107: if (b0 > 0xF4 || (b0 == 0xF4 && b1 >= 0x90)) {
108: if (debug > 0)
109: log("Convert " + b0 + " " + b1 + " " + b2 + " "
110: + b3);
111: throw new IOException("Conversion error ");
112: }
113: int ch = ((0x0f & b0) << 18) + ((0x3f & b1) << 12)
114: + ((0x3f & b2) << 6) + (0x3f & b3);
115:
116: if (debug > 0)
117: log("Convert " + b0 + " " + b1 + " " + b2 + " "
118: + b3 + " " + ch + ((char) ch));
119:
120: if (ch < 0x10000) {
121: cb.append((char) ch);
122: } else {
123: cb
124: .append((char) (((ch - 0x00010000) >> 10) + 0xd800));
125: cb
126: .append((char) (((ch - 0x00010000) & 0x3ff) + 0xdc00));
127: }
128: j++;
129: continue;
130: } else {
131: // XXX Throw conversion exception !!!
132: if (debug > 0)
133: log("Convert " + b0 + " " + b1 + " " + b2 + " "
134: + b3);
135: throw new IOException("Conversion error 4");
136: }
137: }
138: }
139:
140: private static int debug = 1;
141:
142: void log(String s) {
143: System.out.println("UTF8Decoder: " + s);
144: }
145:
146: }
|