001: /*
002: * @(#)CharToByteUTF8.java 1.19 06/10/10
003: *
004: * Copyright 1990-2006 Sun Microsystems, Inc. All Rights Reserved.
005: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
006: *
007: * This program is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU General Public License version
009: * 2 only, as published by the Free Software Foundation.
010: *
011: * This program is distributed in the hope that it will be useful, but
012: * WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014: * General Public License version 2 for more details (a copy is
015: * included at /legal/license.txt).
016: *
017: * You should have received a copy of the GNU General Public License
018: * version 2 along with this work; if not, write to the Free Software
019: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020: * 02110-1301 USA
021: *
022: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
023: * Clara, CA 95054 or visit www.sun.com if you need additional
024: * information or have any questions.
025: *
026: */
027: package sun.io;
028:
029: /**
030: * UCS2 (UTF16) -> UCS Transformation Format 8 (UTF-8) converter
031: * It's represented like below.
032: *
033: * # Bits Bit pattern
034: * 1 7 0xxxxxxx
035: * 2 11 110xxxxx 10xxxxxx
036: * 3 16 1110xxxx 10xxxxxx 10xxxxxx
037: * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
038: * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
039: * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
040: *
041: * UCS2 uses 1-3 / UTF16 uses 1-4 / UCS4 uses 1-6
042: */
043:
044: public class CharToByteUTF8 extends CharToByteConverter {
045:
046: private char highHalfZoneCode;
047:
048: public int flush(byte[] output, int outStart, int outEnd)
049: throws MalformedInputException {
050: if (highHalfZoneCode != 0) {
051: highHalfZoneCode = 0;
052: badInputLength = 0;
053: throw new MalformedInputException();
054: }
055: byteOff = charOff = 0;
056: return 0;
057: }
058:
059: /**
060: * Character conversion
061: */
062: public int convert(char[] input, int inOff, int inEnd,
063: byte[] output, int outOff, int outEnd)
064: throws ConversionBufferFullException,
065: MalformedInputException {
066: char inputChar;
067: byte[] outputByte = new byte[6];
068: int inputSize;
069: int outputSize;
070:
071: charOff = inOff;
072: byteOff = outOff;
073:
074: if (highHalfZoneCode != 0) {
075: inputChar = highHalfZoneCode;
076: highHalfZoneCode = 0;
077: if (input[inOff] >= 0xdc00 && input[inOff] <= 0xdfff) {
078: // This is legal UTF16 sequence.
079: int ucs4 = (highHalfZoneCode - 0xd800) * 0x400
080: + (input[inOff] - 0xdc00) + 0x10000;
081: output[0] = (byte) (0xf0 | ((ucs4 >> 18)) & 0x07);
082: output[1] = (byte) (0x80 | ((ucs4 >> 12) & 0x3f));
083: output[2] = (byte) (0x80 | ((ucs4 >> 6) & 0x3f));
084: output[3] = (byte) (0x80 | (ucs4 & 0x3f));
085: charOff++;
086: highHalfZoneCode = 0;
087: } else {
088: // This is illegal UTF16 sequence.
089: badInputLength = 0;
090: throw new MalformedInputException();
091: }
092: }
093:
094: while (charOff < inEnd) {
095: inputChar = input[charOff];
096: if (inputChar < 0x80) {
097: outputByte[0] = (byte) inputChar;
098: inputSize = 1;
099: outputSize = 1;
100: } else if (inputChar < 0x800) {
101: outputByte[0] = (byte) (0xc0 | ((inputChar >> 6) & 0x1f));
102: outputByte[1] = (byte) (0x80 | (inputChar & 0x3f));
103: inputSize = 1;
104: outputSize = 2;
105: } else if (inputChar >= 0xd800 && inputChar <= 0xdbff) {
106: // this is <high-half zone code> in UTF-16
107: if (charOff + 1 >= inEnd) {
108: highHalfZoneCode = inputChar;
109: break;
110: }
111: // check next char is valid <low-half zone code>
112: char lowChar = input[charOff + 1];
113: if (lowChar < 0xdc00 || lowChar > 0xdfff) {
114: badInputLength = 1;
115: throw new MalformedInputException();
116: }
117: int ucs4 = (inputChar - 0xd800) * 0x400
118: + (lowChar - 0xdc00) + 0x10000;
119: outputByte[0] = (byte) (0xf0 | ((ucs4 >> 18)) & 0x07);
120: outputByte[1] = (byte) (0x80 | ((ucs4 >> 12) & 0x3f));
121: outputByte[2] = (byte) (0x80 | ((ucs4 >> 6) & 0x3f));
122: outputByte[3] = (byte) (0x80 | (ucs4 & 0x3f));
123: outputSize = 4;
124: inputSize = 2;
125: } else {
126: outputByte[0] = (byte) (0xe0 | ((inputChar >> 12)) & 0x0f);
127: outputByte[1] = (byte) (0x80 | ((inputChar >> 6) & 0x3f));
128: outputByte[2] = (byte) (0x80 | (inputChar & 0x3f));
129: inputSize = 1;
130: outputSize = 3;
131: }
132: if (byteOff + outputSize > outEnd) {
133: throw new ConversionBufferFullException();
134: }
135: for (int i = 0; i < outputSize; i++) {
136: output[byteOff++] = outputByte[i];
137: }
138: charOff += inputSize;
139: }
140: return byteOff - outOff;
141: }
142:
143: public boolean canConvert(char ch) {
144: return true;
145: }
146:
147: public int getMaxBytesPerChar() {
148: return 3;
149: }
150:
151: public void reset() {
152: byteOff = charOff = 0;
153: highHalfZoneCode = 0;
154: }
155:
156: public String getCharacterEncoding() {
157: return "UTF8";
158: }
159: }
|