001: package uk.org.ponder.stringutil;
002:
003: /** The Sun "ByteToCharUTF8" gives neither control nor
004: * feedback from the UTF8 conversion process.
005: * The following URLs:
006: * <br><a href="http://www-106.ibm.com/developerworks/library/utfencodingforms/">
007: * http://www-106.ibm.com/developerworks/library/utfencodingforms/</a>
008: * <br><a href="http://www.cl.cam.ac.uk/~mgk25/unicode.html">
009: * http://www.cl.cam.ac.uk/~mgk25/unicode.html</a>
010: * <br>are invaluable in understanding what is going on here.
011: * Note these two are actually in disagreement - IBM agrees with this file,
012: * and says that max UTF-8 is 4 bytes, whereas Kuhn goes up to 6.
013: * RFC2781 says that > 4byte UTF-8 values (> U+1ffff) are simply not convertible to
014: * UTF-16 which is presumably what is used by Java, so the difference is academic.
015: * It would appear characters > U+10000 have not appeared until Unicode 3.1,
016: * standard dated 23/03/01.
017: *
018: * <p>Sun implementation insists on throwing MalformedInputException on all
019: * possible occasions, this is not acceptable strategy for getting user to
020: * correct invalid characters. It should be able to throw UnknownCharacterException,
021: * but this decoder never does.
022: *
023: * <p>Note that this code obeys recommendation against accepting overlong encodings
024: * of the same characer.
025: */
026:
027: public class ByteToCharUTF8 extends ByteToCharConverter {
028:
029: public ByteToCharUTF8() {
030: }
031:
032: /*
033: * Characters to use for automatic substitution.
034: */
035: // This character \uFFFD is actually assigned, NB
036: // http://oss.software.ibm.com/developerworks/opensource/icu/project/archives/icu-bugrfe/icu-bugrfe.0007/msg00062.html
037: // which claims that some converters incorrectly use it as unassigned.
038: // this is "Hollow box, width 1"
039: // Markus Kuhn recommends 0xDCxx rather than 0xfffd
040: // DCxx are in the "low-surrogate" range for UTF-16, which are in error if they appear alone
041: // not preceded by 0xD8xx (Unicode book, ch 13).
042: // Kuhn argues that this is wise because it will generate an error on any further
043: // decoding, but is lossless.
044: // protected char[] subChars = { '\uFFFD' };
045: // small array to avoid output of incomplete UTF-16
046: char[] outputChar = new char[6];
047:
048: // The input_sequence_length variable is left set until the head of this loop,
049: // so that error handlers can find the erroneous sequence, and the input fillers can tell
050: // how much input is being awaited. However, the true position of fully converted input
051: // is always stored in inbufferpos.
052:
053: public void handleEncodingError(String errortype) {
054: for (int i = 0; i < input_sequence_length; ++i) {
055: // remember to mask for sign-extension widening
056: outputChar[i] = (char) (0xdc00 + (inbuffer[inbufferpos + i] & 0xff));
057: }
058: // Kuhn actually recommends output sequence of 1 byte here, but this is
059: // inconsistent with his other advice about lossless error encoding
060: output_sequence_length = input_sequence_length;
061: super .handleEncodingError(errortype);
062: }
063:
064: public int convert() {
065: int byte1, byte2, byte3, byte4;
066: input_sequence_length = 0;
067:
068: // This loop touches:
069: // inbufferpos, inbufferlimit, inbuffer, input_sequence_length
070: // outbufferpos, outbufferlimit, outbuffer, output_sequence_length
071: // (errorhandler)
072:
073: /* System.out.println("ByteToCharUTF8 beginning convert() with inbufferpos "+ inbufferpos +
074: " inbufferlimit "+inbufferlimit);*/
075: while (inbufferpos < inbufferlimit) { // smaller loop to repeatedly get input
076:
077: byte1 = inbuffer[inbufferpos] & 0xff;
078: // useful aide-memoire - the first level boundary gains 4 bits, all subsequent gain 5
079: if ((byte1 & 0x80) == 0) { // level 1 character - single ASCII byte U-0x0 - 0x7f
080: if (byte1 == (int) '\n')
081: ++linenumber;
082: input_sequence_length = 1;
083: outputChar[0] = (char) byte1;
084: output_sequence_length = 1;
085: // System.out.print(outputChar[0]);
086: } else if ((byte1 & 0xe0) == 0xc0) { // level 2 high bits should be 110llll0
087: // level 2 - two bytes U-0x80 - 0x7ff
088: input_sequence_length = 2;
089: if (missing_bytes() > 0) { // overflow of 1 byte into next round
090: return STOP_INPUT_EXHAUSTED;
091: }
092: byte2 = inbuffer[inbufferpos + 1] & 0xff;
093: if ((byte2 & 0xc0) != 0x80) { // level 2 error.
094: handleEncodingError("Invalid 2-byte UTF-8 encoding");
095: } else if ((byte1 & 0x1e) == 0) { // reject overlong sequence 0x7f or less
096: handleEncodingError("Overlong 2-byte UTF-8 encoding");
097: } else {
098: outputChar[0] = (char) (((byte1 & 0x1f) << 6) | (byte2 & 0x3f));
099: output_sequence_length = 1;
100: }
101: //System.out.print("[2]"+outputChar[0]);
102: } else if ((byte1 & 0xf0) == 0xe0) { // level 3 high bits should be 1110llll 10lxxxxx
103: // level 3 - 3 bytes U-0x800 - 0xffff
104: input_sequence_length = 3;
105: if (missing_bytes() > 0) { // overflow of one byte into next round
106: return STOP_INPUT_EXHAUSTED;
107: }
108: byte2 = inbuffer[inbufferpos + 1] & 0xff;
109: byte3 = inbuffer[inbufferpos + 2] & 0xff;
110: if ((byte2 & 0xc0) != 0x80 || (byte3 & 0xc0) != 0x80) { // level 3 error
111: handleEncodingError("Invalid 3-byte UTF-8 encoding");
112: } else if ((byte1 & 0xf) == 0 && (byte2 & 0x20) == 0) {
113: handleEncodingError("Overlong 3-byte UTF-8 encoding");
114: // reject overlong sequence - 0x7ff or less
115: } else {
116: outputChar[0] = (char) (((byte1 & 0x0f) << 12)
117: | ((byte2 & 0x3f) << 6) | (byte3 & 0x3f));
118: output_sequence_length = 1;
119: }
120: // This is the place we would reject incorrect UTF-16 surrogates if we could
121: // be bothered
122: //System.out.print("[3]"+outputChar[0]);
123: } else if ((byte1 & 0xf8) == 0xf0) { // level 4 high bits should be 11110lll 10llxxxxtc.
124: // l bits should be 1 for non-overlong sequence
125: // level 4 - 4 bytes U-0x10000 - 0x1fffff
126: input_sequence_length = 4;
127: if (missing_bytes() > 0) {
128: return STOP_INPUT_EXHAUSTED;
129: }
130: byte2 = inbuffer[inbufferpos + 1] & 0xff;
131: byte3 = inbuffer[inbufferpos + 2] & 0xff;
132: byte4 = inbuffer[inbufferpos + 3] & 0xff;
133: if ((byte2 & 0xc0) != 0x80 || (byte3 & 0xc0) != 0x80
134: || (byte4 & 0xc0) != 0x80) { // level 4 error if all high bits are not 10xxxxxx
135: handleEncodingError("Invalid 4-byte UTF-8 encoding");
136: } else if ((byte1 & 0x7) == 0 && (byte2 & 0x30) == 0) { // reject overlong sequence
137: handleEncodingError("Overlong 4-byte UTF-8 encoding");
138: } else if ((byte1 & 0x4) != 0) {
139: handleEncodingError("4-byte UTF-8 encoding unrepresentable as UTF-16");
140: }
141: // this byte sequence is UTF-16 character, needs encoding as surrogate pair
142: // see RFC2781 for specification.
143: /*
144: * - Characters with values between 0x10000 and 0x10FFFF are
145: * represented by a 16-bit integer with a value between 0xD800 and
146: * 0xDBFF (within the so-called high-half zone or high surrogate
147: * area) followed by a 16-bit integer with a value between 0xDC00 and
148: * 0xDFFF (within the so-called low-half zone or low surrogate area).
149: *
150: * - Characters with values greater than 0x10FFFF cannot be encoded in
151: * UTF-16.
152: */
153: // UTF-16 packs bits as 110110xx xxxxxxxx 110111xx xxxxxxxx for 20 bits.
154: else {
155: int ucs4 = (0x07 & byte1) << 18
156: | (0x3f & byte2) << 12
157: | (0x3f & byte3) << 6 | (0x3f & byte4); // get 3, 6, 6, 6 bits = 21 bits with high 0.
158: outputChar[0] = (char) ((ucs4 - 0x10000) / 0x400 + 0xd800);
159: outputChar[1] = (char) ((ucs4 - 0x10000) % 0x400 + 0xdc00);
160: output_sequence_length = 2;
161: //System.out.print("[4]"+outputChar[0]+outputChar[1]);
162: }
163: } else if ((byte1 & 0xfc) == 0xf8) {
164: input_sequence_length = 5;
165: if (missing_bytes() > 0) {
166: return STOP_INPUT_EXHAUSTED;
167: }
168: byte2 = inbuffer[inbufferpos + 1] & 0xff;
169: byte3 = inbuffer[inbufferpos + 2] & 0xff;
170: byte4 = inbuffer[inbufferpos + 3] & 0xff;
171: int byte5 = inbuffer[inbufferpos + 4] & 0xff;
172: if ((byte2 & 0xc0) != 0x80 || (byte3 & 0xc0) != 0x80
173: || (byte4 & 0xc0) != 0x80
174: || (byte5 & 0xc0) != 0x80) { // level 5 error if all high bits are not 10xxxxxx
175: handleEncodingError("Invalid 5-byte UTF-8 encoding");
176: } else {
177: handleEncodingError("5-byte UTF-8 encoding unrepresentable as UTF-16");
178: }
179: } else if ((byte1 & 0xfe) == 0xfc) {
180: input_sequence_length = 6;
181: if (missing_bytes() > 0) {
182: return STOP_INPUT_EXHAUSTED;
183: }
184: byte2 = inbuffer[inbufferpos + 1] & 0xff;
185: byte3 = inbuffer[inbufferpos + 2] & 0xff;
186: byte4 = inbuffer[inbufferpos + 3] & 0xff;
187: int byte5 = inbuffer[inbufferpos + 4] & 0xff;
188: int byte6 = inbuffer[inbufferpos + 5] & 0xff;
189: if ((byte2 & 0xc0) != 0x80 || (byte3 & 0xc0) != 0x80
190: || (byte4 & 0xc0) != 0x80
191: || (byte5 & 0xc0) != 0x80
192: || (byte6 & 0xc0) != 0x80) { // level 6 error if all high bits are not 10xxxxxx
193: handleEncodingError("Invalid 6-byte UTF-8 encoding");
194: } else {
195: handleEncodingError("6-byte UTF-8 encoding unrepresentable as UTF-16");
196: }
197: } else {
198: input_sequence_length = 1;
199: handleEncodingError("Invalid 1-byte UTF-8 encoding");
200: }
201: // run out of output space --- back up to beginning of sequence and wait
202: // for another output buffer to be supplied.
203: if (outbufferpos + output_sequence_length > outbufferlimit) {
204: /*
205: System.out.println("Output buffer exhausted with sequence length "+output_sequence_length+
206: " remaining");
207: */
208: return STOP_OUTPUT_EXHAUSTED;
209: } else { // not out of space, can step along
210: for (int i = 0; i < output_sequence_length; i++) {
211: outbuffer[outbufferpos + i] = outputChar[i];
212: }
213: outbufferpos += output_sequence_length;
214: inbufferpos += input_sequence_length;
215: totalbytesin += input_sequence_length;
216: input_sequence_length = 0;
217: }
218: } // end loop over this buffer of input
219: return STOP_INPUT_EXHAUSTED_EXACTLY;
220: }
221:
222: /*
223: * Return the character set id
224: */
225: public String getCharacterEncoding() {
226: return "UTF8";
227: }
228:
229: public int getMaxOutput(int inputsize) {
230: // possibilities are 1-1, 2-1, 3-1 or 4-2.
231: return inputsize;
232: }
233: }
|