001: /**
002: *******************************************************************************
003: * Copyright (C) 2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: *
007: *******************************************************************************
008: */package com.ibm.icu.charset;
009:
010: import java.nio.ByteBuffer;
011: import java.nio.CharBuffer;
012: import java.nio.IntBuffer;
013: import java.nio.charset.CharsetDecoder;
014: import java.nio.charset.CharsetEncoder;
015: import java.nio.charset.CoderResult;
016:
017: import com.ibm.icu.lang.UCharacter;
018: import com.ibm.icu.text.UTF16;
019:
020: /**
021: * @author Niti Hantaweepant
022: */
023: class CharsetUTF8 extends CharsetICU {
024:
025: protected byte[] fromUSubstitution = new byte[] { (byte) 0xef,
026: (byte) 0xbf, (byte) 0xbd };
027:
028: public CharsetUTF8(String icuCanonicalName,
029: String javaCanonicalName, String[] aliases) {
030: super (icuCanonicalName, javaCanonicalName, aliases);
031: maxBytesPerChar = 4;
032: minBytesPerChar = 1;
033: maxCharsPerByte = 1;
034: }
035:
036: /* UTF-8 Conversion DATA
037: * for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9
038: */
039: private static final long OFFSETS_FROM_UTF8[] = { 0, 0x00000000L,
040: 0x00003080L, 0x000E2080L, 0x03C82080L, 0xFA082080L,
041: 0x82082080L };
042:
043: private static final byte BYTES_FROM_UTF8[] = { 1, 1, 1, 1, 1, 1,
044: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
045: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
046: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
047: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
048: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
049: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
050: 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
051: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
052: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
053: 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
054: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
055: 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
056: 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
057:
058: /*
059: * Starting with Unicode 3.0.1:
060: * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
061: * byte sequences with more than 4 bytes are illegal in UTF-8,
062: * which is tested with impossible values for them
063: */
064: private static final long UTF8_MIN_CHAR32[] = { 0L, 0L, 0x80L,
065: 0x800L, 0x10000L, 0xffffffffL, 0xffffffffL };
066:
067: class CharsetDecoderUTF8 extends CharsetDecoderICU {
068:
069: public CharsetDecoderUTF8(CharsetICU cs) {
070: super (cs);
071: }
072:
073: protected CoderResult decodeLoop(ByteBuffer source,
074: CharBuffer target, IntBuffer offsets, boolean flush) {
075: CoderResult cr = CoderResult.UNDERFLOW;
076:
077: int sourceArrayIndex = source.position();
078:
079: // Todo: CESU8 implementation
080: // boolean isCESU8 = args.converter.sharedData == _CESU8Data;
081: boolean isCESU8 = (UConverterSharedData._CESU8Data != null);
082: int ch, ch2 = 0;
083: int i, inBytes;
084:
085: donefornow: {
086: if (toUnicodeStatus != 0 && target.hasRemaining()) {
087: inBytes = mode; /* restore # of bytes to consume */
088: i = toULength; /* restore # of bytes consumed */
089:
090: ch = toUnicodeStatus; /*Stores the previously calculated ch from a previous call*/
091: toUnicodeStatus = 0;
092:
093: while (i < inBytes) {
094: if (sourceArrayIndex < source.limit()) {
095: toUBytesArray[i] = (byte) (ch2 = source
096: .get(sourceArrayIndex)
097: & UConverterConstants.UNSIGNED_BYTE_MASK);
098: if (!isTrail((byte) ch2)) {
099: break; /* i < inBytes */
100: }
101: ch = (ch << 6) + ch2;
102: ++sourceArrayIndex;
103: i++;
104: } else {
105: /* stores a partially calculated target*/
106: toUnicodeStatus = ch;
107: mode = inBytes;
108: toULength = (byte) i;
109: break donefornow;
110: }
111: }
112:
113: /* Remove the accumulated high bits */
114: ch -= OFFSETS_FROM_UTF8[inBytes];
115:
116: /*
117: * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
118: * - use only trail bytes after a lead byte (checked above)
119: * - use the right number of trail bytes for a given lead byte
120: * - encode a code point <= U+10ffff
121: * - use the fewest possible number of bytes for their code points
122: * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
123: *
124: * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
125: * There are no irregular sequences any more.
126: * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
127: */
128: if (i == inBytes
129: && ch <= UConverterSharedData.MAXIMUM_UTF
130: && ch >= UTF8_MIN_CHAR32[i]
131: && (isCESU8 ? i <= 3 : !UTF16
132: .isSurrogate((char) ch))) {
133: /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
134: toULength = 0;
135: if (ch <= UConverterSharedData.MAXIMUM_UCS2) {
136: /* fits in 16 bits */
137: target.put((char) ch);
138: } else {
139: /* write out the surrogates */
140: ch -= UConverterSharedData.HALF_BASE;
141: target
142: .put((char) ((ch >> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START));
143: ch = (ch & UConverterSharedData.HALF_MASK)
144: + UConverterSharedData.SURROGATE_LOW_START;
145: if (target.hasRemaining()) {
146: target.put((char) ch);
147:
148: } else /* targetCapacity==1 */{
149: charErrorBufferArray[charErrorBufferBegin + 0] = (char) ch;
150: charErrorBufferLength = 1;
151: cr = CoderResult.OVERFLOW;
152:
153: }
154: }
155: } else {
156: toULength = (byte) i;
157: cr = CoderResult
158: .malformedForLength(sourceArrayIndex);
159: break donefornow;
160: }
161: }
162:
163: while (sourceArrayIndex < source.limit()
164: && target.hasRemaining()) {
165: ch = source.get(sourceArrayIndex++)
166: & UConverterConstants.UNSIGNED_BYTE_MASK;
167: if (ch < 0x80) /* Simple case */
168: {
169: target.put((char) ch);
170: } else {
171: /* store the first char */
172: toUBytesArray[0] = (byte) ch;
173: inBytes = BYTES_FROM_UTF8[(int) ch]; /* lookup current sequence length */
174: i = 1;
175:
176: while (i < inBytes) {
177: if (sourceArrayIndex < source.limit()) {
178: toUBytesArray[i] = (byte) (ch2 = source
179: .get(sourceArrayIndex)
180: & UConverterConstants.UNSIGNED_BYTE_MASK);
181: if (!isTrail((byte) ch2)) {
182: break; /* i < inBytes */
183: }
184: ch = (ch << 6) + ch2;
185: ++sourceArrayIndex;
186: i++;
187: } else {
188: /* stores a partially calculated target*/
189: toUnicodeStatus = ch;
190: mode = inBytes;
191: toULength = (byte) i;
192: break donefornow;
193: }
194: }
195:
196: /* Remove the accumulated high bits */
197: ch -= OFFSETS_FROM_UTF8[inBytes];
198:
199: /*
200: * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
201: * - use only trail bytes after a lead byte (checked above)
202: * - use the right number of trail bytes for a given lead byte
203: * - encode a code point <= U+10ffff
204: * - use the fewest possible number of bytes for their code points
205: * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
206: *
207: * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
208: * There are no irregular sequences any more.
209: * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
210: */
211: if (i == inBytes
212: && ch <= UConverterSharedData.MAXIMUM_UTF
213: && ch >= UTF8_MIN_CHAR32[i]
214: && (isCESU8 ? i <= 3 : !UTF16
215: .isSurrogate((char) ch))) {
216: /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
217: toULength = 0;
218: if (ch <= UConverterSharedData.MAXIMUM_UCS2) {
219: /* fits in 16 bits */
220: target.put((char) ch);
221: } else {
222: /* write out the surrogates */
223: ch -= UConverterSharedData.HALF_BASE;
224: target
225: .put((char) ((ch >>> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START));
226: ch = (ch & UConverterSharedData.HALF_MASK)
227: + UConverterSharedData.SURROGATE_LOW_START;
228: if (target.hasRemaining()) {
229: target.put((char) ch);
230: } else {
231: /* Put in overflow buffer (not handled here) */
232: charErrorBufferArray[charErrorBufferBegin + 0] = (char) ch;
233: charErrorBufferLength = 1;
234: cr = CoderResult.OVERFLOW;
235: break;
236: }
237: }
238: } else {
239: toULength = (byte) i;
240: cr = CoderResult
241: .malformedForLength(sourceArrayIndex);
242: break;
243: }
244: }
245: }
246: }
247:
248: if (sourceArrayIndex < source.limit()
249: && !target.hasRemaining()) {
250: /* End of target buffer */
251: cr = CoderResult.OVERFLOW;
252: }
253:
254: source.position(sourceArrayIndex);
255:
256: return cr;
257: }
258:
259: }
260:
261: class CharsetEncoderUTF8 extends CharsetEncoderICU {
262:
263: public CharsetEncoderUTF8(CharsetICU cs) {
264: super (cs, fromUSubstitution);
265: implReset();
266: }
267:
268: protected void implReset() {
269: super .implReset();
270: }
271:
272: protected CoderResult encodeLoop(CharBuffer source,
273: ByteBuffer target, IntBuffer offsets, boolean flush) {
274: CoderResult cr = CoderResult.UNDERFLOW;
275:
276: int sourceArrayIndex = source.position();
277:
278: // Todo: CESU8 implementation
279: // boolean isCESU8 = args.converter.sharedData == _CESU8Data;
280: boolean isCESU8 = (UConverterSharedData._CESU8Data != null);
281:
282: int ch;
283: short indexToWrite;
284: byte temp[] = new byte[4];
285: boolean doloop = true;
286:
287: if (fromUChar32 != 0 && target.hasRemaining()) {
288: ch = fromUChar32;
289: fromUChar32 = 0;
290:
291: if (sourceArrayIndex < source.limit()) {
292: /* test the following code unit */
293: char trail = source.get(sourceArrayIndex);
294: if (UTF16.isTrailSurrogate(trail)) {
295: ++sourceArrayIndex;
296: ch = UCharacter.getCodePoint((char) ch, trail);
297: /* convert this supplementary code point */
298: /* exit this condition tree */
299: } else {
300: /* this is an unmatched lead code unit (1st surrogate) */
301: /* callback(illegal) */
302: fromUChar32 = (int) ch;
303: cr = CoderResult
304: .malformedForLength(sourceArrayIndex);
305: doloop = false;
306: }
307: } else {
308: /* no more input */
309: fromUChar32 = (int) ch;
310: doloop = false;
311: }
312:
313: if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
314: indexToWrite = 2;
315: temp[2] = (byte) ((ch >>> 12) | 0xe0);
316: } else {
317: indexToWrite = 3;
318: temp[3] = (byte) ((ch >>> 18) | 0xf0);
319: temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80);
320: }
321: temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80);
322: temp[0] = (byte) ((ch & 0x3f) | 0x80);
323:
324: for (; indexToWrite >= 0; indexToWrite--) {
325: if (target.hasRemaining()) {
326: target.put(temp[indexToWrite]);
327: } else {
328: errorBuffer[errorBufferLength++] = temp[indexToWrite];
329: cr = CoderResult.OVERFLOW;
330: }
331: }
332: }
333:
334: if (doloop) {
335: while (sourceArrayIndex < source.limit()
336: && target.hasRemaining()) {
337: ch = source.get(sourceArrayIndex++);
338: if (ch < 0x80) { /* Single byte */
339: target.put((byte) ch);
340: } else if (ch < 0x800) { /* Double byte */
341: target.put((byte) ((ch >>> 6) | 0xc0));
342: if (target.hasRemaining()) {
343: target.put((byte) ((ch & 0x3f) | 0x80));
344: } else {
345: errorBuffer[0] = (byte) ((ch & 0x3f) | 0x80);
346: errorBufferLength = 1;
347: cr = CoderResult.OVERFLOW;
348: break;
349: }
350: } else { /* Check for surrogates */
351: if (UTF16.isSurrogate((char) ch) && !isCESU8) {
352: if (UTF16.isLeadSurrogate((char) ch)) {
353:
354: if (sourceArrayIndex < source.limit()) {
355: /* test the following code unit */
356: char trail = source
357: .get(sourceArrayIndex);
358: if (UTF16.isTrailSurrogate(trail)) {
359: ++sourceArrayIndex;
360: ch = UCharacter.getCodePoint(
361: (char) ch, trail);
362: //ch2 = 0;
363: /* convert this supplementary code point */
364: /* exit this condition tree */
365: } else {
366: /* this is an unmatched lead code unit (1st surrogate) */
367: /* callback(illegal) */
368: fromUChar32 = ch;
369: cr = CoderResult
370: .malformedForLength(sourceArrayIndex);
371: break;
372: }
373: } else {
374: /* no more input */
375: fromUChar32 = ch;
376: break;
377: }
378: } else {
379: fromUChar32 = (int) ch;
380: cr = CoderResult
381: .malformedForLength(sourceArrayIndex);
382: break;
383: }
384: }
385:
386: if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
387: indexToWrite = 2;
388: temp[2] = (byte) ((ch >>> 12) | 0xe0);
389: } else {
390: indexToWrite = 3;
391: temp[3] = (byte) ((ch >>> 18) | 0xf0);
392: temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80);
393: }
394: temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80);
395: temp[0] = (byte) ((ch & 0x3f) | 0x80);
396:
397: for (; indexToWrite >= 0; indexToWrite--) {
398: if (target.hasRemaining()) {
399: target.put(temp[indexToWrite]);
400: } else {
401: errorBuffer[errorBufferLength++] = temp[indexToWrite];
402: cr = CoderResult.OVERFLOW;
403: }
404: }
405: }
406: }
407: }
408:
409: if (sourceArrayIndex < source.limit()
410: && !target.hasRemaining()) {
411: cr = CoderResult.OVERFLOW;
412: }
413:
414: source.position(sourceArrayIndex);
415:
416: return cr;
417: }
418: }
419:
420: /* single-code point definitions -------------------------------------------- */
421:
422: /*
423: * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
424: * @param c 8-bit code unit (byte)
425: * @return TRUE or FALSE
426: * @draft ICU 3.6
427: */
428: //static final boolean isSingle(byte c) {return (((c)&0x80)==0);}
429: /*
430: * Is this code unit (byte) a UTF-8 lead byte?
431: * @param c 8-bit code unit (byte)
432: * @return TRUE or FALSE
433: * @draft ICU 3.6
434: */
435: //static final boolean isLead(byte c) {return ((((c)-0xc0) & UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);}
436: /**
437: * Is this code unit (byte) a UTF-8 trail byte?
438: * @param c 8-bit code unit (byte)
439: * @return TRUE or FALSE
440: * @draft ICU 3.6
441: */
442: static final boolean isTrail(byte c) {
443: return (((c) & 0xc0) == 0x80);
444: }
445:
446: /*
447: * How many code units (bytes) are used for the UTF-8 encoding
448: * of this Unicode code point?
449: * @param c 32-bit code point
450: * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
451: * @draft ICU 3.6
452: */
453: /*static final int length(int c)
454: {
455: long uc = c & UConverterConstants.UNSIGNED_INT_MASK;
456: return
457: (uc<=0x7f ? 1 :
458: (uc<=0x7ff ? 2 :
459: (uc<=0xd7ff ? 3 :
460: (uc<=0xdfff || uc>0x10ffff ? 0 :
461: (uc<=0xffff ? 3 : 4)
462: )
463: )
464: )
465: );
466: }*/
467:
468: public CharsetDecoder newDecoder() {
469: return new CharsetDecoderUTF8(this );
470: }
471:
472: public CharsetEncoder newEncoder() {
473: return new CharsetEncoderUTF8(this);
474: }
475: }
|