001: /**
002: *******************************************************************************
003: * Copyright (C) 2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: *
007: *******************************************************************************
008: */package com.ibm.icu.charset;
009:
010: import java.nio.ByteBuffer;
011: import java.nio.CharBuffer;
012: import java.nio.IntBuffer;
013: import java.nio.charset.CharsetDecoder;
014: import java.nio.charset.CharsetEncoder;
015: import java.nio.charset.CoderResult;
016:
017: import com.ibm.icu.text.UTF16;
018:
019: class CharsetUTF16 extends CharsetICU {
020:
021: protected byte[] fromUSubstitution = new byte[] { (byte) 0xff,
022: (byte) 0xfd };
023:
024: public CharsetUTF16(String icuCanonicalName,
025: String javaCanonicalName, String[] aliases) {
026: super (icuCanonicalName, javaCanonicalName, aliases);
027: maxBytesPerChar = 4;
028: minBytesPerChar = 2;
029: maxCharsPerByte = 1;
030: }
031:
032: class CharsetDecoderUTF16 extends CharsetDecoderICU {
033:
034: public CharsetDecoderUTF16(CharsetICU cs) {
035: super (cs);
036: }
037:
038: protected CoderResult decodeLoop(ByteBuffer source,
039: CharBuffer target, IntBuffer offsets, boolean flush) {
040: CoderResult cr = CoderResult.UNDERFLOW;
041: if (!source.hasRemaining() && toUnicodeStatus == 0) {
042: /* no input, nothing to do */
043: return cr;
044: }
045: if (!target.hasRemaining()) {
046: return CoderResult.OVERFLOW;
047: }
048:
049: int sourceIndex = 0, count = 0, length, sourceArrayIndex;
050: char c = 0, trail;
051: length = source.remaining();
052: sourceArrayIndex = source.position();
053:
054: /* complete a partial UChar or pair from the last call */
055: if (toUnicodeStatus != 0) {
056: /*
057: * special case: single byte from a previous buffer,
058: * where the byte turned out not to belong to a trail surrogate
059: * and the preceding, unmatched lead surrogate was put into toUBytes[]
060: * for error handling
061: */
062: toUBytesArray[toUBytesBegin + 0] = (byte) toUnicodeStatus;
063: toULength = 1;
064: toUnicodeStatus = 0;
065: }
066: if ((count = toULength) != 0) {
067: byte[] pArray = toUBytesArray;
068: int pArrayIndex = toUBytesBegin;
069: do {
070: pArray[count++] = source.get(sourceArrayIndex++);
071: ++sourceIndex;
072: --length;
073: if (count == 2) {
074: c = (char) (((pArray[pArrayIndex + 0] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK));
075: if (!UTF16.isSurrogate(c)) {
076: /* output the BMP code point */
077: target.put(c);
078: if (offsets != null) {
079: offsets.put(-1);
080: }
081: count = 0;
082: c = 0;
083: break;
084: } else if (UTF16.isLeadSurrogate(c)) {
085: /* continue collecting bytes for the trail surrogate */
086: c = 0; /* avoid unnecessary surrogate handling below */
087: } else {
088: /* fall through to error handling for an unmatched trail surrogate */
089: break;
090: }
091: } else if (count == 4) {
092: c = (char) (((pArray[pArrayIndex + 0] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK));
093: trail = (char) (((pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (pArray[pArrayIndex + 3] & UConverterConstants.UNSIGNED_BYTE_MASK));
094: if (UTF16.isTrailSurrogate(trail)) {
095: /* output the surrogate pair */
096: target.put(c);
097: if (target.remaining() >= 1) {
098: target.put(trail);
099: if (offsets != null) {
100: offsets.put(-1);
101: offsets.put(-1);
102: }
103: } else /* targetCapacity==1 */{
104: charErrorBufferArray[charErrorBufferBegin + 0] = trail;
105: charErrorBufferLength = 1;
106: return CoderResult.OVERFLOW;
107: }
108: count = 0;
109: c = 0;
110: break;
111: } else {
112: /* unmatched lead surrogate, handle here for consistent toUBytes[] */
113:
114: /* back out reading the code unit after it */
115: if ((source.position() - sourceArrayIndex) >= 2) {
116: sourceArrayIndex -= 2;
117: } else {
118: /*
119: * if the trail unit's first byte was in a previous buffer, then
120: * we need to put it into a special place because toUBytes[] will be
121: * used for the lead unit's bytes
122: */
123: toUnicodeStatus = 0x100 | pArray[pArrayIndex + 2];
124: --sourceArrayIndex;
125: }
126: toULength = 2;
127: cr = CoderResult
128: .malformedForLength(sourceArrayIndex);
129: break;
130: }
131: }
132: } while (length > 0);
133: toULength = (byte) count;
134: }
135:
136: /* copy an even number of bytes for complete UChars */
137: count = 2 * target.remaining();
138: if (count > length) {
139: count = length & ~1;
140: }
141: if (c == 0 && count > 0) {
142: length -= count;
143: count >>= 1;
144: //targetCapacity-=count;
145: if (offsets == null) {
146: do {
147: c = (char) (((source.get(sourceArrayIndex + 0) & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (source
148: .get(sourceArrayIndex + 1) & UConverterConstants.UNSIGNED_BYTE_MASK));
149: sourceArrayIndex += 2;
150: if (!UTF16.isSurrogate(c)) {
151: target.put(c);
152: } else if (UTF16.isLeadSurrogate(c)
153: && count >= 2
154: && UTF16
155: .isTrailSurrogate(trail = (char) (((source
156: .get(sourceArrayIndex + 0) & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (source
157: .get(sourceArrayIndex + 1) & UConverterConstants.UNSIGNED_BYTE_MASK)))) {
158: sourceArrayIndex += 2;
159: --count;
160: target.put(c);
161: target.put(trail);
162: } else {
163: break;
164: }
165: } while (--count > 0);
166: } else {
167: do {
168: c = (char) (((source.get(sourceArrayIndex + 0) & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (source
169: .get(sourceArrayIndex + 1) & UConverterConstants.UNSIGNED_BYTE_MASK));
170: sourceArrayIndex += 2;
171: if (!UTF16.isSurrogate(c)) {
172: target.put(c);
173: offsets.put(sourceIndex);
174: sourceIndex += 2;
175: } else if (UTF16.isLeadSurrogate(c)
176: && count >= 2
177: && UTF16
178: .isTrailSurrogate(trail = (char) (((source
179: .get(sourceArrayIndex + 0) & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (source
180: .get(sourceArrayIndex + 1) & UConverterConstants.UNSIGNED_BYTE_MASK)))) {
181: sourceArrayIndex += 2;
182: --count;
183: target.put(c);
184: target.put(trail);
185: offsets.put(sourceIndex);
186: offsets.put(sourceIndex);
187: sourceIndex += 4;
188: } else {
189: break;
190: }
191: } while (--count > 0);
192: }
193:
194: if (count == 0) {
195: /* done with the loop for complete UChars */
196: c = 0;
197: } else {
198: /* keep c for surrogate handling, trail will be set there */
199: length += 2 * (count - 1); /* one more byte pair was consumed than count decremented */
200: }
201: }
202:
203: if (c != 0) {
204: /*
205: * c is a surrogate, and
206: * - source or target too short
207: * - or the surrogate is unmatched
208: */
209: toUBytesArray[toUBytesBegin + 0] = (byte) (c >>> 8);
210: toUBytesArray[toUBytesBegin + 1] = (byte) c;
211: toULength = 2;
212:
213: if (UTF16.isLeadSurrogate(c)) {
214: if (length >= 2) {
215: if (UTF16
216: .isTrailSurrogate(trail = (char) (((source
217: .get(sourceArrayIndex + 0) & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (source
218: .get(sourceArrayIndex + 1) & UConverterConstants.UNSIGNED_BYTE_MASK)))) {
219: /* output the surrogate pair, will overflow (see conditions comment above) */
220: sourceArrayIndex += 2;
221: length -= 2;
222: target.put(c);
223: if (offsets != null) {
224: offsets.put(sourceIndex);
225: }
226: charErrorBufferArray[charErrorBufferBegin + 0] = trail;
227: charErrorBufferLength = 1;
228: toULength = 0;
229: cr = CoderResult.OVERFLOW;
230: } else {
231: /* unmatched lead surrogate */
232: cr = CoderResult
233: .malformedForLength(sourceArrayIndex);
234: }
235: } else {
236: /* see if the trail surrogate is in the next buffer */
237: }
238: } else {
239: /* unmatched trail surrogate */
240: cr = CoderResult
241: .malformedForLength(sourceArrayIndex);
242: }
243: }
244:
245: /* check for a remaining source byte */
246: if (!cr.isError()) {
247: if (length > 0) {
248: if (!target.hasRemaining()) {
249: cr = CoderResult.OVERFLOW;
250: } else {
251: /* it must be length==1 because otherwise the above would have copied more */
252: toUBytesArray[toULength++] = source
253: .get(sourceArrayIndex++);
254: }
255: }
256: }
257: source.position(sourceArrayIndex);
258:
259: return cr;
260: }
261:
262: }
263:
264: class CharsetEncoderUTF16 extends CharsetEncoderICU {
265:
266: public CharsetEncoderUTF16(CharsetICU cs) {
267: super (cs, fromUSubstitution);
268: implReset();
269: }
270:
271: private final static int NEED_TO_WRITE_BOM = 1;
272:
273: protected void implReset() {
274: super .implReset();
275: fromUnicodeStatus = NEED_TO_WRITE_BOM;
276: }
277:
278: protected CoderResult encodeLoop(CharBuffer source,
279: ByteBuffer target, IntBuffer offsets, boolean flush) {
280: CoderResult cr = CoderResult.UNDERFLOW;
281: if (!source.hasRemaining()) {
282: /* no input, nothing to do */
283: return cr;
284: }
285: char c;
286: /* write the BOM if necessary */
287: if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
288: byte bom[] = { (byte) 0xfe, (byte) 0xff };
289: cr = fromUWriteBytes(this , bom, 0, bom.length, target,
290: offsets, -1);
291: if (cr.isError()) {
292: return cr;
293: }
294: fromUnicodeStatus = 0;
295: }
296:
297: if (!target.hasRemaining()) {
298: return CoderResult.OVERFLOW;
299: }
300:
301: int sourceIndex = 0;
302: char trail = 0;
303: int length = source.remaining();
304: int sourceArrayIndex = source.position();
305:
306: /* c!=0 indicates in several places outside the main loops that a surrogate was found */
307:
308: if ((c = (char) fromUChar32) != 0
309: && UTF16.isTrailSurrogate(trail = source
310: .get(sourceArrayIndex))
311: && target.remaining() >= 4) {
312: /* the last buffer ended with a lead surrogate, output the surrogate pair */
313: ++sourceArrayIndex;
314: --length;
315: target.put((byte) (c >>> 8));
316: target.put((byte) c);
317: target.put((byte) (trail >>> 8));
318: target.put((byte) trail);
319: if (offsets != null && offsets.remaining() >= 4) {
320: offsets.put(-1);
321: offsets.put(-1);
322: offsets.put(-1);
323: offsets.put(-1);
324: }
325: sourceIndex = 1;
326: fromUChar32 = c = 0;
327: }
328: byte overflow[/*4*/] = new byte[4];
329:
330: if (c == 0) {
331: /* copy an even number of bytes for complete UChars */
332: int count = 2 * length;
333: int targetCapacity = target.remaining();
334: if (count > targetCapacity) {
335: count = targetCapacity & ~1;
336: }
337: /* count is even */
338: targetCapacity -= count;
339: count >>= 1;
340: length -= count;
341:
342: if (offsets == null) {
343: while (count > 0) {
344: c = source.get(sourceArrayIndex++);
345: if (!UTF16.isSurrogate(c)) {
346: target.put((byte) (c >>> 8));
347: target.put((byte) c);
348:
349: } else if (UTF16.isLeadSurrogate(c)
350: && count >= 2
351: && UTF16
352: .isTrailSurrogate(trail = source
353: .get(sourceArrayIndex))) {
354: ++sourceArrayIndex;
355: --count;
356: target.put((byte) (c >>> 8));
357: target.put((byte) c);
358: target.put((byte) (trail >>> 8));
359: target.put((byte) trail);
360: } else {
361: break;
362: }
363: --count;
364: }
365: } else {
366: while (count > 0) {
367: c = source.get(sourceArrayIndex++);
368: if (!UTF16.isSurrogate(c)) {
369: target.put((byte) (c >>> 8));
370: target.put((byte) c);
371: offsets.put(sourceIndex);
372: offsets.put(sourceIndex++);
373: } else if (UTF16.isLeadSurrogate(c)
374: && count >= 2
375: && UTF16
376: .isTrailSurrogate(trail = source
377: .get(sourceArrayIndex))) {
378: ++sourceArrayIndex;
379: --count;
380: target.put((byte) (c >>> 8));
381: target.put((byte) c);
382: target.put((byte) (trail >>> 8));
383: target.put((byte) trail);
384: offsets.put(sourceIndex);
385: offsets.put(sourceIndex);
386: offsets.put(sourceIndex);
387: offsets.put(sourceIndex);
388: sourceIndex += 2;
389: } else {
390: break;
391: }
392: --count;
393: }
394: }
395:
396: if (count == 0) {
397: /* done with the loop for complete UChars */
398: if (length > 0 && targetCapacity > 0) {
399: /*
400: * there is more input and some target capacity -
401: * it must be targetCapacity==1 because otherwise
402: * the above would have copied more;
403: * prepare for overflow output
404: */
405: if (!UTF16.isSurrogate(c = source
406: .get(sourceArrayIndex++))) {
407: overflow[0] = (byte) (c >>> 8);
408: overflow[1] = (byte) c;
409: length = 2; /* 2 bytes to output */
410: c = 0;
411: /* } else { keep c for surrogate handling, length will be set there */
412: }
413: } else {
414: length = 0;
415: c = 0;
416: }
417: } else {
418: /* keep c for surrogate handling, length will be set there */
419: targetCapacity += 2 * count;
420: }
421: } else {
422: length = 0; /* from here on, length counts the bytes in overflow[] */
423: }
424:
425: if (c != 0) {
426: /*
427: * c is a surrogate, and
428: * - source or target too short
429: * - or the surrogate is unmatched
430: */
431: length = 0;
432: if (UTF16.isLeadSurrogate(c)) {
433: if (sourceArrayIndex < source.limit()) {
434: if (UTF16.isTrailSurrogate(trail = source
435: .get(sourceArrayIndex))) {
436: /* output the surrogate pair, will overflow (see conditions comment above) */
437: ++sourceArrayIndex;
438: overflow[0] = (byte) (c >>> 8);
439: overflow[1] = (byte) c;
440: overflow[2] = (byte) (trail >>> 8);
441: overflow[3] = (byte) trail;
442: length = 4; /* 4 bytes to output */
443: c = 0;
444: } else {
445: /* unmatched lead surrogate */
446: cr = CoderResult
447: .malformedForLength(sourceArrayIndex);
448: }
449: } else {
450: /* see if the trail surrogate is in the next buffer */
451: }
452: } else {
453: /* unmatched trail surrogate */
454: cr = CoderResult
455: .malformedForLength(sourceArrayIndex);
456: }
457: fromUChar32 = c;
458: }
459: source.position(sourceArrayIndex);
460: if (length > 0) {
461: /* output length bytes with overflow (length>targetCapacity>0) */
462: cr = fromUWriteBytes(this , overflow, 0, length, target,
463: offsets, sourceIndex);
464: }
465: return cr;
466: }
467: }
468:
469: public CharsetDecoder newDecoder() {
470: return new CharsetDecoderUTF16(this );
471: }
472:
473: public CharsetEncoder newEncoder() {
474: return new CharsetEncoderUTF16(this);
475: }
476:
477: }
|