001: /**
002: *******************************************************************************
003: * Copyright (C) 2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: *
007: *******************************************************************************
008: */package com.ibm.icu.charset;
009:
010: import java.nio.ByteBuffer;
011: import java.nio.CharBuffer;
012: import java.nio.IntBuffer;
013: import java.nio.charset.CharsetDecoder;
014: import java.nio.charset.CharsetEncoder;
015: import java.nio.charset.CoderResult;
016:
017: import com.ibm.icu.text.UTF16;
018:
019: /**
020: * @author Niti Hantaweepant
021: */
022: class CharsetUTF16LE extends CharsetICU {
023:
024: protected byte[] fromUSubstitution = new byte[] { (byte) 0xfd,
025: (byte) 0xff };
026:
027: public CharsetUTF16LE(String icuCanonicalName,
028: String javaCanonicalName, String[] aliases) {
029: super (icuCanonicalName, javaCanonicalName, aliases);
030: maxBytesPerChar = 4;
031: minBytesPerChar = 2;
032: maxCharsPerByte = 1;
033: }
034:
035: class CharsetDecoderUTF16LE extends CharsetDecoderICU {
036:
037: public CharsetDecoderUTF16LE(CharsetICU cs) {
038: super (cs);
039: }
040:
041: protected CoderResult decodeLoop(ByteBuffer source,
042: CharBuffer target, IntBuffer offsets, boolean flush) {
043: CoderResult cr = CoderResult.UNDERFLOW;
044: if (!source.hasRemaining() && toUnicodeStatus == 0) {
045: /* no input, nothing to do */
046: return cr;
047: }
048: if (!target.hasRemaining()) {
049: return CoderResult.OVERFLOW;
050: }
051:
052: int sourceIndex = 0, count = 0, length, sourceArrayIndex;
053: char c = 0, trail;
054: length = source.remaining();
055: sourceArrayIndex = source.position();
056:
057: /* complete a partial UChar or pair from the last call */
058: if (toUnicodeStatus != 0) {
059: /*
060: * special case: single byte from a previous buffer,
061: * where the byte turned out not to belong to a trail surrogate
062: * and the preceding, unmatched lead surrogate was put into toUBytes[]
063: * for error handling
064: */
065: toUBytesArray[toUBytesBegin + 0] = (byte) toUnicodeStatus;
066: toULength = 1;
067: toUnicodeStatus = 0;
068: }
069: if ((count = toULength) != 0) {
070: byte[] pArray = toUBytesArray;
071: int pArrayIndex = toUBytesBegin;
072: do {
073: pArray[count++] = source.get(sourceArrayIndex++);
074: ++sourceIndex;
075: --length;
076: if (count == 2) {
077: c = (char) (((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (pArray[pArrayIndex + 0] & UConverterConstants.UNSIGNED_BYTE_MASK));
078: if (!UTF16.isSurrogate(c)) {
079: /* output the BMP code point */
080: target.put(c);
081: if (offsets != null) {
082: offsets.put(-1);
083: }
084: count = 0;
085: c = 0;
086: break;
087: } else if (UTF16.isLeadSurrogate(c)) {
088: /* continue collecting bytes for the trail surrogate */
089: c = 0; /* avoid unnecessary surrogate handling below */
090: } else {
091: /* fall through to error handling for an unmatched trail surrogate */
092: break;
093: }
094: } else if (count == 4) {
095: c = (char) (((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (pArray[pArrayIndex + 0] & UConverterConstants.UNSIGNED_BYTE_MASK));
096: trail = (char) (((pArray[pArrayIndex + 3] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK));
097: if (UTF16.isTrailSurrogate(trail)) {
098: /* output the surrogate pair */
099: target.put(c);
100: if (target.remaining() >= 1) {
101: target.put(trail);
102: if (offsets != null) {
103: offsets.put(-1);
104: offsets.put(-1);
105: }
106: } else /* targetCapacity==1 */{
107: charErrorBufferArray[charErrorBufferBegin + 0] = trail;
108: charErrorBufferLength = 1;
109: return CoderResult.OVERFLOW;
110: }
111: count = 0;
112: c = 0;
113: break;
114: } else {
115: /* unmatched lead surrogate, handle here for consistent toUBytes[] */
116:
117: /* back out reading the code unit after it */
118: if ((source.position() - sourceArrayIndex) >= 2) {
119: sourceArrayIndex -= 2;
120: } else {
121: /*
122: * if the trail unit's first byte was in a previous buffer, then
123: * we need to put it into a special place because toUBytes[] will be
124: * used for the lead unit's bytes
125: */
126: toUnicodeStatus = 0x100 | pArray[pArrayIndex + 2];
127: --sourceArrayIndex;
128: }
129: toULength = 2;
130: cr = CoderResult
131: .malformedForLength(sourceArrayIndex);
132: break;
133: }
134: }
135: } while (length > 0);
136: toULength = (byte) count;
137: }
138:
139: /* copy an even number of bytes for complete UChars */
140: count = 2 * target.remaining();
141: if (count > length) {
142: count = length & ~1;
143: }
144: if (c == 0 && count > 0) {
145: length -= count;
146: count >>= 1;
147: //targetCapacity-=count;
148: if (offsets == null) {
149: do {
150: c = (char) (((source.get(sourceArrayIndex + 1) & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (source
151: .get(sourceArrayIndex + 0) & UConverterConstants.UNSIGNED_BYTE_MASK));
152: sourceArrayIndex += 2;
153: if (!UTF16.isSurrogate(c)) {
154: target.put(c);
155: } else if (UTF16.isLeadSurrogate(c)
156: && count >= 2
157: && UTF16
158: .isTrailSurrogate(trail = (char) (((source
159: .get(sourceArrayIndex + 1) & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (source
160: .get(sourceArrayIndex + 0) & UConverterConstants.UNSIGNED_BYTE_MASK)))) {
161: sourceArrayIndex += 2;
162: --count;
163: target.put(c);
164: target.put(trail);
165: } else {
166: break;
167: }
168: } while (--count > 0);
169: } else {
170: do {
171: c = (char) (((source.get(sourceArrayIndex + 1) & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (source
172: .get(sourceArrayIndex + 0) & UConverterConstants.UNSIGNED_BYTE_MASK));
173: sourceArrayIndex += 2;
174: if (!UTF16.isSurrogate(c)) {
175: target.put(c);
176: offsets.put(sourceIndex);
177: sourceIndex += 2;
178: } else if (UTF16.isLeadSurrogate(c)
179: && count >= 2
180: && UTF16
181: .isTrailSurrogate(trail = (char) (((source
182: .get(sourceArrayIndex + 1) & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (source
183: .get(sourceArrayIndex + 0) & UConverterConstants.UNSIGNED_BYTE_MASK)))) {
184: sourceArrayIndex += 2;
185: --count;
186: target.put(c);
187: target.put(trail);
188: offsets.put(sourceIndex);
189: offsets.put(sourceIndex);
190: sourceIndex += 4;
191: } else {
192: break;
193: }
194: } while (--count > 0);
195: }
196:
197: if (count == 0) {
198: /* done with the loop for complete UChars */
199: c = 0;
200: } else {
201: /* keep c for surrogate handling, trail will be set there */
202: length += 2 * (count - 1); /* one more byte pair was consumed than count decremented */
203: }
204: }
205:
206: if (c != 0) {
207: /*
208: * c is a surrogate, and
209: * - source or target too short
210: * - or the surrogate is unmatched
211: */
212:
213: toUBytesArray[toUBytesBegin + 0] = (byte) c;
214: toUBytesArray[toUBytesBegin + 1] = (byte) (c >>> 8);
215: toULength = 2;
216:
217: if (UTF16.isLeadSurrogate(c)) {
218: if (length >= 2) {
219: if (UTF16
220: .isTrailSurrogate(trail = (char) (((source
221: .get(sourceArrayIndex + 1) & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (source
222: .get(sourceArrayIndex + 0) & UConverterConstants.UNSIGNED_BYTE_MASK)))) {
223: /* output the surrogate pair, will overflow (see conditions comment above) */
224: sourceArrayIndex += 2;
225: length -= 2;
226: target.put(c);
227: if (offsets != null) {
228: offsets.put(sourceIndex);
229: }
230: charErrorBufferArray[charErrorBufferBegin + 0] = trail;
231: charErrorBufferLength = 1;
232: toULength = 0;
233: cr = CoderResult.OVERFLOW;
234: } else {
235: /* unmatched lead surrogate */
236: cr = CoderResult
237: .malformedForLength(sourceArrayIndex);
238: }
239: } else {
240: /* see if the trail surrogate is in the next buffer */
241: }
242: } else {
243: /* unmatched trail surrogate */
244: cr = CoderResult
245: .malformedForLength(sourceArrayIndex);
246: }
247: }
248:
249: /* check for a remaining source byte */
250: if (!cr.isError()) {
251: if (length > 0) {
252: if (!target.hasRemaining()) {
253: cr = CoderResult.OVERFLOW;
254: } else {
255: /* it must be length==1 because otherwise the above would have copied more */
256: toUBytesArray[toULength++] = source
257: .get(sourceArrayIndex++);
258: }
259: }
260: }
261: source.position(sourceArrayIndex);
262:
263: return cr;
264: }
265:
266: }
267:
268: class CharsetEncoderUTF16LE extends CharsetEncoderICU {
269:
270: public CharsetEncoderUTF16LE(CharsetICU cs) {
271: super (cs, fromUSubstitution);
272: implReset();
273: }
274:
275: private final static int NEED_TO_WRITE_BOM = 1;
276:
277: protected void implReset() {
278: super .implReset();
279: fromUnicodeStatus = NEED_TO_WRITE_BOM;
280: }
281:
282: protected CoderResult encodeLoop(CharBuffer source,
283: ByteBuffer target, IntBuffer offsets, boolean flush) {
284: CoderResult cr = CoderResult.UNDERFLOW;
285: if (!source.hasRemaining()) {
286: /* no input, nothing to do */
287: return cr;
288: }
289: char c;
290: /* write the BOM if necessary */
291: if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
292: byte bom[] = { (byte) 0xff, (byte) 0xfe };
293: cr = fromUWriteBytes(this , bom, 0, bom.length, target,
294: offsets, -1);
295: if (cr.isError()) {
296: return cr;
297: }
298: fromUnicodeStatus = 0;
299: }
300:
301: if (!target.hasRemaining()) {
302: return CoderResult.OVERFLOW;
303: }
304:
305: int sourceIndex = 0;
306: char trail = 0;
307: int length = source.remaining();
308: int sourceArrayIndex = source.position();
309:
310: /* c!=0 indicates in several places outside the main loops that a surrogate was found */
311:
312: if ((c = (char) fromUChar32) != 0
313: && UTF16.isTrailSurrogate(trail = source
314: .get(sourceArrayIndex))
315: && target.remaining() >= 4) {
316: /* the last buffer ended with a lead surrogate, output the surrogate pair */
317: ++sourceArrayIndex;
318: --length;
319: target.put((byte) c);
320: target.put((byte) (c >>> 8));
321: target.put((byte) trail);
322: target.put((byte) (trail >>> 8));
323: if (offsets != null && offsets.remaining() >= 4) {
324: offsets.put(-1);
325: offsets.put(-1);
326: offsets.put(-1);
327: offsets.put(-1);
328: }
329: sourceIndex = 1;
330: fromUChar32 = c = 0;
331: }
332: byte overflow[/*4*/] = new byte[4];
333:
334: if (c == 0) {
335: /* copy an even number of bytes for complete UChars */
336: int count = 2 * length;
337: int targetCapacity = target.remaining();
338: if (count > targetCapacity) {
339: count = targetCapacity & ~1;
340: }
341: /* count is even */
342: targetCapacity -= count;
343: count >>= 1;
344: length -= count;
345:
346: if (offsets == null) {
347: while (count > 0) {
348: c = source.get(sourceArrayIndex++);
349: if (!UTF16.isSurrogate(c)) {
350: target.put((byte) c);
351: target.put((byte) (c >>> 8));
352:
353: } else if (UTF16.isLeadSurrogate(c)
354: && count >= 2
355: && UTF16
356: .isTrailSurrogate(trail = source
357: .get(sourceArrayIndex))) {
358: ++sourceArrayIndex;
359: --count;
360: target.put((byte) c);
361: target.put((byte) (c >>> 8));
362: target.put((byte) trail);
363: target.put((byte) (trail >>> 8));
364: } else {
365: break;
366: }
367: --count;
368: }
369: } else {
370: while (count > 0) {
371: c = source.get(sourceArrayIndex++);
372: if (!UTF16.isSurrogate(c)) {
373: target.put((byte) c);
374: target.put((byte) (c >>> 8));
375: offsets.put(sourceIndex);
376: offsets.put(sourceIndex++);
377: } else if (UTF16.isLeadSurrogate(c)
378: && count >= 2
379: && UTF16
380: .isTrailSurrogate(trail = source
381: .get(sourceArrayIndex))) {
382: ++sourceArrayIndex;
383: --count;
384: target.put((byte) c);
385: target.put((byte) (c >>> 8));
386: target.put((byte) trail);
387: target.put((byte) (trail >>> 8));
388: offsets.put(sourceIndex);
389: offsets.put(sourceIndex);
390: offsets.put(sourceIndex);
391: offsets.put(sourceIndex);
392: sourceIndex += 2;
393: } else {
394: break;
395: }
396: --count;
397: }
398: }
399:
400: if (count == 0) {
401: /* done with the loop for complete UChars */
402: if (length > 0 && targetCapacity > 0) {
403: /*
404: * there is more input and some target capacity -
405: * it must be targetCapacity==1 because otherwise
406: * the above would have copied more;
407: * prepare for overflow output
408: */
409: if (!UTF16.isSurrogate(c = source
410: .get(sourceArrayIndex++))) {
411: overflow[0] = (byte) c;
412: overflow[1] = (byte) (c >>> 8);
413: length = 2; /* 2 bytes to output */
414: c = 0;
415: /* } else { keep c for surrogate handling, length will be set there */
416: }
417: } else {
418: length = 0;
419: c = 0;
420: }
421: } else {
422: /* keep c for surrogate handling, length will be set there */
423: targetCapacity += 2 * count;
424: }
425: } else {
426: length = 0; /* from here on, length counts the bytes in overflow[] */
427: }
428:
429: if (c != 0) {
430: /*
431: * c is a surrogate, and
432: * - source or target too short
433: * - or the surrogate is unmatched
434: */
435: length = 0;
436: if (UTF16.isLeadSurrogate(c)) {
437: if (sourceArrayIndex < source.limit()) {
438: if (UTF16.isTrailSurrogate(trail = source
439: .get(sourceArrayIndex))) {
440: /* output the surrogate pair, will overflow (see conditions comment above) */
441: ++sourceArrayIndex;
442: overflow[0] = (byte) c;
443: overflow[1] = (byte) (c >>> 8);
444: overflow[2] = (byte) trail;
445: overflow[3] = (byte) (trail >>> 8);
446: length = 4; /* 4 bytes to output */
447: c = 0;
448: } else {
449: /* unmatched lead surrogate */
450: cr = CoderResult
451: .malformedForLength(sourceArrayIndex);
452: }
453: } else {
454: /* see if the trail surrogate is in the next buffer */
455: }
456: } else {
457: /* unmatched trail surrogate */
458: cr = CoderResult
459: .malformedForLength(sourceArrayIndex);
460: }
461: fromUChar32 = c;
462: }
463: source.position(sourceArrayIndex);
464: if (length > 0) {
465: /* output length bytes with overflow (length>targetCapacity>0) */
466: cr = fromUWriteBytes(this , overflow, 0, length, target,
467: offsets, sourceIndex);
468: }
469: return cr;
470: }
471: }
472:
473: public CharsetDecoder newDecoder() {
474: return new CharsetDecoderUTF16LE(this );
475: }
476:
477: public CharsetEncoder newEncoder() {
478: return new CharsetEncoderUTF16LE(this);
479: }
480:
481: }
|