001: /**
002: *******************************************************************************
003: * Copyright (C) 2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: *
007: *******************************************************************************
008: */package com.ibm.icu.charset;
009:
010: import java.nio.CharBuffer;
011: import java.nio.IntBuffer;
012: import java.nio.charset.CharsetDecoder;
013: import java.nio.charset.CoderResult;
014: import java.nio.charset.CodingErrorAction;
015: import java.nio.ByteBuffer;
016:
017: import com.ibm.icu.charset.CharsetCallback;
018: import com.ibm.icu.impl.Assert;
019:
020: /**
021: * An abstract class that provides framework methods of decoding operations for concrete
022: * subclasses.
023: * In the future this class will contain API that will implement converter sematics of ICU4C.
024: * @draft ICU 3.6
025: * @provisional This API might change or be removed in a future release.
026: */
027:
028: public abstract class CharsetDecoderICU extends CharsetDecoder {
029:
030: int toUnicodeStatus;
031: byte[] toUBytesArray = new byte[128];
032: int toUBytesBegin = 0;
033: int toULength;
034: char[] charErrorBufferArray = new char[128];
035: int charErrorBufferLength;
036: int charErrorBufferBegin;
037: char[] invalidCharBuffer = new char[128];
038: int invalidCharLength;
039:
040: /* store previous UChars/chars to continue partial matches */
041: byte[] preToUArray;
042: int preToUBegin;
043: int preToULength; /* negative: replay */
044: int preToUFirstLength; /* length of first character */
045: int mode;
046:
047: Object toUContext = null;
048: private CharsetCallback.Decoder onUnmappableInput = CharsetCallback.TO_U_CALLBACK_STOP;
049: private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP;
050: CharsetCallback.Decoder toCharErrorBehaviour = new CharsetCallback.Decoder() {
051: public CoderResult call(CharsetDecoderICU decoder,
052: Object context, ByteBuffer source, CharBuffer target,
053: IntBuffer offsets, char[] buffer, int length,
054: CoderResult cr) {
055: if (cr.isUnmappable()) {
056: return onUnmappableInput.call(decoder, context, source,
057: target, offsets, buffer, length, cr);
058: } else if (cr.isMalformed()) {
059: return onMalformedInput.call(decoder, context, source,
060: target, offsets, buffer, length, cr);
061: }
062: return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder,
063: context, source, target, offsets, buffer, length,
064: cr);
065: }
066: };
067:
068: /**
069: * Construct a CharsetDecorderICU based on the information provided from a
070: * CharsetICU object.
071: * @param cs The CharsetICU object containing information about how to
072: * charset to decode.
073: * @draft ICU 3.6
074: * @provisional This API might change or be removed in a future release.
075: */
076: CharsetDecoderICU(CharsetICU cs) {
077: super (cs, (float) (1 / (float) cs.maxCharsPerByte),
078: cs.maxCharsPerByte);
079: }
080:
081: /**
082: * Sets the action to be taken if an illegal sequence is encountered
083: * @param newAction action to be taken
084: * @exception IllegalArgumentException
085: * @stable ICU 3.6
086: */
087: protected final void implOnMalformedInput(
088: CodingErrorAction newAction) {
089: onMalformedInput = getCallback(newAction);
090: }
091:
092: /**
093: * Sets the action to be taken if an illegal sequence is encountered
094: * @param newAction action to be taken
095: * @exception IllegalArgumentException
096: * @stable ICU 3.6
097: */
098: protected final void implOnUnmappableCharacter(
099: CodingErrorAction newAction) {
100: onUnmappableInput = getCallback(newAction);
101: }
102:
103: private static CharsetCallback.Decoder getCallback(
104: CodingErrorAction action) {
105: if (action == CodingErrorAction.REPLACE) {
106: return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE;
107: } else if (action == CodingErrorAction.IGNORE) {
108: return CharsetCallback.TO_U_CALLBACK_SKIP;
109: } else if (action == CodingErrorAction.REPORT) {
110: return CharsetCallback.TO_U_CALLBACK_STOP;
111: }
112: return CharsetCallback.TO_U_CALLBACK_STOP;
113: }
114:
115: private final ByteBuffer EMPTY = ByteBuffer.allocate(0);
116:
117: /**
118: * Flushes any characters saved in the converter's internal buffer and
119: * resets the converter.
120: * @param out action to be taken
121: * @return result of flushing action and completes the decoding all input.
122: * Returns CoderResult.UNDERFLOW if the action succeeds.
123: * @stable ICU 3.6
124: */
125: protected final CoderResult implFlush(CharBuffer out) {
126: return decode(EMPTY, out, null, true);
127: }
128:
129: /**
130: * Resets the to Unicode mode of converter
131: * @stable ICU 3.6
132: */
133: protected void implReset() {
134: toUnicodeStatus = 0;
135: toULength = 0;
136: charErrorBufferLength = 0;
137: charErrorBufferBegin = 0;
138:
139: /* store previous UChars/chars to continue partial matches */
140: preToUBegin = 0;
141: preToULength = 0; /* negative: replay */
142: preToUFirstLength = 0;
143:
144: mode = 0;
145: }
146:
147: /**
148: * Decodes one or more bytes. The default behaviour of the converter
149: * is stop and report if an error in input stream is encountered.
150: * To set different behaviour use @see CharsetDecoder.onMalformedInput()
151: * This method allows a buffer by buffer conversion of a data stream.
152: * The state of the conversion is saved between calls to convert.
153: * Among other things, this means multibyte input sequences can be
154: * split between calls. If a call to convert results in an Error, the
155: * conversion may be continued by calling convert again with suitably
156: * modified parameters.All conversions should be finished with a call to
157: * the flush method.
158: * @param in buffer to decode
159: * @param out buffer to populate with decoded result
160: * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
161: * action succeeds or more input is needed for completing the decoding action.
162: * @stable ICU 3.6
163: */
164: protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
165: if (!in.hasRemaining()) {
166: return CoderResult.UNDERFLOW;
167: }
168: in.position(in.position() + toUCountPending());
169: /* do the conversion */
170: CoderResult ret = decode(in, out, null, false);
171:
172: setSourcePosition(in);
173: return ret;
174: }
175:
176: /**
177: * Implements the ICU semantic for decode operation
178: * @param in The input byte buffer
179: * @param out The output character buffer
180: * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
181: * action succeeds or more input is needed for completing the decoding action.
182: * @draft ICU 3.6
183: * @provisional This API might change or be removed in a future release.
184: */
185: abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out,
186: IntBuffer offsets, boolean flush);
187:
188: /**
189: * Implements the ICU semantic for decode operation
190: * @param source The input byte buffer
191: * @param target The output character buffer
192: * @param offsets
193: * @param flush true if, and only if, the invoker can provide no
194: * additional input bytes beyond those in the given buffer.
195: * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
196: * action succeeds or more input is needed for completing the decoding action.
197: * @draft ICU 3.6
198: * @provisional This API might change or be removed in a future release.
199: */
200: final CoderResult decode(ByteBuffer source, CharBuffer target,
201: IntBuffer offsets, boolean flush) {
202:
203: /* check parameters */
204: if (target == null || source == null) {
205: throw new IllegalArgumentException();
206: }
207: /*
208: * Make sure that the buffer sizes do not exceed the number range for
209: * int32_t because some functions use the size (in units or bytes)
210: * rather than comparing pointers, and because offsets are int32_t values.
211: *
212: * size_t is guaranteed to be unsigned and large enough for the job.
213: *
214: * Return with an error instead of adjusting the limits because we would
215: * not be able to maintain the semantics that either the source must be
216: * consumed or the target filled (unless an error occurs).
217: * An adjustment would be sourceLimit=t+0x7fffffff; for example.
218: */
219: /*agljport:fix
220: if(
221: ((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) ||
222: ((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t)
223: ) {
224: *err=U_ILLEGAL_ARGUMENT_ERROR;
225: return;
226: }
227: */
228:
229: /* flush the target overflow buffer */
230: if (charErrorBufferLength > 0) {
231: char[] overflow = null;
232: int i, length;
233:
234: overflow = charErrorBufferArray;
235: length = charErrorBufferLength;
236: i = 0;
237: do {
238: if (target.remaining() <= 0) {
239: /* the overflow buffer contains too much, keep the rest */
240: int j = 0;
241:
242: do {
243: overflow[j++] = overflow[i++];
244: } while (i < length);
245:
246: charErrorBufferLength = (byte) j;
247: return CoderResult.OVERFLOW;
248: }
249:
250: /* copy the overflow contents to the target */
251: target.put(overflow[i++]);
252: if (offsets != null) {
253: offsets.put(-1); /* no source index available for old output */
254: }
255: } while (i < length);
256:
257: /* the overflow buffer is completely copied to the target */
258: charErrorBufferLength = 0;
259: }
260:
261: if (!flush && source.remaining() == 0 && preToULength >= 0) {
262: /* the overflow buffer is emptied and there is no new input: we are done */
263: return CoderResult.UNDERFLOW;
264: }
265:
266: /*
267: * Do not simply return with a buffer overflow error if
268: * !flush && t==targetLimit
269: * because it is possible that the source will not generate any output.
270: * For example, the skip callback may be called;
271: * it does not output anything.
272: */
273:
274: return toUnicodeWithCallback(source, target, offsets, flush);
275: }
276:
277: /* maximum number of indexed bytes */
278: private static final int EXT_MAX_BYTES = 0x1f;
279:
280: private void updateOffsets(IntBuffer offsets, int length,
281: int sourceIndex, int errorInputLength) {
282: int limit;
283: int delta, offset;
284:
285: if (sourceIndex >= 0) {
286: /*
287: * adjust each offset by adding the previous sourceIndex
288: * minus the length of the input sequence that caused an
289: * error, if any
290: */
291: delta = sourceIndex - errorInputLength;
292: } else {
293: /*
294: * set each offset to -1 because this conversion function
295: * does not handle offsets
296: */
297: delta = -1;
298: }
299: limit = offsets.position() + length;
300: if (delta == 0) {
301: /* most common case, nothing to do */
302: } else if (delta > 0) {
303: /* add the delta to each offset (but not if the offset is <0) */
304: while (offsets.position() < limit) {
305: offset = offsets.get(offsets.position());
306: if (offset >= 0) {
307: offsets.put(offset + delta);
308: }
309: //FIXME: ++offsets;
310: }
311: } else /* delta<0 */{
312: /*
313: * set each offset to -1 because this conversion function
314: * does not handle offsets
315: * or the error input sequence started in a previous buffer
316: */
317: while (offsets.position() < limit) {
318: offsets.put(-1);
319: }
320: }
321: }
322:
323: final CoderResult toUnicodeWithCallback(ByteBuffer source,
324: CharBuffer target, IntBuffer offsets, boolean flush) {
325:
326: int sourceIndex;
327: int errorInputLength;
328: boolean converterSawEndOfInput, calledCallback;
329: int t = target.position();
330: int s = source.position();
331: /* variables for m:n conversion */
332: ByteBuffer replayArray = ByteBuffer.allocate(EXT_MAX_BYTES);
333: int replayArrayIndex = 0;
334:
335: ByteBuffer realSource = null;
336: boolean realFlush = false;
337: int realSourceIndex = 0;
338:
339: CoderResult cr = CoderResult.UNDERFLOW;
340:
341: /* get the converter implementation function */
342: sourceIndex = 0;
343:
344: if (preToULength >= 0) {
345: /* normal mode */
346: } else {
347: /*
348: * Previous m:n conversion stored source units from a partial match
349: * and failed to consume all of them.
350: * We need to "replay" them from a temporary buffer and convert them first.
351: */
352: realSource = source;
353: realFlush = flush;
354: realSourceIndex = sourceIndex;
355: //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
356: replayArray.put(preToUArray, 0, -preToULength);
357: source = replayArray;
358: source.position(0);
359: source.limit(replayArrayIndex - preToULength);
360: flush = false;
361: sourceIndex = -1;
362: preToULength = 0;
363: }
364:
365: /*
366: * loop for conversion and error handling
367: *
368: * loop {
369: * convert
370: * loop {
371: * update offsets
372: * handle end of input
373: * handle errors/call callback
374: * }
375: * }
376: */
377: for (;;) {
378: if (cr.isUnderflow()) {
379: /* convert */
380: cr = decodeLoop(source, target, offsets, flush);
381:
382: /*
383: * set a flag for whether the converter
384: * successfully processed the end of the input
385: *
386: * need not check cnv->preToULength==0 because a replay (<0) will cause
387: * s<sourceLimit before converterSawEndOfInput is checked
388: */
389: converterSawEndOfInput = (cr.isUnderflow() && flush
390: && source.remaining() == 0 && toULength == 0);
391: } else {
392: /* handle error from getNextUChar() */
393: converterSawEndOfInput = false;
394: }
395:
396: /* no callback called yet for this iteration */
397: calledCallback = false;
398:
399: /* no sourceIndex adjustment for conversion, only for callback output */
400: errorInputLength = 0;
401:
402: /*
403: * loop for offsets and error handling
404: *
405: * iterates at most 3 times:
406: * 1. to clean up after the conversion function
407: * 2. after the callback
408: * 3. after the callback again if there was truncated input
409: */
410: for (;;) {
411: /* update offsets if we write any */
412: if (offsets != null) {
413:
414: int length = (target.position() - t);
415: if (length > 0) {
416: updateOffsets(offsets, length, sourceIndex,
417: errorInputLength);
418:
419: /*
420: * if a converter handles offsets and updates the offsets
421: * pointer at the end, then pArgs->offset should not change
422: * here;
423: * however, some converters do not handle offsets at all
424: * (sourceIndex<0) or may not update the offsets pointer
425: */
426: //TODO: pArgs->offsets=offsets+=length;
427: }
428:
429: if (sourceIndex >= 0) {
430: sourceIndex += (source.position() - s);
431: }
432:
433: }
434:
435: if (preToULength < 0) {
436: /*
437: * switch the source to new replay units (cannot occur while replaying)
438: * after offset handling and before end-of-input and callback handling
439: */
440: if (realSource == null) {
441: realSource = source;
442: realFlush = flush;
443: realSourceIndex = sourceIndex;
444:
445: //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
446: replayArray.put(preToUArray, 0, -preToULength);
447:
448: source = replayArray;
449: source.limit(replayArrayIndex - preToULength);
450: flush = false;
451: if ((sourceIndex += preToULength) < 0) {
452: sourceIndex = -1;
453: }
454:
455: preToULength = 0;
456: } else {
457: /* see implementation note before _fromUnicodeWithCallback() */
458: //agljport:todo U_ASSERT(realSource==NULL);
459: Assert.assrt(realSource == null);
460: }
461: }
462:
463: /* update pointers */
464: s = source.position();
465: t = target.position();
466:
467: if (cr.isUnderflow()) {
468: if (s < source.limit()) {
469: /*
470: * continue with the conversion loop while there is still input left
471: * (continue converting by breaking out of only the inner loop)
472: */
473: break;
474: } else if (realSource != null) {
475: /* switch back from replaying to the real source and continue */
476: source = realSource;
477: flush = realFlush;
478: sourceIndex = realSourceIndex;
479: realSource = null;
480: break;
481: } else if (flush && toULength > 0) {
482: /*
483: * the entire input stream is consumed
484: * and there is a partial, truncated input sequence left
485: */
486:
487: /* inject an error and continue with callback handling */
488: cr = CoderResult.malformedForLength(toULength);
489: calledCallback = false; /* new error condition */
490: } else {
491: /* input consumed */
492: if (flush) {
493: /*
494: * return to the conversion loop once more if the flush
495: * flag is set and the conversion function has not
496: * successfully processed the end of the input yet
497: *
498: * (continue converting by breaking out of only the inner loop)
499: */
500: if (!converterSawEndOfInput) {
501: break;
502: }
503:
504: /* reset the converter without calling the callback function */
505: implReset();
506: }
507:
508: /* done successfully */
509: return cr;
510: }
511: }
512:
513: /* U_FAILURE(*err) */
514: {
515:
516: if (calledCallback || cr.isOverflow()
517: || (cr.isMalformed() && cr.isUnmappable())) {
518: /*
519: * the callback did not or cannot resolve the error:
520: * set output pointers and return
521: *
522: * the check for buffer overflow is redundant but it is
523: * a high-runner case and hopefully documents the intent
524: * well
525: *
526: * if we were replaying, then the replay buffer must be
527: * copied back into the UConverter
528: * and the real arguments must be restored
529: */
530: if (realSource != null) {
531: int length;
532: Assert.assrt(preToULength == 0);
533: length = (int) (source.limit() - source
534: .position());
535: if (length > 0) {
536: //UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length);
537: source.get(preToUArray, preToUBegin,
538: length);
539: preToULength = (byte) -length;
540: }
541:
542: source = realSource;
543: flush = realFlush;
544: }
545: return cr;
546: }
547: }
548:
549: /* copy toUBytes[] to invalidCharBuffer[] */
550: errorInputLength = invalidCharLength = toULength;
551: if (errorInputLength > 0) {
552: copy(toUBytesArray, 0, invalidCharBuffer, 0,
553: errorInputLength);
554: }
555:
556: /* set the converter state to deal with the next character */
557: toULength = 0;
558:
559: /* call the callback function */
560: cr = toCharErrorBehaviour.call(this , toUContext,
561: source, target, offsets, invalidCharBuffer,
562: errorInputLength, cr);
563: /*
564: * loop back to the offset handling
565: *
566: * this flag will indicate after offset handling
567: * that a callback was called;
568: * if the callback did not resolve the error, then we return
569: */
570: calledCallback = true;
571: }
572: }
573: }
574:
575: /**
576: * Returns the number of chars held in the converter's internal state
577: * because more input is needed for completing the conversion. This function is
578: * useful for mapping semantics of ICU's converter interface to those of iconv,
579: * and this information is not needed for normal conversion.
580: * @return The number of chars in the state. -1 if an error is encountered.
581: * @draft ICU 3.6
582: */
583: /*public*/int toUCountPending() {
584: if (preToULength > 0) {
585: return preToULength;
586: } else if (preToULength < 0) {
587: return -preToULength;
588: } else if (toULength > 0) {
589: return toULength;
590: }
591: return 0;
592: }
593:
594: private final void setSourcePosition(ByteBuffer source) {
595: // ok was there input held in the previous invocation of decodeLoop
596: // that resulted in output in this invocation?
597: source.position(source.position() - toUCountPending());
598:
599: }
600:
601: private void copy(byte[] src, int srcOffset, char[] dst,
602: int dstOffset, int length) {
603: for (int i = srcOffset; i < length; i++) {
604: dst[dstOffset++] = (char) src[srcOffset++];
605: }
606: }
607:
608: /**
609: * ONLY used by ToU callback functions.
610: * This function will write out the specified characters to the target
611: * character buffer.
612: * @return A CoderResult object that contains the error result when an error occurs.
613: * @draft ICU 3.6
614: * @provisional This API might change or be removed in a future release.
615: */
616: static final CoderResult toUWriteUChars(CharsetDecoderICU cnv,
617: char[] ucharsArray, int ucharsBegin, int length,
618: CharBuffer target, IntBuffer offsets, int sourceIndex) {
619:
620: CoderResult cr = CoderResult.UNDERFLOW;
621:
622: /* write UChars */
623: if (offsets == null) {
624: while (length > 0 && target.hasRemaining()) {
625: target.put(ucharsArray[ucharsBegin++]);
626: --length;
627: }
628:
629: } else {
630: /* output with offsets */
631: while (length > 0 && target.hasRemaining()) {
632: target.put(ucharsArray[ucharsBegin++]);
633: offsets.put(sourceIndex);
634: --length;
635: }
636: }
637: /* write overflow */
638: if (length > 0) {
639: cnv.charErrorBufferLength = 0;
640: cr = CoderResult.OVERFLOW;
641: do {
642: cnv.charErrorBufferArray[cnv.charErrorBufferLength++] = ucharsArray[ucharsBegin++];
643: } while (--length > 0);
644: }
645: return cr;
646: }
647:
648: /**
649: * This function will write out the Unicode substitution character to the
650: * target character buffer.
651: * Sub classes to override this method if required
652: * @param decoder
653: * @param source
654: * @param target
655: * @param offsets
656: * @return A CoderResult object that contains the error result when an error occurs.
657: * @draft ICU 3.6
658: * @provisional This API might change or be removed in a future release.
659: */
660: CoderResult cbToUWriteSub(CharsetDecoderICU decoder,
661: ByteBuffer source, CharBuffer target, IntBuffer offsets) {
662: String sub = decoder.replacement();
663: CharsetICU cs = (CharsetICU) decoder.charset();
664: if (decoder.invalidCharLength == 1 && cs.subChar1 != 0x00) {
665: char[] subArr = new char[] { 0x1a };
666: return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0,
667: sub.length(), target, offsets, source.position());
668: } else {
669: return CharsetDecoderICU.toUWriteUChars(decoder, sub
670: .toCharArray(), 0, sub.length(), target, offsets,
671: source.position());
672:
673: }
674: }
675: }
|