001: /**
002: *******************************************************************************
003: * Copyright (C) 2006, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: *
007: *******************************************************************************
008: */package com.ibm.icu.charset;
009:
010: import java.nio.BufferOverflowException;
011: import java.nio.ByteBuffer;
012: import java.nio.CharBuffer;
013: import java.nio.IntBuffer;
014: import java.nio.charset.CharsetEncoder;
015: import java.nio.charset.CoderResult;
016: import java.nio.charset.CodingErrorAction;
017:
018: import com.ibm.icu.impl.Assert;
019: import com.ibm.icu.text.UTF16;
020:
021: /**
022: * An abstract class that provides framework methods of decoding operations for concrete
023: * subclasses.
024: * In the future this class will contain API that will implement converter sematics of ICU4C.
025: * @draft ICU 3.6
026: * @provisional This API might change or be removed in a future release.
027: */
028: public abstract class CharsetEncoderICU extends CharsetEncoder {
029:
030: byte[] errorBuffer = new byte[30];
031: int errorBufferLength = 0;
032:
033: /** these are for encodeLoopICU */
034: int fromUnicodeStatus;
035: int fromUChar32;
036: boolean useSubChar1;
037:
038: /* store previous UChars/chars to continue partial matches */
039: int preFromUFirstCP; /* >=0: partial match */
040: char[] preFromUArray;
041: int preFromUBegin;
042: int preFromULength; /* negative: replay */
043:
044: char[] invalidUCharBuffer = new char[2];
045: int invalidUCharLength;
046: Object fromUContext;
047: private CharsetCallback.Encoder onUnmappableInput = CharsetCallback.FROM_U_CALLBACK_STOP;
048: private CharsetCallback.Encoder onMalformedInput = CharsetCallback.FROM_U_CALLBACK_STOP;
049: CharsetCallback.Encoder fromCharErrorBehaviour = new CharsetCallback.Encoder() {
050: public CoderResult call(CharsetEncoderICU encoder,
051: Object context, CharBuffer source, ByteBuffer target,
052: IntBuffer offsets, char[] buffer, int length, int cp,
053: CoderResult cr) {
054: if (cr.isUnmappable()) {
055: return onUnmappableInput.call(encoder, context, source,
056: target, offsets, buffer, length, cp, cr);
057: } else if (cr.isMalformed()) {
058: return onMalformedInput.call(encoder, context, source,
059: target, offsets, buffer, length, cp, cr);
060: }
061: return CharsetCallback.FROM_U_CALLBACK_STOP.call(encoder,
062: context, source, target, offsets, buffer, length,
063: cp, cr);
064:
065: }
066: };
067:
068: /**
069: * Construcs a new encoder for the given charset
070: * @param cs for which the decoder is created
071: * @param replacement the substitution bytes
072: * @draft ICU 3.6
073: * @provisional This API might change or be removed in a future release.
074: */
075: CharsetEncoderICU(CharsetICU cs, byte[] replacement) {
076: super (cs, (cs.minBytesPerChar + cs.maxBytesPerChar) / 2,
077: cs.maxBytesPerChar, replacement);
078: }
079:
080: /**
081: * Sets the action to be taken if an illegal sequence is encountered
082: * @param newAction action to be taken
083: * @exception IllegalArgumentException
084: * @stable ICU 3.6
085: */
086: protected void implOnMalformedInput(CodingErrorAction newAction) {
087: onMalformedInput = getCallback(newAction);
088: }
089:
090: /**
091: * Sets the action to be taken if an illegal sequence is encountered
092: * @param newAction action to be taken
093: * @exception IllegalArgumentException
094: * @stable ICU 3.6
095: */
096: protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
097: onUnmappableInput = getCallback(newAction);
098: }
099:
100: private static CharsetCallback.Encoder getCallback(
101: CodingErrorAction action) {
102: if (action == CodingErrorAction.REPLACE) {
103: return CharsetCallback.FROM_U_CALLBACK_SUBSTITUTE;
104: } else if (action == CodingErrorAction.IGNORE) {
105: return CharsetCallback.FROM_U_CALLBACK_SKIP;
106: } else if (action == CodingErrorAction.REPORT) {
107: return CharsetCallback.FROM_U_CALLBACK_STOP;
108: }
109: return CharsetCallback.FROM_U_CALLBACK_STOP;
110: }
111:
112: private static final CharBuffer EMPTY = CharBuffer.allocate(0);
113:
114: /**
115: * Flushes any characters saved in the converter's internal buffer and
116: * resets the converter.
117: * @param out action to be taken
118: * @return result of flushing action and completes the decoding all input.
119: * Returns CoderResult.UNDERFLOW if the action succeeds.
120: * @stable ICU 3.6
121: */
122: protected CoderResult implFlush(ByteBuffer out) {
123: return encode(EMPTY, out, null, true);
124: }
125:
126: /**
127: * Resets the from Unicode mode of converter
128: * @stable ICU 3.6
129: */
130: protected void implReset() {
131: errorBufferLength = 0;
132: fromUChar32 = 0;
133: fromUnicodeStatus = 0;
134: preFromUBegin = 0;
135: preFromUFirstCP = 0;
136: preFromULength = 0;
137: }
138:
139: /**
140: * Encodes one or more chars. The default behaviour of the
141: * converter is stop and report if an error in input stream is encountered.
142: * To set different behaviour use @see CharsetEncoder.onMalformedInput()
143: * @param in buffer to decode
144: * @param out buffer to populate with decoded result
145: * @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
146: * action succeeds or more input is needed for completing the decoding action.
147: * @stable ICU 3.6
148: */
149: protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
150: if (!in.hasRemaining()) {
151: return CoderResult.UNDERFLOW;
152: }
153: in.position(in.position() + fromUCountPending());
154: /* do the conversion */
155: CoderResult ret = encode(in, out, null, false);
156: setSourcePosition(in);
157: return ret;
158: }
159:
160: /**
161: * Implements ICU semantics of buffer management
162: * @param source
163: * @param target
164: * @param offsets
165: * @return A CoderResult object that contains the error result when an error occurs.
166: * @draft ICU 3.6
167: * @provisional This API might change or be removed in a future release.
168: */
169: abstract CoderResult encodeLoop(CharBuffer source,
170: ByteBuffer target, IntBuffer offsets, boolean flush);
171:
172: /**
173: * Implements ICU semantics for encoding the buffer
174: * @param source The input character buffer
175: * @param target The output byte buffer
176: * @param offsets
177: * @param flush true if, and only if, the invoker can provide no
178: * additional input bytes beyond those in the given buffer.
179: * @return A CoderResult object that contains the error result when an error occurs.
180: * @draft ICU 3.6
181: * @provisional This API might change or be removed in a future release.
182: */
183: final CoderResult encode(CharBuffer source, ByteBuffer target,
184: IntBuffer offsets, boolean flush) {
185:
186: /* check parameters */
187: if (target == null || source == null) {
188: throw new IllegalArgumentException();
189: }
190:
191: /*
192: * Make sure that the buffer sizes do not exceed the number range for
193: * int32_t because some functions use the size (in units or bytes)
194: * rather than comparing pointers, and because offsets are int32_t values.
195: *
196: * size_t is guaranteed to be unsigned and large enough for the job.
197: *
198: * Return with an error instead of adjusting the limits because we would
199: * not be able to maintain the semantics that either the source must be
200: * consumed or the target filled (unless an error occurs).
201: * An adjustment would be targetLimit=t+0x7fffffff; for example.
202: */
203:
204: /* flush the target overflow buffer */
205: if (errorBufferLength > 0) {
206: byte[] overflowArray;
207: int i, length;
208:
209: overflowArray = errorBuffer;
210: length = errorBufferLength;
211: i = 0;
212: do {
213: if (target.remaining() == 0) {
214: /* the overflow buffer contains too much, keep the rest */
215: int j = 0;
216:
217: do {
218: overflowArray[j++] = overflowArray[i++];
219: } while (i < length);
220:
221: errorBufferLength = (byte) j;
222: return CoderResult.OVERFLOW;
223: }
224:
225: /* copy the overflow contents to the target */
226: target.put(overflowArray[i++]);
227: if (offsets != null) {
228: offsets.put(-1); /* no source index available for old output */
229: }
230: } while (i < length);
231:
232: /* the overflow buffer is completely copied to the target */
233: errorBufferLength = 0;
234: }
235:
236: if (!flush && source.remaining() == 0 && preFromULength >= 0) {
237: /* the overflow buffer is emptied and there is no new input: we are done */
238: return CoderResult.UNDERFLOW;
239: }
240:
241: /*
242: * Do not simply return with a buffer overflow error if
243: * !flush && t==targetLimit
244: * because it is possible that the source will not generate any output.
245: * For example, the skip callback may be called;
246: * it does not output anything.
247: */
248:
249: return fromUnicodeWithCallback(source, target, offsets, flush);
250:
251: }
252:
253: /* maximum number of indexed UChars */
254: private static final int EXT_MAX_UCHARS = 19;
255:
256: /**
257: * Implementation note for m:n conversions
258: *
259: * While collecting source units to find the longest match for m:n conversion,
260: * some source units may need to be stored for a partial match.
261: * When a second buffer does not yield a match on all of the previously stored
262: * source units, then they must be "replayed", i.e., fed back into the converter.
263: *
264: * The code relies on the fact that replaying will not nest -
265: * converting a replay buffer will not result in a replay.
266: * This is because a replay is necessary only after the _continuation_ of a
267: * partial match failed, but a replay buffer is converted as a whole.
268: * It may result in some of its units being stored again for a partial match,
269: * but there will not be a continuation _during_ the replay which could fail.
270: *
271: * It is conceivable that a callback function could call the converter
272: * recursively in a way that causes another replay to be stored, but that
273: * would be an error in the callback function.
274: * Such violations will cause assertion failures in a debug build,
275: * and wrong output, but they will not cause a crash.
276: * @draft ICU 3.6
277: * @provisional This API might change or be removed in a future release.
278: */
279: final CoderResult fromUnicodeWithCallback(CharBuffer source,
280: ByteBuffer target, IntBuffer offsets, boolean flush) {
281: int sBufferIndex;
282: int sourceIndex;
283: int errorInputLength;
284: boolean converterSawEndOfInput, calledCallback;
285:
286: /* variables for m:n conversion */
287: CharBuffer replayArray = CharBuffer.allocate(EXT_MAX_UCHARS);
288: int replayArrayIndex = 0;
289: CharBuffer realSource;
290: boolean realFlush;
291:
292: CoderResult cr = CoderResult.UNDERFLOW;
293:
294: /* get the converter implementation function */
295: sourceIndex = 0;
296:
297: if (preFromULength >= 0) {
298: /* normal mode */
299: realSource = null;
300: realFlush = false;
301: } else {
302: /*
303: * Previous m:n conversion stored source units from a partial match
304: * and failed to consume all of them.
305: * We need to "replay" them from a temporary buffer and convert them first.
306: */
307: realSource = source;
308: realFlush = flush;
309:
310: //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR);
311: replayArray.put(preFromUArray, 0, -preFromULength);
312: source.position(replayArrayIndex);
313: source.limit(replayArrayIndex - preFromULength); //preFromULength is negative, see declaration
314: source = replayArray;
315: flush = false;
316:
317: preFromULength = 0;
318: }
319:
320: /*
321: * loop for conversion and error handling
322: *
323: * loop {
324: * convert
325: * loop {
326: * update offsets
327: * handle end of input
328: * handle errors/call callback
329: * }
330: * }
331: */
332: for (;;) {
333: /* convert */
334: cr = encodeLoop(source, target, offsets, flush);
335: /*
336: * set a flag for whether the converter
337: * successfully processed the end of the input
338: *
339: * need not check cnv.preFromULength==0 because a replay (<0) will cause
340: * s<sourceLimit before converterSawEndOfInput is checked
341: */
342: converterSawEndOfInput = (boolean) (cr.isUnderflow()
343: && flush && source.remaining() == 0 && fromUChar32 == 0);
344:
345: /* no callback called yet for this iteration */
346: calledCallback = false;
347:
348: /* no sourceIndex adjustment for conversion, only for callback output */
349: errorInputLength = 0;
350:
351: /*
352: * loop for offsets and error handling
353: *
354: * iterates at most 3 times:
355: * 1. to clean up after the conversion function
356: * 2. after the callback
357: * 3. after the callback again if there was truncated input
358: */
359: for (;;) {
360: /* update offsets if we write any */
361: if (offsets != null) {
362: int length = target.remaining();
363: if (length > 0) {
364:
365: /*
366: * if a converter handles offsets and updates the offsets
367: * pointer at the end, then offset should not change
368: * here;
369: * however, some converters do not handle offsets at all
370: * (sourceIndex<0) or may not update the offsets pointer
371: */
372: offsets.position(offsets.position() + length);
373: }
374:
375: if (sourceIndex >= 0) {
376: sourceIndex += (int) (source.position());
377: }
378: }
379:
380: if (preFromULength < 0) {
381: /*
382: * switch the source to new replay units (cannot occur while replaying)
383: * after offset handling and before end-of-input and callback handling
384: */
385: if (realSource == null) {
386: realSource = source;
387: realFlush = flush;
388:
389: //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR);
390: replayArray.put(preFromUArray, 0,
391: -preFromULength);
392:
393: source = replayArray;
394: source.position(replayArrayIndex);
395: source.limit(replayArrayIndex - preFromULength);
396: flush = false;
397: if ((sourceIndex += preFromULength) < 0) {
398: sourceIndex = -1;
399: }
400:
401: preFromULength = 0;
402: } else {
403: /* see implementation note before _fromUnicodeWithCallback() */
404: //agljport:todo U_ASSERT(realSource==NULL);
405: Assert.assrt(realSource == null);
406: }
407: }
408:
409: /* update pointers */
410: sBufferIndex = source.position();
411: if (cr.isUnderflow()) {
412: if (sBufferIndex < source.limit()) {
413: /*
414: * continue with the conversion loop while there is still input left
415: * (continue converting by breaking out of only the inner loop)
416: */
417: break;
418: } else if (realSource != null) {
419: /* switch back from replaying to the real source and continue */
420: source = realSource;
421: flush = realFlush;
422: sourceIndex = source.position();
423: realSource = null;
424: break;
425: } else if (flush && fromUChar32 != 0) {
426: /*
427: * the entire input stream is consumed
428: * and there is a partial, truncated input sequence left
429: */
430:
431: /* inject an error and continue with callback handling */
432: //err[0]=ErrorCode.U_TRUNCATED_CHAR_FOUND;
433: cr = CoderResult.malformedForLength(1);
434: calledCallback = false; /* new error condition */
435: } else {
436: /* input consumed */
437: if (flush) {
438: /*
439: * return to the conversion loop once more if the flush
440: * flag is set and the conversion function has not
441: * successfully processed the end of the input yet
442: *
443: * (continue converting by breaking out of only the inner loop)
444: */
445: if (!converterSawEndOfInput) {
446: break;
447: }
448:
449: /* reset the converter without calling the callback function */
450: implReset();
451: }
452:
453: /* done successfully */
454: return cr;
455: }
456: }
457:
458: /*U_FAILURE(*err) */
459: {
460:
461: if (calledCallback || cr.isOverflow()
462: || (cr.isMalformed() && cr.isUnmappable())) {
463: /*
464: * the callback did not or cannot resolve the error:
465: * set output pointers and return
466: *
467: * the check for buffer overflow is redundant but it is
468: * a high-runner case and hopefully documents the intent
469: * well
470: *
471: * if we were replaying, then the replay buffer must be
472: * copied back into the UConverter
473: * and the real arguments must be restored
474: */
475: if (realSource != null) {
476: int length;
477:
478: //agljport:todo U_ASSERT(cnv.preFromULength==0);
479:
480: length = source.remaining();
481: if (length > 0) {
482: //UConverterUtility.uprv_memcpy(preFromUArray, 0, sourceArray, pArgs.sourceBegin, length*UMachine.U_SIZEOF_UCHAR);
483: source.get(preFromUArray, 0, length);
484: preFromULength = (byte) -length;
485: }
486: source = realSource;
487: flush = realFlush;
488: }
489: return cr;
490: }
491: }
492:
493: /* callback handling */
494: {
495: /* get and write the code point */
496: errorInputLength = UTF16.append(invalidUCharBuffer,
497: 0, fromUChar32);
498: invalidUCharLength = errorInputLength;
499:
500: /* set the converter state to deal with the next character */
501: fromUChar32 = 0;
502:
503: /* call the callback function */
504: cr = fromCharErrorBehaviour.call(this ,
505: fromUContext, source, target, offsets,
506: invalidUCharBuffer, invalidUCharLength,
507: fromUChar32, cr);
508: }
509:
510: /*
511: * loop back to the offset handling
512: *
513: * this flag will indicate after offset handling
514: * that a callback was called;
515: * if the callback did not resolve the error, then we return
516: */
517: calledCallback = true;
518: }
519: }
520: }
521:
522: /**
523: * Ascertains if a given Unicode code point (32bit value for handling surrogates)
524: * can be converted to the target encoding. If the caller wants to test if a
525: * surrogate pair can be converted to target encoding then the
526: * responsibility of assembling the int value lies with the caller.
527: * For assembling a code point the caller can use UTF16 class of ICU4J and do something like:
528: * <pre>
529: * while(i<mySource.length){
530: * if(UTF16.isLeadSurrogate(mySource[i])&& i+1< mySource.length){
531: * if(UTF16.isTrailSurrogate(mySource[i+1])){
532: * int temp = UTF16.charAt(mySource,i,i+1,0);
533: * if(!((CharsetEncoderICU) myConv).canEncode(temp)){
534: * passed=false;
535: * }
536: * i++;
537: * i++;
538: * }
539: * }
540: * }
541: * </pre>
542: * or
543: * <pre>
544: * String src = new String(mySource);
545: * int i,codepoint;
546: * boolean passed = false;
547: * while(i<src.length()){
548: * codepoint = UTF16.charAt(src,i);
549: * i+= (codepoint>0xfff)? 2:1;
550: * if(!(CharsetEncoderICU) myConv).canEncode(codepoint)){
551: * passed = false;
552: * }
553: * }
554: * </pre>
555: *
556: * @param codepoint Unicode code point as int value
557: * @return true if a character can be converted
558: * @draft ICU 3.6
559: * @provisional This API might change or be removed in a future release.
560: */
561: public boolean canEncode(int codepoint) {
562: return true;
563: }
564:
565: /**
566: * Overrides super class method
567: * @stable ICU 3.6
568: */
569: public boolean isLegalReplacement(byte[] repl) {
570: return true;
571: }
572:
573: /**
574: * Writes out the specified output bytes to the target byte buffer or to converter internal buffers.
575: * @param cnv
576: * @param bytesArray
577: * @param bytesBegin
578: * @param bytesLength
579: * @param out
580: * @param offsets
581: * @param sourceIndex
582: * @return A CoderResult object that contains the error result when an error occurs.
583: * @draft ICU 3.6
584: * @provisional This API might change or be removed in a future release.
585: */
586: static final CoderResult fromUWriteBytes(CharsetEncoderICU cnv,
587: byte[] bytesArray, int bytesBegin, int bytesLength,
588: ByteBuffer out, IntBuffer offsets, int sourceIndex) {
589:
590: //write bytes
591: int obl = bytesLength;
592: CoderResult cr = CoderResult.UNDERFLOW;
593: int bytesLimit = bytesBegin + bytesLength;
594: try {
595: for (; bytesBegin < bytesLimit;) {
596: out.put(bytesArray[bytesBegin]);
597: bytesBegin++;
598: }
599: // success
600: bytesLength = 0;
601: } catch (BufferOverflowException ex) {
602: cr = CoderResult.OVERFLOW;
603: }
604:
605: if (offsets != null) {
606: while (obl > bytesLength) {
607: offsets.put(sourceIndex);
608: --obl;
609: }
610: }
611: //write overflow
612: cnv.errorBufferLength = bytesLimit - bytesBegin;
613: if (cnv.errorBufferLength > 0) {
614: if (cnv != null) {
615: int index = 0;
616: while (bytesBegin < bytesLimit) {
617: cnv.errorBuffer[index++] = bytesArray[bytesBegin++];
618: }
619: }
620: cr = CoderResult.OVERFLOW;
621: }
622: return cr;
623: }
624:
625: /**
626: * Returns the number of chars held in the converter's internal state
627: * because more input is needed for completing the conversion. This function is
628: * useful for mapping semantics of ICU's converter interface to those of iconv,
629: * and this information is not needed for normal conversion.
630: * @return The number of chars in the state. -1 if an error is encountered.
631: * @draft ICU 3.4
632: * @provisional This API might change or be removed in a future release.
633: */
634: /*public*/int fromUCountPending() {
635: if (preFromULength > 0) {
636: return UTF16.getCharCount(preFromUFirstCP) + preFromULength;
637: } else if (preFromULength < 0) {
638: return -preFromULength;
639: } else if (fromUChar32 > 0) {
640: return 1;
641: } else if (preFromUFirstCP > 0) {
642: return UTF16.getCharCount(preFromUFirstCP);
643: }
644: return 0;
645: }
646:
647: /**
648: *
649: * @param source
650: */
651: private final void setSourcePosition(CharBuffer source) {
652:
653: // ok was there input held in the previous invocation of decodeLoop
654: // that resulted in output in this invocation?
655: source.position(source.position() - fromUCountPending());
656: }
657:
658: /**
659: * Write the codepage substitution character.
660: * Subclasses to override this method.
661: * For stateful converters, it is typically necessary to handle this
662: * specificially for the converter in order to properly maintain the state.
663: * @param source The input character buffer
664: * @param target The output byte buffer
665: * @param offsets
666: * @return A CoderResult object that contains the error result when an error occurs.
667: * @draft ICU 3.6
668: * @provisional This API might change or be removed in a future release.
669: */
670: CoderResult cbFromUWriteSub(CharsetEncoderICU encoder,
671: CharBuffer source, ByteBuffer target, IntBuffer offsets) {
672: CharsetICU cs = (CharsetICU) encoder.charset();
673: byte[] sub = encoder.replacement();
674: if (cs.subChar1 != 0 && encoder.invalidUCharBuffer[0] <= 0xff) {
675: return CharsetEncoderICU.fromUWriteBytes(encoder,
676: new byte[] { cs.subChar1 }, 0, 1, target, offsets,
677: source.position());
678: } else {
679: return CharsetEncoderICU.fromUWriteBytes(encoder, sub, 0,
680: sub.length, target, offsets, source.position());
681: }
682: }
683: }
|