001: /*
002: * @(#)CharToByteConverter.java 1.47 06/10/10
003: *
004: * Copyright 1990-2006 Sun Microsystems, Inc. All Rights Reserved.
005: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
006: *
007: * This program is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU General Public License version
009: * 2 only, as published by the Free Software Foundation.
010: *
011: * This program is distributed in the hope that it will be useful, but
012: * WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014: * General Public License version 2 for more details (a copy is
015: * included at /legal/license.txt).
016: *
017: * You should have received a copy of the GNU General Public License
018: * version 2 along with this work; if not, write to the Free Software
019: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020: * 02110-1301 USA
021: *
022: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
023: * Clara, CA 95054 or visit www.sun.com if you need additional
024: * information or have any questions.
025: *
026: */
027:
028: package sun.io;
029:
030: import java.io.*;
031:
032: /**
033: * An abstract base class for subclasses which convert Unicode
034: * characters into an external encoding.
035: *
036: * @author Asmus Freytag
037: * @author Lloyd Honomichl, Novell, Inc.
038: */
039: public abstract class CharToByteConverter {
040:
041: /**
042: * Substitution mode flag.
043: */
044: protected boolean subMode = true;
045:
046: /**
047: * Bytes to substitute for unmappable input.
048: */
049: protected byte[] subBytes = { (byte) '?' };
050:
051: /**
052: * Offset of next character to be converted.
053: */
054: protected int charOff;
055:
056: /**
057: * Offset of next byte to be output.
058: */
059: protected int byteOff;
060:
061: /**
062: * Length of bad input that caused conversion to stop.
063: */
064: protected int badInputLength;
065:
066: /**
067: * Create an instance of the default CharToByteConverter subclass.
068: */
069: public static CharToByteConverter getDefault() {
070: Object cvt;
071: cvt = Converters.newDefaultConverter(Converters.CHAR_TO_BYTE);
072: return (CharToByteConverter) cvt;
073: }
074:
075: /**
076: * Returns appropriate CharToByteConverter subclass instance.
077: * @param string represets encoding
078: */
079: public static CharToByteConverter getConverter(String encoding)
080: throws UnsupportedEncodingException {
081: Object cvt;
082: cvt = Converters
083: .newConverter(Converters.CHAR_TO_BYTE, encoding);
084: return (CharToByteConverter) cvt;
085: }
086:
087: /**
088: * Returns the character set id for the conversion.
089: */
090: public abstract String getCharacterEncoding();
091:
092: /**
093: * Converts an array of Unicode characters into an array of bytes
094: * in the target character encoding. This method allows a buffer by
095: * buffer conversion of a data stream. The state of the conversion is
096: * saved between calls to convert. If a call to convert results in
097: * an exception, the conversion may be continued by calling convert again
098: * with suitably modified parameters. All conversions should be finished
099: * with a call to the flush method.
100: *
101: * @return the number of bytes written to output.
102: * @param input array containing Unicode characters to be converted.
103: * @param inStart begin conversion at this offset in input array.
104: * @param inEnd stop conversion at this offset in input array (exclusive).
105: * @param output byte array to receive conversion result.
106: * @param outStart start writing to output array at this offset.
107: * @param outEnd stop writing to output array at this offset (exclusive).
108: * @exception MalformedInputException if the input buffer contains any
109: * sequence of chars that is illegal in Unicode (principally unpaired
110: * surrogates and \uFFFF or \uFFFE). After this exception is thrown,
111: * the method nextCharIndex can be called to obtain the index of the
112: * first invalid input character. The MalformedInputException can
113: * be queried for the length of the invalid input.
114: * @exception UnknownCharacterException for any character that
115: * that cannot be converted to the external character encoding. Thrown
116: * only when converter is not in substitution mode.
117: * @exception ConversionBufferFullException if output array is filled prior
118: * to converting all the input.
119: */
120: public abstract int convert(char[] input, int inStart, int inEnd,
121: byte[] output, int outStart, int outEnd)
122: throws MalformedInputException, UnknownCharacterException,
123: ConversionBufferFullException;
124:
125: /*
126: * Converts any array of characters, including malformed surrogate
127: * pairs, into an array of bytes in the target character encoding.
128: * A precondition is that substitution mode is turned on. This method
129: * allows a buffer by buffer conversion of a data stream.
130: * The state of the conversion is saved between calls to convert.
131: * All conversions should be finished with a call to the flushAny method.
132: *
133: * @return the number of bytes written to output.
134: * @param input array containing Unicode characters to be converted.
135: * @param inStart begin conversion at this offset in input array.
136: * @param inEnd stop conversion at this offset in input array (exclusive).
137: * @param output byte array to receive conversion result.
138: * @param outStart start writing to output array at this offset.
139: * @param outEnd stop writing to output array at this offset (exclusive).
140: * @exception ConversionBufferFullException if output array is filled prior
141: * to converting all the input.
142: */
143: public int convertAny(char[] input, int inStart, int inEnd,
144: byte[] output, int outStart, int outEnd)
145: throws ConversionBufferFullException {
146: if (!subMode) { /* Precondition: subMode == true */
147: throw new IllegalStateException(
148: "Substitution mode is not on");
149: }
150: /* Rely on the untested precondition that the indices are meaningful */
151: /* For safety, use the public interface to charOff and byteOff, but
152: badInputLength is directly modified.*/
153: int localInOff = inStart;
154: int localOutOff = outStart;
155: while (localInOff < inEnd) {
156: try {
157: int discard = convert(input, localInOff, inEnd, output,
158: localOutOff, outEnd);
159: return (nextByteIndex() - outStart);
160: } catch (MalformedInputException e) {
161: int subSize = subBytes.length;
162: localOutOff = nextByteIndex();
163: if ((localOutOff + subSize) > outEnd)
164: throw new ConversionBufferFullException();
165: for (int i = 0; i < subSize; i++)
166: output[localOutOff++] = subBytes[i];
167: localInOff = nextCharIndex();
168: localInOff += badInputLength;
169: badInputLength = 0;
170: if (localInOff >= inEnd) {
171: byteOff = localOutOff;
172: return (byteOff - outStart);
173: }
174: continue;
175: } catch (UnknownCharacterException e) {
176: /* Should never occur, since subMode == true */
177: throw new InternalError(
178: "UnknownCharacterException thrown "
179: + "in substititution mode");
180: }
181: }
182: return (nextByteIndex() - outStart);
183: }
184:
185: /**
186: * Converts an array of Unicode characters into an array of bytes
187: * in the target character encoding. Unlike convert, this method
188: * does not do incremental conversion. It assumes that the given
189: * input array contains all the characters to be converted. The
190: * state of the converter is reset at the beginning of this method
191: * and is left in the reset state on successful termination.
192: * The converter is not reset if an exception is thrown.
193: * This allows the caller to determine where the bad input
194: * was encountered by calling nextCharIndex.
195: * <p>
196: * This method uses substitution mode when performing the conversion.
197: * The method setSubstitutionBytes may be used to determine what
198: * bytes are substituted. Even though substitution mode is used,
199: * the state of the converter's substitution mode is not changed
200: * at the end of this method.
201: *
202: * @return an array of bytes containing the converted characters.
203: * @param input array containing Unicode characters to be converted.
204: * @exception MalformedInputException if the input buffer contains any
205: * sequence of chars that is illegal in Unicode (principally unpaired
206: * surrogates and \uFFFF or \uFFFE). After this exception is thrown,
207: * the method nextCharIndex can be called to obtain the index of the
208: * first invalid input character and getBadInputLength can be called
209: * to determine the length of the invalid input.
210: *
211: * @see #nextCharIndex
212: * @see #setSubstitutionMode
213: * @see #setSubstitutionBytes
214: * @see #getBadInputLength
215: */
216: public byte[] convertAll(char input[])
217: throws MalformedInputException {
218: reset();
219: boolean savedSubMode = subMode;
220: subMode = true;
221:
222: byte[] output = new byte[getMaxBytesPerChar() * input.length];
223:
224: try {
225: int outputLength = convert(input, 0, input.length, output,
226: 0, output.length);
227: outputLength += flush(output, nextByteIndex(),
228: output.length);
229:
230: byte[] returnedOutput = new byte[outputLength];
231: System
232: .arraycopy(output, 0, returnedOutput, 0,
233: outputLength);
234: return returnedOutput;
235: } catch (ConversionBufferFullException e) {
236: //Not supposed to happen. If it does, getMaxBytesPerChar() lied.
237: throw new InternalError(
238: "this.getMaxBytesPerChar returned bad value");
239: } catch (UnknownCharacterException e) {
240: // Not supposed to happen since we're in substitution mode.
241: throw new InternalError();
242: } finally {
243: subMode = savedSubMode;
244: }
245: }
246:
247: /**
248: * Writes any remaining output to the output buffer and resets the
249: * converter to its initial state.
250: *
251: * @param output byte array to receive flushed output.
252: * @param outStart start writing to output array at this offset.
253: * @param outEnd stop writing to output array at this offset (exclusive).
254: * @exception MalformedInputException if the output to be flushed contained
255: * a partial or invalid multibyte character sequence. Will occur if the
256: * input buffer on the last call to convert ended with the first character
257: * of a surrogate pair. flush will write what it can to the output buffer
258: * and reset the converter before throwing this exception. An additional
259: * call to flush is not required.
260: * @exception ConversionBufferFullException if output array is filled
261: * before all the output can be flushed. flush will write what it can
262: * to the output buffer and remember its state. An additional call to
263: * flush with a new output buffer will conclude the operation.
264: */
265: public abstract int flush(byte[] output, int outStart, int outEnd)
266: throws MalformedInputException,
267: ConversionBufferFullException;
268:
269: /**
270: * Writes any remaining output to the output buffer and resets the
271: * converter to its initial state. May only be called when substitution
272: * mode is turned on, and never complains about malformed input (always
273: * substitutes).
274: *
275: * @param output byte array to receive flushed output.
276: * @param outStart start writing to output array at this offset.
277: * @param outEnd stop writing to output array at this offset (exclusive).
278: * @return number of bytes writter into output.
279: * @exception ConversionBufferFullException if output array is filled
280: * before all the output can be flushed. flush will write what it can
281: * to the output buffer and remember its state. An additional call to
282: * flush with a new output buffer will conclude the operation.
283: */
284: public int flushAny(byte[] output, int outStart, int outEnd)
285: throws ConversionBufferFullException {
286: if (!subMode) { /* Precondition: subMode == true */
287: throw new IllegalStateException(
288: "Substitution mode is not on");
289: }
290: try {
291: return flush(output, outStart, outEnd);
292: } catch (MalformedInputException e) {
293: /* Assume that if a malformed input exception has occurred,
294: no useful data has been placed in the output buffer.
295: i.e. there is no mixture of left over good + some bad data.
296: Usually occurs with a trailing high surrogate pair element.
297: Special cases occur in Cp970, 949c and 933 that seem
298: to be covered, but may require further investigation */
299: int subSize = subBytes.length;
300: int outIndex = outStart;
301: if ((outStart + subSize) > outEnd)
302: throw new ConversionBufferFullException();
303: for (int i = 0; i < subSize; i++)
304: output[outIndex++] = subBytes[i];
305: byteOff = charOff = 0; // Reset the internal state.
306: badInputLength = 0;
307: return subSize;
308: }
309: }
310:
311: /**
312: * Resets converter to its initial state.
313: */
314: public abstract void reset();
315:
316: /**
317: * Returns true if the given character can be converted to the
318: * target character encoding.
319: * @return true if given character is translatable, false otherwise.
320: * @param c character to test
321: */
322: public boolean canConvert(char c) {
323: try {
324: // (output buffer size should use getMaxBytesPerChar value.)
325: char[] input = new char[1];
326: byte[] output = new byte[3];
327: input[0] = c;
328: convert(input, 0, 1, output, 0, 3);
329: return true;
330: } catch (CharConversionException e) {
331: return false;
332: }
333: }
334:
335: /**
336: * Returns the maximum number of bytes needed to convert a char. Useful
337: * for calculating the maximum output buffer size needed for a particular
338: * input buffer.
339: */
340: public abstract int getMaxBytesPerChar();
341:
342: /**
343: * Returns the length, in chars, of the input which caused a
344: * MalformedInputException. Always refers to the last
345: * MalformedInputException thrown by the converter. If none have
346: * ever been thrown, returns 0.
347: */
348: public int getBadInputLength() {
349: return badInputLength;
350: }
351:
352: /**
353: * Returns the index of the character just past
354: * the last character successfully converted by the previous call
355: * to convert.
356: */
357: public int nextCharIndex() {
358: return charOff;
359: }
360:
361: /**
362: * Returns the index of the byte just past the last byte written by
363: * the previous call to convert.
364: */
365: public int nextByteIndex() {
366: return byteOff;
367: }
368:
369: /**
370: * Sets converter into substitution mode. In substitution mode,
371: * the converter will replace untranslatable characters in the source
372: * encoding with the substitution character set by setSubstitutionBytes.
373: * When not in substitution mode, the converter will throw an
374: * UnknownCharacterException when it encounters untranslatable input.
375: *
376: * @param doSub if true, enable substitution mode.
377: * @see #setSubstitutionBytes
378: */
379: public void setSubstitutionMode(boolean doSub) {
380: subMode = doSub;
381: }
382:
383: /**
384: * Sets the substitution bytes to use when the converter is in
385: * substitution mode. The given bytes should represent a valid
386: * character in the target character encoding and must not be
387: * longer than the value returned by getMaxBytesPerChar for this
388: * converter.
389: *
390: * @param newSubBytes the substitution bytes
391: * @exception IllegalArgumentException if given byte array is longer than
392: * the value returned by the method getMaxBytesPerChar.
393: * @see #setSubstitutionMode
394: * @see #getMaxBytesPerChar
395: */
396: public void setSubstitutionBytes(byte[] newSubBytes)
397: throws IllegalArgumentException {
398: if (newSubBytes.length > getMaxBytesPerChar()) {
399: throw new IllegalArgumentException();
400: }
401:
402: subBytes = new byte[newSubBytes.length];
403: System.arraycopy(newSubBytes, 0, subBytes, 0,
404: newSubBytes.length);
405: }
406:
407: /**
408: * Returns a string representation of the class.
409: */
410: public String toString() {
411: return "CharToByteConverter: " + getCharacterEncoding();
412: }
413: }
|