001: /*
002: *
003: *
004: * Copyright 1990-2007 Sun Microsystems, Inc. All Rights Reserved.
005: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
006: *
007: * This program is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU General Public License version
009: * 2 only, as published by the Free Software Foundation.
010: *
011: * This program is distributed in the hope that it will be useful, but
012: * WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014: * General Public License version 2 for more details (a copy is
015: * included at /legal/license.txt).
016: *
017: * You should have received a copy of the GNU General Public License
018: * version 2 along with this work; if not, write to the Free Software
019: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020: * 02110-1301 USA
021: *
022: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
023: * Clara, CA 95054 or visit www.sun.com if you need additional
024: * information or have any questions.
025: */
026:
027: package com.sun.cdc.i18n.j2me;
028:
029: import java.io.*;
030:
031: /** Reader for UTF-16 encoded input streams. */
032: public class UTF_16_Reader extends com.sun.cdc.i18n.StreamReader {
033:
034: /** the first byte of a pair of bytes that represent a 16-bit char */
035: protected int firstByte = -1;
036: /** the byteOrder variable has this value when the byte order
037: * has not yet been specified or detected */
038: protected static final int UNKNOWN_BYTE_ORDER = 0;
039: /** the byteOrder variable has this value when the byte order
040: * is Big Endian */
041: protected static final int BIG_ENDIAN = 1;
042: /** the byteOrder variable has this value when the byte order
043: * is Little Endian */
044: protected static final int LITTLE_ENDIAN = 2;
045: /** the byte order: one of BIG_ENDIAN, LITTLE_ENDIAN, UNKNOWN_BYTE_ORDER */
046: protected int byteOrder = UNKNOWN_BYTE_ORDER;
047:
048: /** Constructs a UTF-16 reader. */
049: public UTF_16_Reader() {
050: }
051:
052: /**
053: * Open the reader
054: * @param in the input stream to be read
055: * @param enc identifies the encoding to be used
056: * @return a reader for the given input stream and encoding
057: * @throws UnsupportedEncodingException
058: */
059: public Reader open(InputStream in, String enc)
060: throws UnsupportedEncodingException {
061: firstByte = -1;
062: byteOrder = UNKNOWN_BYTE_ORDER;
063: super .open(in, enc);
064: return this ;
065: }
066:
067: /** Convert two bytes to a 16-bit char
068: * assuming the big endian byte order.
069: * @param firstByte the first of two bytes representing a char
070: * @param secondByte the second of two bytes representing a char
071: * @return the character represented by the two bytes
072: */
073: protected char mergeBytesBigEndian(int firstByte, int secondByte) {
074: return (char) ((firstByte << 8) + secondByte);
075: }
076:
077: /** Convert two bytes to a 16-bit char
078: * assuming the little endian byte order.
079: * @param firstByte the first of two bytes representing a char
080: * @param secondByte the second of two bytes representing a char
081: * @return the character represented by the two bytes
082: */
083: protected char mergeBytesLittleEndian(int firstByte, int secondByte) {
084: return (char) ((secondByte << 8) + firstByte);
085: }
086:
087: /** Convert two bytes to a 16-bit char
088: * using the current byte order.
089: * @param firstByte the first of two bytes representing a char
090: * @param secondByte the second of two bytes representing a char
091: * @return the character represented by the two bytes
092: */
093: protected char mergeBytes(int firstByte, int secondByte) {
094: if (byteOrder == BIG_ENDIAN) {
095: return mergeBytesBigEndian(firstByte, secondByte);
096: } else { // if (byteOrder == LITTLE_ENDIAN)
097: return mergeBytesLittleEndian(firstByte, secondByte);
098: }
099: }
100:
101: /**
102: * If the two argument bytes represent a Byte Order Mark (BOM),
103: * set the byteOrder member to the corresponding byte order constant;
104: * else set it to the default byte order.
105: * @param firstByte the first of two bytes representing a char or BOM
106: * @param secondByte the second of two bytes representing a char or BOM
107: * @return true if it was a byte order mark, false it it was data
108: */
109: protected boolean bomDetect(int firstByte, int secondByte) {
110: if (firstByte == 0xFE && secondByte == 0xFF) {
111: byteOrder = BIG_ENDIAN;
112: return true;
113: } else if (firstByte == 0xFF && secondByte == 0xFE) {
114: byteOrder = LITTLE_ENDIAN;
115: return true;
116: } else { // default
117: // The UTF-16 FAQ says that in absence of BOM
118: // big-endian byte serialization is used.
119: byteOrder = BIG_ENDIAN;
120: return false;
121: }
122: }
123:
124: /**
125: * Read a block of UTF16 characters.
126: *
127: * @param cbuf output buffer for converted characters read
128: * @param off initial offset into the provided buffer
129: * @param len length of characters in the buffer
130: * @return the number of converted characters
131: * @exception IOException is thrown if the input stream
132: * could not be read for the raw unconverted character
133: */
134: public int read(char cbuf[], int off, int len) throws IOException {
135: int count = 0;
136: int secondByte;
137: if (len == 0) {
138: return 0;
139: }
140:
141: if (firstByte == -1) {
142: firstByte = in.read();
143: }
144: for (; count < len; firstByte = in.read()) {
145: if (-1 == firstByte || -1 == (secondByte = in.read())) {
146: return (0 == count) ? -1 : count;
147: }
148:
149: if (byteOrder == UNKNOWN_BYTE_ORDER) {
150: // only for the first two bytes: examine BOM
151: final boolean itWasBOM = bomDetect(firstByte,
152: secondByte);
153: if (!itWasBOM) {
154: cbuf[off + count] = mergeBytes(firstByte,
155: secondByte);
156: count++;
157: }
158: } else {
159: cbuf[off + count] = mergeBytes(firstByte, secondByte);
160: count++;
161: }
162: }
163: return count;
164: }
165:
166: /**
167: * Tell whether this reader supports the mark() operation.
168: * The implementation always returns false because it does not
169: * support mark().
170: *
171: * @return false
172: */
173: public boolean markSupported() {
174: /*
175: * For readers mark() is in characters; UTF-16 is easier than UTF-8,
176: * but it's not supported yet.
177: * So this reader does not support mark at this time.
178: */
179: return false;
180: }
181:
182: /**
183: * Mark a read ahead character is not supported for UTF16
184: * readers.
185: * @param readAheadLimit number of characters to buffer ahead
186: * @exception IOException is thrown, for all calls to this method
187: * because marking is not supported for UTF16 readers
188: */
189: public void mark(int readAheadLimit) throws IOException {
190: throw new IOException("mark() not supported");
191: }
192:
193: /**
194: * Reset the read ahead marks is not supported for UTF16 readers.
195: * @exception IOException is thrown, for all calls to this method
196: * because marking is not supported for UTF16 readers
197: */
198: public void reset() throws IOException {
199: throw new IOException("reset() not supported");
200: }
201:
202: /**
203: * Get the size in chars of an array of bytes.
204: *
205: * @param array Source buffer
206: * @param offset Offset at which to start counting characters
207: * @param length number of bytes to use for counting
208: *
209: * @return number of characters that would be converted
210: */
211: /*
212: * This method is only used by our internal Helper class in the method
213: * byteToCharArray to know how much to allocate before using a
214: * reader. If we encounter bad encoding we should return a count
215: * that includes that character so the reader will throw an IOException
216: */
217: public int sizeOf(byte[] array, int offset, int length) {
218: int b1 = 0xff & array[0];
219: int b2 = 0xff & array[1];
220: if ((b1 == 0xfe && b2 == 0xff) || (b1 == 0xff && b2 == 0xfe)) {
221: // do not count BOM, it's not a part of data
222: return length / 2 - 1;
223: }
224: return length / 2;
225: }
226: }
|