/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
/**
* Reader for UCS-2 and UCS-4 encodings. (i.e., encodings from
* ISO-10646-UCS-(2|4)).
*
* @xerces.internal
*
* @author Neil Graham, IBM
*
* @version $Id: UCSReader.java 449317 2006-09-23 22:12:30Z mrglavas $
*/
public class UCSReader extends Reader {
//
// Constants
//
/**
* Default byte buffer size (8192, larger than that of ASCIIReader since it's
* reasonable to surmise that the average UCS-4-encoded file should be 4 times
* as large as the average ASCII-encoded file).
*/
public static final int DEFAULT_BUFFER_SIZE = 8192;
public static final short UCS2LE = 1;
public static final short UCS2BE = 2;
public static final short UCS4LE = 4;
public static final short UCS4BE = 8;
//
// Data
//
/** Input stream. */
protected final InputStream fInputStream;
/** Byte buffer. */
protected final byte[] fBuffer;
// what kind of data we're dealing with
protected final short fEncoding;
//
// Constructors
//
/**
* Constructs a UCS reader from the specified input stream using the default
* buffer size. The Endian-ness and whether this is UCS-2 or UCS-4 needs also
* to be known in advance.
*
* @param inputStream
* The input stream.
* @param encoding
* One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
*/
public UCSReader(InputStream inputStream, short encoding) {
this(inputStream, DEFAULT_BUFFER_SIZE, encoding);
} // <init>(InputStream, short)
/**
* Constructs a UCS reader from the specified input stream and buffer size.
* The Endian-ness and whether this is UCS-2 or UCS-4 needs also to be known
* in advance.
*
* @param inputStream
* The input stream.
* @param size
* The initial buffer size.
* @param encoding
* One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
*/
public UCSReader(InputStream inputStream, int size, short encoding) {
this(inputStream, new byte[size], encoding);
} // <init>(InputStream,int,short)
/**
* Constructs a UCS reader from the specified input stream and buffer. The
* Endian-ness and whether this is UCS-2 or UCS-4 needs also to be known in
* advance.
*
* @param inputStream
* The input stream.
* @param buffer
* The byte buffer.
* @param encoding
* One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
*/
public UCSReader(InputStream inputStream, byte[] buffer, short encoding) {
fInputStream = inputStream;
fBuffer = buffer;
fEncoding = encoding;
} // <init>(InputStream,int,short)
//
// Reader methods
//
/**
* Read a single character. This method will block until a character is
* available, an I/O error occurs, or the end of the stream is reached.
*
* <p>
* Subclasses that intend to support efficient single-character input should
* override this method.
*
* @return The character read, as an integer in the range 0 to 127 (<tt>0x00-0x7f</tt>),
* or -1 if the end of the stream has been reached
*
* @exception IOException
* If an I/O error occurs
*/
public int read() throws IOException {
int b0 = fInputStream.read() & 0xff;
if (b0 == 0xff) {
return -1;
}
int b1 = fInputStream.read() & 0xff;
if (b1 == 0xff) {
return -1;
}
// UCS-4
if (fEncoding >= 4) {
int b2 = fInputStream.read() & 0xff;
if (b2 == 0xff) {
return -1;
}
int b3 = fInputStream.read() & 0xff;
if (b3 == 0xff) {
return -1;
}
if (fEncoding == UCS4BE) {
return (b0 << 24) + (b1 << 16) + (b2 << 8) + b3;
}
return (b3 << 24) + (b2 << 16) + (b1 << 8) + b0;
}
// UCS-2
if (fEncoding == UCS2BE) {
return (b0 << 8) + b1;
}
return (b1 << 8) + b0;
} // read():int
/**
* Read characters into a portion of an array. This method will block until
* some input is available, an I/O error occurs, or the end of the stream is
* reached.
*
* @param ch
* Destination buffer
* @param offset
* Offset at which to start storing characters
* @param length
* Maximum number of characters to read
*
* @return The number of characters read, or -1 if the end of the stream has
* been reached
*
* @exception IOException
* If an I/O error occurs
*/
public int read(char ch[], int offset, int length) throws IOException {
int byteLength = length << ((fEncoding >= 4) ? 2 : 1);
if (byteLength > fBuffer.length) {
byteLength = fBuffer.length;
}
int count = fInputStream.read(fBuffer, 0, byteLength);
if (count == -1)
return -1;
// try and make count be a multiple of the number of bytes we're looking for
if (fEncoding >= 4) { // BigEndian
// this looks ugly, but it avoids an if at any rate...
int numToRead = (4 - (count & 3) & 3);
for (int i = 0; i < numToRead; i++) {
int charRead = fInputStream.read();
if (charRead == -1) { // end of input; something likely went wrong!A Pad
// buffer with nulls.
for (int j = i; j < numToRead; j++) {
fBuffer[count + j] = 0;
}
break;
}
fBuffer[count + i] = (byte) charRead;
}
count += numToRead;
} else {
int numToRead = count & 1;
if (numToRead != 0) {
count++;
int charRead = fInputStream.read();
if (charRead == -1) { // end of input; something likely went wrong!A Pad
// buffer with nulls.
fBuffer[count] = 0;
} else {
fBuffer[count] = (byte) charRead;
}
}
}
// now count is a multiple of the right number of bytes
int numChars = count >> ((fEncoding >= 4) ? 2 : 1);
int curPos = 0;
for (int i = 0; i < numChars; i++) {
int b0 = fBuffer[curPos++] & 0xff;
int b1 = fBuffer[curPos++] & 0xff;
// UCS-4
if (fEncoding >= 4) {
int b2 = fBuffer[curPos++] & 0xff;
int b3 = fBuffer[curPos++] & 0xff;
if (fEncoding == UCS4BE) {
ch[offset + i] = (char) ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);
} else {
ch[offset + i] = (char) ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);
}
} else { // UCS-2
if (fEncoding == UCS2BE) {
ch[offset + i] = (char) ((b0 << 8) + b1);
} else {
ch[offset + i] = (char) ((b1 << 8) + b0);
}
}
}
return numChars;
} // read(char[],int,int)
/**
* Skip characters. This method will block until some characters are
* available, an I/O error occurs, or the end of the stream is reached.
*
* @param n
* The number of characters to skip
*
* @return The number of characters actually skipped
*
* @exception IOException
* If an I/O error occurs
*/
public long skip(long n) throws IOException {
// charWidth will represent the number of bits to move
// n leftward to get num of bytes to skip, and then move the result
// rightward
// to get num of chars effectively skipped.
// The trick with &'ing, as with elsewhere in this dcode, is
// intended to avoid an expensive use of / that might not be optimized
// away.
int charWidth = (fEncoding >= 4) ? 2 : 1;
long bytesSkipped = fInputStream.skip(n << charWidth);
if ((bytesSkipped & (charWidth | 1)) == 0)
return bytesSkipped >> charWidth;
return (bytesSkipped >> charWidth) + 1;
} // skip(long):long
/**
* Tell whether this stream is ready to be read.
*
* @return True if the next read() is guaranteed not to block for input, false
* otherwise. Note that returning false does not guarantee that the
* next read will block.
*
* @exception IOException
* If an I/O error occurs
*/
public boolean ready() throws IOException {
return false;
} // ready()
/**
* Tell whether this stream supports the mark() operation.
*/
public boolean markSupported() {
return fInputStream.markSupported();
} // markSupported()
/**
* Mark the present position in the stream. Subsequent calls to reset() will
* attempt to reposition the stream to this point. Not all character-input
* streams support the mark() operation.
*
* @param readAheadLimit
* Limit on the number of characters that may be read while still
* preserving the mark. After reading this many characters,
* attempting to reset the stream may fail.
*
* @exception IOException
* If the stream does not support mark(), or if some other I/O
* error occurs
*/
public void mark(int readAheadLimit) throws IOException {
fInputStream.mark(readAheadLimit);
} // mark(int)
/**
* Reset the stream. If the stream has been marked, then attempt to reposition
* it at the mark. If the stream has not been marked, then attempt to reset it
* in some way appropriate to the particular stream, for example by
* repositioning it to its starting point. Not all character-input streams
* support the reset() operation, and some support reset() without supporting
* mark().
*
* @exception IOException
* If the stream has not been marked, or if the mark has been
* invalidated, or if the stream does not support reset(), or if
* some other I/O error occurs
*/
public void reset() throws IOException {
fInputStream.reset();
} // reset()
/**
* Close the stream. Once a stream has been closed, further read(), ready(),
* mark(), or reset() invocations will throw an IOException. Closing a
* previously-closed stream, however, has no effect.
*
* @exception IOException
* If an I/O error occurs
*/
public void close() throws IOException {
fInputStream.close();
} // close()
} // class UCSReader
|