Reader for UCS-2 and UCS-4 encodings. (i.e., encodings from ISO-10646-UCS-(2|4)). : File Reader « File Input Output


  

/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 * 

 *      http://www.apache.org/licenses/LICENSE-2.0

 * 

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */



import java.io.IOException;

import java.io.InputStream;

import java.io.Reader;



/**

 * Reader for UCS-2 and UCS-4 encodings. (i.e., encodings from

 * ISO-10646-UCS-(2|4)).

 * 

 * @xerces.internal

 * 

 * @author Neil Graham, IBM

 * 

 * @version $Id: UCSReader.java 449317 2006-09-23 22:12:30Z mrglavas $

 */

public class UCSReader extends Reader {



  //

  // Constants

  //



  /**

   * Default byte buffer size (8192, larger than that of ASCIIReader since it's

   * reasonable to surmise that the average UCS-4-encoded file should be 4 times

   * as large as the average ASCII-encoded file).

   */

  public static final int DEFAULT_BUFFER_SIZE = 8192;



  public static final short UCS2LE = 1;



  public static final short UCS2BE = 2;



  public static final short UCS4LE = 4;



  public static final short UCS4BE = 8;



  //

  // Data

  //



  /** Input stream. */

  protected final InputStream fInputStream;



  /** Byte buffer. */

  protected final byte[] fBuffer;



  // what kind of data we're dealing with

  protected final short fEncoding;



  //

  // Constructors

  //



  /**

   * Constructs a UCS reader from the specified input stream using the default

   * buffer size. The Endian-ness and whether this is UCS-2 or UCS-4 needs also

   * to be known in advance.

   * 

   * @param inputStream

   *          The input stream.

   * @param encoding

   *          One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.

   */

  public UCSReader(InputStream inputStream, short encoding) {

    this(inputStream, DEFAULT_BUFFER_SIZE, encoding);

  } // <init>(InputStream, short)



  /**

   * Constructs a UCS reader from the specified input stream and buffer size.

   * The Endian-ness and whether this is UCS-2 or UCS-4 needs also to be known

   * in advance.

   * 

   * @param inputStream

   *          The input stream.

   * @param size

   *          The initial buffer size.

   * @param encoding

   *          One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.

   */

  public UCSReader(InputStream inputStream, int size, short encoding) {

    this(inputStream, new byte[size], encoding);

  } // <init>(InputStream,int,short)



  /**

   * Constructs a UCS reader from the specified input stream and buffer. The

   * Endian-ness and whether this is UCS-2 or UCS-4 needs also to be known in

   * advance.

   * 

   * @param inputStream

   *          The input stream.

   * @param buffer

   *          The byte buffer.

   * @param encoding

   *          One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.

   */

  public UCSReader(InputStream inputStream, byte[] buffer, short encoding) {

    fInputStream = inputStream;

    fBuffer = buffer;

    fEncoding = encoding;

  } // <init>(InputStream,int,short)



  //

  // Reader methods

  //



  /**

   * Read a single character. This method will block until a character is

   * available, an I/O error occurs, or the end of the stream is reached.

   * 

   * <p>

   * Subclasses that intend to support efficient single-character input should

   * override this method.

   * 

   * @return The character read, as an integer in the range 0 to 127 (<tt>0x00-0x7f</tt>),

   *         or -1 if the end of the stream has been reached

   * 

   * @exception IOException

   *              If an I/O error occurs

   */

  public int read() throws IOException {

    int b0 = fInputStream.read() & 0xff;

    if (b0 == 0xff) {

      return -1;

    }

    int b1 = fInputStream.read() & 0xff;

    if (b1 == 0xff) {

      return -1;

    }

    // UCS-4

    if (fEncoding >= 4) {

      int b2 = fInputStream.read() & 0xff;

      if (b2 == 0xff) {

        return -1;

      }

      int b3 = fInputStream.read() & 0xff;

      if (b3 == 0xff) {

        return -1;

      }

      if (fEncoding == UCS4BE) {

        return (b0 << 24) + (b1 << 16) + (b2 << 8) + b3;

      }

      return (b3 << 24) + (b2 << 16) + (b1 << 8) + b0;

    }

    // UCS-2

    if (fEncoding == UCS2BE) {

      return (b0 << 8) + b1;

    }

    return (b1 << 8) + b0;

  } // read():int



  /**

   * Read characters into a portion of an array. This method will block until

   * some input is available, an I/O error occurs, or the end of the stream is

   * reached.

   * 

   * @param ch

   *          Destination buffer

   * @param offset

   *          Offset at which to start storing characters

   * @param length

   *          Maximum number of characters to read

   * 

   * @return The number of characters read, or -1 if the end of the stream has

   *         been reached

   * 

   * @exception IOException

   *              If an I/O error occurs

   */

  public int read(char ch[], int offset, int length) throws IOException {

    int byteLength = length << ((fEncoding >= 4) ? 2 : 1);

    if (byteLength > fBuffer.length) {

      byteLength = fBuffer.length;

    }

    int count = fInputStream.read(fBuffer, 0, byteLength);

    if (count == -1)

      return -1;

    // try and make count be a multiple of the number of bytes we're looking for

    if (fEncoding >= 4) { // BigEndian

      // this looks ugly, but it avoids an if at any rate...

      int numToRead = (4 - (count & 3) & 3);

      for (int i = 0; i < numToRead; i++) {

        int charRead = fInputStream.read();

        if (charRead == -1) { // end of input; something likely went wrong!A Pad

                              // buffer with nulls.

          for (int j = i; j < numToRead; j++) {

            fBuffer[count + j] = 0;

          }

          break;

        }

        fBuffer[count + i] = (byte) charRead;

      }

      count += numToRead;

    } else {

      int numToRead = count & 1;

      if (numToRead != 0) {

        count++;

        int charRead = fInputStream.read();

        if (charRead == -1) { // end of input; something likely went wrong!A Pad

                              // buffer with nulls.

          fBuffer[count] = 0;

        } else {

          fBuffer[count] = (byte) charRead;

        }

      }

    }



    // now count is a multiple of the right number of bytes

    int numChars = count >> ((fEncoding >= 4) ? 2 : 1);

    int curPos = 0;

    for (int i = 0; i < numChars; i++) {

      int b0 = fBuffer[curPos++] & 0xff;

      int b1 = fBuffer[curPos++] & 0xff;

      // UCS-4

      if (fEncoding >= 4) {

        int b2 = fBuffer[curPos++] & 0xff;

        int b3 = fBuffer[curPos++] & 0xff;

        if (fEncoding == UCS4BE) {

          ch[offset + i] = (char) ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);

        } else {

          ch[offset + i] = (char) ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);

        }

      } else { // UCS-2

        if (fEncoding == UCS2BE) {

          ch[offset + i] = (char) ((b0 << 8) + b1);

        } else {

          ch[offset + i] = (char) ((b1 << 8) + b0);

        }

      }

    }

    return numChars;

  } // read(char[],int,int)



  /**

   * Skip characters. This method will block until some characters are

   * available, an I/O error occurs, or the end of the stream is reached.

   * 

   * @param n

   *          The number of characters to skip

   * 

   * @return The number of characters actually skipped

   * 

   * @exception IOException

   *              If an I/O error occurs

   */

  public long skip(long n) throws IOException {

    // charWidth will represent the number of bits to move

    // n leftward to get num of bytes to skip, and then move the result

    // rightward

    // to get num of chars effectively skipped.

    // The trick with &'ing, as with elsewhere in this dcode, is

    // intended to avoid an expensive use of / that might not be optimized

    // away.

    int charWidth = (fEncoding >= 4) ? 2 : 1;

    long bytesSkipped = fInputStream.skip(n << charWidth);

    if ((bytesSkipped & (charWidth | 1)) == 0)

      return bytesSkipped >> charWidth;

    return (bytesSkipped >> charWidth) + 1;

  } // skip(long):long



  /**

   * Tell whether this stream is ready to be read.

   * 

   * @return True if the next read() is guaranteed not to block for input, false

   *         otherwise. Note that returning false does not guarantee that the

   *         next read will block.

   * 

   * @exception IOException

   *              If an I/O error occurs

   */

  public boolean ready() throws IOException {

    return false;

  } // ready()



  /**

   * Tell whether this stream supports the mark() operation.

   */

  public boolean markSupported() {

    return fInputStream.markSupported();

  } // markSupported()



  /**

   * Mark the present position in the stream. Subsequent calls to reset() will

   * attempt to reposition the stream to this point. Not all character-input

   * streams support the mark() operation.

   * 

   * @param readAheadLimit

   *          Limit on the number of characters that may be read while still

   *          preserving the mark. After reading this many characters,

   *          attempting to reset the stream may fail.

   * 

   * @exception IOException

   *              If the stream does not support mark(), or if some other I/O

   *              error occurs

   */

  public void mark(int readAheadLimit) throws IOException {

    fInputStream.mark(readAheadLimit);

  } // mark(int)



  /**

   * Reset the stream. If the stream has been marked, then attempt to reposition

   * it at the mark. If the stream has not been marked, then attempt to reset it

   * in some way appropriate to the particular stream, for example by

   * repositioning it to its starting point. Not all character-input streams

   * support the reset() operation, and some support reset() without supporting

   * mark().

   * 

   * @exception IOException

   *              If the stream has not been marked, or if the mark has been

   *              invalidated, or if the stream does not support reset(), or if

   *              some other I/O error occurs

   */

  public void reset() throws IOException {

    fInputStream.reset();

  } // reset()



  /**

   * Close the stream. Once a stream has been closed, further read(), ready(),

   * mark(), or reset() invocations will throw an IOException. Closing a

   * previously-closed stream, however, has no effect.

   * 

   * @exception IOException

   *              If an I/O error occurs

   */

  public void close() throws IOException {

    fInputStream.close();

  } // close()



} // class UCSReader
Reader for UCS-2 and UCS-4 encodings. (i.e., encodings from ISO-10646-UCS-(2|4)). : File Reader « File Input Output « Java