Read Windows Notepad Unicode files : UTF8 Byte Hex « Development Class


 



/*

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 */



import java.io.IOException;

import java.io.InputStream;

import java.io.PushbackInputStream;



/**

 * This is an input stream that is unicode BOM aware. This allows you to e.g.

 * read Windows Notepad Unicode files as Velocity templates.

 * 

 * It allows you to check the actual encoding of a file by calling

 * {@link #getEncodingFromStream()} on the input stream reader.

 * 

 * This class is not thread safe! When more than one thread wants to use an

 * instance of UnicodeInputStream, the caller must provide synchronization.

 * 

 * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>

 * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>

 * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $

 * @since 1.5

 */

public class UnicodeInputStream extends InputStream {



  /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */

  public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte[] { (byte) 0xef,

      (byte) 0xbb, (byte) 0xbf });



  /**

   * BOM Marker for UTF 16, little endian. See

   * http://www.unicode.org/unicode/faq/utf_bom.html

   */

  public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte[] { (byte) 0xff,

      (byte) 0xfe });



  /**

   * BOM Marker for UTF 16, big endian. See

   * http://www.unicode.org/unicode/faq/utf_bom.html

   */

  public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte[] { (byte) 0xfe,

      (byte) 0xff });



  /**

   * BOM Marker for UTF 32, little endian. See

   * http://www.unicode.org/unicode/faq/utf_bom.html

   * 

   * TODO: Does Java actually support this?

   */

  public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte[] { (byte) 0xff,

      (byte) 0xfe, (byte) 0x00, (byte) 0x00 });



  /**

   * BOM Marker for UTF 32, big endian. See

   * http://www.unicode.org/unicode/faq/utf_bom.html

   * 

   * TODO: Does Java actually support this?

   */

  public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte[] { (byte) 0x00,

      (byte) 0x00, (byte) 0xfe, (byte) 0xff });



  /** The maximum amount of bytes to read for a BOM */

  private static final int MAX_BOM_SIZE = 4;



  /** Buffer for BOM reading */

  private byte[] buf = new byte[MAX_BOM_SIZE];



  /** Buffer pointer. */

  private int pos = 0;



  /** The stream encoding as read from the BOM or null. */

  private final String encoding;



  /** True if the BOM itself should be skipped and not read. */

  private final boolean skipBOM;



  private final PushbackInputStream inputStream;



  /**

   * Creates a new UnicodeInputStream object. Skips a BOM which defines the file

   * encoding.

   * 

   * @param inputStream

   *          The input stream to use for reading.

   */

  public UnicodeInputStream(final InputStream inputStream) throws IllegalStateException,

      IOException {

    this(inputStream, true);

  }



  /**

   * Creates a new UnicodeInputStream object.

   * 

   * @param inputStream

   *          The input stream to use for reading.

   * @param skipBOM

   *          If this is set to true, a BOM read from the stream is discarded.

   *          This parameter should normally be true.

   */

  public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)

      throws IllegalStateException, IOException {

    super();



    this.skipBOM = skipBOM;

    this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);



    try {

      this.encoding = readEncoding();

    } catch (IOException ioe) {

      IllegalStateException ex = new IllegalStateException("Could not read BOM from Stream");

      // ExceptionUtils.setCause(ex, ioe);

      throw ex;

    }

  }



  /**

   * Returns true if the input stream discards the BOM.

   * 

   * @return True if the input stream discards the BOM.

   */

  public boolean isSkipBOM() {

    return skipBOM;

  }



  /**

   * Read encoding based on BOM.

   * 

   * @return The encoding based on the BOM.

   * 

   * @throws IllegalStateException

   *           When a problem reading the BOM occured.

   */

  public String getEncodingFromStream() {

    return encoding;

  }



  /**

   * This method gets the encoding from the stream contents if a BOM exists. If

   * no BOM exists, the encoding is undefined.

   * 

   * @return The encoding of this streams contents as decided by the BOM or null

   *         if no BOM was found.

   */

  protected String readEncoding() throws IOException {

    pos = 0;



    UnicodeBOM encoding = null;



    // read first byte.

    if (readByte()) {

      // Build a list of matches

      //

      // 00 00 FE FF --> UTF 32 BE

      // EF BB BF --> UTF 8

      // FE FF --> UTF 16 BE

      // FF FE --> UTF 16 LE

      // FF FE 00 00 --> UTF 32 LE



      switch (buf[0]) {

      case (byte) 0x00: // UTF32 BE

        encoding = match(UTF32BE_BOM, null);

        break;

      case (byte) 0xef: // UTF8

        encoding = match(UTF8_BOM, null);

        break;

      case (byte) 0xfe: // UTF16 BE

        encoding = match(UTF16BE_BOM, null);

        break;

      case (byte) 0xff: // UTF16/32 LE

        encoding = match(UTF16LE_BOM, null);



        if (encoding != null) {

          encoding = match(UTF32LE_BOM, encoding);

        }

        break;



      default:

        encoding = null;

        break;

      }

    }



    pushback(encoding);



    return (encoding != null) ? encoding.getEncoding() : null;

  }



  private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)

      throws IOException {

    byte[] bom = matchEncoding.getBytes();



    for (int i = 0; i < bom.length; i++) {

      if (pos <= i) // Byte has not yet been read

      {

        if (!readByte()) {

          return noMatchEncoding;

        }

      }



      if (bom[i] != buf[i]) {

        return noMatchEncoding;

      }

    }



    return matchEncoding;

  }



  private final boolean readByte() throws IOException {

    int res = inputStream.read();

    if (res == -1) {

      return false;

    }



    if (pos >= buf.length) {

      throw new IOException("BOM read error");

    }



    buf[pos++] = (byte) res;

    return true;

  }



  private final void pushback(final UnicodeBOM matchBOM) throws IOException {

    int count = pos; // By default, all bytes are pushed back.

    int start = 0;



    if (matchBOM != null && skipBOM) {

      // We have a match (some bytes are part of the BOM)

      // and we want to skip the BOM. Push back only the bytes

      // after the BOM.

      start = matchBOM.getBytes().length;

      count = (pos - start);



      if (count < 0) {

        throw new IllegalStateException("Match has more bytes than available!");

      }

    }



    inputStream.unread(buf, start, count);

  }



  /**

   * @see java.io.InputStream#close()

   */

  public void close() throws IOException {

    inputStream.close();

  }



  /**

   * @see java.io.InputStream#available()

   */

  public int available() throws IOException {

    return inputStream.available();

  }



  /**

   * @see java.io.InputStream#mark(int)

   */

  public void mark(final int readlimit) {

    inputStream.mark(readlimit);

  }



  /**

   * @see java.io.InputStream#markSupported()

   */

  public boolean markSupported() {

    return inputStream.markSupported();

  }



  /**

   * @see java.io.InputStream#read()

   */

  public int read() throws IOException {

    return inputStream.read();

  }



  /**

   * @see java.io.InputStream#read(byte[])

   */

  public int read(final byte[] b) throws IOException {

    return inputStream.read(b);

  }



  /**

   * @see java.io.InputStream#read(byte[], int, int)

   */

  public int read(final byte[] b, final int off, final int len) throws IOException {

    return inputStream.read(b, off, len);

  }



  /**

   * @see java.io.InputStream#reset()

   */

  public void reset() throws IOException {

    inputStream.reset();

  }



  /**

   * @see java.io.InputStream#skip(long)

   */

  public long skip(final long n) throws IOException {

    return inputStream.skip(n);

  }



  /**

   * Helper class to bundle encoding and BOM marker.

   * 

   * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>

   * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $

   */

  static final class UnicodeBOM {

    private final String encoding;



    private final byte[] bytes;



    private UnicodeBOM(final String encoding, final byte[] bytes) {

      this.encoding = encoding;

      this.bytes = bytes;

    }



    String getEncoding() {

      return encoding;

    }



    byte[] getBytes() {

      return bytes;

    }

  }

}
Read Windows Notepad Unicode files : UTF8 Byte Hex « Development Class « Java