001: package workbench.util;
002:
003: /**
004: * Original pseudocode : Thomas Weidenfeller
005: * Implementation tweaked: Aki Nieminen
006: *
007: * http://www.unicode.org/unicode/faq/utf_bom.html
008: * BOMs:
009: * 00 00 FE FF = UTF-32, big-endian
010: * FF FE 00 00 = UTF-32, little-endian
011: * FE FF = UTF-16, big-endian
012: * FF FE = UTF-16, little-endian
013: * EF BB BF = UTF-8
014: *
015: * Win2k Notepad:
016: * Unicode format = UTF-16LE
017: **/
018: import java.io.*;
019:
020: /**
021: * Generic unicode textreader, which will use BOM mark
022: * to identify the encoding to be used.
023: */
024: public class UnicodeReader extends Reader {
025: PushbackInputStream internalIn;
026: InputStreamReader internalIn2 = null;
027: String defaultEnc;
028:
029: private static final int BOM_SIZE = 4;
030:
031: public UnicodeReader(InputStream in, String encoding)
032: throws IOException {
033: this .internalIn = new PushbackInputStream(in, BOM_SIZE);
034: this .defaultEnc = encoding;
035: this .init();
036: }
037:
038: public String getDefaultEncoding() {
039: return defaultEnc;
040: }
041:
042: public String getEncoding() {
043: if (internalIn2 == null)
044: return null;
045: return internalIn2.getEncoding();
046: }
047:
048: /**
049: * Read-ahead four bytes and check for BOM marks. Extra bytes are
050: * unread back to the stream, only BOM bytes are skipped.
051: */
052: protected void init() throws IOException {
053: if (internalIn2 != null)
054: return;
055:
056: String encoding;
057: byte bom[] = new byte[BOM_SIZE];
058: int n, unread;
059: n = internalIn.read(bom, 0, bom.length);
060:
061: if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
062: && (bom[2] == (byte) 0xBF)) {
063: encoding = "UTF-8";
064: unread = n - 3;
065: } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
066: encoding = "UTF-16BE";
067: unread = n - 2;
068: } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
069: encoding = "UTF-16LE";
070: unread = n - 2;
071: } else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
072: && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
073: encoding = "UTF-32BE";
074: unread = n - 4;
075: } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
076: && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
077: encoding = "UTF-32LE";
078: unread = n - 4;
079: } else {
080: // Unicode BOM mark not found, unread all bytes
081: encoding = defaultEnc;
082: unread = n;
083: }
084: if (unread > 0)
085: internalIn.unread(bom, (n - unread), unread);
086: else if (unread < -1)
087: internalIn.unread(bom, 0, 0);
088:
089: // Use given encoding
090: if (encoding == null) {
091: internalIn2 = new InputStreamReader(internalIn);
092: } else {
093: internalIn2 = new InputStreamReader(internalIn, encoding);
094: }
095: }
096:
097: public void close() throws IOException {
098: if (internalIn2 != null)
099: internalIn2.close();
100: }
101:
102: public int read(char[] cbuf, int off, int len) throws IOException {
103: return internalIn2.read(cbuf, off, len);
104: }
105: }
|