01: package fri.util.io;
02:
03: import java.io.*;
04:
05: /**
06: * Reads away UNICODE Byte Order Mark on construction. See
07: * http://www.unicode.org/unicode/faq/utf_bom.html
08: *
09: * <pre>
10: * 00 00 FE FF = UTF-32, big-endian
11: * FF FE 00 00 = UTF-32, little-endian
12: * FE FF = UTF-16, big-endian
13: * FF FE = UTF-16, little-endian
14: * EF BB BF = UTF-8
15: * </pre>
16: */
17: public class UnicodeReader extends Reader {
18: private static final int BOM_MAX_SIZE = 4;
19:
20: private InputStreamReader delegate;
21:
22: public UnicodeReader(InputStream in) throws IOException {
23: init(in, null);
24: }
25:
26: public UnicodeReader(InputStream in, String defaultEnc)
27: throws IOException {
28: init(in, defaultEnc);
29: }
30:
31: /** Returns the encoding that was read from byte order mark if there was one. */
32: public String getEncoding() {
33: return delegate.getEncoding();
34: }
35:
36: /**
37: * Read-ahead four bytes and check for BOM marks. Extra bytes are unread back
38: * to the stream, only BOM bytes are skipped.
39: */
40: private void init(InputStream in, String defaultEnc)
41: throws IOException {
42: String encoding;
43: byte bom[] = new byte[BOM_MAX_SIZE];
44: int n, unread;
45: PushbackInputStream internalIn = new PushbackInputStream(in,
46: BOM_MAX_SIZE);
47: n = internalIn.read(bom, 0, bom.length);
48:
49: if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
50: && (bom[2] == (byte) 0xBF)) {
51: encoding = "UTF-8";
52: unread = n - 3;
53: } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
54: encoding = "UTF-16BE";
55: unread = n - 2;
56: } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
57: encoding = "UTF-16LE";
58: unread = n - 2;
59: } else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
60: && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
61: encoding = "UTF-32BE";
62: unread = n - 4;
63: } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
64: && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
65: encoding = "UTF-32LE";
66: unread = n - 4;
67: } else {
68: // Unicode BOM mark not found, unread all bytes
69: encoding = defaultEnc;
70: unread = n;
71: }
72:
73: if (unread > 0)
74: internalIn.unread(bom, (n - unread), unread);
75: else if (unread < -1)
76: internalIn.unread(bom, 0, 0);
77:
78: // Use BOM or default encoding
79: if (encoding == null) {
80: delegate = new InputStreamReader(internalIn);
81: } else {
82: delegate = new InputStreamReader(internalIn, encoding);
83: }
84: }
85:
86: /** Overridden to use delegate reader. */
87: public void close() throws IOException {
88: delegate.close();
89: }
90:
91: /** Overridden to use delegate reader. */
92: public int read(char[] cbuf, int off, int len) throws IOException {
93: return delegate.read(cbuf, off, len);
94: }
95:
96: }
|