001: /*
002: * :tabSize=8:indentSize=8:noTabs=false:
003: * :folding=explicit:collapseFolds=1:
004: *
005: * Copyright (C) 2007 Kazutoshi Satoda
006: *
007: * This program is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU General Public License
009: * as published by the Free Software Foundation; either version 2
010: * of the License, or any later version.
011: * This program is distributed in the hope that it will be useful,
012: * but WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014: * GNU General Public License for more details.
015: *
016: * You should have received a copy of the GNU General Public License
017: * along with this program; if not, write to the Free Software
018: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
019: */
020:
021: package org.gjt.sp.jedit.io;
022:
023: //{{{ Imports
024: import java.io.InputStream;
025: import java.io.OutputStream;
026: import java.io.Reader;
027: import java.io.Writer;
028: import java.io.IOException;
029: import java.nio.charset.UnsupportedCharsetException;
030: import java.nio.charset.MalformedInputException;
031: import java.util.Arrays;
032: import java.util.Map;
033: import java.util.HashMap;
034:
035: //}}}
036:
037: /**
038: * Encodings which have BOM at the beginning of byte stream.
039: *
040: * @since 4.3pre10
041: * @author Kazutoshi Satoda
042: */
043: public class EncodingWithBOM implements Encoding {
044: //{{{ Constructor
045: public EncodingWithBOM(String plain) {
046: byte[] bom = bomMap.get(plain);
047: if (bom == null) {
048: throw new UnsupportedCharsetException(plain + " with BOM");
049: }
050: this .plain = new CharsetEncoding(plain);
051: this .bom = bom;
052: } //}}}
053:
054: //{{{ implements Encoding
055: public Reader getTextReader(InputStream in) throws IOException {
056: byte[] actualMark = new byte[bom.length];
057: int count = in.read(actualMark);
058: if (count < bom.length || !Arrays.equals(actualMark, bom)) {
059: throw new MalformedInputException(0);
060: }
061: return plain.getTextReader(in);
062: }
063:
064: public Writer getTextWriter(OutputStream out) throws IOException {
065: out.write(bom);
066: return plain.getTextWriter(out);
067: }
068:
069: //}}}
070:
071: //{{{ class Detector
072: public static class Detector implements EncodingDetector {
073: public String detectEncoding(InputStream sample)
074: throws IOException {
075: byte[] mark = new byte[4];
076: int count = sample.read(mark);
077:
078: byte low = (byte) (BOM16 & 0xff);
079: byte high = (byte) ((BOM16 >> 8) & 0xff);
080: if (count >= 4) {
081: if (mark[0] == low && mark[1] == high
082: && mark[2] == 0x00 && mark[3] == 0x00) {
083: return "X-UTF-32LE-BOM";
084: } else if (mark[0] == 0x00 && mark[1] == 0x00
085: && mark[2] == high && mark[3] == low) {
086: return "X-UTF-32BE-BOM";
087: }
088: }
089: if (count >= 2) {
090: if (mark[0] == low && mark[1] == high) {
091: return "x-UTF-16LE-BOM";
092: } else if (mark[0] == high && mark[1] == low) {
093: // "x-UTF-16BE-BOM" does not available.
094: // But an encoder for "UTF-16" actually uses
095: // big endian with corresponding BOM. It just
096: // works as "UTF-16BE with BOM".
097: return "UTF-16";
098: }
099: }
100:
101: if (count >= UTF8BOM.length) {
102: int i = 0;
103: while (i < UTF8BOM.length) {
104: if (mark[i] != UTF8BOM[i]) {
105: break;
106: }
107: ++i;
108: }
109: if (i == UTF8BOM.length) {
110: return "UTF-8Y";
111: }
112: }
113:
114: return null;
115: }
116: } //}}}
117:
118: //{{{ Private members
119:
120: //{{{ Statics
121: private static final int BOM16 = 0xfeff;
122: private static final byte[] UTF8BOM = { (byte) 0xef, (byte) 0xbb,
123: (byte) 0xbf };
124:
125: private static final Map<String, byte[]> bomMap = new HashMap<String, byte[]>();
126:
127: static {
128: bomMap.put("UTF-8", UTF8BOM);
129:
130: byte low = (byte) (BOM16 & 0xff);
131: byte high = (byte) ((BOM16 >> 8) & 0xff);
132: bomMap.put("UTF-16LE", new byte[] { low, high });
133: bomMap.put("UTF-16BE", new byte[] { high, low });
134: bomMap.put("UTF-32LE", new byte[] { low, high, 0x00, 0x00 });
135: bomMap.put("UTF-32BE", new byte[] { 0x00, 0x00, high, low });
136: }
137: //}}}
138:
139: //{{{ Instance variables
140: private final CharsetEncoding plain;
141: private final byte[] bom;
142: //}}}
143:
144: //}}}
145: }
|