001: /*
002: * $Id: CharsetToolkit.java 4112 2006-10-13 13:21:25Z blackdrag $
003: *
004: * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved.
005: *
006: * Redistribution and use of this software and associated documentation
007: * ("Software"), with or without modification, are permitted provided that the
008: * following conditions are met:
009: * 1. Redistributions of source code must retain copyright statements and
010: * notices. Redistributions must also contain a copy of this document.
011: * 2. Redistributions in binary form must reproduce the above copyright
012: * notice, this list of conditions and the following disclaimer in the
013: * documentation and/or other materials provided with the distribution.
014: * 3. The name "groovy" must not be used to endorse or promote products
015: * derived from this Software without prior written permission of The Codehaus.
016: * For written permission, please contact info@codehaus.org.
017: * 4. Products derived from this Software may not be called "groovy" nor may
018: * "groovy" appear in their names without prior written permission of The
019: * Codehaus. "groovy" is a registered trademark of The Codehaus.
020: * 5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/
021: *
022: * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY
023: * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
024: * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
025: * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR
026: * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
027: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
028: * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
029: * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
030: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
031: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
032: * DAMAGE.
033: *
034: */
035:
036: package groovy.util;
037:
038: import java.io.*;
039: import java.nio.charset.Charset;
040: import java.util.*;
041:
042: /**
043: * <p>Utility class to guess the encoding of a given text file.</p>
044: *
045: * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
046: * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
047: * is wide enough, the charset should also be discovered.</p>
048: *
049: * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
050: *
051: * <p>Usage:</p>
052: * <pre>
053: * // guess the encoding
054: * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
055: *
056: * // create a reader with the correct charset
057: * CharsetToolkit toolkit = new CharsetToolkit(file);
058: * BufferedReader reader = toolkit.getReader();
059: *
060: * // read the file content
061: * String line;
062: * while ((line = br.readLine())!= null)
063: * {
064: * System.out.println(line);
065: * }
066: * </pre>
067: *
068: * @author Guillaume Laforge
069: */
070: public class CharsetToolkit {
071: private byte[] buffer;
072: private Charset defaultCharset;
073: private Charset charset;
074: private boolean enforce8Bit = true;
075: private File file;
076:
077: /**
078: * Constructor of the <code>CharsetToolkit</code> utility class.
079: *
080: * @param file of which we want to know the encoding.
081: */
082: public CharsetToolkit(File file) throws IOException {
083: this .file = file;
084: this .defaultCharset = getDefaultSystemCharset();
085: this .charset = null;
086: InputStream input = new FileInputStream(file);
087: try {
088: byte[] bytes = new byte[4096];
089: int bytesRead = input.read(bytes);
090: if (bytesRead == -1) {
091: this .buffer = new byte[0];
092: } else if (bytesRead < 4096) {
093: byte[] bytesToGuess = new byte[bytesRead];
094: System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
095: this .buffer = bytesToGuess;
096: } else {
097: this .buffer = bytes;
098: }
099: } finally {
100: try {
101: input.close();
102: } catch (IOException e) {
103: }
104: }
105: }
106:
107: /**
108: * Defines the default <code>Charset</code> used in case the buffer represents
109: * an 8-bit <code>Charset</code>.
110: *
111: * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
112: * if an 8-bit <code>Charset</code> is encountered.
113: */
114: public void setDefaultCharset(Charset defaultCharset) {
115: if (defaultCharset != null)
116: this .defaultCharset = defaultCharset;
117: else
118: this .defaultCharset = getDefaultSystemCharset();
119: }
120:
121: public Charset getCharset() {
122: if (this .charset == null)
123: this .charset = guessEncoding();
124: return charset;
125: }
126:
127: /**
128: * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
129: * It might be a file without any special character in the range 128-255, but that may be or become
130: * a file encoded with the default <code>charset</code> rather than US-ASCII.
131: *
132: * @param enforce a boolean specifying the use or not of US-ASCII.
133: */
134: public void setEnforce8Bit(boolean enforce) {
135: this .enforce8Bit = enforce;
136: }
137:
138: /**
139: * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
140: *
141: * @return a boolean representing the flag of use of US-ASCII.
142: */
143: public boolean getEnforce8Bit() {
144: return this .enforce8Bit;
145: }
146:
147: /**
148: * Retrieves the default Charset
149: */
150: public Charset getDefaultCharset() {
151: return defaultCharset;
152: }
153:
154: /**
155: * <p>Guess the encoding of the provided buffer.</p>
156: * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
157: * return the charset implied by this BOM. Otherwise, the file would not be a human
158: * readable text file.</p>
159: *
160: * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
161: * If it is not UTF-8, we assume the encoding is the default system encoding
162: * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
163: *
164: * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
165: * <pre>
166: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
167: * 0000 0000-0000 007F 0xxxxxxx
168: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
169: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
170: * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
171: * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
172: * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
173: * </pre>
174: * <p>With UTF-8, 0xFE and 0xFF never appear.</p>
175: *
176: * @return the Charset recognized.
177: */
178: private Charset guessEncoding() {
179: // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
180: // otherwise, the file would not be human readable
181: if (hasUTF8Bom())
182: return Charset.forName("UTF-8");
183: if (hasUTF16LEBom())
184: return Charset.forName("UTF-16LE");
185: if (hasUTF16BEBom())
186: return Charset.forName("UTF-16BE");
187:
188: // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
189: // otherwise, the file is in US-ASCII
190: boolean highOrderBit = false;
191:
192: // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
193: // if it's not the case, we can assume the encoding is the default encoding of the system
194: boolean validU8Char = true;
195:
196: // TODO the buffer is not read up to the end, but up to length - 6
197:
198: int length = buffer.length;
199: int i = 0;
200: while (i < length - 6) {
201: byte b0 = buffer[i];
202: byte b1 = buffer[i + 1];
203: byte b2 = buffer[i + 2];
204: byte b3 = buffer[i + 3];
205: byte b4 = buffer[i + 4];
206: byte b5 = buffer[i + 5];
207: if (b0 < 0) {
208: // a high order bit was encountered, thus the encoding is not US-ASCII
209: // it may be either an 8-bit encoding or UTF-8
210: highOrderBit = true;
211: // a two-bytes sequence was encoutered
212: if (isTwoBytesSequence(b0)) {
213: // there must be one continuation byte of the form 10xxxxxx,
214: // otherwise the following characteris is not a valid UTF-8 construct
215: if (!isContinuationChar(b1))
216: validU8Char = false;
217: else
218: i++;
219: }
220: // a three-bytes sequence was encoutered
221: else if (isThreeBytesSequence(b0)) {
222: // there must be two continuation bytes of the form 10xxxxxx,
223: // otherwise the following characteris is not a valid UTF-8 construct
224: if (!(isContinuationChar(b1) && isContinuationChar(b2)))
225: validU8Char = false;
226: else
227: i += 2;
228: }
229: // a four-bytes sequence was encoutered
230: else if (isFourBytesSequence(b0)) {
231: // there must be three continuation bytes of the form 10xxxxxx,
232: // otherwise the following characteris is not a valid UTF-8 construct
233: if (!(isContinuationChar(b1)
234: && isContinuationChar(b2) && isContinuationChar(b3)))
235: validU8Char = false;
236: else
237: i += 3;
238: }
239: // a five-bytes sequence was encoutered
240: else if (isFiveBytesSequence(b0)) {
241: // there must be four continuation bytes of the form 10xxxxxx,
242: // otherwise the following characteris is not a valid UTF-8 construct
243: if (!(isContinuationChar(b1)
244: && isContinuationChar(b2)
245: && isContinuationChar(b3) && isContinuationChar(b4)))
246: validU8Char = false;
247: else
248: i += 4;
249: }
250: // a six-bytes sequence was encoutered
251: else if (isSixBytesSequence(b0)) {
252: // there must be five continuation bytes of the form 10xxxxxx,
253: // otherwise the following characteris is not a valid UTF-8 construct
254: if (!(isContinuationChar(b1)
255: && isContinuationChar(b2)
256: && isContinuationChar(b3)
257: && isContinuationChar(b4) && isContinuationChar(b5)))
258: validU8Char = false;
259: else
260: i += 5;
261: } else
262: validU8Char = false;
263: }
264: if (!validU8Char)
265: break;
266: i++;
267: }
268: // if no byte with an high order bit set, the encoding is US-ASCII
269: // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
270: if (!highOrderBit) {
271: // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
272: if (this .enforce8Bit)
273: return this .defaultCharset;
274: else
275: return Charset.forName("US-ASCII");
276: }
277: // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
278: // otherwise the file would not be human readable
279: if (validU8Char)
280: return Charset.forName("UTF-8");
281: // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
282: return this .defaultCharset;
283: }
284:
285: /**
286: * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
287: *
288: * @param b a byte.
289: * @return true if it's a continuation char.
290: */
291: private static boolean isContinuationChar(byte b) {
292: return -128 <= b && b <= -65;
293: }
294:
295: /**
296: * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
297: *
298: * @param b a byte.
299: * @return true if it's the first byte of a two-bytes sequence.
300: */
301: private static boolean isTwoBytesSequence(byte b) {
302: return -64 <= b && b <= -33;
303: }
304:
305: /**
306: * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
307: *
308: * @param b a byte.
309: * @return true if it's the first byte of a three-bytes sequence.
310: */
311: private static boolean isThreeBytesSequence(byte b) {
312: return -32 <= b && b <= -17;
313: }
314:
315: /**
316: * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
317: *
318: * @param b a byte.
319: * @return true if it's the first byte of a four-bytes sequence.
320: */
321: private static boolean isFourBytesSequence(byte b) {
322: return -16 <= b && b <= -9;
323: }
324:
325: /**
326: * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
327: *
328: * @param b a byte.
329: * @return true if it's the first byte of a five-bytes sequence.
330: */
331: private static boolean isFiveBytesSequence(byte b) {
332: return -8 <= b && b <= -5;
333: }
334:
335: /**
336: * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
337: *
338: * @param b a byte.
339: * @return true if it's the first byte of a six-bytes sequence.
340: */
341: private static boolean isSixBytesSequence(byte b) {
342: return -4 <= b && b <= -3;
343: }
344:
345: /**
346: * Retrieve the default charset of the system.
347: *
348: * @return the default <code>Charset</code>.
349: */
350: public static Charset getDefaultSystemCharset() {
351: return Charset.forName(System.getProperty("file.encoding"));
352: }
353:
354: /**
355: * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
356: *
357: * @return true if the buffer has a BOM for UTF8.
358: */
359: public boolean hasUTF8Bom() {
360: if (buffer.length >= 3)
361: return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
362: else
363: return false;
364: }
365:
366: /**
367: * Has a Byte Order Marker for UTF-16 Low Endian
368: * (ucs-2le, ucs-4le, and ucs-16le).
369: *
370: * @return true if the buffer has a BOM for UTF-16 Low Endian.
371: */
372: public boolean hasUTF16LEBom() {
373: if (buffer.length >= 2)
374: return (buffer[0] == -1 && buffer[1] == -2);
375: else
376: return false;
377: }
378:
379: /**
380: * Has a Byte Order Marker for UTF-16 Big Endian
381: * (utf-16 and ucs-2).
382: *
383: * @return true if the buffer has a BOM for UTF-16 Big Endian.
384: */
385: public boolean hasUTF16BEBom() {
386: if (buffer.length >= 2)
387: return (buffer[0] == -2 && buffer[1] == -1);
388: else
389: return false;
390: }
391:
392: /**
393: * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
394: * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
395: * method <code>guessEncoding()</code>.
396: *
397: * @return a <code>BufferedReader</code>
398: * @throws FileNotFoundException if the file is not found.
399: */
400: public BufferedReader getReader() throws FileNotFoundException {
401: LineNumberReader reader = new LineNumberReader(
402: new InputStreamReader(new FileInputStream(file),
403: getCharset()));
404: if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
405: try {
406: reader.read();
407: } catch (IOException e) {
408: // should never happen, as a file with no content
409: // but with a BOM has at least one char
410: }
411: }
412: return reader;
413: }
414:
415: /**
416: * Retrieves all the available <code>Charset</code>s on the platform,
417: * among which the default <code>charset</code>.
418: *
419: * @return an array of <code>Charset</code>s.
420: */
421: public static Charset[] getAvailableCharsets() {
422: Collection collection = Charset.availableCharsets().values();
423: return (Charset[]) collection.toArray(new Charset[collection
424: .size()]);
425: }
426: }
|