001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.util.*;
024: import java.io.*;
025: import java.nio.charset.*;
026: import java.net.*;
027:
028: /**
029: * Based on information in:
030: * http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
031: * http://www.w3.org/TR/html401/charset.html#h-5.2
032: */
033: final class StreamEncodingDetector {
034: private final InputStream inputStream;
035: private String encoding = null;
036: private String encodingSpecificationInfo = null;
037: private boolean definitive = true;
038: private boolean documentSpecifiedEncodingPossible = true;
039:
040: private static final String UTF_16 = "UTF-16";
041: private static final String UTF_16BE = "UTF-16BE";
042: private static final String UTF_16LE = "UTF-16LE";
043: private static final String UTF_8 = "UTF-8";
044: private static final String ISO_8859_1 = "ISO-8859-1";
045: private static final String EBCDIC = "Cp037"; // aka IBM037, not guaranteed, but available on most platforms
046:
047: // All of the following encodings are generally not supported in java and will usually throw an exception if decoding is attempted.
048: // Specified explicitly using Byte Order Mark:
049: private static final String SCSU = "SCSU";
050: private static final String UTF_7 = "UTF-7";
051: private static final String UTF_EBCDIC = "UTF-EBCDIC";
052: private static final String BOCU_1 = "BOCU-1";
053: private static final String UTF_32 = "UTF-32";
054: // Guessed from presence of 00 bytes in first four bytes:
055: private static final String UTF_32BE = "UTF-32BE";
056: private static final String UTF_32LE = "UTF-32LE";
057:
058: public StreamEncodingDetector(final URL url) throws IOException {
059: final URLConnection urlConnection = url.openConnection();
060: final HttpURLConnection httpURLConnection = (urlConnection instanceof HttpURLConnection) ? (HttpURLConnection) urlConnection
061: : null;
062: // urlConnection.setRequestProperty("Accept-Charset","UTF-8, ISO-8859-1;q=0"); // used for debugging
063: final InputStream urlInputStream = urlConnection
064: .getInputStream();
065: final String contentType = urlConnection.getContentType();
066: if (contentType != null) {
067: encoding = Source
068: .getCharsetParameterFromHttpHeaderValue(contentType);
069: if (encoding != null) {
070: inputStream = urlInputStream;
071: encodingSpecificationInfo = "HTTP header Content-Type: "
072: + contentType;
073: return;
074: }
075: }
076: inputStream = urlInputStream.markSupported() ? urlInputStream
077: : new BufferedInputStream(urlInputStream);
078: init();
079: }
080:
081: public StreamEncodingDetector(final InputStream inputStream)
082: throws IOException {
083: this .inputStream = inputStream.markSupported() ? inputStream
084: : new BufferedInputStream(inputStream);
085: init();
086: }
087:
088: public InputStream getInputStream() {
089: return inputStream;
090: }
091:
092: public String getEncoding() {
093: return encoding;
094: }
095:
096: public String getEncodingSpecificationInfo() {
097: return encodingSpecificationInfo;
098: }
099:
100: public boolean isDifinitive() {
101: return definitive;
102: }
103:
104: public boolean isDocumentSpecifiedEncodingPossible() {
105: return documentSpecifiedEncodingPossible;
106: }
107:
108: public Reader openReader() throws UnsupportedEncodingException {
109: if (encoding == null)
110: return new InputStreamReader(inputStream, ISO_8859_1); // encoding==null only if input stream is empty so use an arbitrary encoding.
111: if (!Charset.isSupported(encoding))
112: throw new UnsupportedEncodingException(encoding + " - "
113: + encodingSpecificationInfo);
114: return new InputStreamReader(inputStream, encoding);
115: }
116:
117: private boolean setEncoding(final String encoding,
118: final String encodingSpecificationInfo) {
119: this .encoding = encoding;
120: this .encodingSpecificationInfo = encodingSpecificationInfo;
121: return true;
122: }
123:
124: private boolean init() throws IOException {
125: inputStream.mark(4);
126: final int b1 = inputStream.read();
127: if (b1 == -1)
128: return setEncoding(null, "empty input stream");
129: final int b2 = inputStream.read();
130: final int b3 = inputStream.read();
131: final int b4 = inputStream.read();
132: inputStream.reset();
133: // Check for Unicode Byte Order Mark:
134: if (b1 == 0xEF) {
135: if (b2 == 0xBB && b3 == 0xBF)
136: return setEncoding(UTF_8,
137: "UTF-8 Byte Order Mark (EF BB BF)");
138: } else if (b1 == 0xFE) {
139: if (b2 == 0xFF)
140: return setEncoding(UTF_16,
141: "UTF-16 big-endian Byte Order Mark (FE FF)");
142: } else if (b1 == 0xFF) {
143: if (b2 == 0xFE) {
144: if (b3 == 0 && b4 == 0)
145: return setEncoding(UTF_32,
146: "UTF-32 little-endian Byte Order Mark (FF EE 00 00)");
147: return setEncoding(UTF_16,
148: "UTF-16 little-endian Byte Order Mark (FF EE)");
149: }
150: } else if (b1 == 0) {
151: if (b2 == 0 && b3 == 0xFE && b4 == 0xFF)
152: return setEncoding(UTF_32,
153: "UTF-32 big-endian Byte Order Mark (00 00 FE FF)");
154: } else if (b1 == 0x0E) {
155: if (b2 == 0xFE && b3 == 0xFF)
156: return setEncoding(SCSU,
157: "SCSU Byte Order Mark (0E FE FF)");
158: } else if (b1 == 0x2B) {
159: if (b2 == 0x2F && b3 == 0x76)
160: return setEncoding(UTF_7,
161: "UTF-7 Byte Order Mark (2B 2F 76)");
162: } else if (b1 == 0xDD) {
163: if (b2 == 0x73 && b3 == 0x66 && b4 == 0x73)
164: return setEncoding(UTF_EBCDIC,
165: "UTF-EBCDIC Byte Order Mark (DD 73 66 73)");
166: } else if (b1 == 0xFB) {
167: if (b2 == 0xEE && b3 == 0x28)
168: return setEncoding(BOCU_1,
169: "BOCU-1 Byte Order Mark (FB EE 28)");
170: }
171: // No Unicode Byte Order Mark found. Have to start guessing.
172: definitive = false;
173: // The best we can do is to provide an encoding that reflects the correct number and ordering of bytes for characters in the ASCII range.
174: // The result will be one of ISO_8859_1, EBCDIC, UTF_16BE, UTF_16LE, UTF_32BE or UTF_32LE.
175: // Assumes 00 bytes indicate multi-byte encodings rather than the presence of NUL characters or characters with a code that is a multiple of 0x100.
176: if (b4 == -1) {
177: // The stream contains between 1 and 3 bytes.
178: // This means the document can't possibly specify the encoding, so make a best guess based on the first 3 bytes.
179: documentSpecifiedEncodingPossible = false;
180: // It might be possible to rule out some encodings based on these bytes, but it is impossible to make a definite determination.
181: // The main thing to determine is whether it is an 8-bit or 16-bit encoding.
182: // In order to guess the most likely encoding, assume that the text contains only ASCII characters, and that any 00 bytes indicate a 16-bit encoding.
183: // The only strictly 8-bit encoding guaranteed to be supported on all java platforms is ISO-8859-1 (UTF-8 uses a variable number of bytes per character).
184: // If no 00 bytes are present it is safest to assume ISO-8859-1, as this accepts the full range of values 00-FF in every byte.
185: if (b2 == -1 || b3 != -1)
186: return setEncoding(ISO_8859_1,
187: "default 8-bit ASCII-compatible encoding (stream 3 bytes long)"); // The stream contains exactly 1 or 3 bytes, so assume an 8-bit encoding regardless of whether any 00 bytes are present.
188: // The stream contains exactly 2 bytes.
189: if (b1 == 0)
190: return setEncoding(UTF_16BE,
191: "default 16-bit BE encoding (byte stream starts with 00, stream 2 bytes long)");
192: if (b2 == 0)
193: return setEncoding(UTF_16LE,
194: "default 16-bit LE encoding (byte stream pattern XX 00, stream 2 bytes long)");
195: // No 00 bytes present, assume 8-bit encoding:
196: return setEncoding(
197: ISO_8859_1,
198: "default 8-bit ASCII-compatible encoding (no 00 bytes present, stream 2 bytes long)");
199: }
200: // Stream contains at least 4 bytes.
201: // The patterns used for documentation are made up of:
202: // 0 - zero byte
203: // X - non-zero byte
204: // ? - byte value not yet determined
205: if (b1 == 0) {
206: // pattern 0???
207: if (b2 == 0)
208: return setEncoding(UTF_32BE,
209: "default 32-bit BE encoding (byte stream starts with 00 00)"); // pattern 00?? most likely indicates UTF-32BE
210: // pattern 0X??
211: // Regardless of the final two bytes, assume that the first two bytes indicate a 16-bit BE encoding.
212: // There are many circumstances where this could be an incorrect assumption, for example:
213: // - UTF-16LE encoding with first character U+0100 (or any other character whose code is a multiple of 100Hex)
214: // - any encoding with first character NUL
215: // - UTF-32BE encoding with first character outside of Basic Multilingual Plane (BMP)
216: // Checking the final two bytes might give some clues as to whether any of these other situations are more likely,
217: // but none of the clues will yield less than a 50% chance that the encoding is in fact UTF-16BE as suggested by the first two bytes.
218: return setEncoding(UTF_16BE,
219: "default 16-bit BE encoding (byte stream starts with 00)"); // >=50% chance that encoding is UTF-16BE
220: }
221: // pattern X???
222: if (b4 == 0) {
223: // pattern X??0
224: if (b3 == 0)
225: return setEncoding(UTF_32LE,
226: "default 32-bit LE encoding (byte stream starts with pattern XX ?? 00 00)"); // pattern X?00 most likely indicates UTF-32LE
227: // pattern X?X0
228: return setEncoding(UTF_16LE,
229: "default 16-bit LE encoding (byte stream stars with pattern XX ?? XX 00)"); // Regardless of the second byte, assume the fourth 00 byte indicates UTF-16LE.
230: }
231: // pattern X??X
232: if (b2 == 0) {
233: // pattern X0?X
234: // Assuming the second 00 byte doesn't indicate a NUL character, and that it is very unlikely that this is a 32-bit encoding
235: // of a character outside of the BMP, we can assume that it indicates a 16-bit encoding.
236: // If the pattern is X00X, there is a 50/50 chance that the encoding is BE or LE, with one of the characters have a code that is a multiple of 0x100.
237: // This should be a very rare occurrence, and there is no more than a 50% chance that the encoding
238: // will be different to that assumed (UTF-16LE) without checking for this occurrence, so don't bother checking for it.
239: // If the pattern is X0XX, this is likely to indicate a 16-bit LE encoding with the second character > U+00FF.
240: return setEncoding(UTF_16LE,
241: "default 16-bit LE encoding (byte stream starts with pattern XX 00 ?? XX)");
242: }
243: // pattern XX?X
244: if (b3 == 0)
245: return setEncoding(UTF_16BE,
246: "default 16-bit BE encoding (byte stream starts with pattern XX XX 00 XX)"); // pattern XX0X likely to indicate a 16-bit BE encoding with the first character > U+00FF.
247: // pattern XXXX
248: // Although it is still possible that this is a 16-bit encoding with the first two characters > U+00FF
249: // Assume the more likely case of four 8-bit characters <= U+00FF.
250: // Check whether it fits some common EBCDIC strings that might be found at the start of a document:
251: if (b1 == 0x4C) { // first character is EBCDIC '<' (ASCII 'L'), check a couple more characters before assuming EBCDIC encoding:
252: if (b2 == 0x6F && b3 == 0xA7 && b4 == 0x94)
253: return setEncoding(EBCDIC,
254: "default EBCDIC encoding (<?xml...> detected)"); // first four bytes are "<?xm" in EBCDIC ("Lo§”" in Windows-1252)
255: if (b2 == 0x5A && b3 == 0xC4 && b4 == 0xD6)
256: return setEncoding(EBCDIC,
257: "default EBCDIC encoding (<!DOCTYPE...> detected)"); // first four bytes are "<!DO" in EBCDIC ("LZÄÖ" in Windows-1252)
258: if ((b2 & b3 & b4 & 0x80) != 0)
259: return setEncoding(EBCDIC,
260: "default EBCDIC-compatible encoding (HTML element detected)"); // all of the 3 bytes after the '<' have the high-order bit set, indicating EBCDIC letters such as "<HTM" ("LÈãÔ" in Windows-1252), or "<htm" ("Lˆ£”" in Windows-1252)
261: // although this is not an exhaustive check for EBCDIC, it is safer to assume a more common preliminary encoding if none of these conditions are met.
262: }
263: // Now confident that it is not EBCDIC, but some other 8-bit encoding.
264: // Most other 8-bit encodings are compatible with ASCII.
265: // Since a document specified encoding requires only ASCII characters, just choose an arbitrary 8-bit preliminary encoding.
266: // UTF-8 is however not a good choice as it is not strictly an 8-bit encoding.
267: // UTF-8 bytes with a value >= 0x80 indicate the presence of a multi-byte character, and there are many byte values that are illegal.
268: // Therefore, choose the only true 8-bit encoding that accepts all byte values and is guaranteed to be available on all java implementations.
269: return setEncoding(
270: ISO_8859_1,
271: "default 8-bit ASCII-compatible encoding (no 00 bytes present in first four bytes of stream)");
272: }
273: }
|