Source Code Cross Referenced for StreamEncodingDetector.java in » HTML-Parser » jericho-html » au » id » jericho » lib » html » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » HTML Parser » jericho html » au.id.jericho.lib.html

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002:        // Version 2.5
003:        // Copyright (C) 2007 Martin Jericho
004:        // http://jerichohtml.sourceforge.net/
005:        //
006:        // This library is free software; you can redistribute it and/or
007:        // modify it under the terms of either one of the following licences:
008:        //
009:        // 1. The Eclipse Public License (EPL) version 1.0,
010:        // included in this distribution in the file licence-epl-1.0.html
011:        // or available at http://www.eclipse.org/legal/epl-v10.html
012:        //
013:        // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014:        // included in this distribution in the file licence-lgpl-2.1.txt
015:        // or available at http://www.gnu.org/licenses/lgpl.txt
016:        //
017:        // This library is distributed on an "AS IS" basis,
018:        // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019:        // See the individual licence texts for more details.
020:
021:        package au.id.jericho.lib.html;
022:
023:        import java.util.*;
024:        import java.io.*;
025:        import java.nio.charset.*;
026:        import java.net.*;
027:
028:        /**
029:         * Based on information in:
030:         * http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
031:         * http://www.w3.org/TR/html401/charset.html#h-5.2
032:         */
033:        final class StreamEncodingDetector {
034:            private final InputStream inputStream;
035:            private String encoding = null;
036:            private String encodingSpecificationInfo = null;
037:            private boolean definitive = true;
038:            private boolean documentSpecifiedEncodingPossible = true;
039:
040:            private static final String UTF_16 = "UTF-16";
041:            private static final String UTF_16BE = "UTF-16BE";
042:            private static final String UTF_16LE = "UTF-16LE";
043:            private static final String UTF_8 = "UTF-8";
044:            private static final String ISO_8859_1 = "ISO-8859-1";
045:            private static final String EBCDIC = "Cp037"; // aka IBM037, not guaranteed, but available on most platforms
046:
047:            // All of the following encodings are generally not supported in java and will usually throw an exception if decoding is attempted.
048:            // Specified explicitly using Byte Order Mark:
049:            private static final String SCSU = "SCSU";
050:            private static final String UTF_7 = "UTF-7";
051:            private static final String UTF_EBCDIC = "UTF-EBCDIC";
052:            private static final String BOCU_1 = "BOCU-1";
053:            private static final String UTF_32 = "UTF-32";
054:            // Guessed from presence of 00 bytes in first four bytes:
055:            private static final String UTF_32BE = "UTF-32BE";
056:            private static final String UTF_32LE = "UTF-32LE";
057:
058:            public StreamEncodingDetector(final URL url) throws IOException {
059:                final URLConnection urlConnection = url.openConnection();
060:                final HttpURLConnection httpURLConnection = (urlConnection instanceof  HttpURLConnection) ? (HttpURLConnection) urlConnection
061:                        : null;
062:                // urlConnection.setRequestProperty("Accept-Charset","UTF-8, ISO-8859-1;q=0"); // used for debugging
063:                final InputStream urlInputStream = urlConnection
064:                        .getInputStream();
065:                final String contentType = urlConnection.getContentType();
066:                if (contentType != null) {
067:                    encoding = Source
068:                            .getCharsetParameterFromHttpHeaderValue(contentType);
069:                    if (encoding != null) {
070:                        inputStream = urlInputStream;
071:                        encodingSpecificationInfo = "HTTP header Content-Type: "
072:                                + contentType;
073:                        return;
074:                    }
075:                }
076:                inputStream = urlInputStream.markSupported() ? urlInputStream
077:                        : new BufferedInputStream(urlInputStream);
078:                init();
079:            }
080:
081:            public StreamEncodingDetector(final InputStream inputStream)
082:                    throws IOException {
083:                this .inputStream = inputStream.markSupported() ? inputStream
084:                        : new BufferedInputStream(inputStream);
085:                init();
086:            }
087:
088:            public InputStream getInputStream() {
089:                return inputStream;
090:            }
091:
092:            public String getEncoding() {
093:                return encoding;
094:            }
095:
096:            public String getEncodingSpecificationInfo() {
097:                return encodingSpecificationInfo;
098:            }
099:
100:            public boolean isDifinitive() {
101:                return definitive;
102:            }
103:
104:            public boolean isDocumentSpecifiedEncodingPossible() {
105:                return documentSpecifiedEncodingPossible;
106:            }
107:
108:            public Reader openReader() throws UnsupportedEncodingException {
109:                if (encoding == null)
110:                    return new InputStreamReader(inputStream, ISO_8859_1); // encoding==null only if input stream is empty so use an arbitrary encoding.
111:                if (!Charset.isSupported(encoding))
112:                    throw new UnsupportedEncodingException(encoding + " - "
113:                            + encodingSpecificationInfo);
114:                return new InputStreamReader(inputStream, encoding);
115:            }
116:
117:            private boolean setEncoding(final String encoding,
118:                    final String encodingSpecificationInfo) {
119:                this .encoding = encoding;
120:                this .encodingSpecificationInfo = encodingSpecificationInfo;
121:                return true;
122:            }
123:
124:            private boolean init() throws IOException {
125:                inputStream.mark(4);
126:                final int b1 = inputStream.read();
127:                if (b1 == -1)
128:                    return setEncoding(null, "empty input stream");
129:                final int b2 = inputStream.read();
130:                final int b3 = inputStream.read();
131:                final int b4 = inputStream.read();
132:                inputStream.reset();
133:                // Check for Unicode Byte Order Mark:
134:                if (b1 == 0xEF) {
135:                    if (b2 == 0xBB && b3 == 0xBF)
136:                        return setEncoding(UTF_8,
137:                                "UTF-8 Byte Order Mark (EF BB BF)");
138:                } else if (b1 == 0xFE) {
139:                    if (b2 == 0xFF)
140:                        return setEncoding(UTF_16,
141:                                "UTF-16 big-endian Byte Order Mark (FE FF)");
142:                } else if (b1 == 0xFF) {
143:                    if (b2 == 0xFE) {
144:                        if (b3 == 0 && b4 == 0)
145:                            return setEncoding(UTF_32,
146:                                    "UTF-32 little-endian Byte Order Mark (FF EE 00 00)");
147:                        return setEncoding(UTF_16,
148:                                "UTF-16 little-endian Byte Order Mark (FF EE)");
149:                    }
150:                } else if (b1 == 0) {
151:                    if (b2 == 0 && b3 == 0xFE && b4 == 0xFF)
152:                        return setEncoding(UTF_32,
153:                                "UTF-32 big-endian Byte Order Mark (00 00 FE FF)");
154:                } else if (b1 == 0x0E) {
155:                    if (b2 == 0xFE && b3 == 0xFF)
156:                        return setEncoding(SCSU,
157:                                "SCSU Byte Order Mark (0E FE FF)");
158:                } else if (b1 == 0x2B) {
159:                    if (b2 == 0x2F && b3 == 0x76)
160:                        return setEncoding(UTF_7,
161:                                "UTF-7 Byte Order Mark (2B 2F 76)");
162:                } else if (b1 == 0xDD) {
163:                    if (b2 == 0x73 && b3 == 0x66 && b4 == 0x73)
164:                        return setEncoding(UTF_EBCDIC,
165:                                "UTF-EBCDIC Byte Order Mark (DD 73 66 73)");
166:                } else if (b1 == 0xFB) {
167:                    if (b2 == 0xEE && b3 == 0x28)
168:                        return setEncoding(BOCU_1,
169:                                "BOCU-1 Byte Order Mark (FB EE 28)");
170:                }
171:                // No Unicode Byte Order Mark found.  Have to start guessing.
172:                definitive = false;
173:                // The best we can do is to provide an encoding that reflects the correct number and ordering of bytes for characters in the ASCII range.
174:                // The result will be one of ISO_8859_1, EBCDIC, UTF_16BE, UTF_16LE, UTF_32BE or UTF_32LE.
175:                // Assumes 00 bytes indicate multi-byte encodings rather than the presence of NUL characters or characters with a code that is a multiple of 0x100.
176:                if (b4 == -1) {
177:                    // The stream contains between 1 and 3 bytes.
178:                    // This means the document can't possibly specify the encoding, so make a best guess based on the first 3 bytes.
179:                    documentSpecifiedEncodingPossible = false;
180:                    // It might be possible to rule out some encodings based on these bytes, but it is impossible to make a definite determination.
181:                    // The main thing to determine is whether it is an 8-bit or 16-bit encoding.
182:                    // In order to guess the most likely encoding, assume that the text contains only ASCII characters, and that any 00 bytes indicate a 16-bit encoding.
183:                    // The only strictly 8-bit encoding guaranteed to be supported on all java platforms is ISO-8859-1 (UTF-8 uses a variable number of bytes per character).
184:                    // If no 00 bytes are present it is safest to assume ISO-8859-1, as this accepts the full range of values 00-FF in every byte.
185:                    if (b2 == -1 || b3 != -1)
186:                        return setEncoding(ISO_8859_1,
187:                                "default 8-bit ASCII-compatible encoding (stream 3 bytes long)"); // The stream contains exactly 1 or 3 bytes, so assume an 8-bit encoding regardless of whether any 00 bytes are present.
188:                    // The stream contains exactly 2 bytes.
189:                    if (b1 == 0)
190:                        return setEncoding(UTF_16BE,
191:                                "default 16-bit BE encoding (byte stream starts with 00, stream 2 bytes long)");
192:                    if (b2 == 0)
193:                        return setEncoding(UTF_16LE,
194:                                "default 16-bit LE encoding (byte stream pattern XX 00, stream 2 bytes long)");
195:                    // No 00 bytes present, assume 8-bit encoding:
196:                    return setEncoding(
197:                            ISO_8859_1,
198:                            "default 8-bit ASCII-compatible encoding (no 00 bytes present, stream 2 bytes long)");
199:                }
200:                // Stream contains at least 4 bytes.
201:                // The patterns used for documentation are made up of:
202:                //   0 - zero byte
203:                //   X - non-zero byte
204:                //   ? - byte value not yet determined
205:                if (b1 == 0) {
206:                    // pattern 0???
207:                    if (b2 == 0)
208:                        return setEncoding(UTF_32BE,
209:                                "default 32-bit BE encoding (byte stream starts with 00 00)"); // pattern 00?? most likely indicates UTF-32BE
210:                    // pattern 0X??
211:                    // Regardless of the final two bytes, assume that the first two bytes indicate a 16-bit BE encoding.
212:                    // There are many circumstances where this could be an incorrect assumption, for example:
213:                    //   - UTF-16LE encoding with first character U+0100 (or any other character whose code is a multiple of 100Hex)
214:                    //   - any encoding with first character NUL
215:                    //   - UTF-32BE encoding with first character outside of Basic Multilingual Plane (BMP)
216:                    // Checking the final two bytes might give some clues as to whether any of these other situations are more likely,
217:                    // but none of the clues will yield less than a 50% chance that the encoding is in fact UTF-16BE as suggested by the first two bytes.
218:                    return setEncoding(UTF_16BE,
219:                            "default 16-bit BE encoding (byte stream starts with 00)"); // >=50% chance that encoding is UTF-16BE
220:                }
221:                // pattern X???
222:                if (b4 == 0) {
223:                    // pattern X??0
224:                    if (b3 == 0)
225:                        return setEncoding(UTF_32LE,
226:                                "default 32-bit LE encoding (byte stream starts with pattern XX ?? 00 00)"); // pattern X?00 most likely indicates UTF-32LE
227:                    // pattern X?X0
228:                    return setEncoding(UTF_16LE,
229:                            "default 16-bit LE encoding (byte stream stars with pattern XX ?? XX 00)"); // Regardless of the second byte, assume the fourth 00 byte indicates UTF-16LE.
230:                }
231:                // pattern X??X
232:                if (b2 == 0) {
233:                    // pattern X0?X
234:                    // Assuming the second 00 byte doesn't indicate a NUL character, and that it is very unlikely that this is a 32-bit encoding
235:                    // of a character outside of the BMP, we can assume that it indicates a 16-bit encoding.
236:                    // If the pattern is X00X, there is a 50/50 chance that the encoding is BE or LE, with one of the characters have a code that is a multiple of 0x100.
237:                    // This should be a very rare occurrence, and there is no more than a 50% chance that the encoding
238:                    // will be different to that assumed (UTF-16LE) without checking for this occurrence, so don't bother checking for it.
239:                    // If the pattern is X0XX, this is likely to indicate a 16-bit LE encoding with the second character > U+00FF.
240:                    return setEncoding(UTF_16LE,
241:                            "default 16-bit LE encoding (byte stream starts with pattern XX 00 ?? XX)");
242:                }
243:                // pattern XX?X
244:                if (b3 == 0)
245:                    return setEncoding(UTF_16BE,
246:                            "default 16-bit BE encoding (byte stream starts with pattern XX XX 00 XX)"); // pattern XX0X likely to indicate a 16-bit BE encoding with the first character > U+00FF.
247:                // pattern XXXX
248:                // Although it is still possible that this is a 16-bit encoding with the first two characters > U+00FF
249:                // Assume the more likely case of four 8-bit characters <= U+00FF.
250:                // Check whether it fits some common EBCDIC strings that might be found at the start of a document:
251:                if (b1 == 0x4C) { // first character is EBCDIC '<' (ASCII 'L'), check a couple more characters before assuming EBCDIC encoding:
252:                    if (b2 == 0x6F && b3 == 0xA7 && b4 == 0x94)
253:                        return setEncoding(EBCDIC,
254:                                "default EBCDIC encoding (<?xml...> detected)"); // first four bytes are "<?xm" in EBCDIC ("Lo��" in Windows-1252)
255:                    if (b2 == 0x5A && b3 == 0xC4 && b4 == 0xD6)
256:                        return setEncoding(EBCDIC,
257:                                "default EBCDIC encoding (<!DOCTYPE...> detected)"); // first four bytes are "<!DO" in EBCDIC ("LZ��" in Windows-1252)
258:                    if ((b2 & b3 & b4 & 0x80) != 0)
259:                        return setEncoding(EBCDIC,
260:                                "default EBCDIC-compatible encoding (HTML element detected)"); // all of the 3 bytes after the '<' have the high-order bit set, indicating EBCDIC letters such as "<HTM" ("L���" in Windows-1252), or "<htm" ("L���" in Windows-1252)
261:                    // although this is not an exhaustive check for EBCDIC, it is safer to assume a more common preliminary encoding if none of these conditions are met.
262:                }
263:                // Now confident that it is not EBCDIC, but some other 8-bit encoding.
264:                // Most other 8-bit encodings are compatible with ASCII.
265:                // Since a document specified encoding requires only ASCII characters, just choose an arbitrary 8-bit preliminary encoding.
266:                // UTF-8 is however not a good choice as it is not strictly an 8-bit encoding.
267:                // UTF-8 bytes with a value >= 0x80 indicate the presence of a multi-byte character, and there are many byte values that are illegal.
268:                // Therefore, choose the only true 8-bit encoding that accepts all byte values and is guaranteed to be available on all java implementations.
269:                return setEncoding(
270:                        ISO_8859_1,
271:                        "default 8-bit ASCII-compatible encoding (no 00 bytes present in first four bytes of stream)");
272:            }
273:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.