001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.util.*;
024: import java.io.*;
025: import java.nio.charset.*;
026: import java.net.*;
027:
028: final class EncodingDetector {
029: private final InputStream inputStream;
030: private String encoding = null;
031: private String encodingSpecificationInfo = null;
032: private final String preliminaryEncoding;
033: private final String preliminaryEncodingSpecificationInfo;
034: private final String alternativePreliminaryEncoding;
035:
036: private static final int PREVIEW_BYTE_COUNT = 2048;
037:
038: private static final String UTF_8 = "UTF-8";
039: private static final String ISO_8859_1 = "ISO-8859-1";
040:
041: public EncodingDetector(final URL url) throws IOException {
042: this (new StreamEncodingDetector(url));
043: }
044:
045: public EncodingDetector(final InputStream inputStream)
046: throws IOException {
047: this (new StreamEncodingDetector(inputStream));
048: }
049:
050: public EncodingDetector(final InputStream inputStream,
051: final String preliminaryEncoding) throws IOException {
052: this (inputStream, preliminaryEncoding,
053: "preliminary encoding set explicitly", null);
054: if (!Charset.isSupported(preliminaryEncoding))
055: throw new UnsupportedEncodingException(
056: preliminaryEncoding
057: + " specified as preliminaryEncoding constructor argument");
058: detectDocumentSpecifiedEncoding();
059: }
060:
061: private EncodingDetector(
062: final StreamEncodingDetector streamEncodingDetector)
063: throws IOException {
064: this (streamEncodingDetector, ISO_8859_1);
065: }
066:
067: private EncodingDetector(
068: final StreamEncodingDetector streamEncodingDetector,
069: final String alternativePreliminaryEncoding)
070: throws IOException {
071: this (streamEncodingDetector.getInputStream(),
072: streamEncodingDetector.getEncoding(),
073: streamEncodingDetector.getEncodingSpecificationInfo(),
074: alternativePreliminaryEncoding);
075: if (streamEncodingDetector.isDifinitive()
076: || !streamEncodingDetector
077: .isDocumentSpecifiedEncodingPossible()) {
078: // don't try to detect the encoding from the document because there is no need or it is not possible
079: setEncoding(preliminaryEncoding,
080: preliminaryEncodingSpecificationInfo);
081: } else {
082: detectDocumentSpecifiedEncoding();
083: }
084: }
085:
086: private EncodingDetector(final InputStream inputStream,
087: final String preliminaryEncoding,
088: final String preliminaryEncodingSpecificationInfo,
089: final String alternativePreliminaryEncoding)
090: throws IOException {
091: this .inputStream = inputStream.markSupported() ? inputStream
092: : new BufferedInputStream(inputStream);
093: this .preliminaryEncoding = preliminaryEncoding;
094: this .preliminaryEncodingSpecificationInfo = preliminaryEncodingSpecificationInfo;
095: this .alternativePreliminaryEncoding = alternativePreliminaryEncoding;
096: if (alternativePreliminaryEncoding != null
097: && !Charset.isSupported(alternativePreliminaryEncoding))
098: throw new UnsupportedEncodingException(
099: alternativePreliminaryEncoding
100: + " specified as alternativePreliminaryEncoding constructor argument");
101: }
102:
103: public InputStream getInputStream() {
104: return inputStream;
105: }
106:
107: public String getEncoding() {
108: return encoding;
109: }
110:
111: public String getEncodingSpecificationInfo() {
112: return encodingSpecificationInfo;
113: }
114:
115: public String getPreliminaryEncoding() {
116: return preliminaryEncoding;
117: }
118:
119: public String getPreliminaryEncodingSpecificationInfo() {
120: return preliminaryEncodingSpecificationInfo;
121: }
122:
123: public Reader openReader() throws UnsupportedEncodingException {
124: if (encoding == null)
125: return new InputStreamReader(inputStream, ISO_8859_1); // encoding==null only if input stream is empty so use an arbitrary encoding.
126: if (!Charset.isSupported(encoding)) {
127: throw new UnsupportedEncodingException(encoding + ": "
128: + encodingSpecificationInfo);
129: }
130: return new InputStreamReader(inputStream, encoding);
131: }
132:
133: private boolean setEncoding(final String encoding,
134: final String encodingSpecificationInfo) {
135: this .encoding = encoding;
136: this .encodingSpecificationInfo = encodingSpecificationInfo;
137: return true;
138: }
139:
140: private boolean detectDocumentSpecifiedEncoding()
141: throws IOException {
142: inputStream.mark(PREVIEW_BYTE_COUNT);
143: String safePreliminaryEncoding;
144: if (Charset.isSupported(preliminaryEncoding)) {
145: safePreliminaryEncoding = preliminaryEncoding;
146: } else {
147: if (alternativePreliminaryEncoding == null)
148: throw new UnsupportedEncodingException(
149: preliminaryEncoding + ": "
150: + preliminaryEncodingSpecificationInfo);
151: safePreliminaryEncoding = alternativePreliminaryEncoding;
152: }
153: final Source previewSource = getPreviewSource(safePreliminaryEncoding); // should never throw UnsupportedEncodingException
154: inputStream.reset();
155: final Logger logger = previewSource.getLogger();
156: previewSource.setLogger(null);
157: if (preliminaryEncoding != safePreliminaryEncoding
158: && logger.isWarnEnabled())
159: logger
160: .warn("Alternative encoding "
161: + safePreliminaryEncoding
162: + " substituted for unsupported preliminary encoding "
163: + preliminaryEncoding + ": "
164: + preliminaryEncodingSpecificationInfo);
165: String documentSpecifiedEncodingInfoSuffix;
166: if (previewSource.getDocumentSpecifiedEncoding() == null) {
167: if (previewSource.isXML()) {
168: // The source looks like an XML document.
169: // The XML 1.0 specification section 4.3.3 states that an XML file that is not encoded in UTF-8 must contain
170: // either a UTF-16 BOM or an encoding declaration in its XML declaration.
171: // Since no encoding declaration was detected, and if we assume this class is only used if no BOM is present, we can then assume it is UTF-8.
172: return setEncoding(UTF_8,
173: "mandatory XML encoding when no BOM or encoding declaration is present");
174: }
175: documentSpecifiedEncodingInfoSuffix = "no encoding specified in document";
176: } else {
177: if (Charset.isSupported(previewSource
178: .getDocumentSpecifiedEncoding()))
179: return setEncoding(previewSource
180: .getDocumentSpecifiedEncoding(), previewSource
181: .getEncodingSpecificationInfo());
182: // Document specified encoding is not supported. Fall back on preliminary encoding.
183: documentSpecifiedEncodingInfoSuffix = "encoding "
184: + previewSource.getDocumentSpecifiedEncoding()
185: + " specified in document is not supported";
186: if (logger.isWarnEnabled())
187: logger
188: .warn("Unsupported encoding "
189: + previewSource
190: .getDocumentSpecifiedEncoding()
191: + " specified in document, using preliminary encoding "
192: + safePreliminaryEncoding + " instead");
193: }
194: // Document does not look like XML, does not specify an encoding in its transport protocol, has no BOM, and does not specify an encoding in the document itself.
195: // The HTTP protocol states that such a situation should assume ISO-8859-1 encoding.
196: // We will just assume the preliminary encoding, which is the best guess based on the first 4 bytes of the stream.
197: // This means ISO-8859-1 will be used for any 8-bit ASCII compatible encoding, consistent with the HTTP protocol default.
198: if (preliminaryEncoding != safePreliminaryEncoding)
199: return setEncoding(safePreliminaryEncoding,
200: "alternative encoding substituted for unsupported preliminary encoding "
201: + preliminaryEncoding + ": "
202: + preliminaryEncodingSpecificationInfo
203: + ", "
204: + documentSpecifiedEncodingInfoSuffix);
205: return setEncoding(preliminaryEncoding,
206: preliminaryEncodingSpecificationInfo + ", "
207: + documentSpecifiedEncodingInfoSuffix);
208: }
209:
210: private Source getPreviewSource(final String previewEncoding)
211: throws IOException {
212: final byte[] bytes = new byte[PREVIEW_BYTE_COUNT];
213: int i;
214: for (i = 0; i < PREVIEW_BYTE_COUNT; i++) {
215: final int nextByte = inputStream.read();
216: if (nextByte == -1)
217: break;
218: bytes[i] = (byte) nextByte;
219: }
220: return new Source(
221: new InputStreamReader(new ByteArrayInputStream(bytes,
222: 0, i), previewEncoding), null);
223: }
224: }
|