001: /*
002: * Copyright 2004 Sun Microsystems, Inc.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: *
016: */
017: package com.sun.syndication.io;
018:
019: import java.io.*;
020: import java.net.URL;
021: import java.net.URLConnection;
022: import java.net.HttpURLConnection;
023: import java.util.regex.Pattern;
024: import java.util.regex.Matcher;
025: import java.text.MessageFormat;
026:
027: /**
028: * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
029: * the charset encoding of the XML document within the stream.
030: * <p>
031: * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
032: * character stream.
033: * <p>
034: * All this has to be done without consuming characters from the stream, if not the XML parser
035: * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
036: * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all
037: * parsers).
038: * <p>
039: * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
040: * HTTP streams by offering a wide set of constructors.
041: * <P>
042: * By default the charset encoding detection is lenient, the constructor with the lenient flag
043: * can be used for an script (following HTTP MIME and XML specifications).
044: * All this is nicely explained by Mark Pilgrim in his blog,
045: * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
046: * Determining the character encoding of a feed</a>.
047: * <p>
048: * @author Alejandro Abdelnur
049: *
050: */
051: public class XmlReader extends Reader {
052: private static final int PUSHBACK_MAX_SIZE = 4096;
053:
054: private static final String UTF_8 = "UTF-8";
055: private static final String US_ASCII = "US-ASCII";
056: private static final String UTF_16BE = "UTF-16BE";
057: private static final String UTF_16LE = "UTF-16LE";
058: private static final String UTF_16 = "UTF-16";
059:
060: private Reader _reader;
061: private String _encoding;
062:
063: /**
064: * Creates a Reader for a File.
065: * <p>
066: * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
067: * missing defaults to UTF-8.
068: * <p>
069: * It does a lenient charset encoding detection, check the constructor with the lenient parameter
070: * for details.
071: * <p>
072: * @param file File to create a Reader from.
073: * @throws IOException thrown if there is a problem reading the file.
074: *
075: */
076: public XmlReader(File file) throws IOException {
077: this (new FileInputStream(file));
078: }
079:
080: /**
081: * Creates a Reader for a raw InputStream.
082: * <p>
083: * It follows the same logic used for files.
084: * <p>
085: * It does a lenient charset encoding detection, check the constructor with the lenient parameter
086: * for details.
087: * <p>
088: * @param is InputStream to create a Reader from.
089: * @throws IOException thrown if there is a problem reading the stream.
090: *
091: */
092: public XmlReader(InputStream is) throws IOException {
093: this (is, true);
094: }
095:
096: /**
097: * Creates a Reader for a raw InputStream.
098: * <p>
099: * It follows the same logic used for files.
100: * <p>
101: * If lenient detection is indicated and the detection above fails as per specifications it then attempts
102: * the following:
103: * <p>
104: * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
105: * <p>
106: * Else if the XML prolog had a charset encoding that encoding is used.
107: * <p>
108: * Else if the content type had a charset encoding that encoding is used.
109: * <p>
110: * Else 'UTF-8' is used.
111: * <p>
112: * If lenient detection is indicated an XmlReaderException is never thrown.
113: * <p>
114: * @param is InputStream to create a Reader from.
115: * @param lenient indicates if the charset encoding detection should be relaxed.
116: * @throws IOException thrown if there is a problem reading the stream.
117: * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
118: *
119: */
120: public XmlReader(InputStream is, boolean lenient)
121: throws IOException, XmlReaderException {
122: try {
123: doRawStream(is, lenient);
124: } catch (XmlReaderException ex) {
125: if (!lenient) {
126: throw ex;
127: } else {
128: doLenientDetection(null, ex);
129: }
130: }
131: }
132:
133: /**
134: * Creates a Reader using the InputStream of a URL.
135: * <p>
136: * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
137: * data it uses the same logic used for Files.
138: * <p>
139: * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
140: * data it uses the same logic used for an InputStream with content-type.
141: * <p>
142: * It does a lenient charset encoding detection, check the constructor with the lenient parameter
143: * for details.
144: * <p>
145: * @param url URL to create a Reader from.
146: * @throws IOException thrown if there is a problem reading the stream of the URL.
147: *
148: */
149: public XmlReader(URL url) throws IOException {
150: this (url.openConnection());
151: }
152:
153: /**
154: * Creates a Reader using the InputStream of a URLConnection.
155: * <p>
156: * If the URLConnection is not of type HttpURLConnection and there is not
157: * 'content-type' header in the fetched data it uses the same logic used for files.
158: * <p>
159: * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
160: * data it uses the same logic used for an InputStream with content-type.
161: * <p>
162: * It does a lenient charset encoding detection, check the constructor with the lenient parameter
163: * for details.
164: * <p>
165: * @param conn URLConnection to create a Reader from.
166: * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
167: *
168: */
169: public XmlReader(URLConnection conn) throws IOException {
170: boolean lenient = true;
171: if (conn instanceof HttpURLConnection) {
172: try {
173: doHttpStream(conn.getInputStream(), conn
174: .getContentType(), lenient);
175: } catch (XmlReaderException ex) {
176: doLenientDetection(conn.getContentType(), ex);
177: }
178: } else if (conn.getContentType() != null) {
179: try {
180: doHttpStream(conn.getInputStream(), conn
181: .getContentType(), lenient);
182: } catch (XmlReaderException ex) {
183: doLenientDetection(conn.getContentType(), ex);
184: }
185: } else {
186: try {
187: doRawStream(conn.getInputStream(), lenient);
188: } catch (XmlReaderException ex) {
189: doLenientDetection(null, ex);
190: }
191: }
192: }
193:
194: /**
195: * Creates a Reader using an InputStream an the associated content-type header.
196: * <p>
197: * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
198: * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
199: * prolog encoding uses the default encoding mandated by the content-type MIME type.
200: * <p>
201: * It does a lenient charset encoding detection, check the constructor with the lenient parameter
202: * for details.
203: * <p>
204: * @param is InputStream to create the reader from.
205: * @param httpContentType content-type header to use for the resolution of the charset encoding.
206: * @throws IOException thrown if there is a problem reading the file.
207: *
208: */
209: public XmlReader(InputStream is, String httpContentType)
210: throws IOException {
211: this (is, httpContentType, true);
212: }
213:
214: /**
215: * Creates a Reader using an InputStream an the associated content-type header. This constructor is
216: * lenient regarding the encoding detection.
217: * <p>
218: * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
219: * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
220: * prolog encoding uses the default encoding mandated by the content-type MIME type.
221: * <p>
222: * If lenient detection is indicated and the detection above fails as per specifications it then attempts
223: * the following:
224: * <p>
225: * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
226: * <p>
227: * Else if the XML prolog had a charset encoding that encoding is used.
228: * <p>
229: * Else if the content type had a charset encoding that encoding is used.
230: * <p>
231: * Else 'UTF-8' is used.
232: * <p>
233: * If lenient detection is indicated an XmlReaderException is never thrown.
234: * <p>
235: * @param is InputStream to create the reader from.
236: * @param httpContentType content-type header to use for the resolution of the charset encoding.
237: * @param lenient indicates if the charset encoding detection should be relaxed.
238: * @throws IOException thrown if there is a problem reading the file.
239: * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
240: *
241: */
242: public XmlReader(InputStream is, String httpContentType,
243: boolean lenient) throws IOException, XmlReaderException {
244: try {
245: doHttpStream(is, httpContentType, lenient);
246: } catch (XmlReaderException ex) {
247: if (!lenient) {
248: throw ex;
249: } else {
250: doLenientDetection(httpContentType, ex);
251: }
252: }
253: }
254:
255: private void doLenientDetection(String httpContentType,
256: XmlReaderException ex) throws IOException {
257: if (httpContentType != null) {
258: if (httpContentType.startsWith("text/html")) {
259: httpContentType = httpContentType.substring("text/html"
260: .length());
261: httpContentType = "text/xml" + httpContentType;
262: try {
263: doHttpStream(ex.getInputStream(), httpContentType,
264: true);
265: ex = null;
266: } catch (XmlReaderException ex2) {
267: ex = ex2;
268: }
269: }
270: }
271: if (ex != null) {
272: String encoding = ex.getXmlEncoding();
273: if (encoding == null) {
274: encoding = ex.getContentTypeEncoding();
275: }
276: if (encoding == null) {
277: encoding = UTF_8;
278: }
279: prepareReader(ex.getInputStream(), encoding);
280: }
281: }
282:
283: /**
284: * Returns the charset encoding of the XmlReader.
285: * <p>
286: * @return charset encoding.
287: *
288: */
289: public String getEncoding() {
290: return _encoding;
291: }
292:
293: public int read(char[] buf, int offset, int len) throws IOException {
294: return _reader.read(buf, offset, len);
295: }
296:
297: /**
298: * Closes the XmlReader stream.
299: * <p>
300: * @throws IOException thrown if there was a problem closing the stream.
301: *
302: */
303: public void close() throws IOException {
304: _reader.close();
305: }
306:
307: private void doRawStream(InputStream is, boolean lenient)
308: throws IOException {
309: PushbackInputStream pis = new PushbackInputStream(is,
310: PUSHBACK_MAX_SIZE);
311: String bomEnc = getBOMEncoding(pis);
312: String xmlGuessEnc = getXMLGuessEncoding(pis);
313: String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
314: String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc,
315: xmlEnc, pis);
316: prepareReader(pis, encoding);
317: }
318:
319: private void doHttpStream(InputStream is, String httpContentType,
320: boolean lenient) throws IOException {
321: PushbackInputStream pis = new PushbackInputStream(is,
322: PUSHBACK_MAX_SIZE);
323: String cTMime = getContentTypeMime(httpContentType);
324: String cTEnc = getContentTypeEncoding(httpContentType);
325: String bomEnc = getBOMEncoding(pis);
326: String xmlGuessEnc = getXMLGuessEncoding(pis);
327: String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
328: String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc,
329: xmlGuessEnc, xmlEnc, pis, lenient);
330: prepareReader(pis, encoding);
331: }
332:
333: private void prepareReader(InputStream is, String encoding)
334: throws IOException {
335: _reader = new InputStreamReader(is, encoding);
336: _encoding = encoding;
337: }
338:
339: // InputStream is passed for XmlReaderException creation only
340: private static String calculateRawEncoding(String bomEnc,
341: String xmlGuessEnc, String xmlEnc, InputStream is)
342: throws IOException {
343: String encoding;
344: if (bomEnc == null) {
345: if (xmlGuessEnc == null || xmlEnc == null) {
346: encoding = UTF_8;
347: } else if (xmlEnc.equals(UTF_16)
348: && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc
349: .equals(UTF_16LE))) {
350: encoding = xmlGuessEnc;
351: } else {
352: encoding = xmlEnc;
353: }
354: } else if (bomEnc.equals(UTF_8)) {
355: if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
356: throw new XmlReaderException(RAW_EX_1
357: .format(new Object[] { bomEnc, xmlGuessEnc,
358: xmlEnc }), bomEnc, xmlGuessEnc, xmlEnc,
359: is);
360: }
361: if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
362: throw new XmlReaderException(RAW_EX_1
363: .format(new Object[] { bomEnc, xmlGuessEnc,
364: xmlEnc }), bomEnc, xmlGuessEnc, xmlEnc,
365: is);
366: }
367: encoding = UTF_8;
368: } else if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
369: if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
370: throw new IOException(RAW_EX_1.format(new Object[] {
371: bomEnc, xmlGuessEnc, xmlEnc }));
372: }
373: if (xmlEnc != null && !xmlEnc.equals(UTF_16)
374: && !xmlEnc.equals(bomEnc)) {
375: throw new XmlReaderException(RAW_EX_1
376: .format(new Object[] { bomEnc, xmlGuessEnc,
377: xmlEnc }), bomEnc, xmlGuessEnc, xmlEnc,
378: is);
379: }
380: encoding = bomEnc;
381: } else {
382: throw new XmlReaderException(RAW_EX_2.format(new Object[] {
383: bomEnc, xmlGuessEnc, xmlEnc }), bomEnc,
384: xmlGuessEnc, xmlEnc, is);
385: }
386: return encoding;
387: }
388:
389: // InputStream is passed for XmlReaderException creation only
390: private static String calculateHttpEncoding(String cTMime,
391: String cTEnc, String bomEnc, String xmlGuessEnc,
392: String xmlEnc, InputStream is, boolean lenient)
393: throws IOException {
394: String encoding;
395: if (lenient & xmlEnc != null) {
396: encoding = xmlEnc;
397: } else {
398: boolean appXml = isAppXml(cTMime);
399: boolean textXml = isTextXml(cTMime);
400: if (appXml || textXml) {
401: if (cTEnc == null) {
402: if (appXml) {
403: encoding = calculateRawEncoding(bomEnc,
404: xmlGuessEnc, xmlEnc, is);
405: } else {
406: encoding = US_ASCII;
407: }
408: } else if (bomEnc != null
409: && (cTEnc.equals(UTF_16BE) || cTEnc
410: .equals(UTF_16LE))) {
411: throw new XmlReaderException(HTTP_EX_1
412: .format(new Object[] { cTMime, cTEnc,
413: bomEnc, xmlGuessEnc, xmlEnc }),
414: cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc,
415: is);
416: } else if (cTEnc.equals(UTF_16)) {
417: if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
418: encoding = bomEnc;
419: } else {
420: throw new XmlReaderException(HTTP_EX_2
421: .format(new Object[] { cTMime, cTEnc,
422: bomEnc, xmlGuessEnc, xmlEnc }),
423: cTMime, cTEnc, bomEnc, xmlGuessEnc,
424: xmlEnc, is);
425: }
426: } else {
427: encoding = cTEnc;
428: }
429: } else {
430: throw new XmlReaderException(HTTP_EX_3
431: .format(new Object[] { cTMime, cTEnc, bomEnc,
432: xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
433: bomEnc, xmlGuessEnc, xmlEnc, is);
434: }
435: }
436: return encoding;
437: }
438:
439: // returns MIME type or NULL if httpContentType is NULL
440: private static String getContentTypeMime(String httpContentType) {
441: String mime = null;
442: if (httpContentType != null) {
443: int i = httpContentType.indexOf(";");
444: mime = ((i == -1) ? httpContentType : httpContentType
445: .substring(0, i)).trim();
446: }
447: return mime;
448: }
449:
450: private static final Pattern CHARSET_PATTERN = Pattern
451: .compile("charset=([.[^; ]]*)");
452:
453: // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
454: private static String getContentTypeEncoding(String httpContentType) {
455: String encoding = null;
456: if (httpContentType != null) {
457: int i = httpContentType.indexOf(";");
458: if (i > -1) {
459: String postMime = httpContentType.substring(i + 1);
460: Matcher m = CHARSET_PATTERN.matcher(postMime);
461: encoding = (m.find()) ? m.group(1) : null;
462: encoding = (encoding != null) ? encoding.toUpperCase()
463: : null;
464: }
465: }
466: return encoding;
467: }
468:
469: // returns the BOM in the stream, NULL if not present,
470: // if there was BOM the in the stream it is consumed
471: private static String getBOMEncoding(PushbackInputStream is)
472: throws IOException {
473: String encoding = null;
474: int[] bytes = new int[3];
475: bytes[0] = is.read();
476: bytes[1] = is.read();
477: bytes[2] = is.read();
478:
479: if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
480: encoding = UTF_16BE;
481: is.unread(bytes[2]);
482: } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
483: encoding = UTF_16LE;
484: is.unread(bytes[2]);
485: } else if (bytes[0] == 0xEF && bytes[1] == 0xBB
486: && bytes[2] == 0xBF) {
487: encoding = UTF_8;
488: } else {
489: for (int i = bytes.length - 1; i >= 0; i--) {
490: is.unread(bytes[i]);
491: }
492: }
493: return encoding;
494: }
495:
496: // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
497: private static String getXMLGuessEncoding(PushbackInputStream is)
498: throws IOException {
499: String encoding = null;
500: int[] bytes = new int[4];
501: bytes[0] = is.read();
502: bytes[1] = is.read();
503: bytes[2] = is.read();
504: bytes[3] = is.read();
505: for (int i = bytes.length - 1; i >= 0; i--) {
506: is.unread(bytes[i]);
507: }
508:
509: if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00
510: && bytes[3] == 0x3F) {
511: encoding = UTF_16BE;
512: } else if (bytes[0] == 0x3C && bytes[1] == 0x00
513: && bytes[2] == 0x3F && bytes[3] == 0x00) {
514: encoding = UTF_16LE;
515: } else if (bytes[0] == 0x3C && bytes[1] == 0x3F
516: && bytes[2] == 0x78 && bytes[3] == 0x6D) {
517: encoding = UTF_8;
518: }
519: return encoding;
520: }
521:
522: private static final Pattern ENCODING_PATTERN = Pattern
523: .compile(
524: "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*')).*\\?>",
525: Pattern.MULTILINE);
526:
527: // returns the encoding declared in the <?xml encoding=...?>, NULL if none
528: private static String getXmlProlog(PushbackInputStream is,
529: String guessedEnc) throws IOException {
530: String encoding = null;
531: if (guessedEnc != null) {
532: byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
533: int offset = 0;
534: int max = PUSHBACK_MAX_SIZE;
535: int c = is.read(bytes, offset, max);
536: while (c != -1 && offset < PUSHBACK_MAX_SIZE) {
537: offset += c;
538: max -= c;
539: c = is.read(bytes, offset, max);
540: }
541: int bytesRead = offset;
542: if (bytesRead > 0) {
543: is.unread(bytes, 0, bytesRead);
544: Reader reader = new InputStreamReader(
545: new ByteArrayInputStream(bytes, 0, bytesRead),
546: guessedEnc);
547: BufferedReader br = new BufferedReader(reader);
548: StringBuffer prolog = new StringBuffer(
549: PUSHBACK_MAX_SIZE);
550: String line = br.readLine();
551: while (line != null) {
552: prolog.append(line).append("\n");
553: line = br.readLine();
554: }
555: Matcher m = ENCODING_PATTERN.matcher(prolog);
556: if (m.find()) {
557: encoding = m.group(1).toUpperCase();
558: encoding = encoding.substring(1,
559: encoding.length() - 1);
560: }
561: }
562: }
563: return encoding;
564: }
565:
566: // indicates if the MIME type belongs to the APPLICATION XML family
567: private static boolean isAppXml(String mime) {
568: return mime != null
569: && (mime.equals("application/xml")
570: || mime.equals("application/xml-dtd")
571: || mime
572: .equals("application/xml-external-parsed-entity") || (mime
573: .startsWith("application/") && mime
574: .endsWith("+xml")));
575: }
576:
577: // indicates if the MIME type belongs to the TEXT XML family
578: private static boolean isTextXml(String mime) {
579: return mime != null
580: && (mime.equals("text/xml")
581: || mime
582: .equals("text/xml-external-parsed-entity") || (mime
583: .startsWith("text/") && mime.endsWith("+xml")));
584: }
585:
586: private static final MessageFormat RAW_EX_1 = new MessageFormat(
587: "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
588:
589: private static final MessageFormat RAW_EX_2 = new MessageFormat(
590: "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
591:
592: private static final MessageFormat HTTP_EX_1 = new MessageFormat(
593: "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
594:
595: private static final MessageFormat HTTP_EX_2 = new MessageFormat(
596: "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
597:
598: private static final MessageFormat HTTP_EX_3 = new MessageFormat(
599: "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
600:
601: }
|