001: /* Copyright (c) 2001 - 2007 TOPP - www.openplans.org. All rights reserved.
002: * This code is licensed under the GPL 2.0 license, availible at the root
003: * application directory.
004: */
005: package org.geoserver.ows.util;
006:
007: import java.io.IOException;
008: import java.io.InputStream;
009: import java.io.InputStreamReader;
010: import java.io.Reader;
011: import java.io.StringWriter;
012: import java.io.UnsupportedEncodingException;
013: import java.nio.charset.UnsupportedCharsetException;
014: import java.util.Locale;
015: import java.util.logging.Level;
016: import java.util.logging.Logger;
017: import java.util.regex.Matcher;
018: import java.util.regex.Pattern;
019:
020: /**
021: * Provides a methods that can be used to detect charset of some
022: * XML document and (optionally) return a reader that is aware of
023: * this charset and can correctly decode document's data.
024: */
025: public class XmlCharsetDetector {
026: protected static Logger LOGGER = org.geotools.util.logging.Logging
027: .getLogger("org.vfny.geoserver.requests");
028:
029: /**
030: * In current context naming this "GT", "GREATER_THAN" or like
031: * would be misleading.
032: */
033: private static final char RIGHT_ANGLE_BRACKET = '\u003E';
034: private static final Pattern ENCODING_PATTERN = Pattern
035: .compile("encoding\\s*\\=\\s*\"([^\"]+)\"");
036:
037: /**
038: * Maximum number of characters we are expecting in XML Declaration.
039: * There are probably will be less then 100, but just in case...
040: */
041: private static final int MAX_XMLDECL_SIZE = 100;
042:
043: /**
044: * Based on Xerces-J code, this method will try its best to return a
045: * reader which is able to decode content of incoming XML document
046: * properly. To achieve this goal, it first infers general
047: * encoding scheme of the above document and then uses this
048: * information to extract actual charset from XML declaration. In
049: * any recoverable error situation default UTF-8 reader will be
050: * created.
051: *
052: * @param istream Byte stream (most probably obtained with
053: * <code>HttpServletRequest.getInputStream</code>
054: * that gives access to XML document in question).
055: *
056: * @param encInfo Instance of EncodingInfo where information about
057: * detected charset will be stored. You can then
058: * use it, for example, to form a response encoded
059: * with this charset.
060: *
061: * @throws IOException in case of any unrecoverable I/O errors.
062: * @throws UnsupportedCharsetException <code>InputStreamReader</code>'s
063: * constructor will probably throw this exception if
064: * inferred charset of XML document is not supported by
065: * current JVM.
066: */
067: public static Reader getCharsetAwareReader(InputStream istream,
068: EncodingInfo encInfo) throws IOException,
069: UnsupportedCharsetException {
070: RewindableInputStream stream;
071: stream = new RewindableInputStream(istream, false);
072:
073: //
074: // Phase 1. Reading first four bytes and determining encoding scheme.
075: final byte[] b4 = new byte[4];
076:
077: int count = 0;
078:
079: for (; count < 4; count++) {
080: int b = stream.read();
081:
082: if (-1 != b) {
083: b4[count] = (byte) b;
084: } else {
085: break;
086: }
087: }
088:
089: if (LOGGER.isLoggable(Level.FINER)) {
090: // Such number of concatenating strings makes me sick.
091: // But using StringBuffer will make this uglier, not?
092: LOGGER.finer("First 4 bytes of XML doc are : "
093: + Integer.toHexString((int) b4[0] & 0xff)
094: .toUpperCase()
095: + " ('"
096: + (char) b4[0]
097: + "') "
098: + Integer.toHexString((int) b4[1] & 0xff)
099: .toUpperCase()
100: + " ('"
101: + (char) b4[1]
102: + "') "
103: + Integer.toHexString((int) b4[2] & 0xff)
104: .toUpperCase()
105: + " ('"
106: + (char) b4[2]
107: + "') "
108: + Integer.toHexString((int) b4[3] & 0xff)
109: .toUpperCase() + " ('" + (char) b4[3]
110: + "')");
111: }
112:
113: /*
114: * `getEncodingName()` is capable of detecting following encoding
115: * schemes:
116: * "UTF-8", "UTF-16LE", "UTF-16BE", "ISO-10646-UCS-4",
117: * or "CP037". It cannot distinguish between UTF-16 (without BOM)
118: * and "ISO-10646-UCS-2", so latter will be interpreted as UTF-16
119: * for the purpose of reading XML declaration. There shouldn't be
120: * much trouble though as (I believe) these formats are identical for
121: * the Basic Multilingual Plane, except that UTF-16-encoded text
122: * can contain values from surrogate range and valid UCS-2 input
123: * cannot (imho).
124: * This ugly form of copying charset data is required to maintain
125: * "reference integrity" of encInfo variable. As it can be possibly
126: * used after this method call, it should point to the same memory
127: * structure, and assignment or cloning doesn't work for me there.
128: */
129: encInfo.copyFrom(getEncodingName(b4, count));
130:
131: if (LOGGER.isLoggable(Level.FINE)) {
132: LOGGER
133: .fine("Charset detection phase 1. Inferred encoding: "
134: + encInfo.toString());
135: }
136:
137: // Rewinding to beginning of data
138: stream.reset();
139:
140: String ENCODING = encInfo.getEncoding().toUpperCase(
141: Locale.ENGLISH);
142: Boolean isBigEndian = encInfo.isBigEndian();
143: boolean hasBOM = encInfo.hasBOM();
144:
145: /*
146: * Special case UTF-8 files with BOM created by Microsoft
147: * tools. It's more efficient to consume the BOM than make
148: * the reader perform extra checks. -Ac
149: */
150: if (hasBOM && ENCODING.equals("UTF-8")) {
151: // ignore first three bytes...
152: stream.skip(3);
153: }
154:
155: /*
156: * The specifics of `getEncodingName` work is that it always returns
157: * UTF-16 with BOM as either UTF-16LE or UTF-16BE, and
158: * InputStreamReader doesn't expect BOM coming with UTF-16LE|BE
159: * encoded data. So this BOM should also be removed, if present.
160: */
161: if ((count > 1)
162: && (ENCODING.equals("UTF-16LE") || ENCODING
163: .equals("UTF-16BE"))) {
164: int b0 = b4[0] & 0xFF;
165: int b1 = b4[1] & 0xFF;
166:
167: if (((b0 == 0xFF) && (b1 == 0xFE))
168: || ((b0 == 0xFE) && (b1 == 0xFF))) {
169: // ignore first two bytes...
170: stream.skip(2);
171: }
172: }
173:
174: Reader reader = null;
175:
176: /*
177: * We must use custom class to read UCS-4 data, my JVM doesn't support
178: * this encoding scheme by default and I doubt other JVMs are.
179: *
180: * There was another specific reader for UTF-8 encoding in Xerces
181: * (org.apache.xerces.impl.io.UTF8Reader), which they say is
182: * optimized one. May be it is really better than JVM's default
183: * decoding algorithm but I doubt the necessity of porting just
184: * another (not so small) class in order to "efficiently" extract
185: * a couple of chars from XML declaration. Still I may be mistaking
186: * there. Moreover, Xerces' UTF8Reader has some internal dependencies
187: * and it will take much more effort to extract it from there.
188: *
189: * Also, at this stage it is quite impossible to have "ISO-10646-UCS-2"
190: * as a value for ENCODING.
191: *
192: * You can avoid possible bugs in UCSReader by commenting out this
193: * block of code together with following `if`. Then you will get an
194: * UnsupportedEncodingException for UCS-4 encoded data.
195: */
196: if ("ISO-10646-UCS-4".equals(ENCODING)) {
197: if (null != isBigEndian) {
198: boolean isBE = isBigEndian.booleanValue();
199:
200: if (isBE) {
201: reader = new UCSReader(stream, UCSReader.UCS4BE);
202: } else {
203: reader = new UCSReader(stream, UCSReader.UCS4LE);
204: }
205: } else {
206: // Fatal error, UCSReader will fail to decode this properly
207: String s = "Unsupported byte order for ISO-10646-UCS-4 encoding.";
208: throw new UnsupportedCharsetException(s);
209: }
210: }
211:
212: if (null == reader) {
213: reader = new InputStreamReader(stream, ENCODING);
214: }
215:
216: //
217: // Phase 2. Reading XML declaration and extracting charset info from it.
218: String declEncoding = getXmlEncoding(reader);
219:
220: if (LOGGER.isLoggable(Level.FINE)) {
221: LOGGER
222: .fine("Charset detection phase 2. Charset in XML declaration "
223: + "is `" + declEncoding + "`.");
224: }
225:
226: stream.reset();
227:
228: /*
229: * Now RewindableInputStream is allowed to return more than one byte
230: * per read operation. It also will not buffer bytes read using
231: * `read(byte[], int, int)` method.
232: */
233: stream.setChunkedMode(true);
234:
235: /*
236: * Reusing existing reader if possible, creating new one only if
237: * declared charset name differs from guessed one
238: */
239: if ((null != declEncoding) && !declEncoding.equals(ENCODING)) {
240: /*
241: * I believe that for UCS-2 encoding default UTF-16 reader
242: * (which is already created at this time) should suffice
243: * in most cases. Though, we can always construct a new
244: * UCSReader instance, if I am wrong here.
245: */
246: if (!declEncoding.equals("ISO-10646-UCS-2")) {
247: if (LOGGER.isLoggable(Level.FINE)) {
248: LOGGER
249: .fine("Declared charset differs from inferred one. "
250: + "Trying to construct InputStreamReader for `"
251: + declEncoding + "`.");
252: }
253:
254: reader = new InputStreamReader(stream, declEncoding);
255: encInfo.setEncoding(declEncoding);
256: }
257: }
258:
259: return reader;
260: } // END getCharsetAwareReader(InputStream) : Reader
261:
262: /**
263: * Use this variant when you aren't interested in encoding data, and just
264: * want to get a suitable reader for incoming request.
265: *
266: * @param istream See <code>getCharsetAwareReader(InputStream,
267: * EncodingInfo)</code>.
268: *
269: */
270: public static Reader getCharsetAwareReader(InputStream istream)
271: throws IOException, UnsupportedCharsetException {
272: return getCharsetAwareReader(istream, new EncodingInfo());
273: }
274:
275: /**
276: * Creates a new reader on top of the given <code>InputStream</code> using
277: * existing (external) encoding information. Unlike
278: * <code>getCharsetAwareReader</code>, this method never tries to detect
279: * charset or encoding scheme of <code>InputStream</code>'s data. This also
280: * means that it <em>must</em> be provided with valid
281: * <code>EncodingInfo</code> instance, which may be obtained, for example,
282: * from previous <code>getCharsetAwareReader(InputStream, EncodingInfo)</code>
283: * call.
284: *
285: * @param istream byte-stream containing textual (presumably XML) data
286: * @param encInfo correctly initialized object which holds information of
287: * the above byte-stream's contents charset.
288: *
289: * @throws IllegalArgumentException if charset name is not specified
290: * @throws UnsupportedEncodingException in cases when specified charset is
291: * not supported by platform or due to invalid byte order for
292: * <code>ISO-10646-UCS-2|4</code> charsets.
293: *
294: */
295: public static Reader createReader(InputStream istream,
296: EncodingInfo encInfo) throws IllegalArgumentException,
297: UnsupportedEncodingException {
298: String charset = encInfo.getEncoding();
299: Boolean isBigEndian = encInfo.isBigEndian();
300:
301: // We MUST know encoding (in fact, charset) name, and as EncodingInfo
302: // have non-arg constructor, its `getEncoding` can return null.
303: if (null == charset) {
304: String s = "Name of the charset must not be NULL!";
305: throw new IllegalArgumentException(s);
306: }
307:
308: if (LOGGER.isLoggable(Level.FINE)) {
309: LOGGER
310: .fine("Trying to create reader basing on existing charset "
311: + "information: `" + encInfo + "`.");
312: }
313:
314: Reader reader = null;
315:
316: // UCS-2|4 charsets are handled with custom reader
317: if ("ISO-10646-UCS-4".equals(charset)) {
318: if (null != isBigEndian) {
319: boolean isBE = isBigEndian.booleanValue();
320:
321: if (isBE) {
322: reader = new UCSReader(istream, UCSReader.UCS4BE);
323: } else {
324: reader = new UCSReader(istream, UCSReader.UCS4LE);
325: }
326: } else {
327: // Fatal error, UCSReader will fail to decode this properly
328: String s = "Unsupported byte order for ISO-10646-UCS-4 encoding.";
329: throw new UnsupportedEncodingException(s);
330: }
331: } else if ("ISO-10646-UCS-2".equals(charset)) {
332: if (null != isBigEndian) {
333: boolean isBE = isBigEndian.booleanValue();
334:
335: if (isBE) {
336: reader = new UCSReader(istream, UCSReader.UCS4BE);
337: } else {
338: reader = new UCSReader(istream, UCSReader.UCS4LE);
339: }
340: } else {
341: // Cannot construct UCSReader without byte order info
342: String s = "Byte order must be specified for ISO-10646-UCS-2.";
343: throw new UnsupportedEncodingException(s);
344: }
345: } else {
346: reader = new InputStreamReader(istream, charset);
347: }
348:
349: return reader;
350: } // END createReader(InputStream, EncodingInfo) : Reader
351:
352: /**
353: * Returns the IANA encoding name that is auto-detected from
354: * the bytes specified, with the endian-ness of that encoding where
355: * appropriate. Note, that encoding obtained this way is only an
356: * <em>encoding scheme</em> of the request, i.e. step 1 of detection
357: * process. To learn the exact <em>charset</em> of the request data,
358: * you should also perform step 2 - read XML declaration and get the
359: * value of its <code>encoding</code> pseudoattribute.
360: *
361: * @param b4 The first four bytes of the input.
362: * @param count The number of bytes actually read.
363: * @return Instance of EncodingInfo incapsulating all encoding-related data.
364: */
365: public static EncodingInfo getEncodingName(byte[] b4, int count) {
366: if (count < 2) {
367: return new EncodingInfo("UTF-8", null);
368: }
369:
370: // UTF-16, with BOM
371: int b0 = b4[0] & 0xFF;
372: int b1 = b4[1] & 0xFF;
373:
374: if ((b0 == 0xFE) && (b1 == 0xFF)) {
375: // UTF-16, big-endian
376: return new EncodingInfo("UTF-16BE", new Boolean(true), true);
377: }
378:
379: if ((b0 == 0xFF) && (b1 == 0xFE)) {
380: // UTF-16, little-endian
381: return new EncodingInfo("UTF-16LE", new Boolean(false),
382: true);
383: }
384:
385: // default to UTF-8 if we don't have enough bytes to make a
386: // good determination of the encoding
387: if (count < 3) {
388: return new EncodingInfo("UTF-8", null);
389: }
390:
391: // UTF-8 with a BOM
392: int b2 = b4[2] & 0xFF;
393:
394: if ((b0 == 0xEF) && (b1 == 0xBB) && (b2 == 0xBF)) {
395: return new EncodingInfo("UTF-8", null, true);
396: }
397:
398: // default to UTF-8 if we don't have enough bytes to make a
399: // good determination of the encoding
400: if (count < 4) {
401: return new EncodingInfo("UTF-8", null);
402: }
403:
404: // other encodings
405: int b3 = b4[3] & 0xFF;
406:
407: if ((b0 == 0x00) && (b1 == 0x00) && (b2 == 0x00)
408: && (b3 == 0x3C)) {
409: // UCS-4, big endian (1234)
410: return new EncodingInfo("ISO-10646-UCS-4",
411: new Boolean(true));
412: }
413:
414: if ((b0 == 0x3C) && (b1 == 0x00) && (b2 == 0x00)
415: && (b3 == 0x00)) {
416: // UCS-4, little endian (4321)
417: return new EncodingInfo("ISO-10646-UCS-4", new Boolean(
418: false));
419: }
420:
421: if ((b0 == 0x00) && (b1 == 0x00) && (b2 == 0x3C)
422: && (b3 == 0x00)) {
423: // UCS-4, unusual octet order (2143)
424: // REVISIT: What should this be? (Currently this would be
425: // an exception :)
426: return new EncodingInfo("ISO-10646-UCS-4", null);
427: }
428:
429: if ((b0 == 0x00) && (b1 == 0x3C) && (b2 == 0x00)
430: && (b3 == 0x00)) {
431: // UCS-4, unusual octect order (3412)
432: // REVISIT: What should this be?
433: return new EncodingInfo("ISO-10646-UCS-4", null);
434: }
435:
436: if ((b0 == 0x00) && (b1 == 0x3C) && (b2 == 0x00)
437: && (b3 == 0x3F)) {
438: // UTF-16, big-endian, no BOM
439: // (or could turn out to be UCS-2...
440: // REVISIT: What should this be?
441: return new EncodingInfo("UTF-16BE", new Boolean(true));
442: }
443:
444: if ((b0 == 0x3C) && (b1 == 0x00) && (b2 == 0x3F)
445: && (b3 == 0x00)) {
446: // UTF-16, little-endian, no BOM
447: // (or could turn out to be UCS-2...
448: return new EncodingInfo("UTF-16LE", new Boolean(false));
449: }
450:
451: if ((b0 == 0x4C) && (b1 == 0x6F) && (b2 == 0xA7)
452: && (b3 == 0x94)) {
453: // EBCDIC
454: // a la xerces1, return CP037 instead of EBCDIC here
455: return new EncodingInfo("CP037", null);
456: }
457:
458: // default encoding
459: return new EncodingInfo("UTF-8", null);
460: } // END getEncodingName(byte[], int) : EncodingInfo
461:
462: /**
463: * Gets the encoding of the xml request made to the dispatcher. This
464: * works by reading the temp file where we are storing the request,
465: * looking to match the header specified encoding that should be present
466: * on all xml files. This call should only be made after the temp file
467: * has been set. If no encoding is found, or if an IOError is encountered
468: * then null shall be returned.
469: *
470: * @param reader This character stream is supposed to contain XML data
471: * (i.e. it should start with valid XML declaration).
472: *
473: * @return The encoding specified in the xml header read from the supplied
474: * character stream.
475: */
476: protected static String getXmlEncoding(Reader reader) {
477: try {
478: StringWriter sw = new StringWriter(MAX_XMLDECL_SIZE);
479:
480: int c;
481: int count = 0;
482:
483: for (; (6 > count) && (-1 != (c = reader.read())); count++) {
484: sw.write(c);
485: }
486:
487: /*
488: * Hmm, checking for the case when there is no XML declaration and
489: * document begins with processing instruction whose target name
490: * starts with "<?xml" ("<?xmlfoo"). Sounds like a nearly impossible
491: * thing, but Xerces guys are checking for that somewhere in the
492: * depths of their code :)
493: */
494: if ((6 > count) || (!"<?xml ".equals(sw.toString()))) {
495: if (LOGGER.isLoggable(Level.FINER)) {
496: LOGGER.finer("Invalid(?) XML declaration: "
497: + sw.toString() + ".");
498: }
499:
500: return null;
501: }
502:
503: /*
504: * Continuing reading declaration(?) til the first '>' ('\u003E')
505: * encountered. Conversion from `int` to `char` should be safe
506: * for our purposes, at least I'm not expecting any extended
507: * (0x10000+) characters in xml declaration. I also limited
508: * the total number of chars read this way to prevent any
509: * malformed (no '>') input potentially forcing us to read
510: * megabytes of useless data :)
511: */
512: for (; (MAX_XMLDECL_SIZE > count)
513: && (-1 != (c = reader.read()))
514: && (RIGHT_ANGLE_BRACKET != (char) c); count++) {
515: sw.write(c);
516: }
517:
518: Matcher m = ENCODING_PATTERN.matcher(sw.toString());
519:
520: if (m.find()) {
521: String result = m.group(1);
522:
523: return result;
524: } else {
525: return null;
526: }
527: } catch (IOException e) {
528: if (LOGGER.isLoggable(Level.WARNING)) {
529: LOGGER
530: .warning("Failed to extract charset info from XML "
531: + "declaration due to IOException: "
532: + e.getMessage());
533: }
534:
535: return null;
536: }
537: } // END getXmlEncoding(Reader) : String
538: } // END class XmlCharsetDetector
|