001: /*
002: * Copyright (c) 1998 Sun Microsystems, Inc. All Rights Reserved.
003: */
004:
005: package com.sun.xml.dtdparser;
006:
007: import java.io.ByteArrayInputStream;
008: import java.io.CharConversionException;
009: import java.io.IOException;
010: import java.io.InputStream;
011: import java.io.InputStreamReader;
012: import java.io.PushbackInputStream;
013: import java.io.Reader;
014: import java.util.Hashtable;
015:
016: // NOTE: Add I18N support to this class when JDK gets the ability to
017: // defer selection of locale for exception messages ... use the same
018: // technique for both.
019:
020: /**
021: * This handles several XML-related tasks that normal java.io Readers
022: * don't support, inluding use of IETF standard encoding names and
023: * automatic detection of most XML encodings. The former is needed
024: * for interoperability; the latter is needed to conform with the XML
025: * spec. This class also optimizes reading some common encodings by
026: * providing low-overhead unsynchronized Reader support.
027: * <p/>
028: * <P> Note that the autodetection facility should be used only on
029: * data streams which have an unknown character encoding. For example,
030: * it should never be used on MIME text/xml entities.
031: * <p/>
032: * <P> Note that XML processors are only required to support UTF-8 and
033: * UTF-16 character encodings. Autodetection permits the underlying Java
034: * implementation to provide support for many other encodings, such as
035: * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
036: *
037: * @author David Brownell
038: * @author Janet Koenig
039: * @version 1.3 00/02/24
040: */
041: // package private
042: final class XmlReader extends Reader {
043: private static final int MAXPUSHBACK = 512;
044:
045: private Reader in;
046: private String assignedEncoding;
047: private boolean closed;
048:
049: //
050: // This class always delegates I/O to a reader, which gets
051: // its data from the very beginning of the XML text. It needs
052: // to use a pushback stream since (a) autodetection can read
053: // partial UTF-8 characters which need to be fully processed,
054: // (b) the "Unicode" readers swallow characters that they think
055: // are byte order marks, so tests fail if they don't see the
056: // real byte order mark.
057: //
058: // It's got do this efficiently: character I/O is solidly on the
059: // critical path. (So keep buffer length over 2 Kbytes to avoid
060: // excess buffering. Many URL handlers stuff a BufferedInputStream
061: // between here and the real data source, and larger buffers keep
062: // that from slowing you down.)
063: //
064:
065: /**
066: * Constructs the reader from an input stream, autodetecting
067: * the encoding to use according to the heuristic specified
068: * in the XML 1.0 recommendation.
069: *
070: * @param in the input stream from which the reader is constructed
071: * @throws IOException on error, such as unrecognized encoding
072: */
073: public static Reader createReader(InputStream in)
074: throws IOException {
075: return new XmlReader(in);
076: }
077:
078: /**
079: * Creates a reader supporting the given encoding, mapping
080: * from standard encoding names to ones that understood by
081: * Java where necessary.
082: *
083: * @param in the input stream from which the reader is constructed
084: * @param encoding the IETF standard name of the encoding to use;
085: * if null, autodetection is used.
086: * @throws IOException on error, including unrecognized encoding
087: */
088: public static Reader createReader(InputStream in, String encoding)
089: throws IOException {
090: if (encoding == null)
091: return new XmlReader(in);
092: if ("UTF-8".equalsIgnoreCase(encoding)
093: || "UTF8".equalsIgnoreCase(encoding))
094: return new Utf8Reader(in);
095: if ("US-ASCII".equalsIgnoreCase(encoding)
096: || "ASCII".equalsIgnoreCase(encoding))
097: return new AsciiReader(in);
098: if ("ISO-8859-1".equalsIgnoreCase(encoding)
099: // plus numerous aliases ...
100: )
101: return new Iso8859_1Reader(in);
102:
103: //
104: // What we really want is an administerable resource mapping
105: // encoding names/aliases to classnames. For example a property
106: // file resource, "readers/mapping.props", holding and a set
107: // of readers in that (sub)package... defaulting to this call
108: // only if no better choice is available.
109: //
110: return new InputStreamReader(in, std2java(encoding));
111: }
112:
113: //
114: // JDK doesn't know all of the standard encoding names, and
115: // in particular none of the EBCDIC ones IANA defines (and
116: // which IBM encourages).
117: //
118: static private final Hashtable charsets = new Hashtable(31);
119:
120: static {
121: charsets.put("UTF-16", "Unicode");
122: charsets.put("ISO-10646-UCS-2", "Unicode");
123:
124: // NOTE: no support for ISO-10646-UCS-4 yet.
125:
126: charsets.put("EBCDIC-CP-US", "cp037");
127: charsets.put("EBCDIC-CP-CA", "cp037");
128: charsets.put("EBCDIC-CP-NL", "cp037");
129: charsets.put("EBCDIC-CP-WT", "cp037");
130:
131: charsets.put("EBCDIC-CP-DK", "cp277");
132: charsets.put("EBCDIC-CP-NO", "cp277");
133: charsets.put("EBCDIC-CP-FI", "cp278");
134: charsets.put("EBCDIC-CP-SE", "cp278");
135:
136: charsets.put("EBCDIC-CP-IT", "cp280");
137: charsets.put("EBCDIC-CP-ES", "cp284");
138: charsets.put("EBCDIC-CP-GB", "cp285");
139: charsets.put("EBCDIC-CP-FR", "cp297");
140:
141: charsets.put("EBCDIC-CP-AR1", "cp420");
142: charsets.put("EBCDIC-CP-HE", "cp424");
143: charsets.put("EBCDIC-CP-BE", "cp500");
144: charsets.put("EBCDIC-CP-CH", "cp500");
145:
146: charsets.put("EBCDIC-CP-ROECE", "cp870");
147: charsets.put("EBCDIC-CP-YU", "cp870");
148: charsets.put("EBCDIC-CP-IS", "cp871");
149: charsets.put("EBCDIC-CP-AR2", "cp918");
150:
151: // IANA also defines two that JDK 1.2 doesn't handle:
152: // EBCDIC-CP-GR --> CP423
153: // EBCDIC-CP-TR --> CP905
154: }
155:
156: // returns an encoding name supported by JDK >= 1.1.6
157: // for some cases required by the XML spec
158: private static String std2java(String encoding) {
159: String temp = encoding.toUpperCase();
160: temp = (String) charsets.get(temp);
161: return temp != null ? temp : encoding;
162: }
163:
164: /**
165: * Returns the standard name of the encoding in use
166: */
167: public String getEncoding() {
168: return assignedEncoding;
169: }
170:
171: private XmlReader(InputStream stream) throws IOException {
172: super (stream);
173:
174: PushbackInputStream pb;
175: byte buf[];
176: int len;
177:
178: if (stream instanceof PushbackInputStream)
179: pb = (PushbackInputStream) stream;
180: else
181: pb = new PushbackInputStream(stream, MAXPUSHBACK);
182:
183: //
184: // See if we can figure out the character encoding used
185: // in this file by peeking at the first few bytes.
186: //
187: buf = new byte[4];
188: len = pb.read(buf);
189: if (len > 0)
190: pb.unread(buf, 0, len);
191:
192: if (len == 4)
193: switch (buf[0] & 0x0ff) {
194: case 0:
195: // 00 3c 00 3f == illegal UTF-16 big-endian
196: if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
197: setEncoding(pb, "UnicodeBig");
198: return;
199: }
200: // else it's probably UCS-4
201: break;
202:
203: case '<': // 0x3c: the most common cases!
204: switch (buf[1] & 0x0ff) {
205: // First character is '<'; could be XML without
206: // an XML directive such as "<hello>", "<!-- ...",
207: // and so on.
208: default:
209: break;
210:
211: // 3c 00 3f 00 == illegal UTF-16 little endian
212: case 0x00:
213: if (buf[2] == 0x3f && buf[3] == 0x00) {
214: setEncoding(pb, "UnicodeLittle");
215: return;
216: }
217: // else probably UCS-4
218: break;
219:
220: // 3c 3f 78 6d == ASCII and supersets '<?xm'
221: case '?':
222: if (buf[2] != 'x' || buf[3] != 'm')
223: break;
224: //
225: // One of several encodings could be used:
226: // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
227: //
228: useEncodingDecl(pb, "UTF8");
229: return;
230: }
231: break;
232:
233: // 4c 6f a7 94 ... some EBCDIC code page
234: case 0x4c:
235: if (buf[1] == 0x6f && (0x0ff & buf[2]) == 0x0a7
236: && (0x0ff & buf[3]) == 0x094) {
237: useEncodingDecl(pb, "CP037");
238: return;
239: }
240: // whoops, treat as UTF-8
241: break;
242:
243: // UTF-16 big-endian
244: case 0xfe:
245: if ((buf[1] & 0x0ff) != 0xff)
246: break;
247: setEncoding(pb, "UTF-16");
248: return;
249:
250: // UTF-16 little-endian
251: case 0xff:
252: if ((buf[1] & 0x0ff) != 0xfe)
253: break;
254: setEncoding(pb, "UTF-16");
255: return;
256:
257: // default ... no XML declaration
258: default:
259: break;
260: }
261:
262: //
263: // If all else fails, assume XML without a declaration, and
264: // using UTF-8 encoding.
265: //
266: setEncoding(pb, "UTF-8");
267: }
268:
269: /*
270: * Read the encoding decl on the stream, knowing that it should
271: * be readable using the specified encoding (basically, ASCII or
272: * EBCDIC). The body of the document may use a wider range of
273: * characters than the XML/Text decl itself, so we switch to use
274: * the specified encoding as soon as we can. (ASCII is a subset
275: * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
276: * has a variety of "code pages" that have these characters as
277: * a common subset.)
278: */
279: private void useEncodingDecl(PushbackInputStream pb, String encoding)
280: throws IOException {
281: byte buffer[] = new byte[MAXPUSHBACK];
282: int len;
283: Reader r;
284: int c;
285:
286: //
287: // Buffer up a bunch of input, and set up to read it in
288: // the specified encoding ... we can skip the first four
289: // bytes since we know that "<?xm" was read to determine
290: // what encoding to use!
291: //
292: len = pb.read(buffer, 0, buffer.length);
293: pb.unread(buffer, 0, len);
294: r = new InputStreamReader(new ByteArrayInputStream(buffer, 4,
295: len), encoding);
296:
297: //
298: // Next must be "l" (and whitespace) else we conclude
299: // error and choose UTF-8.
300: //
301: if ((c = r.read()) != 'l') {
302: setEncoding(pb, "UTF-8");
303: return;
304: }
305:
306: //
307: // Then, we'll skip any
308: // S version="..." [or single quotes]
309: // bit and get any subsequent
310: // S encoding="..." [or single quotes]
311: //
312: // We put an arbitrary size limit on how far we read; lots
313: // of space will break this algorithm.
314: //
315: StringBuffer buf = new StringBuffer();
316: StringBuffer keyBuf = null;
317: String key = null;
318: boolean sawEq = false;
319: char quoteChar = 0;
320: boolean sawQuestion = false;
321:
322: XmlDecl: for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
323: if ((c = r.read()) == -1)
324: break;
325:
326: // ignore whitespace before/between "key = 'value'"
327: if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
328: continue;
329:
330: // ... but require at least a little!
331: if (i == 0)
332: break;
333:
334: // terminate the loop ASAP
335: if (c == '?')
336: sawQuestion = true;
337: else if (sawQuestion) {
338: if (c == '>')
339: break;
340: sawQuestion = false;
341: }
342:
343: // did we get the "key =" bit yet?
344: if (key == null || !sawEq) {
345: if (keyBuf == null) {
346: if (Character.isWhitespace((char) c))
347: continue;
348: keyBuf = buf;
349: buf.setLength(0);
350: buf.append((char) c);
351: sawEq = false;
352: } else if (Character.isWhitespace((char) c)) {
353: key = keyBuf.toString();
354: } else if (c == '=') {
355: if (key == null)
356: key = keyBuf.toString();
357: sawEq = true;
358: keyBuf = null;
359: quoteChar = 0;
360: } else
361: keyBuf.append((char) c);
362: continue;
363: }
364:
365: // space before quoted value
366: if (Character.isWhitespace((char) c))
367: continue;
368: if (c == '"' || c == '\'') {
369: if (quoteChar == 0) {
370: quoteChar = (char) c;
371: buf.setLength(0);
372: continue;
373: } else if (c == quoteChar) {
374: if ("encoding".equals(key)) {
375: assignedEncoding = buf.toString();
376:
377: // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
378: for (i = 0; i < assignedEncoding.length(); i++) {
379: c = assignedEncoding.charAt(i);
380: if ((c >= 'A' && c <= 'Z')
381: || (c >= 'a' && c <= 'z'))
382: continue;
383: if (i == 0)
384: break XmlDecl;
385: if (i > 0
386: && (c == '-'
387: || (c >= '0' && c <= '9')
388: || c == '.' || c == '_'))
389: continue;
390: // map illegal names to UTF-8 default
391: break XmlDecl;
392: }
393:
394: setEncoding(pb, assignedEncoding);
395: return;
396:
397: } else {
398: key = null;
399: continue;
400: }
401: }
402: }
403: buf.append((char) c);
404: }
405:
406: setEncoding(pb, "UTF-8");
407: }
408:
409: private void setEncoding(InputStream stream, String encoding)
410: throws IOException {
411: assignedEncoding = encoding;
412: in = createReader(stream, encoding);
413: }
414:
415: /**
416: * Reads the number of characters read into the buffer, or -1 on EOF.
417: */
418: public int read(char buf[], int off, int len) throws IOException {
419: int val;
420:
421: if (closed)
422: return -1; // throw new IOException ("closed");
423: val = in.read(buf, off, len);
424: if (val == -1)
425: close();
426: return val;
427: }
428:
429: /**
430: * Reads a single character.
431: */
432: public int read() throws IOException {
433: int val;
434:
435: if (closed)
436: throw new IOException("closed");
437: val = in.read();
438: if (val == -1)
439: close();
440: return val;
441: }
442:
443: /**
444: * Returns true iff the reader supports mark/reset.
445: */
446: public boolean markSupported() {
447: return in == null ? false : in.markSupported();
448: }
449:
450: /**
451: * Sets a mark allowing a limited number of characters to
452: * be "peeked", by reading and then resetting.
453: *
454: * @param value how many characters may be "peeked".
455: */
456: public void mark(int value) throws IOException {
457: if (in != null)
458: in.mark(value);
459: }
460:
461: /**
462: * Resets the current position to the last marked position.
463: */
464: public void reset() throws IOException {
465: if (in != null)
466: in.reset();
467: }
468:
469: /**
470: * Skips a specified number of characters.
471: */
472: public long skip(long value) throws IOException {
473: return in == null ? 0 : in.skip(value);
474: }
475:
476: /**
477: * Returns true iff input characters are known to be ready.
478: */
479: public boolean ready() throws IOException {
480: return in == null ? false : in.ready();
481: }
482:
483: /**
484: * Closes the reader.
485: */
486: public void close() throws IOException {
487: if (closed)
488: return;
489: in.close();
490: in = null;
491: closed = true;
492: }
493:
494: //
495: // Delegating to a converter module will always be slower than
496: // direct conversion. Use a similar approach for any other
497: // readers that need to be particularly fast; only block I/O
498: // speed matters to this package. For UTF-16, separate readers
499: // for big and little endian streams make a difference, too;
500: // fewer conditionals in the critical path!
501: //
502: static abstract class BaseReader extends Reader {
503: protected InputStream instream;
504: protected byte buffer[];
505: protected int start, finish;
506:
507: BaseReader(InputStream stream) {
508: super (stream);
509:
510: instream = stream;
511: buffer = new byte[8192];
512: }
513:
514: public boolean ready() throws IOException {
515: return instream == null || (finish - start) > 0
516: || instream.available() != 0;
517: }
518:
519: // caller shouldn't read again
520: public void close() throws IOException {
521: if (instream != null) {
522: instream.close();
523: start = finish = 0;
524: buffer = null;
525: instream = null;
526: }
527: }
528: }
529:
530: //
531: // We want this reader, to make the default encoding be as fast
532: // as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2)
533: // InputStreamReader works, but 20+% slower speed isn't OK for
534: // the default/primary encoding.
535: //
536: static final class Utf8Reader extends BaseReader {
537: // 2nd half of UTF-8 surrogate pair
538: private char nextChar;
539:
540: Utf8Reader(InputStream stream) {
541: super (stream);
542: }
543:
544: public int read(char buf[], int offset, int len)
545: throws IOException {
546: int i = 0, c = 0;
547:
548: if (len <= 0)
549: return 0;
550:
551: // Consume remaining half of any surrogate pair immediately
552: if (nextChar != 0) {
553: buf[offset + i++] = nextChar;
554: nextChar = 0;
555: }
556:
557: while (i < len) {
558: // stop or read data if needed
559: if (finish <= start) {
560: if (instream == null) {
561: c = -1;
562: break;
563: }
564: start = 0;
565: finish = instream.read(buffer, 0, buffer.length);
566: if (finish <= 0) {
567: this .close();
568: c = -1;
569: break;
570: }
571: }
572:
573: //
574: // RFC 2279 describes UTF-8; there are six encodings.
575: // Each encoding takes a fixed number of characters
576: // (1-6 bytes) and is flagged by a bit pattern in the
577: // first byte. The five and six byte-per-character
578: // encodings address characters which are disallowed
579: // in XML documents, as do some four byte ones.
580: //
581:
582: //
583: // Single byte == ASCII. Common; optimize.
584: //
585: c = buffer[start] & 0x0ff;
586: if ((c & 0x80) == 0x00) {
587: // 0x0000 <= c <= 0x007f
588: start++;
589: buf[offset + i++] = (char) c;
590: continue;
591: }
592:
593: //
594: // Multibyte chars -- check offsets optimistically,
595: // ditto the "10xx xxxx" format for subsequent bytes
596: //
597: int off = start;
598:
599: try {
600: // 2 bytes
601: if ((buffer[off] & 0x0E0) == 0x0C0) {
602: c = (buffer[off++] & 0x1f) << 6;
603: c += buffer[off++] & 0x3f;
604:
605: // 0x0080 <= c <= 0x07ff
606:
607: // 3 bytes
608: } else if ((buffer[off] & 0x0F0) == 0x0E0) {
609: c = (buffer[off++] & 0x0f) << 12;
610: c += (buffer[off++] & 0x3f) << 6;
611: c += buffer[off++] & 0x3f;
612:
613: // 0x0800 <= c <= 0xffff
614:
615: // 4 bytes
616: } else if ((buffer[off] & 0x0f8) == 0x0F0) {
617: c = (buffer[off++] & 0x07) << 18;
618: c += (buffer[off++] & 0x3f) << 12;
619: c += (buffer[off++] & 0x3f) << 6;
620: c += buffer[off++] & 0x3f;
621:
622: // 0x0001 0000 <= c <= 0x001f ffff
623:
624: // Unicode supports c <= 0x0010 ffff ...
625: if (c > 0x0010ffff)
626: throw new CharConversionException(
627: "UTF-8 encoding of character 0x00"
628: + Integer.toHexString(c)
629: + " can't be converted to Unicode.");
630:
631: // Convert UCS-4 char to surrogate pair (UTF-16)
632: c -= 0x10000;
633: nextChar = (char) (0xDC00 + (c & 0x03ff));
634: c = 0xD800 + (c >> 10);
635:
636: // 5 and 6 byte versions are XML WF errors, but
637: // typically come from mislabeled encodings
638: } else
639: throw new CharConversionException(
640: "Unconvertible UTF-8 character"
641: + " beginning with 0x"
642: + Integer
643: .toHexString(buffer[start] & 0xff));
644:
645: } catch (ArrayIndexOutOfBoundsException e) {
646: // off > length && length >= buffer.length
647: c = 0;
648: }
649:
650: //
651: // if the buffer held only a partial character,
652: // compact it and try to read the rest of the
653: // character. worst case involves three
654: // single-byte reads -- quite rare.
655: //
656: if (off > finish) {
657: System.arraycopy(buffer, start, buffer, 0, finish
658: - start);
659: finish -= start;
660: start = 0;
661: off = instream.read(buffer, finish, buffer.length
662: - finish);
663: if (off < 0) {
664: this .close();
665: throw new CharConversionException(
666: "Partial UTF-8 char");
667: }
668: finish += off;
669: continue;
670: }
671:
672: //
673: // check the format of the non-initial bytes
674: //
675: for (start++; start < off; start++) {
676: if ((buffer[start] & 0xC0) != 0x80) {
677: this .close();
678: throw new CharConversionException(
679: "Malformed UTF-8 char -- "
680: + "is an XML encoding declaration missing?");
681: }
682: }
683:
684: //
685: // If this needed a surrogate pair, consume ASAP
686: //
687: buf[offset + i++] = (char) c;
688: if (nextChar != 0 && i < len) {
689: buf[offset + i++] = nextChar;
690: nextChar = 0;
691: }
692: }
693: if (i > 0)
694: return i;
695: return (c == -1) ? -1 : 0;
696: }
697: }
698:
699: //
700: // We want ASCII and ISO-8859 Readers since they're the most common
701: // encodings in the US and Europe, and we don't want performance
702: // regressions for them. They're also easy to implement efficiently,
703: // since they're bitmask subsets of UNICODE.
704: //
705: // XXX haven't benchmarked these readers vs what we get out of JDK.
706: //
707: static final class AsciiReader extends BaseReader {
708: AsciiReader(InputStream in) {
709: super (in);
710: }
711:
712: public int read(char buf[], int offset, int len)
713: throws IOException {
714: int i, c;
715:
716: if (instream == null)
717: return -1;
718:
719: for (i = 0; i < len; i++) {
720: if (start >= finish) {
721: start = 0;
722: finish = instream.read(buffer, 0, buffer.length);
723: if (finish <= 0) {
724: if (finish <= 0)
725: this .close();
726: break;
727: }
728: }
729: c = buffer[start++];
730: if ((c & 0x80) != 0)
731: throw new CharConversionException(
732: "Illegal ASCII character, 0x"
733: + Integer.toHexString(c & 0xff));
734: buf[offset + i] = (char) c;
735: }
736: if (i == 0 && finish <= 0)
737: return -1;
738: return i;
739: }
740: }
741:
742: static final class Iso8859_1Reader extends BaseReader {
743: Iso8859_1Reader(InputStream in) {
744: super (in);
745: }
746:
747: public int read(char buf[], int offset, int len)
748: throws IOException {
749: int i;
750:
751: if (instream == null)
752: return -1;
753:
754: for (i = 0; i < len; i++) {
755: if (start >= finish) {
756: start = 0;
757: finish = instream.read(buffer, 0, buffer.length);
758: if (finish <= 0) {
759: if (finish <= 0)
760: this .close();
761: break;
762: }
763: }
764: buf[offset + i] = (char) (0x0ff & buffer[start++]);
765: }
766: if (i == 0 && finish <= 0)
767: return -1;
768: return i;
769: }
770: }
771: }
|