001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package org.apache.xerces.xinclude;
019:
020: import java.io.BufferedInputStream;
021: import java.io.IOException;
022: import java.io.InputStream;
023: import java.io.InputStreamReader;
024: import java.io.Reader;
025: import java.net.HttpURLConnection;
026: import java.net.URL;
027: import java.net.URLConnection;
028: import java.util.Iterator;
029: import java.util.Locale;
030: import java.util.Map;
031:
032: import org.apache.xerces.impl.XMLEntityManager;
033: import org.apache.xerces.impl.XMLErrorReporter;
034: import org.apache.xerces.impl.io.ASCIIReader;
035: import org.apache.xerces.impl.io.Latin1Reader;
036: import org.apache.xerces.impl.io.UTF8Reader;
037: import org.apache.xerces.impl.msg.XMLMessageFormatter;
038: import org.apache.xerces.util.EncodingMap;
039: import org.apache.xerces.util.HTTPInputSource;
040: import org.apache.xerces.util.MessageFormatter;
041: import org.apache.xerces.util.XMLChar;
042: import org.apache.xerces.xni.XMLString;
043: import org.apache.xerces.xni.parser.XMLInputSource;
044:
045: /**
046: * This class is used for reading resources requested in <include> elements,
047: * when the parse attribute of the <include> element is "text". Using this
048: * class will open the location, detect the encoding, and discard the byte order
049: * mark, if applicable.
050: *
051: * REVISIT:
052: * Much of the code in this class is taken from XMLEntityManager. It would be nice
053: * if this code could be shared in some way. However, since XMLEntityManager is used
054: * for reading files as XML, and this needs to read files as text, there would need
055: * to be some refactoring done.
056: *
057: * @author Michael Glavassevich, IBM
058: * @author Peter McCracken, IBM
059: * @author Ankit Pasricha, IBM
060: * @author Arun Yadav, Sun Microsystems Inc.
061: *
062: * @version $Id: XIncludeTextReader.java 572046 2007-09-02 17:33:57Z mrglavas $
063: *
064: * @see XIncludeHandler
065: */
066: public class XIncludeTextReader {
067:
068: private Reader fReader;
069: private final XIncludeHandler fHandler;
070: private XMLInputSource fSource;
071: private XMLErrorReporter fErrorReporter;
072: private XMLString fTempString = new XMLString();
073:
074: /**
075: * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
076: *
077: * @param source The XMLInputSource to use.
078: * @param handler The XIncludeHandler to use.
079: * @param bufferSize The size of this text reader's buffer.
080: */
081: public XIncludeTextReader(XMLInputSource source,
082: XIncludeHandler handler, int bufferSize) throws IOException {
083: fHandler = handler;
084: fSource = source;
085: fTempString = new XMLString(new char[bufferSize + 1], 0, 0);
086: }
087:
088: /**
089: * Sets the XMLErrorReporter used for reporting errors while
090: * reading the text include.
091: *
092: * @param errorReporter the XMLErrorReporter to be used for
093: * reporting errors.
094: */
095: public void setErrorReporter(XMLErrorReporter errorReporter) {
096: fErrorReporter = errorReporter;
097: }
098:
099: /**
100: * Return the Reader for given XMLInputSource.
101: *
102: * @param source The XMLInputSource to use.
103: */
104: protected Reader getReader(XMLInputSource source)
105: throws IOException {
106: if (source.getCharacterStream() != null) {
107: return source.getCharacterStream();
108: } else {
109: InputStream stream = null;
110:
111: String encoding = source.getEncoding();
112: if (encoding == null) {
113: encoding = "UTF-8";
114: }
115: if (source.getByteStream() != null) {
116: stream = source.getByteStream();
117: // Wrap the InputStream so that it is possible to rewind it.
118: if (!(stream instanceof BufferedInputStream)) {
119: stream = new BufferedInputStream(stream,
120: fTempString.ch.length);
121: }
122: } else {
123: String expandedSystemId = XMLEntityManager
124: .expandSystemId(source.getSystemId(), source
125: .getBaseSystemId(), false);
126:
127: URL url = new URL(expandedSystemId);
128: URLConnection urlCon = url.openConnection();
129:
130: // If this is an HTTP connection attach any request properties to the request.
131: if (urlCon instanceof HttpURLConnection
132: && source instanceof HTTPInputSource) {
133: final HttpURLConnection urlConnection = (HttpURLConnection) urlCon;
134: final HTTPInputSource httpInputSource = (HTTPInputSource) source;
135:
136: // set request properties
137: Iterator propIter = httpInputSource
138: .getHTTPRequestProperties();
139: while (propIter.hasNext()) {
140: Map.Entry entry = (Map.Entry) propIter.next();
141: urlConnection.setRequestProperty((String) entry
142: .getKey(), (String) entry.getValue());
143: }
144:
145: // set preference for redirection
146: boolean followRedirects = httpInputSource
147: .getFollowHTTPRedirects();
148: if (!followRedirects) {
149: XMLEntityManager.setInstanceFollowRedirects(
150: urlConnection, followRedirects);
151: }
152: }
153:
154: // Wrap the InputStream so that it is possible to rewind it.
155: stream = new BufferedInputStream(urlCon
156: .getInputStream());
157:
158: // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
159: String rawContentType = urlCon.getContentType();
160:
161: // text/xml and application/xml offer only one optional parameter
162: int index = (rawContentType != null) ? rawContentType
163: .indexOf(';') : -1;
164:
165: String contentType = null;
166: String charset = null;
167: if (index != -1) {
168: // this should be something like "text/xml"
169: contentType = rawContentType.substring(0, index)
170: .trim();
171:
172: // this should be something like "charset=UTF-8", but we want to
173: // strip it down to just "UTF-8"
174: charset = rawContentType.substring(index + 1)
175: .trim();
176: if (charset.startsWith("charset=")) {
177: // 8 is the length of "charset="
178: charset = charset.substring(8).trim();
179: // strip quotes, if present
180: if ((charset.charAt(0) == '"' && charset
181: .charAt(charset.length() - 1) == '"')
182: || (charset.charAt(0) == '\'' && charset
183: .charAt(charset.length() - 1) == '\'')) {
184: charset = charset.substring(1, charset
185: .length() - 1);
186: }
187: } else {
188: charset = null;
189: }
190: } else {
191: contentType = rawContentType.trim();
192: }
193:
194: String detectedEncoding = null;
195: /** The encoding of such a resource is determined by:
196: 1 external encoding information, if available, otherwise
197: -- the most common type of external information is the "charset" parameter of a MIME package
198: 2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
199: 3 the value of the encoding attribute if one exists, otherwise
200: 4 UTF-8.
201: **/
202: if (contentType.equals("text/xml")) {
203: if (charset != null) {
204: detectedEncoding = charset;
205: } else {
206: // see RFC2376 or 3023, section 3.1
207: detectedEncoding = "US-ASCII";
208: }
209: } else if (contentType.equals("application/xml")) {
210: if (charset != null) {
211: detectedEncoding = charset;
212: } else {
213: // see RFC2376 or 3023, section 3.2
214: detectedEncoding = getEncodingName(stream);
215: }
216: } else if (contentType.endsWith("+xml")) {
217: detectedEncoding = getEncodingName(stream);
218: }
219:
220: if (detectedEncoding != null) {
221: encoding = detectedEncoding;
222: }
223: // else 3 or 4.
224: }
225:
226: encoding = encoding.toUpperCase(Locale.ENGLISH);
227:
228: // eat the Byte Order Mark
229: encoding = consumeBOM(stream, encoding);
230:
231: // If the document is UTF-8 or US-ASCII use
232: // the Xerces readers for these encodings. For
233: // US-ASCII consult the encoding map since
234: // this encoding has many aliases.
235: if (encoding.equals("UTF-8")) {
236: return new UTF8Reader(
237: stream,
238: fTempString.ch.length,
239: fErrorReporter
240: .getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
241: fErrorReporter.getLocale());
242: }
243:
244: // Try to use a Java reader.
245: String javaEncoding = EncodingMap
246: .getIANA2JavaMapping(encoding);
247:
248: // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
249: // The XIncludeHandler will report this as a ResourceError and then will
250: // attempt to include a fallback if there is one.
251: if (javaEncoding == null) {
252: MessageFormatter aFormatter = fErrorReporter
253: .getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
254: Locale aLocale = fErrorReporter.getLocale();
255: throw new IOException(aFormatter.formatMessage(aLocale,
256: "EncodingDeclInvalid",
257: new Object[] { encoding }));
258: } else if (javaEncoding.equals("ASCII")) {
259: return new ASCIIReader(
260: stream,
261: fTempString.ch.length,
262: fErrorReporter
263: .getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
264: fErrorReporter.getLocale());
265: } else if (javaEncoding.equals("ISO8859_1")) {
266: return new Latin1Reader(stream, fTempString.ch.length);
267: }
268: return new InputStreamReader(stream, javaEncoding);
269: }
270: }
271:
272: /**
273: * XMLEntityManager cares about endian-ness, since it creates its own optimized
274: * readers. Since we're just using generic Java readers for now, we're not caring
275: * about endian-ness. If this changes, even more code needs to be copied from
276: * XMLEntity manager. -- PJM
277: */
278: protected String getEncodingName(InputStream stream)
279: throws IOException {
280: final byte[] b4 = new byte[4];
281: String encoding = null;
282:
283: // this has the potential to throw an exception
284: // it will be fixed when we ensure the stream is rewindable (see note above)
285: stream.mark(4);
286: int count = stream.read(b4, 0, 4);
287: stream.reset();
288: if (count == 4) {
289: encoding = getEncodingName(b4);
290: }
291:
292: return encoding;
293: }
294:
295: /**
296: * Removes the byte order mark from the stream, if
297: * it exists and returns the encoding name.
298: *
299: * @param stream
300: * @param encoding
301: * @throws IOException
302: */
303: protected String consumeBOM(InputStream stream, String encoding)
304: throws IOException {
305:
306: byte[] b = new byte[3];
307: int count = 0;
308: stream.mark(3);
309: if (encoding.equals("UTF-8")) {
310: count = stream.read(b, 0, 3);
311: if (count == 3) {
312: final int b0 = b[0] & 0xFF;
313: final int b1 = b[1] & 0xFF;
314: final int b2 = b[2] & 0xFF;
315: if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
316: // First three bytes are not BOM, so reset.
317: stream.reset();
318: }
319: } else {
320: stream.reset();
321: }
322: } else if (encoding.startsWith("UTF-16")) {
323: count = stream.read(b, 0, 2);
324: if (count == 2) {
325: final int b0 = b[0] & 0xFF;
326: final int b1 = b[1] & 0xFF;
327: if (b0 == 0xFE && b1 == 0xFF) {
328: return "UTF-16BE";
329: } else if (b0 == 0xFF && b1 == 0xFE) {
330: return "UTF-16LE";
331: }
332: }
333: // First two bytes are not BOM, so reset.
334: stream.reset();
335: }
336: // We could do UTF-32, but since the getEncodingName() doesn't support that
337: // we won't support it here.
338: // To implement UTF-32, look for: 00 00 FE FF for big-endian
339: // or FF FE 00 00 for little-endian
340: return encoding;
341: }
342:
343: /**
344: * REVISIT: This code is taken from org.apache.xerces.impl.XMLEntityManager.
345: * Is there any way we can share the code, without having it implemented twice?
346: * I think we should make it public and static in XMLEntityManager. --PJM
347: *
348: * Returns the IANA encoding name that is auto-detected from
349: * the bytes specified, with the endian-ness of that encoding where appropriate.
350: *
351: * @param b4 The first four bytes of the input.
352: * @return the encoding name, or null if no encoding could be detected
353: */
354: protected String getEncodingName(byte[] b4) {
355:
356: // UTF-16, with BOM
357: int b0 = b4[0] & 0xFF;
358: int b1 = b4[1] & 0xFF;
359: if (b0 == 0xFE && b1 == 0xFF) {
360: // UTF-16, big-endian
361: return "UTF-16BE";
362: }
363: if (b0 == 0xFF && b1 == 0xFE) {
364: // UTF-16, little-endian
365: return "UTF-16LE";
366: }
367:
368: // UTF-8 with a BOM
369: int b2 = b4[2] & 0xFF;
370: if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
371: return "UTF-8";
372: }
373:
374: // other encodings
375: int b3 = b4[3] & 0xFF;
376: if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
377: // UCS-4, big endian (1234)
378: return "ISO-10646-UCS-4";
379: }
380: if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
381: // UCS-4, little endian (4321)
382: return "ISO-10646-UCS-4";
383: }
384: if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
385: // UCS-4, unusual octet order (2143)
386: return "ISO-10646-UCS-4";
387: }
388: if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
389: // UCS-4, unusual octect order (3412)
390: return "ISO-10646-UCS-4";
391: }
392: if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
393: // UTF-16, big-endian, no BOM
394: // (or could turn out to be UCS-2...
395: return "UTF-16BE";
396: }
397: if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
398: // UTF-16, little-endian, no BOM
399: // (or could turn out to be UCS-2...
400: return "UTF-16LE";
401: }
402: if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
403: // EBCDIC
404: // a la xerces1, return CP037 instead of EBCDIC here
405: return "CP037";
406: }
407:
408: // this signals us to use the value from the encoding attribute
409: return null;
410:
411: } // getEncodingName(byte[]):Object[]
412:
413: /**
414: * Read the input stream as text, and pass the text on to the XIncludeHandler
415: * using calls to characters(). This will read all of the text it can from the
416: * resource.
417: *
418: * @throws IOException
419: */
420: public void parse() throws IOException {
421: fReader = getReader(fSource);
422: fSource = null;
423: int readSize = fReader.read(fTempString.ch, 0,
424: fTempString.ch.length - 1);
425: fHandler.fHasIncludeReportedContent = true;
426: while (readSize != -1) {
427: for (int i = 0; i < readSize; ++i) {
428: char ch = fTempString.ch[i];
429: if (!isValid(ch)) {
430: if (XMLChar.isHighSurrogate(ch)) {
431: int ch2;
432: // retrieve next character
433: if (++i < readSize) {
434: ch2 = fTempString.ch[i];
435: }
436: // handle rare boundary case
437: else {
438: ch2 = fReader.read();
439: if (ch2 != -1) {
440: fTempString.ch[readSize++] = (char) ch2;
441: }
442: }
443: if (XMLChar.isLowSurrogate(ch2)) {
444: // convert surrogates to a supplemental character
445: int sup = XMLChar.supplemental(ch,
446: (char) ch2);
447: if (!isValid(sup)) {
448: fErrorReporter
449: .reportError(
450: XMLMessageFormatter.XML_DOMAIN,
451: "InvalidCharInContent",
452: new Object[] { Integer
453: .toString(sup,
454: 16) },
455: XMLErrorReporter.SEVERITY_FATAL_ERROR);
456: }
457: } else {
458: fErrorReporter
459: .reportError(
460: XMLMessageFormatter.XML_DOMAIN,
461: "InvalidCharInContent",
462: new Object[] { Integer
463: .toString(ch2, 16) },
464: XMLErrorReporter.SEVERITY_FATAL_ERROR);
465: }
466: } else {
467: fErrorReporter
468: .reportError(
469: XMLMessageFormatter.XML_DOMAIN,
470: "InvalidCharInContent",
471: new Object[] { Integer
472: .toString(ch, 16) },
473: XMLErrorReporter.SEVERITY_FATAL_ERROR);
474: }
475: }
476: }
477: if (fHandler != null && readSize > 0) {
478: fTempString.offset = 0;
479: fTempString.length = readSize;
480: fHandler.characters(fTempString, fHandler
481: .modifyAugmentations(null, true));
482: }
483: readSize = fReader.read(fTempString.ch, 0,
484: fTempString.ch.length - 1);
485: }
486:
487: }
488:
489: /**
490: * Sets the input source on this text reader.
491: *
492: * @param source The XMLInputSource to use.
493: */
494: public void setInputSource(XMLInputSource source) {
495: fSource = source;
496: }
497:
498: /**
499: * Closes the stream. Call this after parse(), or when there is no longer any need
500: * for this object.
501: *
502: * @throws IOException
503: */
504: public void close() throws IOException {
505: if (fReader != null) {
506: fReader.close();
507: fReader = null;
508: }
509: }
510:
511: /**
512: * Returns true if the specified character is a valid XML character
513: * as per the rules of XML 1.0.
514: *
515: * @param ch The character to check.
516: */
517: protected boolean isValid(int ch) {
518: return XMLChar.isValid(ch);
519: }
520:
521: /**
522: * Sets the buffer size property for the reader which decides the chunk sizes that are parsed
523: * by the reader at a time and passed to the handler
524: *
525: * @param bufferSize The size of the buffer desired
526: */
527: protected void setBufferSize(int bufferSize) {
528: if (fTempString.ch.length != ++bufferSize) {
529: fTempString.ch = new char[bufferSize];
530: }
531: }
532:
533: }
|