Source Code Cross Referenced for XIncludeTextReader.java in » XML » xerces-2_9_1 » org » apache » xerces » xinclude » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » XML » xerces 2_9_1 » org.apache.xerces.xinclude
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         * Licensed to the Apache Software Foundation (ASF) under one or more
003:         * contributor license agreements.  See the NOTICE file distributed with
004:         * this work for additional information regarding copyright ownership.
005:         * The ASF licenses this file to You under the Apache License, Version 2.0
006:         * (the "License"); you may not use this file except in compliance with
007:         * the License.  You may obtain a copy of the License at
008:         * 
009:         *      http://www.apache.org/licenses/LICENSE-2.0
010:         * 
011:         * Unless required by applicable law or agreed to in writing, software
012:         * distributed under the License is distributed on an "AS IS" BASIS,
013:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014:         * See the License for the specific language governing permissions and
015:         * limitations under the License.
016:         */
017:
018:        package org.apache.xerces.xinclude;
019:
020:        import java.io.BufferedInputStream;
021:        import java.io.IOException;
022:        import java.io.InputStream;
023:        import java.io.InputStreamReader;
024:        import java.io.Reader;
025:        import java.net.HttpURLConnection;
026:        import java.net.URL;
027:        import java.net.URLConnection;
028:        import java.util.Iterator;
029:        import java.util.Locale;
030:        import java.util.Map;
031:
032:        import org.apache.xerces.impl.XMLEntityManager;
033:        import org.apache.xerces.impl.XMLErrorReporter;
034:        import org.apache.xerces.impl.io.ASCIIReader;
035:        import org.apache.xerces.impl.io.Latin1Reader;
036:        import org.apache.xerces.impl.io.UTF8Reader;
037:        import org.apache.xerces.impl.msg.XMLMessageFormatter;
038:        import org.apache.xerces.util.EncodingMap;
039:        import org.apache.xerces.util.HTTPInputSource;
040:        import org.apache.xerces.util.MessageFormatter;
041:        import org.apache.xerces.util.XMLChar;
042:        import org.apache.xerces.xni.XMLString;
043:        import org.apache.xerces.xni.parser.XMLInputSource;
044:
045:        /**
046:         * This class is used for reading resources requested in &lt;include&gt; elements,
047:         * when the parse attribute of the &lt;include&gt; element is "text".  Using this
048:         * class will open the location, detect the encoding, and discard the byte order
049:         * mark, if applicable.
050:         * 
051:         * REVISIT:
052:         * Much of the code in this class is taken from XMLEntityManager.  It would be nice
053:         * if this code could be shared in some way.  However, since XMLEntityManager is used
054:         * for reading files as XML, and this needs to read files as text, there would need
055:         * to be some refactoring done.
056:         * 
057:         * @author Michael Glavassevich, IBM
058:         * @author Peter McCracken, IBM
059:         * @author Ankit Pasricha, IBM
060:         * @author Arun Yadav, Sun Microsystems Inc.
061:         *
062:         * @version $Id: XIncludeTextReader.java 572046 2007-09-02 17:33:57Z mrglavas $
063:         *
064:         * @see XIncludeHandler
065:         */
066:        public class XIncludeTextReader {
067:
068:            private Reader fReader;
069:            private final XIncludeHandler fHandler;
070:            private XMLInputSource fSource;
071:            private XMLErrorReporter fErrorReporter;
072:            private XMLString fTempString = new XMLString();
073:
074:            /**
075:             * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
076:             *
077:             * @param source The XMLInputSource to use.
078:             * @param handler The XIncludeHandler to use.
079:             * @param bufferSize The size of this text reader's buffer.
080:             */
081:            public XIncludeTextReader(XMLInputSource source,
082:                    XIncludeHandler handler, int bufferSize) throws IOException {
083:                fHandler = handler;
084:                fSource = source;
085:                fTempString = new XMLString(new char[bufferSize + 1], 0, 0);
086:            }
087:
088:            /**
089:             * Sets the XMLErrorReporter used for reporting errors while
090:             * reading the text include.
091:             *
092:             * @param errorReporter the XMLErrorReporter to be used for
093:             * reporting errors.
094:             */
095:            public void setErrorReporter(XMLErrorReporter errorReporter) {
096:                fErrorReporter = errorReporter;
097:            }
098:
099:            /**
100:             * Return the Reader for given XMLInputSource.
101:             *
102:             * @param source The XMLInputSource to use.
103:             */
104:            protected Reader getReader(XMLInputSource source)
105:                    throws IOException {
106:                if (source.getCharacterStream() != null) {
107:                    return source.getCharacterStream();
108:                } else {
109:                    InputStream stream = null;
110:
111:                    String encoding = source.getEncoding();
112:                    if (encoding == null) {
113:                        encoding = "UTF-8";
114:                    }
115:                    if (source.getByteStream() != null) {
116:                        stream = source.getByteStream();
117:                        // Wrap the InputStream so that it is possible to rewind it.
118:                        if (!(stream instanceof  BufferedInputStream)) {
119:                            stream = new BufferedInputStream(stream,
120:                                    fTempString.ch.length);
121:                        }
122:                    } else {
123:                        String expandedSystemId = XMLEntityManager
124:                                .expandSystemId(source.getSystemId(), source
125:                                        .getBaseSystemId(), false);
126:
127:                        URL url = new URL(expandedSystemId);
128:                        URLConnection urlCon = url.openConnection();
129:
130:                        // If this is an HTTP connection attach any request properties to the request.
131:                        if (urlCon instanceof  HttpURLConnection
132:                                && source instanceof  HTTPInputSource) {
133:                            final HttpURLConnection urlConnection = (HttpURLConnection) urlCon;
134:                            final HTTPInputSource httpInputSource = (HTTPInputSource) source;
135:
136:                            // set request properties
137:                            Iterator propIter = httpInputSource
138:                                    .getHTTPRequestProperties();
139:                            while (propIter.hasNext()) {
140:                                Map.Entry entry = (Map.Entry) propIter.next();
141:                                urlConnection.setRequestProperty((String) entry
142:                                        .getKey(), (String) entry.getValue());
143:                            }
144:
145:                            // set preference for redirection
146:                            boolean followRedirects = httpInputSource
147:                                    .getFollowHTTPRedirects();
148:                            if (!followRedirects) {
149:                                XMLEntityManager.setInstanceFollowRedirects(
150:                                        urlConnection, followRedirects);
151:                            }
152:                        }
153:
154:                        // Wrap the InputStream so that it is possible to rewind it.
155:                        stream = new BufferedInputStream(urlCon
156:                                .getInputStream());
157:
158:                        // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
159:                        String rawContentType = urlCon.getContentType();
160:
161:                        // text/xml and application/xml offer only one optional parameter
162:                        int index = (rawContentType != null) ? rawContentType
163:                                .indexOf(';') : -1;
164:
165:                        String contentType = null;
166:                        String charset = null;
167:                        if (index != -1) {
168:                            // this should be something like "text/xml"
169:                            contentType = rawContentType.substring(0, index)
170:                                    .trim();
171:
172:                            // this should be something like "charset=UTF-8", but we want to
173:                            // strip it down to just "UTF-8"
174:                            charset = rawContentType.substring(index + 1)
175:                                    .trim();
176:                            if (charset.startsWith("charset=")) {
177:                                // 8 is the length of "charset="
178:                                charset = charset.substring(8).trim();
179:                                // strip quotes, if present
180:                                if ((charset.charAt(0) == '"' && charset
181:                                        .charAt(charset.length() - 1) == '"')
182:                                        || (charset.charAt(0) == '\'' && charset
183:                                                .charAt(charset.length() - 1) == '\'')) {
184:                                    charset = charset.substring(1, charset
185:                                            .length() - 1);
186:                                }
187:                            } else {
188:                                charset = null;
189:                            }
190:                        } else {
191:                            contentType = rawContentType.trim();
192:                        }
193:
194:                        String detectedEncoding = null;
195:                        /**  The encoding of such a resource is determined by:
196:                            1 external encoding information, if available, otherwise
197:                                 -- the most common type of external information is the "charset" parameter of a MIME package
198:                            2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
199:                            3 the value of the encoding attribute if one exists, otherwise
200:                            4 UTF-8.
201:                         **/
202:                        if (contentType.equals("text/xml")) {
203:                            if (charset != null) {
204:                                detectedEncoding = charset;
205:                            } else {
206:                                // see RFC2376 or 3023, section 3.1
207:                                detectedEncoding = "US-ASCII";
208:                            }
209:                        } else if (contentType.equals("application/xml")) {
210:                            if (charset != null) {
211:                                detectedEncoding = charset;
212:                            } else {
213:                                // see RFC2376 or 3023, section 3.2
214:                                detectedEncoding = getEncodingName(stream);
215:                            }
216:                        } else if (contentType.endsWith("+xml")) {
217:                            detectedEncoding = getEncodingName(stream);
218:                        }
219:
220:                        if (detectedEncoding != null) {
221:                            encoding = detectedEncoding;
222:                        }
223:                        // else 3 or 4.
224:                    }
225:
226:                    encoding = encoding.toUpperCase(Locale.ENGLISH);
227:
228:                    // eat the Byte Order Mark
229:                    encoding = consumeBOM(stream, encoding);
230:
231:                    // If the document is UTF-8 or US-ASCII use 
232:                    // the Xerces readers for these encodings. For
233:                    // US-ASCII consult the encoding map since
234:                    // this encoding has many aliases.
235:                    if (encoding.equals("UTF-8")) {
236:                        return new UTF8Reader(
237:                                stream,
238:                                fTempString.ch.length,
239:                                fErrorReporter
240:                                        .getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
241:                                fErrorReporter.getLocale());
242:                    }
243:
244:                    // Try to use a Java reader.
245:                    String javaEncoding = EncodingMap
246:                            .getIANA2JavaMapping(encoding);
247:
248:                    // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
249:                    // The XIncludeHandler will report this as a ResourceError and then will
250:                    // attempt to include a fallback if there is one.
251:                    if (javaEncoding == null) {
252:                        MessageFormatter aFormatter = fErrorReporter
253:                                .getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
254:                        Locale aLocale = fErrorReporter.getLocale();
255:                        throw new IOException(aFormatter.formatMessage(aLocale,
256:                                "EncodingDeclInvalid",
257:                                new Object[] { encoding }));
258:                    } else if (javaEncoding.equals("ASCII")) {
259:                        return new ASCIIReader(
260:                                stream,
261:                                fTempString.ch.length,
262:                                fErrorReporter
263:                                        .getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
264:                                fErrorReporter.getLocale());
265:                    } else if (javaEncoding.equals("ISO8859_1")) {
266:                        return new Latin1Reader(stream, fTempString.ch.length);
267:                    }
268:                    return new InputStreamReader(stream, javaEncoding);
269:                }
270:            }
271:
272:            /** 
273:             * XMLEntityManager cares about endian-ness, since it creates its own optimized
274:             * readers. Since we're just using generic Java readers for now, we're not caring
275:             * about endian-ness.  If this changes, even more code needs to be copied from
276:             * XMLEntity manager. -- PJM
277:             */
278:            protected String getEncodingName(InputStream stream)
279:                    throws IOException {
280:                final byte[] b4 = new byte[4];
281:                String encoding = null;
282:
283:                // this has the potential to throw an exception
284:                // it will be fixed when we ensure the stream is rewindable (see note above)
285:                stream.mark(4);
286:                int count = stream.read(b4, 0, 4);
287:                stream.reset();
288:                if (count == 4) {
289:                    encoding = getEncodingName(b4);
290:                }
291:
292:                return encoding;
293:            }
294:
295:            /**
296:             * Removes the byte order mark from the stream, if
297:             * it exists and returns the encoding name.
298:             * 
299:             * @param stream
300:             * @param encoding
301:             * @throws IOException
302:             */
303:            protected String consumeBOM(InputStream stream, String encoding)
304:                    throws IOException {
305:
306:                byte[] b = new byte[3];
307:                int count = 0;
308:                stream.mark(3);
309:                if (encoding.equals("UTF-8")) {
310:                    count = stream.read(b, 0, 3);
311:                    if (count == 3) {
312:                        final int b0 = b[0] & 0xFF;
313:                        final int b1 = b[1] & 0xFF;
314:                        final int b2 = b[2] & 0xFF;
315:                        if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
316:                            // First three bytes are not BOM, so reset.
317:                            stream.reset();
318:                        }
319:                    } else {
320:                        stream.reset();
321:                    }
322:                } else if (encoding.startsWith("UTF-16")) {
323:                    count = stream.read(b, 0, 2);
324:                    if (count == 2) {
325:                        final int b0 = b[0] & 0xFF;
326:                        final int b1 = b[1] & 0xFF;
327:                        if (b0 == 0xFE && b1 == 0xFF) {
328:                            return "UTF-16BE";
329:                        } else if (b0 == 0xFF && b1 == 0xFE) {
330:                            return "UTF-16LE";
331:                        }
332:                    }
333:                    // First two bytes are not BOM, so reset.
334:                    stream.reset();
335:                }
336:                // We could do UTF-32, but since the getEncodingName() doesn't support that
337:                // we won't support it here.
338:                // To implement UTF-32, look for:  00 00 FE FF for big-endian
339:                //                             or  FF FE 00 00 for little-endian
340:                return encoding;
341:            }
342:
343:            /**
344:             * REVISIT: This code is taken from org.apache.xerces.impl.XMLEntityManager.
345:             *          Is there any way we can share the code, without having it implemented twice?
346:             *          I think we should make it public and static in XMLEntityManager. --PJM
347:             *
348:             * Returns the IANA encoding name that is auto-detected from
349:             * the bytes specified, with the endian-ness of that encoding where appropriate.
350:             *
351:             * @param b4    The first four bytes of the input.
352:             * @return the encoding name, or null if no encoding could be detected
353:             */
354:            protected String getEncodingName(byte[] b4) {
355:
356:                // UTF-16, with BOM
357:                int b0 = b4[0] & 0xFF;
358:                int b1 = b4[1] & 0xFF;
359:                if (b0 == 0xFE && b1 == 0xFF) {
360:                    // UTF-16, big-endian
361:                    return "UTF-16BE";
362:                }
363:                if (b0 == 0xFF && b1 == 0xFE) {
364:                    // UTF-16, little-endian
365:                    return "UTF-16LE";
366:                }
367:
368:                // UTF-8 with a BOM
369:                int b2 = b4[2] & 0xFF;
370:                if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
371:                    return "UTF-8";
372:                }
373:
374:                // other encodings
375:                int b3 = b4[3] & 0xFF;
376:                if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
377:                    // UCS-4, big endian (1234)
378:                    return "ISO-10646-UCS-4";
379:                }
380:                if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
381:                    // UCS-4, little endian (4321)
382:                    return "ISO-10646-UCS-4";
383:                }
384:                if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
385:                    // UCS-4, unusual octet order (2143)
386:                    return "ISO-10646-UCS-4";
387:                }
388:                if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
389:                    // UCS-4, unusual octect order (3412)
390:                    return "ISO-10646-UCS-4";
391:                }
392:                if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
393:                    // UTF-16, big-endian, no BOM
394:                    // (or could turn out to be UCS-2...
395:                    return "UTF-16BE";
396:                }
397:                if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
398:                    // UTF-16, little-endian, no BOM
399:                    // (or could turn out to be UCS-2...
400:                    return "UTF-16LE";
401:                }
402:                if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
403:                    // EBCDIC
404:                    // a la xerces1, return CP037 instead of EBCDIC here
405:                    return "CP037";
406:                }
407:
408:                // this signals us to use the value from the encoding attribute
409:                return null;
410:
411:            } // getEncodingName(byte[]):Object[]
412:
413:            /**
414:             * Read the input stream as text, and pass the text on to the XIncludeHandler
415:             * using calls to characters().  This will read all of the text it can from the
416:             * resource.
417:             * 
418:             * @throws IOException
419:             */
420:            public void parse() throws IOException {
421:                fReader = getReader(fSource);
422:                fSource = null;
423:                int readSize = fReader.read(fTempString.ch, 0,
424:                        fTempString.ch.length - 1);
425:                fHandler.fHasIncludeReportedContent = true;
426:                while (readSize != -1) {
427:                    for (int i = 0; i < readSize; ++i) {
428:                        char ch = fTempString.ch[i];
429:                        if (!isValid(ch)) {
430:                            if (XMLChar.isHighSurrogate(ch)) {
431:                                int ch2;
432:                                // retrieve next character
433:                                if (++i < readSize) {
434:                                    ch2 = fTempString.ch[i];
435:                                }
436:                                // handle rare boundary case
437:                                else {
438:                                    ch2 = fReader.read();
439:                                    if (ch2 != -1) {
440:                                        fTempString.ch[readSize++] = (char) ch2;
441:                                    }
442:                                }
443:                                if (XMLChar.isLowSurrogate(ch2)) {
444:                                    // convert surrogates to a supplemental character
445:                                    int sup = XMLChar.supplemental(ch,
446:                                            (char) ch2);
447:                                    if (!isValid(sup)) {
448:                                        fErrorReporter
449:                                                .reportError(
450:                                                        XMLMessageFormatter.XML_DOMAIN,
451:                                                        "InvalidCharInContent",
452:                                                        new Object[] { Integer
453:                                                                .toString(sup,
454:                                                                        16) },
455:                                                        XMLErrorReporter.SEVERITY_FATAL_ERROR);
456:                                    }
457:                                } else {
458:                                    fErrorReporter
459:                                            .reportError(
460:                                                    XMLMessageFormatter.XML_DOMAIN,
461:                                                    "InvalidCharInContent",
462:                                                    new Object[] { Integer
463:                                                            .toString(ch2, 16) },
464:                                                    XMLErrorReporter.SEVERITY_FATAL_ERROR);
465:                                }
466:                            } else {
467:                                fErrorReporter
468:                                        .reportError(
469:                                                XMLMessageFormatter.XML_DOMAIN,
470:                                                "InvalidCharInContent",
471:                                                new Object[] { Integer
472:                                                        .toString(ch, 16) },
473:                                                XMLErrorReporter.SEVERITY_FATAL_ERROR);
474:                            }
475:                        }
476:                    }
477:                    if (fHandler != null && readSize > 0) {
478:                        fTempString.offset = 0;
479:                        fTempString.length = readSize;
480:                        fHandler.characters(fTempString, fHandler
481:                                .modifyAugmentations(null, true));
482:                    }
483:                    readSize = fReader.read(fTempString.ch, 0,
484:                            fTempString.ch.length - 1);
485:                }
486:
487:            }
488:
489:            /**
490:             * Sets the input source on this text reader.
491:             * 
492:             * @param source The XMLInputSource to use.
493:             */
494:            public void setInputSource(XMLInputSource source) {
495:                fSource = source;
496:            }
497:
498:            /**
499:             * Closes the stream.  Call this after parse(), or when there is no longer any need
500:             * for this object.
501:             * 
502:             * @throws IOException
503:             */
504:            public void close() throws IOException {
505:                if (fReader != null) {
506:                    fReader.close();
507:                    fReader = null;
508:                }
509:            }
510:
511:            /**
512:             * Returns true if the specified character is a valid XML character
513:             * as per the rules of XML 1.0.
514:             *
515:             * @param ch The character to check.
516:             */
517:            protected boolean isValid(int ch) {
518:                return XMLChar.isValid(ch);
519:            }
520:
521:            /**
522:             * Sets the buffer size property for the reader which decides the chunk sizes that are parsed
523:             * by the reader at a time and passed to the handler
524:             * 
525:             * @param bufferSize The size of the buffer desired
526:             */
527:            protected void setBufferSize(int bufferSize) {
528:                if (fTempString.ch.length != ++bufferSize) {
529:                    fTempString.ch = new char[bufferSize];
530:                }
531:            }
532:
533:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.