001: /*
002: * The Apache Software License, Version 1.1
003: *
004: *
005: * Copyright (c) 1999 The Apache Software Foundation. All rights
006: * reserved.
007: *
008: * Redistribution and use in source and binary forms, with or without
009: * modification, are permitted provided that the following conditions
010: * are met:
011: *
012: * 1. Redistributions of source code must retain the above copyright
013: * notice, this list of conditions and the following disclaimer.
014: *
015: * 2. Redistributions in binary form must reproduce the above copyright
016: * notice, this list of conditions and the following disclaimer in
017: * the documentation and/or other materials provided with the
018: * distribution.
019: *
020: * 3. The end-user documentation included with the redistribution,
021: * if any, must include the following acknowledgment:
022: * "This product includes software developed by the
023: * Apache Software Foundation (http://www.apache.org/)."
024: * Alternately, this acknowledgment may appear in the software itself,
025: * if and wherever such third-party acknowledgments normally appear.
026: *
027: * 4. The names "Xerces" and "Apache Software Foundation" must
028: * not be used to endorse or promote products derived from this
029: * software without prior written permission. For written
030: * permission, please contact apache@apache.org.
031: *
032: * 5. Products derived from this software may not be called "Apache",
033: * nor may "Apache" appear in their name, without prior written
034: * permission of the Apache Software Foundation.
035: *
036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
047: * SUCH DAMAGE.
048: * ====================================================================
049: *
050: * This software consists of voluntary contributions made by many
051: * individuals on behalf of the Apache Software Foundation and was
052: * originally based on software copyright (c) 1999, International
053: * Business Machines, Inc., http://www.apache.org. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package org.apache.xerces.readers;
059:
060: import org.apache.xerces.framework.XMLErrorReporter;
061: import org.apache.xerces.utils.ChunkyByteArray;
062: import org.apache.xerces.utils.StringPool;
063: import java.util.Stack;
064:
065: /**
066: * Abstract base class for encoding recognizers.
067: *
068: * When we encounter an external entity, including the document entity,
069: * and do not know what the encoding of the underlying byte stream is,
070: * we need to look at the contents of the stream to find out. We do this
071: * by asking a set of "recognizers" to look at the stream data and if
072: * the recognizer can understand the encoding it will try to read an
073: * XML or text declaration, if present, and construct the appropriate
074: * reader for that encoding. The recognizer subclasses will typically
075: * use the prescanXMLDeclOrTextDecl() method if the stream looks like
076: * it does begin with such a declaration using a temporary reader that
077: * can support the calls needed to scan through the encoding declaration.
078: */
079: public abstract class XMLDeclRecognizer {
080:
081: /**
082: * Register the standard recognizers.
083: *
084: * @param recognizerStack The stack of recognizers used by the parser.
085: */
086: public static void registerDefaultRecognizers(Stack recognizerStack) {
087: recognizerStack.push(new EBCDICRecognizer());
088: recognizerStack.push(new UCSRecognizer());
089: recognizerStack.push(new UTF8Recognizer());
090: }
091:
092: /**
093: * Subclasses override this method to support recognizing their encodings.
094: *
095: * @param readerFactory the factory object to use when constructing the entity reader.
096: * @param entityHandler the entity handler to get entity readers from
097: * @param errorReporter where to report errors
098: * @param sendCharDataAsCharArray true if the reader should use char arrays, not string handles.
099: * @param stringPool the <code>StringPool</code> to put strings in
100: * @param data initial bytes to perform recognition on
101: * @param xmlDecl true if attempting to recognize fron an XMLDecl, false if trying to recognize from a TextDecl.
102: * @param allowJavaEncodingName true if Java's encoding names are allowed, false if they are not.
103: * @return The reader that will be used to process the contents of the data stream.
104: * @exception java.lang.Exception
105: */
106: public abstract XMLEntityHandler.EntityReader recognize(
107: XMLEntityReaderFactory readerFactory,
108: XMLEntityHandler entityHandler,
109: XMLErrorReporter errorReporter,
110: boolean sendCharDataAsCharArray, StringPool stringPool,
111: ChunkyByteArray data, boolean xmlDecl,
112: boolean allowJavaEncodingName) throws Exception;
113:
114: //
115: // From the standard:
116: //
117: // [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
118: // [24] VersionInfo ::= S 'version' Eq (' VersionNum ' | " VersionNum ")
119: // [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
120: // [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
121: // [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
122: //
123: /**
124: * Support for getting the value of an EncodingDecl using an XMLReader.
125: *
126: * This is the minimal logic from the scanner to recognize an XMLDecl or TextDecl using
127: * the XMLReader interface.
128: *
129: * @param entityReader data source for prescan
130: * @param xmlDecl true if attempting to recognize from an XMLDecl, false if trying to recognize from a TextDecl.
131: * @return <code>StringPool</code> handle to the name of the encoding recognized
132: * @exception java.lang.Exception
133: */
134: protected int prescanXMLDeclOrTextDecl(
135: XMLEntityHandler.EntityReader entityReader, boolean xmlDecl)
136: throws Exception {
137: if (!entityReader.lookingAtChar('<', true)) {
138: return -1;
139: }
140: if (!entityReader.lookingAtChar('?', true)) {
141: return -1;
142: }
143: if (!entityReader.skippedString(xml_string)) {
144: return -1;
145: }
146: entityReader.skipPastSpaces();
147: boolean single;
148: char qchar;
149: if (entityReader.skippedString(version_string)) {
150: entityReader.skipPastSpaces();
151: if (!entityReader.lookingAtChar('=', true)) {
152: return -1;
153: }
154: entityReader.skipPastSpaces();
155: int versionIndex = entityReader.scanStringLiteral();
156: if (versionIndex < 0) {
157: return -1;
158: }
159: if (!entityReader.lookingAtSpace(true)) {
160: return -1;
161: }
162: entityReader.skipPastSpaces();
163: } else if (xmlDecl) {
164: return -1;
165: }
166: if (!entityReader.skippedString(encoding_string)) {
167: return -1;
168: }
169: entityReader.skipPastSpaces();
170: if (!entityReader.lookingAtChar('=', true)) {
171: return -1;
172: }
173: entityReader.skipPastSpaces();
174: int encodingIndex = entityReader.scanStringLiteral();
175: return encodingIndex;
176: }
177:
178: //
179: // [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
180: // [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
181: //
182: private static final char[] xml_string = { 'x', 'm', 'l' };
183: //
184: // [24] VersionInfo ::= S 'version' Eq (' VersionNum ' | " VersionNum ")
185: //
186: private static final char[] version_string = { 'v', 'e', 'r', 's',
187: 'i', 'o', 'n' };
188: //
189: // [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
190: //
191: private static final char[] encoding_string = { 'e', 'n', 'c', 'o',
192: 'd', 'i', 'n', 'g' };
193: }
|