001: /*
002: * Copyright (c) 1998 Sun Microsystems, Inc. All Rights Reserved.
003: */
004:
005: package com.sun.xml.dtdparser;
006:
007: import org.xml.sax.EntityResolver;
008: import org.xml.sax.InputSource;
009:
010: import java.io.File;
011: import java.io.FileInputStream;
012: import java.io.IOException;
013: import java.io.InputStream;
014: import java.net.URL;
015: import java.net.URLConnection;
016: import java.util.Hashtable;
017:
018: /**
019: * This entity resolver class provides a number of utilities which can help
020: * managment of external parsed entities in XML. These are commonly used
021: * to hold markup declarations that are to be used as part of a Document
022: * Type Declaration (DTD), or to hold text marked up with XML.
023: * <p/>
024: * <P> Features include: <UL>
025: * <p/>
026: * <LI> Static factory methods are provided for constructing SAX InputSource
027: * objects from Files, URLs, or MIME objects. This eliminates a class of
028: * error-prone coding in applications.
029: * <p/>
030: * <LI> Character encodings for XML documents are correctly supported: <UL>
031: * <p/>
032: * <LI> The encodings defined in the RFCs for MIME content types
033: * (2046 for general MIME, and 2376 for XML in particular), are
034: * supported, handling <em>charset=...</em> attributes and accepting
035: * content types which are known to be safe for use with XML;
036: * <p/>
037: * <LI> The character encoding autodetection algorithm identified
038: * in the XML specification is used, and leverages all of
039: * the JDK 1.1 (and later) character encoding support.
040: * <p/>
041: * <LI> The use of MIME typing may optionally be disabled, forcing the
042: * use of autodetection, to support web servers which don't correctly
043: * report MIME types for XML. For example, they may report text that
044: * is encoded in EUC-JP as being US-ASCII text, leading to fatal
045: * errors during parsing.
046: * <p/>
047: * <LI> The InputSource objects returned by this class always
048: * have a <code>java.io.Reader</code> available as the "character
049: * stream" property.
050: * <p/>
051: * </UL>
052: * <p/>
053: * <LI> Catalog entries can map public identifiers to Java resources or
054: * to local URLs. These are used to reduce network dependencies and loads,
055: * and will often be used for external DTD components. For example, packages
056: * shipping DTD files as resources in JAR files can eliminate network traffic
057: * when accessing them, and sites may provide local caches of common DTDs.
058: * Note that no particular catalog syntax is supported by this class, only
059: * the notion of a set of entries.
060: * <p/>
061: * </UL>
062: * <p/>
063: * <P> Subclasses can perform tasks such as supporting new URI schemes for
064: * URIs which are not URLs, such as URNs (see RFC 2396) or for accessing
065: * MIME entities which are part of a <em>multipart/related</em> group
066: * (see RFC 2387). They may also be used to support particular catalog
067: * syntaxes, such as the <a href="http://www.oasis-open.org/html/a401.htm">
068: * SGML/Open Catalog (SOCAT)</a> which supports the SGML notion of "Formal
069: * Public Identifiers (FPIs).
070: *
071: * @author David Brownell
072: * @author Janet Koenig
073: * @version 1.3 00/02/24
074: */
075: public class Resolver implements EntityResolver {
076: private boolean ignoringMIME;
077:
078: // table mapping public IDs to (local) URIs
079: private Hashtable id2uri;
080:
081: // tables mapping public IDs to resources and classloaders
082: private Hashtable id2resource;
083: private Hashtable id2loader;
084:
085: //
086: // table of MIME content types (less attributes!) known
087: // to be mostly "OK" to use with XML MIME entities. the
088: // idea is to rule out obvious braindamage ("image/jpg")
089: // not the subtle stuff ("text/html") that might actually
090: // be (or become) safe.
091: //
092: private static final String types[] = { "application/xml",
093: "text/xml", "text/plain", "text/html", // commonly mis-inferred
094: "application/x-netcdf", // this is often illegal XML
095: "content/unknown" };
096:
097: /**
098: * Constructs a resolver.
099: */
100: public Resolver() {
101: }
102:
103: /**
104: * Returns an input source, using the MIME type information and URL
105: * scheme to statically determine the correct character encoding if
106: * possible and otherwise autodetecting it. MIME carefully specifies
107: * the character encoding defaults, and how attributes of the content
108: * type can change it. XML further specifies two mandatory encodings
109: * (UTF-8 and UTF-16), and includes an XML declaration which can be
110: * used to internally label most documents encoded using US-ASCII
111: * supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and
112: * more).
113: * <p/>
114: * <P> This method can be used to access XML documents which do not
115: * have URIs (such as servlet input streams, or most JavaMail message
116: * entities) and to support access methods such as HTTP POST or PUT.
117: * (URLs normally return content using the GET method.)
118: * <p/>
119: * <P> <em> The caller should set the system ID in order for relative URIs
120: * found in this document to be interpreted correctly.</em> In some cases,
121: * a custom resolver will need to be used; for example, documents
122: * may be grouped in a single MIME "multipart/related" bundle, and
123: * relative URLs would refer to other documents in that bundle.
124: *
125: * @param contentType The MIME content type for the source for which
126: * an InputSource is desired, such as <em>text/xml;charset=utf-8</em>.
127: * @param stream The input byte stream for the input source.
128: * @param checkType If true, this verifies that the content type is known
129: * to support XML documents, such as <em>application/xml</em>.
130: * @param scheme Unless this is "file", unspecified MIME types
131: * default to US-ASCII. Files are always autodetected since most
132: * file systems discard character encoding information.
133: */
134: public static InputSource createInputSource(String contentType,
135: InputStream stream, boolean checkType, String scheme)
136: throws IOException {
137: InputSource retval;
138: String charset = null;
139:
140: if (contentType != null) {
141: int index;
142:
143: contentType = contentType.toLowerCase();
144: index = contentType.indexOf(';');
145: if (index != -1) {
146: String attributes;
147:
148: attributes = contentType.substring(index + 1);
149: contentType = contentType.substring(0, index);
150:
151: // use "charset=..." if it's available
152: index = attributes.indexOf("charset");
153: if (index != -1) {
154: attributes = attributes.substring(index + 7);
155: // strip out subsequent attributes
156: if ((index = attributes.indexOf(';')) != -1)
157: attributes = attributes.substring(0, index);
158: // find start of value
159: if ((index = attributes.indexOf('=')) != -1) {
160: attributes = attributes.substring(index + 1);
161: // strip out rfc822 comments
162: if ((index = attributes.indexOf('(')) != -1)
163: attributes = attributes.substring(0, index);
164: // double quotes are optional
165: if ((index = attributes.indexOf('"')) != -1) {
166: attributes = attributes
167: .substring(index + 1);
168: attributes = attributes.substring(0,
169: attributes.indexOf('"'));
170: }
171: charset = attributes.trim();
172: // XXX "\;", "\)" etc were mishandled above
173: }
174: }
175: }
176:
177: //
178: // Check MIME type.
179: //
180: if (checkType) {
181: boolean isOK = false;
182: for (int i = 0; i < types.length; i++)
183: if (types[i].equals(contentType)) {
184: isOK = true;
185: break;
186: }
187: if (!isOK)
188: throw new IOException("Not XML: " + contentType);
189: }
190:
191: //
192: // "text/*" MIME types have hard-wired character set
193: // defaults, as specified in the RFCs. For XML, we
194: // ignore the system "file.encoding" property since
195: // autodetection is more correct.
196: //
197: if (charset == null) {
198: contentType = contentType.trim();
199: if (contentType.startsWith("text/")) {
200: if (!"file".equalsIgnoreCase(scheme))
201: charset = "US-ASCII";
202: }
203: // "application/*" has no default
204: }
205: }
206:
207: retval = new InputSource(XmlReader
208: .createReader(stream, charset));
209: retval.setByteStream(stream);
210: retval.setEncoding(charset);
211: return retval;
212: }
213:
214: /**
215: * Creates an input source from a given URI.
216: *
217: * @param uri the URI (system ID) for the entity
218: * @param checkType if true, the MIME content type for the entity
219: * is checked for document type and character set encoding.
220: */
221: static public InputSource createInputSource(URL uri,
222: boolean checkType) throws IOException {
223:
224: URLConnection conn = uri.openConnection();
225: InputSource retval;
226:
227: if (checkType) {
228: String contentType = conn.getContentType();
229: retval = createInputSource(contentType, conn
230: .getInputStream(), false, uri.getProtocol());
231: } else {
232: retval = new InputSource(XmlReader.createReader(conn
233: .getInputStream()));
234: }
235: retval.setSystemId(conn.getURL().toString());
236: return retval;
237: }
238:
239: /**
240: * Creates an input source from a given file, autodetecting
241: * the character encoding.
242: */
243: static public InputSource createInputSource(File file)
244: throws IOException {
245: InputSource retval;
246: String path;
247:
248: retval = new InputSource(XmlReader
249: .createReader(new FileInputStream(file)));
250:
251: // On JDK 1.2 and later, simplify this:
252: // "path = file.toURL ().toString ()".
253: path = file.getAbsolutePath();
254: if (File.separatorChar != '/')
255: path = path.replace(File.separatorChar, '/');
256: if (!path.startsWith("/"))
257: path = "/" + path;
258: if (!path.endsWith("/") && file.isDirectory())
259: path = path + "/";
260:
261: retval.setSystemId("file:" + path);
262: return retval;
263: }
264:
265: /**
266: * <b>SAX:</b>
267: * Resolve the given entity into an input source. If the name can't
268: * be mapped to a preferred form of the entity, the URI is used. To
269: * resolve the entity, first a local catalog mapping names to URIs is
270: * consulted. If no mapping is found there, a catalog mapping names
271: * to java resources is consulted. Finally, if neither mapping found
272: * a copy of the entity, the specified URI is used.
273: * <p/>
274: * <P> When a URI is used, <a href="#createInputSource">
275: * createInputSource</a> is used to correctly deduce the character
276: * encoding used by this entity. No MIME type checking is done.
277: *
278: * @param name Used to find alternate copies of the entity, when
279: * this value is non-null; this is the XML "public ID".
280: * @param uri Used when no alternate copy of the entity is found;
281: * this is the XML "system ID", normally a URI.
282: */
283: public InputSource resolveEntity(String name, String uri)
284: throws IOException {
285: InputSource retval;
286: String mappedURI = name2uri(name);
287: InputStream stream;
288:
289: // prefer explicit URI mappings, then bundled resources...
290: if (mappedURI == null && (stream = mapResource(name)) != null) {
291: uri = "java:resource:" + (String) id2resource.get(name);
292: retval = new InputSource(XmlReader.createReader(stream));
293:
294: // ...and treat all URIs the same (as URLs for now).
295: } else {
296: URL url;
297: URLConnection conn;
298:
299: if (mappedURI != null)
300: uri = mappedURI;
301: else if (uri == null)
302: return null;
303:
304: url = new URL(uri);
305: conn = url.openConnection();
306: uri = conn.getURL().toString();
307: // System.out.println ("++ URI: " + url);
308: if (ignoringMIME)
309: retval = new InputSource(XmlReader.createReader(conn
310: .getInputStream()));
311: else {
312: String contentType = conn.getContentType();
313: retval = createInputSource(contentType, conn
314: .getInputStream(), false, url.getProtocol());
315: }
316: }
317: retval.setSystemId(uri);
318: retval.setPublicId(name);
319: return retval;
320: }
321:
322: /**
323: * Returns true if this resolver is ignoring MIME types in the documents
324: * it returns, to work around bugs in how servers have reported the
325: * documents' MIME types.
326: */
327: public boolean isIgnoringMIME() {
328: return ignoringMIME;
329: }
330:
331: /**
332: * Tells the resolver whether to ignore MIME types in the documents it
333: * retrieves. Many web servers incorrectly assign text documents a
334: * default character encoding, even when that is incorrect. For example,
335: * all HTTP text documents default to use ISO-8859-1 (used for Western
336: * European languages), and other MIME sources default text documents
337: * to use US-ASCII (a seven bit encoding). For XML documents which
338: * include text encoding declarations (as most should do), these server
339: * bugs can be worked around by ignoring the MIME type entirely.
340: */
341: public void setIgnoringMIME(boolean value) {
342: ignoringMIME = value;
343: }
344:
345: // maps the public ID to an alternate URI, if one is registered
346: private String name2uri(String publicId) {
347: if (publicId == null || id2uri == null)
348: return null;
349: return (String) id2uri.get(publicId);
350: }
351:
352: /**
353: * Registers the given public ID as corresponding to a particular
354: * URI, typically a local copy. This URI will be used in preference
355: * to ones provided as system IDs in XML entity declarations. This
356: * mechanism would most typically be used for Document Type Definitions
357: * (DTDs), where the public IDs are formally managed and versioned.
358: *
359: * @param publicId The managed public ID being mapped
360: * @param uri The URI of the preferred copy of that entity
361: */
362: public void registerCatalogEntry(String publicId, String uri) {
363: if (id2uri == null)
364: id2uri = new Hashtable(17);
365: id2uri.put(publicId, uri);
366: }
367:
368: // return the resource as a stream
369: private InputStream mapResource(String publicId) {
370: // System.out.println ("++ PUBLIC: " + publicId);
371: if (publicId == null || id2resource == null)
372: return null;
373:
374: String resourceName = (String) id2resource.get(publicId);
375: ClassLoader loader = null;
376:
377: if (resourceName == null)
378: return null;
379: // System.out.println ("++ Resource: " + resourceName);
380:
381: if (id2loader != null)
382: loader = (ClassLoader) id2loader.get(publicId);
383: // System.out.println ("++ Loader: " + loader);
384: if (loader == null)
385: return ClassLoader.getSystemResourceAsStream(resourceName);
386: return loader.getResourceAsStream(resourceName);
387: }
388:
389: /**
390: * Registers a given public ID as corresponding to a particular Java
391: * resource in a given class loader, typically distributed with a
392: * software package. This resource will be preferred over system IDs
393: * included in XML documents. This mechanism should most typically be
394: * used for Document Type Definitions (DTDs), where the public IDs are
395: * formally managed and versioned.
396: * <p/>
397: * <P> If a mapping to a URI has been provided, that mapping takes
398: * precedence over this one.
399: *
400: * @param publicId The managed public ID being mapped
401: * @param resourceName The name of the Java resource
402: * @param loader The class loader holding the resource, or null if
403: * it is a system resource.
404: */
405: public void registerCatalogEntry(String publicId,
406: String resourceName, ClassLoader loader) {
407: if (id2resource == null)
408: id2resource = new Hashtable(17);
409: id2resource.put(publicId, resourceName);
410:
411: if (loader != null) {
412: if (id2loader == null)
413: id2loader = new Hashtable(17);
414: id2loader.put(publicId, loader);
415: }
416: }
417: }
|