001: /* Copyright 2002-2005 Elliotte Rusty Harold
002:
003: This library is free software; you can redistribute it and/or modify
004: it under the terms of version 2.1 of the GNU Lesser General Public
005: License as published by the Free Software Foundation.
006:
007: This library is distributed in the hope that it will be useful,
008: but WITHOUT ANY WARRANTY; without even the implied warranty of
009: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: GNU Lesser General Public License for more details.
011:
012: You should have received a copy of the GNU Lesser General Public
013: License along with this library; if not, write to the
014: Free Software Foundation, Inc., 59 Temple Place, Suite 330,
015: Boston, MA 02111-1307 USA
016:
017: You can contact Elliotte Rusty Harold by sending e-mail to
018: elharo@metalab.unc.edu. Please include the word "XOM" in the
019: subject line. The XOM home page is located at http://www.xom.nu/
020: */
021:
022: package nu.xom.xinclude;
023:
024: import java.io.BufferedInputStream;
025: import java.io.BufferedReader;
026: import java.io.IOException;
027: import java.io.InputStream;
028: import java.io.InputStreamReader;
029: import java.io.Reader;
030: import java.io.UnsupportedEncodingException;
031: import java.net.MalformedURLException;
032: import java.net.URL;
033: import java.net.URLConnection;
034: import java.util.Locale;
035: import java.util.ArrayList;
036:
037: import nu.xom.Attribute;
038: import nu.xom.Builder;
039: import nu.xom.DocType;
040: import nu.xom.Document;
041: import nu.xom.Element;
042: import nu.xom.Elements;
043: import nu.xom.MalformedURIException;
044: import nu.xom.Node;
045: import nu.xom.NodeFactory;
046: import nu.xom.Nodes;
047: import nu.xom.ParentNode;
048: import nu.xom.ParsingException;
049: import nu.xom.Text;
050:
051: /**
052: * <p>
053: * Implements XInclude resolution as specified in
054: * <a href="http://www.w3.org/TR/2004/REC-xinclude-20041220/"
055: * target="_top"><cite>XML Inclusions (XInclude) Version
056: * 1.0</cite></a>. Fallbacks are supported.
057: * The XPointer <code>element()</code> scheme and
058: * shorthand XPointers are also supported. The XPointer
059: * <code>xpointer()</code> scheme is not supported.
060: * The <code>accept</code> and <code>accept-language</code>
061: * attributes are supported.
062: * </p>
063: *
064: * @author Elliotte Rusty Harold
065: * @version 1.1b3
066: *
067: */
068: public class XIncluder {
069:
070: private static String version = System.getProperty("java.version");
071:
072: // could rewrite this to handle only elements in documents
073: // (no parentless elements) and then add code to handle Nodes
074: // and parentless elements by sticking each one in a Document
075:
076: // prevent instantiation
077: private XIncluder() {
078: }
079:
080: /**
081: * <p>
082: * The namespace name of all XInclude elements.
083: * </p>
084: */
085: public final static String XINCLUDE_NS = "http://www.w3.org/2001/XInclude";
086:
087: /**
088: * <p>
089: * Returns a copy of the document in which all
090: * <code>xinclude:include</code> elements have been
091: * replaced by their referenced content. The original
092: * <code>Document</code> object is not modified.
093: * Resolution is recursive; that is, include elements
094: * in the included documents are themselves resolved.
095: * The <code>Document</code> returned contains no
096: * include elements.
097: * </p>
098: *
099: * @param in the document in which include elements
100: * should be resolved
101: *
102: * @return copy of the document in which
103: * all <code>xinclude:include</code> elements
104: * have been replaced by their referenced content
105: *
106: * @throws BadParseAttributeException if an <code>include</code>
107: * element has a <code>parse</code> attribute with any value
108: * other than <code>text</code> or <code>parse</code>
109: * @throws InclusionLoopException if the document
110: * contains an XInclude element that attempts to include
111: * a document in which this element is directly or indirectly
112: * included.
113: * @throws IOException if an included document could not be loaded,
114: * and no fallback was available
115: * @throws NoIncludeLocationException if an <code>xinclude:include</code>
116: * element does not have an <code>href</code> attribute
117: * @throws ParsingException if an included XML document
118: * was malformed
119: * @throws UnsupportedEncodingException if an included document
120: * used an encoding this parser does not support, and no
121: * fallback was available
122: * @throws XIncludeException if the document violates the
123: * syntax rules of XInclude
124: * @throws XMLException if resolving an include element would
125: * result in a malformed document
126: */
127: public static Document resolve(Document in)
128: throws BadParseAttributeException, InclusionLoopException,
129: IOException, NoIncludeLocationException, ParsingException,
130: UnsupportedEncodingException, XIncludeException {
131:
132: Builder builder = new Builder();
133: return resolve(in, builder);
134:
135: }
136:
137: /**
138: * <p>
139: * Returns a copy of the document in which all
140: * <code>xinclude:include</code> elements have been
141: * replaced by their referenced content as loaded by the builder.
142: * The original <code>Document</code> object is not modified.
143: * Resolution is recursive; that is, include elements
144: * in the included documents are themselves resolved.
145: * The document returned contains no <code>include</code> elements.
146: * </p>
147: *
148: * @param in the document in which include elements
149: * should be resolved
150: * @param builder the builder used to build the
151: * nodes included from other documents
152: *
153: * @return copy of the document in which
154: * all <code>xinclude:include</code> elements
155: * have been replaced by their referenced content
156: *
157: * @throws BadParseAttributeException if an <code>include</code>
158: * element has a <code>parse</code> attribute with any value
159: * other than <code>text</code> or <code>parse</code>
160: * @throws InclusionLoopException if the document
161: * contains an XInclude element that attempts to include
162: * a document in which this element is directly or indirectly
163: * included.
164: * @throws IOException if an included document could not be loaded,
165: * and no fallback was available
166: * @throws NoIncludeLocationException if an <code>xinclude:include</code>
167: * element does not have an href attribute.
168: * @throws ParsingException if an included XML document
169: * was malformed
170: * @throws UnsupportedEncodingException if an included document
171: * used an encoding this parser does not support, and no
172: * fallback was available
173: * @throws XIncludeException if the document violates the
174: * syntax rules of XInclude
175: * @throws XMLException if resolving an include element would
176: * result in a malformed document
177: */
178: public static Document resolve(Document in, Builder builder)
179: throws BadParseAttributeException, InclusionLoopException,
180: IOException, NoIncludeLocationException, ParsingException,
181: UnsupportedEncodingException, XIncludeException {
182:
183: Document copy = new Document(in);
184: resolveInPlace(copy, builder);
185: return copy;
186:
187: }
188:
189: /**
190: * <p>
191: * Modifies a document by replacing all
192: * <code>xinclude:include</code> elements
193: * by their referenced content.
194: * Resolution is recursive; that is, include elements
195: * in the included documents are themselves resolved.
196: * The resolved document contains no
197: * <code>xinclude:include</code> elements.
198: * </p>
199: *
200: * <p>
201: * If the inclusion fails for any reason—XInclude syntax
202: * error, missing resource with no fallback, etc.—the document
203: * may be left in a partially resolved state.
204: * </p>
205: *
206: * @param in the document in which include elements
207: * should be resolved
208: *
209: * @throws BadParseAttributeException if an <code>include</code>
210: * element has a <code>parse</code> attribute
211: * with any value other than <code>text</code>
212: * or <code>parse</code>
213: * @throws InclusionLoopException if the document
214: * contains an XInclude element that attempts to include a
215: * document in which this element is directly or indirectly
216: * included
217: * @throws IOException if an included document could not be loaded,
218: * and no fallback was available
219: * @throws NoIncludeLocationException if an <code>xinclude:include</code>
220: * element does not have an <code>href</code> attribute
221: * @throws ParsingException if an included XML document
222: * was malformed
223: * @throws UnsupportedEncodingException if an included document
224: * used an encoding this parser does not support, and no
225: * fallback was available
226: * @throws XIncludeException if the document violates the
227: * syntax rules of XInclude
228: * @throws XMLException if resolving an include element would
229: * result in a malformed document
230: */
231: public static void resolveInPlace(Document in)
232: throws BadParseAttributeException, InclusionLoopException,
233: IOException, NoIncludeLocationException, ParsingException,
234: UnsupportedEncodingException, XIncludeException {
235: resolveInPlace(in, new Builder());
236: }
237:
238: /**
239: * <p>
240: * Modifies a document by replacing all
241: * <code>xinclude:include</code> elements with their referenced
242: * content as loaded by the builder. Resolution is recursive;
243: * that is, <code>include</code> elements in the included documents
244: * are themselves resolved. The resolved document contains no
245: * <code>xinclude:include</code> elements.
246: * </p>
247: *
248: * <p>
249: * If the inclusion fails for any reason — XInclude syntax
250: * error, missing resource with no fallback, etc. — the
251: * document may be left in a partially resolved state.
252: * </p>
253: *
254: * @param in the document in which include elements
255: * should be resolved
256: * @param builder the builder used to build the
257: * nodes included from other documents
258: *
259: * @throws BadParseAttributeException if an <code>include</code>
260: * element has a <code>parse</code> attribute
261: * with any value other than <code>text</code>
262: * or <code>parse</code>
263: * @throws InclusionLoopException if this element
264: * contains an XInclude element that attempts to include a
265: * document in which this element is directly or indirectly
266: * included
267: * @throws IOException if an included document could not be loaded,
268: * and no fallback was available
269: * @throws NoIncludeLocationException if an <code>xinclude:include</code>
270: * element does not have an <code>href</code> attribute.
271: * @throws ParsingException if an included XML document
272: * was malformed
273: * @throws UnsupportedEncodingException if an included document
274: * used an encoding this parser does not support, and no
275: * fallback was available
276: * @throws XIncludeException if the document violates the
277: * syntax rules of XInclude
278: * @throws XMLException if resolving an include element would
279: * result in a malformed document
280: */
281: public static void resolveInPlace(Document in, Builder builder)
282: throws BadParseAttributeException, InclusionLoopException,
283: IOException, NoIncludeLocationException, ParsingException,
284: UnsupportedEncodingException, XIncludeException {
285:
286: ArrayList stack = new ArrayList();
287: resolveInPlace(in, builder, stack);
288:
289: }
290:
291: private static void resolveInPlace(Document in, Builder builder,
292: ArrayList baseURLs) throws IOException, ParsingException,
293: XIncludeException {
294:
295: String base = in.getBaseURI();
296: // workaround a bug in Sun VMs
297: if (base != null && base.startsWith("file:///")) {
298: base = "file:/" + base.substring(8);
299: }
300:
301: baseURLs.add(base);
302: Element root = in.getRootElement();
303: resolve(root, builder, baseURLs);
304: baseURLs.remove(baseURLs.size() - 1);
305:
306: }
307:
308: private static void resolve(Element element, Builder builder,
309: ArrayList baseURLs) throws IOException, ParsingException,
310: XIncludeException {
311:
312: resolve(element, builder, baseURLs, null);
313:
314: }
315:
316: private static void resolve(Element element, Builder builder,
317: ArrayList baseURLs, Document originalDoc)
318: throws IOException, ParsingException, XIncludeException {
319:
320: if (isIncludeElement(element)) {
321: verifyIncludeElement(element);
322:
323: String parse = element.getAttributeValue("parse");
324: if (parse == null)
325: parse = "xml";
326: String xpointer = element.getAttributeValue("xpointer");
327: String encoding = element.getAttributeValue("encoding");
328: String href = element.getAttributeValue("href");
329: // empty string href is same as no href attribute
330: if ("".equals(href))
331: href = null;
332:
333: ParentNode parent = element.getParent();
334: String base = element.getBaseURI();
335: URL baseURL = null;
336: try {
337: baseURL = new URL(base);
338: } catch (MalformedURLException ex) {
339: // don't use base
340: }
341: URL url = null;
342: try {
343: // xml:base attributes added to maintain the
344: // base URI should not have fragment IDs
345:
346: if (baseURL != null && href != null) {
347: url = absolutize(baseURL, href);
348: } else if (href != null) {
349: try {
350: testURISyntax(href);
351: url = new URL(href);
352: } catch (MalformedURIException ex) {
353: if (baseURL == null) {
354: throw new BadHrefAttributeException(
355: "Could not resolve relative URI "
356: + href
357: + " because the xi:include element does"
358: + " not have a base URI.",
359: href);
360: }
361: throw new BadHrefAttributeException(
362: "Illegal IRI in href attribute", href);
363: }
364: }
365:
366: String accept = element.getAttributeValue("accept");
367: checkHeader(accept);
368: String acceptLanguage = element
369: .getAttributeValue("accept-language");
370: checkHeader(acceptLanguage);
371:
372: if (parse.equals("xml")) {
373:
374: String parentLanguage = "";
375: if (parent instanceof Element) {
376: parentLanguage = getXMLLangValue((Element) parent);
377: }
378:
379: Nodes replacements;
380: if (url != null) {
381: replacements = downloadXMLDocument(url,
382: xpointer, builder, baseURLs, accept,
383: acceptLanguage, parentLanguage);
384: // Add base URIs. Base URIs added by XInclusion require
385: // the element to maintain the same base URI as it had
386: // in the original document. Since its base URI in the
387: // original document does not contain a fragment ID,
388: // therefore its base URI after inclusion shouldn't,
389: // and this special case is unnecessary. Base URI fixup
390: // should not add the fragment ID.
391: for (int i = 0; i < replacements.size(); i++) {
392: Node child = replacements.get(i);
393: if (child instanceof Element) {
394: String noFragment = child.getBaseURI();
395: if (noFragment.indexOf('#') >= 0) {
396: noFragment = noFragment.substring(
397: 0, noFragment.indexOf('#'));
398: }
399: Element baseless = (Element) child;
400:
401: // parent is null here; need to get real parent
402: String parentBase = parent.getBaseURI();
403: if (parentBase != null
404: && !"".equals(parentBase)) {
405: parentBase = getDirectoryBase(parentBase);
406: }
407:
408: if (noFragment.startsWith(parentBase)) {
409: noFragment = noFragment
410: .substring(parentBase
411: .length());
412: }
413: Attribute baseAttribute = new Attribute(
414: "xml:base",
415: "http://www.w3.org/XML/1998/namespace",
416: noFragment);
417: baseless.addAttribute(baseAttribute);
418:
419: }
420: }
421: } else {
422: Document parentDoc = element.getDocument();
423: if (parentDoc == null) {
424: parentDoc = originalDoc;
425: }
426: Nodes originals = XPointer.query(parentDoc,
427: xpointer);
428: replacements = new Nodes();
429: for (int i = 0; i < originals.size(); i++) {
430: Node original = originals.get(i);
431: // current implementation of XPointer never returns non-elements
432: if (contains((Element) original, element)) {
433: throw new InclusionLoopException(
434: "Element tried to include itself");
435: }
436: Node copy = original.copy();
437: replacements.append(copy);
438: }
439: replacements = resolveXPointerSelection(
440: replacements, builder, baseURLs,
441: parentDoc);
442:
443: }
444:
445: // Will fail if we're replacing the root element with
446: // a node list containing zero or multiple elements,
447: // but that should fail. However, I may wish to
448: // adjust the type of exception thrown. This is only
449: // relevant if I add support for the xpointer scheme
450: // since otherwise you can only point at one element
451: // or document.
452: if (parent instanceof Element) {
453: int position = parent.indexOf(element);
454: for (int i = 0; i < replacements.size(); i++) {
455: Node child = replacements.get(i);
456: parent.insertChild(child, position + i);
457: }
458: element.detach();
459: } else { // root element needs special treatment
460: // I am assuming here that it is not possible
461: // for parent to be null. I think this is true
462: // in the current version, but it could change
463: // if I made it possible to directly resolve an
464: // element or a Nodes.
465: Document doc = (Document) parent;
466: int i = 0;
467: // prolog and root
468: while (true) {
469: Node child = replacements.get(i);
470: i++;
471: if (child instanceof Element) {
472: doc.setRootElement((Element) child);
473: break;
474: } else {
475: doc.insertChild(child, doc
476: .indexOf(element));
477: }
478:
479: }
480: // epilog
481: Element root = doc.getRootElement();
482: int position = doc.indexOf(root);
483: for (int j = i; j < replacements.size(); j++) {
484: doc.insertChild(replacements.get(j),
485: position + 1 + j - i);
486: }
487: }
488: } else if (parse.equals("text")) {
489: Nodes replacements = downloadTextDocument(url,
490: encoding, builder, accept, acceptLanguage);
491: for (int j = 0; j < replacements.size(); j++) {
492: Node replacement = replacements.get(j);
493: if (replacement instanceof Attribute) {
494: ((Element) parent)
495: .addAttribute((Attribute) replacement);
496: } else {
497: parent.insertChild(replacement, parent
498: .indexOf(element));
499: }
500: }
501: parent.removeChild(element);
502: } else {
503: throw new BadParseAttributeException(
504: "Bad value for parse attribute: " + parse,
505: element.getDocument().getBaseURI());
506: }
507:
508: } catch (IOException ex) {
509: processFallback(element, builder, baseURLs, parent, ex);
510: } catch (XPointerSyntaxException ex) {
511: processFallback(element, builder, baseURLs, parent, ex);
512: } catch (XPointerResourceException ex) {
513: // Process fallbacks; I'm not sure this is correct
514: // behavior. Possibly this should include nothing. See
515: // http://lists.w3.org/Archives/Public/www-xml-xinclude-comments/2003Aug/0000.html
516: // Daniel Veillard thinks this is correct. See
517: // http://lists.w3.org/Archives/Public/www-xml-xinclude-comments/2003Aug/0001.html
518: processFallback(element, builder, baseURLs, parent, ex);
519: }
520:
521: } else if (isFallbackElement(element)) {
522: throw new MisplacedFallbackException(
523: "Fallback element outside include element", element
524: .getDocument().getBaseURI());
525: } else {
526: Elements children = element.getChildElements();
527: for (int i = 0; i < children.size(); i++) {
528: resolve(children.get(i), builder, baseURLs);
529: }
530: }
531:
532: }
533:
534: // ???? Move this into URIUtil when it goes public
535: private static String getDirectoryBase(String parentBase) {
536: if (parentBase.endsWith("/"))
537: return parentBase;
538: int lastSlash = parentBase.lastIndexOf('/');
539: return parentBase.substring(0, lastSlash + 1);
540: }
541:
542: private static void verifyIncludeElement(Element element)
543: throws XIncludeException {
544:
545: testHref(element);
546: testForFragmentIdentifier(element);
547: verifyEncoding(element);
548: testForForbiddenChildElements(element);
549: }
550:
551: private static void testHref(Element include)
552: throws NoIncludeLocationException {
553:
554: String href = include.getAttributeValue("href");
555: String xpointer = include.getAttributeValue("xpointer");
556: if (href == null && xpointer == null) {
557: throw new NoIncludeLocationException(
558: "Missing href attribute", include.getDocument()
559: .getBaseURI());
560: }
561: }
562:
563: private static void testForFragmentIdentifier(Element include)
564: throws BadHrefAttributeException {
565:
566: String href = include.getAttributeValue("href");
567: if (href != null) {
568: if (href.indexOf('#') > -1) {
569: throw new BadHrefAttributeException(
570: "fragment identifier in URI " + href, include
571: .getBaseURI());
572: }
573: }
574:
575: }
576:
577: private static void verifyEncoding(Element include)
578: throws BadEncodingAttributeException {
579:
580: String encoding = include.getAttributeValue("encoding");
581: if (encoding == null)
582: return;
583: // production 81 of XML spec
584: // EncName :=[A-Za-z] ([A-Za-z0-9._] | '-')*
585: char[] text = encoding.toCharArray();
586: if (text.length == 0) {
587: throw new BadEncodingAttributeException(
588: "Empty encoding attribute", include.getBaseURI());
589: }
590: char c = text[0];
591: if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
592: throw new BadEncodingAttributeException(
593: "Illegal value for encoding attribute: " + encoding,
594: include.getBaseURI());
595: }
596: for (int i = 1; i < text.length; i++) {
597: c = text[i];
598: if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
599: || (c >= '0' && c <= '9') || c == '-' || c == '_'
600: || c == '.') {
601: continue;
602: }
603: throw new BadEncodingAttributeException(
604: "Illegal value for encoding attribute: " + encoding,
605: include.getBaseURI());
606: }
607:
608: }
609:
610: // hack because URIUtil isn't public
611: private static URL absolutize(URL baseURL, String href)
612: throws MalformedURLException, BadHrefAttributeException {
613:
614: Element parent = new Element("c");
615: parent.setBaseURI(baseURL.toExternalForm());
616: Element child = new Element("c");
617: parent.appendChild(child);
618: child.addAttribute(new Attribute("xml:base",
619: "http://www.w3.org/XML/1998/namespace", href));
620: URL result = new URL(child.getBaseURI());
621: if (!"".equals(href) && result.equals(baseURL)) {
622: if (!baseURL.toExternalForm().endsWith(href)) {
623: throw new BadHrefAttributeException(href
624: + " is not a syntactically correct IRI");
625: }
626: }
627: return result;
628:
629: }
630:
631: private static void testURISyntax(String href) {
632: Element e = new Element("e");
633: e.setNamespaceURI(href);
634: }
635:
636: private static String getXMLLangValue(Element element) {
637:
638: while (true) {
639: Attribute lang = element.getAttribute("lang",
640: "http://www.w3.org/XML/1998/namespace");
641: if (lang != null)
642: return lang.getValue();
643: ParentNode parent = element.getParent();
644: if (parent == null)
645: return "";
646: else if (parent instanceof Document)
647: return "";
648: else
649: element = (Element) parent;
650: }
651:
652: }
653:
654: // This assumes current implementation of XPointer that
655: // always selects exactly one element or throws an exception.
656: private static Nodes resolveXPointerSelection(Nodes in,
657: Builder builder, ArrayList baseURLs, Document original)
658: throws IOException, ParsingException, XIncludeException {
659:
660: Element preinclude = (Element) in.get(0);
661: return resolveSilently(preinclude, builder, baseURLs, original);
662:
663: }
664:
665: private static boolean contains(ParentNode ancestor, Node descendant) {
666:
667: for (Node parent = descendant; parent != null; parent = parent
668: .getParent()) {
669: if (parent == ancestor)
670: return true;
671: }
672:
673: return false;
674:
675: }
676:
677: private static Nodes resolveSilently(Element element,
678: Builder builder, ArrayList baseURLs, Document originalDoc)
679: throws IOException, ParsingException, XIncludeException {
680:
681: // There is no possibility the element passed to this method
682: // is an include or a fallback element
683: if (isIncludeElement(element) || isFallbackElement(element)) {
684: throw new RuntimeException(
685: "XOM BUG: include or fallback element passed to resolveSilently;"
686: + " please report with a test case");
687: }
688:
689: Elements children = element.getChildElements();
690: for (int i = 0; i < children.size(); i++) {
691: resolve(children.get(i), builder, baseURLs, originalDoc);
692: }
693: return new Nodes(element);
694:
695: }
696:
697: private static void testForForbiddenChildElements(Element element)
698: throws XIncludeException {
699:
700: int fallbacks = 0;
701: Elements children = element.getChildElements();
702: int size = children.size();
703: for (int i = 0; i < size; i++) {
704: Element child = children.get(i);
705: if (XINCLUDE_NS.equals(child.getNamespaceURI())) {
706: if ("fallback".equals(child.getLocalName())) {
707: fallbacks++;
708: if (fallbacks > 1) {
709: throw new XIncludeException(
710: "Multiple fallback elements", element
711: .getDocument().getBaseURI());
712: }
713: } else {
714: throw new XIncludeException(
715: "Include element contains an include child",
716: element.getDocument().getBaseURI());
717: }
718: }
719: }
720:
721: }
722:
723: private static void processFallback(Element includeElement,
724: Builder builder, ArrayList baseURLs, ParentNode parent,
725: Exception ex) throws XIncludeException, IOException,
726: ParsingException {
727:
728: Element fallback = includeElement.getFirstChildElement(
729: "fallback", XINCLUDE_NS);
730: if (fallback == null) {
731: if (ex instanceof IOException)
732: throw (IOException) ex;
733: XIncludeException ex2 = new XIncludeException(ex
734: .getMessage(), includeElement.getDocument()
735: .getBaseURI());
736: ex2.initCause(ex);
737: throw ex2;
738: }
739:
740: while (fallback.getChildCount() > 0) {
741: Node child = fallback.getChild(0);
742: if (child instanceof Element) {
743: resolve((Element) child, builder, baseURLs);
744: }
745: child = fallback.getChild(0);
746: child.detach();
747: parent.insertChild(child, parent.indexOf(includeElement));
748: }
749: includeElement.detach();
750:
751: }
752:
753: // I could probably move the xpointer out of this method
754: private static Nodes downloadXMLDocument(URL source,
755: String xpointer, Builder builder, ArrayList baseURLs,
756: String accept, String acceptLanguage, String parentLanguage)
757: throws IOException, ParsingException, XIncludeException,
758: XPointerSyntaxException, XPointerResourceException {
759:
760: String base = source.toExternalForm();
761: if (xpointer == null && baseURLs.indexOf(base) != -1) {
762: throw new InclusionLoopException(
763: "Tried to include the already included document "
764: + base + " from "
765: + baseURLs.get(baseURLs.size() - 1),
766: (String) baseURLs.get(baseURLs.size() - 1));
767: }
768:
769: URLConnection uc = source.openConnection();
770: setHeaders(uc, accept, acceptLanguage);
771: InputStream in = new BufferedInputStream(uc.getInputStream());
772: Document doc;
773: try {
774: doc = builder.build(in, source.toExternalForm());
775: } finally {
776: in.close();
777: }
778:
779: resolveInPlace(doc, builder, baseURLs);
780: Nodes included;
781: if (xpointer != null && xpointer.length() != 0) {
782: included = XPointer.query(doc, xpointer);
783: // fill in lang attributes here
784: for (int i = 0; i < included.size(); i++) {
785: Node node = included.get(i);
786: // Current implementation can only select elements
787: Element top = (Element) node;
788: Attribute lang = top.getAttribute("lang",
789: "http://www.w3.org/XML/1998/namespace");
790: if (lang == null) {
791: String childLanguage = getXMLLangValue(top);
792: if (!parentLanguage.equals(childLanguage)) {
793: top.addAttribute(new Attribute("xml:lang",
794: "http://www.w3.org/XML/1998/namespace",
795: childLanguage));
796: }
797: }
798: }
799: } else {
800: included = new Nodes();
801: for (int i = 0; i < doc.getChildCount(); i++) {
802: Node child = doc.getChild(i);
803: if (!(child instanceof DocType)) {
804: included.append(child);
805: }
806: }
807: }
808: // so we can detach the old root if necessary
809: doc.setRootElement(new Element("f"));
810: for (int i = 0; i < included.size(); i++) {
811: Node node = included.get(i);
812: // Take account of xml:base attribute, which we normally
813: // don't do when detaching
814: String noFragment = node.getBaseURI();
815: if (noFragment.indexOf('#') >= 0) {
816: noFragment = noFragment.substring(0, noFragment
817: .indexOf('#'));
818: }
819: node.detach();
820: if (node instanceof Element) {
821: ((Element) node).setBaseURI(noFragment);
822: }
823: }
824:
825: return included;
826:
827: }
828:
829: /**
830: * <p>
831: * This utility method reads a document at a specified URL
832: * and returns the contents of that document as a <code>Text</code>.
833: * It's used to include files with <code>parse="text"</code>.
834: * </p>
835: *
836: * @param source <code>URL</code> of the document to download
837: * @param encoding encoding of the document; e.g. UTF-8,
838: * ISO-8859-1, etc.
839: * @param builder the <code>Builder</code> used to build the
840: * nodes included from other documents
841: *
842: * @return the document retrieved from the source <code>URL</code>
843: *
844: * @throws IOException if the remote document cannot
845: * be read due to an I/O error
846: */
847: private static Nodes downloadTextDocument(URL source,
848: String encoding, Builder builder, String accept,
849: String language) throws IOException, XIncludeException {
850:
851: if (encoding == null || encoding.length() == 0) {
852: encoding = "UTF-8";
853: }
854:
855: URLConnection uc = source.openConnection();
856: setHeaders(uc, accept, language);
857:
858: String encodingFromHeader = uc.getContentEncoding();
859: String contentType = uc.getContentType();
860: int contentLength = uc.getContentLength();
861: if (contentLength < 0)
862: contentLength = 1024;
863: InputStream in = new BufferedInputStream(uc.getInputStream());
864: try {
865: if (encodingFromHeader != null)
866: encoding = encodingFromHeader;
867: else {
868: if (contentType != null) {
869: contentType = contentType
870: .toLowerCase(Locale.ENGLISH);
871: if (contentType.equals("text/xml")
872: || contentType.equals("application/xml")
873: || (contentType.startsWith("text/") && contentType
874: .endsWith("+xml"))
875: || (contentType.startsWith("application/") && contentType
876: .endsWith("+xml"))) {
877: encoding = EncodingHeuristics
878: .readEncodingFromStream(in);
879: }
880: }
881: }
882: // workaround for pre-1.3 VMs that don't recognize UTF-16
883: if (version.startsWith("1.2") || version.startsWith("1.1")) {
884: if (encoding.equalsIgnoreCase("UTF-16")) {
885: // is it big-endian or little-endian?
886: in.mark(2);
887: int first = in.read();
888: if (first == 0xFF)
889: encoding = "UnicodeLittle";
890: else
891: encoding = "UnicodeBig";
892: in.reset();
893: } else if (encoding
894: .equalsIgnoreCase("UnicodeBigUnmarked")) {
895: encoding = "UnicodeBig";
896: } else if (encoding
897: .equalsIgnoreCase("UnicodeLittleUnmarked")) {
898: encoding = "UnicodeLittle";
899: }
900: }
901: Reader reader = new BufferedReader(new InputStreamReader(
902: in, encoding));
903: StringBuffer sb = new StringBuffer(contentLength);
904: for (int c = reader.read(); c != -1; c = reader.read()) {
905: sb.append((char) c);
906: }
907:
908: NodeFactory factory = builder.getNodeFactory();
909: if (factory != null) {
910: return factory.makeText(sb.toString());
911: } else
912: return new Nodes(new Text(sb.toString()));
913: } finally {
914: in.close();
915: }
916:
917: }
918:
919: private static void setHeaders(URLConnection uc, String accept,
920: String language) throws BadHTTPHeaderException {
921:
922: if (accept != null) {
923: checkHeader(accept);
924: uc.setRequestProperty("accept", accept);
925: }
926: if (language != null) {
927: checkHeader(language);
928: uc.setRequestProperty("accept-language", language);
929: }
930:
931: }
932:
933: private static void checkHeader(String header)
934: throws BadHTTPHeaderException {
935:
936: if (header == null)
937: return;
938: int length = header.length();
939: for (int i = 0; i < length; i++) {
940: char c = header.charAt(i);
941: if (c < 0x20 || c > 0x7E) {
942: throw new BadHTTPHeaderException(
943: "Header contains illegal character 0x"
944: + Integer.toHexString(c).toUpperCase());
945: }
946: }
947:
948: }
949:
950: private static boolean isIncludeElement(Element element) {
951:
952: return element.getLocalName().equals("include")
953: && element.getNamespaceURI().equals(XINCLUDE_NS);
954:
955: }
956:
957: private static boolean isFallbackElement(Element element) {
958:
959: return element.getLocalName().equals("fallback")
960: && element.getNamespaceURI().equals(XINCLUDE_NS);
961:
962: }
963:
964: }
|