001: /**********************************************************************************
002: *
003: * Copyright (c) 2003, 2004 The Regents of the University of Michigan, Trustees of Indiana University,
004: * Board of Trustees of the Leland Stanford, Jr., University, and The MIT Corporation
005: *
006: * Licensed under the Educational Community License Version 1.0 (the "License");
007: * By obtaining, using and/or copying this Original Work, you agree that you have read,
008: * understand, and will comply with the terms and conditions of the Educational Community License.
009: * You may obtain a copy of the License at:
010: *
011: * http://cvs.sakaiproject.org/licenses/license_1_0.html
012: *
013: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
014: * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE
015: * AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
016: * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
017: * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
018: *
019: **********************************************************************************/package edu.indiana.lib.twinpeaks.util;
020:
021: import java.io.*;
022: import java.util.*;
023:
024: import javax.xml.parsers.*;
025: import javax.xml.transform.dom.DOMSource;
026: import javax.xml.transform.stream.StreamResult;
027: import javax.xml.transform.Transformer;
028: import javax.xml.transform.TransformerFactory;
029:
030: import org.w3c.dom.*;
031: import org.w3c.dom.html.*;
032: import org.xml.sax.*;
033:
034: public class DomUtils {
035:
036: private static org.apache.commons.logging.Log _log = LogUtils
037: .getLog(DomUtils.class);
038: /**
039: * Default encoding (NekoHTML)
040: */
041: private static final String ENCODING_OPTION = "http://cyberneko.org/html/properties/default-encoding";
042:
043: private DomUtils() {
044: }
045:
046: public final static String INPUT_ENCODING = "iso-8859-1";
047: public final static String ENCODING = "UTF-8";
048:
049: /**
050: * Create a new element
051: * @param document Document to contain the new element
052: * @param name the element name
053: * @return new Element
054: */
055: public static Element createElement(Document document, String name) {
056: Element element;
057:
058: return document.createElement(name);
059: }
060:
061: /**
062: * Add a new element to the given parent
063: * @param parent the parent Element
064: * @param name the child name
065: * @return new Element
066: */
067: public static Element createElement(Element parent, String name) {
068: Document document;
069: Element element;
070:
071: document = parent.getOwnerDocument();
072: element = document.createElement(name);
073:
074: parent.appendChild(element);
075: return element;
076: }
077:
078: /**
079: * Add Text object to an Element.
080: * @param element the containing element
081: * @param text the text to add
082: */
083: public static void addText(Element element, String text) {
084: element.appendChild(element.getOwnerDocument().createTextNode(
085: text));
086: }
087:
088: /**
089: * Add an entity to a specified Element.
090: * (eg <code>DomUtils.addEntity(element, "nbsp");</code>)
091: * @param element the containing element
092: * @param entity the entity to add
093: */
094: public static void addEntity(Element element, String entity) {
095: element.appendChild(element.getOwnerDocument()
096: .createEntityReference(entity));
097: }
098:
099: /**
100: * "Normalize" XML text node content to create a simple string
101: * @param update Text to add to the original string
102: * @return Concatenated contents (trimmed, pagination characters (\r, \n, etc.)
103: * removed, with a space seperator)
104: */
105: public static String normalizeText(String update) {
106: return normalizeText(null, update);
107: }
108:
109: /**
110: * "Normalize" XML text node content to create a simple string
111: * @param original Original text
112: * @param update Text to add to the original string
113: * @return Concatenated contents (trimmed, pagination characters (\r, \n, etc.)
114: * removed, with a space seperator)
115: */
116: public static String normalizeText(String original, String update) {
117: StringBuffer result;
118:
119: if (original == null) {
120: return (update == null) ? "" : StringUtils.replace(update
121: .trim(), "\\s", " ");
122: }
123:
124: result = new StringBuffer(original.trim());
125: result.append(' ');
126: result.append(update.trim());
127:
128: return StringUtils.replace(result.toString(), "\\s", " ");
129: }
130:
131: /**
132: * Get the text associated with this element, at this level only
133: * @param parent the node containing text
134: * @return Text (trimmed of leading/trailing whitespace, null if none)
135: */
136: public static String getText(Node parent) {
137: return textSearch(parent, false);
138: }
139:
140: /**
141: * Get the text associated with this element, at all suboordinate levels
142: * @param parent the node containing text
143: * @return Text (trimmed of leading/trailing whitespace, null if none)
144: */
145: public static String getAllTextAtNode(Node parent) {
146: return textSearch(parent, true);
147: }
148:
149: /**
150: * Get the text associated with this element at this level only, or
151: * recursivley, searching through all child elements
152: * @param parent the node containing text
153: * @param recursiveSearch Search all child elements?
154: * @return Text (trimmed of leading/trailing whitespace, null if none)
155: */
156: public static String textSearch(Node parent, boolean recursiveSearch) {
157: String text = null;
158:
159: if (parent != null) {
160: for (Node child = parent.getFirstChild(); child != null; child = child
161: .getNextSibling()) {
162:
163: switch (child.getNodeType()) {
164: case Node.TEXT_NODE:
165: text = normalizeText(text, child.getNodeValue());
166: break;
167:
168: case Node.ELEMENT_NODE:
169: if (recursiveSearch) {
170: text = normalizeText(text, getText(child));
171: }
172: break;
173:
174: default:
175: break;
176: }
177: }
178: }
179: return text == null ? text : text.trim();
180: }
181:
182: /**
183: * Get the first text node associated with this element
184: * @param parent the node containing text
185: * @return Text (trimmed of leanding/trailing whitespace, null if none)
186: */
187: public static String getFirstText(Node parent) {
188: return getTextNodeByNumber(parent, 1);
189: }
190:
191: /**
192: * Get the specified text node associated with this element
193: * @param parent the node containing text
194: * @param number The text node to fetch (1st, 2nd, etc)
195: * @return Text (trimmed of leanding/trailing whitespace, null if none)
196: */
197: public static String getTextNodeByNumber(Node parent, int number) {
198: String text = null;
199: int count = 1;
200:
201: if (parent != null) {
202: for (Node child = parent.getFirstChild(); child != null; child = child
203: .getNextSibling()) {
204:
205: if ((child.getNodeType() == Node.TEXT_NODE)
206: && (count++ == number)) {
207: text = child.getNodeValue();
208: return text.trim();
209: }
210: }
211: }
212: return text;
213: }
214:
215: /**
216: * Get any text associated with this element and it's children. Null if none.
217: * @param parent the node containing text
218: * @return Text
219: */
220: public static String getAllText(Node parent) {
221: String text = null;
222:
223: if (parent != null) {
224:
225: for (Node child = parent.getFirstChild(); child != null; child = child
226: .getNextSibling()) {
227:
228: if (child.getNodeType() == Node.TEXT_NODE) {
229: text = normalizeText(text, child.getNodeValue());
230: continue;
231: }
232:
233: if (child.getNodeType() == Node.ELEMENT_NODE) {
234: String childText = getText(child);
235:
236: if (childText != null) {
237: text = normalizeText(text, childText);
238: }
239: }
240: }
241: }
242: return text;
243: }
244:
245: /**
246: * Get an Attribute from an Element. Returns an empty String if none found
247: * @param element the containing Element
248: * @param name the attribute name
249: * @return Attribute as a String
250: */
251: public static String getAttribute(Element element, String name) {
252: return element.getAttribute(name);
253: }
254:
255: /**
256: * Set an Attribute in an Element
257: * @param element the containing Element
258: * @param name the attribute name
259: * @param value the attribute value
260: */
261: public static void setAttribute(Element element, String name,
262: String value) {
263: element.setAttribute(name, value);
264: }
265:
266: /**
267: * Return a list of named Elements.
268: * @param element the containing Element
269: * @param name the tag name
270: * @return NodeList of matching elements
271: */
272: public static NodeList getElementList(Element element, String name) {
273: return element.getElementsByTagName(name);
274: }
275:
276: /**
277: * Return a list of named Elements with a specific attribute value.
278: * @param element the containing Element
279: * @param name the tag name
280: * @param attribute Attribute name
281: * @param value Attribute value
282: * @return List of matching elements
283: */
284: public static List selectElementsByAttributeValue(Element element,
285: String name, String attribute, String value) {
286: return selectElementsByAttributeValue(element, name, attribute,
287: value, false);
288: }
289:
290: /**
291: * Return the first named Element with a specific attribute value.
292: * @param element the containing Element
293: * @param name the tag name
294: * @param attribute Attribute name
295: * @param value Attribute value
296: * @return The first matching Element (null if none)
297: */
298: public static Element selectFirstElementByAttributeValue(
299: Element element, String name, String attribute, String value) {
300:
301: ArrayList resultList = (ArrayList) selectElementsByAttributeValue(
302: element, name, attribute, value, true);
303: return (resultList.size() == 0) ? null : (Element) resultList
304: .get(0);
305: }
306:
307: /**
308: * Return a list of named Elements with a specific attribute value.
309: * @param element the containing Element
310: * @param name the tag name
311: * @param attribute Attribute name
312: * @param value Attribute value
313: * @param returnFirst Return only the first matching value?
314: * @return List of matching elements
315: */
316: public static List selectElementsByAttributeValue(Element element,
317: String name, String attribute, String value,
318: boolean returnFirst) {
319: NodeList elementList = element.getElementsByTagName(name);
320: List resultList = new ArrayList();
321:
322: for (int i = 0; i < elementList.getLength(); i++) {
323: if (getAttribute((Element) elementList.item(i), attribute)
324: .equals(value)) {
325: resultList.add(elementList.item(i));
326: if (returnFirst) {
327: break;
328: }
329: }
330: }
331: return resultList;
332: }
333:
334: /**
335: * Return the first named Element found. Null if none.
336: * @param element the containing Element
337: * @param name the tag name
338: * @return matching Element (null if none)
339: */
340: public static Element getElement(Element element, String name) {
341: NodeList nodeList = getElementList(element, name);
342: return (nodeList.getLength() == 0) ? null : (Element) nodeList
343: .item(0);
344: }
345:
346: /**
347: * Remove this node from its parent.
348: * @param node the node to remove
349: * @return Node removed
350: */
351: public Node removeNode(Node node) {
352: return node.getParentNode().removeChild(node);
353: }
354:
355: /**
356: * Search up the tree for a given node
357: * @param currentNode Starting point for our search
358: * @param tagName Node name to look up
359: * @return matching Node (null if none)
360: */
361: public static Node getPreviousNodeByName(Node currentNode,
362: String tagName) {
363: Node node = currentNode.getParentNode();
364:
365: while ((node != null) && (!node.getNodeName().equals(tagName))) {
366: node = node.getParentNode();
367: }
368: return node;
369: }
370:
371: /**
372: * Search earlier siblings for a given node
373: * @param currentNode Starting point for our search
374: * @param tagName Node name to look up
375: * @return matching Node (null if none)
376: */
377: public static Node getPreviousSiblingByName(Node currentNode,
378: String tagName) {
379: Node node = currentNode.getPreviousSibling();
380:
381: while ((node != null) && (!node.getNodeName().equals(tagName))) {
382: node = node.getPreviousSibling();
383: }
384: return node;
385: }
386:
387: /**
388: * Search our next siblings for a given node
389: * @param currentNode Starting point for our search
390: * @param tagName Node name to look up
391: * @return matching Node (null if none)
392: */
393: public static Node getNextSiblingByName(Node currentNode,
394: String tagName) {
395: Node node = currentNode.getNextSibling();
396:
397: while ((node != null) && (!node.getNodeName().equals(tagName))) {
398: node = node.getNextSibling();
399: }
400: return node;
401: }
402:
403: /**
404: * Search across the tree for a given sibling
405: * @param currentNode Starting point for our search
406: * @param tagName Node name to look up
407: * @return matching Node (null if none)
408: * @deprecated Replaced by {@link #getNextSiblingByName(Node currentNode, String tagName)}
409: */
410: public static Node getNextNodeByName(Node currentNode,
411: String tagName) {
412: return getNextSiblingByName(currentNode, tagName);
413: }
414:
415: /**
416: * Search for a named child of a given node
417: * @param currentNode Starting point for our search
418: * @param tagName Node name to look up
419: * @return matching Node (null if none)
420: */
421: public static Node getChildSiblingByName(Node currentNode,
422: String tagName) {
423: Node node = currentNode.getFirstChild();
424:
425: while ((node != null) && (!node.getNodeName().equals(tagName))) {
426: node = node.getNextSibling();
427: }
428: return node;
429: }
430:
431: /**
432: * Get a DOM Document builder.
433: * @return The DocumentBuilder
434: * @throws DomException
435: */
436: public static DocumentBuilder getXmlDocumentBuilder()
437: throws DomException {
438: try {
439: DocumentBuilderFactory factory;
440:
441: factory = DocumentBuilderFactory.newInstance();
442: factory.setNamespaceAware(false);
443:
444: return factory.newDocumentBuilder();
445:
446: } catch (Exception e) {
447: throw new DomException(e.toString());
448: }
449: }
450:
451: /**
452: * Start a new XML Document (with root name = xml)
453: * @return the Document
454: * @throws DomException
455: */
456: public static Document createXmlDocument() throws DomException {
457: return createXmlDocument("xml");
458: }
459:
460: /**
461: * Start a new XML Document.
462: * @param rootName The name of the Document root Element (created here)
463: * @return the Document
464: * @throws DomException
465: */
466: public static Document createXmlDocument(String rootName)
467: throws DomException {
468: try {
469: Document document = getXmlDocumentBuilder().newDocument();
470: Element root = document.createElement(rootName);
471:
472: document.appendChild(root);
473: return document;
474:
475: } catch (Exception e) {
476: throw new DomException(e.toString());
477: }
478: }
479:
480: /**
481: * Copy an XML document, adding it as a child of the target document root
482: * @param source Document to copy
483: * @param target Document to contain copy
484: */
485: public static void copyDocument(Document source, Document target) {
486: Node node = target
487: .importNode(source.getDocumentElement(), true);
488:
489: target.getDocumentElement().appendChild(node);
490: }
491:
492: /**
493: * Copy a Node from one source document, adding it to the document
494: * root of a different, target Document
495: * @param source Document to copy
496: * @param target Document to contain copy
497: */
498: public static void copyDocumentNode(Node source, Document target) {
499: Node node = target.importNode(source, true);
500:
501: target.getDocumentElement().appendChild(node);
502: }
503:
504: /**
505: * Parse XML text (from an input stream) into a Document.
506: * @param xmlStream The XML text stream
507: * @return DOM Document
508: * @throws DomException
509: */
510: public static Document parseXmlStream(InputStream xmlStream)
511: throws DomException {
512: try {
513: return getXmlDocumentBuilder().parse(
514: new InputSource(xmlStream));
515:
516: } catch (Exception e) {
517: throw new DomException(e.toString());
518: }
519: }
520:
521: /**
522: * Parse XML text (from a Reader) into a Document.
523: * @param xmlReader The XML Reader
524: * @return DOM Document
525: * @throws DomException
526: */
527: public static Document parseXmlReader(Reader xmlReader)
528: throws DomException {
529:
530: try {
531: return getXmlDocumentBuilder().parse(
532: new InputSource(xmlReader));
533:
534: } catch (Exception e) {
535: throw new DomException(e.toString());
536: }
537: }
538:
539: /**
540: * Parse XML text (from a raw byte array) into a Document.
541: * @param xml The XML text
542: * @return DOM Document
543: * @throws DomException
544: */
545: public static Document parseXmlBytes(byte[] xml)
546: throws DomException {
547: return parseXmlStream(new ByteArrayInputStream(xml));
548: }
549:
550: /**
551: * Parse XML text (from a string) into a Document.
552: * @param xml The XML text
553: * @return DOM Document
554: * @throws DomException
555: */
556: public static Document parseXmlString(String xml)
557: throws DomException {
558: return parseXmlStream(new ByteArrayInputStream(xml.getBytes()));
559: }
560:
561: /**
562: * Parse an XML file into a Document.
563: * @param filename - The filename to parse
564: * @return DOM Document
565: * @throws DomException
566: */
567: public static Document parseXmlFile(String filename)
568: throws DomException {
569: try {
570: return getXmlDocumentBuilder().parse(filename);
571: } catch (Exception exception) {
572: throw new DomException(exception.toString());
573: }
574: }
575:
576: /**
577: * Set up and configure an HTML DOM parser. We specifiy a
578: * default encoding value to be used when no encoding information
579: * is available in the HTML document itself.
580: *
581: * An appropriate META tag will override this default:
582: * <code>
583: * <meta http-equiv="Content-Type" content="text/html; charset=XXXX">
584: * </code>
585: *
586: * @return The parser
587: */
588:
589: /*******************************************************************************
590:
591: * We originally used the Neko HTML parser here. This was a boon as it
592: * gracefully handled both HTML and XML (which it wraped in HTML and
593: * BODY tags). Sadly, it is closely tied to Xerces,
594: *
595: * At a future date, we'll look for an appropriate substitute. At present,
596: * parsing only XML is good enough (the Sirsi Web2 Bridge is the only
597: * supported search source, and it's an XML API to SingleSearch).
598:
599: private static org.cyberneko.html.parsers.DOMParser newHtmlDomParser()
600: throws SAXNotRecognizedException, SAXNotSupportedException {
601: org.cyberneko.html.parsers.DOMParser domParser;
602:
603: domParser = new org.cyberneko.html.parsers.DOMParser();
604: domParser.setProperty(ENCODING_OPTION, INPUT_ENCODING);
605:
606: return domParser;
607: }
608:
609: *******************************************************************************/
610:
611: /**
612: * Parse HTML from a Reader
613: * @param reader Reader input
614: * @return DOM Document
615: * @throws DomException
616: */
617:
618: /*******************************************************************************
619: *
620: * See notes on Neko HTML (above)
621: *
622:
623: public static Document parseHtmlReader(Reader reader) throws DomException {
624: return parseHtmlFromInputSource(new InputSource(reader));
625: }
626:
627: *******************************************************************************/
628:
629: /**
630: * Parse HTML from an InputSource
631: * @param in InputSource
632: * @return DOM Document
633: * @throws DomException
634: */
635:
636: /*******************************************************************************
637: *
638: * See notes on Neko HTML (above)
639: *
640:
641: public static Document parseHtmlFromInputSource(InputSource in) throws DomException {
642: try {
643: org.cyberneko.html.parsers.DOMParser domParser;
644:
645: domParser = newHtmlDomParser();
646: domParser.parse(in);
647: return domParser.getDocument();
648:
649: } catch (Exception e) {
650: throw new DomException(e.toString());
651: }
652: }
653:
654: *******************************************************************************/
655:
656: /**
657: * Parse HTML text (from a raw byte array) into a Document.
658: * @param html The HTML text
659: * @return DOM Document
660: * @throws DomException
661: *<p>
662: * The used to be:
663: * <code>parseHtmlStream(new ByteArrayInputStream(html));</code>
664: */
665: public static Document parseHtmlBytes(byte[] html)
666: throws DomException {
667: return parseXmlStream(new ByteArrayInputStream(html));
668: }
669:
670: /**
671: * Parse HTML text (from a String) into a Document.
672: * @param html The HTML text
673: * @return DOM Document
674: * @throws DomException
675: *<p>
676: * This used to be:
677: * <code>return parseHtmlReader(new StringReader(html));</code>
678: */
679: public static Document parseHtmlString(String html)
680: throws DomException {
681: return parseXmlReader(new StringReader(html));
682: }
683:
684: /**
685: * Write formatted XML text to supplied OutputStream.
686: * @param node Node to write
687: * @param target stream to write to
688: * @throws DomException
689: */
690: public static void serializeXml(Node node, OutputStream target)
691: throws DomException {
692: try {
693: Transformer transformer = TransformerFactory.newInstance()
694: .newTransformer();
695: transformer.transform(new DOMSource(node),
696: new StreamResult(target));
697:
698: } catch (Exception e) {
699: throw new DomException(e.toString());
700: }
701: }
702:
703: /**
704: * Write formatted XML text to supplied Writer.
705: * @param node the Node to write
706: * @param writer Writer the document is written to
707: * @throws DomException
708: */
709: public static void serializeXml(Node node, Writer writer)
710: throws DomException {
711: try {
712: Transformer transformer = TransformerFactory.newInstance()
713: .newTransformer();
714: transformer.transform(new DOMSource(node),
715: new StreamResult(writer));
716:
717: } catch (Exception e) {
718: throw new DomException(e.toString());
719: }
720: }
721:
722: /**
723: * Write formatted XML text to a String.
724: * @param object The XML Document, HTML Document, or Element to write
725: * @return String containing the formatted document text
726: * @throws DomException
727: */
728: public static String serialize(Object object) throws DomException {
729: ByteArrayOutputStream stream = null;
730: Writer writer = null;
731:
732: try {
733: stream = new ByteArrayOutputStream();
734: writer = new OutputStreamWriter(stream, ENCODING);
735:
736: if (object instanceof Document) {
737: serializeXml((Node) ((Document) object)
738: .getDocumentElement(), writer);
739: } else if (object instanceof Element) {
740: serializeXml((Node) object, writer);
741: } else {
742: throw new IllegalArgumentException(
743: "Unexpected object for serialzation: "
744: + object.toString());
745: }
746: return stream.toString();
747:
748: } catch (Exception e) {
749: throw new DomException(e.toString());
750:
751: } finally {
752: try {
753: if (writer != null)
754: writer.close();
755: } catch (Exception ignore) {
756: }
757: try {
758: if (stream != null)
759: stream.close();
760: } catch (Exception ignore) {
761: }
762: }
763: }
764: }
|