001: /*
002: * The Apache Software License, Version 1.1
003: *
004: *
005: * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
006: * reserved.
007: *
008: * Redistribution and use in source and binary forms, with or without
009: * modification, are permitted provided that the following conditions
010: * are met:
011: *
012: * 1. Redistributions of source code must retain the above copyright
013: * notice, this list of conditions and the following disclaimer.
014: *
015: * 2. Redistributions in binary form must reproduce the above copyright
016: * notice, this list of conditions and the following disclaimer in
017: * the documentation and/or other materials provided with the
018: * distribution.
019: *
020: * 3. The end-user documentation included with the redistribution,
021: * if any, must include the following acknowledgment:
022: * "This product includes software developed by the
023: * Apache Software Foundation (http://www.apache.org/)."
024: * Alternately, this acknowledgment may appear in the software itself,
025: * if and wherever such third-party acknowledgments normally appear.
026: *
027: * 4. The names "Xerces" and "Apache Software Foundation" must
028: * not be used to endorse or promote products derived from this
029: * software without prior written permission. For written
030: * permission, please contact apache@apache.org.
031: *
032: * 5. Products derived from this software may not be called "Apache",
033: * nor may "Apache" appear in their name, without prior written
034: * permission of the Apache Software Foundation.
035: *
036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
047: * SUCH DAMAGE.
048: * ====================================================================
049: *
050: * This software consists of voluntary contributions made by many
051: * individuals on behalf of the Apache Software Foundation and was
052: * originally based on software copyright (c) 1999, International
053: * Business Machines, Inc., http://www.apache.org. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057: package org.apache.html.dom;
058:
059: import java.util.Vector;
060: import org.w3c.dom.*;
061: import org.w3c.dom.html.*;
062: import org.xml.sax.*;
063: import org.apache.xerces.dom.TextImpl;
064: import org.apache.xerces.dom.ElementImpl;
065: import org.apache.xerces.dom.ProcessingInstructionImpl;
066:
067: /**
068: * This is a SAX document handler that is used to build an HTML document.
069: * It can build a document from any SAX parser, but is specifically tuned
070: * for working with the OpenXML HTML parser.
071: *
072: *
073: * @version $Revision: 1.4 $ $Date: 2000/12/21 00:33:38 $
074: * @author <a href="mailto:arkin@openxml.org">Assaf Arkin</a>
075: */
076: public class HTMLBuilder implements DocumentHandler {
077:
078: /**
079: * The document that is being built.
080: */
081: protected HTMLDocumentImpl _document;
082:
083: /**
084: * The current node in the document into which elements, text and
085: * other nodes will be inserted. This starts as the document iself
086: * and reflects each element that is currently being parsed.
087: */
088: protected ElementImpl _current;
089:
090: /**
091: * A reference to the current locator, this is generally the parser
092: * itself. The locator is used to locate errors and identify the
093: * source locations of elements.
094: */
095: private Locator _locator;
096:
097: /**
098: * Applies only to whitespace appearing between element tags in element content,
099: * as per the SAX definition, and true by default.
100: */
101: private boolean _ignoreWhitespace = true;
102:
103: /**
104: * Indicates whether finished building a document. If so, can start building
105: * another document. Must be initially true to get the first document processed.
106: */
107: private boolean _done = true;
108:
109: /**
110: * The document is only created the same time as the document element, however, certain
111: * nodes may precede the document element (comment and PI), and they are accumulated
112: * in this vector.
113: */
114: protected Vector _preRootNodes;
115:
116: public void startDocument() throws SAXException {
117: if (!_done)
118: throw new SAXException(
119: "HTM001 State error: startDocument fired twice on one builder.");
120: _document = null;
121: _done = false;
122: }
123:
124: public void endDocument() throws SAXException {
125: if (_document == null)
126: throw new SAXException(
127: "HTM002 State error: document never started or missing document element.");
128: if (_current != null)
129: throw new SAXException(
130: "HTM003 State error: document ended before end of document element.");
131: _current = null;
132: _done = true;
133: }
134:
135: public synchronized void startElement(String tagName,
136: AttributeList attrList) throws SAXException {
137: ElementImpl elem;
138: int i;
139:
140: if (tagName == null)
141: throw new SAXException("HTM004 Argument 'tagName' is null.");
142:
143: // If this is the root element, this is the time to create a new document,
144: // because only know we know the document element name and namespace URI.
145: if (_document == null) {
146: // No need to create the element explicitly.
147: _document = new HTMLDocumentImpl();
148: elem = (ElementImpl) _document.getDocumentElement();
149: _current = elem;
150: if (_current == null)
151: throw new SAXException(
152: "HTM005 State error: Document.getDocumentElement returns null.");
153:
154: // Insert nodes (comment and PI) that appear before the root element.
155: if (_preRootNodes != null) {
156: for (i = _preRootNodes.size(); i-- > 0;)
157: _document.insertBefore((Node) _preRootNodes
158: .elementAt(i), elem);
159: _preRootNodes = null;
160: }
161:
162: } else {
163: // This is a state error, indicates that document has been parsed in full,
164: // or that there are two root elements.
165: if (_current == null)
166: throw new SAXException(
167: "HTM006 State error: startElement called after end of document element.");
168: elem = (ElementImpl) _document.createElement(tagName);
169: _current.appendChild(elem);
170: _current = elem;
171: }
172:
173: // Add the attributes (specified and not-specified) to this element.
174: if (attrList != null) {
175: for (i = 0; i < attrList.getLength(); ++i)
176: elem.setAttribute(attrList.getName(i), attrList
177: .getValue(i));
178: }
179: }
180:
181: public void endElement(String tagName) throws SAXException {
182:
183: if (_current == null)
184: throw new SAXException(
185: "HTM007 State error: endElement called with no current node.");
186: if (!_current.getNodeName().equals(tagName))
187: throw new SAXException(
188: "HTM008 State error: mismatch in closing tag name "
189: + tagName + "\n" + tagName);
190:
191: // Move up to the parent element. When you reach the top (closing the root element).
192: // the parent is document and current is null.
193: if (_current.getParentNode() == _current.getOwnerDocument())
194: _current = null;
195: else
196: _current = (ElementImpl) _current.getParentNode();
197: }
198:
199: public void characters(String text) throws SAXException {
200: if (_current == null)
201: throw new SAXException(
202: "HTM009 State error: character data found outside of root element.");
203: _current.appendChild(new TextImpl(_document, text));
204: }
205:
206: public void characters(char[] text, int start, int length)
207: throws SAXException {
208: if (_current == null)
209: throw new SAXException(
210: "HTM010 State error: character data found outside of root element.");
211: _current.appendChild(new TextImpl(_document, new String(text,
212: start, length)));
213: }
214:
215: public void ignorableWhitespace(char[] text, int start, int length)
216: throws SAXException {
217: Node node;
218:
219: if (!_ignoreWhitespace)
220: _current.appendChild(new TextImpl(_document, new String(
221: text, start, length)));
222: }
223:
224: public void processingInstruction(String target, String instruction)
225: throws SAXException {
226: Node node;
227:
228: // Processing instruction may appear before the document element (in fact, before the
229: // document has been created, or after the document element has been closed.
230: if (_current == null && _document == null) {
231: if (_preRootNodes == null)
232: _preRootNodes = new Vector();
233: _preRootNodes.addElement(new ProcessingInstructionImpl(
234: null, target, instruction));
235: } else if (_current == null && _document != null)
236: _document.appendChild(new ProcessingInstructionImpl(
237: _document, target, instruction));
238: else
239: _current.appendChild(new ProcessingInstructionImpl(
240: _document, target, instruction));
241: }
242:
243: public HTMLDocument getHTMLDocument() {
244: return (HTMLDocument) _document;
245: }
246:
247: public void setDocumentLocator(Locator locator) {
248: _locator = locator;
249: }
250:
251: }
|