001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.html.dom;
018:
019: import java.util.Vector;
020:
021: import org.apache.xerces.dom.ElementImpl;
022: import org.apache.xerces.dom.ProcessingInstructionImpl;
023: import org.apache.xerces.dom.TextImpl;
024: import org.w3c.dom.Node;
025: import org.w3c.dom.html.HTMLDocument;
026: import org.xml.sax.AttributeList;
027: import org.xml.sax.DocumentHandler;
028: import org.xml.sax.Locator;
029: import org.xml.sax.SAXException;
030:
031: /**
032: * This is a SAX document handler that is used to build an HTML document.
033: * It can build a document from any SAX parser, but is specifically tuned
034: * for working with the OpenXML HTML parser.
035: *
036: *
037: * @version $Revision: 449313 $ $Date: 2006-09-23 18:01:43 -0400 (Sat, 23 Sep 2006) $
038: * @author <a href="mailto:arkin@openxml.org">Assaf Arkin</a>
039: */
040: public class HTMLBuilder implements DocumentHandler {
041:
042: /**
043: * The document that is being built.
044: */
045: protected HTMLDocumentImpl _document;
046:
047: /**
048: * The current node in the document into which elements, text and
049: * other nodes will be inserted. This starts as the document itself
050: * and reflects each element that is currently being parsed.
051: */
052: protected ElementImpl _current;
053:
054: /**
055: * Applies only to whitespace appearing between element tags in element content,
056: * as per the SAX definition, and true by default.
057: */
058: private boolean _ignoreWhitespace = true;
059:
060: /**
061: * Indicates whether finished building a document. If so, can start building
062: * another document. Must be initially true to get the first document processed.
063: */
064: private boolean _done = true;
065:
066: /**
067: * The document is only created the same time as the document element, however, certain
068: * nodes may precede the document element (comment and PI), and they are accumulated
069: * in this vector.
070: */
071: protected Vector _preRootNodes;
072:
073: public void startDocument() throws SAXException {
074: if (!_done)
075: throw new SAXException(
076: "HTM001 State error: startDocument fired twice on one builder.");
077: _document = null;
078: _done = false;
079: }
080:
081: public void endDocument() throws SAXException {
082: if (_document == null)
083: throw new SAXException(
084: "HTM002 State error: document never started or missing document element.");
085: if (_current != null)
086: throw new SAXException(
087: "HTM003 State error: document ended before end of document element.");
088: _current = null;
089: _done = true;
090: }
091:
092: public synchronized void startElement(String tagName,
093: AttributeList attrList) throws SAXException {
094: ElementImpl elem;
095: int i;
096:
097: if (tagName == null)
098: throw new SAXException("HTM004 Argument 'tagName' is null.");
099:
100: // If this is the root element, this is the time to create a new document,
101: // because only know we know the document element name and namespace URI.
102: if (_document == null) {
103: // No need to create the element explicitly.
104: _document = new HTMLDocumentImpl();
105: elem = (ElementImpl) _document.getDocumentElement();
106: _current = elem;
107: if (_current == null)
108: throw new SAXException(
109: "HTM005 State error: Document.getDocumentElement returns null.");
110:
111: // Insert nodes (comment and PI) that appear before the root element.
112: if (_preRootNodes != null) {
113: for (i = _preRootNodes.size(); i-- > 0;)
114: _document.insertBefore((Node) _preRootNodes
115: .elementAt(i), elem);
116: _preRootNodes = null;
117: }
118:
119: } else {
120: // This is a state error, indicates that document has been parsed in full,
121: // or that there are two root elements.
122: if (_current == null)
123: throw new SAXException(
124: "HTM006 State error: startElement called after end of document element.");
125: elem = (ElementImpl) _document.createElement(tagName);
126: _current.appendChild(elem);
127: _current = elem;
128: }
129:
130: // Add the attributes (specified and not-specified) to this element.
131: if (attrList != null) {
132: for (i = 0; i < attrList.getLength(); ++i)
133: elem.setAttribute(attrList.getName(i), attrList
134: .getValue(i));
135: }
136: }
137:
138: public void endElement(String tagName) throws SAXException {
139: if (_current == null)
140: throw new SAXException(
141: "HTM007 State error: endElement called with no current node.");
142: if (!_current.getNodeName().equalsIgnoreCase(tagName))
143: throw new SAXException(
144: "HTM008 State error: mismatch in closing tag name "
145: + tagName + "\n" + tagName);
146:
147: // Move up to the parent element. When you reach the top (closing the root element).
148: // the parent is document and current is null.
149: if (_current.getParentNode() == _current.getOwnerDocument())
150: _current = null;
151: else
152: _current = (ElementImpl) _current.getParentNode();
153: }
154:
155: public void characters(String text) throws SAXException {
156: if (_current == null)
157: throw new SAXException(
158: "HTM009 State error: character data found outside of root element.");
159: _current.appendChild(new TextImpl(_document, text));
160: }
161:
162: public void characters(char[] text, int start, int length)
163: throws SAXException {
164: if (_current == null)
165: throw new SAXException(
166: "HTM010 State error: character data found outside of root element.");
167: _current.appendChild(new TextImpl(_document, new String(text,
168: start, length)));
169: }
170:
171: public void ignorableWhitespace(char[] text, int start, int length)
172: throws SAXException {
173: if (!_ignoreWhitespace)
174: _current.appendChild(new TextImpl(_document, new String(
175: text, start, length)));
176: }
177:
178: public void processingInstruction(String target, String instruction)
179: throws SAXException {
180: // Processing instruction may appear before the document element (in fact, before the
181: // document has been created, or after the document element has been closed.
182: if (_current == null && _document == null) {
183: if (_preRootNodes == null)
184: _preRootNodes = new Vector();
185: _preRootNodes.addElement(new ProcessingInstructionImpl(
186: null, target, instruction));
187: } else if (_current == null && _document != null)
188: _document.appendChild(new ProcessingInstructionImpl(
189: _document, target, instruction));
190: else
191: _current.appendChild(new ProcessingInstructionImpl(
192: _document, target, instruction));
193: }
194:
195: public HTMLDocument getHTMLDocument() {
196: return _document;
197: }
198:
199: public void setDocumentLocator(Locator locator) {
200: // ignored
201: }
202:
203: }
|