001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: *
017: */
018:
019: package org.apache.jmeter.protocol.http.parser;
020:
021: import java.io.ByteArrayInputStream;
022: import java.net.MalformedURLException;
023: import java.net.URL;
024: import java.util.Iterator;
025:
026: import org.apache.jorphan.logging.LoggingManager;
027: import org.apache.log.Logger;
028: import org.w3c.dom.Document;
029: import org.w3c.dom.NamedNodeMap;
030: import org.w3c.dom.Node;
031: import org.w3c.dom.NodeList;
032: import org.w3c.tidy.Tidy;
033: import org.xml.sax.SAXException;
034:
035: /**
036: * HtmlParser implementation using JTidy.
037: *
038: */
039: class JTidyHTMLParser extends HTMLParser {
040: private static final Logger log = LoggingManager
041: .getLoggerForClass();
042:
043: protected JTidyHTMLParser() {
044: super ();
045: }
046:
047: protected boolean isReusable() {
048: return true;
049: }
050:
051: /*
052: * (non-Javadoc)
053: *
054: * @see org.apache.jmeter.protocol.http.parser.HTMLParser#getEmbeddedResourceURLs(byte[],
055: * java.net.URL)
056: */
057: public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl,
058: URLCollection urls) throws HTMLParseException {
059: Document dom = null;
060: try {
061: dom = (Document) getDOM(html);
062: } catch (SAXException se) {
063: throw new HTMLParseException(se);
064: }
065:
066: // Now parse the DOM tree
067:
068: scanNodes(dom, urls, baseUrl);
069:
070: return urls.iterator();
071: }
072:
073: /**
074: * Scan nodes recursively, looking for embedded resources
075: *
076: * @param node -
077: * initial node
078: * @param urls -
079: * container for URLs
080: * @param baseUrl -
081: * used to create absolute URLs
082: *
083: * @return new base URL
084: */
085: private URL scanNodes(Node node, URLCollection urls, URL baseUrl)
086: throws HTMLParseException {
087: if (node == null) {
088: return baseUrl;
089: }
090:
091: String name = node.getNodeName();
092:
093: int type = node.getNodeType();
094:
095: switch (type) {
096:
097: case Node.DOCUMENT_NODE:
098: scanNodes(((Document) node).getDocumentElement(), urls,
099: baseUrl);
100: break;
101:
102: case Node.ELEMENT_NODE:
103:
104: NamedNodeMap attrs = node.getAttributes();
105: if (name.equalsIgnoreCase(TAG_BASE)) {
106: String tmp = getValue(attrs, ATT_HREF);
107: if (tmp != null)
108: try {
109: baseUrl = new URL(baseUrl, tmp);
110: } catch (MalformedURLException e) {
111: throw new HTMLParseException(e);
112: }
113: break;
114: }
115:
116: if (name.equalsIgnoreCase(TAG_IMAGE)
117: || name.equalsIgnoreCase(TAG_EMBED)) {
118: urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
119: break;
120: }
121:
122: if (name.equalsIgnoreCase(TAG_APPLET)) {
123: urls.addURL(getValue(attrs, "code"), baseUrl);
124: break;
125: }
126: if (name.equalsIgnoreCase(TAG_INPUT)) {
127: String src = getValue(attrs, ATT_SRC);
128: String typ = getValue(attrs, ATT_TYPE);
129: if ((src != null)
130: && (typ.equalsIgnoreCase(ATT_IS_IMAGE))) {
131: urls.addURL(src, baseUrl);
132: }
133: break;
134: }
135: if (name.equalsIgnoreCase(TAG_LINK)
136: && getValue(attrs, ATT_REL).equalsIgnoreCase(
137: STYLESHEET)) {
138: urls.addURL(getValue(attrs, ATT_HREF), baseUrl);
139: break;
140: }
141: if (name.equalsIgnoreCase(TAG_SCRIPT)) {
142: urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
143: break;
144: }
145: if (name.equalsIgnoreCase(TAG_FRAME)) {
146: urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
147: break;
148: }
149: String back = getValue(attrs, ATT_BACKGROUND);
150: if (back != null) {
151: urls.addURL(back, baseUrl);
152: }
153: if (name.equalsIgnoreCase(TAG_BGSOUND)) {
154: urls.addURL(getValue(attrs, ATT_SRC), baseUrl);
155: break;
156: }
157:
158: String style = getValue(attrs, ATT_STYLE);
159: if (style != null) {
160: HtmlParsingUtils.extractStyleURLs(baseUrl, urls, style);
161: }
162:
163: NodeList children = node.getChildNodes();
164: if (children != null) {
165: int len = children.getLength();
166: for (int i = 0; i < len; i++) {
167: baseUrl = scanNodes(children.item(i), urls, baseUrl);
168: }
169: }
170:
171: break;
172:
173: // case Node.TEXT_NODE:
174: // break;
175:
176: }
177:
178: return baseUrl;
179:
180: }
181:
182: /*
183: * Helper method to get an attribute value, if it exists @param attrs list
184: * of attributs @param attname attribute name @return
185: */
186: private String getValue(NamedNodeMap attrs, String attname) {
187: String v = null;
188: Node n = attrs.getNamedItem(attname);
189: if (n != null)
190: v = n.getNodeValue();
191: return v;
192: }
193:
194: /**
195: * Returns <code>tidy</code> as HTML parser.
196: *
197: * @return a <code>tidy</code> HTML parser
198: */
199: private static Tidy getTidyParser() {
200: log.debug("Start : getParser");
201: Tidy tidy = new Tidy();
202: tidy.setCharEncoding(org.w3c.tidy.Configuration.UTF8);
203: tidy.setQuiet(true);
204: tidy.setShowWarnings(false);
205: if (log.isDebugEnabled()) {
206: log.debug("getParser : tidy parser created - " + tidy);
207: }
208: log.debug("End : getParser");
209: return tidy;
210: }
211:
212: /**
213: * Returns a node representing a whole xml given an xml document.
214: *
215: * @param text
216: * an xml document (as a byte array)
217: * @return a node representing a whole xml
218: *
219: * @throws SAXException
220: * indicates an error parsing the xml document
221: */
222: private static Node getDOM(byte[] text) throws SAXException {
223: log.debug("Start : getDOM");
224: Node node = getTidyParser().parseDOM(
225: new ByteArrayInputStream(text), null);
226: if (log.isDebugEnabled()) {
227: log.debug("node : " + node);
228: }
229: log.debug("End : getDOM");
230: return node;
231: }
232: }
|