001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: *
017: */
018:
019: package org.apache.jmeter.protocol.http.parser;
020:
021: import java.net.MalformedURLException;
022: import java.net.URL;
023: import java.util.Iterator;
024:
025: import org.apache.jorphan.logging.LoggingManager;
026: import org.apache.log.Logger;
027: import org.htmlparser.Node;
028: import org.htmlparser.Parser;
029: import org.htmlparser.Tag;
030: import org.htmlparser.tags.AppletTag;
031: import org.htmlparser.tags.BaseHrefTag;
032: import org.htmlparser.tags.BodyTag;
033: import org.htmlparser.tags.CompositeTag;
034: import org.htmlparser.tags.FrameTag;
035: import org.htmlparser.tags.ImageTag;
036: import org.htmlparser.tags.InputTag;
037: import org.htmlparser.tags.LinkTag;
038: import org.htmlparser.tags.ScriptTag;
039: import org.htmlparser.util.NodeIterator;
040: import org.htmlparser.util.ParserException;
041:
042: /**
043: * HtmlParser implementation using SourceForge's HtmlParser.
044: *
045: */
046: class HtmlParserHTMLParser extends HTMLParser {
047: private static final Logger log = LoggingManager
048: .getLoggerForClass();
049:
050: static {
051: org.htmlparser.scanners.ScriptScanner.STRICT = false; // Try to ensure that more javascript code is processed OK ...
052: }
053:
054: protected HtmlParserHTMLParser() {
055: super ();
056: log.info("Using htmlparser version: " + Parser.getVersion());
057: }
058:
059: protected boolean isReusable() {
060: return true;
061: }
062:
063: /*
064: * (non-Javadoc)
065: *
066: * @see org.apache.jmeter.protocol.http.parser.HtmlParser#getEmbeddedResourceURLs(byte[],
067: * java.net.URL)
068: */
069: public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl,
070: URLCollection urls) throws HTMLParseException {
071:
072: if (log.isDebugEnabled())
073: log.debug("Parsing html of: " + baseUrl);
074:
075: Parser htmlParser = null;
076: try {
077: String contents = new String(html);
078: htmlParser = new Parser();
079: htmlParser.setInputHTML(contents);
080: } catch (Exception e) {
081: throw new HTMLParseException(e);
082: }
083:
084: // Now parse the DOM tree
085: try {
086: // we start to iterate through the elements
087: parseNodes(htmlParser.elements(), new URLPointer(baseUrl),
088: urls);
089: log.debug("End : parseNodes");
090: } catch (ParserException e) {
091: throw new HTMLParseException(e);
092: }
093:
094: return urls.iterator();
095: }
096:
097: /*
098: * A dummy class to pass the pointer of URL.
099: */
100: private static class URLPointer {
101: private URLPointer(URL newUrl) {
102: url = newUrl;
103: }
104:
105: private URL url;
106: }
107:
108: /**
109: * Recursively parse all nodes to pick up all URL s.
110: * @see e the nodes to be parsed
111: * @see baseUrl Base URL from which the HTML code was obtained
112: * @see urls URLCollection
113: */
114: private void parseNodes(final NodeIterator e,
115: final URLPointer baseUrl, final URLCollection urls)
116: throws HTMLParseException, ParserException {
117: while (e.hasMoreNodes()) {
118: Node node = e.nextNode();
119: // a url is always in a Tag.
120: if (!(node instanceof Tag)) {
121: continue;
122: }
123: Tag tag = (Tag) node;
124: String tagname = tag.getTagName();
125: String binUrlStr = null;
126:
127: // first we check to see if body tag has a
128: // background set
129: if (tag instanceof BodyTag) {
130: binUrlStr = tag.getAttribute(ATT_BACKGROUND);
131: } else if (tag instanceof BaseHrefTag) {
132: BaseHrefTag baseHref = (BaseHrefTag) tag;
133: String baseref = baseHref.getBaseUrl().toString();
134: try {
135: if (!baseref.equals(""))// Bugzilla 30713
136: {
137: baseUrl.url = new URL(baseUrl.url, baseHref
138: .getBaseUrl());
139: }
140: } catch (MalformedURLException e1) {
141: throw new HTMLParseException(e1);
142: }
143: } else if (tag instanceof ImageTag) {
144: ImageTag image = (ImageTag) tag;
145: binUrlStr = image.getImageURL();
146: } else if (tag instanceof AppletTag) {
147: // look for applets
148:
149: // This will only work with an Applet .class file.
150: // Ideally, this should be upgraded to work with Objects (IE)
151: // and archives (.jar and .zip) files as well.
152: AppletTag applet = (AppletTag) tag;
153: binUrlStr = applet.getAppletClass();
154: } else if (tag instanceof InputTag) {
155: // we check the input tag type for image
156: if (ATT_IS_IMAGE.equalsIgnoreCase(tag
157: .getAttribute(ATT_TYPE))) {
158: // then we need to download the binary
159: binUrlStr = tag.getAttribute(ATT_SRC);
160: }
161: } else if (tag instanceof LinkTag) {
162: LinkTag link = (LinkTag) tag;
163: if (link.getChild(0) instanceof ImageTag) {
164: ImageTag img = (ImageTag) link.getChild(0);
165: binUrlStr = img.getImageURL();
166: }
167: } else if (tag instanceof ScriptTag) {
168: binUrlStr = tag.getAttribute(ATT_SRC);
169: } else if (tag instanceof FrameTag) {
170: binUrlStr = tag.getAttribute(ATT_SRC);
171: } else if (tagname.equalsIgnoreCase(TAG_EMBED)
172: || tagname.equalsIgnoreCase(TAG_BGSOUND)) {
173: binUrlStr = tag.getAttribute(ATT_SRC);
174: } else if (tagname.equalsIgnoreCase(TAG_LINK)) {
175: // Putting the string first means it works even if the attribute is null
176: if (STYLESHEET.equalsIgnoreCase(tag
177: .getAttribute(ATT_REL))) {
178: binUrlStr = tag.getAttribute(ATT_HREF);
179: }
180: } else {
181: binUrlStr = tag.getAttribute(ATT_BACKGROUND);
182: }
183:
184: if (binUrlStr != null) {
185: urls.addURL(binUrlStr, baseUrl.url);
186: }
187:
188: // Now look for URLs in the STYLE attribute
189: String styleTagStr = tag.getAttribute(ATT_STYLE);
190: if (styleTagStr != null) {
191: HtmlParsingUtils.extractStyleURLs(baseUrl.url, urls,
192: styleTagStr);
193: }
194:
195: // second, if the tag was a composite tag,
196: // recursively parse its children.
197: if (tag instanceof CompositeTag) {
198: CompositeTag composite = (CompositeTag) tag;
199: parseNodes(composite.elements(), baseUrl, urls);
200: }
201: }
202: }
203:
204: }
|