001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: *
017: */
018:
019: package org.apache.jmeter.protocol.http.parser;
020:
021: import java.net.URL;
022: import java.util.Collection;
023: import java.util.Hashtable;
024: import java.util.Iterator;
025: import java.util.LinkedHashSet;
026:
027: import org.apache.jmeter.util.JMeterUtils;
028: import org.apache.jorphan.logging.LoggingManager;
029: import org.apache.log.Logger;
030:
031: /**
032: * HtmlParsers can parse HTML content to obtain URLs.
033: *
034: * @author <a href="mailto:jsalvata@apache.org">Jordi Salvat i Alabart</a>
035: * @version $Revision: 514343 $ updated on $Date: 2007-03-04 03:17:42 +0000 (Sun, 04 Mar 2007) $
036: */
037: public abstract class HTMLParser {
038:
039: private static final Logger log = LoggingManager
040: .getLoggerForClass();
041:
042: protected static final String ATT_BACKGROUND = "background";// $NON-NLS-1$
043: protected static final String ATT_HREF = "href";// $NON-NLS-1$
044: protected static final String ATT_REL = "rel";// $NON-NLS-1$
045: protected static final String ATT_SRC = "src";// $NON-NLS-1$
046: protected static final String ATT_STYLE = "style";// $NON-NLS-1$
047: protected static final String ATT_TYPE = "type";// $NON-NLS-1$
048: protected static final String ATT_IS_IMAGE = "image";// $NON-NLS-1$
049: protected static final String TAG_APPLET = "applet";// $NON-NLS-1$
050: protected static final String TAG_BASE = "base";// $NON-NLS-1$
051: protected static final String TAG_BGSOUND = "bgsound";// $NON-NLS-1$
052: protected static final String TAG_EMBED = "embed";// $NON-NLS-1$
053: protected static final String TAG_FRAME = "frame";// $NON-NLS-1$
054: protected static final String TAG_IMAGE = "img";// $NON-NLS-1$
055: protected static final String TAG_INPUT = "input";// $NON-NLS-1$
056: protected static final String TAG_LINK = "link";// $NON-NLS-1$
057: protected static final String TAG_SCRIPT = "script";// $NON-NLS-1$
058: protected static final String STYLESHEET = "stylesheet";// $NON-NLS-1$
059:
060: // Cache of parsers - parsers must be re-usable
061: private static Hashtable parsers = new Hashtable(3);
062:
063: public final static String PARSER_CLASSNAME = "htmlParser.className"; // $NON-NLS-1$
064:
065: public final static String DEFAULT_PARSER = "org.apache.jmeter.protocol.http.parser.HtmlParserHTMLParser"; // $NON-NLS-1$
066:
067: /**
068: * Protected constructor to prevent instantiation except from within
069: * subclasses.
070: */
071: protected HTMLParser() {
072: }
073:
074: public static final HTMLParser getParser() {
075: return getParser(JMeterUtils.getPropDefault(PARSER_CLASSNAME,
076: DEFAULT_PARSER));
077: }
078:
079: public static final synchronized HTMLParser getParser(
080: String htmlParserClassName) {
081:
082: // Is there a cached parser?
083: HTMLParser pars = (HTMLParser) parsers.get(htmlParserClassName);
084: if (pars != null) {
085: log.debug("Fetched " + htmlParserClassName);
086: return pars;
087: }
088:
089: try {
090: Object clazz = Class.forName(htmlParserClassName)
091: .newInstance();
092: if (clazz instanceof HTMLParser) {
093: pars = (HTMLParser) clazz;
094: } else {
095: throw new HTMLParseError(new ClassCastException(
096: htmlParserClassName));
097: }
098: } catch (InstantiationException e) {
099: throw new HTMLParseError(e);
100: } catch (IllegalAccessException e) {
101: throw new HTMLParseError(e);
102: } catch (ClassNotFoundException e) {
103: throw new HTMLParseError(e);
104: }
105: log.info("Created " + htmlParserClassName);
106: if (pars.isReusable()) {
107: parsers.put(htmlParserClassName, pars);// cache the parser
108: }
109:
110: return pars;
111: }
112:
113: /**
114: * Get the URLs for all the resources that a browser would automatically
115: * download following the download of the HTML content, that is: images,
116: * stylesheets, javascript files, applets, etc...
117: * <p>
118: * URLs should not appear twice in the returned iterator.
119: * <p>
120: * Malformed URLs can be reported to the caller by having the Iterator
121: * return the corresponding RL String. Overall problems parsing the html
122: * should be reported by throwing an HTMLParseException.
123: *
124: * @param html
125: * HTML code
126: * @param baseUrl
127: * Base URL from which the HTML code was obtained
128: * @return an Iterator for the resource URLs
129: */
130: public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl)
131: throws HTMLParseException {
132: // The Set is used to ignore duplicated binary files.
133: // Using a LinkedHashSet to avoid unnecessary overhead in iterating
134: // the elements in the set later on. As a side-effect, this will keep
135: // them roughly in order, which should be a better model of browser
136: // behaviour.
137:
138: Collection col = new LinkedHashSet();
139: return getEmbeddedResourceURLs(html, baseUrl,
140: new URLCollection(col));
141:
142: // An additional note on using HashSets to store URLs: I just
143: // discovered that obtaining the hashCode of a java.net.URL implies
144: // a domain-name resolution process. This means significant delays
145: // can occur, even more so if the domain name is not resolvable.
146: // Whether this can be a problem in practical situations I can't tell,
147: // but
148: // thought I'd keep a note just in case...
149: // BTW, note that using a Vector and removing duplicates via scan
150: // would not help, since URL.equals requires name resolution too.
151: // The above problem has now been addressed with the URLString and
152: // URLCollection classes.
153:
154: }
155:
156: /**
157: * Get the URLs for all the resources that a browser would automatically
158: * download following the download of the HTML content, that is: images,
159: * stylesheets, javascript files, applets, etc...
160: * <p>
161: * All URLs should be added to the Collection.
162: * <p>
163: * Malformed URLs can be reported to the caller by having the Iterator
164: * return the corresponding RL String. Overall problems parsing the html
165: * should be reported by throwing an HTMLParseException.
166: *
167: * N.B. The Iterator returns URLs, but the Collection will contain objects
168: * of class URLString.
169: *
170: * @param html
171: * HTML code
172: * @param baseUrl
173: * Base URL from which the HTML code was obtained
174: * @param coll
175: * URLCollection
176: * @return an Iterator for the resource URLs
177: */
178: public abstract Iterator getEmbeddedResourceURLs(byte[] html,
179: URL baseUrl, URLCollection coll) throws HTMLParseException;
180:
181: /**
182: * Get the URLs for all the resources that a browser would automatically
183: * download following the download of the HTML content, that is: images,
184: * stylesheets, javascript files, applets, etc...
185: *
186: * N.B. The Iterator returns URLs, but the Collection will contain objects
187: * of class URLString.
188: *
189: * @param html
190: * HTML code
191: * @param baseUrl
192: * Base URL from which the HTML code was obtained
193: * @param coll
194: * Collection - will contain URLString objects, not URLs
195: * @return an Iterator for the resource URLs
196: */
197: public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl,
198: Collection coll) throws HTMLParseException {
199: return getEmbeddedResourceURLs(html, baseUrl,
200: new URLCollection(coll));
201: }
202:
203: /**
204: * Parsers should over-ride this method if the parser class is re-usable, in
205: * which case the class will be cached for the next getParser() call.
206: *
207: * @return true if the Parser is reusable
208: */
209: protected boolean isReusable() {
210: return false;
211: }
212: }
|