001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: *
017: */
018:
019: package org.apache.jmeter.protocol.http.parser;
020:
021: import java.net.MalformedURLException;
022: import java.net.URL;
023: import java.util.Iterator;
024:
025: import org.apache.jmeter.util.JMeterUtils;
026: import org.apache.jorphan.logging.LoggingManager;
027: import org.apache.log.Logger;
028:
029: // NOTE: Also looked at using Java 1.4 regexp instead of ORO. The change was
030: // trivial. Performance did not improve -- at least not significantly.
031: // Finally decided for ORO following advise from Stefan Bodewig (message
032: // to jmeter-dev dated 25 Nov 2003 8:52 CET) [Jordi]
033: import org.apache.oro.text.regex.MatchResult;
034: import org.apache.oro.text.regex.Pattern;
035: import org.apache.oro.text.regex.PatternMatcherInput;
036: import org.apache.oro.text.regex.Perl5Compiler;
037: import org.apache.oro.text.regex.Perl5Matcher;
038:
039: /**
040: * HtmlParser implementation using regular expressions.
041: * <p>
042: * This class will find RLs specified in the following ways (where <b>url</b>
043: * represents the RL being found:
044: * <ul>
045: * <li><img src=<b>url</b> ... >
046: * <li><script src=<b>url</b> ... >
047: * <li><applet code=<b>url</b> ... >
048: * <li><input type=image src=<b>url</b> ... >
049: * <li><body background=<b>url</b> ... >
050: * <li><table background=<b>url</b> ... >
051: * <li><td background=<b>url</b> ... >
052: * <li><tr background=<b>url</b> ... >
053: * <li><applet ... codebase=<b>url</b> ... >
054: * <li><embed src=<b>url</b> ... >
055: * <li><embed codebase=<b>url</b> ... >
056: * <li><object codebase=<b>url</b> ... >
057: * <li><link rel=stylesheet href=<b>url</b>... gt;
058: * <li><bgsound src=<b>url</b> ... >
059: * <li><frame src=<b>url</b> ... >
060: * </ul>
061: *
062: * <p>
063: * This class will take into account the following construct:
064: * <ul>
065: * <li><base href=<b>url</b>>
066: * </ul>
067: *
068: * <p>
069: * But not the following:
070: * <ul>
071: * <li>< ... codebase=<b>url</b> ... >
072: * </ul>
073: *
074: * @author <a href="mailto:jsalvata@apache.org">Jordi Salvat i Alabart</a>
075: */
076: class RegexpHTMLParser extends HTMLParser {
077: private static final Logger log = LoggingManager
078: .getLoggerForClass();
079:
080: /**
081: * Regexp fragment matching a tag attribute's value (including the equals
082: * sign and any spaces before it). Note it matches unquoted values, which to
083: * my understanding, are not conformant to any of the HTML specifications,
084: * but are still quite common in the web and all browsers seem to understand
085: * them.
086: */
087: private static final String VALUE = "\\s*=\\s*(?:\"([^\"]*)\"|'([^']*)'|([^\"'\\s>\\\\][^\\s>]*)(?=[\\s>]))";
088:
089: // Note there's 3 capturing groups per value
090:
091: /**
092: * Regexp fragment matching the separation between two tag attributes.
093: */
094: private static final String SEP = "\\s(?:[^>]*\\s)?";
095:
096: /**
097: * Regular expression used against the HTML code to find the URIs of images,
098: * etc.:
099: */
100: private static final String REGEXP = "<(?:" + "!--.*?-->" + "|BASE"
101: + SEP
102: + "HREF"
103: + VALUE
104: + "|(?:IMG|SCRIPT|FRAME|IFRAME|BGSOUND|FRAME)"
105: + SEP
106: + "SRC"
107: + VALUE
108: + "|APPLET"
109: + SEP
110: + "CODE(?:BASE)?"
111: + VALUE
112: + "|(?:EMBED|OBJECT)"
113: + SEP
114: + "(?:SRC|CODEBASE)"
115: + VALUE
116: + "|(?:BODY|TABLE|TR|TD)"
117: + SEP
118: + "BACKGROUND"
119: + VALUE
120: + "|[^<]+?STYLE\\s*=['\"].*?URL\\(\\s*['\"](.+?)['\"]\\s*\\)"
121: + "|INPUT(?:"
122: + SEP
123: + "(?:SRC"
124: + VALUE
125: + "|TYPE\\s*=\\s*(?:\"image\"|'image'|image(?=[\\s>])))){2,}"
126: + "|LINK(?:"
127: + SEP
128: + "(?:HREF"
129: + VALUE
130: + "|REL\\s*=\\s*(?:\"stylesheet\"|'stylesheet'|stylesheet(?=[\\s>])))){2,}"
131: + ")";
132:
133: // Number of capturing groups possibly containing Base HREFs:
134: private static final int NUM_BASE_GROUPS = 3;
135:
136: /**
137: * Thread-local input:
138: */
139: private static ThreadLocal localInput = new ThreadLocal() {
140: protected Object initialValue() {
141: return new PatternMatcherInput(new char[0]);
142: }
143: };
144:
145: protected boolean isReusable() {
146: return true;
147: }
148:
149: /**
150: * Make sure to compile the regular expression upon instantiation:
151: */
152: protected RegexpHTMLParser() {
153: super ();
154: }
155:
156: /*
157: * (non-Javadoc)
158: *
159: * @see org.apache.jmeter.protocol.http.parser.HtmlParser#getEmbeddedResourceURLs(byte[],
160: * java.net.URL)
161: */
162: public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl,
163: URLCollection urls) {
164:
165: Perl5Matcher matcher = JMeterUtils.getMatcher();
166: PatternMatcherInput input = (PatternMatcherInput) localInput
167: .get();
168: // TODO: find a way to avoid the cost of creating a String here --
169: // probably a new PatternMatcherInput working on a byte[] would do
170: // better.
171: input.setInput(new String(html));
172: Pattern pattern = JMeterUtils.getPatternCache().getPattern(
173: REGEXP,
174: Perl5Compiler.CASE_INSENSITIVE_MASK
175: | Perl5Compiler.SINGLELINE_MASK
176: | Perl5Compiler.READ_ONLY_MASK);
177:
178: while (matcher.contains(input, pattern)) {
179: MatchResult match = matcher.getMatch();
180: String s;
181: if (log.isDebugEnabled())
182: log.debug("match groups " + match.groups() + " "
183: + match.toString());
184: // Check for a BASE HREF:
185: for (int g = 1; g <= NUM_BASE_GROUPS && g <= match.groups(); g++) {
186: s = match.group(g);
187: if (s != null) {
188: if (log.isDebugEnabled()) {
189: log.debug("new baseUrl: " + s + " - "
190: + baseUrl.toString());
191: }
192: try {
193: baseUrl = new URL(baseUrl, s);
194: } catch (MalformedURLException e) {
195: // Doesn't even look like a URL?
196: // Maybe it isn't: Ignore the exception.
197: if (log.isDebugEnabled()) {
198: log.debug("Can't build base URL from RL "
199: + s + " in page " + baseUrl, e);
200: }
201: }
202: }
203: }
204: for (int g = NUM_BASE_GROUPS + 1; g <= match.groups(); g++) {
205: s = match.group(g);
206: if (s != null) {
207: if (log.isDebugEnabled()) {
208: log
209: .debug("group " + g + " - "
210: + match.group(g));
211: }
212: urls.addURL(s, baseUrl);
213: }
214: }
215: }
216: return urls.iterator();
217: }
218: }
|