001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: *
017: */
018:
019: package org.apache.jmeter.protocol.http.parser;
020:
021: import java.io.ByteArrayInputStream;
022: import java.io.UnsupportedEncodingException;
023: import java.net.MalformedURLException;
024: import java.net.URL;
025: import java.net.URLDecoder;
026: import java.util.LinkedList;
027: import java.util.List;
028:
029: import org.apache.jmeter.config.Argument;
030: import org.apache.jmeter.config.Arguments;
031: import org.apache.jmeter.protocol.http.sampler.HTTPSamplerBase;
032: import org.apache.jmeter.protocol.http.sampler.HTTPSamplerFactory;
033: import org.apache.jmeter.testelement.property.PropertyIterator;
034: import org.apache.jmeter.util.JMeterUtils;
035: import org.apache.jorphan.logging.LoggingManager;
036: import org.apache.log.Logger;
037: import org.apache.oro.text.PatternCacheLRU;
038: import org.apache.oro.text.regex.MatchResult;
039: import org.apache.oro.text.regex.Pattern;
040: import org.apache.oro.text.regex.PatternMatcherInput;
041: import org.apache.oro.text.regex.Perl5Compiler;
042: import org.apache.oro.text.regex.Perl5Matcher;
043: import org.w3c.dom.Document;
044: import org.w3c.dom.NamedNodeMap;
045: import org.w3c.dom.Node;
046: import org.w3c.dom.NodeList;
047: import org.w3c.tidy.Tidy;
048:
049: // For Junit tests @see TestHtmlParsingUtils
050:
051: /**
052: * @author Michael Stover Created June 14, 2001
053: */
054: public final class HtmlParsingUtils {
055: private static final Logger log = LoggingManager
056: .getLoggerForClass();
057:
058: /**
059: * Private constructor to prevent instantiation.
060: */
061: private HtmlParsingUtils() {
062: }
063:
064: /**
065: * Check if anchor matches by checking against:
066: * - protocol
067: * - domain
068: * - path
069: * - parameter names
070: *
071: * @param newLink target to match
072: * @param config pattern to match against
073: *
074: * @return true if target URL matches pattern URL
075: */
076: public static boolean isAnchorMatched(HTTPSamplerBase newLink,
077: HTTPSamplerBase config) {
078: String query = null;
079: try {
080: query = URLDecoder
081: .decode(newLink.getQueryString(), "UTF-8"); // $NON-NLS-1$
082: } catch (UnsupportedEncodingException e) {
083: // UTF-8 unsupported? You must be joking!
084: log.error("UTF-8 encoding not supported!");
085: throw new Error("Should not happen: " + e.toString());
086: }
087:
088: final Arguments arguments = config.getArguments();
089: if (query == null && arguments.getArgumentCount() > 0) {
090: return false;// failed to convert query, so assume no match
091: }
092:
093: final Perl5Matcher matcher = JMeterUtils.getMatcher();
094: final PatternCacheLRU patternCache = JMeterUtils
095: .getPatternCache();
096:
097: if (!isEqualOrMatches(newLink.getProtocol(), config
098: .getProtocol(), matcher, patternCache)) {
099: return false;
100: }
101:
102: final String domain = config.getDomain();
103: if (domain != null && domain.length() > 0) {
104: if (!isEqualOrMatches(newLink.getDomain(), domain, matcher,
105: patternCache)) {
106: return false;
107: }
108: }
109:
110: final String path = config.getPath();
111: if (!newLink.getPath().equals(path)
112: && !matcher.matches(newLink.getPath(), patternCache
113: .getPattern("[/]*" + path, // $NON-NLS-1$
114: Perl5Compiler.READ_ONLY_MASK))) {
115: return false;
116: }
117:
118: PropertyIterator iter = arguments.iterator();
119: while (iter.hasNext()) {
120: Argument item = (Argument) iter.next().getObjectValue();
121: final String name = item.getName();
122: if (query.indexOf(name + "=") == -1) { // $NON-NLS-1$
123: if (!(matcher.contains(query, patternCache.getPattern(
124: name, Perl5Compiler.READ_ONLY_MASK)))) {
125: return false;
126: }
127: }
128: }
129:
130: return true;
131: }
132:
133: /**
134: * Arguments match if the input name matches the corresponding pattern name
135: * and the input value matches the pattern value, where the matching is done
136: * first using String equals, and then Regular Expression matching if the equals test fails.
137: *
138: * @param arg - input Argument
139: * @param patternArg - pattern to match against
140: * @return true if both name and value match
141: */
142: public static boolean isArgumentMatched(Argument arg,
143: Argument patternArg) {
144: final Perl5Matcher matcher = JMeterUtils.getMatcher();
145: final PatternCacheLRU patternCache = JMeterUtils
146: .getPatternCache();
147: return isEqualOrMatches(arg.getName(), patternArg.getName(),
148: matcher, patternCache)
149: && isEqualOrMatches(arg.getValue(), patternArg
150: .getValue(), matcher, patternCache);
151: }
152:
153: /**
154: * Match the input argument against the pattern using String.equals() or pattern matching if that fails.
155: *
156: * @param arg input string
157: * @param pat pattern string
158: * @param matcher Perl5Matcher
159: * @param cache PatternCache
160: *
161: * @return true if input matches the pattern
162: */
163: public static boolean isEqualOrMatches(String arg, String pat,
164: Perl5Matcher matcher, PatternCacheLRU cache) {
165: return arg.equals(pat)
166: || matcher.matches(arg, cache.getPattern(pat,
167: Perl5Compiler.READ_ONLY_MASK));
168: }
169:
170: /**
171: * Match the input argument against the pattern using String.equals() or pattern matching if that fails
172: * using case-insenssitive matching.
173: *
174: * @param arg input string
175: * @param pat pattern string
176: * @param matcher Perl5Matcher
177: * @param cache PatternCache
178: *
179: * @return true if input matches the pattern
180: */
181: public static boolean isEqualOrMatchesCaseBlind(String arg,
182: String pat, Perl5Matcher matcher, PatternCacheLRU cache) {
183: return arg.equalsIgnoreCase(pat)
184: || matcher.matches(arg, cache.getPattern(pat,
185: Perl5Compiler.READ_ONLY_MASK
186: | Perl5Compiler.CASE_INSENSITIVE_MASK));
187: }
188:
189: /**
190: * Match the input argument against the pattern using String.equals() or pattern matching if that fails
191: * using case-insensitive matching.
192: *
193: * @param arg input string
194: * @param pat pattern string
195: *
196: * @return true if input matches the pattern
197: */
198: public static boolean isEqualOrMatches(String arg, String pat) {
199: return isEqualOrMatches(arg, pat, JMeterUtils.getMatcher(),
200: JMeterUtils.getPatternCache());
201: }
202:
203: /**
204: * Match the input argument against the pattern using String.equals() or pattern matching if that fails
205: * using case-insensitive matching.
206: *
207: * @param arg input string
208: * @param pat pattern string
209: *
210: * @return true if input matches the pattern
211: */
212: public static boolean isEqualOrMatchesCaseBlind(String arg,
213: String pat) {
214: return isEqualOrMatchesCaseBlind(arg, pat, JMeterUtils
215: .getMatcher(), JMeterUtils.getPatternCache());
216: }
217:
218: /**
219: * Returns <code>tidy</code> as HTML parser.
220: *
221: * @return a <code>tidy</code> HTML parser
222: */
223: public static Tidy getParser() {
224: log.debug("Start : getParser1");
225: Tidy tidy = new Tidy();
226: tidy.setCharEncoding(org.w3c.tidy.Configuration.UTF8);
227: tidy.setQuiet(true);
228: tidy.setShowWarnings(false);
229:
230: if (log.isDebugEnabled()) {
231: log.debug("getParser1 : tidy parser created - " + tidy);
232: }
233:
234: log.debug("End : getParser1");
235:
236: return tidy;
237: }
238:
239: /**
240: * Returns a node representing a whole xml given an xml document.
241: *
242: * @param text
243: * an xml document
244: * @return a node representing a whole xml
245: */
246: public static Node getDOM(String text) {
247: log.debug("Start : getDOM1");
248:
249: try {
250: Node node = getParser().parseDOM(
251: new ByteArrayInputStream(text.getBytes("UTF-8")),
252: null);// $NON-NLS-1$
253:
254: if (log.isDebugEnabled()) {
255: log.debug("node : " + node);
256: }
257:
258: log.debug("End : getDOM1");
259:
260: return node;
261: } catch (UnsupportedEncodingException e) {
262: log
263: .error("getDOM1 : Unsupported encoding exception - "
264: + e);
265: log.debug("End : getDOM1");
266: throw new RuntimeException("UTF-8 encoding failed");
267: }
268: }
269:
270: public static Document createEmptyDoc() {
271: return Tidy.createEmptyDocument();
272: }
273:
274: /**
275: * Create a new Sampler based on an HREF string plus a contextual URL
276: * object. Given that an HREF string might be of three possible forms, some
277: * processing is required.
278: */
279: public static HTTPSamplerBase createUrlFromAnchor(
280: String parsedUrlString, URL context)
281: throws MalformedURLException {
282: if (log.isDebugEnabled()) {
283: log.debug("Creating URL from Anchor: " + parsedUrlString
284: + ", base: " + context);
285: }
286: URL url = new URL(context, parsedUrlString);
287: HTTPSamplerBase sampler = HTTPSamplerFactory.newInstance();
288: sampler.setDomain(url.getHost());
289: sampler.setProtocol(url.getProtocol());
290: sampler.setPort(url.getPort());
291: sampler.setPath(url.getPath());
292: sampler.parseArguments(url.getQuery());
293:
294: return sampler;
295: }
296:
297: public static List createURLFromForm(Node doc, URL context) {
298: String selectName = null;
299: LinkedList urlConfigs = new LinkedList();
300: recurseForm(doc, urlConfigs, context, selectName, false);
301: /*
302: * NamedNodeMap atts = formNode.getAttributes();
303: * if(atts.getNamedItem("action") == null) { throw new
304: * MalformedURLException(); } String action =
305: * atts.getNamedItem("action").getNodeValue(); UrlConfig url =
306: * createUrlFromAnchor(action, context); recurseForm(doc, url,
307: * selectName,true,formStart);
308: */
309: return urlConfigs;
310: }
311:
312: // N.B. Since the tags are extracted from an HTML Form, any values must already have been encoded
313: private static boolean recurseForm(Node tempNode,
314: LinkedList urlConfigs, URL context, String selectName,
315: boolean inForm) {
316: NamedNodeMap nodeAtts = tempNode.getAttributes();
317: String tag = tempNode.getNodeName();
318: try {
319: if (inForm) {
320: HTTPSamplerBase url = (HTTPSamplerBase) urlConfigs
321: .getLast();
322: if (tag.equalsIgnoreCase("form")) { // $NON-NLS-1$
323: try {
324: urlConfigs.add(createFormUrlConfig(tempNode,
325: context));
326: } catch (MalformedURLException e) {
327: inForm = false;
328: }
329: } else if (tag.equalsIgnoreCase("input")) { // $NON-NLS-1$
330: url.addEncodedArgument(getAttributeValue(nodeAtts,
331: "name"), // $NON-NLS-1$
332: getAttributeValue(nodeAtts, "value")); // $NON-NLS-1$
333: } else if (tag.equalsIgnoreCase("textarea")) { // $NON-NLS-1$
334: try {
335: url
336: .addEncodedArgument(getAttributeValue(
337: nodeAtts, "name"), // $NON-NLS-1$
338: tempNode.getFirstChild()
339: .getNodeValue());
340: } catch (NullPointerException e) {
341: url.addArgument(getAttributeValue(nodeAtts,
342: "name"), ""); // $NON-NLS-1$
343: }
344: } else if (tag.equalsIgnoreCase("select")) { // $NON-NLS-1$
345: selectName = getAttributeValue(nodeAtts, "name"); // $NON-NLS-1$
346: } else if (tag.equalsIgnoreCase("option")) { // $NON-NLS-1$
347: String value = getAttributeValue(nodeAtts, "value"); // $NON-NLS-1$
348: if (value == null) {
349: try {
350: value = tempNode.getFirstChild()
351: .getNodeValue();
352: } catch (NullPointerException e) {
353: value = ""; // $NON-NLS-1$
354: }
355: }
356: url.addEncodedArgument(selectName, value);
357: }
358: } else if (tag.equalsIgnoreCase("form")) { // $NON-NLS-1$
359: try {
360: urlConfigs.add(createFormUrlConfig(tempNode,
361: context));
362: inForm = true;
363: } catch (MalformedURLException e) {
364: inForm = false;
365: }
366: }
367: } catch (Exception ex) {
368: log.warn("Some bad HTML " + printNode(tempNode), ex);
369: }
370: NodeList childNodes = tempNode.getChildNodes();
371: for (int x = 0; x < childNodes.getLength(); x++) {
372: inForm = recurseForm(childNodes.item(x), urlConfigs,
373: context, selectName, inForm);
374: }
375: return inForm;
376: }
377:
378: private static String getAttributeValue(NamedNodeMap att,
379: String attName) {
380: try {
381: return att.getNamedItem(attName).getNodeValue();
382: } catch (Exception ex) {
383: return ""; // $NON-NLS-1$
384: }
385: }
386:
387: private static String printNode(Node node) {
388: StringBuffer buf = new StringBuffer();
389: buf.append("<"); // $NON-NLS-1$
390: buf.append(node.getNodeName());
391: NamedNodeMap atts = node.getAttributes();
392: for (int x = 0; x < atts.getLength(); x++) {
393: buf.append(" "); // $NON-NLS-1$
394: buf.append(atts.item(x).getNodeName());
395: buf.append("=\""); // $NON-NLS-1$
396: buf.append(atts.item(x).getNodeValue());
397: buf.append("\""); // $NON-NLS-1$
398: }
399:
400: buf.append(">"); // $NON-NLS-1$
401:
402: return buf.toString();
403: }
404:
405: private static HTTPSamplerBase createFormUrlConfig(Node tempNode,
406: URL context) throws MalformedURLException {
407: NamedNodeMap atts = tempNode.getAttributes();
408: if (atts.getNamedItem("action") == null) { // $NON-NLS-1$
409: throw new MalformedURLException();
410: }
411: String action = atts.getNamedItem("action").getNodeValue(); // $NON-NLS-1$
412: HTTPSamplerBase url = createUrlFromAnchor(action, context);
413: return url;
414: }
415:
416: public static void extractStyleURLs(final URL baseUrl,
417: final URLCollection urls, String styleTagStr) {
418: Perl5Matcher matcher = JMeterUtils.getMatcher();
419: Pattern pattern = JMeterUtils.getPatternCache().getPattern(
420: "URL\\(\\s*('|\")(.*)('|\")\\s*\\)", // $NON-NLS-1$
421: Perl5Compiler.CASE_INSENSITIVE_MASK
422: | Perl5Compiler.SINGLELINE_MASK
423: | Perl5Compiler.READ_ONLY_MASK);
424: PatternMatcherInput input = null;
425: input = new PatternMatcherInput(styleTagStr);
426: while (matcher.contains(input, pattern)) {
427: MatchResult match = matcher.getMatch();
428: // The value is in the second group
429: String styleUrl = match.group(2);
430: urls.addURL(styleUrl, baseUrl);
431: }
432: }
433: }
|