001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: *
017: */
018:
019: package org.apache.jmeter.protocol.http.proxy;
020:
021: import java.util.Map;
022:
023: import org.apache.jorphan.logging.LoggingManager;
024: import org.apache.jorphan.util.JOrphanUtils;
025: import org.apache.log.Logger;
026: import org.apache.jmeter.protocol.http.parser.HTMLParseException;
027: import org.htmlparser.Node;
028: import org.htmlparser.Parser;
029: import org.htmlparser.Tag;
030: import org.htmlparser.tags.CompositeTag;
031: import org.htmlparser.tags.FormTag;
032: import org.htmlparser.util.NodeIterator;
033: import org.htmlparser.util.ParserException;
034:
035: /**
036: * A parser for html, to find the form tags, and their accept-charset value
037: */
038: class FormCharSetFinder {
039: private static final Logger log = LoggingManager
040: .getLoggerForClass();
041:
042: static {
043: log.info("Using htmlparser version: " + Parser.getVersion());
044: }
045:
046: protected FormCharSetFinder() {
047: super ();
048: }
049:
050: /**
051: * Add form action urls and their corresponding encodings for all forms on the page
052: *
053: * @param html the html to parse for form encodings
054: * @param formEncodings the Map where form encodings should be added
055: * @param pageEncoding the encoding used for the whole page
056: * @throws HTMLParseException
057: */
058: public void addFormActionsAndCharSet(String html,
059: Map formEncodings, String pageEncoding)
060: throws HTMLParseException {
061: if (log.isDebugEnabled()) {
062: log.debug("Parsing html of: " + html);
063: }
064:
065: Parser htmlParser = null;
066: try {
067: htmlParser = new Parser();
068: htmlParser.setInputHTML(html);
069: } catch (Exception e) {
070: throw new HTMLParseException(e);
071: }
072:
073: // Now parse the DOM tree
074: try {
075: // we start to iterate through the elements
076: parseNodes(htmlParser.elements(), formEncodings,
077: pageEncoding);
078: log.debug("End : parseNodes");
079: } catch (ParserException e) {
080: throw new HTMLParseException(e);
081: }
082: }
083:
084: /**
085: * Recursively parse all nodes to pick up all form encodings
086: *
087: * @param e the nodes to be parsed
088: * @param formEncodings the Map where we should add form encodings found
089: * @param pageEncoding the encoding used for the page where the nodes are present
090: */
091: private void parseNodes(final NodeIterator e, Map formEncodings,
092: String pageEncoding) throws HTMLParseException,
093: ParserException {
094: while (e.hasMoreNodes()) {
095: Node node = e.nextNode();
096: // a url is always in a Tag.
097: if (!(node instanceof Tag)) {
098: continue;
099: }
100: Tag tag = (Tag) node;
101:
102: // Only check form tags
103: if (tag instanceof FormTag) {
104: // Find the action / form url
105: String action = tag.getAttribute("action");
106: String acceptCharSet = tag
107: .getAttribute("accept-charset");
108: if (action != null && action.length() > 0) {
109: // We use the page encoding where the form resides, as the
110: // default encoding for the form
111: String formCharSet = pageEncoding;
112: // Check if we found an accept-charset attribute on the form
113: if (acceptCharSet != null) {
114: String[] charSets = JOrphanUtils.split(
115: acceptCharSet, ",");
116: // Just use the first one of the possible many charsets
117: if (charSets.length > 0) {
118: formCharSet = charSets[0].trim();
119: if (formCharSet.length() == 0) {
120: formCharSet = null;
121: }
122: }
123: }
124: if (formCharSet != null) {
125: synchronized (formEncodings) {
126: formEncodings.put(action, formCharSet);
127: }
128: }
129: }
130: }
131:
132: // second, if the tag was a composite tag,
133: // recursively parse its children.
134: if (tag instanceof CompositeTag) {
135: CompositeTag composite = (CompositeTag) tag;
136: parseNodes(composite.elements(), formEncodings,
137: pageEncoding);
138: }
139: }
140: }
141: }
|