001: /* HtmlFormCredential
002: *
003: * Created on Apr 7, 2004
004: *
005: * Copyright (C) 2004 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.datamodel.credential;
024:
025: import java.util.HashMap;
026: import java.util.Iterator;
027: import java.util.Map;
028: import java.util.logging.Logger;
029:
030: import javax.management.Attribute;
031: import javax.management.AttributeNotFoundException;
032:
033: import org.apache.commons.httpclient.HttpClient;
034: import org.apache.commons.httpclient.HttpMethod;
035: import org.apache.commons.httpclient.HttpMethodBase;
036: import org.apache.commons.httpclient.NameValuePair;
037: import org.apache.commons.httpclient.URIException;
038: import org.apache.commons.httpclient.methods.GetMethod;
039: import org.apache.commons.httpclient.methods.PostMethod;
040: import org.archive.crawler.datamodel.CrawlURI;
041: import org.archive.crawler.settings.MapType;
042: import org.archive.crawler.settings.SimpleType;
043: import org.archive.crawler.settings.Type;
044: import org.archive.net.UURI;
045: import org.archive.net.UURIFactory;
046:
047: /**
048: * Credential that holds all needed to do a GET/POST to a HTML form.
049: *
050: * @author stack
051: * @version $Revision: 4668 $, $Date: 2006-09-26 21:49:01 +0000 (Tue, 26 Sep 2006) $
052: */
053: public class HtmlFormCredential extends Credential {
054:
055: private static final long serialVersionUID = -4732570804435453949L;
056:
057: private static final Logger logger = Logger
058: .getLogger(HtmlFormCredential.class.getName());
059:
060: private static final String ATTR_LOGIN_URI = "login-uri";
061: private static final String ATTR_FORM_ITEMS = "form-items";
062: private static final String ATTR_FORM_METHOD = "http-method";
063: private static final String[] METHODS = { "POST", "GET" };
064:
065: /**
066: * Constructor.
067: *
068: * A constructor that takes name of the credential is required by settings
069: * framework.
070: *
071: * @param name Name of this credential.
072: */
073: public HtmlFormCredential(final String name) {
074: super (name, "Credential that has all necessary"
075: + " for running a POST/GET to an HTML login form.");
076:
077: Type t = addElementToDefinition(new SimpleType(
078: "login-uri",
079: "Full URI of page that contains the HTML login form we're to"
080: + " apply these credentials too: E.g. http://www.archive.org",
081: ""));
082: t.setOverrideable(false);
083: t.setExpertSetting(true);
084:
085: t = addElementToDefinition(new SimpleType(ATTR_FORM_METHOD,
086: "GET or POST", METHODS[0], METHODS));
087: t.setOverrideable(false);
088: t.setExpertSetting(true);
089:
090: t = addElementToDefinition(new MapType(ATTR_FORM_ITEMS,
091: "Form items.", String.class));
092: t.setOverrideable(false);
093: t.setExpertSetting(true);
094: }
095:
096: /**
097: * @param context CrawlURI context to use.
098: * @return login-uri.
099: * @throws AttributeNotFoundException
100: */
101: public String getLoginUri(final CrawlURI context)
102: throws AttributeNotFoundException {
103: return (String) getAttribute(ATTR_LOGIN_URI, context);
104: }
105:
106: /**
107: * @param context CrawlURI context to use.
108: * @return login-uri.
109: * @throws AttributeNotFoundException
110: */
111: public String getHttpMethod(final CrawlURI context)
112: throws AttributeNotFoundException {
113: return (String) getAttribute(ATTR_FORM_METHOD, context);
114: }
115:
116: /**
117: * @param context CrawlURI context to use.
118: * @return Form inputs as convenient map. Returns null if no form items.
119: * @throws AttributeNotFoundException
120: */
121: public Map<String, Object> getFormItems(final CrawlURI context)
122: throws AttributeNotFoundException {
123: Map<String, Object> result = null;
124: MapType items = (MapType) getAttribute(ATTR_FORM_ITEMS, context);
125: if (items != null) {
126: for (Iterator i = items.iterator(context); i.hasNext();) {
127: Attribute a = (Attribute) i.next();
128: if (result == null) {
129: result = new HashMap<String, Object>();
130: }
131: result.put(a.getName(), a.getValue());
132: }
133: }
134: return result;
135: }
136:
137: public boolean isPrerequisite(final CrawlURI curi) {
138: boolean result = false;
139: String curiStr = curi.getUURI().toString();
140: String loginUri = getPrerequisite(curi);
141: if (loginUri != null) {
142: try {
143: UURI uuri = UURIFactory.getInstance(curi.getUURI(),
144: loginUri);
145: if (uuri != null && curiStr != null
146: && uuri.toString().equals(curiStr)) {
147: result = true;
148: if (!curi.isPrerequisite()) {
149: curi.setPrerequisite(true);
150: logger.fine(curi + " is prereq.");
151: }
152: }
153: } catch (URIException e) {
154: logger.severe("Failed to uuri: " + curi + ", "
155: + e.getMessage());
156: }
157: }
158: return result;
159: }
160:
161: public boolean hasPrerequisite(CrawlURI curi) {
162: return getPrerequisite(curi) != null;
163: }
164:
165: public String getPrerequisite(CrawlURI curi) {
166: String loginUri = null;
167: try {
168: loginUri = getLoginUri(curi);
169: } catch (AttributeNotFoundException e) {
170: logger.severe("Failed to getLoginUri: " + this + ", "
171: + curi + "," + e.getMessage());
172: // Not much I can do here. What if I fail every time? Then
173: // this prereq. will not ever be processed. We'll never get on to
174: // this server.
175: }
176: return loginUri;
177: }
178:
179: public String getKey(CrawlURI curi)
180: throws AttributeNotFoundException {
181: return getLoginUri(curi);
182: }
183:
184: public boolean isEveryTime() {
185: // This authentication is one time only.
186: return false;
187: }
188:
189: public boolean populate(CrawlURI curi, HttpClient http,
190: HttpMethod method, String payload) {
191: // http is not used.
192: // payload is not used.
193: boolean result = false;
194: Map formItems = null;
195: try {
196: formItems = getFormItems(curi);
197: } catch (AttributeNotFoundException e1) {
198: logger.severe("Failed get of form items for " + curi);
199: }
200: if (formItems == null || formItems.size() <= 0) {
201: try {
202: logger.severe("No form items for " + method.getURI());
203: } catch (URIException e) {
204: logger
205: .severe("No form items and exception getting uri: "
206: + e.getMessage());
207: }
208: return result;
209: }
210:
211: NameValuePair[] data = new NameValuePair[formItems.size()];
212: int index = 0;
213: String key = null;
214: for (Iterator i = formItems.keySet().iterator(); i.hasNext();) {
215: key = (String) i.next();
216: data[index++] = new NameValuePair(key, (String) formItems
217: .get(key));
218: }
219: if (method instanceof PostMethod) {
220: ((PostMethod) method).setRequestBody(data);
221: result = true;
222: } else if (method instanceof GetMethod) {
223: // Append these values to the query string.
224: // Get current query string, then add data, then get it again
225: // only this time its our data only... then append.
226: HttpMethodBase hmb = (HttpMethodBase) method;
227: String currentQuery = hmb.getQueryString();
228: hmb.setQueryString(data);
229: String newQuery = hmb.getQueryString();
230: hmb.setQueryString(((currentQuery != null) ? currentQuery
231: : "")
232: + "&" + newQuery);
233: result = true;
234: } else {
235: logger.severe("Unknown method type: " + method);
236: }
237: return result;
238: }
239:
240: public boolean isPost(CrawlURI curi) {
241: String method = null;
242: try {
243: method = getHttpMethod(curi);
244: } catch (AttributeNotFoundException e) {
245: logger.severe("Failed to get method for " + curi + ", "
246: + this );
247: }
248: return method != null && method.equalsIgnoreCase("POST");
249: }
250: }
|