0001: /*
0002: * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
0003: * PROPRIETARY/CONFIDENTIAL. Use of this product is subject to license terms.
0004: */
0005: package com.sun.portal.providers.urlscraper;
0006:
0007: import java.util.List;
0008: import java.util.ArrayList;
0009: import java.util.Vector;
0010: import java.util.Map;
0011: import java.util.Hashtable;
0012: import java.util.HashMap;
0013: import java.util.Iterator;
0014: import java.util.StringTokenizer;
0015: import java.util.ResourceBundle;
0016: import java.util.Enumeration;
0017: import java.util.logging.Level;
0018: import java.util.logging.Logger;
0019: import java.util.logging.LogRecord;
0020:
0021: import java.io.InputStream;
0022: import java.io.IOException;
0023: import java.io.UnsupportedEncodingException;
0024: import java.io.ByteArrayOutputStream;
0025: import java.io.OutputStream;
0026: import java.io.DataOutputStream;
0027:
0028: import java.net.URL;
0029: import java.net.URLEncoder;
0030: import java.net.URLConnection;
0031: import java.net.HttpURLConnection;
0032: import java.net.MalformedURLException;
0033:
0034: import javax.servlet.http.HttpServletRequest;
0035: import javax.servlet.http.HttpServletResponse;
0036: import javax.servlet.http.Cookie;
0037:
0038: import com.sun.portal.rewriter.RewriterPool;
0039: import com.sun.portal.rewriter.RewriterModule;
0040: import com.sun.portal.rewriter.Rewriter;
0041: import com.sun.portal.rewriter.Translator;
0042: import com.sun.portal.rewriter.AbsoluteTranslator;
0043: import com.sun.portal.rewriter.util.uri.PageSpec;
0044: import com.sun.portal.rewriter.rom.InvalidXMLException;
0045: import com.sun.portal.log.common.PortalLogger;
0046: import sun.misc.BASE64Encoder;
0047:
0048: import com.sun.identity.security.DecryptAction;
0049: import com.sun.identity.security.EncryptAction;
0050: import java.util.regex.Matcher;
0051: import java.util.regex.Pattern;
0052:
0053: /**
0054: * <P> This class fetches and stores the content from an arbitrary url
0055: */
0056:
0057: public class Fetcher {
0058:
0059: /**
0060: * The Following keys have same value as URLScraperProvider's DP values , hence these are passed implicitly.
0061: * If these are modified, it needs to be passed explicitly in URLScraperProvider.java. Hence require change in
0062: * URLScraperProvider.
0063: * E.g: KEY_LOGIN_URL = "loginUrl"; and in DP it must be called "loginUrl" else needs to be passed explicitly.
0064: **/
0065:
0066: /**
0067: * The RulesetId as String
0068: */
0069: public static final String KEY_RULESET_ID = "urlScraperRulesetID"; //String
0070: /**
0071: * wether to Forward all Cookies as Boolean
0072: */
0073: public static final String KEY_COOKIES_TO_FORWARD_ALL = "cookiesToForwardAll"; //Boolean
0074: /**
0075: * if only selcted Cookies will be forwarded, the list of cookies to be forwarded as List
0076: */
0077: public static final String KEY_COOKIES_TO_FORWARD_LIST = "cookiesToForwardList"; //List of CookieNamesAsString
0078: /**
0079: * As ResourceBundle
0080: */
0081: public static final String KEY_RESOURCE_BUNDLE = "rb"; //ResourceBundle
0082: /**
0083: * The Input encoding as String
0084: */
0085: public static final String KEY_INPUT_ENCODING = "inputEncoding"; //String
0086: /**
0087: * The formData as String e.g: login=[uid]&passwd=[password] wher the characters in square bracket are properties available in the channel.
0088: */
0089: public static final String KEY_FORM_DATA = "formData"; //String e.g: login=[uid]&passwd=[password]
0090: /**
0091: * login URL as string . If this is provided it will be used once to authenticate for the first time
0092: */
0093: public static final String KEY_LOGIN_URL = "loginUrl"; //http://login.yahoo.com/config/login
0094: /**
0095: * The form data to be sent to during login if provided e.g: login=[uid]&passwd=[password]
0096: */
0097: public static final String KEY_LOGIN_FORM_DATA = "loginFormData"; //String e.g:login=[uid]&passwd=[password]
0098:
0099: /**
0100: * isHttpAuth , set true only if HttpAuth
0101: */
0102: public static final String KEY_HTTP_AUTH = "isHttpAuth"; // Boolean
0103:
0104: /**
0105: * The Uid for Http Auth
0106: */
0107: public static final String KEY_HTTP_AUTH_UID = "uid"; // String
0108:
0109: /**
0110: * the password for Http Auth
0111: */
0112: public static final String KEY_HTTP_AUTH_PASSWORD = "password"; // String
0113:
0114: /**
0115: * logoutUrl as string . If this is provided it will be used logout
0116: */
0117:
0118: public static final String KEY_LOGOUT_URL = "logoutUrl";
0119:
0120: /**
0121: * Initial size for the StringBuffer storing the retrieved content.
0122: * Sort of arbitrary, and is probably larger than expected.
0123: */
0124: private static final int CONTENT_BUFFER_SIZE = 2048;
0125:
0126: /**
0127: * Logger
0128: */
0129: private static Logger logger = PortalLogger
0130: .getLogger(Fetcher.class);
0131:
0132: /**
0133: * boolean ubt
0134: */
0135: private boolean ubt = false;
0136: /**
0137: * In case of ubt , append this url
0138: */
0139: private String ubtAppendUrl = null;
0140: /**
0141: * The Map which contains all the data
0142: */
0143: private Map config = new Hashtable();
0144: /**
0145: * The CookieTable where key is cookieName and value is domainPathEntry for the cookies obtained from the remote URL.
0146: */
0147: private Hashtable domainPathCookieTable = new Hashtable();
0148: /**
0149: * All the cookies obtained from the remote URL, hence acts as session for that url based cookie manged session.
0150: */
0151: private Hashtable sessionCookieTable = new Hashtable();
0152:
0153: /**
0154: * if the Fetcher is authenticated to the site
0155: */
0156: private boolean isAuthenticated = false;
0157:
0158: /**
0159: * After login the user is redirected to this Url by the remote website. This is reqd to be stored as user can fetch this url directly next time rather than going to the getLoginUrl(), as it is already authenticated .
0160: * e.g: on login to login.yahoo.com/config/login , the user will be redirected to my.yahoo.com , which is stored so that we can fetch my.yahoo.com directly.
0161: */
0162: private String postLoginUrl = null;
0163:
0164: /**
0165: * just to store the conetnType, required for ContentFilter
0166: */
0167: //private String contentType = null ;
0168: int loop = 0;
0169: /**
0170: * Ignore these Headers wile sending to remorte URL, Do not pass these headers
0171: */
0172: private static List ignoreHeaders = new ArrayList();
0173: static {
0174: // keep it all in lowercase
0175: ignoreHeaders.add("host");
0176: ignoreHeaders.add("cookie");
0177: ignoreHeaders.add("connection");
0178: ignoreHeaders.add("content-length");
0179: ignoreHeaders.add("accept-encoding");
0180: ignoreHeaders.add("location");
0181: RewriterModule.initIDSAME();
0182: }
0183:
0184: /**
0185: * <P> Constructor method
0186: * @param config The Map needs lot of entries and the object type as defined.
0187: * @exception MalformedURLException The Exception
0188: */
0189: public Fetcher(Map config) throws MalformedURLException {
0190: this (config, false, null);
0191: }
0192:
0193: /**
0194: * <P> Constructor method
0195: * @param configMap the MAP
0196: * @param ubtAppendUrlSt
0197: * @param ubt Whether to track links outside portal
0198: * @exception MalformedURLException
0199: */
0200: public Fetcher(Map configMap, boolean ubt, String ubtAppendUrlSt)
0201: throws MalformedURLException {
0202: config = configMap;
0203: this .ubt = ubt;
0204: this .ubtAppendUrl = ubtAppendUrlSt;
0205: }
0206:
0207: /**
0208: * return cookiesToforwardAll as boolean
0209: * @return boolean
0210: */
0211: private boolean getCookiesToForwardAll() {
0212: Object obj = config.get(KEY_COOKIES_TO_FORWARD_ALL);
0213: if (obj != null && obj instanceof String) {
0214: return (Boolean.getBoolean(obj.toString()));
0215: } else if (obj instanceof Boolean) {
0216: return ((Boolean) obj).booleanValue();
0217: }
0218: return true;
0219: }
0220:
0221: /**
0222: * The List of Cookies that can be forwarded
0223: * @return List
0224: */
0225: private List getCookiesToForwardList() {
0226: return (List) config.get(KEY_COOKIES_TO_FORWARD_LIST);
0227: }
0228:
0229: /**
0230: * creates an Instance of FetcherThread
0231: * @param request
0232: * @param response
0233: * @param u
0234: * @return
0235: */
0236: private FetcherThread createFetcherThread(
0237: HttpServletRequest request, HttpServletResponse response,
0238: String u) {
0239: return new FetcherThread(this , request, response, u);
0240: }
0241:
0242: /**
0243: * <P> Fetch the contents from the specified URL. Populates the "content"
0244: * buffer
0245: * @return None
0246: * @param ft
0247: * @param u
0248: */
0249: StringBuffer fetch(FetcherThread ft, String u) {
0250:
0251: StringBuffer content = this .setFormAuth(ft);
0252:
0253: if (isEmpty(u)) {
0254: u = this .postLoginUrl;
0255: } else if (u.equals(this .postLoginUrl) && content != null) {
0256: return content;
0257: }
0258: return fetch(ft, u, getFormData());
0259: }
0260:
0261: /**
0262: *
0263: * @return
0264: * @param ft
0265: * @param formData
0266: * @param u
0267: */
0268: private StringBuffer fetch(FetcherThread ft, String u,
0269: String formData) {
0270:
0271: StringBuffer content = null;
0272:
0273: InputStream in = null;
0274: String charset = null;
0275: HttpURLConnection uc = null;
0276: try {
0277:
0278: URL url = new URL(getAbsURL(u, ft.getDesktopRequestURL()));
0279:
0280: try {
0281: uc = (HttpURLConnection) url.openConnection();
0282: } catch (ClassCastException ce) {
0283: throw new Exception(
0284: "Fetcher doesn't support ftp, file urls. URL:"
0285: + url, ce);
0286: }
0287: uc.setDoInput(true);
0288: uc.setUseCaches(false);
0289: //setFollowRedirect is a static method, hence may have impact on other classes using URLConnection
0290: //This is reqd as we incase of autoredirect(true) it does not pass the additional cookies obtained before the redirection to the new URL.
0291: uc.setFollowRedirects(false);
0292:
0293: //
0294: // Begin forward cookies
0295: //
0296: if (ft.getHttpServletRequest() != null) {
0297: Cookie[] ca = ft.getHttpServletRequest().getCookies();
0298: HashMap forwardedCookies = new HashMap();
0299: if (ca != null) {
0300: for (int j = 0; j < ca.length; j++) {
0301: if (forward(ca[j].getName(),
0302: getCookiesToForwardAll(),
0303: getCookiesToForwardList())) {
0304: forwardedCookies
0305: .put(ca[j].getName(), ca[j]);
0306: }
0307: }
0308: }
0309: //Add All the session Cookies
0310: for (Iterator it = sessionCookieTable.values()
0311: .iterator(); it.hasNext();) {
0312: Cookie c = (Cookie) it.next();
0313: //It may be already existing in the browser cookies, This will take precedence over that
0314: forwardedCookies.put(c.getName(), c);
0315: }
0316:
0317: // forwardedCookies is a hashmap , so that cookie with same name should not duplicate
0318:
0319: if ((forwardedCookies != null)
0320: && (!forwardedCookies.isEmpty())) {
0321: StringBuffer cs = new StringBuffer();
0322: logger.log(Level.FINEST, "PSCR_CSPPU0001",
0323: domainPathCookieTable);
0324: for (Iterator fci = forwardedCookies.values()
0325: .iterator(); fci.hasNext();) {
0326: Cookie c = (Cookie) fci.next();
0327: String path = null;
0328: String domain = null;
0329: if (domainPathCookieTable != null
0330: && domainPathCookieTable.containsKey(c
0331: .getName())) {
0332: // restore the original domain and path.
0333: domainPathEntry dp = (domainPathEntry) domainPathCookieTable
0334: .get(c.getName());
0335: path = dp.getPath();
0336: domain = dp.getDomain();
0337: }
0338: cs.append(c.getName()).append("=").append(
0339: c.getValue());
0340: //Why should the cookieValue be Encoded , it must have been taken care of before.
0341: //.append(URLEncoder.encode(c.getValue()));
0342:
0343: if (path != null) {
0344: cs.append("; ").append("$Path=" + path);
0345:
0346: }
0347: if (domain != null) {
0348: cs.append("; ").append("$Domain=" + domain);
0349: }
0350:
0351: if (fci.hasNext()) {
0352: cs.append("; ");
0353: }
0354: }
0355: logger.log(Level.FINEST, "PSCR_CSPPU0002", cs
0356: .toString());
0357: logger.log(Level.FINEST, "PSCR_CSPPU0003",
0358: domainPathCookieTable);
0359: uc.setRequestProperty("Cookie", cs.toString());
0360:
0361: }
0362: }
0363: setHeaders(uc, ft.getHttpServletRequest());
0364: setBasicAuthHeader(uc);
0365: int responseCode = 200;
0366:
0367: //if(responseCode == 200 )
0368: //The following may not be required all the time, do it only if responseCode is 200
0369: setPostData(uc, formData);
0370:
0371: responseCode = uc.getResponseCode();
0372: int contentLength = uc.getContentLength();
0373:
0374: // Process cookies that were set in the response by handing
0375: // them back to the browser (if in the cookiesToForward list)
0376: String hkey;
0377: for (int i = 1; (hkey = uc.getHeaderFieldKey(i)) != null; i++) {
0378: if (!hkey.equalsIgnoreCase("Set-cookie")) {
0379: continue;
0380: }
0381: String setCookie = uc.getHeaderField(i);
0382:
0383: processSetCookieHeader(setCookie, ft, url);
0384: }
0385:
0386: logger.log(Level.FINEST, "PSCR_CSPPU0004", new Integer(
0387: responseCode));
0388:
0389: if (responseCode == -1) {
0390: logger.log(Level.INFO, "PSCR_CSPPU0005");
0391: try {
0392: uc.disconnect();
0393: } catch (Exception ex) {
0394: }
0395: return content;
0396: }
0397:
0398: if (responseCode == uc.HTTP_MOVED_PERM
0399: || responseCode == uc.HTTP_MOVED_TEMP) {
0400: //
0401: // do redirect
0402: //
0403: String loc = uc.getHeaderField("Location");
0404: Header h = new Header(loc);
0405: logger
0406: .log(Level.FINEST, "PSCR_CSPPU0006", h
0407: .getValue());
0408: //
0409: // recursively call run, with new url based on location of redirect.
0410: //
0411:
0412: String newUrlSt = getAbsURL(h.getValue(), url);
0413: if (loop < 3) {
0414: if (u.equalsIgnoreCase(newUrlSt)) {
0415: loop++;
0416: }
0417: ft.setLastFetchedUrl(u);
0418: return fetch(ft, newUrlSt, formData);
0419: }
0420: }
0421:
0422: if (responseCode == uc.HTTP_UNAUTHORIZED) {
0423: content = new StringBuffer(getResourceBundle()
0424: .getString("authFailed"));
0425: try {
0426: uc.disconnect();
0427: } catch (Exception ex) {
0428: }
0429: return content;
0430: }
0431:
0432: //
0433: // read the content
0434: //
0435:
0436: ft.setContentType(uc.getContentType());
0437: PageSpec pageSpec = new PageSpec(uc.getURL().toString(), ft
0438: .getContentType());
0439: in = uc.getInputStream();
0440: byte[] bytes = readContent(in, contentLength);
0441: charset = getContentEncoding(ft.getContentType(), bytes,
0442: pageSpec.getMIME(), getInputEncoding());
0443: String data = null;
0444: if (charset != null && charset.length() != 0) {
0445: data = new String(bytes, charset);
0446: } else {
0447: data = new String(bytes);
0448: }
0449: // If there is no ruleset id, do no rewrite.
0450: String rulesetId = getRuleSetId();
0451: Rewriter rewriter = null;
0452: if (rulesetId != null && !rulesetId.equals("")) {
0453:
0454: // To check whether rewriter is available. If so
0455: // call the necessary API's
0456: //Begin rewriter code.
0457: rewriter = RewriterPool.getDefault().getRewriter(
0458: rulesetId, pageSpec);
0459: }
0460:
0461: if (rewriter != null) {
0462: Translator t = ubt ? new AbsoluteTranslator(pageSpec,
0463: ubtAppendUrl)
0464: : new AbsoluteTranslator(pageSpec);
0465: //
0466: // found translator, translate content and return.
0467: //
0468: content = new StringBuffer(rewriter.rewrite(data, t));
0469: } else {
0470: content = new StringBuffer(data);
0471: }
0472: logger.log(Level.FINEST, "PSCR_CSPPU0007", content
0473: .toString());
0474: } catch (InvalidXMLException ixe) {
0475: logger.log(Level.INFO, "PSCR_CSPPU0008", ixe);
0476: content = new StringBuffer(getResourceBundle().getString(
0477: "invalidruleset"));
0478: } catch (UnsupportedEncodingException ue) {
0479: if (logger.isLoggable(Level.INFO)) {
0480: LogRecord record = new LogRecord(Level.INFO,
0481: "PSCR_CSPPU0009");
0482: record.setLoggerName(logger.getName());
0483: record.setParameters(new Object[] { charset });
0484: record.setThrown(ue);
0485: logger.log(record);
0486: }
0487: content = new StringBuffer(getResourceBundle().getString(
0488: "unsupportedencoding"));
0489:
0490: } catch (Exception e) {
0491: logger.log(Level.INFO, "PSCR_CSPPU0008", e);
0492: if (e instanceof InterruptedException) {
0493: content = new StringBuffer(getResourceBundle()
0494: .getString("timeout"));
0495: }
0496: //
0497: // set content to null, this tells the desktop to try to get
0498: // from the cache if available
0499: //
0500: content = null;
0501: }
0502:
0503: finally {
0504: try {
0505: //disconnect will close inputsteam as well for sun jdk
0506: uc.disconnect();
0507: if (in != null) {
0508: in.close();
0509: }
0510: } catch (IOException e2) {
0511: // nothing
0512: }
0513: }
0514: return content;
0515: }
0516:
0517: /**
0518: *
0519: * @param cookieStr
0520: * @return
0521: */
0522: private String getCookieName(String cookieStr) {
0523: String name = null;
0524: int index = cookieStr.indexOf("=");
0525: if (index != -1) {
0526: name = cookieStr.substring(0, index);
0527: }
0528: return name;
0529: }
0530:
0531: /**
0532: *
0533: * @param ft
0534: * @param cookieStr
0535: * @param url
0536: * @throws java.net.MalformedURLException
0537: */
0538: private void processSetCookieHeader(String cookieStr,
0539: FetcherThread ft, URL url) throws MalformedURLException {
0540: // change the domain/path information.
0541: String newCookieStr = modifyCookieHeader(cookieStr, url, ft);
0542:
0543: logger.log(Level.FINEST, "PSCR_CSPPU0010", new Object[] {
0544: cookieStr, newCookieStr });
0545: if (newCookieStr != null) {
0546: cookieStr = newCookieStr;
0547: }
0548: try {
0549: if (cookieStr != null
0550: && ft.getHttpServletResponse() != null) {
0551: String cookieName = getCookieName(cookieStr);
0552: if ((cookieName != null)
0553: && (forward(cookieName,
0554: getCookiesToForwardAll(),
0555: getCookiesToForwardList()))) {
0556: // include it in the response (after rewriting)
0557: synchronized (ft.getHttpServletResponse()) {
0558: ft.getHttpServletResponse().addHeader(
0559: "Set-Cookie", cookieStr);
0560:
0561: }
0562: }
0563: }
0564: } catch (Exception e) {
0565: if (logger.isLoggable(Level.INFO)) {
0566: LogRecord record = new LogRecord(Level.INFO,
0567: "PSCR_CSPPU0011");
0568: record.setLoggerName(logger.getName());
0569: record.setParameters(new Object[] { cookieStr });
0570: record.setThrown(e);
0571: logger.log(record);
0572: }
0573: }
0574: }
0575:
0576: /**
0577: *
0578: * @return
0579: * @param lastFetchedUrl
0580: * @param u
0581: * @throws java.net.MalformedURLException
0582: */
0583: private static String getAbsURL(String u, URL lastFetchedUrl)
0584: throws MalformedURLException {
0585:
0586: if (!u.regionMatches(true, 0, "http://", 0, 7)
0587: && !u.regionMatches(true, 0, "https://", 0, 8)) {
0588: String scheme = lastFetchedUrl.getProtocol();
0589: StringBuffer absURL = new StringBuffer().append(scheme)
0590: .append("://").append(lastFetchedUrl.getHost())
0591: .append(":");
0592: int port = lastFetchedUrl.getPort();
0593: // default port schemes need to fill in port
0594: if (scheme.equals("http") && port <= 0) {
0595: port = 80;
0596: } else if (scheme.equals("https") && port <= 0) {
0597: port = 443;
0598: }
0599:
0600: absURL.append(port);
0601: if (!u.startsWith("/")) {
0602: String path = lastFetchedUrl.getPath();
0603: int lastIndex = path.lastIndexOf("/");
0604: if (lastIndex > 0) {
0605: absURL.append(path.substring(0, lastIndex)).append(
0606: "/");
0607: }
0608: }
0609: absURL.append(u);
0610: return absURL.toString();
0611:
0612: } else {
0613: return u;
0614: }
0615: }
0616:
0617: /**
0618: *
0619: * @return
0620: * @param ft
0621: * @param cookieStr
0622: * @param url
0623: * @throws java.net.MalformedURLException
0624: */
0625: private String modifyCookieHeader(String cookieStr, URL url,
0626: FetcherThread ft) throws MalformedURLException {
0627: if (cookieStr == null) {
0628: return null;
0629: }
0630:
0631: StringTokenizer tokens = new StringTokenizer(cookieStr, ";");
0632:
0633: if (!tokens.hasMoreTokens()) {
0634: return cookieStr;
0635: }
0636:
0637: String nameAndValue = tokens.nextToken().trim();
0638: String cookieName = getCookieName(cookieStr);
0639: String host = url.getHost();
0640: String portalHost = (ft.getDesktopRequestURL()).getHost();
0641:
0642: int equIndex;
0643:
0644: StringBuffer sb = new StringBuffer();
0645: sb.append(nameAndValue);
0646: String cookieValue = getValueFromNameValuePair(nameAndValue);
0647: Cookie sessionCookie = new Cookie(cookieName, cookieValue);
0648: sessionCookieTable.put(cookieName, sessionCookie);
0649:
0650: String token, attr, val;
0651: String path = null;
0652: String domain = null;
0653:
0654: while (tokens.hasMoreTokens()) {
0655: token = tokens.nextToken().trim();
0656: if (token.length() == 0) {
0657: continue;
0658: }
0659:
0660: equIndex = token.indexOf('=');
0661:
0662: if (equIndex < 0) {
0663: attr = token;
0664: val = null;
0665: } else {
0666: attr = token.substring(0, equIndex).trim();
0667: if (equIndex + 1 == token.length()) {
0668: val = null;
0669: } else {
0670: val = token.substring(equIndex + 1).trim();
0671: }
0672: }
0673:
0674: if (attr.equalsIgnoreCase("path")) {
0675: path = val;
0676: if (path != null) {
0677: if (!path.startsWith("/")) {
0678: return null;
0679: }
0680: }
0681: } else if (attr.equalsIgnoreCase("domain")) {
0682: domain = val;
0683: if (domain != null) {
0684: // must start with '.' and at least .a.b
0685: if (domain.charAt(0) != '.')
0686: domain = "." + domain;
0687: if (domain.length() < 4) {
0688: return null;
0689: }
0690: if (domain.charAt(domain.length() - 1) == '.') {
0691: return null;
0692: }
0693:
0694: if (!host.toLowerCase().endsWith(
0695: domain.toLowerCase())) {
0696: return null;
0697: }
0698: }
0699:
0700: } else {
0701: sb.append(";" + token);
0702: }
0703: }
0704: // store the domain and path in cookieMap.
0705: String portalDomain = getDomain(portalHost);
0706: if (domain != null && path != null) {
0707: if (!domain.equalsIgnoreCase(portalDomain)
0708: || !path.equalsIgnoreCase(getPath(ft
0709: .getHttpServletRequest()))) {
0710: domainPathCookieTable.put(cookieName,
0711: new domainPathEntry(domain, path));
0712: }
0713:
0714: } else {
0715: domainPathCookieTable.put(cookieName, new domainPathEntry(
0716: domain, path));
0717: }
0718: // rewrite the domain to portal domain and path to root.
0719: sb.append(";domain=").append(portalDomain).append(";path=/");
0720:
0721: if (path != null)
0722: sessionCookie.setPath(path);
0723: if (domain != null)
0724: sessionCookie.setDomain(domain);
0725:
0726: return sb.toString();
0727:
0728: }
0729:
0730: /**
0731: *
0732: * @param request
0733: * @return
0734: */
0735: private String getPath(HttpServletRequest request) {
0736: return (request.getContextPath() + request.getServletPath());
0737: }
0738:
0739: /**
0740: *
0741: * @param host
0742: * @return
0743: */
0744: private String getDomain(String host) {
0745:
0746: int index1 = host.lastIndexOf('.');
0747: if (index1 == -1) {
0748: return host;
0749: }
0750: int index2 = host.substring(0, index1).lastIndexOf('.');
0751: if (index1 == -1 || index2 == -1) {
0752: return host;
0753: }
0754: return host.substring(index2);
0755: }
0756:
0757: /**
0758: * pass all headers to the URL like userAgent etc.
0759: * Helps in scraping sites which are client-aware
0760: * @params uc the URLConnection object where the header is set
0761: * @params request the source where the header is copied from
0762: * @param uc
0763: * @param request
0764: */
0765: private void setHeaders(URLConnection uc, HttpServletRequest request) {
0766: Enumeration names = request.getHeaderNames();
0767:
0768: if (names != null) {
0769: while (names.hasMoreElements()) {
0770: String name = (String) names.nextElement();
0771: if (name != null) {
0772: String nameLC = name.toLowerCase();
0773: String value = request.getHeader(name);
0774: logger.log(Level.FINEST, "PSCR_CSPPU0012",
0775: new Object[] { name, value });
0776: if (ignoreHeaders.contains(nameLC)) {
0777: continue; // dont pass specific Request headers
0778: }
0779:
0780: if (value != null) {
0781: uc.setRequestProperty(name, value);
0782: logger.log(Level.FINEST, "PSCR_CSPPU0013",
0783: new Object[] { name, value });
0784: }
0785: }
0786: }
0787: }
0788: return;
0789: }
0790:
0791: /**
0792: *
0793: */
0794: /*String getContentType(){
0795: return contentType;
0796: }
0797: */
0798:
0799: static class domainPathEntry {
0800: String domain = null;
0801: String path = null;
0802:
0803: /**
0804: *
0805: * @param d
0806: * @param p
0807: */
0808: domainPathEntry(String d, String p) {
0809: domain = d;
0810: path = p;
0811: }
0812:
0813: /**
0814: *
0815: * @return
0816: */
0817: String getPath() {
0818: return path;
0819: }
0820:
0821: /**
0822: *
0823: * @return
0824: */
0825: String getDomain() {
0826: return domain;
0827: }
0828: }
0829:
0830: /**
0831: *
0832: * @param in
0833: * @param contentLength
0834: * @throws java.io.IOException
0835: * @return
0836: */
0837: static byte[] readContent(InputStream in, int contentLength)
0838: throws IOException {
0839: byte[] contentbytes = new byte[CONTENT_BUFFER_SIZE];
0840: if (contentLength != -1) {
0841: contentbytes = new byte[contentLength];
0842: } else {
0843: contentbytes = new byte[CONTENT_BUFFER_SIZE];
0844: }
0845: ByteArrayOutputStream baos = new ByteArrayOutputStream();
0846: int count = 0;
0847: while ((count = in.read(contentbytes)) > 0) {
0848: baos.write(contentbytes, 0, count);
0849: }
0850: byte[] content = baos.toByteArray();
0851: if (baos != null) {
0852: baos.close();
0853: }
0854: return content;
0855: }
0856:
0857: /**
0858: *
0859: * @param formData
0860: * @param conn
0861: * @throws java.io.IOException
0862: */
0863: private static void setPostData(HttpURLConnection conn,
0864: String formData) throws IOException {
0865:
0866: if (!isEmpty(formData)) {
0867: conn.setRequestMethod("POST");
0868: conn.setDoOutput(true);
0869: OutputStream outSt = conn.getOutputStream();
0870: DataOutputStream dataOS = new DataOutputStream(outSt);
0871: dataOS.writeBytes(formData);
0872: dataOS.flush();
0873: dataOS.close();
0874: }
0875: }
0876:
0877: /**
0878: *
0879: * @return
0880: */
0881: private String getFormData() {
0882: String formData = (String) config.get(KEY_FORM_DATA);
0883: return replaceValuesInFormData(formData);
0884: }
0885:
0886: /**
0887: *
0888: * @return
0889: */
0890: private String getLoginFormData() {
0891: String loginFormData = (String) config.get(KEY_LOGIN_FORM_DATA);
0892: return replaceValuesInFormData(loginFormData);
0893: }
0894:
0895: /**
0896: *
0897: * @param formData
0898: * @return
0899: */
0900: private String replaceValuesInFormData(String formData) {
0901: if (formData != null) {
0902: int startPos = 0;
0903: boolean loopFlag = true;
0904: while (loopFlag) {
0905: int position = formData.indexOf("[", startPos);
0906: if (position > 0) {
0907: startPos = position + 1;
0908: int lastPos = formData.indexOf("]", startPos);
0909: if (lastPos > 0) {
0910: String key = formData.substring(startPos,
0911: lastPos);
0912: String value = getPropertyValue(key);
0913: if (value != null) {
0914: key = "\\[" + key.trim() + "\\]";
0915: formData = formData
0916: .replaceFirst(key, value);
0917: }
0918: }
0919: } else {
0920: loopFlag = false;
0921: }
0922: }
0923: }
0924: return formData;
0925: }
0926:
0927: /**
0928: * Check if a String is null or Empty
0929: * @param param
0930: * @return
0931: */
0932: private static boolean isEmpty(String param) {
0933: if (param == null) {
0934: return true;
0935: }
0936: if (param.trim().equals("")) {
0937: return true;
0938: }
0939: return false;
0940: }
0941:
0942: /**
0943: * <P> This method returns true if allCookies property is true
0944: * otherwise checks if the cookie name exists in the cookiesToForward
0945: * list and returns true if it does or false if it doesn't.
0946: *
0947: * @return boolean value
0948: * @param cookieName
0949: * @param allCookies allCookies property value from display profile
0950: * @param cookiesToForwardList cookiesToForwardList property value from display profile
0951: */
0952: private static boolean forward(String cookieName,
0953: boolean allCookies, List cookiesToForwardList) {
0954: if (allCookies) {
0955: return true;
0956: } else {
0957: if (cookiesToForwardList.contains(cookieName)) {
0958: return true;
0959: }
0960: }
0961: return false;
0962: }
0963:
0964: /**
0965: *
0966: * Gets the charset
0967: *
0968: * <p> This method determines the charset based on the
0969: * contentType header if it is available (only applies to http(s) urls),
0970: * or from the inputEncoding property if it is non-blank, or from the meta
0971: * tag in content, e.g. meta tag in html, xml or wml header if they are
0972: * available (only applies to HTML, XML, WML).
0973: * @return String charset
0974: * or null if charset cannot be determined
0975: * @param profileCharset
0976: * @param contentType If http(s) urls, null otherwise
0977: * @param bytes Bytes from the scraped content
0978: * @param MIMEType MIMEType for the content
0979: */
0980: public static String getContentEncoding(String contentType,
0981: byte[] bytes, String MIMEType, String profileCharset) {
0982: String charset = null;
0983: if (contentType != null) {
0984:
0985: Pattern p = Pattern.compile("charset=([^;]+)");
0986: Matcher m = p.matcher(contentType);
0987:
0988: if (m.find())
0989: return m.group(1).replaceAll("\"", "");
0990: }
0991: //get Encoding from profile
0992: charset = profileCharset;
0993:
0994: if (charset != null && charset.length() != 0) {
0995: return charset;
0996: }
0997: //Look for charset in meta tag if html , xml , wml
0998: if ((MIMEType != null)
0999: && (MIMEType.equalsIgnoreCase("text/html")
1000: || MIMEType.equalsIgnoreCase("text/xml") || MIMEType
1001: .equalsIgnoreCase("application/xml"))
1002: || MIMEType.equalsIgnoreCase("text/vnd.wap.wml")) {
1003: //get charset from meta tag if avaialble
1004: charset = getContentEncodingFromContentBytes(bytes);
1005: }
1006: return charset;
1007: }
1008:
1009: /**
1010: *
1011: * @param key
1012: * @return
1013: */
1014: private String getPropertyValue(String key) {
1015: return (String) config.get(key);
1016: }
1017:
1018: /**
1019: *
1020: * @return
1021: * @throws java.net.MalformedURLException
1022: */
1023: /*
1024: private URL getDesktopRequestURL() throws MalformedURLException {
1025: Object obj = config.get(KEY_DESKTOP_REQUEST_URL) ;
1026: if ( obj instanceof URL){
1027: return (URL)obj;
1028: } else if( obj !=null && obj instanceof String ){
1029: return new URL(obj.toString());
1030: }
1031: return null;
1032: }
1033: */
1034:
1035: /**
1036: *
1037: * @return
1038: */
1039: private ResourceBundle getResourceBundle() {
1040: return (ResourceBundle) config.get(KEY_RESOURCE_BUNDLE);
1041: }
1042:
1043: /**
1044: *
1045: * @return
1046: */
1047: private String getRuleSetId() {
1048: return (String) config.get(KEY_RULESET_ID);
1049: }
1050:
1051: /**
1052: *
1053: * @return
1054: */
1055: private String getInputEncoding() {
1056: return (String) config.get(KEY_INPUT_ENCODING);
1057: }
1058:
1059: /**
1060: *
1061: * @return
1062: */
1063: private String getLoginUrl() {
1064: return (String) config.get(KEY_LOGIN_URL);
1065: }
1066:
1067: /**
1068: *
1069: *
1070: */
1071: private String getLogoutUrl() {
1072: return (String) config.get(KEY_LOGOUT_URL);
1073: }
1074:
1075: /**
1076: * Gets the charset from content
1077: *
1078: * <p>This method determines the charset based on
1079: * meta tag in content
1080: * @param contentBytes Bytes from the scraped content
1081: * @return String charset or null if charset cannot be determined
1082: */
1083: static String getContentEncodingFromContentBytes(byte[] contentBytes) {
1084: String charset = null;
1085: /* The character encoding info was not found in the contentType
1086: * header. We have to parse through the content portion to
1087: * figure it out. It may be specified in the html <meta> tag
1088: * as the following;
1089: *
1090: * <html><head>
1091: * ...
1092: * <meta content="text/html; charset=gb2312">
1093: * ...
1094: * </head>
1095: * ...
1096: */
1097: String contentString = new String(contentBytes);
1098:
1099: String str = contentString.toLowerCase();
1100: int idxMetaTag, idxCloseArrowBracket, idxCharset;
1101: int startIdx = 0;
1102:
1103: while (true) {
1104: idxMetaTag = str.indexOf("<meta", startIdx);
1105: if (idxMetaTag == -1)
1106: break;
1107:
1108: idxCloseArrowBracket = str.indexOf(">", idxMetaTag);
1109:
1110: if (idxCloseArrowBracket == -1)
1111: break;
1112:
1113: String headerstr = contentString.substring(idxMetaTag,
1114: idxCloseArrowBracket);
1115: String header = headerstr.toLowerCase();
1116:
1117: idxCharset = header.indexOf("charset=");
1118: if (idxCharset == -1) {
1119: startIdx = idxCloseArrowBracket + 1;
1120: continue;
1121: }
1122: /* We found one charset within a <meta> tag
1123: */
1124:
1125: int startCE = idxCharset + 8;
1126: char chquotes = header.charAt(startCE);
1127: if (chquotes == '\"') {
1128: startCE = startCE + 1;
1129: }
1130: int endCE = startCE;
1131: char ch = header.charAt(endCE);
1132:
1133: /* the charset value can only contain letter, digit,
1134: charcter '-' or '_'
1135: */
1136: while (Character.isLetterOrDigit(ch) || (ch == '-')
1137: || (ch == '_')) {
1138: endCE++;
1139: ch = header.charAt(endCE);
1140: }
1141: if (endCE > startCE) {
1142: charset = headerstr.substring(startCE, endCE);
1143: return charset;
1144: }
1145:
1146: break;
1147: }
1148: return charset;
1149: }
1150:
1151: /**
1152: *
1153: * @param nameValuePair
1154: * @return
1155: */
1156: private String getValueFromNameValuePair(String nameValuePair) {
1157: int i = nameValuePair.indexOf("=");
1158: if (i > -1) {
1159: return nameValuePair.substring(i + 1);
1160: }
1161: return null;
1162: }
1163:
1164: /**
1165: * check if it is configured for HttpAuth
1166: * @return
1167: */
1168: private boolean isHttpAuth() {
1169: Object obj = config.get(KEY_HTTP_AUTH);
1170: if (obj != null && obj instanceof String) {
1171: return (Boolean.getBoolean(obj.toString()));
1172: } else if (obj instanceof Boolean) {
1173: return ((Boolean) obj).booleanValue();
1174: }
1175: return false;
1176: }
1177:
1178: /**
1179: * gets the uids as String .
1180: * @return String
1181: */
1182: private String getHttpAuthUid() {
1183: return (String) config.get(KEY_HTTP_AUTH_UID);
1184: }
1185:
1186: /**
1187: * get the password as String
1188: * @return String
1189: */
1190: private String getHttpAuthPassword() {
1191: return (String) config.get(KEY_HTTP_AUTH_PASSWORD);
1192: }
1193:
1194: /**
1195: *
1196: * @param conn
1197: */
1198: private void setBasicAuthHeader(HttpURLConnection conn) {
1199: if (isHttpAuth()) {
1200:
1201: String uid = getHttpAuthUid();
1202: String password = getHttpAuthPassword();
1203: String userAndPassword = uid + ":" + password;
1204:
1205: BASE64Encoder encoder = new BASE64Encoder();
1206: String encUidPwd = encoder.encodeBuffer(userAndPassword
1207: .getBytes());
1208:
1209: conn.setRequestProperty("Authorization", "Basic "
1210: + URLEncoder.encode(encUidPwd));
1211:
1212: }
1213: }
1214:
1215: /**
1216: *
1217: * @return
1218: * @param ft
1219: */
1220: private StringBuffer setFormAuth(FetcherThread ft) {
1221: //Don't do form auth if it is supposed to do httpAuth
1222: if (isHttpAuth()) {
1223: return null;
1224: }
1225: StringBuffer content = null;
1226: if (!isAuthenticated) {
1227: String loginUrl = getLoginUrl();
1228: if (!isEmpty(loginUrl)) {
1229: String loginFormData = getLoginFormData();
1230: content = fetch(ft, loginUrl, getLoginFormData());
1231: isAuthenticated = true;
1232: postLoginUrl = ft.getLastFetchedUrl();
1233:
1234: }
1235: }
1236: return content;
1237: }
1238:
1239: /**
1240: * gets the Filtered Content within the timeOut limit else fails
1241: * @return
1242: * @param timeOut
1243: * @param req
1244: * @param res
1245: * @param urlAsString
1246: */
1247: public StringBuffer getFilteredContent(int timeOut,
1248: HttpServletRequest req, HttpServletResponse res,
1249: String urlAsString) {
1250: String url = replaceValuesInFormData(urlAsString);
1251:
1252: FetcherThread ft = createFetcherThread(req, res, url);
1253:
1254: return getFilteredContent(timeOut, ft);
1255: }
1256:
1257: /**
1258: * gets the filtered content using this FetcherThread
1259: * @return
1260: * @param timeOut
1261: * @param ft
1262: */
1263: private StringBuffer getFilteredContent(int timeOut,
1264: FetcherThread ft) {
1265:
1266: StringBuffer buff = null;
1267:
1268: //
1269: // wait for fetcher to get content
1270: //
1271: try {
1272: ft.start();
1273: try {
1274: ft.join(timeOut * 1000);
1275: } catch (InterruptedException ex) {
1276:
1277: buff = new StringBuffer(this .getResourceBundle()
1278: .getString("timeout"));
1279: logger.log(Level.INFO, "PSCR_CSPPU0018", ex);
1280: //logger.log(Level.SEVERE , "PSCR_CSPPU0028", ex);
1281: }
1282:
1283: if (ft.isFinished()) {
1284: buff = ft.getContentBuffer();
1285: if (buff != null) {
1286: String contentType = ft.getContentType();
1287: ContentFilter conFilter = ContentFilterImpl
1288: .getInstance(contentType);
1289: logger.log(Level.FINEST, "PSCR_CSPPU0020",
1290: conFilter.getClass().getName());
1291: buff = conFilter.filter(buff);
1292: }
1293: }
1294: } catch (ContentFilterException ex) {
1295:
1296: logger.log(Level.SEVERE, "PSCR_CSPPU0029", ex);
1297: } finally {
1298: //
1299: // interrupt thread if it hung
1300: //
1301: if (ft.isAlive()) {
1302: ft.terminate();
1303: }
1304:
1305: }
1306: return buff;
1307: }
1308:
1309: /**
1310: * Invokes the logout within the timeOut limit else fails
1311: * @return
1312: * @param timeOut
1313: * @param req
1314: * @param res
1315: */
1316:
1317: public void logout(int timeOut, HttpServletRequest req,
1318: HttpServletResponse res) {
1319: if (isAuthenticated) {
1320: String logoutUrl = getLogoutUrl();
1321: if (!isEmpty(logoutUrl)) {
1322: FetcherThread ft = createFetcherThread(req, res,
1323: logoutUrl);
1324: fetch(ft, logoutUrl, null);
1325: }
1326: isAuthenticated = false;
1327: sessionCookieTable = new Hashtable();
1328: domainPathCookieTable = new Hashtable();
1329: }
1330:
1331: }
1332: }
|