001: /*
002: * Created on Mar 11, 2005
003: */
004: package com.sun.portal.wireless.htmlconversion.servlet;
005:
006: import java.io.BufferedReader;
007: import java.io.IOException;
008: import java.io.InputStreamReader;
009: import java.io.PrintWriter;
010: import java.net.MalformedURLException;
011: import java.net.URL;
012: import java.net.HttpURLConnection;
013: import java.util.Enumeration;
014:
015: import javax.servlet.http.HttpServletRequest;
016: import javax.servlet.http.HttpServletResponse;
017:
018: import com.sun.portal.wireless.htmlconversion.HtmlConverter;
019: import com.sun.portal.wireless.htmlconversion.HtmlConversionException;
020:
021: /**
022: * Retrieves HTML content from a URL using a specified HTTP method, and
023: * provides methods to transform the retrieved HTML content to AML.
024: *
025: * @author ashwin.mathew@sun.com
026: */
027: public class URLScraper {
028:
029: /**
030: * The HTTP request parameter holding the
031: * URL to scrape.
032: */
033: public static final String URL_PARAM = "url";
034:
035: /**
036: * The prefix for all URLs
037: */
038: public static final String URL_PREFIX = "/MobileViewHtml?";
039:
040: /**
041: * The default context path, "/portal".
042: */
043: public static final String DEFAULT_CONTEXT = "/portal";
044:
045: /**
046: * Constant specifying the HTTP POST method.
047: */
048: public static final String HTTP_METHOD_POST = "POST";
049:
050: /**
051: * Constant specifying the HTTP GET method.
052: */
053: public static final String HTTP_METHOD_GET = "GET";
054:
055: private static final String HTTP_PROTOCOL_PREFIX = "http://";
056:
057: private static final String AMP = "&";
058: private static final String EQUALS = "=";
059:
060: private static final String PROPERTY_CONTENT_TYPE = "Content-Type";
061: private static final String PROPERTY_CONTENT_LENGTH = "Content-Length";
062:
063: private static final String CONTENT_TYPE_FORM_ENCODED = "application/x-www-form-urlencoded";
064:
065: // Allowed content types
066: private static final String CONTENT_TYPE_TEXT_HTML = "text/html";
067: private static final String CONTENT_TYPE_TEXT_XML = "text/xml";
068: private static final String CONTENT_TYPE_XHTML = "application/xhtml+xml";
069:
070: private static final String[] allowedContentTypes = new String[3];
071: static {
072: allowedContentTypes[0] = CONTENT_TYPE_TEXT_HTML;
073: allowedContentTypes[1] = CONTENT_TYPE_TEXT_XML;
074: allowedContentTypes[2] = CONTENT_TYPE_XHTML;
075: }
076:
077: private HttpServletRequest request;
078:
079: private HttpServletResponse response;
080:
081: private String urlSpec;
082:
083: private String method;
084:
085: private String content = null;
086:
087: private int responseCode = HttpURLConnection.HTTP_OK;
088:
089: private HtmlConverter converter;
090:
091: /**
092: * Constructs a new URLScraper.
093: *
094: * @param request
095: * @param response
096: */
097: public URLScraper(HttpServletRequest request,
098: HttpServletResponse response, boolean isFragment)
099: throws HtmlConversionException {
100: this .request = request;
101: this .response = response;
102:
103: this .method = request.getMethod();
104:
105: this .urlSpec = URLTranscoder.decode(request
106: .getParameter(URL_PARAM));
107:
108: if (!urlSpec.startsWith(HTTP_PROTOCOL_PREFIX)) {
109: throw new HtmlConversionException(
110: HtmlConversionException.MALFORMED_URL_ERROR,
111: "URL does not start with http:// [" + urlSpec + "]");
112: }
113:
114: getContent();
115: converter = new HtmlConverter(content, isFragment);
116: converter.setEncoder(request, response);
117: }
118:
119: /**
120: * Converts the retrieved content to AML.
121: *
122: * @param isFragment Indicates whether or not the output
123: * AML document is a fragment.
124: * @return
125: */
126: public String getAMLContent() throws HtmlConversionException {
127: return converter.toAML();
128: }
129:
130: /**
131: * Gets content for the specified URL using the specified method.
132: *
133: * @return
134: */
135: public String getContent() throws HtmlConversionException {
136: if (content != null) {
137: return content;
138: }
139:
140: URL url = null;
141:
142: try {
143: url = new URL(urlSpec);
144: } catch (MalformedURLException muEx) {
145: muEx.printStackTrace();
146: throw new HtmlConversionException(
147: HtmlConversionException.MALFORMED_URL_ERROR, muEx);
148: }
149:
150: StringBuffer contentBuffer = new StringBuffer();
151:
152: try {
153: HttpURLConnection.setFollowRedirects(true);
154:
155: HttpURLConnection connection = (HttpURLConnection) url
156: .openConnection();
157:
158: if (method.equals(HTTP_METHOD_POST)) {
159: connection.setDoOutput(true);
160: connection.setRequestProperty(PROPERTY_CONTENT_TYPE,
161: CONTENT_TYPE_FORM_ENCODED);
162:
163: String postParamString = getPostParamString();
164: connection.setRequestProperty(PROPERTY_CONTENT_LENGTH,
165: String.valueOf(postParamString.length()));
166:
167: PrintWriter writer = new PrintWriter(connection
168: .getOutputStream());
169:
170: if (postParamString != null) {
171: writer.write(postParamString);
172: }
173:
174: writer.flush();
175: writer.close();
176: }
177:
178: responseCode = connection.getResponseCode();
179: if (responseCode < 200 || responseCode >= 400) {
180: // An error occurred
181: throw new HtmlConversionException(
182: HtmlConversionException.URL_RETRIEVAL_ERROR,
183: "HTTP Error Code " + responseCode);
184: }
185:
186: checkAllowedContentType(connection.getContentType());
187:
188: BufferedReader in = new BufferedReader(
189: new InputStreamReader(connection.getInputStream()));
190:
191: String inputLine;
192: while ((inputLine = in.readLine()) != null) {
193: contentBuffer.append(inputLine);
194: }
195:
196: in.close();
197: } catch (IOException ioEx) {
198: ioEx.printStackTrace();
199: throw new HtmlConversionException(
200: HtmlConversionException.URL_RETRIEVAL_ERROR, ioEx);
201: }
202:
203: content = contentBuffer.toString();
204:
205: return content;
206: }
207:
208: public int getResponseCode() {
209: return responseCode;
210: }
211:
212: private void checkAllowedContentType(String contentType)
213: throws HtmlConversionException {
214: boolean isAllowedContentType = false;
215:
216: for (int i = 0; i < allowedContentTypes.length; i++) {
217: if (contentType.startsWith(allowedContentTypes[i])) {
218: isAllowedContentType = true;
219: break;
220: }
221: }
222:
223: if (!isAllowedContentType) {
224: throw new HtmlConversionException(
225: HtmlConversionException.UNSUPPORTED_CONTENT_TYPE,
226: "Unsupported content type: " + contentType);
227: }
228: }
229:
230: // Converts post params (either from postParams HashMap
231: // or HttpRequest, whichever is available) in a query
232: // string, of the form "n1=v1&n2=v2&n3=v3".
233: private String getPostParamString() {
234: StringBuffer postParamBuffer = new StringBuffer();
235: boolean isNotEmpty = false;
236:
237: Enumeration paramNames = request.getParameterNames();
238: while (paramNames.hasMoreElements()) {
239: String param = (String) paramNames.nextElement();
240: String value = request.getParameter(param);
241: postParamBuffer.append(param).append(EQUALS).append(value)
242: .append(AMP);
243:
244: isNotEmpty = true;
245: }
246:
247: if (isNotEmpty) {
248: // prune the last "&"
249: postParamBuffer.setLength(postParamBuffer.length() - 1);
250: }
251:
252: return postParamBuffer.toString();
253: }
254: }
|