001: /*
002: * Created on Feb 9, 2005
003: *
004: */
005: package com.sun.portal.wireless.htmlconversion;
006:
007: import java.io.IOException;
008: import java.io.StringReader;
009: import java.io.StringWriter;
010:
011: import javax.servlet.http.HttpServletRequest;
012: import javax.servlet.http.HttpServletResponse;
013: import javax.swing.text.html.parser.ParserDelegator;
014: import javax.xml.parsers.DocumentBuilder;
015: import javax.xml.parsers.DocumentBuilderFactory;
016: import javax.xml.transform.Transformer;
017: import javax.xml.transform.TransformerConfigurationException;
018: import javax.xml.transform.TransformerException;
019: import javax.xml.transform.TransformerFactory;
020: import javax.xml.transform.dom.DOMSource;
021: import javax.xml.transform.stream.StreamResult;
022: import org.w3c.dom.Document;
023: import org.xml.sax.InputSource;
024: import org.xml.sax.SAXException;
025: import org.xml.sax.SAXNotRecognizedException;
026: import org.xml.sax.SAXNotSupportedException;
027: import org.xml.sax.XMLReader;
028: import org.xml.sax.helpers.XMLReaderFactory;
029:
030: import com.sun.portal.wireless.htmlconversion.servlet.URLTranscoder;
031: import com.sun.portal.log.common.PortalLogger;
032: import java.util.logging.Level;
033: import java.util.logging.Logger;
034:
035: /**
036: * Public API for this package that converts HTML input to AML output.
037: *
038: * @author ashwin.mathew@sun.com
039: */
040: public class HtmlConverter {
041:
042: /**
043: * Unknown whether the document is HTML or XHTML, the API will try to
044: * determine the document type.
045: */
046: public static final int DOCUMENT_TYPE_UNKNOWN = 0;
047:
048: /**
049: * Force transformation of the document as HTML.
050: */
051: public static final int DOCUMENT_TYPE_HTML = 1;
052:
053: /**
054: * Force transformation of the document as XHTML.
055: */
056: public static final int DOCUMENT_TYPE_XHTML = 2;
057:
058: // The input HTML
059: private String input;
060:
061: // The output AML
062: private Document output;
063:
064: private URLTranscoder encoder;
065:
066: private boolean isFragment;
067:
068: private int documentType;
069:
070: private boolean isTransformed = false;
071:
072: private static final String AML_PAGE_START_TAG = "<AmlPage>";
073:
074: private static final String AML_PAGE_END_TAG = "</AmlPage>";
075:
076: private static final int AML_PAGE_START_TAG_LENGTH = AML_PAGE_START_TAG
077: .length();
078:
079: private static final String DTD_START = "<!DOCTYPE";
080:
081: private static final char DTD_END = '>';
082:
083: private static final String XHTML_UPPER = "XHTML";
084:
085: private static final String XHTML_LOWER = "xhtml";
086:
087: private static final String FEATURE_VALIDATION = "http://xml.org/sax/features/validation";
088:
089: private static final String FEATURE_LOAD_EXTERNAL_DTD = "http://apache.org/xml/features/nonvalidating/load-external-dtd";
090:
091: private static final Logger logger = PortalLogger
092: .getLogger("com.sun.portal.wireless.htmlconversion");
093:
094: /**
095: * Constructs a new HtmlConverter which tries to determine the document type
096: * itself.
097: *
098: * @param input
099: * The input HTML content to be transformed.
100: * @param isFragment
101: * Whether or not the output AML content is a whole AML page
102: * (with AmlDocument and AmlPage tags) or is just a fragment of
103: * AML to be embedded on a larger AML page.
104: */
105: public HtmlConverter(String input, boolean isFragment)
106: throws HtmlConversionException {
107: this (input, isFragment, DOCUMENT_TYPE_UNKNOWN);
108: }
109:
110: /**
111: * Constructs a new HtmlConverter for the specified document type.
112: *
113: * @param input
114: * The input HTML content to be transformed.
115: * @param isFragment
116: * Whether or not the output AML content is a whole AML page
117: * (with AmlDocument and AmlPage tags) or is just a fragment of
118: * AML to be embedded on a larger AML page.
119: * @param documentType
120: * The type of the document, HTML, XHTML or unknown, must be one
121: * of the DOCUMENT_TYPE_* constants defined on this class.
122: */
123: public HtmlConverter(String input, boolean isFragment,
124: int documentType) throws HtmlConversionException {
125: this .input = input;
126: this .isFragment = isFragment;
127: this .documentType = documentType;
128:
129: if (logger.isLoggable(Level.FINEST)) {
130: logger.finest("Transforming HTML [" + input + "]");
131: }
132:
133: try {
134: DocumentBuilderFactory factory = DocumentBuilderFactory
135: .newInstance();
136: DocumentBuilder builder = factory.newDocumentBuilder();
137: output = builder.newDocument();
138: } catch (Exception ex) {
139: // ex.printStackTrace();
140: logger.log(Level.SEVERE, "Error converting HTML", ex);
141: throw new HtmlConversionException(
142: HtmlConversionException.XML_ERROR, ex);
143: }
144:
145: if (documentType == DOCUMENT_TYPE_UNKNOWN) {
146: checkDocumentType();
147: }
148: }
149:
150: /**
151: * Creates and sets the URLEncoder.
152: *
153: * @param request
154: * @param response
155: */
156: public void setEncoder(HttpServletRequest request,
157: HttpServletResponse response) {
158: encoder = new URLTranscoder(request, response);
159: }
160:
161: // Determines whether the input document is HTML or XHTML
162: // The current mechanism only checks for the presence of
163: // the string "XHTML" or "xhtml" in the opening DTD specification.
164: // This may have to be reimplemented later to be a little more
165: // sophisticated, for example, by checking whether or not image
166: // and input tags in the document have a closing "/>" instead of
167: // just ">".
168: private void checkDocumentType() {
169: documentType = DOCUMENT_TYPE_HTML;
170:
171: if (input.startsWith(DTD_START)) {
172: int endIndex = input.indexOf(DTD_END);
173: String dtd = input.substring(0, endIndex);
174:
175: if (dtd.indexOf(XHTML_UPPER) != -1
176: || dtd.indexOf(XHTML_LOWER) != -1) {
177: documentType = DOCUMENT_TYPE_XHTML;
178: }
179: }
180: }
181:
182: /**
183: * Returns the transformed AML output.
184: */
185: public String toAML() throws HtmlConversionException {
186: transform();
187:
188: TransformerFactory tFactory = TransformerFactory.newInstance();
189:
190: Transformer transformer = null;
191: try {
192: transformer = tFactory.newTransformer();
193: } catch (TransformerConfigurationException tce) {
194: // tce.printStackTrace();
195: logger.log(Level.SEVERE, "Error converting HTML", tce);
196: throw new HtmlConversionException(
197: HtmlConversionException.XML_ERROR, tce);
198: }
199:
200: DOMSource source = new DOMSource(output);
201:
202: StringWriter amlDoc = new StringWriter();
203: StreamResult result = new StreamResult(amlDoc);
204:
205: try {
206: transformer.transform(source, result);
207: } catch (TransformerException te) {
208: // te.printStackTrace();
209: logger.log(Level.SEVERE, "Error converting HTML", te);
210: throw new HtmlConversionException(
211: HtmlConversionException.XML_ERROR, te);
212: }
213:
214: String amlOutput = amlDoc.toString();
215:
216: if (isFragment) {
217: // Rip off the AmlDocument and AmlPage tags
218: int amlPageStartIndex = amlOutput
219: .indexOf(AML_PAGE_START_TAG);
220: if (amlPageStartIndex != -1) {
221: amlOutput = amlOutput.substring(amlPageStartIndex
222: + AML_PAGE_START_TAG_LENGTH);
223:
224: int amlPageEndIndex = amlOutput
225: .lastIndexOf(AML_PAGE_END_TAG);
226: amlOutput = amlOutput.substring(0, amlPageEndIndex);
227: }
228: }
229:
230: if (logger.isLoggable(Level.FINEST)) {
231: logger
232: .finest("Transformed HTML to AML [" + amlOutput
233: + "]");
234: }
235:
236: return amlOutput;
237: }
238:
239: private void transform() throws HtmlConversionException {
240: if (isTransformed) {
241: return;
242: }
243:
244: // Assume regular HTML parser for now
245: // Will add XHTML handling in later
246:
247: ParserState state = new ParserState(output, encoder);
248: GenericHtmlParserCallback genericCallback = new GenericHtmlParserCallback(
249: state);
250:
251: // Check documentType and proceed.
252: if (documentType == DOCUMENT_TYPE_HTML) {
253: StringReader inputReader = new StringReader(input);
254:
255: HtmlParserCallback callback = new HtmlParserCallback(
256: genericCallback);
257:
258: try {
259: new ParserDelegator()
260: .parse(inputReader, callback, true);
261: } catch (Exception e) {
262: // e.printStackTrace();
263: logger.log(Level.SEVERE, "Error converting HTML", e);
264: throw new HtmlConversionException(
265: HtmlConversionException.TRANSFORMATION_ERROR, e);
266: }
267: } else // documentType == DOCUMENT_TYPE_XHTML
268: {
269: XhtmlParserCallback callback = new XhtmlParserCallback(
270: genericCallback);
271:
272: XMLReader parser = null;
273:
274: try {
275: parser = XMLReaderFactory.createXMLReader();
276: } catch (SAXException saxEx) {
277: // saxEx.printStackTrace();
278: logger
279: .log(Level.SEVERE, "Error converting HTML",
280: saxEx);
281: throw new HtmlConversionException(
282: HtmlConversionException.XML_ERROR, saxEx);
283: }
284:
285: parser.setContentHandler(callback);
286: parser.setDTDHandler(callback);
287: parser.setEntityResolver(callback);
288: parser.setErrorHandler(callback);
289:
290: try {
291: parser.setFeature(FEATURE_VALIDATION, false);
292: parser.setFeature(FEATURE_LOAD_EXTERNAL_DTD, false);
293: } catch (SAXNotRecognizedException saxEx) {
294: // Ignore these exceptions, and attempt to
295: // continue processing
296: // saxEx.printStackTrace();
297: logger.log(Level.WARNING, "Error converting HTML",
298: saxEx);
299: } catch (SAXNotSupportedException saxEx) {
300: // Ignore these exceptions, and attempt to
301: // continue processing
302: // saxEx.printStackTrace();
303: logger.log(Level.WARNING, "Error converting HTML",
304: saxEx);
305: }
306:
307: InputSource inputSource = new InputSource(new StringReader(
308: input));
309:
310: try {
311: parser.parse(inputSource);
312: } catch (SAXException saxEx) {
313: // saxEx.printStackTrace();
314: logger
315: .log(Level.SEVERE, "Error converting HTML",
316: saxEx);
317: throw new HtmlConversionException(
318: HtmlConversionException.TRANSFORMATION_ERROR,
319: saxEx);
320: } catch (IOException ioEx) {
321: // ioEx.printStackTrace();
322: logger.log(Level.SEVERE, "Error converting HTML", ioEx);
323: throw new HtmlConversionException(
324: HtmlConversionException.TRANSFORMATION_ERROR,
325: ioEx);
326: }
327: }
328:
329: // Now flatten the tables and reform document structure
330: state.getLayoutManager().reformLayout();
331:
332: isTransformed = true;
333: }
334:
335: }
|