001: /*
002: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/util/CmsHtmlConverter.java,v $
003: * Date : $Date: 2008-02-27 12:05:36 $
004: * Version: $Revision: 1.31 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.util;
033:
034: import org.opencms.file.CmsObject;
035: import org.opencms.file.CmsProperty;
036: import org.opencms.file.CmsPropertyDefinition;
037: import org.opencms.file.CmsResource;
038: import org.opencms.i18n.CmsEncoder;
039: import org.opencms.main.CmsException;
040: import org.opencms.main.CmsLog;
041:
042: import java.io.ByteArrayInputStream;
043: import java.io.ByteArrayOutputStream;
044: import java.io.UnsupportedEncodingException;
045: import java.util.ArrayList;
046: import java.util.List;
047: import java.util.Properties;
048: import java.util.StringTokenizer;
049: import java.util.regex.Pattern;
050:
051: import org.apache.commons.logging.Log;
052:
053: import org.w3c.tidy.Tidy;
054:
055: /**
056: * Html cleaner and pretty printer.<p>
057: *
058: * Used to clean up html code (e.g. remove word tags) and optionally create xhtml from html.<p>
059: *
060: * @author Michael Emmerich
061: * @author Alexander Kandzior
062: *
063: * @version $Revision: 1.31 $
064: *
065: * @since 6.0.0
066: */
067: public class CmsHtmlConverter {
068:
069: /** Parameter value for disabled mode. **/
070: public static final String PARAM_DISABLED = CmsStringUtil.FALSE;
071:
072: /** Parameter value for enabled mode. **/
073: public static final String PARAM_ENABLED = CmsStringUtil.TRUE;
074:
075: /** Parameter value for WORD mode. **/
076: public static final String PARAM_WORD = "cleanup";
077:
078: /** Parameter value for XHTML mode. **/
079: public static final String PARAM_XHTML = "xhtml";
080:
081: /** The log object for this class. */
082: private static final Log LOG = CmsLog
083: .getLog(CmsHtmlConverter.class);
084:
085: /** Regular expression for cleanup. */
086: String[] m_cleanupPatterns = { "<o:p>.*(\\r\\n)*.*</o:p>",
087: "<o:p>.*(\\r\\n)*.*</O:p>", "<\\?xml:.*(\\r\\n).*/>",
088: "<\\?xml:.*(\\r\\n).*(\\r\\n).*/\\?>",
089: "<\\?xml:.*(\\r\\n).*(\\r\\n).*/>",
090: "<\\?xml:(.*(\\r\\n)).*/\\?>",
091: "<o:SmartTagType.*(\\r\\n)*.*/>",
092: "<o:smarttagtype.*(\\r\\n)*.*/>" };
093:
094: /** Patterns for cleanup. */
095: Pattern[] m_clearStyle;
096:
097: /** The input encoding. */
098: String m_encoding;
099:
100: /** Regular expression for replace. */
101: String[] m_replacePatterns = { " ", "(\\r\\n){2,}", "–",
102: "(\\n){2,}", "\\(\\r\\n<", "\\(\\n<",
103: "\\(\\r\\n(\\ ){1,}<", "\\(\\n(\\ ){1,}<", "\\r\\n<span",
104: "\\n<span" };
105:
106: /** Patterns for replace. */
107: Pattern[] m_replaceStyle;
108:
109: /** Values for replace. */
110: String[] m_replaceValues = { " ", "", "–", "", "(<",
111: "(<", "(<", "(<", "<span", "<span" };
112:
113: /** The tidy to use. */
114: Tidy m_tidy;
115:
116: /** The length of the line separator. */
117: private int m_lineSeparatorLength;
118:
119: /** Indicates if this converter is enabled or not. */
120: private boolean m_modeEnabled;
121:
122: /** Indicates if word cleanup mode is enabled or not. */
123: private boolean m_modeWord;
124:
125: /** Indicates if xhtml conversion mode is enabled or not. */
126: private boolean m_modeXhtml;
127:
128: /**
129: * Constructor, creates a new CmsHtmlConverter.<p>
130: *
131: * The encoding used by default is {@link CmsEncoder#ENCODING_UTF_8}.<p>
132: */
133: public CmsHtmlConverter() {
134:
135: init(CmsEncoder.ENCODING_UTF_8, PARAM_ENABLED);
136: }
137:
138: /**
139: * Constructor, creates a new CmsHtmlConverter.<p>
140: *
141: * Possible values for the conversion mode are:<ul>
142: * <li>{@link #PARAM_DISABLED}: The conversion is disabled.
143: * <li>{@link #PARAM_ENABLED}: Conversion is enabled without transformation, so html is pretty printed only.
144: * <li>{@link #PARAM_XHTML}: Conversion from html to xhtml is enabled.
145: * <li>{@link #PARAM_WORD}: Cleanup of word like html tags is enabled.
146: * </ul>
147: * Values can be combined with the <code>;</code> separator, so it's possible to convert
148: * to xhtml and clean from word at the same time.<p>
149: *
150: * @param encoding the encoding used for the html code conversion
151: * @param mode the conversion mode to use
152: */
153: public CmsHtmlConverter(String encoding, String mode) {
154:
155: init(encoding, mode);
156: }
157:
158: /**
159: * Reads the content conversion property of a given resource and returns it's value.<p>
160: *
161: * A default value (disabled) is returned if the property could not be read.<p>
162: *
163: * @param cms the CmsObject
164: * @param resource the resource in the vfs
165: * @return the content conversion property value
166: */
167: public static String getConversionSettings(CmsObject cms,
168: CmsResource resource) {
169:
170: // read the content-conversion property
171: String contentConversion;
172: try {
173: String resourceName = cms.getSitePath(resource);
174: CmsProperty contentConversionProperty = cms
175: .readPropertyObject(
176: resourceName,
177: CmsPropertyDefinition.PROPERTY_CONTENT_CONVERSION,
178: true);
179: contentConversion = contentConversionProperty
180: .getValue(CmsHtmlConverter.PARAM_DISABLED);
181: } catch (CmsException e) {
182: // if there was an error reading the property, choose a default value
183: contentConversion = CmsHtmlConverter.PARAM_DISABLED;
184: }
185: return contentConversion;
186: }
187:
188: /**
189: * Tests if the content conversion is enabled.<p>
190: *
191: * @param conversionMode the content conversion mode string
192: * @return ture or false
193: */
194: public static boolean isConversionEnabled(String conversionMode) {
195:
196: boolean value = true;
197: if ((conversionMode == null)
198: || (conversionMode.indexOf(PARAM_DISABLED) != -1)) {
199: value = false;
200: }
201: return value;
202: }
203:
204: /**
205: * Converts the given html code according to the settings of this converter.<p>
206: *
207: * @param htmlInput html input stored in an array of bytes
208: * @return array of bytes contining the converted html
209: *
210: * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
211: */
212: public byte[] convertToByte(byte[] htmlInput)
213: throws UnsupportedEncodingException {
214:
215: if (m_modeEnabled) {
216: // only do any processing if the conversion is enabled
217: return convertToByte(new String(htmlInput, m_encoding));
218: }
219: return htmlInput;
220: }
221:
222: /**
223: * Converts the given html code according to the settings of this converter.<p>
224: *
225: * @param htmlInput html input stored in a string
226: * @return array of bytes contining the converted html
227: *
228: * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
229: */
230: public byte[] convertToByte(String htmlInput)
231: throws UnsupportedEncodingException {
232:
233: return convertToString(htmlInput).getBytes(m_encoding);
234: }
235:
236: /**
237: * Converts the given html code according to the settings of this converter.<p>
238: *
239: * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
240: *
241: * @param htmlInput html input stored in an array of bytes
242: * @return array of bytes contining the converted html
243: */
244: public byte[] convertToByteSilent(byte[] htmlInput) {
245:
246: try {
247: return convertToByte(htmlInput);
248: } catch (Exception e) {
249: if (LOG.isWarnEnabled()) {
250: LOG.warn(Messages.get().getBundle().key(
251: Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
252: }
253: return htmlInput;
254: }
255: }
256:
257: /**
258: * Converts the given html code according to the settings of this converter.<p>
259: *
260: * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
261: *
262: * @param htmlInput html input stored in a string
263: * @return array of bytes contining the converted html
264: */
265: public byte[] convertToByteSilent(String htmlInput) {
266:
267: try {
268: return convertToByte(htmlInput.getBytes(m_encoding));
269: } catch (Exception e) {
270: if (LOG.isWarnEnabled()) {
271: LOG.warn(Messages.get().getBundle().key(
272: Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
273: }
274: try {
275: return htmlInput.getBytes(m_encoding);
276: } catch (UnsupportedEncodingException e1) {
277: if (LOG.isWarnEnabled()) {
278: LOG.warn(Messages.get().getBundle().key(
279: Messages.LOG_CONVERSION_BYTE_FAILED_0), e1);
280: }
281: return htmlInput.getBytes();
282: }
283: }
284: }
285:
286: /**
287: * Converts the given html code according to the settings of this converter.<p>
288: *
289: * @param htmlInput html input stored in an array of bytes
290: * @return string contining the converted html
291: *
292: * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
293: */
294: public String convertToString(byte[] htmlInput)
295: throws UnsupportedEncodingException {
296:
297: return convertToString(new String(htmlInput, m_encoding));
298: }
299:
300: /**
301: * Converts the given html code according to the settings of this converter.<p>
302: *
303: * @param htmlInput html input stored in a string
304: * @return string contining the converted html
305: *
306: * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
307: */
308: public String convertToString(String htmlInput)
309: throws UnsupportedEncodingException {
310:
311: // only do parsing if the mode is not set to disabled
312: if (m_modeEnabled) {
313:
314: // do a maximum of 10 loops
315: int max = m_modeWord ? 10 : 1;
316: int count = 0;
317:
318: // we may have to do several parsing runs until all tags are removed
319: int oldSize = htmlInput.length();
320: String workHtml = regExp(htmlInput);
321: while (count < max) {
322: count++;
323:
324: // first add the optional header if in word mode
325: if (m_modeWord) {
326: workHtml = adjustHtml(workHtml);
327: }
328: // now use tidy to parse and format the html
329: workHtml = parse(workHtml, m_encoding);
330: if (m_modeWord) {
331: // cut off the line separator, which is always appended
332: workHtml = workHtml.substring(0, workHtml.length()
333: - m_lineSeparatorLength);
334: }
335:
336: if (workHtml.length() == oldSize) {
337: // no change in html code after last processing loop
338: workHtml = regExp(workHtml);
339: break;
340: }
341: oldSize = workHtml.length();
342: workHtml = regExp(workHtml);
343: }
344: if (LOG.isDebugEnabled()) {
345: LOG.debug(Messages.get().getBundle().key(
346: Messages.LOG_PARSING_RUNS_2,
347: this .getClass().getName(), new Integer(count)));
348: }
349: htmlInput = workHtml;
350: }
351:
352: return htmlInput;
353: }
354:
355: /**
356: * Converts the given html code according to the settings of this converter.<p>
357: *
358: * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
359: *
360: * @param htmlInput html input stored in an array of bytes
361: *
362: * @return string contining the converted html
363: */
364: public String convertToStringSilent(byte[] htmlInput) {
365:
366: try {
367: return convertToString(htmlInput);
368: } catch (Exception e) {
369: if (LOG.isWarnEnabled()) {
370: LOG.warn(Messages.get().getBundle().key(
371: Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
372: }
373: try {
374: return new String(htmlInput, m_encoding);
375: } catch (UnsupportedEncodingException e1) {
376: if (LOG.isWarnEnabled()) {
377: LOG.warn(Messages.get().getBundle().key(
378: Messages.LOG_CONVERSION_BYTE_FAILED_0), e1);
379: }
380: return new String(htmlInput);
381: }
382: }
383: }
384:
385: /**
386: * Converts the given html code according to the settings of this converter.<p>
387: *
388: * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
389: *
390: * @param htmlInput html input stored in string
391: *
392: * @return string contining the converted html
393: */
394: public String convertToStringSilent(String htmlInput) {
395:
396: try {
397: return convertToString(htmlInput);
398: } catch (Exception e) {
399: if (LOG.isWarnEnabled()) {
400: LOG.warn(Messages.get().getBundle().key(
401: Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
402: }
403: return htmlInput;
404: }
405: }
406:
407: /**
408: * Returns the encoding used for the html code conversion.<p>
409: *
410: * @return the encoding used for the html code conversion
411: */
412: public String getEncoding() {
413:
414: return m_encoding;
415: }
416:
417: /**
418: * Adjusts the html input code in WORD mode if nescessary.<p>
419: *
420: * When in WORD mode, the html tag must contain the xmlns:o="urn:schemas-microsoft-com:office:office"
421: * attribute, otherwiese tide will not remove the WORD tags from the document.
422: *
423: * @param htmlInput the html input
424: * @return adjusted html input
425: */
426: private String adjustHtml(String htmlInput) {
427:
428: // check if we have some opening and closing html tags
429: if ((htmlInput.toLowerCase().indexOf("<html>") == -1)
430: && (htmlInput.toLowerCase().indexOf("</html>") == -1)) {
431: // add a correct <html> tag for word generated html
432: StringBuffer tmp = new StringBuffer();
433: tmp.append("<html xmlns:o=\"\"><body>");
434: tmp.append(htmlInput);
435: tmp.append("</body></html>");
436: htmlInput = tmp.toString();
437: }
438: return htmlInput;
439: }
440:
441: /**
442: * Extracts all mode parameters from the mode property value and stores them in a list.<p>
443: *
444: * Values must be seperated iwth a semicolon.
445: *
446: * @param mode the mode paramter string
447: * @return list with all extracted nodes
448: */
449: private List extractModes(String mode) {
450:
451: ArrayList extractedModes = new ArrayList();
452: if (mode != null) {
453: StringTokenizer extract = new StringTokenizer(mode, ";");
454: while (extract.hasMoreTokens()) {
455: String tok = extract.nextToken();
456: extractedModes.add(tok);
457: }
458: }
459: return extractedModes;
460: }
461:
462: /**
463: * Initializes the CmsHtmlConverter.<p>
464: *
465: * @param encoding the encoding used for the html code conversion
466: * @param mode the mode parameter to select the operation mode of the converter.
467: */
468: private void init(String encoding, String mode) {
469:
470: // extract all operation mode
471: List modes = extractModes(mode);
472:
473: // confiugurate the tidy depending on the operation mode
474: if (modes.contains(PARAM_ENABLED)) {
475: m_modeEnabled = true;
476: }
477: if (modes.contains(PARAM_XHTML)) {
478: m_modeEnabled = true;
479: m_modeXhtml = true;
480: }
481: if (modes.contains(PARAM_WORD)) {
482: m_modeEnabled = true;
483: m_modeWord = true;
484: }
485:
486: // set the encoding
487: m_encoding = encoding;
488:
489: // get line separator length
490: m_lineSeparatorLength = System.getProperty("line.separator")
491: .length();
492:
493: // we need this only if the conversion is enabled
494: if (m_modeEnabled) {
495:
496: // create the main tidy object
497: m_tidy = new Tidy();
498:
499: // set specified word, xhtml conversion settings
500: m_tidy.setXHTML(m_modeXhtml);
501: m_tidy.setWord2000(m_modeWord);
502:
503: // add additional tags
504: // those are required to handle word 2002 (and newer) documents
505: Properties additionalTags = new Properties();
506: additionalTags.put("new-empty-tags", "o:smarttagtype");
507: additionalTags.put("new-inline-tags", "o:smarttagtype");
508: m_tidy.getConfiguration().addProps(additionalTags);
509:
510: // set the default tidy configuration
511:
512: // set the tidy encoding
513: m_tidy.setInputEncoding(encoding);
514: m_tidy.setOutputEncoding(encoding);
515:
516: // disable the tidy meta element in output
517: m_tidy.setTidyMark(false);
518: // disable clean mode
519: m_tidy.setMakeClean(false);
520: // enable num entities
521: m_tidy.setNumEntities(true);
522: // create output of the body only
523: m_tidy.setPrintBodyOnly(true);
524: // force output creation even if there are tidy errors
525: m_tidy.setForceOutput(true);
526: // set tidy to quiet mode to prevent output
527: m_tidy.setQuiet(true);
528: // disable warning output
529: m_tidy.setShowWarnings(false);
530: // allow comments in the output
531: m_tidy.setHideComments(false);
532: // set no line break before a <br>
533: m_tidy.setBreakBeforeBR(false);
534: // dont wrap attribute values
535: m_tidy.setWrapAttVals(false);
536: // warp lines after 100 chars
537: m_tidy.setWraplen(100);
538: // no indentation
539: m_tidy.setSpaces(0);
540:
541: if (m_modeWord) {
542: // create the regexp for cleanup, only used in word clean mode
543: m_clearStyle = new Pattern[m_cleanupPatterns.length];
544: for (int i = 0; i < m_cleanupPatterns.length; i++) {
545: m_clearStyle[i] = Pattern
546: .compile(m_cleanupPatterns[i]);
547: }
548: }
549:
550: // create the regexp for replace
551: m_replaceStyle = new Pattern[m_replacePatterns.length];
552: for (int i = 0; i < m_replacePatterns.length; i++) {
553: m_replaceStyle[i] = Pattern
554: .compile(m_replacePatterns[i]);
555: }
556: }
557: }
558:
559: /**
560: * Parses a byte array containing html code with different parsing modes.<p>
561: *
562: * @param htmlInput a byte array containing raw html code
563: * @param encoding the encoding
564: *
565: * @return parsed and cleared html code
566: *
567: * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
568: */
569: private String parse(String htmlInput, String encoding)
570: throws UnsupportedEncodingException {
571:
572: // prepare the streams
573: ByteArrayInputStream in = new ByteArrayInputStream(htmlInput
574: .getBytes(encoding));
575: ByteArrayOutputStream out = new ByteArrayOutputStream();
576: // do the parsing
577: m_tidy.parse(in, out);
578: // return the result
579: byte[] result = out.toByteArray();
580: return new String(result, encoding);
581: }
582:
583: /**
584: * Parses the htmlInput with regular expressions for cleanup purposes.<p>
585: *
586: * @param htmlInput the html input
587: * @return processed html
588: */
589: private String regExp(String htmlInput) {
590:
591: String parsedHtml = htmlInput.trim();
592:
593: if (m_modeWord) {
594: // process all cleanup regexp
595: for (int i = 0; i < m_cleanupPatterns.length; i++) {
596: parsedHtml = m_clearStyle[i].matcher(parsedHtml)
597: .replaceAll("");
598: }
599: }
600:
601: // process all replace regexp
602: for (int i = 0; i < m_replacePatterns.length; i++) {
603: parsedHtml = m_replaceStyle[i].matcher(parsedHtml)
604: .replaceAll(m_replaceValues[i]);
605: }
606:
607: return parsedHtml;
608: }
609: }
|