01: /**
02: * $Id: HTMLCleaner.java,v 1.2 2006/04/17 20:08:41 jtb Exp $
03: * Copyright 2003 Sun Microsystems, Inc. All
04: * rights reserved. Use of this product is subject
05: * to license terms. Federal Acquisitions:
06: * Commercial Software -- Government Users
07: * Subject to Standard License Terms and
08: * Conditions.
09: *
10: * Sun, Sun Microsystems, the Sun logo, and Sun ONE
11: * are trademarks or registered trademarks of Sun Microsystems,
12: * Inc. in the United States and other countries.
13: */package com.sun.portal.rssportlet;
14:
15: import java.util.regex.Matcher;
16: import java.util.regex.Pattern;
17:
18: /**
19: * This class accepts HTML input and translates it into plain test.
20: * It does this by removing all HTML tags, all Javascript blocks, and
21: * all entity references. It also replaces all repeating whitespace to
22: * a single space.
23: */
24: public class HTMLCleaner {
25: private static interface Patterns {
26: // javascript tags and everything in between
27: public static final Pattern SCRIPTS = Pattern.compile(
28: "<(no)?script[^>]*>.*</(no)?script>", Pattern.DOTALL);
29: // HTML/XML tags
30: public static final Pattern TAGS = Pattern.compile("<[^>]+>");
31: // entity references
32: public static final Pattern ENTITY_REFS = Pattern
33: .compile("&[^;]+;");
34: // repeated whitespace
35: public static final Pattern WHITESPACE = Pattern
36: .compile("\\s\\s+");
37: }
38:
39: /**
40: * Clean the HTML input.
41: */
42: public String clean(String s) {
43: if (s == null) {
44: return null;
45: }
46:
47: Matcher m;
48:
49: m = Patterns.SCRIPTS.matcher(s);
50: s = m.replaceAll("");
51: m = Patterns.TAGS.matcher(s);
52: s = m.replaceAll("");
53: m = Patterns.ENTITY_REFS.matcher(s);
54: s = m.replaceAll("");
55: m = Patterns.WHITESPACE.matcher(s);
56: s = m.replaceAll(" ");
57:
58: return s;
59: }
60: }
|