01: /*
02: * The contents of this file are subject to the terms
03: * of the Common Development and Distribution License
04: * (the License). You may not use this file except in
05: * compliance with the License.
06: *
07: * You can obtain a copy of the license at
08: * http://www.sun.com/cddl/cddl.html or
09: * at portlet-repository/CDDLv1.0.txt.
10: * See the License for the specific language governing
11: * permissions and limitations under the License.
12: *
13: * When distributing Covered Code, include this CDDL
14: * Header Notice in each file and include the License file
15: * at portlet-repository/CDDLv1.0.txt.
16: * If applicable, add the following below the CDDL Header,
17: * with the fields enclosed by brackets [] replaced by
18: * you own identifying information:
19: * "Portions Copyrighted [year] [name of copyright owner]"
20: *
21: * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
22: */
23:
24: package com.sun.portal.app.blog;
25:
26: import java.util.regex.Matcher;
27: import java.util.regex.Pattern;
28:
29: /**
30: * This class accepts HTML input and translates it into plain test.
31: * It does this by removing all HTML tags, all Javascript blocks, and
32: * all entity references. It also replaces all repeating whitespace to
33: * a single space.
34: */
35: public class HTMLCleaner {
36: private static interface Patterns {
37: // javascript tags and everything in between
38: public static final Pattern SCRIPTS = Pattern.compile(
39: "<(no)?script[^>]*>.*</(no)?script>", Pattern.DOTALL);
40: // HTML/XML tags
41: public static final Pattern TAGS = Pattern.compile("<[^>]+>");
42: // entity references
43: public static final Pattern ENTITY_REFS = Pattern
44: .compile("&[^;]+;");
45: // repeated whitespace
46: public static final Pattern WHITESPACE = Pattern
47: .compile("\\s\\s+");
48: }
49:
50: /**
51: * Clean the HTML input.
52: */
53: public String clean(String s) {
54: if (s == null) {
55: return null;
56: }
57:
58: Matcher m;
59:
60: m = Patterns.SCRIPTS.matcher(s);
61: s = m.replaceAll("");
62: m = Patterns.TAGS.matcher(s);
63: s = m.replaceAll("");
64: m = Patterns.ENTITY_REFS.matcher(s);
65: s = m.replaceAll("");
66: m = Patterns.WHITESPACE.matcher(s);
67: s = m.replaceAll(" ");
68:
69: return s;
70: }
71: }
|