001: /**
002: * Licensed under the GNU LESSER GENERAL PUBLIC LICENSE, version 2.1, dated February 1999.
003: *
004: * This program is free software; you can redistribute it and/or modify
005: * it under the terms of the latest version of the GNU Lesser General
006: * Public License as published by the Free Software Foundation;
007: *
008: * This program is distributed in the hope that it will be useful,
009: * but WITHOUT ANY WARRANTY; without even the implied warranty of
010: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
011: * GNU Lesser General Public License for more details.
012: *
013: * You should have received a copy of the GNU Lesser General Public License
014: * along with this program (LICENSE.txt); if not, write to the Free Software
015: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
016: */package org.jamwiki.parser.jflex;
017:
018: import java.util.regex.Matcher;
019: import java.util.regex.Pattern;
020: import org.apache.commons.lang.StringUtils;
021: import org.jamwiki.Environment;
022: import org.jamwiki.parser.ParserInput;
023: import org.jamwiki.parser.ParserOutput;
024: import org.jamwiki.utils.WikiLogger;
025:
026: /**
027: * Utility methods used with the Mediawiki lexers.
028: */
029: public class ParserUtil {
030:
031: private static final WikiLogger logger = WikiLogger
032: .getLogger(ParserUtil.class.getName());
033: private static Pattern TAG_PATTERN = null;
034: private static Pattern JAVASCRIPT_PATTERN1 = null;
035: private static Pattern JAVASCRIPT_PATTERN2 = null;
036:
037: static {
038: try {
039: TAG_PATTERN = Pattern
040: .compile("(<[ ]*[/]?[ ]*)([^\\ />]+)([ ]*(.*?))([/]?[ ]*>)");
041: // catch script insertions of the form "onsubmit="
042: JAVASCRIPT_PATTERN1 = Pattern.compile("( on[^=]{3,}=)+",
043: Pattern.CASE_INSENSITIVE);
044: // catch script insertions that use a javascript url
045: JAVASCRIPT_PATTERN2 = Pattern.compile(
046: "(javascript[ ]*\\:)+", Pattern.CASE_INSENSITIVE);
047: } catch (Exception e) {
048: logger.severe("Unable to compile pattern", e);
049: }
050: }
051:
052: /**
053: *
054: */
055: private ParserUtil() {
056: }
057:
058: /**
059: * Provide a way to run the pre-processor against a fragment of text, such
060: * as an image caption. This method should be used sparingly since it is
061: * not very efficient.
062: */
063: protected static String parseFragment(ParserInput parserInput,
064: String raw, int mode) throws Exception {
065: if (StringUtils.isBlank(raw)) {
066: return raw;
067: }
068: JFlexParser parser = new JFlexParser(parserInput);
069: ParserOutput parserOutput = new ParserOutput();
070: return parser.parseFragment(parserOutput, raw, mode);
071: }
072:
073: /**
074: * Clean up HTML tags to make them XHTML compliant (lowercase, no
075: * unnecessary spaces).
076: */
077: protected static String sanitizeHtmlTag(String tag) {
078: String result = tag.trim();
079: result = StringUtils.remove(result, " ").toLowerCase();
080: if (result.endsWith("/>")) {
081: // spaces were stripped, so make sure tag is of the form "<br />"
082: result = result.substring(0, result.length() - 2) + " />";
083: }
084: return result;
085: }
086:
087: /**
088: * Given a tag of the form "<tag>content</tag>", return all content between
089: * the tags. Consider the following examples:
090: *
091: * "<tag>content</tag>" returns "content".
092: * "<tag />" returns and empty string.
093: * "<tag><sub>content</sub></tag>" returns "<sub>content</sub>".
094: *
095: * @param raw The raw tag content to be analyzed.
096: * @return The content for the tag being analyzed.
097: */
098: protected static String tagContent(String raw) {
099: int start = raw.indexOf('>') + 1;
100: int end = raw.lastIndexOf('<');
101: if (start == 0) {
102: // no tags
103: return raw;
104: }
105: if (end <= start) {
106: return "";
107: }
108: return raw.substring(start, end);
109: }
110:
111: /**
112: * Allowing Javascript action tags to be used as attributes (onmouseover, etc) is
113: * a bad thing, so clean up HTML tags to remove any such attributes.
114: */
115: protected static String validateHtmlTag(String tag) {
116: Matcher m = TAG_PATTERN.matcher(tag);
117: if (!m.find()) {
118: logger
119: .severe("Failure while attempting to match html tag for pattern "
120: + tag);
121: return tag;
122: }
123: String tagOpen = m.group(1);
124: String tagKeyword = m.group(2);
125: String attributes = m.group(3);
126: String tagClose = m.group(5);
127: String result = "<";
128: if (tagOpen.indexOf('/') != -1) {
129: result += "/";
130: }
131: result += tagKeyword.toLowerCase().trim();
132: if (!StringUtils.isBlank(attributes)) {
133: attributes = ParserUtil
134: .validateHtmlTagAttributes(attributes);
135: result += " " + attributes.trim();
136: }
137: if (tagClose.indexOf('/') != -1) {
138: tagClose = " />";
139: }
140: result += tagClose.trim();
141: return result;
142: }
143:
144: /**
145: * Allowing Javascript action tags to be used as attributes (onmouseover, etc) is
146: * a bad thing, so clean up HTML tags to remove any such attributes.
147: */
148: protected static String validateHtmlTagAttributes(String attributes) {
149: if (StringUtils.isBlank(attributes)) {
150: return attributes;
151: }
152: if (!Environment
153: .getBooleanValue(Environment.PROP_PARSER_ALLOW_JAVASCRIPT)) {
154: // FIXME - can these two patterns be combined into one?
155: // pattern requires a space prior to the "onFoo", so make sure one exists
156: Matcher m = JAVASCRIPT_PATTERN1.matcher(" " + attributes);
157: if (m.find()) {
158: logger
159: .warning("Attempt to include Javascript in Wiki syntax "
160: + attributes);
161: return "";
162: }
163: m = JAVASCRIPT_PATTERN2.matcher(attributes);
164: if (m.find()) {
165: logger
166: .warning("Attempt to include Javascript in Wiki syntax "
167: + attributes);
168: return "";
169: }
170: }
171: return attributes;
172: }
173: }
|