001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: */
019: package org.archive.util;
020:
021: import java.io.BufferedReader;
022: import java.io.IOException;
023: import java.io.PrintWriter;
024: import java.io.StringReader;
025: import java.io.StringWriter;
026: import java.util.HashMap;
027: import java.util.Map;
028: import java.util.regex.Matcher;
029: import java.util.regex.Pattern;
030:
031: import javax.servlet.jsp.JspWriter;
032:
033: import org.apache.commons.lang.StringEscapeUtils;
034:
035: public class TextUtils {
036: private static final String FIRSTWORD = "^([^\\s]*).*$";
037:
038: /**
039: * Allowable range between & and ;
040: */
041: private static final int MAX_ENTITY_WIDTH = 9;
042:
043: private static final ThreadLocal<Map<String, Matcher>> TL_MATCHER_MAP = new ThreadLocal<Map<String, Matcher>>() {
044: protected Map<String, Matcher> initialValue() {
045: return new HashMap<String, Matcher>(50);
046: }
047: };
048:
049: /**
050: * Get a matcher object for a precompiled regex pattern.
051: *
052: * This method tries to reuse Matcher objects for efficiency.
053: * It can hold for recycling one Matcher per pattern per thread.
054: *
055: * Matchers retrieved should be returned for reuse via the
056: * recycleMatcher() method, but no errors will occur if they
057: * are not.
058: *
059: * This method is a hotspot frequently accessed.
060: *
061: * @param pattern the string pattern to use
062: * @param input the character sequence the matcher should be using
063: * @return a matcher object loaded with the submitted character sequence
064: */
065: public static Matcher getMatcher(String pattern, CharSequence input) {
066: if (pattern == null) {
067: throw new IllegalArgumentException(
068: "String 'pattern' must not be null");
069: }
070: final Map<String, Matcher> matchers = TL_MATCHER_MAP.get();
071: Matcher m = (Matcher) matchers.get(pattern);
072: if (m == null) {
073: m = Pattern.compile(pattern).matcher(input);
074: } else {
075: matchers.put(pattern, null);
076: m.reset(input);
077: }
078: return m;
079: }
080:
081: public static void recycleMatcher(Matcher m) {
082: final Map<String, Matcher> matchers = TL_MATCHER_MAP.get();
083: matchers.put(m.pattern().pattern(), m);
084: }
085:
086: /**
087: * Utility method using a precompiled pattern instead of using the
088: * replaceAll method of the String class. This method will also be reusing
089: * Matcher objects.
090: *
091: * @see java.util.regex.Pattern
092: * @param pattern precompiled Pattern to match against
093: * @param input the character sequence to check
094: * @param replacement the String to substitute every match with
095: * @return the String with all the matches substituted
096: */
097: public static String replaceAll(String pattern, CharSequence input,
098: String replacement) {
099: Matcher m = getMatcher(pattern, input);
100: String res = m.replaceAll(replacement);
101: recycleMatcher(m);
102: return res;
103: }
104:
105: /**
106: * Utility method using a precompiled pattern instead of using the
107: * replaceFirst method of the String class. This method will also be reusing
108: * Matcher objects.
109: *
110: * @see java.util.regex.Pattern
111: * @param pattern precompiled Pattern to match against
112: * @param input the character sequence to check
113: * @param replacement the String to substitute the first match with
114: * @return the String with the first match substituted
115: */
116: public static String replaceFirst(String pattern,
117: CharSequence input, String replacement) {
118: Matcher m = getMatcher(pattern, input);
119: String res = m.replaceFirst(replacement);
120: recycleMatcher(m);
121: return res;
122: }
123:
124: /**
125: * Utility method using a precompiled pattern instead of using the matches
126: * method of the String class. This method will also be reusing Matcher
127: * objects.
128: *
129: * @see java.util.regex.Pattern
130: * @param pattern precompiled Pattern to match against
131: * @param input the character sequence to check
132: * @return true if character sequence matches
133: */
134: public static boolean matches(String pattern, CharSequence input) {
135: Matcher m = getMatcher(pattern, input);
136: boolean res = m.matches();
137: recycleMatcher(m);
138: return res;
139: }
140:
141: /**
142: * Utility method using a precompiled pattern instead of using the split
143: * method of the String class.
144: *
145: * @see java.util.regex.Pattern
146: * @param pattern precompiled Pattern to split by
147: * @param input the character sequence to split
148: * @return array of Strings split by pattern
149: */
150: public static String[] split(String pattern, CharSequence input) {
151: Matcher m = getMatcher(pattern, input);
152: String[] retVal = m.pattern().split(input);
153: recycleMatcher(m);
154: return retVal;
155: }
156:
157: /**
158: * @param s String to find first word in (Words are delimited by
159: * whitespace).
160: * @return First word in the passed string else null if no word found.
161: */
162: public static String getFirstWord(String s) {
163: Matcher m = getMatcher(FIRSTWORD, s);
164: String retVal = (m != null && m.matches()) ? m.group(1) : null;
165: recycleMatcher(m);
166: return retVal;
167: }
168:
169: /**
170: * Escapes a string so that it can be passed as an argument to a javscript
171: * in a JSP page. This method takes a string and returns the same string
172: * with any single quote escaped by prepending the character with a
173: * backslash. Linebreaks are also replaced with '\n'. Also,
174: * less-than signs and ampersands are replaced with HTML entities.
175: *
176: * @param s The string to escape
177: * @return The same string escaped.
178: */
179: public static String escapeForHTMLJavascript(String s) {
180: return escapeForHTML(StringEscapeUtils.escapeJavaScript(s));
181: }
182:
183: /**
184: * Escapes a string so that it can be placed inside XML/HTML attribute.
185: * Replaces ampersand, less-than, greater-than, single-quote, and
186: * double-quote with escaped versions.
187: * @param s The string to escape
188: * @return The same string escaped.
189: */
190: public static String escapeForMarkupAttribute(String s) {
191: return StringEscapeUtils.escapeXml(s);
192: }
193:
194: /**
195: * Minimally escapes a string so that it can be placed inside XML/HTML
196: * attribute.
197: * Escapes lt and amp.
198: * @param s The string to escape
199: * @return The same string escaped.
200: */
201: public static String escapeForHTML(String s) {
202: // TODO: do this in a single pass instead of creating 5 junk strings
203: String escaped = s.replaceAll("&", "&");
204: return escaped.replaceAll("<", "<");
205: }
206:
207: /**
208: * Utility method for writing a (potentially large) String to a JspWriter,
209: * escaping it for HTML display, without constructing another large String
210: * of the whole content.
211: * @param s String to write
212: * @param out destination JspWriter
213: * @throws IOException
214: */
215: public static void writeEscapedForHTML(String s, JspWriter out)
216: throws IOException {
217: BufferedReader reader = new BufferedReader(new StringReader(s));
218: String line;
219: while ((line = reader.readLine()) != null) {
220: out.println(StringEscapeUtils.escapeHtml(line));
221: }
222: }
223:
224: /**
225: * Replaces HTML Entity Encodings.
226: * @param cs The CharSequence to remove html codes from
227: * @return the same CharSequence or an escaped String.
228: */
229: public static CharSequence unescapeHtml(final CharSequence cs) {
230: if (cs == null) {
231: return cs;
232: }
233:
234: // If both of these do not equal zero, then cs has entity code
235: int startEntityCode = -1;
236: int endEntityCode = -1;
237:
238: // Check for encodings, make sure start and end are within certain range
239: for (int i = 0; i < cs.length(); i++) {
240: if (cs.charAt(i) == '&') {
241: startEntityCode = i;
242: } else if (cs.charAt(i) == ';' && startEntityCode >= 0
243: && i > startEntityCode
244: && ((i - startEntityCode) < MAX_ENTITY_WIDTH)) {
245: endEntityCode = i;
246: }
247: }
248:
249: return (startEntityCode != 0 && endEntityCode != 0) ? StringEscapeUtils
250: .unescapeHtml(cs.toString())
251: : cs;
252: }
253:
254: /**
255: * @param message Message to put at top of the string returned. May be
256: * null.
257: * @param e Exception to write into a string.
258: * @return Return formatted string made of passed message and stack trace
259: * of passed exception.
260: */
261: public static String exceptionToString(String message, Throwable e) {
262: StringWriter sw = new StringWriter();
263: if (message == null || message.length() == 0) {
264: sw.write(message);
265: sw.write("\n");
266: }
267: e.printStackTrace(new PrintWriter(sw));
268: return sw.toString();
269: }
270: }
|