Source Code Cross Referenced for TextUtils.java in  » Web-Crawler » heritrix » org » archive » util » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.util 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* Copyright (C) 2003 Internet Archive.
002:         *
003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
004:         *
005:         * Heritrix is free software; you can redistribute it and/or modify
006:         * it under the terms of the GNU Lesser Public License as published by
007:         * the Free Software Foundation; either version 2.1 of the License, or
008:         * any later version.
009:         *
010:         * Heritrix is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser Public License
016:         * along with Heritrix; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         */
019:        package org.archive.util;
020:
021:        import java.io.BufferedReader;
022:        import java.io.IOException;
023:        import java.io.PrintWriter;
024:        import java.io.StringReader;
025:        import java.io.StringWriter;
026:        import java.util.HashMap;
027:        import java.util.Map;
028:        import java.util.regex.Matcher;
029:        import java.util.regex.Pattern;
030:
031:        import javax.servlet.jsp.JspWriter;
032:
033:        import org.apache.commons.lang.StringEscapeUtils;
034:
035:        public class TextUtils {
036:            private static final String FIRSTWORD = "^([^\\s]*).*$";
037:
038:            /**
039:             * Allowable range between & and ;
040:             */
041:            private static final int MAX_ENTITY_WIDTH = 9;
042:
043:            private static final ThreadLocal<Map<String, Matcher>> TL_MATCHER_MAP = new ThreadLocal<Map<String, Matcher>>() {
044:                protected Map<String, Matcher> initialValue() {
045:                    return new HashMap<String, Matcher>(50);
046:                }
047:            };
048:
049:            /**
050:             * Get a matcher object for a precompiled regex pattern.
051:             * 
052:             * This method tries to reuse Matcher objects for efficiency.
053:             * It can hold for recycling one Matcher per pattern per thread. 
054:             * 
055:             * Matchers retrieved should be returned for reuse via the
056:             * recycleMatcher() method, but no errors will occur if they
057:             * are not.
058:             * 
059:             * This method is a hotspot frequently accessed.
060:             *
061:             * @param pattern the string pattern to use
062:             * @param input the character sequence the matcher should be using
063:             * @return a matcher object loaded with the submitted character sequence
064:             */
065:            public static Matcher getMatcher(String pattern, CharSequence input) {
066:                if (pattern == null) {
067:                    throw new IllegalArgumentException(
068:                            "String 'pattern' must not be null");
069:                }
070:                final Map<String, Matcher> matchers = TL_MATCHER_MAP.get();
071:                Matcher m = (Matcher) matchers.get(pattern);
072:                if (m == null) {
073:                    m = Pattern.compile(pattern).matcher(input);
074:                } else {
075:                    matchers.put(pattern, null);
076:                    m.reset(input);
077:                }
078:                return m;
079:            }
080:
081:            public static void recycleMatcher(Matcher m) {
082:                final Map<String, Matcher> matchers = TL_MATCHER_MAP.get();
083:                matchers.put(m.pattern().pattern(), m);
084:            }
085:
086:            /**
087:             * Utility method using a precompiled pattern instead of using the
088:             * replaceAll method of the String class. This method will also be reusing
089:             * Matcher objects.
090:             * 
091:             * @see java.util.regex.Pattern
092:             * @param pattern precompiled Pattern to match against
093:             * @param input the character sequence to check
094:             * @param replacement the String to substitute every match with
095:             * @return the String with all the matches substituted
096:             */
097:            public static String replaceAll(String pattern, CharSequence input,
098:                    String replacement) {
099:                Matcher m = getMatcher(pattern, input);
100:                String res = m.replaceAll(replacement);
101:                recycleMatcher(m);
102:                return res;
103:            }
104:
105:            /**
106:             * Utility method using a precompiled pattern instead of using the
107:             * replaceFirst method of the String class. This method will also be reusing
108:             * Matcher objects.
109:             * 
110:             * @see java.util.regex.Pattern
111:             * @param pattern precompiled Pattern to match against
112:             * @param input the character sequence to check
113:             * @param replacement the String to substitute the first match with
114:             * @return the String with the first match substituted
115:             */
116:            public static String replaceFirst(String pattern,
117:                    CharSequence input, String replacement) {
118:                Matcher m = getMatcher(pattern, input);
119:                String res = m.replaceFirst(replacement);
120:                recycleMatcher(m);
121:                return res;
122:            }
123:
124:            /**
125:             * Utility method using a precompiled pattern instead of using the matches
126:             * method of the String class. This method will also be reusing Matcher
127:             * objects.
128:             * 
129:             * @see java.util.regex.Pattern
130:             * @param pattern precompiled Pattern to match against
131:             * @param input the character sequence to check
132:             * @return true if character sequence matches
133:             */
134:            public static boolean matches(String pattern, CharSequence input) {
135:                Matcher m = getMatcher(pattern, input);
136:                boolean res = m.matches();
137:                recycleMatcher(m);
138:                return res;
139:            }
140:
141:            /**
142:             * Utility method using a precompiled pattern instead of using the split
143:             * method of the String class.
144:             * 
145:             * @see java.util.regex.Pattern
146:             * @param pattern precompiled Pattern to split by
147:             * @param input the character sequence to split
148:             * @return array of Strings split by pattern
149:             */
150:            public static String[] split(String pattern, CharSequence input) {
151:                Matcher m = getMatcher(pattern, input);
152:                String[] retVal = m.pattern().split(input);
153:                recycleMatcher(m);
154:                return retVal;
155:            }
156:
157:            /**
158:             * @param s String to find first word in (Words are delimited by
159:             * whitespace).
160:             * @return First word in the passed string else null if no word found.
161:             */
162:            public static String getFirstWord(String s) {
163:                Matcher m = getMatcher(FIRSTWORD, s);
164:                String retVal = (m != null && m.matches()) ? m.group(1) : null;
165:                recycleMatcher(m);
166:                return retVal;
167:            }
168:
169:            /**
170:             * Escapes a string so that it can be passed as an argument to a javscript
171:             * in a JSP page. This method takes a string and returns the same string
172:             * with any single quote escaped by prepending the character with a
173:             * backslash. Linebreaks are also replaced with '\n'.  Also,
174:             * less-than signs and ampersands are replaced with HTML entities.
175:             * 
176:             * @param s The string to escape
177:             * @return The same string escaped.
178:             */
179:            public static String escapeForHTMLJavascript(String s) {
180:                return escapeForHTML(StringEscapeUtils.escapeJavaScript(s));
181:            }
182:
183:            /**
184:             * Escapes a string so that it can be placed inside XML/HTML attribute.
185:             * Replaces ampersand, less-than, greater-than, single-quote, and 
186:             * double-quote with escaped versions.
187:             * @param s The string to escape
188:             * @return The same string escaped.
189:             */
190:            public static String escapeForMarkupAttribute(String s) {
191:                return StringEscapeUtils.escapeXml(s);
192:            }
193:
194:            /**
195:             * Minimally escapes a string so that it can be placed inside XML/HTML
196:             * attribute.
197:             * Escapes lt and amp.
198:             * @param s The string to escape
199:             * @return The same string escaped.
200:             */
201:            public static String escapeForHTML(String s) {
202:                // TODO: do this in a single pass instead of creating 5 junk strings
203:                String escaped = s.replaceAll("&", "&amp;");
204:                return escaped.replaceAll("<", "&lt;");
205:            }
206:
207:            /**
208:             * Utility method for writing a (potentially large) String to a JspWriter,
209:             * escaping it for HTML display, without constructing another large String
210:             * of the whole content. 
211:             * @param s String to write
212:             * @param out destination JspWriter
213:             * @throws IOException
214:             */
215:            public static void writeEscapedForHTML(String s, JspWriter out)
216:                    throws IOException {
217:                BufferedReader reader = new BufferedReader(new StringReader(s));
218:                String line;
219:                while ((line = reader.readLine()) != null) {
220:                    out.println(StringEscapeUtils.escapeHtml(line));
221:                }
222:            }
223:
224:            /**
225:             * Replaces HTML Entity Encodings.
226:             * @param cs The CharSequence to remove html codes from
227:             * @return the same CharSequence or an escaped String.
228:             */
229:            public static CharSequence unescapeHtml(final CharSequence cs) {
230:                if (cs == null) {
231:                    return cs;
232:                }
233:
234:                // If both of these do not equal zero, then cs has entity code
235:                int startEntityCode = -1;
236:                int endEntityCode = -1;
237:
238:                // Check for encodings, make sure start and end are within certain range
239:                for (int i = 0; i < cs.length(); i++) {
240:                    if (cs.charAt(i) == '&') {
241:                        startEntityCode = i;
242:                    } else if (cs.charAt(i) == ';' && startEntityCode >= 0
243:                            && i > startEntityCode
244:                            && ((i - startEntityCode) < MAX_ENTITY_WIDTH)) {
245:                        endEntityCode = i;
246:                    }
247:                }
248:
249:                return (startEntityCode != 0 && endEntityCode != 0) ? StringEscapeUtils
250:                        .unescapeHtml(cs.toString())
251:                        : cs;
252:            }
253:
254:            /**
255:             * @param message Message to put at top of the string returned. May be
256:             * null.
257:             * @param e Exception to write into a string.
258:             * @return Return formatted string made of passed message and stack trace
259:             * of passed exception.
260:             */
261:            public static String exceptionToString(String message, Throwable e) {
262:                StringWriter sw = new StringWriter();
263:                if (message == null || message.length() == 0) {
264:                    sw.write(message);
265:                    sw.write("\n");
266:                }
267:                e.printStackTrace(new PrintWriter(sw));
268:                return sw.toString();
269:            }
270:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.