Source Code Cross Referenced for HtmlParser.java in  » Mail-Clients » columba-1.4 » org » columba » mail » parser » text » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Mail Clients » columba 1.4 » org.columba.mail.parser.text 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        //The contents of this file are subject to the Mozilla Public License Version 1.1
002:        //(the "License"); you may not use this file except in compliance with the
003:        //License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
004:        //
005:        //Software distributed under the License is distributed on an "AS IS" basis,
006:        //WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
007:        //for the specific language governing rights and
008:        //limitations under the License.
009:        //
010:        //The Original Code is "The Columba Project"
011:        //
012:        //The Initial Developers of the Original Code are Frederik Dietz and Timo Stich.
013:        //Portions created by Frederik Dietz and Timo Stich are Copyright (C) 2003.
014:        //
015:        //All Rights Reserved.
016:        package org.columba.mail.parser.text;
017:
018:        import java.io.BufferedReader;
019:        import java.io.StringReader;
020:        import java.nio.ByteBuffer;
021:        import java.nio.charset.Charset;
022:        import java.util.logging.Logger;
023:        import java.util.regex.Matcher;
024:        import java.util.regex.Pattern;
025:
026:        /**
027:         * Contains different utility functions for manipulating Html based
028:         * text. This includes functionality for removing and restoring
029:         * special entities (such as &, <, >, ...) and functionality for
030:         * removing html tags from the text.
031:         *
032:         * @author Karl Peder Olesen (karlpeder), 20030623
033:         *
034:         */
035:        public final class HtmlParser {
036:
037:            /**
038:             * Utility classes should not have a public constructor.
039:             */
040:            private HtmlParser() {
041:            }
042:
043:            private static final Logger LOG = Logger
044:                    .getLogger("org.columba.mail.parser.text");
045:
046:            private static final Pattern BREAK_TO_NL_PATTERN = Pattern.compile(
047:                    "</?br>", Pattern.CASE_INSENSITIVE);
048:            private static final Pattern P_TO_DOUBLE_NL_PATTERN = Pattern
049:                    .compile("</p>", Pattern.CASE_INSENSITIVE);
050:            private static final Pattern DIV_TO_DOUBLE_NL_PATTERN = Pattern
051:                    .compile("</div>", Pattern.CASE_INSENSITIVE);
052:            private static final Pattern H_TO_DOUBLE_NL_PATTERN = Pattern
053:                    .compile("</h\\d>", Pattern.CASE_INSENSITIVE);
054:            private static final Pattern WHITE_SPACE_REMOVAL_PATTERN = Pattern
055:                    .compile("\\s+", Pattern.CASE_INSENSITIVE);
056:            private static final Pattern TRIM_SPACE_PATTERN = Pattern.compile(
057:                    "\n\\s+", Pattern.CASE_INSENSITIVE);
058:            private static final Pattern HEADER_REMOVAL_PATTERN = Pattern
059:                    .compile("<html[^<]*<body[^>]*>", Pattern.CASE_INSENSITIVE);
060:            private static final Pattern STRIP_TAGS_PATTERN = Pattern.compile(
061:                    "<[^>]*>", Pattern.CASE_INSENSITIVE);
062:            private static final Pattern COMMENTS_REMOVAL_PATTERN = Pattern
063:                    .compile("<!--[^-]*-->", Pattern.CASE_INSENSITIVE);
064:            private static final String EMAIL_STR = "([a-zA-Z0-9_+\\.-]+@([a-zA-Z0-9]+([\\.-][a-zA-Z0-9]+)*)+\\.[a-zA-Z]{2,4})";
065:            //do the bug [997599] "\\b([^\\s@]+@[^\\s]+)\\b";
066:            private static final Pattern EMAIL_PATTERN = Pattern
067:                    .compile(EMAIL_STR);
068:            private static final Pattern EMAIL_PATTERN_INC_LINK = Pattern
069:                    .compile("<a[\\s\\n]*href=(\\\")?(mailto:)" + EMAIL_STR
070:                            + "[^<]*</a>", Pattern.CASE_INSENSITIVE);
071:
072:            private static final String PROT = "(http|https|ftp)";
073:            private static final String PUNC = ".,:;?!\\-";
074:            private static final String ANY = "\\S";
075:            private static final String URL_STR = "\\b" + "("
076:                    + "(\\w*(:\\S*)?@)?" + PROT + "://" + "[" + ANY + "]+"
077:                    + ")" + "\\b";
078:
079:            /*
080:                     \\b  Start at word boundary
081:                 (
082:            (\\w*(:\\S*)?@)?  [user:[pass]]@ - Construct
083:            prot + "://  protocol and ://
084:               ["+any+"]  match literaly anything...
085:                 )
086:            (?=\\s|$)  ...until we find whitespace or end of String
087:             */
088:            private static final Pattern URL_PATTERN = Pattern.compile(URL_STR,
089:                    Pattern.CASE_INSENSITIVE);
090:            private static final String URL_REPAIR_STR = "(.*://.*?)" + "("
091:                    + "(&gt;).*|" + "([" + PUNC + "]*)" + "(<br>)?" + ")$";
092:
093:            /*
094:            (.*://.*?)"  "something" with ://
095:                  (could be .*? but then the Pattern would match whitespace)
096:                     (
097:              (&gt;).*  a html-Encoded > followed by anything
098:                                                          |  or
099:            (["+punc+"]*)"  any Punctuation
100:                (<br>)? 0 or 1 trailing <br>
101:                     )$  end of String
102:             */
103:            private static final Pattern URL_REPAIR_PATTERN = Pattern
104:                    .compile(URL_REPAIR_STR);
105:            private static final Pattern URL_PATTERN_INC_LINK = Pattern
106:                    .compile("<a( |\\n)*?href=(\\\")?" + URL_STR
107:                            + "(.|\\n)*?</a>", Pattern.CASE_INSENSITIVE);
108:
109:            // TODO (@author fdietz): Add more special entities - e.g. accenture chars such as ?
110:
111:            /** Special entities recognized by restore special entities */
112:            // The form of the entities must be a regexp!
113:            private static final String[] SPECIAL_ENTITIES = { "&quot;",
114:                    "&amp;", "&lt;", "&gt;", "&nbsp;", "&iexcl;", "&cent;",
115:                    "&pound;", "&curren;", "&yen;", "&brvbar;", "&sect;",
116:                    "&uml;", "&copy;", "&ordf;", "&laquo;", "&not;", "&shy;",
117:                    "&reg;", "&macr;", "&deg;", "&plusmn;", "&sup2;", "&sup3;",
118:                    "&acute;", "&micro;", "&para;", "&middot;", "&cedil;",
119:                    "&sup1;", "&ordm;", "&raquo;", "&frac14;", "&frac12;",
120:                    "&frac34;", "&iquest;", "&Agrave;", "&Aacute;", "&Acirc;",
121:                    "&Atilde;", "&Auml;", "&Aring;", "&AElig;", "&Ccedil;",
122:                    "&Egrave;", "&Eacute;", "&Ecirc;", "&Euml;", "&Igrave;",
123:                    "&Iacute;", "&Icirc;", "&Iuml;", "&ETH;", "&Ntilde;",
124:                    "&Ograve;", "&Oacute;", "&Ocirc;", "&Otilde;", "&Ouml;",
125:                    "&times;", "&Oslash;", "&Ugrave;", "&Uacute;", "&Ucirc;",
126:                    "&Uuml;", "&Yacute;", "&THORN;", "&szlig;", "&agrave;",
127:                    "&aacute;", "&acirc;", "&atilde;", "&auml;", "&aring;",
128:                    "&aelig;", "&ccedil;", "&egrave;", "&eacute;", "&ecirc;",
129:                    "&euml;", "&igrave;", "&iacute;", "&icirc;", "&iuml;",
130:                    "&eth;", "&ntilde;", "&ograve;", "&oacute;", "&ocirc;",
131:                    "&otilde;", "&ouml;", "&divide;", "&oslash;", "&ugrave;",
132:                    "&uacute;", "&ucirc;", "&uuml;", "&yacute;", "&thorn;",
133:                    "&yuml;" };
134:
135:            /** Normal chars corresponding to the defined special entities */
136:            private static final String[] ENTITY_STRINGS = { "\"", "&", "<",
137:                    ">", "\u00a0", "\u00a1", "\u00a2", "\u00a3", "\u00a4",
138:                    "\u00a5", "\u00a6", "\u00a7", "\u00a8", "\u00a9", "\u00aa",
139:                    "\u00ab", "\u00ac", "\u00ad", "\u00ae", "\u00af", "\u00b0",
140:                    "\u00b1", "\u00b2", "\u00b3", "\u00b4", "\u00b5", "\u00b6",
141:                    "\u00b7", "\u00b8", "\u00b9", "\u00ba", "\u00bb", "\u00bc",
142:                    "\u00bd", "\u00be", "\u00bf", "\u00c0", "\u00c1", "\u00c2",
143:                    "\u00c3", "\u00c4", "\u00c5", "\u00c6", "\u00c7", "\u00c8",
144:                    "\u00c9", "\u00ca", "\u00cb", "\u00cc", "\u00cd", "\u00ce",
145:                    "\u00cf", "\u00d0", "\u00d1", "\u00d2", "\u00d3", "\u00d4",
146:                    "\u00d5", "\u00d6", "\u00d7", "\u00d8", "\u00d9", "\u00da",
147:                    "\u00db", "\u00dc", "\u00dd", "\u00de", "\u00df", "\u00e0",
148:                    "\u00e1", "\u00e2", "\u00e3", "\u00e4", "\u00e5", "\u00e6",
149:                    "\u00e7", "\u00e8", "\u00e9", "\u00ea", "\u00eb", "\u00ec",
150:                    "\u00ed", "\u00ee", "\u00ef", "\u00f0", "\u00f1", "\u00f2",
151:                    "\u00f3", "\u00f4", "\u00f5", "\u00f6", "\u00f7", "\u00f8",
152:                    "\u00f9", "\u00fa", "\u00fb", "\u00fc", "\u00fd", "\u00fe",
153:                    "\u00ff" };
154:
155:            private static final Pattern SPECIAL_PATTERN = Pattern
156:                    .compile("&#(\\d+);");
157:
158:            private static final Pattern CHARSET_PATTERN = Pattern
159:                    .compile("\\bcharset=([\\w-_\\d]+)\\b");
160:
161:            /**
162:             * Strips html tags and removes extra spaces which occurs due
163:             * to e.g. indentation of the html and the head section, which does
164:             * not contain any textual information.
165:             * <br>
166:             * The conversion rutine does the following:<br>
167:             * 1. Removes the header from the html file, i.e. everything from
168:             *    the html tag until and including the starting body tag.<br>
169:             * 2. Replaces multiple consecutive whitespace characters with a single
170:             *    space (since extra whitespace should be ignored in html).<br>
171:             * 3. Replaces ending br tags with a single newline character<br>
172:             * 4. Replaces ending p, div and heading tags with two newlines characters;
173:             *    resulting in a single empty line btw. paragraphs.<br>
174:             * 5. Strips remaining html tags.<br>
175:             * <br>
176:             * NB: The tag stripping is done using a very simple regular expression,
177:             * which removes everything between &lt and &gt. Therefore too much text
178:             * could in some (hopefully rare!?) cases be removed.
179:             *
180:             * @param        s                Input string
181:             * @return        Input stripped for html tags
182:             * @author        Karl Peder Olesen (karlpeder)
183:             */
184:            public static String stripHtmlTags(String s) {
185:                // initial check of input:
186:                if (s == null) {
187:                    return null;
188:                }
189:
190:                // remove header
191:                s = HEADER_REMOVAL_PATTERN.matcher(s).replaceAll("");
192:
193:                // remove extra whitespace
194:                s = WHITE_SPACE_REMOVAL_PATTERN.matcher(s).replaceAll(" ");
195:
196:                // replace br, p and heading tags with newlines
197:                s = BREAK_TO_NL_PATTERN.matcher(s).replaceAll("\n");
198:                s = P_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
199:                s = DIV_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
200:                s = H_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
201:
202:                // strip remaining tags
203:                s = STRIP_TAGS_PATTERN.matcher(s).replaceAll("");
204:
205:                // tag stripping can leave some double spaces at line beginnings
206:                s = TRIM_SPACE_PATTERN.matcher(s).replaceAll("\n").trim();
207:
208:                return s;
209:            }
210:
211:            /**
212:             * Strips html tags. The method used is very simple:
213:             * Everything between tag-start (&lt) and tag-end (&gt) is removed.
214:             * Optionaly br tags are replaced by newline and ending p tags with
215:             * double newline.
216:             *
217:             * @param        s                        input string
218:             * @param        breakToNl        if true, newlines are inserted for br and p tags
219:             * @return        output without html tags (null on error)
220:             * @author        karlpeder, 20030623
221:             *                         (moved from org.columba.mail.gui.message.util.DocumentParser)
222:             *
223:             * @deprecated        Please use the more advanced and correct
224:             *              @see stripHtmlTags(String) method
225:             */
226:            public static String stripHtmlTags(String s, boolean breakToNl) {
227:                // initial check of input:
228:                if (s == null) {
229:                    return null;
230:                }
231:
232:                if (breakToNl) {
233:                    // replace <br> and </br> with newline
234:                    s = BREAK_TO_NL_PATTERN.matcher(s).replaceAll("\n");
235:
236:                    // replace </p> with double newline
237:                    s = P_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
238:                }
239:
240:                // strip tags
241:                s = STRIP_TAGS_PATTERN.matcher(s).replaceAll("");
242:
243:                return s;
244:            }
245:
246:            /**
247:             * Performs in large terms the reverse of
248:             * substituteSpecialCharacters (though br tags are not
249:             * converted to newlines, this should be handled separately).
250:             * More preciesly it changes special entities like
251:             * amp, nbsp etc. to their real counter parts: &, space etc.
252:             * <br>
253:             * This includes transformation of special (language specific) chars
254:             * such as the Danish ? ? ? ? ? ?.
255:             *
256:             * @param        s        input string
257:             * @return        output with special entities replaced with their
258:             *                         "real" counter parts (null on error)
259:             * @author  karlpeder, 20030623
260:             *                         (moved from org.columba.mail.gui.message.util.DocumentParser)
261:             */
262:            public static String restoreSpecialCharacters(Charset charset,
263:                    String s) {
264:
265:                //First replace all special entities
266:                for (int i = 0; i < SPECIAL_ENTITIES.length; i++) {
267:                    s = s.replaceAll(SPECIAL_ENTITIES[i], ENTITY_STRINGS[i]);
268:                }
269:
270:                StringBuffer result = new StringBuffer(s.length());
271:
272:                //replace the other entities
273:                Matcher matcher = SPECIAL_PATTERN.matcher(s);
274:                while (matcher.find()) {
275:                    matcher.appendReplacement(result, charset.decode(
276:                            ByteBuffer.wrap(new byte[] { (byte) Integer
277:                                    .parseInt(matcher.group(1)) })).toString());
278:                }
279:                matcher.appendTail(result);
280:
281:                //Convert 4 WS in a row to a tab
282:                return result.toString().replaceAll("    ", "\t");
283:            }
284:
285:            public static Charset getHtmlCharset(String htmlSource) {
286:                Matcher matcher = CHARSET_PATTERN.matcher(htmlSource);
287:                if (matcher.find()) {
288:                    try {
289:                        return Charset.forName(matcher.group(1));
290:                    } catch (RuntimeException e) {
291:                    }
292:                }
293:
294:                return Charset.forName(System.getProperty("file.encoding"));
295:            }
296:
297:            /**
298:             * Strips html tags. and replaces special entities with their
299:             * "normal" counter parts, e.g. <code>&gt; => ></code>.<br>
300:             * Calling this method is the same as calling first stripHtmlTags
301:             * and then restoreSpecialCharacters.
302:             *
303:             * @param        html        input string
304:             * @return        output without html tags and special entities
305:             *                         (null on error)
306:             * @author        karlpeder, 20030623
307:             *                         (moved from org.columba.mail.parser.text.BodyTextParser)
308:             */
309:            public static String htmlToText(String html) {
310:                // stripHtmlTags called with true ~ p & br => newlines
311:                Charset charset = getHtmlCharset(html);
312:
313:                String text = stripHtmlTags(html);
314:
315:                return restoreSpecialCharacters(charset, text);
316:            }
317:
318:            /**
319:             * Replaces special chars - <,>,&,\t,\n," - with the special
320:             * entities used in html (amp, nbsp, ...). Then the complete
321:             * text is surrounded with proper html tags: Starting- and
322:             * ending html tag, header section and body section.
323:             * The complete body section is sorround with p tags.
324:             * <br>
325:             * This is the same as first calling substituteSpecialCharacters
326:             * and then add starting and ending html tags etc.
327:             * <br>
328:             * Further more urls and email adresses are converted into links
329:             * Optionally a title and css definition is inserted in the
330:             * html header.
331:             * <br>
332:             *
333:             * TODO (@author fdietz): Add support for smilies and coloring of quoted text
334:             *
335:             * @param        text        Text to convert to html
336:             * @param        title        Title to include in header, not used if null
337:             * @param        css                Style sheet def. to include in header,
338:             *                                         not used if null.
339:             *                                         The input shall not include the style tag
340:             * @return        Text converted to html
341:             * @author        Karl Peder Olesen (karlpeder), 20030916
342:             */
343:            public static String textToHtml(String text, String title,
344:                    String css, String charset) {
345:                // convert special characters
346:                String html = HtmlParser.substituteSpecialCharacters(text);
347:
348:                // parse for urls / email adresses and substite with HTML-code
349:                // html = HtmlParser.substituteURL(html);
350:                // html = HtmlParser.substituteEmailAddress(html);
351:
352:                // insert surrounding html tags
353:                StringBuffer buf = new StringBuffer();
354:                buf.append("<html><head>");
355:                buf
356:                        .append("<meta http-equiv=\"Content-Type\" content=\"text/html;charset="
357:                                + charset + "\">");
358:
359:                if (title != null) {
360:                    buf.append("<title>");
361:                    buf.append(title);
362:                    buf.append("</title>");
363:                }
364:
365:                if (css != null) {
366:                    buf.append("<style type=\"text/css\"><!-- ");
367:                    buf.append(css);
368:                    buf.append(" --></style>");
369:                }
370:
371:                buf.append("</head><body><p>");
372:                buf.append(html);
373:                buf.append("</p></body></html>");
374:
375:                return buf.toString();
376:            }
377:
378:            /**
379:             * Substitute special characters like:
380:             * <,>,&,\t,\n,"
381:             * with special entities used in html (amp, nbsp, ...)
382:             *
383:             * @param        s        input string containing special characters
384:             * @return        output with special characters substituted
385:             *                         (null on error)
386:             */
387:            public static String substituteSpecialCharacters(String s) {
388:                StringBuffer sb = new StringBuffer(s.length());
389:                StringReader sr = new StringReader(s);
390:                BufferedReader br = new BufferedReader(sr);
391:                String ss = null;
392:
393:                try {
394:                    while ((ss = br.readLine()) != null) {
395:                        int i = 0;
396:
397:                        while (i < ss.length()) {
398:                            switch (ss.charAt(i)) {
399:                            case '<':
400:                                sb.append("&lt;");
401:                                i++;
402:
403:                                break;
404:
405:                            case '>':
406:                                sb.append("&gt;");
407:                                i++;
408:
409:                                break;
410:
411:                            case '&':
412:                                sb.append("&amp;");
413:                                i++;
414:
415:                                break;
416:
417:                            case '"':
418:                                sb.append("&quot;");
419:                                i++;
420:
421:                                break;
422:
423:                            case ' ':
424:
425:                                //sb.append("&nbsp;");
426:                                if (ss.substring(i).startsWith("    ")) {
427:                                    sb.append("&nbsp; ");
428:                                    i = i + 2;
429:                                } else if (ss.substring(i).startsWith("   ")) {
430:                                    sb.append("&nbsp;&nbsp; ");
431:                                    i = i + 3;
432:                                } else if (ss.substring(i).startsWith("  ")) {
433:                                    sb.append("&nbsp; ");
434:                                    i = i + 2;
435:                                } else {
436:                                    sb.append(' ');
437:                                    i++;
438:                                }
439:
440:                                break;
441:
442:                            case '\t':
443:                                sb.append("&nbsp;&nbsp;&nbsp;&nbsp;");
444:                                i++;
445:
446:                                break;
447:
448:                            case '\n':
449:                                sb.append("<br>");
450:                                i++;
451:
452:                                break;
453:
454:                            default:
455:                                sb.append(ss.charAt(i));
456:                                i++;
457:
458:                                break;
459:                            }
460:                        }
461:
462:                        sb.append("<br>\n");
463:                    }
464:                } catch (Exception e) {
465:                    LOG.severe("Error substituting special characters: "
466:                            + e.getMessage());
467:
468:                    return null; // error
469:                }
470:
471:                return sb.toString();
472:            }
473:
474:            /**
475:             *
476:             * substitute special characters like:
477:             * <,>,&,\t,\n
478:             * with special entities used in html<br>
479:             * This is the same as substituteSpecialCharacters, but
480:             * here an extra newline character is not inserted.
481:             *
482:             * @param        s        input string containing special characters
483:             * @return        output with special characters substituted
484:             *                         (null on error)
485:             */
486:            public static String substituteSpecialCharactersInHeaderfields(
487:                    String s) {
488:                StringBuffer sb = new StringBuffer(s.length());
489:                StringReader sr = new StringReader(s);
490:                BufferedReader br = new BufferedReader(sr);
491:                String ss = null;
492:
493:                // TODO (@author karlpeder): Extend handling of special entities as in restoreSpecialCharacters
494:
495:                /*
496:                 * *20030623, karlpeder* " and space handled also
497:                 */
498:                try {
499:                    while ((ss = br.readLine()) != null) {
500:                        int i = 0;
501:
502:                        while (i < ss.length()) {
503:                            switch (ss.charAt(i)) {
504:                            case '<':
505:                                sb.append("&lt;");
506:                                i++;
507:
508:                                break;
509:
510:                            case '>':
511:                                sb.append("&gt;");
512:                                i++;
513:
514:                                break;
515:
516:                            case '&':
517:                                sb.append("&amp;");
518:                                i++;
519:
520:                                break;
521:
522:                            case '"':
523:                                sb.append("&quot;");
524:                                i++;
525:
526:                                break;
527:
528:                            /*
529:                            case '\'':
530:                            sb.append("&apos;");
531:                            i++;
532:
533:                            break;*/
534:
535:                            case ' ':
536:
537:                                if (ss.substring(i).startsWith("    ")) {
538:                                    sb.append("&nbsp; ");
539:                                    i = i + 2;
540:                                } else if (ss.substring(i).startsWith("   ")) {
541:                                    sb.append("&nbsp;&nbsp; ");
542:                                    i = i + 3;
543:                                } else if (ss.substring(i).startsWith("  ")) {
544:                                    sb.append("&nbsp; ");
545:                                    i = i + 2;
546:                                } else {
547:                                    sb.append(' ');
548:                                    i++;
549:                                }
550:
551:                                break;
552:
553:                            case '\t':
554:                                sb.append("&nbsp;&nbsp;&nbsp;&nbsp;");
555:                                i++;
556:
557:                                break;
558:
559:                            case '\n':
560:                                sb.append("<br>");
561:                                i++;
562:
563:                                break;
564:
565:                            default:
566:                                sb.append(ss.charAt(i));
567:                                i++;
568:
569:                                break;
570:                            }
571:                        }
572:                    }
573:                } catch (Exception e) {
574:                    LOG.severe("Error substituting special characters: "
575:                            + e.getMessage());
576:
577:                    return null; // error
578:                }
579:
580:                return sb.toString();
581:            }
582:
583:            /**
584:             * Tries to fix broken html-strings by inserting
585:             * html start- and end tags if missing, and by
586:             * removing content after the html end tag.
587:             *
588:             * @param        input        html content to be validated
589:             * @return        content with extra tags inserted if necessary
590:             */
591:            public static String validateHTMLString(String input) {
592:                StringBuffer output = new StringBuffer(input);
593:                int index = 0;
594:
595:                String lowerCaseInput = input.toLowerCase();
596:
597:                // Check for missing  <html> tag
598:                if (lowerCaseInput.indexOf("<html>") == -1) {
599:                    if (lowerCaseInput.indexOf("<!doctype") != -1) {
600:                        index = lowerCaseInput.indexOf("\n", lowerCaseInput
601:                                .indexOf("<!doctype")) + 1;
602:                    }
603:
604:                    output.insert(index, "<html>");
605:                }
606:
607:                // Check for missing  </html> tag
608:                if (lowerCaseInput.indexOf("</html>") == -1) {
609:                    output.append("</html>");
610:                }
611:
612:                // remove characters after </html> tag
613:                index = lowerCaseInput.indexOf("</html>");
614:
615:                if (lowerCaseInput.length() >= (index + 7)) {
616:                    lowerCaseInput = lowerCaseInput.substring(0, index + 7);
617:                }
618:
619:                return output.toString();
620:            }
621:
622:            /**
623:             * parse text and transform every email-address
624:             * in a HTML-conform address
625:             *
626:             * @param        s        input text
627:             * @return        text with email-adresses transformed to links
628:             *                         (null on error)
629:             */
630:            public static String substituteEmailAddress(String s) {
631:                // due to bug CA-174 changed: return EMAIL_PATTERN.matcher(s).replaceAll("<A HREF=\"mailto:$1\">$1</A>");
632:                return substituteEmailAddress(s, false);
633:            }
634:
635:            /**
636:             * Transforms email-addresses into HTML just as
637:             * substituteEmailAddress(String), but tries to ignore email-addresses,
638:             * which are already links, if the ignore links flag is set.
639:             * <br>
640:             * This extended functionality is necessary when parsing a text which
641:             * is already (partly) html.
642:             * <br>
643:             * FIXME: Can this be done smarter, i.e. directly with reg. expr. without manual parsing??
644:             *
645:             * @param         s                                input text
646:             * @param        ignoreLinks                if true link tags are ignored. This gives a
647:             *                                                         wrong result if some e-mail adresses are
648:             *                                                         already links (but uses reg. expr. directly,
649:             *                                                         and is therefore faster)
650:             * @return        text with email-adresses transformed to links
651:             */
652:            public static String substituteEmailAddress(String s,
653:                    boolean ignoreLinks) {
654:                if (ignoreLinks) {
655:                    // Do not take existing link tags into account
656:                    return substituteEmailAddress(s);
657:                }
658:
659:                // initialisation
660:                Matcher noLinkMatcher = EMAIL_PATTERN.matcher(s);
661:                Matcher withLinkMatcher = EMAIL_PATTERN_INC_LINK.matcher(s);
662:                int pos = 0; // current position in s
663:                int length = s.length();
664:                StringBuffer buf = new StringBuffer();
665:
666:                while (pos < length) {
667:                    if (noLinkMatcher.find(pos)) {
668:                        // an email adress was found - check whether its already a link
669:                        int s1 = noLinkMatcher.start();
670:                        int e1 = noLinkMatcher.end();
671:                        boolean insertLink;
672:
673:                        if (withLinkMatcher.find(pos)) {
674:                            // found an email address with links - is it the same?
675:                            int s2 = withLinkMatcher.start();
676:                            int e2 = withLinkMatcher.end();
677:
678:                            if ((s2 < s1) && (e2 > e1)) {
679:                                // same email adress - just append and continue
680:                                buf.append(s.substring(pos, e2));
681:                                pos = e2;
682:                                insertLink = false; // already handled
683:                            } else {
684:                                // not the same
685:                                insertLink = true;
686:                            }
687:                        } else {
688:                            // no match with mailto link tags
689:
690:                            insertLink = true;
691:
692:                            // can be an email address in a link BUG CA-174
693:                            // fix that with looking for an open link in the same line before
694:                            // on the way from left to the current position of the email at s1
695:                            // find the last open link <a
696:                            Matcher openLink = Pattern.compile("<a",
697:                                    Pattern.CASE_INSENSITIVE).matcher(s);
698:                            Matcher closeLink = Pattern.compile("</a>",
699:                                    Pattern.CASE_INSENSITIVE).matcher(s);
700:                            int linkPos = 0;
701:                            int savedLinkPos = -1;
702:                            while (linkPos < s1) {
703:                                savedLinkPos = linkPos;
704:                                if (openLink.find(linkPos))
705:                                    linkPos = openLink.end();
706:                                else
707:                                    break;
708:                            }
709:
710:                            // found an open link
711:                            if (savedLinkPos > -1) {
712:                                // check if it is closed
713:                                if (closeLink.find(savedLinkPos)) {
714:                                    // if the closing mark is after the s1 mark do not insert a link
715:                                    if (closeLink.end() >= s1) {
716:                                        buf.append(s.substring(pos, e1));
717:                                        pos = e1;
718:                                        insertLink = false; // already handled
719:                                    }
720:                                }
721:                            }
722:                        }
723:
724:                        // shall we insert a link?
725:                        if (insertLink) {
726:                            String email = s.substring(s1, e1);
727:                            String link = "<a href=\"mailto:" + email + "\">"
728:                                    + email + "</a>";
729:                            buf.append(s.substring(pos, s1));
730:                            buf.append(link);
731:                            pos = e1;
732:                        }
733:                    } else {
734:                        // no more matches - append rest of string
735:                        buf.append(s.substring(pos));
736:                        pos = length;
737:                    }
738:                }
739:
740:                // return result
741:                String result = buf.toString();
742:                LOG.info("Result:\n" + result);
743:
744:                return result;
745:            }
746:
747:            /**
748:             * parse text and transform every url
749:             * in a HTML-conform url
750:             *
751:             * @param        s        input text
752:             * @return        text with urls transformed to links
753:             *                         (null on error)
754:             */
755:            public static String substituteURL(String s) {
756:                String match;
757:                Matcher m = URL_PATTERN.matcher(s);
758:                StringBuffer sb = new StringBuffer();
759:
760:                int pos = 0;
761:                while (m.find()) {
762:                    match = m.group();
763:
764:                    sb.append(s.substring(pos, m.start()));
765:                    String temp = "";
766:                    // Test if there is a trailing html tag
767:                    if (match.matches(".*<\\w+$") && s.length() > m.end()
768:                            && s.charAt(m.end()) == '>') {
769:                        temp = match.substring(match.lastIndexOf('<'));
770:                        match = match.substring(0, match.lastIndexOf('<'));
771:                    }
772:                    sb.append("<A HREF=\"" + match + "\">" + match + "</A>");
773:                    sb.append(temp);
774:                    pos = m.end();
775:                }
776:
777:                sb.append(s.substring(pos));
778:
779:                return sb.toString();
780:            }
781:
782:            /**
783:             * Transforms urls into HTML just as substituteURL(String),
784:             * but tries to ignore urls, which are already links, if the ignore
785:             * links flag is set.
786:             * <br>
787:             * This extended functionality is necessary when parsing a text which
788:             * is already (partly) html.
789:             * <br>
790:             * FIXME: Can this be done smarter, i.e. directly with reg. expr. without manual parsing??
791:             *
792:             * @param         s                                input text
793:             * @param        ignoreLinks                if true link tags are ignored. This gives a
794:             *                                                         wrong result if some urls are already links
795:             *                                                         (but uses reg. expr. directly, and is
796:             *                                                         therefore faster)
797:             * @return        text with urls
798:             */
799:            public static String substituteURL(String s, boolean ignoreLinks) {
800:                if (ignoreLinks) {
801:                    // Do not take existing link tags into account
802:                    return substituteURL(s);
803:                }
804:
805:                // initialisation
806:                Matcher noLinkMatcher = URL_PATTERN.matcher(s);
807:                Matcher withLinkMatcher = URL_PATTERN_INC_LINK.matcher(s);
808:                int pos = 0; // current position in s
809:                int length = s.length();
810:                StringBuffer buf = new StringBuffer();
811:
812:                while (pos < length) {
813:                    if (noLinkMatcher.find(pos)) {
814:                        // an url - check whether its already a link
815:                        int s1 = noLinkMatcher.start();
816:                        int e1 = noLinkMatcher.end();
817:                        boolean insertLink;
818:
819:                        if (withLinkMatcher.find(pos)) {
820:                            // found an url with links - is it the same?
821:                            int s2 = withLinkMatcher.start();
822:                            int e2 = withLinkMatcher.end();
823:
824:                            if ((s2 < s1) && (e2 > e1)) {
825:                                // same url - just append and continue
826:                                buf.append(s.substring(pos, e2));
827:                                pos = e2;
828:                                insertLink = false; // already handled
829:                            } else {
830:                                // not the same
831:                                insertLink = true;
832:                            }
833:                        } else {
834:                            // no match with link tags
835:                            insertLink = true;
836:                        }
837:
838:                        // shall we insert a link?
839:                        if (insertLink) {
840:                            String url = s.substring(s1, e1);
841:                            String link = "<a href=\"" + url + "\">" + url
842:                                    + "</a>";
843:                            buf.append(s.substring(pos, s1));
844:                            buf.append(link);
845:                            pos = e1;
846:                        }
847:                    } else {
848:                        // no more matches - append rest of string
849:                        buf.append(s.substring(pos));
850:                        pos = length;
851:                    }
852:                }
853:
854:                // return result
855:                String result = buf.toString();
856:                LOG.info("Result:\n" + result);
857:
858:                return result;
859:            }
860:
861:            /**
862:             * Extracts the body of a html document, i.e. the html contents
863:             * between (and not including) body start and end tags.
864:             *
865:             * @param        html        The html document to extract the body from
866:             * @return       The body of the html document
867:             *
868:             * @author        Karl Peder Olesen (karlpeder)
869:             */
870:            public static String getHtmlBody(String html) {
871:                // locate body start- and end tags
872:                String lowerCaseContent = html.toLowerCase();
873:                int tagStart = lowerCaseContent.indexOf("<body");
874:
875:                // search for closing bracket separately to account for attributes in tag
876:                int tagStartClose = lowerCaseContent.indexOf(">", tagStart) + 1;
877:                int tagEnd = lowerCaseContent.indexOf("</body>");
878:
879:                // correct limits if body tags where not found
880:                if (tagStartClose < 0) {
881:                    tagStartClose = 0;
882:                }
883:
884:                if ((tagEnd < 0) || (tagEnd > lowerCaseContent.length())) {
885:                    tagEnd = lowerCaseContent.length();
886:                }
887:
888:                // return body
889:                return html.substring(tagStartClose, tagEnd);
890:            }
891:
892:            /**
893:             * Parses a html documents and removes all html comments found.
894:             *
895:             * @param        html        The html document
896:             * @return        Html document without comments
897:             *
898:             * @author        Karl Peder Olesen (karlpeder)
899:             */
900:            public static String removeComments(String html) {
901:                // remove comments
902:                return COMMENTS_REMOVAL_PATTERN.matcher(html).replaceAll("");
903:            }
904:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.