001: //The contents of this file are subject to the Mozilla Public License Version 1.1
002: //(the "License"); you may not use this file except in compliance with the
003: //License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
004: //
005: //Software distributed under the License is distributed on an "AS IS" basis,
006: //WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
007: //for the specific language governing rights and
008: //limitations under the License.
009: //
010: //The Original Code is "The Columba Project"
011: //
012: //The Initial Developers of the Original Code are Frederik Dietz and Timo Stich.
013: //Portions created by Frederik Dietz and Timo Stich are Copyright (C) 2003.
014: //
015: //All Rights Reserved.
016: package org.columba.mail.parser.text;
017:
018: import java.io.BufferedReader;
019: import java.io.StringReader;
020: import java.nio.ByteBuffer;
021: import java.nio.charset.Charset;
022: import java.util.logging.Logger;
023: import java.util.regex.Matcher;
024: import java.util.regex.Pattern;
025:
026: /**
027: * Contains different utility functions for manipulating Html based
028: * text. This includes functionality for removing and restoring
029: * special entities (such as &, <, >, ...) and functionality for
030: * removing html tags from the text.
031: *
032: * @author Karl Peder Olesen (karlpeder), 20030623
033: *
034: */
035: public final class HtmlParser {
036:
037: /**
038: * Utility classes should not have a public constructor.
039: */
040: private HtmlParser() {
041: }
042:
043: private static final Logger LOG = Logger
044: .getLogger("org.columba.mail.parser.text");
045:
046: private static final Pattern BREAK_TO_NL_PATTERN = Pattern.compile(
047: "</?br>", Pattern.CASE_INSENSITIVE);
048: private static final Pattern P_TO_DOUBLE_NL_PATTERN = Pattern
049: .compile("</p>", Pattern.CASE_INSENSITIVE);
050: private static final Pattern DIV_TO_DOUBLE_NL_PATTERN = Pattern
051: .compile("</div>", Pattern.CASE_INSENSITIVE);
052: private static final Pattern H_TO_DOUBLE_NL_PATTERN = Pattern
053: .compile("</h\\d>", Pattern.CASE_INSENSITIVE);
054: private static final Pattern WHITE_SPACE_REMOVAL_PATTERN = Pattern
055: .compile("\\s+", Pattern.CASE_INSENSITIVE);
056: private static final Pattern TRIM_SPACE_PATTERN = Pattern.compile(
057: "\n\\s+", Pattern.CASE_INSENSITIVE);
058: private static final Pattern HEADER_REMOVAL_PATTERN = Pattern
059: .compile("<html[^<]*<body[^>]*>", Pattern.CASE_INSENSITIVE);
060: private static final Pattern STRIP_TAGS_PATTERN = Pattern.compile(
061: "<[^>]*>", Pattern.CASE_INSENSITIVE);
062: private static final Pattern COMMENTS_REMOVAL_PATTERN = Pattern
063: .compile("<!--[^-]*-->", Pattern.CASE_INSENSITIVE);
064: private static final String EMAIL_STR = "([a-zA-Z0-9_+\\.-]+@([a-zA-Z0-9]+([\\.-][a-zA-Z0-9]+)*)+\\.[a-zA-Z]{2,4})";
065: //do the bug [997599] "\\b([^\\s@]+@[^\\s]+)\\b";
066: private static final Pattern EMAIL_PATTERN = Pattern
067: .compile(EMAIL_STR);
068: private static final Pattern EMAIL_PATTERN_INC_LINK = Pattern
069: .compile("<a[\\s\\n]*href=(\\\")?(mailto:)" + EMAIL_STR
070: + "[^<]*</a>", Pattern.CASE_INSENSITIVE);
071:
072: private static final String PROT = "(http|https|ftp)";
073: private static final String PUNC = ".,:;?!\\-";
074: private static final String ANY = "\\S";
075: private static final String URL_STR = "\\b" + "("
076: + "(\\w*(:\\S*)?@)?" + PROT + "://" + "[" + ANY + "]+"
077: + ")" + "\\b";
078:
079: /*
080: \\b Start at word boundary
081: (
082: (\\w*(:\\S*)?@)? [user:[pass]]@ - Construct
083: prot + ":// protocol and ://
084: ["+any+"] match literaly anything...
085: )
086: (?=\\s|$) ...until we find whitespace or end of String
087: */
088: private static final Pattern URL_PATTERN = Pattern.compile(URL_STR,
089: Pattern.CASE_INSENSITIVE);
090: private static final String URL_REPAIR_STR = "(.*://.*?)" + "("
091: + "(>).*|" + "([" + PUNC + "]*)" + "(<br>)?" + ")$";
092:
093: /*
094: (.*://.*?)" "something" with ://
095: (could be .*? but then the Pattern would match whitespace)
096: (
097: (>).* a html-Encoded > followed by anything
098: | or
099: (["+punc+"]*)" any Punctuation
100: (<br>)? 0 or 1 trailing <br>
101: )$ end of String
102: */
103: private static final Pattern URL_REPAIR_PATTERN = Pattern
104: .compile(URL_REPAIR_STR);
105: private static final Pattern URL_PATTERN_INC_LINK = Pattern
106: .compile("<a( |\\n)*?href=(\\\")?" + URL_STR
107: + "(.|\\n)*?</a>", Pattern.CASE_INSENSITIVE);
108:
109: // TODO (@author fdietz): Add more special entities - e.g. accenture chars such as ?
110:
111: /** Special entities recognized by restore special entities */
112: // The form of the entities must be a regexp!
113: private static final String[] SPECIAL_ENTITIES = { """,
114: "&", "<", ">", " ", "¡", "¢",
115: "£", "¤", "¥", "¦", "§",
116: "¨", "©", "ª", "«", "¬", "­",
117: "®", "¯", "°", "±", "²", "³",
118: "´", "µ", "¶", "·", "¸",
119: "¹", "º", "»", "¼", "½",
120: "¾", "¿", "À", "Á", "Â",
121: "Ã", "Ä", "Å", "Æ", "Ç",
122: "È", "É", "Ê", "Ë", "Ì",
123: "Í", "Î", "Ï", "Ð", "Ñ",
124: "Ò", "Ó", "Ô", "Õ", "Ö",
125: "×", "Ø", "Ù", "Ú", "Û",
126: "Ü", "Ý", "Þ", "ß", "à",
127: "á", "â", "ã", "ä", "å",
128: "æ", "ç", "è", "é", "ê",
129: "ë", "ì", "í", "î", "ï",
130: "ð", "ñ", "ò", "ó", "ô",
131: "õ", "ö", "÷", "ø", "ù",
132: "ú", "û", "ü", "ý", "þ",
133: "ÿ" };
134:
135: /** Normal chars corresponding to the defined special entities */
136: private static final String[] ENTITY_STRINGS = { "\"", "&", "<",
137: ">", "\u00a0", "\u00a1", "\u00a2", "\u00a3", "\u00a4",
138: "\u00a5", "\u00a6", "\u00a7", "\u00a8", "\u00a9", "\u00aa",
139: "\u00ab", "\u00ac", "\u00ad", "\u00ae", "\u00af", "\u00b0",
140: "\u00b1", "\u00b2", "\u00b3", "\u00b4", "\u00b5", "\u00b6",
141: "\u00b7", "\u00b8", "\u00b9", "\u00ba", "\u00bb", "\u00bc",
142: "\u00bd", "\u00be", "\u00bf", "\u00c0", "\u00c1", "\u00c2",
143: "\u00c3", "\u00c4", "\u00c5", "\u00c6", "\u00c7", "\u00c8",
144: "\u00c9", "\u00ca", "\u00cb", "\u00cc", "\u00cd", "\u00ce",
145: "\u00cf", "\u00d0", "\u00d1", "\u00d2", "\u00d3", "\u00d4",
146: "\u00d5", "\u00d6", "\u00d7", "\u00d8", "\u00d9", "\u00da",
147: "\u00db", "\u00dc", "\u00dd", "\u00de", "\u00df", "\u00e0",
148: "\u00e1", "\u00e2", "\u00e3", "\u00e4", "\u00e5", "\u00e6",
149: "\u00e7", "\u00e8", "\u00e9", "\u00ea", "\u00eb", "\u00ec",
150: "\u00ed", "\u00ee", "\u00ef", "\u00f0", "\u00f1", "\u00f2",
151: "\u00f3", "\u00f4", "\u00f5", "\u00f6", "\u00f7", "\u00f8",
152: "\u00f9", "\u00fa", "\u00fb", "\u00fc", "\u00fd", "\u00fe",
153: "\u00ff" };
154:
155: private static final Pattern SPECIAL_PATTERN = Pattern
156: .compile("&#(\\d+);");
157:
158: private static final Pattern CHARSET_PATTERN = Pattern
159: .compile("\\bcharset=([\\w-_\\d]+)\\b");
160:
161: /**
162: * Strips html tags and removes extra spaces which occurs due
163: * to e.g. indentation of the html and the head section, which does
164: * not contain any textual information.
165: * <br>
166: * The conversion rutine does the following:<br>
167: * 1. Removes the header from the html file, i.e. everything from
168: * the html tag until and including the starting body tag.<br>
169: * 2. Replaces multiple consecutive whitespace characters with a single
170: * space (since extra whitespace should be ignored in html).<br>
171: * 3. Replaces ending br tags with a single newline character<br>
172: * 4. Replaces ending p, div and heading tags with two newlines characters;
173: * resulting in a single empty line btw. paragraphs.<br>
174: * 5. Strips remaining html tags.<br>
175: * <br>
176: * NB: The tag stripping is done using a very simple regular expression,
177: * which removes everything between < and >. Therefore too much text
178: * could in some (hopefully rare!?) cases be removed.
179: *
180: * @param s Input string
181: * @return Input stripped for html tags
182: * @author Karl Peder Olesen (karlpeder)
183: */
184: public static String stripHtmlTags(String s) {
185: // initial check of input:
186: if (s == null) {
187: return null;
188: }
189:
190: // remove header
191: s = HEADER_REMOVAL_PATTERN.matcher(s).replaceAll("");
192:
193: // remove extra whitespace
194: s = WHITE_SPACE_REMOVAL_PATTERN.matcher(s).replaceAll(" ");
195:
196: // replace br, p and heading tags with newlines
197: s = BREAK_TO_NL_PATTERN.matcher(s).replaceAll("\n");
198: s = P_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
199: s = DIV_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
200: s = H_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
201:
202: // strip remaining tags
203: s = STRIP_TAGS_PATTERN.matcher(s).replaceAll("");
204:
205: // tag stripping can leave some double spaces at line beginnings
206: s = TRIM_SPACE_PATTERN.matcher(s).replaceAll("\n").trim();
207:
208: return s;
209: }
210:
211: /**
212: * Strips html tags. The method used is very simple:
213: * Everything between tag-start (<) and tag-end (>) is removed.
214: * Optionaly br tags are replaced by newline and ending p tags with
215: * double newline.
216: *
217: * @param s input string
218: * @param breakToNl if true, newlines are inserted for br and p tags
219: * @return output without html tags (null on error)
220: * @author karlpeder, 20030623
221: * (moved from org.columba.mail.gui.message.util.DocumentParser)
222: *
223: * @deprecated Please use the more advanced and correct
224: * @see stripHtmlTags(String) method
225: */
226: public static String stripHtmlTags(String s, boolean breakToNl) {
227: // initial check of input:
228: if (s == null) {
229: return null;
230: }
231:
232: if (breakToNl) {
233: // replace <br> and </br> with newline
234: s = BREAK_TO_NL_PATTERN.matcher(s).replaceAll("\n");
235:
236: // replace </p> with double newline
237: s = P_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
238: }
239:
240: // strip tags
241: s = STRIP_TAGS_PATTERN.matcher(s).replaceAll("");
242:
243: return s;
244: }
245:
246: /**
247: * Performs in large terms the reverse of
248: * substituteSpecialCharacters (though br tags are not
249: * converted to newlines, this should be handled separately).
250: * More preciesly it changes special entities like
251: * amp, nbsp etc. to their real counter parts: &, space etc.
252: * <br>
253: * This includes transformation of special (language specific) chars
254: * such as the Danish ? ? ? ? ? ?.
255: *
256: * @param s input string
257: * @return output with special entities replaced with their
258: * "real" counter parts (null on error)
259: * @author karlpeder, 20030623
260: * (moved from org.columba.mail.gui.message.util.DocumentParser)
261: */
262: public static String restoreSpecialCharacters(Charset charset,
263: String s) {
264:
265: //First replace all special entities
266: for (int i = 0; i < SPECIAL_ENTITIES.length; i++) {
267: s = s.replaceAll(SPECIAL_ENTITIES[i], ENTITY_STRINGS[i]);
268: }
269:
270: StringBuffer result = new StringBuffer(s.length());
271:
272: //replace the other entities
273: Matcher matcher = SPECIAL_PATTERN.matcher(s);
274: while (matcher.find()) {
275: matcher.appendReplacement(result, charset.decode(
276: ByteBuffer.wrap(new byte[] { (byte) Integer
277: .parseInt(matcher.group(1)) })).toString());
278: }
279: matcher.appendTail(result);
280:
281: //Convert 4 WS in a row to a tab
282: return result.toString().replaceAll(" ", "\t");
283: }
284:
285: public static Charset getHtmlCharset(String htmlSource) {
286: Matcher matcher = CHARSET_PATTERN.matcher(htmlSource);
287: if (matcher.find()) {
288: try {
289: return Charset.forName(matcher.group(1));
290: } catch (RuntimeException e) {
291: }
292: }
293:
294: return Charset.forName(System.getProperty("file.encoding"));
295: }
296:
297: /**
298: * Strips html tags. and replaces special entities with their
299: * "normal" counter parts, e.g. <code>> => ></code>.<br>
300: * Calling this method is the same as calling first stripHtmlTags
301: * and then restoreSpecialCharacters.
302: *
303: * @param html input string
304: * @return output without html tags and special entities
305: * (null on error)
306: * @author karlpeder, 20030623
307: * (moved from org.columba.mail.parser.text.BodyTextParser)
308: */
309: public static String htmlToText(String html) {
310: // stripHtmlTags called with true ~ p & br => newlines
311: Charset charset = getHtmlCharset(html);
312:
313: String text = stripHtmlTags(html);
314:
315: return restoreSpecialCharacters(charset, text);
316: }
317:
318: /**
319: * Replaces special chars - <,>,&,\t,\n," - with the special
320: * entities used in html (amp, nbsp, ...). Then the complete
321: * text is surrounded with proper html tags: Starting- and
322: * ending html tag, header section and body section.
323: * The complete body section is sorround with p tags.
324: * <br>
325: * This is the same as first calling substituteSpecialCharacters
326: * and then add starting and ending html tags etc.
327: * <br>
328: * Further more urls and email adresses are converted into links
329: * Optionally a title and css definition is inserted in the
330: * html header.
331: * <br>
332: *
333: * TODO (@author fdietz): Add support for smilies and coloring of quoted text
334: *
335: * @param text Text to convert to html
336: * @param title Title to include in header, not used if null
337: * @param css Style sheet def. to include in header,
338: * not used if null.
339: * The input shall not include the style tag
340: * @return Text converted to html
341: * @author Karl Peder Olesen (karlpeder), 20030916
342: */
343: public static String textToHtml(String text, String title,
344: String css, String charset) {
345: // convert special characters
346: String html = HtmlParser.substituteSpecialCharacters(text);
347:
348: // parse for urls / email adresses and substite with HTML-code
349: // html = HtmlParser.substituteURL(html);
350: // html = HtmlParser.substituteEmailAddress(html);
351:
352: // insert surrounding html tags
353: StringBuffer buf = new StringBuffer();
354: buf.append("<html><head>");
355: buf
356: .append("<meta http-equiv=\"Content-Type\" content=\"text/html;charset="
357: + charset + "\">");
358:
359: if (title != null) {
360: buf.append("<title>");
361: buf.append(title);
362: buf.append("</title>");
363: }
364:
365: if (css != null) {
366: buf.append("<style type=\"text/css\"><!-- ");
367: buf.append(css);
368: buf.append(" --></style>");
369: }
370:
371: buf.append("</head><body><p>");
372: buf.append(html);
373: buf.append("</p></body></html>");
374:
375: return buf.toString();
376: }
377:
378: /**
379: * Substitute special characters like:
380: * <,>,&,\t,\n,"
381: * with special entities used in html (amp, nbsp, ...)
382: *
383: * @param s input string containing special characters
384: * @return output with special characters substituted
385: * (null on error)
386: */
387: public static String substituteSpecialCharacters(String s) {
388: StringBuffer sb = new StringBuffer(s.length());
389: StringReader sr = new StringReader(s);
390: BufferedReader br = new BufferedReader(sr);
391: String ss = null;
392:
393: try {
394: while ((ss = br.readLine()) != null) {
395: int i = 0;
396:
397: while (i < ss.length()) {
398: switch (ss.charAt(i)) {
399: case '<':
400: sb.append("<");
401: i++;
402:
403: break;
404:
405: case '>':
406: sb.append(">");
407: i++;
408:
409: break;
410:
411: case '&':
412: sb.append("&");
413: i++;
414:
415: break;
416:
417: case '"':
418: sb.append(""");
419: i++;
420:
421: break;
422:
423: case ' ':
424:
425: //sb.append(" ");
426: if (ss.substring(i).startsWith(" ")) {
427: sb.append(" ");
428: i = i + 2;
429: } else if (ss.substring(i).startsWith(" ")) {
430: sb.append(" ");
431: i = i + 3;
432: } else if (ss.substring(i).startsWith(" ")) {
433: sb.append(" ");
434: i = i + 2;
435: } else {
436: sb.append(' ');
437: i++;
438: }
439:
440: break;
441:
442: case '\t':
443: sb.append(" ");
444: i++;
445:
446: break;
447:
448: case '\n':
449: sb.append("<br>");
450: i++;
451:
452: break;
453:
454: default:
455: sb.append(ss.charAt(i));
456: i++;
457:
458: break;
459: }
460: }
461:
462: sb.append("<br>\n");
463: }
464: } catch (Exception e) {
465: LOG.severe("Error substituting special characters: "
466: + e.getMessage());
467:
468: return null; // error
469: }
470:
471: return sb.toString();
472: }
473:
474: /**
475: *
476: * substitute special characters like:
477: * <,>,&,\t,\n
478: * with special entities used in html<br>
479: * This is the same as substituteSpecialCharacters, but
480: * here an extra newline character is not inserted.
481: *
482: * @param s input string containing special characters
483: * @return output with special characters substituted
484: * (null on error)
485: */
486: public static String substituteSpecialCharactersInHeaderfields(
487: String s) {
488: StringBuffer sb = new StringBuffer(s.length());
489: StringReader sr = new StringReader(s);
490: BufferedReader br = new BufferedReader(sr);
491: String ss = null;
492:
493: // TODO (@author karlpeder): Extend handling of special entities as in restoreSpecialCharacters
494:
495: /*
496: * *20030623, karlpeder* " and space handled also
497: */
498: try {
499: while ((ss = br.readLine()) != null) {
500: int i = 0;
501:
502: while (i < ss.length()) {
503: switch (ss.charAt(i)) {
504: case '<':
505: sb.append("<");
506: i++;
507:
508: break;
509:
510: case '>':
511: sb.append(">");
512: i++;
513:
514: break;
515:
516: case '&':
517: sb.append("&");
518: i++;
519:
520: break;
521:
522: case '"':
523: sb.append(""");
524: i++;
525:
526: break;
527:
528: /*
529: case '\'':
530: sb.append("'");
531: i++;
532:
533: break;*/
534:
535: case ' ':
536:
537: if (ss.substring(i).startsWith(" ")) {
538: sb.append(" ");
539: i = i + 2;
540: } else if (ss.substring(i).startsWith(" ")) {
541: sb.append(" ");
542: i = i + 3;
543: } else if (ss.substring(i).startsWith(" ")) {
544: sb.append(" ");
545: i = i + 2;
546: } else {
547: sb.append(' ');
548: i++;
549: }
550:
551: break;
552:
553: case '\t':
554: sb.append(" ");
555: i++;
556:
557: break;
558:
559: case '\n':
560: sb.append("<br>");
561: i++;
562:
563: break;
564:
565: default:
566: sb.append(ss.charAt(i));
567: i++;
568:
569: break;
570: }
571: }
572: }
573: } catch (Exception e) {
574: LOG.severe("Error substituting special characters: "
575: + e.getMessage());
576:
577: return null; // error
578: }
579:
580: return sb.toString();
581: }
582:
583: /**
584: * Tries to fix broken html-strings by inserting
585: * html start- and end tags if missing, and by
586: * removing content after the html end tag.
587: *
588: * @param input html content to be validated
589: * @return content with extra tags inserted if necessary
590: */
591: public static String validateHTMLString(String input) {
592: StringBuffer output = new StringBuffer(input);
593: int index = 0;
594:
595: String lowerCaseInput = input.toLowerCase();
596:
597: // Check for missing <html> tag
598: if (lowerCaseInput.indexOf("<html>") == -1) {
599: if (lowerCaseInput.indexOf("<!doctype") != -1) {
600: index = lowerCaseInput.indexOf("\n", lowerCaseInput
601: .indexOf("<!doctype")) + 1;
602: }
603:
604: output.insert(index, "<html>");
605: }
606:
607: // Check for missing </html> tag
608: if (lowerCaseInput.indexOf("</html>") == -1) {
609: output.append("</html>");
610: }
611:
612: // remove characters after </html> tag
613: index = lowerCaseInput.indexOf("</html>");
614:
615: if (lowerCaseInput.length() >= (index + 7)) {
616: lowerCaseInput = lowerCaseInput.substring(0, index + 7);
617: }
618:
619: return output.toString();
620: }
621:
622: /**
623: * parse text and transform every email-address
624: * in a HTML-conform address
625: *
626: * @param s input text
627: * @return text with email-adresses transformed to links
628: * (null on error)
629: */
630: public static String substituteEmailAddress(String s) {
631: // due to bug CA-174 changed: return EMAIL_PATTERN.matcher(s).replaceAll("<A HREF=\"mailto:$1\">$1</A>");
632: return substituteEmailAddress(s, false);
633: }
634:
635: /**
636: * Transforms email-addresses into HTML just as
637: * substituteEmailAddress(String), but tries to ignore email-addresses,
638: * which are already links, if the ignore links flag is set.
639: * <br>
640: * This extended functionality is necessary when parsing a text which
641: * is already (partly) html.
642: * <br>
643: * FIXME: Can this be done smarter, i.e. directly with reg. expr. without manual parsing??
644: *
645: * @param s input text
646: * @param ignoreLinks if true link tags are ignored. This gives a
647: * wrong result if some e-mail adresses are
648: * already links (but uses reg. expr. directly,
649: * and is therefore faster)
650: * @return text with email-adresses transformed to links
651: */
652: public static String substituteEmailAddress(String s,
653: boolean ignoreLinks) {
654: if (ignoreLinks) {
655: // Do not take existing link tags into account
656: return substituteEmailAddress(s);
657: }
658:
659: // initialisation
660: Matcher noLinkMatcher = EMAIL_PATTERN.matcher(s);
661: Matcher withLinkMatcher = EMAIL_PATTERN_INC_LINK.matcher(s);
662: int pos = 0; // current position in s
663: int length = s.length();
664: StringBuffer buf = new StringBuffer();
665:
666: while (pos < length) {
667: if (noLinkMatcher.find(pos)) {
668: // an email adress was found - check whether its already a link
669: int s1 = noLinkMatcher.start();
670: int e1 = noLinkMatcher.end();
671: boolean insertLink;
672:
673: if (withLinkMatcher.find(pos)) {
674: // found an email address with links - is it the same?
675: int s2 = withLinkMatcher.start();
676: int e2 = withLinkMatcher.end();
677:
678: if ((s2 < s1) && (e2 > e1)) {
679: // same email adress - just append and continue
680: buf.append(s.substring(pos, e2));
681: pos = e2;
682: insertLink = false; // already handled
683: } else {
684: // not the same
685: insertLink = true;
686: }
687: } else {
688: // no match with mailto link tags
689:
690: insertLink = true;
691:
692: // can be an email address in a link BUG CA-174
693: // fix that with looking for an open link in the same line before
694: // on the way from left to the current position of the email at s1
695: // find the last open link <a
696: Matcher openLink = Pattern.compile("<a",
697: Pattern.CASE_INSENSITIVE).matcher(s);
698: Matcher closeLink = Pattern.compile("</a>",
699: Pattern.CASE_INSENSITIVE).matcher(s);
700: int linkPos = 0;
701: int savedLinkPos = -1;
702: while (linkPos < s1) {
703: savedLinkPos = linkPos;
704: if (openLink.find(linkPos))
705: linkPos = openLink.end();
706: else
707: break;
708: }
709:
710: // found an open link
711: if (savedLinkPos > -1) {
712: // check if it is closed
713: if (closeLink.find(savedLinkPos)) {
714: // if the closing mark is after the s1 mark do not insert a link
715: if (closeLink.end() >= s1) {
716: buf.append(s.substring(pos, e1));
717: pos = e1;
718: insertLink = false; // already handled
719: }
720: }
721: }
722: }
723:
724: // shall we insert a link?
725: if (insertLink) {
726: String email = s.substring(s1, e1);
727: String link = "<a href=\"mailto:" + email + "\">"
728: + email + "</a>";
729: buf.append(s.substring(pos, s1));
730: buf.append(link);
731: pos = e1;
732: }
733: } else {
734: // no more matches - append rest of string
735: buf.append(s.substring(pos));
736: pos = length;
737: }
738: }
739:
740: // return result
741: String result = buf.toString();
742: LOG.info("Result:\n" + result);
743:
744: return result;
745: }
746:
747: /**
748: * parse text and transform every url
749: * in a HTML-conform url
750: *
751: * @param s input text
752: * @return text with urls transformed to links
753: * (null on error)
754: */
755: public static String substituteURL(String s) {
756: String match;
757: Matcher m = URL_PATTERN.matcher(s);
758: StringBuffer sb = new StringBuffer();
759:
760: int pos = 0;
761: while (m.find()) {
762: match = m.group();
763:
764: sb.append(s.substring(pos, m.start()));
765: String temp = "";
766: // Test if there is a trailing html tag
767: if (match.matches(".*<\\w+$") && s.length() > m.end()
768: && s.charAt(m.end()) == '>') {
769: temp = match.substring(match.lastIndexOf('<'));
770: match = match.substring(0, match.lastIndexOf('<'));
771: }
772: sb.append("<A HREF=\"" + match + "\">" + match + "</A>");
773: sb.append(temp);
774: pos = m.end();
775: }
776:
777: sb.append(s.substring(pos));
778:
779: return sb.toString();
780: }
781:
782: /**
783: * Transforms urls into HTML just as substituteURL(String),
784: * but tries to ignore urls, which are already links, if the ignore
785: * links flag is set.
786: * <br>
787: * This extended functionality is necessary when parsing a text which
788: * is already (partly) html.
789: * <br>
790: * FIXME: Can this be done smarter, i.e. directly with reg. expr. without manual parsing??
791: *
792: * @param s input text
793: * @param ignoreLinks if true link tags are ignored. This gives a
794: * wrong result if some urls are already links
795: * (but uses reg. expr. directly, and is
796: * therefore faster)
797: * @return text with urls
798: */
799: public static String substituteURL(String s, boolean ignoreLinks) {
800: if (ignoreLinks) {
801: // Do not take existing link tags into account
802: return substituteURL(s);
803: }
804:
805: // initialisation
806: Matcher noLinkMatcher = URL_PATTERN.matcher(s);
807: Matcher withLinkMatcher = URL_PATTERN_INC_LINK.matcher(s);
808: int pos = 0; // current position in s
809: int length = s.length();
810: StringBuffer buf = new StringBuffer();
811:
812: while (pos < length) {
813: if (noLinkMatcher.find(pos)) {
814: // an url - check whether its already a link
815: int s1 = noLinkMatcher.start();
816: int e1 = noLinkMatcher.end();
817: boolean insertLink;
818:
819: if (withLinkMatcher.find(pos)) {
820: // found an url with links - is it the same?
821: int s2 = withLinkMatcher.start();
822: int e2 = withLinkMatcher.end();
823:
824: if ((s2 < s1) && (e2 > e1)) {
825: // same url - just append and continue
826: buf.append(s.substring(pos, e2));
827: pos = e2;
828: insertLink = false; // already handled
829: } else {
830: // not the same
831: insertLink = true;
832: }
833: } else {
834: // no match with link tags
835: insertLink = true;
836: }
837:
838: // shall we insert a link?
839: if (insertLink) {
840: String url = s.substring(s1, e1);
841: String link = "<a href=\"" + url + "\">" + url
842: + "</a>";
843: buf.append(s.substring(pos, s1));
844: buf.append(link);
845: pos = e1;
846: }
847: } else {
848: // no more matches - append rest of string
849: buf.append(s.substring(pos));
850: pos = length;
851: }
852: }
853:
854: // return result
855: String result = buf.toString();
856: LOG.info("Result:\n" + result);
857:
858: return result;
859: }
860:
861: /**
862: * Extracts the body of a html document, i.e. the html contents
863: * between (and not including) body start and end tags.
864: *
865: * @param html The html document to extract the body from
866: * @return The body of the html document
867: *
868: * @author Karl Peder Olesen (karlpeder)
869: */
870: public static String getHtmlBody(String html) {
871: // locate body start- and end tags
872: String lowerCaseContent = html.toLowerCase();
873: int tagStart = lowerCaseContent.indexOf("<body");
874:
875: // search for closing bracket separately to account for attributes in tag
876: int tagStartClose = lowerCaseContent.indexOf(">", tagStart) + 1;
877: int tagEnd = lowerCaseContent.indexOf("</body>");
878:
879: // correct limits if body tags where not found
880: if (tagStartClose < 0) {
881: tagStartClose = 0;
882: }
883:
884: if ((tagEnd < 0) || (tagEnd > lowerCaseContent.length())) {
885: tagEnd = lowerCaseContent.length();
886: }
887:
888: // return body
889: return html.substring(tagStartClose, tagEnd);
890: }
891:
892: /**
893: * Parses a html documents and removes all html comments found.
894: *
895: * @param html The html document
896: * @return Html document without comments
897: *
898: * @author Karl Peder Olesen (karlpeder)
899: */
900: public static String removeComments(String html) {
901: // remove comments
902: return COMMENTS_REMOVAL_PATTERN.matcher(html).replaceAll("");
903: }
904: }
|