001: package org.opencms.util;
002:
003: import java.util.HashMap;
004: import java.util.Iterator;
005: import java.util.List;
006: import java.util.Map;
007:
008: import org.htmlparser.Tag;
009: import org.htmlparser.Text;
010: import org.htmlparser.util.Translate;
011:
012: /**
013: * Extracts the HTML page content.<p>
014: */
015: public class CmsHtml2TextConverter extends CmsHtmlParser {
016:
017: /** Indicated to append or store the next line breaks. */
018: private boolean m_appendBr;
019:
020: /** Map of stored attributes that must bw written to the output when the tag closes. */
021: private Map m_attributeMap;
022:
023: /** The last appended line break count. */
024: private int m_brCount;
025:
026: /** The current indentation. */
027: private int m_indent;
028:
029: /** The current line length. */
030: private int m_lineLength;
031:
032: /** The marker String (for headlines, bullets etc.). */
033: private String m_marker;
034:
035: /** The maximum line length. */
036: private int m_maxLineLength;
037:
038: /** The last stored, but not appended line break count. */
039: private int m_storedBrCount;
040:
041: /**
042: * Creates a new instance of the html converter.<p>
043: */
044: public CmsHtml2TextConverter() {
045:
046: m_result = new StringBuffer(512);
047: m_maxLineLength = 100;
048: m_attributeMap = new HashMap(16);
049: }
050:
051: /**
052: * Extracts the text from the given html content, assuming the given html encoding.<p>
053: *
054: * @param html the content to extract the plain text from
055: * @param encoding the encoding to use
056: *
057: * @return the text extracted from the given html content
058: *
059: * @throws Exception if something goes wrong
060: */
061: public static String html2text(String html, String encoding)
062: throws Exception {
063:
064: // create the converter instance
065: CmsHtml2TextConverter visitor = new CmsHtml2TextConverter();
066: return visitor.process(html, encoding);
067: }
068:
069: /**
070: * @see org.htmlparser.visitors.NodeVisitor#visitEndTag(org.htmlparser.Tag)
071: */
072: public void visitEndTag(Tag tag) {
073:
074: m_appendBr = false;
075: appendLinebreaks(tag, false);
076: String attribute = (String) m_attributeMap.remove(tag
077: .getParent());
078: if (attribute != null) {
079: appendText(attribute);
080: }
081: }
082:
083: /**
084: * @see org.htmlparser.visitors.NodeVisitor#visitStringNode(org.htmlparser.Text)
085: */
086: public void visitStringNode(Text text) {
087:
088: appendText(text.toPlainTextString());
089: }
090:
091: /**
092: * @see org.htmlparser.visitors.NodeVisitor#visitTag(org.htmlparser.Tag)
093: */
094: public void visitTag(Tag tag) {
095:
096: m_appendBr = true;
097: appendLinebreaks(tag, true);
098:
099: if (tag.getTagName().equals("IMG")) {
100: appendText("##IMG##");
101: }
102:
103: String href = tag.getAttribute("href");
104: if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(href)) {
105: appendAttribute(tag, " [" + href.trim() + "]");
106: }
107: String src = tag.getAttribute("src");
108: if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(src)) {
109: appendAttribute(tag, " [" + src.trim() + "]");
110: }
111: String title = tag.getAttribute("title");
112: if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(title)) {
113: appendAttribute(tag, " {" + title.trim() + "}");
114: }
115: String alt = tag.getAttribute("alt");
116: if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(alt)) {
117: appendAttribute(tag, " {" + alt.trim() + "}");
118: }
119: }
120:
121: private void appendAttribute(Tag tag, String text) {
122:
123: if (tag.getTagName().equals("IMG")) {
124: appendText(text);
125: } else {
126: String current = (String) m_attributeMap.get(tag);
127: if (current != null) {
128: text = current + text;
129: }
130: m_attributeMap.put(tag, text);
131: }
132: }
133:
134: private void appendIndentation() {
135:
136: if (m_lineLength <= m_indent) {
137: int len = (m_marker != null) ? m_indent
138: - (m_marker.length() + 1) : m_indent;
139: for (int i = 0; i < len; i++) {
140: m_result.append(' ');
141: }
142: if (m_marker != null) {
143: m_result.append(m_marker);
144: m_result.append(' ');
145: m_marker = null;
146: }
147: }
148: }
149:
150: private void appendLinebreak(int count) {
151:
152: appendLinebreak(count, false);
153: }
154:
155: private void appendLinebreak(int count, boolean force) {
156:
157: if (m_appendBr) {
158: if (m_storedBrCount > count) {
159: count = m_storedBrCount;
160: }
161: m_storedBrCount = 0;
162: if (force) {
163: m_brCount = 0;
164: }
165: while (m_brCount < count) {
166: m_result.append("\r\n");
167: m_brCount++;
168: }
169: m_lineLength = m_indent;
170: } else {
171: while (m_storedBrCount < count) {
172: m_storedBrCount++;
173: }
174: }
175: }
176:
177: private void appendLinebreaks(Tag tag, boolean open) {
178:
179: String name = tag.getTagName();
180: int pos = TAG_LIST.indexOf(name);
181:
182: switch (pos) {
183: case 0: // H1
184: setMarker("=", open);
185: setIndentation(2, open);
186: appendLinebreak(2);
187: break;
188: case 1: // H2
189: setMarker("==", open);
190: setIndentation(3, open);
191: appendLinebreak(2);
192: break;
193: case 2: // H3
194: setMarker("===", open);
195: setIndentation(4, open);
196: appendLinebreak(2);
197: break;
198: case 3: // H4
199: setMarker("====", open);
200: setIndentation(5, open);
201: appendLinebreak(2);
202: break;
203: case 4: // H5
204: setMarker("=====", open);
205: setIndentation(6, open);
206: appendLinebreak(2);
207: break;
208: case 5: // H6
209: setMarker("=======", open);
210: setIndentation(7, open);
211: appendLinebreak(2);
212: break;
213: case 6: // P
214: case 7: // DIV
215: appendLinebreak(2);
216: break;
217: case 8: // SPAN
218: break;
219: case 9: // BR
220: appendLinebreak(1, true);
221: break;
222: case 10: // OL
223: case 11: // UL
224: appendLinebreak(2);
225: break;
226: case 12: // LI
227: setMarker("*", open);
228: setIndentation(5, open);
229: appendLinebreak(1);
230: break;
231: case 13: // TABLE
232: setIndentation(5, open);
233: appendLinebreak(2);
234: if (open) {
235: appendLinebreak(1);
236: appendText("-----");
237: appendLinebreak(1);
238: }
239: break;
240: case 14: // TD
241: setMarker("--", open);
242: appendLinebreak(2);
243: break;
244: case 15: // TR
245: if (!open) {
246: appendLinebreak(1);
247: appendText("-----");
248: appendLinebreak(1);
249: }
250: break;
251: case 16: // TH
252: case 17: // THEAD
253: case 18: // TBODY
254: case 19: // TFOOT
255: appendLinebreak(1);
256: break;
257: default: // unknown tag (ignore)
258: }
259: }
260:
261: private void appendText(String text) {
262:
263: if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) {
264: text = Translate.decode(text);
265: text = collapse(text);
266: }
267: if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) {
268:
269: if (m_storedBrCount > 0) {
270: m_appendBr = true;
271: appendLinebreak(m_storedBrCount);
272: }
273: appendIndentation();
274: m_brCount = 0;
275:
276: List wordList = CmsStringUtil.splitAsList(text, ' ');
277: Iterator i = wordList.iterator();
278: while (i.hasNext()) {
279: String word = (String) i.next();
280: boolean hasNbsp = ((word.charAt(0) == 160) || (word
281: .charAt(word.length() - 1) == 160));
282: if ((word.length() + 1 + m_lineLength) > m_maxLineLength) {
283: m_appendBr = true;
284: appendLinebreak(1);
285: appendIndentation();
286: m_brCount = 0;
287: } else {
288: if (!hasNbsp
289: && (m_lineLength > m_indent)
290: && (m_result.charAt(m_result.length() - 1) != 160)
291: && (m_result.charAt(m_result.length() - 1) != 32)) {
292:
293: m_result.append(' ');
294: m_lineLength++;
295: }
296: }
297: m_result.append(word);
298: m_lineLength += word.length();
299: }
300: }
301: }
302:
303: private void setIndentation(int length, boolean open) {
304:
305: if (open) {
306: m_indent += length;
307: } else {
308: m_indent -= length;
309: if (m_indent < 0) {
310: m_indent = 0;
311: }
312: }
313: }
314:
315: private void setMarker(String marker, boolean open) {
316:
317: if (open) {
318: m_marker = marker;
319: }
320: }
321: }
|