001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.util.*;
024: import java.io.*;
025:
026: /**
027: * Performs a simple rendering of HTML markup into text.
028: * <p>
029: * This provides a human readable version of the segment content that is modelled on the way
030: * <a target="_blank" href="http://www.mozilla.com/thunderbird/">Mozilla Thunderbird</a> and other email clients provide an automatic conversion of
031: * HTML content to text in their <a target="_blank" href="http://tools.ietf.org/html/rfc2046#section-5.1.4">alternative MIME encoding</a> of emails.
032: * <p>
033: * The output using default settings complies with the "text/plain; format=flowed" (DelSp=No) protocol described in
034: * <a target="_blank" href="http://tools.ietf.org/html/rfc3676">RFC3676</a>.
035: * <p>
036: * Many properties are available to customise the output, possibly the most significant of which being {@link #setMaxLineLength(int) MaxLineLength}.
037: * See the individual property descriptions for details.
038: * <p>
039: * Use one of the following methods to obtain the output:
040: * <ul>
041: * <li>{@link #writeTo(Writer)}</li>
042: * <li>{@link #toString()}</li>
043: * <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
044: * </ul>
045: * <p>
046: * The rendering of some constructs, especially tables, is very rudimentary.
047: * No attempt is made to render nested tables properly, except to ensure that all of the text content is included in the output.
048: * <p>
049: * Rendering an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
050: * <p>
051: * Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions.
052: * <p>
053: * To extract pure text without any rendering of the markup, use the {@link TextExtractor} class instead.
054: */
055: public final class Renderer implements CharStreamSource {
056: private final Segment rootSegment;
057: private int maxLineLength = 76;
058: private String newLine = "\r\n";
059: private boolean decorateFontStyles = false;
060: private boolean convertNonBreakingSpaces = true;
061: private int blockIndentSize = 4;
062: private int listIndentSize = 6;
063: private char[] listBullets = new char[] { '*', 'o', '+', '#' };
064: private String tableCellSeparator = " \t";
065:
066: /**
067: * Constructs a new <code>Renderer</code> based on the specified {@link Segment}.
068: * @param segment the segment containing the HTML to be rendered.
069: * @see Segment#getRenderer()
070: */
071: public Renderer(final Segment segment) {
072: rootSegment = segment;
073: }
074:
075: // Documentation inherited from CharStreamSource
076: public void writeTo(final Writer writer) throws IOException {
077: new Processor(rootSegment, getMaxLineLength(), getNewLine(),
078: getDecorateFontStyles(), getConvertNonBreakingSpaces(),
079: getBlockIndentSize(), getListIndentSize(),
080: getListBullets(), getTableCellSeparator())
081: .writeTo(writer);
082: }
083:
084: // Documentation inherited from CharStreamSource
085: public long getEstimatedMaximumOutputLength() {
086: return rootSegment.length();
087: }
088:
089: // Documentation inherited from CharStreamSource
090: public String toString() {
091: return CharStreamSourceUtil.toString(this );
092: }
093:
094: /**
095: * Sets the column at which lines are to be wrapped.
096: * <p>
097: * Lines that would otherwise exceed this length are wrapped onto a new line at a word boundary.
098: * <p>
099: * A Line may still exceed this length if it consists of a single word, where the length of the word plus the line indent exceeds the maximum length.
100: * In this case the line is wrapped immediately after the end of the word.
101: * <p>
102: * The default value is <code>76</code>, which reflects the maximum line length for sending
103: * email data specified in <a target="_blank" href="http://rfc.net/rfc2049.html#s3.">RFC2049 section 3.5</a>.
104: *
105: * @param maxLineLength the column at which lines are to be wrapped.
106: * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
107: * @see #getMaxLineLength()
108: */
109: public Renderer setMaxLineLength(final int maxLineLength) {
110: this .maxLineLength = maxLineLength;
111: return this ;
112: }
113:
114: /**
115: * Returns the column at which lines are to be wrapped.
116: * <p>
117: * See the {@link #setMaxLineLength(int)} method for a full description of this property.
118: *
119: * @return the column at which lines are to be wrapped.
120: */
121: public int getMaxLineLength() {
122: return maxLineLength;
123: }
124:
125: /**
126: * Sets the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
127: * <p>
128: * The default value is <code>"\r\n"</code> <span title="carriage return + line feed">(CR+LF)</span> regardless of the platform on which the library is running.
129: * This is so that the default configuration produces valid
130: * <a target="_blank" href="http://tools.ietf.org/html/rfc1521#section-7.1.2">MIME plain/text</a> output, which mandates the use of CR+LF for line breaks.
131: * <p>
132: * Specifying a <code>null</code> argument causes the output to use same new line string as is used in the source document, which is
133: * determined via the {@link Source#getNewLine()} method.
134: * If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document,
135: * or using the value from {@link Config#NewLine}.
136: *
137: * @param newLine the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output, may be <code>null</code>.
138: * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
139: * @see #getNewLine()
140: */
141: public Renderer setNewLine(final String newLine) {
142: this .newLine = newLine;
143: return this ;
144: }
145:
146: /**
147: * Returns the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
148: * <p>
149: * See the {@link #setNewLine(String)} method for a full description of this property.
150: *
151: * @return the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
152: */
153: public String getNewLine() {
154: if (newLine == null)
155: newLine = rootSegment.source.getBestGuessNewLine();
156: return newLine;
157: }
158:
159: /**
160: * Sets whether decoration characters are to be included around the content of some
161: * <a target="_blank" href="http://www.w3.org/TR/html401/present/graphics.html#h-15.2.1">font style elements</a> and
162: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.2.1">phrase elements</a>.
163: * <p>
164: * The default value is <code>false</code>.
165: * <p>
166: * Below is a table summarising the decorated elements.
167: * <p>
168: * <style type="text/css">
169: * table#FontStyleElementSummary td, table#FontStyleElementSummary th {text-align: center; padding-bottom: 2px}
170: * </style>
171: * <table id="FontStyleElementSummary" class="bordered" cellspacing="0">
172: * <tr><th title="HTML elements decorated">Elements</th><th title="The character placed around the element content">Character</th><th>Example Output</th></tr>
173: * <tr><td>{@link HTMLElementName#B B} and {@link HTMLElementName#STRONG STRONG}</td><td><code>*</code></td><td><code>*bold text*</code></td></tr>
174: * <tr><td>{@link HTMLElementName#I I} and {@link HTMLElementName#EM EM}</td><td><code>/</code></td><td><code>/italic text/</code></td></tr>
175: * <tr><td>{@link HTMLElementName#U U}</td><td><code>_</code></td><td><code>_underlined text_</code></td></tr>
176: * <tr><td>{@link HTMLElementName#CODE CODE}</td><td><code>|</code></td><td><code>|code|</code></td></tr>
177: * </table>
178: *
179: * @param decorateFontStyles specifies whether decoration characters are to be included around the content of some font style elements.
180: * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
181: * @see #getDecorateFontStyles()
182: */
183: public Renderer setDecorateFontStyles(
184: final boolean decorateFontStyles) {
185: this .decorateFontStyles = decorateFontStyles;
186: return this ;
187: }
188:
189: /**
190: * Indicates whether decoration characters are to be included around the content of some
191: * <a target="_blank" href="http://www.w3.org/TR/html401/present/graphics.html#h-15.2.1">font style elements</a> and
192: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.2.1">phrase elements</a>.
193: * <p>
194: * See the {@link #setDecorateFontStyles(boolean)} method for a full description of this property.
195: *
196: * @return <code>true</code> if decoration characters are to be included around the content of some font style elements, otherwise <code>false</code>.
197: */
198: public boolean getDecorateFontStyles() {
199: return decorateFontStyles;
200: }
201:
202: /**
203: * Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
204: * <p>
205: * The default value is <code>true</code>.
206: *
207: * @param convertNonBreakingSpaces specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
208: * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
209: * @see #getConvertNonBreakingSpaces()
210: */
211: public Renderer setConvertNonBreakingSpaces(
212: boolean convertNonBreakingSpaces) {
213: this .convertNonBreakingSpaces = convertNonBreakingSpaces;
214: return this ;
215: }
216:
217: /**
218: * Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
219: * <p>
220: * See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property.
221: *
222: * @return <code>true</code> if non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces, otherwise <code>false</code>.
223: */
224: public boolean getConvertNonBreakingSpaces() {
225: return convertNonBreakingSpaces;
226: }
227:
228: /**
229: * Sets the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements.
230: * <p>
231: * At present this applies to {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE} and {@link HTMLElementName#DD DD} elements.
232: * <p>
233: * The default value is <code>4</code>.
234: *
235: * @param blockIndentSize the size of the indent.
236: * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
237: * @see #getBlockIndentSize()
238: */
239: public Renderer setBlockIndentSize(final int blockIndentSize) {
240: this .blockIndentSize = blockIndentSize;
241: return this ;
242: }
243:
244: /**
245: * Returns the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements.
246: * <p>
247: * See the {@link #setBlockIndentSize(int)} method for a full description of this property.
248: *
249: * @return the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements.
250: */
251: public int getBlockIndentSize() {
252: return blockIndentSize;
253: }
254:
255: /**
256: * Sets the size of the indent to be used for {@link HTMLElementName#LI LI} elements.
257: * <p>
258: * The default value is <code>6</code>.
259: * <p>
260: * This applies to {@link HTMLElementName#LI LI} elements inside both {@link HTMLElementName#UL UL} and {@link HTMLElementName#OL OL} elements.
261: * <p>
262: * The bullet or number of the list item is included as part of the indent.
263: *
264: * @param listIndentSize the size of the indent.
265: * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
266: * @see #getListIndentSize()
267: */
268: public Renderer setListIndentSize(final int listIndentSize) {
269: this .listIndentSize = listIndentSize;
270: return this ;
271: }
272:
273: /**
274: * Returns the size of the indent to be used for {@link HTMLElementName#LI LI} elements.
275: * <p>
276: * See the {@link #setListIndentSize(int)} method for a full description of this property.
277: *
278: * @return the size of the indent to be used for {@link HTMLElementName#LI LI} elements.
279: */
280: public int getListIndentSize() {
281: return listIndentSize;
282: }
283:
284: /**
285: * Sets the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements.
286: * <p>
287: * The values in the default array are <code>*</code>, <code>o</code>, <code>+</code> and <code>#</code>.
288: * <p>
289: * If the nesting of rendered lists goes deeper than the length of this array, the bullet characters start repeating from the first in the array.
290: * <p>
291: * WARNING: If any of the characters in the default array are modified, this will affect all other instances of this class using the default array.
292: *
293: * @param listBullets an array of characters to be used as bullets, must have at least one entry.
294: * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
295: * @see #getListBullets()
296: */
297: public Renderer setListBullets(final char[] listBullets) {
298: if (listBullets == null || listBullets.length == 0)
299: throw new IllegalArgumentException(
300: "listBullets argument must be an array of at least one character");
301: this .listBullets = listBullets;
302: return this ;
303: }
304:
305: /**
306: * Returns the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements.
307: * <p>
308: * See the {@link #setListBullets(char[])} method for a full description of this property.
309: *
310: * @return the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements.
311: */
312: public char[] getListBullets() {
313: return listBullets;
314: }
315:
316: /**
317: * Sets the string that is to separate table cells.
318: * <p>
319: * The default value is <code>" \t"</code> (a space followed by a tab).
320: *
321: * @param tableCellSeparator the string that is to separate table cells.
322: * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
323: * @see #getTableCellSeparator()
324: */
325: public Renderer setTableCellSeparator(
326: final String tableCellSeparator) {
327: this .tableCellSeparator = tableCellSeparator;
328: return this ;
329: }
330:
331: /**
332: * Returns the string that is to separate table cells.
333: * <p>
334: * See the {@link #setTableCellSeparator(String)} method for a full description of this property.
335: *
336: * @return the string that is to separate table cells.
337: */
338: public String getTableCellSeparator() {
339: return tableCellSeparator;
340: }
341:
342: /** This class does the actual work, but is first passed final copies of all the parameters for efficiency. */
343: private static final class Processor {
344: private final Segment rootSegment;
345: private final Source source;
346: private final int maxLineLength;
347: private final String newLine;
348: private final boolean decorateFontStyles;
349: private final boolean convertNonBreakingSpaces;
350: private final int blockIndentSize;
351: private final int listIndentSize;
352: private final char[] listBullets;
353: private final String tableCellSeparator;
354:
355: private Writer writer;
356: private int renderedIndex; // keeps track of where rendering is up to in case of overlapping elements
357: private int col;
358: private int startOfLineCol;
359: private int blockIndentLevel;
360: private int listIndentLevel;
361: private int blockVerticalMargin;
362: private boolean preformatted;
363: private boolean lastCharWhiteSpace;
364: private boolean bullet;
365: private int listBulletNumber;
366:
367: private static final int NO_MARGIN = -1;
368: private static final int UNORDERED_LIST = -1;
369:
370: private static Map ELEMENT_HANDLERS = new HashMap();
371: static {
372: ELEMENT_HANDLERS.put(HTMLElementName.A,
373: A_ElementHandler.INSTANCE);
374: ELEMENT_HANDLERS.put(HTMLElementName.ADDRESS,
375: StandardBlockElementHandler.INSTANCE_0_0);
376: ELEMENT_HANDLERS.put(HTMLElementName.APPLET,
377: RemoveElementHandler.INSTANCE);
378: ELEMENT_HANDLERS.put(HTMLElementName.B,
379: FontStyleElementHandler.INSTANCE_B);
380: ELEMENT_HANDLERS.put(HTMLElementName.BLOCKQUOTE,
381: StandardBlockElementHandler.INSTANCE_1_1_INDENT);
382: ELEMENT_HANDLERS.put(HTMLElementName.BR,
383: BR_ElementHandler.INSTANCE);
384: ELEMENT_HANDLERS.put(HTMLElementName.BUTTON,
385: RemoveElementHandler.INSTANCE);
386: ELEMENT_HANDLERS.put(HTMLElementName.CAPTION,
387: StandardBlockElementHandler.INSTANCE_0_0);
388: ELEMENT_HANDLERS.put(HTMLElementName.CENTER,
389: StandardBlockElementHandler.INSTANCE_1_1);
390: ELEMENT_HANDLERS.put(HTMLElementName.CODE,
391: FontStyleElementHandler.INSTANCE_CODE);
392: ELEMENT_HANDLERS.put(HTMLElementName.DD,
393: StandardBlockElementHandler.INSTANCE_0_0_INDENT);
394: ELEMENT_HANDLERS.put(HTMLElementName.DIR,
395: ListElementHandler.INSTANCE_UL);
396: ELEMENT_HANDLERS.put(HTMLElementName.DIV,
397: StandardBlockElementHandler.INSTANCE_0_0);
398: ELEMENT_HANDLERS.put(HTMLElementName.DT,
399: StandardBlockElementHandler.INSTANCE_0_0);
400: ELEMENT_HANDLERS.put(HTMLElementName.EM,
401: FontStyleElementHandler.INSTANCE_I);
402: ELEMENT_HANDLERS.put(HTMLElementName.FIELDSET,
403: StandardBlockElementHandler.INSTANCE_1_1);
404: ELEMENT_HANDLERS.put(HTMLElementName.FORM,
405: StandardBlockElementHandler.INSTANCE_1_1);
406: ELEMENT_HANDLERS.put(HTMLElementName.H1,
407: StandardBlockElementHandler.INSTANCE_2_1);
408: ELEMENT_HANDLERS.put(HTMLElementName.H2,
409: StandardBlockElementHandler.INSTANCE_2_1);
410: ELEMENT_HANDLERS.put(HTMLElementName.H3,
411: StandardBlockElementHandler.INSTANCE_2_1);
412: ELEMENT_HANDLERS.put(HTMLElementName.H4,
413: StandardBlockElementHandler.INSTANCE_2_1);
414: ELEMENT_HANDLERS.put(HTMLElementName.H5,
415: StandardBlockElementHandler.INSTANCE_2_1);
416: ELEMENT_HANDLERS.put(HTMLElementName.H6,
417: StandardBlockElementHandler.INSTANCE_2_1);
418: ELEMENT_HANDLERS.put(HTMLElementName.HEAD,
419: RemoveElementHandler.INSTANCE);
420: ELEMENT_HANDLERS.put(HTMLElementName.HR,
421: HR_ElementHandler.INSTANCE);
422: ELEMENT_HANDLERS.put(HTMLElementName.I,
423: FontStyleElementHandler.INSTANCE_I);
424: ELEMENT_HANDLERS.put(HTMLElementName.LEGEND,
425: StandardBlockElementHandler.INSTANCE_0_0);
426: ELEMENT_HANDLERS.put(HTMLElementName.LI,
427: LI_ElementHandler.INSTANCE);
428: ELEMENT_HANDLERS.put(HTMLElementName.MENU,
429: ListElementHandler.INSTANCE_UL);
430: ELEMENT_HANDLERS.put(HTMLElementName.MAP,
431: RemoveElementHandler.INSTANCE);
432: ELEMENT_HANDLERS.put(HTMLElementName.NOFRAMES,
433: RemoveElementHandler.INSTANCE);
434: ELEMENT_HANDLERS.put(HTMLElementName.NOSCRIPT,
435: RemoveElementHandler.INSTANCE);
436: ELEMENT_HANDLERS.put(HTMLElementName.OL,
437: ListElementHandler.INSTANCE_OL);
438: ELEMENT_HANDLERS.put(HTMLElementName.P,
439: StandardBlockElementHandler.INSTANCE_1_1);
440: ELEMENT_HANDLERS.put(HTMLElementName.PRE,
441: PRE_ElementHandler.INSTANCE);
442: ELEMENT_HANDLERS.put(HTMLElementName.SCRIPT,
443: RemoveElementHandler.INSTANCE);
444: ELEMENT_HANDLERS.put(HTMLElementName.SELECT,
445: RemoveElementHandler.INSTANCE);
446: ELEMENT_HANDLERS.put(HTMLElementName.STRONG,
447: FontStyleElementHandler.INSTANCE_B);
448: ELEMENT_HANDLERS.put(HTMLElementName.STYLE,
449: RemoveElementHandler.INSTANCE);
450: ELEMENT_HANDLERS.put(HTMLElementName.TEXTAREA,
451: RemoveElementHandler.INSTANCE);
452: ELEMENT_HANDLERS.put(HTMLElementName.TD,
453: TD_ElementHandler.INSTANCE);
454: ELEMENT_HANDLERS.put(HTMLElementName.TH,
455: TD_ElementHandler.INSTANCE);
456: ELEMENT_HANDLERS.put(HTMLElementName.TR,
457: TR_ElementHandler.INSTANCE);
458: ELEMENT_HANDLERS.put(HTMLElementName.U,
459: FontStyleElementHandler.INSTANCE_U);
460: ELEMENT_HANDLERS.put(HTMLElementName.UL,
461: ListElementHandler.INSTANCE_UL);
462: }
463:
464: public Processor(final Segment rootSegment,
465: final int maxLineLength, final String newLine,
466: final boolean decorateFontStyles,
467: final boolean convertNonBreakingSpaces,
468: final int blockIndentSize, final int listIndentSize,
469: final char[] listBullets,
470: final String tableCellSeparator) {
471: this .rootSegment = rootSegment;
472: source = rootSegment.source;
473: this .maxLineLength = maxLineLength;
474: this .newLine = newLine;
475: this .decorateFontStyles = decorateFontStyles;
476: this .convertNonBreakingSpaces = convertNonBreakingSpaces;
477: this .blockIndentSize = blockIndentSize;
478: this .listIndentSize = listIndentSize;
479: this .listBullets = listBullets;
480: this .tableCellSeparator = tableCellSeparator;
481: }
482:
483: public void writeTo(final Writer writer) throws IOException {
484: reset();
485: this .writer = writer;
486: writeIndent();
487: writeSegment(rootSegment);
488: writer.flush();
489: }
490:
491: private void reset() {
492: renderedIndex = 0;
493: col = 0;
494: startOfLineCol = 0;
495: blockIndentLevel = 0;
496: listIndentLevel = 0;
497: blockVerticalMargin = NO_MARGIN;
498: preformatted = false;
499: lastCharWhiteSpace = false;
500: bullet = false;
501: }
502:
503: private void writeSegment(final Segment segment)
504: throws IOException {
505: writeSegmentProcessingChildElements(segment.begin,
506: segment.end, segment.getChildElements());
507: }
508:
509: private void writeElementContent(final Element element)
510: throws IOException {
511: final int contentEnd = element.getContentEnd();
512: if (element.isEmpty() || renderedIndex >= contentEnd)
513: return;
514: final int contentBegin = element.getStartTag().end;
515: writeSegmentProcessingChildElements(Math.max(renderedIndex,
516: contentBegin), contentEnd, element
517: .getChildElements());
518: }
519:
520: private void writeSegmentProcessingChildElements(
521: final int begin, final int end, final List childElements)
522: throws IOException {
523: int index = begin;
524: for (Iterator i = childElements.iterator(); i.hasNext();) {
525: Element childElement = (Element) i.next();
526: if (index >= childElement.end)
527: continue;
528: if (index < childElement.begin)
529: writeSegmentRemovingTags(index, childElement.begin);
530: getElementHandler(childElement).process(this ,
531: childElement);
532: index = Math.max(renderedIndex, childElement.end);
533: }
534: if (index < end)
535: writeSegmentRemovingTags(index, end);
536: }
537:
538: private static ElementHandler getElementHandler(
539: final Element element) {
540: if (element.getStartTag().getStartTagType().isServerTag())
541: return RemoveElementHandler.INSTANCE; // hard-coded configuration does not include server tags in child element hierarchy, so this is normally not executed.
542: ElementHandler elementHandler = (ElementHandler) ELEMENT_HANDLERS
543: .get(element.getName());
544: return (elementHandler != null) ? elementHandler
545: : StandardInlineElementHandler.INSTANCE;
546: }
547:
548: private void writeSegmentRemovingTags(final int begin,
549: final int end) throws IOException {
550: int index = begin;
551: while (true) {
552: Tag tag = source.findNextTag(index);
553: if (tag == null || tag.begin >= end)
554: break;
555: writeSegment(index, tag.begin);
556: index = tag.end;
557: }
558: writeSegment(index, end);
559: }
560:
561: private void writeSegment(int begin, final int end)
562: throws IOException {
563: // assert begin<end
564: if (begin < renderedIndex)
565: begin = renderedIndex;
566: if (begin >= end)
567: return;
568: try {
569: if (preformatted) {
570: if (blockVerticalMargin != NO_MARGIN)
571: writeBlockVerticalMargin();
572: final String text = CharacterReference.decode(
573: source.string.subSequence(begin, end),
574: convertNonBreakingSpaces);
575: for (int i = 0; i < text.length(); i++) {
576: final char ch = text.charAt(i);
577: if (ch == '\n') {
578: newLine();
579: } else if (ch == '\r') {
580: newLine();
581: final int nextI = i + 1;
582: if (nextI == text.length())
583: break;
584: if (text.charAt(nextI) == '\n')
585: i++;
586: } else {
587: write(ch);
588: }
589: }
590: } else {
591: final String text = CharacterReference
592: .decodeCollapseWhiteSpace(source.string
593: .subSequence(begin, end),
594: convertNonBreakingSpaces);
595: if (text.length() == 0) {
596: lastCharWhiteSpace = true;
597: return;
598: }
599: if (blockVerticalMargin != NO_MARGIN) {
600: writeBlockVerticalMargin();
601: } else if (lastCharWhiteSpace
602: || Segment.isWhiteSpace(source
603: .charAt(begin))) {
604: write(' ');
605: }
606: int textIndex = 0;
607: int i = 0;
608: lastCharWhiteSpace = false;
609: while (true) {
610: for (; i < text.length(); i++) {
611: if (text.charAt(i) != ' ')
612: continue; // search for end of word
613: // At end of word. To comply with RFC264 Format=Flowed protocol, need to make sure we don't wrap immediately before ">" or "From ".
614: if (i + 1 < text.length()
615: && text.charAt(i + 1) == '>')
616: continue;
617: if (i + 6 < text.length()
618: && text.startsWith("From ", i + 1))
619: continue;
620: break; // OK to wrap here if necessary
621: }
622: if (col + i - textIndex + 1 >= maxLineLength) {
623: if (lastCharWhiteSpace
624: && (blockIndentLevel | listIndentLevel) == 0)
625: write(' ');
626: startNewLine(0);
627: } else if (lastCharWhiteSpace) {
628: write(' ');
629: }
630: write(text, textIndex, i);
631: if (i == text.length())
632: break;
633: lastCharWhiteSpace = true;
634: textIndex = ++i;
635: }
636: lastCharWhiteSpace = Segment.isWhiteSpace(source
637: .charAt(end - 1));
638: }
639: } finally {
640: if (renderedIndex < end)
641: renderedIndex = end;
642: }
643: }
644:
645: private void writeBlockVerticalMargin() throws IOException {
646: startNewLine(blockVerticalMargin);
647: blockVerticalMargin = NO_MARGIN;
648: lastCharWhiteSpace = false;
649: }
650:
651: private void blockBoundary(final int verticalMargin)
652: throws IOException {
653: if (blockVerticalMargin < verticalMargin)
654: blockVerticalMargin = verticalMargin;
655: }
656:
657: private void startNewLine(int verticalMargin)
658: throws IOException {
659: final int requiredNewLines = verticalMargin
660: + (atStartOfLine() ? 0 : 1);
661: if (requiredNewLines == 0)
662: return;
663: for (int i = 0; i < requiredNewLines; i++)
664: writer.write(newLine);
665: writeIndent();
666: }
667:
668: private void newLine() throws IOException {
669: writer.write(newLine);
670: writeIndent();
671: }
672:
673: private void writeIndent() throws IOException {
674: for (int i = blockIndentLevel * blockIndentSize; i > 0; i--)
675: writer.write(' ');
676: if (bullet) {
677: for (int i = (listIndentLevel - 1) * listIndentSize; i > 0; i--)
678: writer.write(' ');
679: if (listBulletNumber == UNORDERED_LIST) {
680: for (int i = listIndentSize - 2; i > 0; i--)
681: writer.write(' ');
682: writer.write(listBullets[(listIndentLevel - 1)
683: % listBullets.length]);
684: writer.write(' ');
685: } else {
686: String bulletNumberString = Integer
687: .toString(listBulletNumber);
688: for (int i = listIndentSize
689: - bulletNumberString.length() - 2; i > 0; i--)
690: writer.write(' ');
691: writer.write(bulletNumberString);
692: writer.write(". ");
693: }
694: bullet = false;
695: } else {
696: for (int i = listIndentLevel * listIndentSize; i > 0; i--)
697: writer.write(' ');
698: }
699: col = startOfLineCol = getIndentCol();
700: }
701:
702: private boolean atStartOfLine() {
703: return col == startOfLineCol;
704: }
705:
706: private int getIndentCol() {
707: return blockIndentLevel * blockIndentSize + listIndentLevel
708: * listIndentSize;
709: }
710:
711: private Processor write(final char ch) throws IOException {
712: writer.write(ch);
713: col++;
714: return this ;
715: }
716:
717: private Processor write(final String text) throws IOException {
718: writer.write(text);
719: col += text.length();
720: return this ;
721: }
722:
723: private void write(final CharSequence text, final int begin,
724: final int end) throws IOException {
725: for (int i = begin; i < end; i++)
726: writer.write(text.charAt(i));
727: col += end - begin;
728: }
729:
730: private interface ElementHandler {
731: void process(Processor x, Element element)
732: throws IOException;
733: }
734:
735: private static class RemoveElementHandler implements
736: ElementHandler {
737: public static final ElementHandler INSTANCE = new RemoveElementHandler();
738:
739: public void process(Processor x, Element element) {
740: }
741: }
742:
743: private static class StandardInlineElementHandler implements
744: ElementHandler {
745: public static final ElementHandler INSTANCE = new StandardInlineElementHandler();
746:
747: public void process(Processor x, Element element)
748: throws IOException {
749: x.writeElementContent(element);
750: }
751: }
752:
753: private static class FontStyleElementHandler implements
754: ElementHandler {
755: public static final ElementHandler INSTANCE_B = new FontStyleElementHandler(
756: '*');
757: public static final ElementHandler INSTANCE_I = new FontStyleElementHandler(
758: '/');
759: public static final ElementHandler INSTANCE_U = new FontStyleElementHandler(
760: '_');
761: public static final ElementHandler INSTANCE_CODE = new FontStyleElementHandler(
762: '|');
763: private final char decorationChar;
764:
765: public FontStyleElementHandler(char decorationChar) {
766: this .decorationChar = decorationChar;
767: }
768:
769: public void process(Processor x, Element element)
770: throws IOException {
771: if (x.decorateFontStyles) {
772: if (x.lastCharWhiteSpace) {
773: x.write(' ');
774: x.lastCharWhiteSpace = false;
775: }
776: x.write(decorationChar);
777: x.writeElementContent(element);
778: if (x.decorateFontStyles)
779: x.write(decorationChar);
780: } else {
781: x.writeElementContent(element);
782: }
783: }
784: }
785:
786: private static class StandardBlockElementHandler implements
787: ElementHandler {
788: public static final ElementHandler INSTANCE_0_0 = new StandardBlockElementHandler(
789: 0, 0, false);
790: public static final ElementHandler INSTANCE_1_1 = new StandardBlockElementHandler(
791: 1, 1, false);
792: public static final ElementHandler INSTANCE_2_1 = new StandardBlockElementHandler(
793: 2, 1, false);
794: public static final ElementHandler INSTANCE_0_0_INDENT = new StandardBlockElementHandler(
795: 0, 0, true);
796: public static final ElementHandler INSTANCE_1_1_INDENT = new StandardBlockElementHandler(
797: 1, 1, true);
798: private final int topMargin;
799: private final int bottomMargin;
800: private final boolean indent;
801:
802: public StandardBlockElementHandler(int topMargin,
803: int bottomMargin, boolean indent) {
804: this .topMargin = topMargin;
805: this .bottomMargin = bottomMargin;
806: this .indent = indent;
807: }
808:
809: public void process(Processor x, Element element)
810: throws IOException {
811: x.blockBoundary(topMargin);
812: if (indent)
813: x.blockIndentLevel++;
814: x.writeElementContent(element);
815: if (indent)
816: x.blockIndentLevel--;
817: x.blockBoundary(bottomMargin);
818: }
819: }
820:
821: private static class A_ElementHandler implements ElementHandler {
822: public static final ElementHandler INSTANCE = new A_ElementHandler();
823:
824: public void process(Processor x, Element element)
825: throws IOException {
826: x.writeElementContent(element);
827: String href = element.getAttributeValue("href");
828: if (href == null)
829: return;
830: int linkLength = href.length() + 3;
831: if (x.col + linkLength >= x.maxLineLength) {
832: x.startNewLine(0);
833: } else {
834: x.write(' ');
835: }
836: x.write('<').write(href).write('>');
837: x.lastCharWhiteSpace = true;
838: }
839: }
840:
841: private static class BR_ElementHandler implements
842: ElementHandler {
843: public static final ElementHandler INSTANCE = new BR_ElementHandler();
844:
845: public void process(Processor x, Element element)
846: throws IOException {
847: x.newLine();
848: x.blockBoundary(0);
849: }
850: }
851:
852: private static class HR_ElementHandler implements
853: ElementHandler {
854: public static final ElementHandler INSTANCE = new HR_ElementHandler();
855:
856: public void process(Processor x, Element element)
857: throws IOException {
858: x.blockBoundary(0);
859: x.writeBlockVerticalMargin();
860: for (int i = 0; i < 72; i++)
861: x.write('-');
862: x.blockBoundary(0);
863: }
864: }
865:
866: private static class ListElementHandler implements
867: ElementHandler {
868: public static final ElementHandler INSTANCE_OL = new ListElementHandler(
869: 0);
870: public static final ElementHandler INSTANCE_UL = new ListElementHandler(
871: UNORDERED_LIST);
872: private final int initialListBulletNumber;
873:
874: public ListElementHandler(int initialListBulletNumber) {
875: this .initialListBulletNumber = initialListBulletNumber;
876: }
877:
878: public void process(Processor x, Element element)
879: throws IOException {
880: x.blockBoundary(0);
881: int oldListBulletNumber = x.listBulletNumber;
882: x.listBulletNumber = initialListBulletNumber;
883: x.listIndentLevel++;
884: x.writeElementContent(element);
885: x.listIndentLevel--;
886: x.listBulletNumber = oldListBulletNumber;
887: x.blockBoundary(0);
888: }
889: }
890:
891: private static class LI_ElementHandler implements
892: ElementHandler {
893: public static final ElementHandler INSTANCE = new LI_ElementHandler();
894:
895: public void process(Processor x, Element element)
896: throws IOException {
897: if (x.listBulletNumber != UNORDERED_LIST)
898: x.listBulletNumber++;
899: x.bullet = true;
900: x.writeBlockVerticalMargin(); // force writing of bullet even if no content
901: x.writeElementContent(element);
902: x.bullet = false;
903: x.blockBoundary(0);
904: }
905: }
906:
907: private static class PRE_ElementHandler implements
908: ElementHandler {
909: public static final ElementHandler INSTANCE = new PRE_ElementHandler();
910:
911: public void process(Processor x, Element element)
912: throws IOException {
913: x.blockBoundary(1);
914: boolean oldPreformatted = x.preformatted; // should always be false
915: x.preformatted = true;
916: x.writeElementContent(element);
917: x.preformatted = oldPreformatted;
918: x.blockBoundary(1);
919: }
920: }
921:
922: private static class TD_ElementHandler implements
923: ElementHandler {
924: public static final ElementHandler INSTANCE = new TD_ElementHandler();
925:
926: public void process(Processor x, Element element)
927: throws IOException {
928: if (!x.atStartOfLine())
929: x.write(x.tableCellSeparator);
930: x.lastCharWhiteSpace = false;
931: x.writeElementContent(element);
932: }
933: }
934:
935: private static class TR_ElementHandler implements
936: ElementHandler {
937: public static final ElementHandler INSTANCE = new TR_ElementHandler();
938:
939: public void process(Processor x, Element element)
940: throws IOException {
941: x.blockBoundary(0);
942: x.writeBlockVerticalMargin(); // force writing of new line so first cell knows not to write separator.
943: x.writeElementContent(element);
944: x.blockBoundary(0);
945: }
946: }
947: }
948: }
|