001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.util.*;
024: import java.io.*;
025: import java.net.*;
026:
027: /**
028: * Formats HTML source by laying out each non-inline-level element on a new line with an appropriate indent.
029: * <p>
030: * Any indentation present in the original source text is removed.
031: * <p>
032: * Use one of the following methods to obtain the output:
033: * <ul>
034: * <li>{@link #writeTo(Writer)}</li>
035: * <li>{@link #toString()}</li>
036: * <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
037: * </ul>
038: * <p>
039: * The output text is functionally equivalent to the original source and should be rendered identically unless specified below.
040: * <p>
041: * The following points describe the process in general terms.
042: * Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions.
043: * <p>
044: * <ul>
045: * <li>Every element that is not an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level element} appears on a new line
046: * with an indent corresponding to its {@linkplain Element#getDepth() depth} in the <a href="Source.html#DocumentElementHierarchy">document element hierarchy</a>.
047: * <li>The indent is formed by writing <i>n</i> repetitions of the string specified in the {@link #setIndentString(String) IndentString} property,
048: * where <i>n</i> is the depth of the indentation.
049: * <li>The {@linkplain Element#getContent() content} of an indented element starts on a new line and is indented at a depth one greater than that of the element,
050: * with the end tag appearing on a new line at the same depth as the start tag.
051: * If the content contains only text and {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements},
052: * it may continue on the same line as the start tag. Additionally, if the output content contains no new lines, the end tag may also continue on the same line.
053: * <li>The content of preformatted elements such as {@link HTMLElementName#PRE PRE} and {@link HTMLElementName#TEXTAREA TEXTAREA} are not indented,
054: * nor is the white space modified in any way.
055: * <li>Only {@linkplain StartTagType#NORMAL normal} and {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} elements are indented.
056: * All others are treated as {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
057: * <li>White space and indentation inside HTML {@linkplain StartTagType#COMMENT comments}, {@linkplain StartTagType#CDATA_SECTION CDATA sections}, or any
058: * {@linkplain TagType#isServerTag() server tag} is preserved,
059: * but with the indentation of new lines starting at a depth one greater than that of the surrounding text.
060: * <li>White space and indentation inside {@link HTMLElementName#SCRIPT SCRIPT} elements is preserved,
061: * but with the indentation of new lines starting at a depth one greater than that of the <code>SCRIPT</code> element.
062: * <li>If the {@link #setTidyTags(boolean) TidyTags} property is set to <code>true</code>,
063: * every tag in the document is replaced with the output from its {@link Tag#tidy()} method.
064: * If this property is set to <code>false</code>, the tag from the original text is used, including all white space,
065: * but with any new lines indented at a depth one greater than that of the element.
066: * <li>If the {@link #setCollapseWhiteSpace(boolean) CollapseWhiteSpace} property
067: * is set to <code>true</code>, every string of one or more {@linkplain Segment#isWhiteSpace(char) white space} characters
068: * located outside of a tag is replaced with a single space in the output.
069: * White space located adjacent to a non-inline-level element tag (except {@linkplain TagType#isServerTag() server tags}) may be removed.
070: * <li>If the {@link #setIndentAllElements(boolean) IndentAllElements} property
071: * is set to <code>true</code>, every element appears indented on a new line, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
072: * This generates output that is a good representation of the actual <a href="Source.html#DocumentElementHierarchy">document element hierarchy</a>,
073: * but is very likely to introduce white space that compromises the functional equivalency of the document.
074: * <li>The {@link #setNewLine(String) NewLine} property specifies the character sequence
075: * to use for each <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output document.
076: * <li>If the source document contains {@linkplain TagType#isServerTag() server tags}, the functional equivalency of the output document may be compromised.
077: * </ul>
078: * <p>
079: * Formatting an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
080: */
081: public final class SourceFormatter implements CharStreamSource {
082: private final Segment segment;
083: private String indentString = "\t";
084: private boolean tidyTags = false;
085: private boolean collapseWhiteSpace = false;
086: private boolean indentAllElements = false;
087: private String newLine = null;
088:
089: /**
090: * Constructs a new <code>SourceFormatter</code> based on the specified {@link Segment}.
091: * @param segment the segment containing the HTML to be formatted.
092: * @see Source#getSourceFormatter()
093: */
094: public SourceFormatter(final Segment segment) {
095: this .segment = segment;
096: }
097:
098: // Documentation inherited from CharStreamSource
099: public void writeTo(final Writer writer) throws IOException {
100: new Processor(segment, getIndentString(), getTidyTags(),
101: getCollapseWhiteSpace(), getIndentAllElements(),
102: getIndentAllElements(), getNewLine()).writeTo(writer);
103: }
104:
105: // Documentation inherited from CharStreamSource
106: public long getEstimatedMaximumOutputLength() {
107: return segment.length() * 2;
108: }
109:
110: // Documentation inherited from CharStreamSource
111: public String toString() {
112: return CharStreamSourceUtil.toString(this );
113: }
114:
115: /**
116: * Sets the string to be used for indentation.
117: * <p>
118: * The default value is a string containing a single tab character (U+0009).
119: * <p>
120: * The most commonly used indent strings are <code>"\t"</code> (single tab), <code>" "</code> (single space), <code>" "</code> (2 spaces), and <code>" "</code> (4 spaces).
121: *
122: * @param indentString the string to be used for indentation, must not be <code>null</code>.
123: * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
124: * @see #getIndentString()
125: */
126: public SourceFormatter setIndentString(final String indentString) {
127: if (indentString == null)
128: throw new IllegalArgumentException(
129: "indentString property must not be null");
130: this .indentString = indentString;
131: return this ;
132: }
133:
134: /**
135: * Returns the string to be used for indentation.
136: * <p>
137: * See the {@link #setIndentString(String)} method for a full description of this property.
138: *
139: * @return the string to be used for indentation.
140: */
141: public String getIndentString() {
142: return indentString;
143: }
144:
145: /**
146: * Sets whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
147: * <p>
148: * The default value is <code>false</code>.
149: * <p>
150: * If this property is set to <code>false</code>, the tag from the original text is used, including all white space,
151: * but with any new lines indented at a depth one greater than that of the element.
152: *
153: * @param tidyTags specifies whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
154: * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
155: * @see #getTidyTags()
156: */
157: public SourceFormatter setTidyTags(final boolean tidyTags) {
158: this .tidyTags = tidyTags;
159: return this ;
160: }
161:
162: /**
163: * Indicates whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
164: * <p>
165: * See the {@link #setTidyTags(boolean)} method for a full description of this property.
166: *
167: * @return <code>true</code> if the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method, otherwise <code>false</code>.
168: */
169: public boolean getTidyTags() {
170: return tidyTags;
171: }
172:
173: /**
174: * Sets whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
175: * <p>
176: * The default value is <code>false</code>.
177: * <p>
178: * If this property is set to <code>true</code>, every string of one or more {@linkplain Segment#isWhiteSpace(char) white space} characters
179: * located outside of a tag is replaced with a single space in the output.
180: * White space located adjacent to a non-inline-level element tag (except {@linkplain TagType#isServerTag() server tags}) may be removed.
181: *
182: * @param collapseWhiteSpace specifies whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
183: * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
184: * @see #getCollapseWhiteSpace()
185: */
186: public SourceFormatter setCollapseWhiteSpace(
187: final boolean collapseWhiteSpace) {
188: this .collapseWhiteSpace = collapseWhiteSpace;
189: return this ;
190: }
191:
192: /**
193: * Indicates whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
194: * <p>
195: * See the {@link #setCollapseWhiteSpace(boolean collapseWhiteSpace)} method for a full description of this property.
196: *
197: * @return <code>true</code> if {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed, otherwise <code>false</code>.
198: */
199: public boolean getCollapseWhiteSpace() {
200: return collapseWhiteSpace;
201: }
202:
203: /**
204: * Sets whether all elements are to be indented, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements} and those with preformatted contents.
205: * <p>
206: * The default value is <code>false</code>.
207: * <p>
208: * If this property is set to <code>true</code>, every element appears indented on a new line, including
209: * {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
210: * <p>
211: * This generates output that is a good representation of the actual <a href="Source.html#DocumentElementHierarchy">document element hierarchy</a>,
212: * but is very likely to introduce white space that compromises the functional equivalency of the document.
213: *
214: * @param indentAllElements specifies whether all elements are to be indented.
215: * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
216: * @see #getIndentAllElements()
217: */
218: public SourceFormatter setIndentAllElements(
219: final boolean indentAllElements) {
220: this .indentAllElements = indentAllElements;
221: return this ;
222: }
223:
224: /**
225: * Indicates whether all elements are to be indented, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements} and those with preformatted contents.
226: * <p>
227: * See the {@link #setIndentAllElements(boolean)} method for a full description of this property.
228: *
229: * @return <code>true</code> if all elements are to be indented, otherwise <code>false</code>.
230: */
231: public boolean getIndentAllElements() {
232: return indentAllElements;
233: }
234:
235: /**
236: * Sets the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
237: * <p>
238: * The default is to use the same new line string as is used in the source document, which is determined via the {@link Source#getNewLine()} method.
239: * If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document,
240: * or using the value from {@link Config#NewLine}.
241: * <p>
242: * Specifying a <code>null</code> argument resets the property to its default value, which is to use the same new line string as is used in the source document.
243: *
244: * @param newLine the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output, may be <code>null</code>.
245: * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
246: * @see #getNewLine()
247: */
248: public SourceFormatter setNewLine(final String newLine) {
249: this .newLine = newLine;
250: return this ;
251: }
252:
253: /**
254: * Returns the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
255: * <p>
256: * See the {@link #setNewLine(String)} method for a full description of this property.
257: *
258: * @return the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
259: */
260: public String getNewLine() {
261: if (newLine == null)
262: newLine = segment.source.getBestGuessNewLine();
263: return newLine;
264: }
265:
266: /** This class does the actual work, but is first passed final copies of all the parameters for efficiency. */
267: private static final class Processor {
268: private final Segment segment;
269: private final CharSequence sourceText;
270: private final String indentString;
271: private final boolean tidyTags;
272: private final boolean collapseWhiteSpace;
273: private final boolean indentAllElements;
274: private final boolean indentScriptElements; // at present this parameter is tied to indentAllElements. SCRIPT elements need to be inline to keep functional equivalency of output
275: private final String newLine;
276:
277: private Writer writer;
278: private Tag nextTag;
279: private int index;
280:
281: public Processor(final Segment segment,
282: final String indentString, final boolean tidyTags,
283: final boolean collapseWhiteSpace,
284: final boolean indentAllElements,
285: final boolean indentAllScripts, final String newLine) {
286: this .segment = segment;
287: sourceText = segment.source.toString();
288: this .indentString = indentString;
289: this .tidyTags = tidyTags;
290: this .collapseWhiteSpace = collapseWhiteSpace;
291: this .indentAllElements = indentAllElements;
292: this .indentScriptElements = indentAllScripts;
293: this .newLine = newLine;
294: }
295:
296: public void writeTo(final Writer writer) throws IOException {
297: this .writer = writer;
298: nextTag = segment.source.findNextTag(segment.begin);
299: index = segment.begin;
300: writeContent(segment.end, segment.getChildElements(), 0);
301: writer.flush();
302: }
303:
304: private void writeContent(final int end,
305: final List childElements, final int depth)
306: throws IOException {
307: // sets index to end
308: for (final Iterator i = childElements.iterator(); i
309: .hasNext();) {
310: final Element element = (Element) i.next();
311: final int elementBegin = element.begin;
312: if (elementBegin >= end)
313: break;
314: if (indentAllElements) {
315: writeText(elementBegin, depth, false, false, false,
316: collapseWhiteSpace);
317: writeElement(element, depth, end, false, false);
318: } else {
319: final String elementName = element.getName();
320: if (!indent(element))
321: continue;
322: writeText(elementBegin, depth, false, false, false,
323: collapseWhiteSpace);
324: if (elementName == HTMLElementName.PRE
325: || elementName == HTMLElementName.TEXTAREA) {
326: writeElement(element, depth, end, true, true);
327: } else if (elementName == HTMLElementName.SCRIPT) {
328: writeElement(element, depth, end, true, false);
329: } else {
330: writeElement(
331: element,
332: depth,
333: end,
334: false,
335: !containsNonInlineLevelChildElements(element));
336: }
337: }
338: }
339: writeText(end, depth, false, false, false,
340: collapseWhiteSpace);
341: }
342:
343: private boolean indent(final Element element) {
344: final StartTagType startTagType = element.getStartTag()
345: .getStartTagType();
346: if (startTagType == StartTagType.DOCTYPE_DECLARATION)
347: return true;
348: if (startTagType != StartTagType.NORMAL)
349: return false;
350: final String elementName = element.getName();
351: if (elementName == HTMLElementName.SCRIPT)
352: return indentScriptElements;
353: if (!HTMLElements.getInlineLevelElementNames().contains(
354: elementName))
355: return true;
356: return containsNonInlineLevelChildElements(element);
357: }
358:
359: private void writeText(final int end, int depth,
360: final boolean beginInline, final boolean endInline,
361: final boolean increaseIndentAfterFirstLineBreak,
362: final boolean collapseWhiteSpace) throws IOException {
363: // sets index to end
364: if (index == end)
365: return;
366: while (Segment.isWhiteSpace(sourceText.charAt(index)))
367: if (++index == end)
368: return; // trim whitespace.
369: if (!beginInline)
370: writeIndent(depth);
371: writeTextInline(end, depth,
372: increaseIndentAfterFirstLineBreak,
373: collapseWhiteSpace);
374: if (!endInline)
375: writer.write(newLine);
376: }
377:
378: private void writeElement(final Element element,
379: final int depth, final int end,
380: final boolean preformatted, boolean renderContentInline)
381: throws IOException {
382: // sets index to minimum of element.end or end
383: // assert index==element.begin
384: // assert index < end
385: final StartTag startTag = element.getStartTag();
386: final EndTag endTag = element.getEndTag();
387: writeIndent(depth);
388: writeTag(startTag, depth, end);
389: if (index == end) {
390: writer.write(newLine);
391: return;
392: }
393: if (!renderContentInline)
394: writer.write(newLine);
395: int contentEnd = element.getContentEnd();
396: if (end < contentEnd)
397: contentEnd = end;
398: if (index < contentEnd) {
399: if (preformatted) {
400: if (renderContentInline) {
401: // Preformatted element such as PRE, TEXTAREA
402: writeContentPreformatted(contentEnd, depth);
403: } else {
404: // SCRIPT element
405: writeIndentedScriptContent(contentEnd,
406: depth + 1);
407: }
408: } else {
409: if (renderContentInline) {
410: // Inline-level element
411: if (collapseWhiteSpace) {
412: writeTextCollapseWhiteSpace(contentEnd,
413: depth);
414: } else {
415: if (!writeTextInline(contentEnd, depth,
416: true, false)) {
417: writer.write(newLine);
418: renderContentInline = false;
419: }
420: }
421: } else {
422: // Block-level element
423: writeContent(contentEnd, element
424: .getChildElements(), depth + 1);
425: }
426: }
427: }
428: if (endTag != null && end > endTag.begin) {
429: if (!renderContentInline)
430: writeIndent(depth);
431: // assert index=endTag.begin
432: writeTag(endTag, depth, end);
433: writer.write(newLine);
434: } else if (renderContentInline) {
435: writer.write(newLine);
436: }
437: }
438:
439: private void updateNextTag() {
440: // ensures that nextTag is up to date
441: while (nextTag != null) {
442: if (nextTag.begin >= index)
443: return;
444: nextTag = nextTag.findNextTag();
445: }
446: }
447:
448: private void writeIndentedScriptContent(final int end,
449: final int depth) throws IOException {
450: // sets index to end
451: // assert index < end
452: int startOfLinePos = getStartOfLinePos(end, false);
453: if (index == end)
454: return;
455: if (startOfLinePos == -1) {
456: // Script started on same line as start tag. Use the start of the next line to determine the original indent.
457: writeIndent(depth);
458: writeLineKeepWhiteSpace(end, depth);
459: writer.write(newLine);
460: if (index == end)
461: return;
462: startOfLinePos = getStartOfLinePos(end, true);
463: if (index == end)
464: return;
465: }
466: writeTextPreserveIndentation(end, depth, index
467: - startOfLinePos);
468: writer.write(newLine);
469: }
470:
471: private boolean writeTextPreserveIndentation(final int end,
472: final int depth) throws IOException {
473: // sets index to end
474: // returns true if all text was on one line, otherwise false
475: // assert index < end
476: // Use the start of the next line to determine the original indent.
477: writeLineKeepWhiteSpace(end, depth);
478: if (index == end)
479: return true;
480: int startOfLinePos = getStartOfLinePos(end, true);
481: if (index == end)
482: return true;
483: writer.write(newLine);
484: writeTextPreserveIndentation(end, depth + 1, index
485: - startOfLinePos);
486: return false;
487: }
488:
489: private void writeTextPreserveIndentation(final int end,
490: final int depth, final int originalIndentLength)
491: throws IOException {
492: // assert index < end
493: // sets index to end
494: writeIndent(depth);
495: writeLineKeepWhiteSpace(end, depth);
496: while (index != end) {
497: // Skip over the original indent:
498: for (int x = 0; x < originalIndentLength; x++) {
499: final char ch = sourceText.charAt(index);
500: if (!(ch == ' ' || ch == '\t'))
501: break;
502: if (++index == end)
503: return;
504: }
505: writer.write(newLine);
506: // Insert our indent:
507: writeIndent(depth);
508: // Write the rest of the line including any indent greater than the first line's indent:
509: writeLineKeepWhiteSpace(end, depth);
510: }
511: }
512:
513: private int getStartOfLinePos(final int end,
514: final boolean atStartOfLine) {
515: // returns the starting position of the next complete line containing text, or -1 if texts starts on the current line (hence not a complete line).
516: // sets index to the start of the text following the returned position, or end, whichever comes first.
517: int startOfLinePos = atStartOfLine ? index : -1;
518: while (true) {
519: final char ch = sourceText.charAt(index);
520: if (ch == '\n' || ch == '\r') {
521: startOfLinePos = index + 1;
522: } else if (!(ch == ' ' || ch == '\t'))
523: break;
524: if (++index == end)
525: break;
526: }
527: return startOfLinePos;
528: }
529:
530: private void writeSpecifiedTextInline(final CharSequence text,
531: int depth) throws IOException {
532: final int textLength = text.length();
533: int i = writeSpecifiedLine(text, 0);
534: if (i < textLength) {
535: final int subsequentLineDepth = depth + 1;
536: do {
537: while (Segment.isWhiteSpace(text.charAt(i)))
538: if (++i >= textLength)
539: return; // trim whitespace.
540: writer.write(newLine);
541: writeIndent(subsequentLineDepth);
542: i = writeSpecifiedLine(text, i);
543: } while (i < textLength);
544: }
545: }
546:
547: private int writeSpecifiedLine(final CharSequence text, int i)
548: throws IOException {
549: // Writes the first line from the specified text starting from the specified position.
550: // The line break characters are not written.
551: // Returns the position following the first line break character(s), or text.length() if the text contains no line breaks.
552: final int textLength = text.length();
553: while (true) {
554: final char ch = text.charAt(i);
555: if (ch == '\r') {
556: final int nexti = i + 1;
557: if (nexti < textLength
558: && text.charAt(nexti) == '\n')
559: return i + 2;
560: }
561: if (ch == '\n')
562: return i + 1;
563: writer.write(ch);
564: if (++i >= textLength)
565: return i;
566: }
567: }
568:
569: private boolean writeTextInline(final int end, int depth,
570: final boolean increaseIndentAfterFirstLineBreak,
571: final boolean collapseWhiteSpace) throws IOException {
572: // returns true if all text was on one line, otherwise false
573: // sets index to end
574: // assert index < end
575: writeLine(end, depth, collapseWhiteSpace);
576: if (index == end)
577: return true;
578: final int subsequentLineDepth = increaseIndentAfterFirstLineBreak ? depth + 1
579: : depth;
580: do {
581: while (Segment.isWhiteSpace(sourceText.charAt(index)))
582: if (++index == end)
583: return false; // trim whitespace.
584: writer.write(newLine);
585: writeIndent(subsequentLineDepth);
586: writeLine(end, subsequentLineDepth, collapseWhiteSpace);
587: } while (index < end);
588: return false;
589: }
590:
591: private void writeLine(final int end, final int depth,
592: final boolean collapseWhiteSpace) throws IOException {
593: // sets index to the position following the first line break character(s), or to end if collapseWhiteSpace or the text contains no line breaks.
594: // assert index < end
595: if (collapseWhiteSpace) {
596: writeTextCollapseWhiteSpace(end, depth);
597: } else {
598: writeLineKeepWhiteSpace(end, depth);
599: }
600: }
601:
602: private void writeLineKeepWhiteSpace(final int end,
603: final int depth) throws IOException {
604: // Writes the first line from the source text starting from index, ending at the specified end position.
605: // The line break characters are not written.
606: // Sets index to the position following the first line break character(s), or end if the text contains no line breaks. index is guaranteed <= end.
607: // Any tags encountered are written using the writeTag method, whose output may include line breaks.
608: // assert index < end
609: updateNextTag();
610: while (true) {
611: while (nextTag != null && index == nextTag.begin) {
612: writeTag(nextTag, depth, end);
613: if (index == end)
614: return;
615: }
616: final char ch = sourceText.charAt(index);
617: if (ch == '\r') {
618: final int nextindex = index + 1;
619: if (nextindex < end
620: && sourceText.charAt(nextindex) == '\n') {
621: index += 2;
622: return;
623: }
624: }
625: if (ch == '\n') {
626: index++;
627: return;
628: }
629: writer.write(ch);
630: if (++index == end)
631: return;
632: }
633: }
634:
635: private void writeTextCollapseWhiteSpace(final int end,
636: final int depth) throws IOException {
637: // sets index to end
638: // assert index < end
639: boolean lastWasWhiteSpace = false;
640: updateNextTag();
641: while (index < end) {
642: while (nextTag != null && index == nextTag.begin) {
643: if (lastWasWhiteSpace) {
644: writer.write(' ');
645: lastWasWhiteSpace = false;
646: }
647: writeTag(nextTag, depth, end);
648: if (index == end)
649: return;
650: }
651: final char ch = sourceText.charAt(index++);
652: if (Segment.isWhiteSpace(ch)) {
653: lastWasWhiteSpace = true;
654: } else {
655: if (lastWasWhiteSpace) {
656: writer.write(' ');
657: lastWasWhiteSpace = false;
658: }
659: writer.write(ch);
660: }
661: }
662: if (lastWasWhiteSpace)
663: writer.write(' ');
664: }
665:
666: private void writeContentPreformatted(final int end,
667: final int depth) throws IOException {
668: // sets index to end
669: // assert index < end
670: updateNextTag();
671: do {
672: while (nextTag != null && index == nextTag.begin) {
673: writeTag(nextTag, depth, end);
674: if (index == end)
675: return;
676: }
677: writer.write(sourceText.charAt(index));
678: } while (++index < end);
679: }
680:
681: private void writeTag(final Tag tag, final int depth,
682: final int end) throws IOException {
683: // sets index to last position written, guaranteed < end
684: // assert index==tag.begin
685: // assert index < end
686: nextTag = tag.findNextTag();
687: final int tagEnd = (end > tag.end) ? tag.end : end;
688: // assert index < tagEnd
689: if (tag.getTagType() == StartTagType.COMMENT
690: || tag.getTagType() == StartTagType.CDATA_SECTION) {
691: writeTextPreserveIndentation(tagEnd, depth);
692: } else if (tidyTags) {
693: final String tidyTag = tag.tidy();
694: if ((tag instanceof StartTag)
695: && ((StartTag) tag).getAttributes() != null)
696: writer.write(tidyTag);
697: else
698: writeSpecifiedTextInline(tidyTag, depth);
699: index = tagEnd;
700: } else {
701: writeTextInline(tagEnd, depth, true, false);
702: }
703: if (end <= tag.end || !(tag instanceof StartTag))
704: return;
705: if ((tag.name == HTMLElementName.SCRIPT && !indentScriptElements)
706: || tag.getTagType().isServerTag()) {
707: // this is a server start tag, we may need to write the whole server element:
708: final Element element = tag.getElement();
709: final EndTag endTag = element.getEndTag();
710: if (endTag == null)
711: return;
712: final int contentEnd = (end < endTag.begin) ? end
713: : endTag.begin;
714: final boolean singleLineContent = (index == contentEnd) ? true
715: : writeTextPreserveIndentation(contentEnd,
716: depth);
717: //final boolean singleLineContent=(index==contentEnd) ? true : writeTextInline(contentEnd,depth+1,false,false); // use this line instead of previous if indentation shouldn't be preserved in server elements.
718: if (endTag.begin >= end)
719: return;
720: if (!singleLineContent) {
721: writer.write(newLine);
722: writeIndent(depth);
723: }
724: // assert index==endTag.begin
725: writeTag(endTag, depth, end);
726: }
727: }
728:
729: private void writeIndent(final int depth) throws IOException {
730: for (int x = 0; x < depth; x++)
731: writer.write(indentString);
732: }
733:
734: private boolean containsNonInlineLevelChildElements(
735: final Element element) {
736: // returns true if the element contains any non-inline-level elements or SCRIPT elements.
737: final Collection childElements = element.getChildElements();
738: if (childElements == Collections.EMPTY_LIST)
739: return false;
740: for (final Iterator i = childElements.iterator(); i
741: .hasNext();) {
742: final Element childElement = (Element) i.next();
743: final String elementName = childElement.getName();
744: if (elementName == HTMLElementName.SCRIPT
745: || !HTMLElements.getInlineLevelElementNames()
746: .contains(elementName))
747: return true;
748: if (containsNonInlineLevelChildElements(childElement))
749: return true;
750: }
751: return false;
752: }
753: }
754: }
|