001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.io.*;
024: import java.util.*;
025:
026: /**
027: * Represents a modified version of an original {@link Source} document.
028: * <p>
029: * An <code>OutputDocument</code> represents an original source document that
030: * has been modified by substituting segments of it with other text.
031: * Each of these substitutions must be registered in the output document,
032: * which is most commonly done using the various <code>replace</code>, <code>remove</code> or <code>insert</code> methods in this class.
033: * These methods internally {@linkplain #register(OutputSegment) register} one or more {@link OutputSegment} objects to define each substitution.
034: *
035: * After all of the substitutions have been registered, the modified text can be retrieved using the
036: * {@link #writeTo(Writer)} or {@link #toString()} methods.
037: * <p>
038: * The registered {@linkplain OutputSegment output segments} may be adjacent, and as of version 2.5 may also overlap.
039: * In most cases only output segments that have been {@linkplain #remove(Segment) removed} or {@linkplain #replaceWithSpaces(int,int) replaced with spaces}
040: * legitimately overlap each other. Registering overlapping output segments that generate output will generally yield unexpected results.
041: * <p>
042: * If unexpected results are being generated from an <code>OutputDocument</code>, the {@link #getDebugInfo()} method provides information on each
043: * {@linkplain #getRegisteredOutputSegments() registered output segment}, which should provide enough information to determine the cause of the problem.
044: * In most cases the problem will be caused by overlapping output segments.
045: * <p>
046: * The following example converts all externally referenced style sheets to internal style sheets:
047: * <p>
048: * <pre>
049: * URL sourceUrl=new URL(sourceUrlString);
050: * String htmlText=Util.getString(new InputStreamReader(sourceUrl.openStream()));
051: * Source source=new Source(htmlText);
052: * OutputDocument outputDocument=new OutputDocument(source);
053: * StringBuffer sb=new StringBuffer();
054: * List linkStartTags=source.findAllStartTags(Tag.LINK);
055: * for (Iterator i=linkStartTags.iterator(); i.hasNext();) {
056: * StartTag startTag=(StartTag)i.next();
057: * Attributes attributes=startTag.getAttributes();
058: * String rel=attributes.getValue("rel");
059: * if (!"stylesheet".equalsIgnoreCase(rel)) continue;
060: * String href=attributes.getValue("href");
061: * if (href==null) continue;
062: * String styleSheetContent;
063: * try {
064: * styleSheetContent=Util.getString(new InputStreamReader(new URL(sourceUrl,href).openStream()));
065: * } catch (Exception ex) {
066: * continue; // don't convert if URL is invalid
067: * }
068: * sb.setLength(0);
069: * sb.append("<style");
070: * Attribute typeAttribute=attributes.get("type");
071: * if (typeAttribute!=null) sb.append(' ').append(typeAttribute);
072: * sb.append(">\n").append(styleSheetContent).append("\n</style>");
073: * outputDocument.replace(startTag,sb);
074: * }
075: * String convertedHtmlText=outputDocument.toString();
076: * </pre>
077: *
078: * @see OutputSegment
079: */
080: public final class OutputDocument implements CharStreamSource {
081: private CharSequence sourceText;
082: private ArrayList outputSegments = new ArrayList();
083:
084: /**
085: * Constructs a new output document based on the specified source document.
086: * @param source the source document.
087: */
088: public OutputDocument(final Source source) {
089: if (source == null)
090: throw new IllegalArgumentException(
091: "source argument must not be null");
092: this .sourceText = source;
093: }
094:
095: OutputDocument(final ParseText parseText) {
096: this .sourceText = parseText;
097: }
098:
099: /**
100: * Returns the original source text upon which this output document is based.
101: * @return the original source text upon which this output document is based.
102: */
103: public CharSequence getSourceText() {
104: return sourceText;
105: }
106:
107: /**
108: * Removes the specified {@linkplain Segment segment} from this output document.
109: * <p>
110: * This is equivalent to {@link #replace(Segment,CharSequence) replace}<code>(segment,null)</code>.
111: *
112: * @param segment the segment to remove.
113: */
114: public void remove(final Segment segment) {
115: register(new RemoveOutputSegment(segment));
116: }
117:
118: /**
119: * Removes all the segments from this output document represented by the specified source {@linkplain Segment} objects.
120: * <p>
121: * This is equivalent to the following code:<pre>
122: * for (Iterator i=segments.iterator(); i.hasNext();)
123: * {@link #remove(Segment) remove}((Segment)i.next());</pre>
124: *
125: * @param segments a collection of segments to remove, represented by source {@link Segment} objects.
126: */
127: public void remove(final Collection segments) {
128: for (Iterator i = segments.iterator(); i.hasNext();)
129: remove((Segment) i.next());
130: }
131:
132: /**
133: * Inserts the specified text at the specified character position in this output document.
134: * @param pos the character position at which to insert the text.
135: * @param text the replacement text.
136: */
137: public void insert(final int pos, final CharSequence text) {
138: register(new StringOutputSegment(pos, pos, text));
139: }
140:
141: /**
142: * Replaces the specified {@linkplain Segment segment} in this output document with the specified text.
143: * <p>
144: * Specifying a <code>null</code> argument to the <code>text</code> parameter is exactly equivalent to specifying an empty string,
145: * and results in the segment being completely removed from the output document.
146: *
147: * @param segment the segment to replace.
148: * @param text the replacement text, or <code>null</code> to remove the segment.
149: */
150: public void replace(final Segment segment, final CharSequence text) {
151: replace(segment.getBegin(), segment.getEnd(), text);
152: }
153:
154: /**
155: * Replaces the specified segment of this output document with the specified text.
156: * <p>
157: * Specifying a <code>null</code> argument to the <code>text</code> parameter is exactly equivalent to specifying an empty string,
158: * and results in the segment being completely removed from the output document.
159: *
160: * @param begin the character position at which to begin the replacement.
161: * @param end the character position at which to end the replacement.
162: * @param text the replacement text, or <code>null</code> to remove the segment.
163: */
164: public void replace(final int begin, final int end,
165: final CharSequence text) {
166: register(new StringOutputSegment(begin, end, text));
167: }
168:
169: /**
170: * Replaces the specified segment of this output document with the specified character.
171: *
172: * @param begin the character position at which to begin the replacement.
173: * @param end the character position at which to end the replacement.
174: * @param ch the replacement character.
175: */
176: public void replace(final int begin, final int end, final char ch) {
177: register(new CharOutputSegment(begin, end, ch));
178: }
179:
180: /**
181: * Replaces the specified {@link FormControl} in this output document.
182: * <p>
183: * The effect of this method is to {@linkplain #register(OutputSegment) register} zero or more
184: * {@linkplain OutputSegment output segments} in the output document as required to reflect
185: * previous modifications to the control's state.
186: * The state of a control includes its <a href="FormControl.html#SubmissionValue">submission value</a>,
187: * {@linkplain FormControl#setOutputStyle(FormControlOutputStyle) output style}, and whether it has been
188: * {@linkplain FormControl#setDisabled(boolean) disabled}.
189: * <p>
190: * The state of the form control should not be modified after this method is called, as there is no guarantee that
191: * subsequent changes either will or will not be reflected in the final output.
192: * A second call to this method with the same parameter is not allowed.
193: * It is therefore recommended to call this method as the last action before the output is generated.
194: * <p>
195: * Although the specifics of the number and nature of the output segments added in any particular circumstance
196: * is not defined in the specification, it can generally be assumed that only the minimum changes necessary
197: * are made to the original document. If the state of the control has not been modified, calling this method
198: * has no effect at all.
199: *
200: * @param formControl the form control to replace.
201: * @see #replace(FormFields)
202: */
203: public void replace(final FormControl formControl) {
204: formControl.replaceInOutputDocument(this );
205: }
206:
207: /**
208: * {@linkplain #replace(FormControl) Replaces} all the constituent {@linkplain FormControl form controls}
209: * from the specified {@link FormFields} in this output document.
210: * <p>
211: * This is equivalent to the following code:
212: * <pre>for (Iterator i=formFields.{@link FormFields#getFormControls() getFormControls()}.iterator(); i.hasNext();)
213: * {@link #replace(FormControl) replace}((FormControl)i.next());</pre>
214: * <p>
215: * The state of any of the form controls in the specified form fields should not be modified after this method is called,
216: * as there is no guarantee that subsequent changes either will or will not be reflected in the final output.
217: * A second call to this method with the same parameter is not allowed.
218: * It is therefore recommended to call this method as the last action before the output is generated.
219: *
220: * @param formFields the form fields to replace.
221: * @see #replace(FormControl)
222: */
223: public void replace(final FormFields formFields) {
224: formFields.replaceInOutputDocument(this );
225: }
226:
227: /**
228: * Replaces the specified {@link Attributes} segment in this output document with the name/value entries
229: * in the returned <code>Map</code>.
230: * The returned map initially contains entries representing the attributes from the source document,
231: * which can be modified before output.
232: * <p>
233: * The documentation of the {@link #replace(Attributes,Map)} method contains more information about the requirements
234: * of the map entries.
235: * <p>
236: * Specifying a value of <code>true</code> as an argument to the <code>convertNamesToLowerCase</code> parameter
237: * causes all original attribute names to be converted to lower case in the map.
238: * This simplifies the process of finding/updating specific attributes since map keys are case sensitive.
239: * <p>
240: * Attribute values are automatically {@linkplain CharacterReference#decode(CharSequence) decoded} before
241: * being loaded into the map.
242: * <p>
243: * This method is logically equivalent to:<br />
244: * {@link #replace(Attributes,Map) replace}<code>(attributes, attributes.</code>{@link Attributes#populateMap(Map,boolean) populateMap(new LinkedHashMap(),convertNamesToLowerCase)}<code>)</code>
245: * <p>
246: * The use of <code>LinkedHashMap</code> to implement the map ensures (probably unnecessarily) that
247: * existing attributes are output in the same order as they appear in the source document, and new
248: * attributes are output in the same order as they are added.
249: * <p>
250: * <dl>
251: * <dt>Example:</dt>
252: * <dd><pre>
253: * Source source=new Source(htmlDocument);
254: * Attributes bodyAttributes
255: * =source.findNextStartTag(0,Tag.BODY).getAttributes();
256: * OutputDocument outputDocument=new OutputDocument(source);
257: * Map attributesMap=outputDocument.replace(bodyAttributes,true);
258: * attributesMap.put("bgcolor","green");
259: * String htmlDocumentWithGreenBackground=outputDocument.toString();</pre></dl>
260: *
261: * @param attributes the <code>Attributes</code> segment defining the span of the segment and initial name/value entries of the returned map.
262: * @param convertNamesToLowerCase specifies whether all attribute names are converted to lower case in the map.
263: * @return a <code>Map</code> containing the name/value entries to be output.
264: * @see #replace(Attributes,Map)
265: */
266: public Map replace(final Attributes attributes,
267: boolean convertNamesToLowerCase) {
268: AttributesOutputSegment attributesOutputSegment = new AttributesOutputSegment(
269: attributes, convertNamesToLowerCase);
270: register(attributesOutputSegment);
271: return attributesOutputSegment.getMap();
272: }
273:
274: /**
275: * Replaces the specified attributes segment in this source document with the name/value entries in the specified <code>Map</code>.
276: * <p>
277: * This method might be used if the <code>Map</code> containing the new attribute values
278: * should not be preloaded with the same entries as the source attributes, or a map implementation
279: * other than <code>LinkedHashMap</code> is required.
280: * Otherwise, the {@link #replace(Attributes, boolean convertNamesToLowerCase)} method is generally more useful.
281: * <p>
282: * Keys in the map must be <code>String</code> objects, and values must implement the <code>CharSequence</code> interface.
283: * <p>
284: * An attribute with no value is represented by a map entry with a <code>null</code> value.
285: * <p>
286: * Attribute values are stored unencoded in the map, and are automatically
287: * {@linkplain CharacterReference#encode(CharSequence) encoded} if necessary during output.
288: * <p>
289: * The use of invalid characters in attribute names results in unspecified behaviour.
290: * <p>
291: * Note that methods in the <code>Attributes</code> class treat attribute names as case insensitive,
292: * whereas the <code>Map</code> treats them as case sensitive.
293: *
294: * @param attributes the <code>Attributes</code> object defining the span of the segment to replace.
295: * @param map the <code>Map</code> containing the name/value entries.
296: * @see #replace(Attributes, boolean convertNamesToLowerCase)
297: */
298: public void replace(final Attributes attributes, final Map map) {
299: register(new AttributesOutputSegment(attributes, map));
300: }
301:
302: /**
303: * Replaces the specified segment of this output document with a string of spaces of the same length.
304: * <p>
305: * This method is most commonly used to remove segments of the document without affecting the character positions of the remaining elements.
306: * <p>
307: * It is used internally to implement the functionality available through the {@link Segment#ignoreWhenParsing()} method.
308: * <p>
309: * To remove a segment from the output document completely, use the {@link #remove(Segment)} method instead.
310: *
311: * @param begin the character position at which to begin the replacement.
312: * @param end the character position at which to end the replacement.
313: */
314: public void replaceWithSpaces(final int begin, final int end) {
315: register(new BlankOutputSegment(begin, end));
316: }
317:
318: /**
319: * Registers the specified {@linkplain OutputSegment output segment} in this output document.
320: * <p>
321: * Use this method if you want to use a customised {@link OutputSegment} class.
322: *
323: * @param outputSegment the output segment to register.
324: */
325: public void register(final OutputSegment outputSegment) {
326: outputSegments.add(outputSegment);
327: }
328:
329: /**
330: * Writes the final content of this output document to the specified <code>Writer</code>.
331: * <p>
332: * As of version 2.5, the presence of overlapping output segments no longer results in an {@link OverlappingOutputSegmentsException}.
333: * It is now up to the developer to detect unintentional overlapping segments.
334: * <p>
335: * If the output is required in the form of a <code>Reader</code>, use {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)} instead.
336: *
337: * @param writer the destination <code>java.io.Writer</code> for the output.
338: * @throws IOException if an I/O exception occurs.
339: * @see #toString()
340: */
341: public void writeTo(final Writer writer) throws IOException {
342: try {
343: if (outputSegments.isEmpty()) {
344: Util.appendTo(writer, sourceText);
345: return;
346: }
347: int pos = 0;
348: Collections.sort(outputSegments, OutputSegment.COMPARATOR);
349: OutputSegment lastOutputSegment = null;
350: for (final Iterator i = outputSegments.iterator(); i
351: .hasNext();) {
352: final OutputSegment outputSegment = (OutputSegment) i
353: .next();
354: if (outputSegment == lastOutputSegment)
355: continue; // silently ignore duplicate output segment
356: if (outputSegment.getBegin() > pos) {
357: Util.appendTo(writer, sourceText, pos,
358: outputSegment.getBegin());
359: }
360: if (outputSegment.getBegin() < pos
361: && outputSegment instanceof BlankOutputSegment) {
362: // Overlapping BlankOutputSegments requires special handling to ensure the correct number of blanks are inserted.
363: for (final int outputSegmentEnd = outputSegment
364: .getEnd(); pos < outputSegmentEnd; pos++)
365: writer.write(' ');
366: } else {
367: outputSegment.writeTo(writer);
368: if (pos < outputSegment.getEnd())
369: pos = outputSegment.getEnd();
370: }
371: lastOutputSegment = outputSegment;
372: }
373: if (pos < sourceText.length())
374: Util.appendTo(writer, sourceText, pos, sourceText
375: .length());
376: } finally {
377: writer.flush();
378: }
379: }
380:
381: public long getEstimatedMaximumOutputLength() {
382: long estimatedMaximumOutputLength = sourceText.length();
383: for (final Iterator i = outputSegments.iterator(); i.hasNext();) {
384: final OutputSegment outputSegment = (OutputSegment) i
385: .next();
386: final int outputSegmentOriginalLength = outputSegment
387: .getEnd()
388: - outputSegment.getBegin();
389: estimatedMaximumOutputLength += (outputSegment
390: .getEstimatedMaximumOutputLength() - outputSegmentOriginalLength);
391: }
392: return estimatedMaximumOutputLength;
393: }
394:
395: /**
396: * Returns the final content of this output document as a <code>String</code>.
397: * @return the final content of this output document as a <code>String</code>.
398: * @see #writeTo(Writer)
399: */
400: public String toString() {
401: return CharStreamSourceUtil.toString(this );
402: }
403:
404: /**
405: * Returns a string representation of this object useful for debugging purposes.
406: * <p>
407: * The output includes details of all the {@link #getRegisteredOutputSegments() registered output segments}.
408: *
409: * @return a string representation of this object useful for debugging purposes.
410: */
411: public String getDebugInfo() {
412: StringBuffer sb = new StringBuffer();
413: for (Iterator i = getRegisteredOutputSegments().iterator(); i
414: .hasNext();) {
415: OutputSegment outputSegment = (OutputSegment) i.next();
416: if (outputSegment instanceof BlankOutputSegment)
417: sb.append("Replace with Spaces: ");
418: else if (outputSegment instanceof RemoveOutputSegment)
419: sb.append("Remove: ");
420: else
421: sb.append("Replace: ");
422: if (sourceText instanceof Source) {
423: Source source = (Source) sourceText;
424: sb.append('(');
425: source.getRowColumnVector(outputSegment.getBegin())
426: .appendTo(sb);
427: sb.append('-');
428: source.getRowColumnVector(outputSegment.getEnd())
429: .appendTo(sb);
430: sb.append(')');
431: } else {
432: sb.append("(p").append(outputSegment.getBegin())
433: .append("-p").append(outputSegment.getEnd())
434: .append(')');
435: }
436: sb.append(' ');
437: String outputFromSegment = outputSegment.toString();
438: if (outputFromSegment.length() <= 20) {
439: sb.append(outputFromSegment);
440: } else {
441: sb.append(outputFromSegment.substring(0, 20)).append(
442: "...");
443: }
444: sb.append(Config.NewLine);
445: }
446: return sb.toString();
447: }
448:
449: /**
450: * Returns a list all of the {@linkplain #register(OutputSegment) registered} {@link OutputSegment} objects in this output document.
451: * <p>
452: * The output segments are sorted in order of their {@linkplain OutputSegment#getBegin() starting position} in the document.
453: * <p>
454: * The returned list is modifiable and any changes will affect the output generated by this <code>OutputDocument</code>.
455: *
456: * @return a list all of the {@linkplain #register(OutputSegment) registered} {@link OutputSegment} objects in this output document.
457: */
458: public List getRegisteredOutputSegments() {
459: Collections.sort(outputSegments, OutputSegment.COMPARATOR);
460: return outputSegments;
461: }
462: }
|