001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.util.*;
024:
025: /**
026: * Represents a segment of a {@link Source} document.
027: * <p>
028: * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
029: * <p>
030: * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
031: */
032: public class Segment implements Comparable, CharSequence {
033: final int begin;
034: final int end;
035: final Source source;
036:
037: List childElements = null;
038:
039: private static final char[] WHITESPACE = { ' ', '\n', '\r', '\t',
040: '\f', '\u200B' }; // see comments in isWhiteSpace(char) method
041:
042: /**
043: * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
044: * @param source the {@link Source} document, must not be <code>null</code>.
045: * @param begin the character position in the source where this segment begins.
046: * @param end the character position in the source where this segment ends.
047: */
048: public Segment(final Source source, final int begin, final int end) {
049: if (begin == -1 || end == -1 || begin > end)
050: throw new IllegalArgumentException();
051: this .begin = begin;
052: this .end = end;
053: if (source == null)
054: throw new IllegalArgumentException(
055: "source argument must not be null");
056: this .source = source;
057: }
058:
059: // Only called from Source constructor
060: Segment(final int length) {
061: begin = 0;
062: this .end = length;
063: source = (Source) this ;
064: }
065:
066: // Only used for creating dummy flag instances of this type (see Element.NOT_CACHED)
067: Segment() {
068: begin = 0;
069: end = 0;
070: source = null;
071: }
072:
073: /**
074: * Returns the character position in the {@link Source} document at which this segment begins.
075: * @return the character position in the {@link Source} document at which this segment begins.
076: */
077: public final int getBegin() {
078: return begin;
079: }
080:
081: /**
082: * Returns the character position in the {@link Source} document immediately after the end of this segment.
083: * <p>
084: * The character at the position specified by this property is <b>not</b> included in the segment.
085: *
086: * @return the character position in the {@link Source} document immediately after the end of this segment.
087: */
088: public final int getEnd() {
089: return end;
090: }
091:
092: /**
093: * Compares the specified object with this <code>Segment</code> for equality.
094: * <p>
095: * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>,
096: * and both segments have the same {@link Source}, and the same begin and end positions.
097: * @param object the object to be compared for equality with this <code>Segment</code>.
098: * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
099: */
100: public final boolean equals(final Object object) {
101: if (this == object)
102: return true;
103: if (object == null || !(object instanceof Segment))
104: return false;
105: final Segment segment = (Segment) object;
106: return segment.begin == begin && segment.end == end
107: && segment.source == source;
108: }
109:
110: /**
111: * Returns a hash code value for the segment.
112: * <p>
113: * The current implementation returns the sum of the begin and end positions, although this is not
114: * guaranteed in future versions.
115: *
116: * @return a hash code value for the segment.
117: */
118: public int hashCode() {
119: return begin + end;
120: }
121:
122: /**
123: * Returns the length of the segment.
124: * This is defined as the number of characters between the begin and end positions.
125: * @return the length of the segment.
126: */
127: public final int length() {
128: return end - begin;
129: }
130:
131: /**
132: * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
133: * <p>
134: * This is the case if {@link #getBegin()}<code><=segment.</code>{@link #getBegin()}<code> && </code>{@link #getEnd()}<code>>=segment.</code>{@link #getEnd()}.
135: *
136: * @param segment the segment to be tested for being enclosed by this segment.
137: * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
138: */
139: public final boolean encloses(final Segment segment) {
140: return begin <= segment.begin && end >= segment.end;
141: }
142:
143: /**
144: * Indicates whether this segment encloses the specified character position in the source document.
145: * <p>
146: * This is the case if {@link #getBegin()}<code> <= pos < </code>{@link #getEnd()}.
147: *
148: * @param pos the position in the {@link Source} document.
149: * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
150: */
151: public final boolean encloses(final int pos) {
152: return begin <= pos && pos < end;
153: }
154:
155: /**
156: * Returns the source text of this segment as a <code>String</code>.
157: * <p>
158: * The returned <code>String</code> is newly created with every call to this method, unless this
159: * segment is itself an instance of {@link Source}.
160: * <p>
161: * Note that before version 2.0 this returned a representation of this object useful for debugging purposes,
162: * which can now be obtained via the {@link #getDebugInfo()} method.
163: *
164: * @return the source text of this segment as a <code>String</code>.
165: */
166: public String toString() {
167: return source.string.substring(begin, end).toString();
168: }
169:
170: /**
171: * Performs a simple rendering of the HTML markup in this segment into text.
172: * <p>
173: * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before
174: * {@linkplain Renderer#writeTo(Writer) obtaining its output}.
175: *
176: * @return an instance of {@link Renderer} based on this segment.
177: * @see #getTextExtractor()
178: */
179: public Renderer getRenderer() {
180: return new Renderer(this );
181: }
182:
183: /**
184: * Extracts the textual content from the HTML markup of this segment.
185: * <p>
186: * The output can be configured by setting properties on the returned {@link TextExtractor} instance before
187: * {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
188: * <p>
189: * @return an instance of {@link TextExtractor} based on this segment.
190: * @see #getRenderer()
191: */
192: public TextExtractor getTextExtractor() {
193: return new TextExtractor(this );
194: }
195:
196: /**
197: * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
198: * <p>
199: * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
200: * if this method is to be used on a large proportion of the source.
201: * It is called automatically if this method is called on the {@link Source} object itself.
202: * <p>
203: * See the {@link Tag} class documentation for more details about the behaviour of this method.
204: *
205: * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
206: */
207: public List findAllTags() {
208: return findAllTags(null);
209: }
210:
211: /**
212: * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
213: * <p>
214: * See the {@link Tag} class documentation for more details about the behaviour of this method.
215: * <p>
216: * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #findAllTags()}.
217: *
218: * @param tagType the {@linkplain TagType type} of tags to find.
219: * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
220: */
221: public List findAllTags(final TagType tagType) {
222: Tag tag = checkEnclosure(Tag.findPreviousOrNextTag(source,
223: begin, tagType, false));
224: if (tag == null)
225: return Collections.EMPTY_LIST;
226: final ArrayList list = new ArrayList();
227: do {
228: list.add(tag);
229: tag = checkEnclosure(Tag.findPreviousOrNextTag(source,
230: tag.begin + 1, tagType, false));
231: } while (tag != null);
232: return list;
233: }
234:
235: /**
236: * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
237: * <p>
238: * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
239: * if this method is to be used on a large proportion of the source.
240: * It is called automatically if this method is called on the {@link Source} object itself.
241: * <p>
242: * See the {@link Tag} class documentation for more details about the behaviour of this method.
243: *
244: * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
245: */
246: public List findAllStartTags() {
247: return findAllStartTags(null);
248: }
249:
250: /**
251: * Returns a list of all {@link StartTag} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
252: * <p>
253: * See the {@link Tag} class documentation for more details about the behaviour of this method.
254: * <p>
255: * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #findAllStartTags()}.
256: * <p>
257: * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
258: *
259: * @param name the {@linkplain StartTag#getName() name} of the start tags to find.
260: * @return a list of all {@link StartTag} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
261: */
262: public List findAllStartTags(String name) {
263: if (name != null)
264: name = name.toLowerCase();
265: final boolean isXMLTagName = Tag.isXMLName(name);
266: StartTag startTag = (StartTag) checkEnclosure(StartTag
267: .findPreviousOrNext(source, begin, name, isXMLTagName,
268: false));
269: if (startTag == null)
270: return Collections.EMPTY_LIST;
271: final ArrayList list = new ArrayList();
272: do {
273: list.add(startTag);
274: startTag = (StartTag) checkEnclosure(StartTag
275: .findPreviousOrNext(source, startTag.begin + 1,
276: name, isXMLTagName, false));
277: } while (startTag != null);
278: return list;
279: }
280:
281: /**
282: * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair
283: * that are {@linkplain #encloses(Segment) enclosed} by this segment.
284: * <p>
285: * See the {@link Tag} class documentation for more details about the behaviour of this method.
286: *
287: * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
288: * @param value the value of the specified attribute to search for, must not be <code>null</code>.
289: * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
290: * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
291: */
292: public List findAllStartTags(final String attributeName,
293: final String value, final boolean valueCaseSensitive) {
294: StartTag startTag = (StartTag) checkEnclosure(source
295: .findNextStartTag(begin, attributeName, value,
296: valueCaseSensitive));
297: if (startTag == null)
298: return Collections.EMPTY_LIST;
299: final ArrayList list = new ArrayList();
300: do {
301: list.add(startTag);
302: startTag = (StartTag) checkEnclosure(source
303: .findNextStartTag(startTag.begin + 1,
304: attributeName, value, valueCaseSensitive));
305: } while (startTag != null);
306: return list;
307: }
308:
309: /**
310: * Returns a list of the immediate children of this segment in the document element hierarchy.
311: * <p>
312: * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
313: * <p>
314: * An element found at the start of this segment is included in the list.
315: * Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead,
316: * which only returns the children of the element.
317: * <p>
318: * Calling <code>getChildElements()</code> on an <code>Element</code> is usually more efficient than calling it on a <code>Segment</code>.
319: * <p>
320: * The objects in the list are all of type {@link Element}.
321: * <p>
322: * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
323: * if this method is to be used on a large proportion of the source.
324: * It is called automatically if this method is called on the {@link Source} object itself.
325: * <p>
326: * See the {@link Source#getChildElements()} method for more details.
327: *
328: * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
329: * @see Element#getParentElement()
330: */
331: public List getChildElements() {
332: if (childElements == null) {
333: if (length() == 0) {
334: childElements = Collections.EMPTY_LIST;
335: } else {
336: childElements = new ArrayList();
337: int pos = begin;
338: while (true) {
339: final StartTag childStartTag = source
340: .findNextStartTag(pos);
341: if (childStartTag == null
342: || childStartTag.begin >= end)
343: break;
344: if (!Config.IncludeServerTagsInElementHierarchy
345: && childStartTag.getTagType().isServerTag()) {
346: pos = childStartTag.end;
347: continue;
348: }
349: final Element childElement = childStartTag
350: .getElement();
351: childElements.add(childElement);
352: childElement.getChildElements();
353: pos = childElement.end;
354: }
355: }
356: }
357: return childElements;
358: }
359:
360: /**
361: * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
362: * <p>
363: * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
364: * if this method is to be used on a large proportion of the source.
365: * It is called automatically if this method is called on the {@link Source} object itself.
366: * <p>
367: * The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags()} method.
368: *
369: * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
370: */
371: public List findAllElements() {
372: return findAllElements((String) null);
373: }
374:
375: /**
376: * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
377: * <p>
378: * The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags(String name)} method.
379: * <p>
380: * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #findAllElements()}.
381: * <p>
382: * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
383: *
384: * @param name the {@linkplain Element#getName() name} of the elements to find.
385: * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
386: */
387: public List findAllElements(String name) {
388: if (name != null)
389: name = name.toLowerCase();
390: final List startTags = findAllStartTags(name);
391: if (startTags.isEmpty())
392: return Collections.EMPTY_LIST;
393: final ArrayList elements = new ArrayList(startTags.size());
394: for (final Iterator i = startTags.iterator(); i.hasNext();) {
395: final StartTag startTag = (StartTag) i.next();
396: final Element element = startTag.getElement();
397: if (element.end > end)
398: break;
399: elements.add(element);
400: }
401: return elements;
402: }
403:
404: /**
405: * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
406: * <p>
407: * The elements returned correspond exactly with the start tags returned in the {@link #findAllTags(TagType)} method.
408: *
409: * @param startTagType the {@linkplain StartTagType type} of start tags to find, must not be <code>null</code>.
410: * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
411: */
412: public List findAllElements(final StartTagType startTagType) {
413: final List startTags = findAllTags(startTagType);
414: if (startTags.isEmpty())
415: return Collections.EMPTY_LIST;
416: final ArrayList elements = new ArrayList(startTags.size());
417: for (final Iterator i = startTags.iterator(); i.hasNext();) {
418: final StartTag startTag = (StartTag) i.next();
419: final Element element = startTag.getElement();
420: if (element.end > end)
421: break;
422: elements.add(element);
423: }
424: return elements;
425: }
426:
427: /**
428: * Returns a list of all {@link Element} objects with the specified attribute name/value pair
429: * that are {@linkplain #encloses(Segment) enclosed} by this segment.
430: * <p>
431: * The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method.
432: *
433: * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
434: * @param value the value of the specified attribute to search for, must not be <code>null</code>.
435: * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
436: * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
437: */
438: public List findAllElements(final String attributeName,
439: final String value, final boolean valueCaseSensitive) {
440: final List startTags = findAllStartTags(attributeName, value,
441: valueCaseSensitive);
442: if (startTags.isEmpty())
443: return Collections.EMPTY_LIST;
444: final ArrayList elements = new ArrayList(startTags.size());
445: for (final Iterator i = startTags.iterator(); i.hasNext();) {
446: final StartTag startTag = (StartTag) i.next();
447: final Element element = startTag.getElement();
448: if (element.end > end)
449: break;
450: elements.add(element);
451: }
452: return elements;
453: }
454:
455: /**
456: * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
457: * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
458: */
459: public List findAllCharacterReferences() {
460: CharacterReference characterReference = findNextCharacterReference(begin);
461: if (characterReference == null)
462: return Collections.EMPTY_LIST;
463: final ArrayList list = new ArrayList();
464: do {
465: list.add(characterReference);
466: characterReference = findNextCharacterReference(characterReference.end);
467: } while (characterReference != null);
468: return list;
469: }
470:
471: /**
472: * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
473: * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
474: */
475: public List findFormControls() {
476: return FormControl.findAll(this );
477: }
478:
479: /**
480: * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
481: * <p>
482: * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #findFormControls()}<code>)</code>.
483: *
484: * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
485: * @see #findFormControls()
486: */
487: public FormFields findFormFields() {
488: return new FormFields(findFormControls());
489: }
490:
491: /**
492: * Parses any {@link Attributes} within this segment.
493: * This method is only used in the unusual situation where attributes exist outside of a start tag.
494: * The {@link StartTag#getAttributes()} method should be used in normal situations.
495: * <p>
496: * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
497: *
498: * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
499: */
500: public Attributes parseAttributes() {
501: return source.parseAttributes(begin, end);
502: }
503:
504: /**
505: * Causes the this segment to be ignored when parsing.
506: * <p>
507: * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
508: * <p>
509: * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside
510: * {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags.
511: * The most common scenario is where the {@linkplain Attributes attributes} of a normal tag uses server tags to dynamically set the values of the attributes.
512: * <p>
513: * As of version 2.4 it is no longer necessary to use this method to ignore {@linkplain StartTagType#SERVER_COMMON common server tags} inside normal tags,
514: * as the attributes parser now automatically ignores common server tags.
515: * <p>
516: * As of version 2.5 it is also unnecessary to use this method to ignore the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements,
517: * as the parser automatically ignores this content when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
518: * <p>
519: * This leaves only a few scenarios where calling this method still provides a significant benefit.
520: * <p>
521: * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags.
522: * Here is an example using an XML-style JSP tag:
523: * <blockquote class="code"><code><a href="<i18n:resource path="/Portal"/>?BACK=TRUE">back</a></code></blockquote>
524: * The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute,
525: * as there is no way for the parser to recognise the <code>il8n:resource</code> element as a server tag.
526: * Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice,
527: * but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to
528: * find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
529: * <p>
530: * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely.
531: * Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of
532: * tools such as {@link TextExtractor} and {@link Renderer}.
533: * <p>
534: * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
535: * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment.
536: * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
537: * and perform the desired operations on this new source object.
538: * <p>
539: * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>.
540: * <p>
541: * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache},
542: * and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
543: * If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
544: * Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
545: * <p>
546: * For best performance, this method should be called on all segments that need to be ignored without calling
547: * any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
548: *
549: * @see Source#ignoreWhenParsing(Collection segments)
550: */
551: public void ignoreWhenParsing() {
552: source.ignoreWhenParsing(begin, end);
553: }
554:
555: /**
556: * Compares this <code>Segment</code> object to another object.
557: * <p>
558: * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
559: * <p>
560: * A segment is considered to be before another segment if its begin position is earlier,
561: * or in the case that both segments begin at the same position, its end position is earlier.
562: * <p>
563: * Segments that begin and end at the same position are considered equal for
564: * the purposes of this comparison, even if they relate to different source documents.
565: * <p>
566: * Note: this class has a natural ordering that is inconsistent with equals.
567: * This means that this method may return zero in some cases where calling the
568: * {@link #equals(Object)} method with the same argument returns <code>false</code>.
569: *
570: * @param o the segment to be compared
571: * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
572: * @throws ClassCastException if the argument is not a <code>Segment</code>
573: */
574: public int compareTo(final Object o) {
575: if (this == o)
576: return 0;
577: final Segment segment = (Segment) o;
578: if (begin < segment.begin)
579: return -1;
580: if (begin > segment.begin)
581: return 1;
582: if (end < segment.end)
583: return -1;
584: if (end > segment.end)
585: return 1;
586: return 0;
587: }
588:
589: /**
590: * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
591: * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
592: */
593: public final boolean isWhiteSpace() {
594: for (int i = begin; i < end; i++)
595: if (!isWhiteSpace(source.charAt(i)))
596: return false;
597: return true;
598: }
599:
600: /**
601: * Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
602: * <p>
603: * The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a>
604: * specifies the following white space characters:
605: * <ul>
606: * <li>space (U+0020)
607: * <li>tab (U+0009)
608: * <li>form feed (U+000C)
609: * <li>line feed (U+000A)
610: * <li>carriage return (U+000D)
611: * <li>zero-width space (U+200B)
612: * </ul>
613: * <p>
614: * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
615: * recognise them as whitespace and renders them as an unprintable character (empty square).
616: * Even zero-width spaces included using the numeric character reference <code>&#x200B;</code> are rendered this way.
617: *
618: * @param ch the character to test.
619: * @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
620: */
621: public static final boolean isWhiteSpace(final char ch) {
622: for (int i = 0; i < WHITESPACE.length; i++)
623: if (ch == WHITESPACE[i])
624: return true;
625: return false;
626: }
627:
628: /**
629: * Returns a string representation of this object useful for debugging purposes.
630: * @return a string representation of this object useful for debugging purposes.
631: */
632: public String getDebugInfo() {
633: final StringBuffer sb = new StringBuffer(50);
634: sb.append('(');
635: source.getRowColumnVector(begin).appendTo(sb);
636: sb.append('-');
637: source.getRowColumnVector(end).appendTo(sb);
638: sb.append(')');
639: return sb.toString();
640: }
641:
642: /**
643: * Returns the character at the specified index.
644: * <p>
645: * This is logically equivalent to <code>toString().charAt(index)</code>
646: * for valid argument values <code>0 <= index < length()</code>.
647: * <p>
648: * However because this implementation works directly on the underlying document source string,
649: * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
650: * for an invalid argument value.
651: *
652: * @param index the index of the character.
653: * @return the character at the specified index.
654: */
655: public final char charAt(final int index) {
656: return source.string.charAt(begin + index);
657: }
658:
659: /**
660: * Returns a new character sequence that is a subsequence of this sequence.
661: * <p>
662: * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code>
663: * for valid values of <code>beginIndex</code> and <code>endIndex</code>.
664: * <p>
665: * However because this implementation works directly on the underlying document source string,
666: * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
667: * for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
668: *
669: * @param beginIndex the begin index, inclusive.
670: * @param endIndex the end index, exclusive.
671: * @return a new character sequence that is a subsequence of this sequence.
672: */
673: public final CharSequence subSequence(final int beginIndex,
674: final int endIndex) {
675: return source.string.subSequence(begin + beginIndex, begin
676: + endIndex);
677: }
678:
679: /**
680: * Extracts the textual content from the HTML markup of this segment.
681: * <p>
682: * This method has been deprecated as of version 2.4 and replaced with the {@link #getTextExtractor()} method.
683: *
684: * @return the textual content from the HTML markup of this segment.
685: * @deprecated Use {@link #getTextExtractor()}<code>.</code>{@link TextExtractor#toString() toString()} instead.
686: */
687: public String extractText() {
688: return new TextExtractor(this ).toString();
689: }
690:
691: /**
692: * Extracts the textual content from the HTML markup of this segment.
693: * <p>
694: * This method has been deprecated as of version 2.4 and replaced with the {@link #getTextExtractor()} method.
695: *
696: * @param includeAttributes specifies whether the values of <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>, <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>, <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>, and <a target="_blank" href="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a> attributes are included in the output.
697: * @return the textual content from the HTML markup of this segment.
698: * @deprecated Use {@link #getTextExtractor()}<code>.</code>{@link TextExtractor#setIncludeAttributes(boolean) setIncludeAttributes(includeAttributes)}<code>.</code>{@link TextExtractor#toString() toString()} instead.
699: */
700: public String extractText(final boolean includeAttributes) {
701: return new TextExtractor(this ).setIncludeAttributes(
702: includeAttributes).toString();
703: }
704:
705: /**
706: * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
707: * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
708: */
709: static final StringBuffer appendCollapseWhiteSpace(
710: final StringBuffer sb, final CharSequence text) {
711: final int textLength = text.length();
712: int i = 0;
713: boolean lastWasWhiteSpace = false;
714: while (true) {
715: if (i >= textLength)
716: return sb;
717: if (!isWhiteSpace(text.charAt(i)))
718: break;
719: i++;
720: }
721: do {
722: final char ch = text.charAt(i++);
723: if (isWhiteSpace(ch)) {
724: lastWasWhiteSpace = true;
725: } else {
726: if (lastWasWhiteSpace) {
727: sb.append(' ');
728: lastWasWhiteSpace = false;
729: }
730: sb.append(ch);
731: }
732: } while (i < textLength);
733: return sb;
734: }
735:
736: private Tag checkEnclosure(final Tag tag) {
737: if (tag == null || tag.end > end)
738: return null;
739: return tag;
740: }
741:
742: private CharacterReference findNextCharacterReference(final int pos) {
743: final CharacterReference characterReference = source
744: .findNextCharacterReference(pos);
745: if (characterReference == null || characterReference.end > end)
746: return null;
747: return characterReference;
748: }
749: }
|