Source Code Cross Referenced for Segment.java in » HTML-Parser » jericho-html » au » id » jericho » lib » html » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » HTML Parser » jericho html » au.id.jericho.lib.html
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002:        // Version 2.5
003:        // Copyright (C) 2007 Martin Jericho
004:        // http://jerichohtml.sourceforge.net/
005:        //
006:        // This library is free software; you can redistribute it and/or
007:        // modify it under the terms of either one of the following licences:
008:        //
009:        // 1. The Eclipse Public License (EPL) version 1.0,
010:        // included in this distribution in the file licence-epl-1.0.html
011:        // or available at http://www.eclipse.org/legal/epl-v10.html
012:        //
013:        // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014:        // included in this distribution in the file licence-lgpl-2.1.txt
015:        // or available at http://www.gnu.org/licenses/lgpl.txt
016:        //
017:        // This library is distributed on an "AS IS" basis,
018:        // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019:        // See the individual licence texts for more details.
020:
021:        package au.id.jericho.lib.html;
022:
023:        import java.util.*;
024:
025:        /**
026:         * Represents a segment of a {@link Source} document.
027:         * <p>
028:         * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
029:         * <p>
030:         * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
031:         */
032:        public class Segment implements  Comparable, CharSequence {
033:            final int begin;
034:            final int end;
035:            final Source source;
036:
037:            List childElements = null;
038:
039:            private static final char[] WHITESPACE = { ' ', '\n', '\r', '\t',
040:                    '\f', '\u200B' }; // see comments in isWhiteSpace(char) method
041:
042:            /**
043:             * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
044:             * @param source  the {@link Source} document, must not be <code>null</code>.
045:             * @param begin  the character position in the source where this segment begins.
046:             * @param end  the character position in the source where this segment ends.
047:             */
048:            public Segment(final Source source, final int begin, final int end) {
049:                if (begin == -1 || end == -1 || begin > end)
050:                    throw new IllegalArgumentException();
051:                this .begin = begin;
052:                this .end = end;
053:                if (source == null)
054:                    throw new IllegalArgumentException(
055:                            "source argument must not be null");
056:                this .source = source;
057:            }
058:
059:            // Only called from Source constructor
060:            Segment(final int length) {
061:                begin = 0;
062:                this .end = length;
063:                source = (Source) this ;
064:            }
065:
066:            // Only used for creating dummy flag instances of this type (see Element.NOT_CACHED)
067:            Segment() {
068:                begin = 0;
069:                end = 0;
070:                source = null;
071:            }
072:
073:            /**
074:             * Returns the character position in the {@link Source} document at which this segment begins.
075:             * @return the character position in the {@link Source} document at which this segment begins.
076:             */
077:            public final int getBegin() {
078:                return begin;
079:            }
080:
081:            /**
082:             * Returns the character position in the {@link Source} document immediately after the end of this segment.
083:             * <p>
084:             * The character at the position specified by this property is <b>not</b> included in the segment.
085:             *
086:             * @return the character position in the {@link Source} document immediately after the end of this segment.
087:             */
088:            public final int getEnd() {
089:                return end;
090:            }
091:
092:            /**
093:             * Compares the specified object with this <code>Segment</code> for equality.
094:             * <p>
095:             * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>,
096:             * and both segments have the same {@link Source}, and the same begin and end positions.
097:             * @param object  the object to be compared for equality with this <code>Segment</code>.
098:             * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
099:             */
100:            public final boolean equals(final Object object) {
101:                if (this  == object)
102:                    return true;
103:                if (object == null || !(object instanceof  Segment))
104:                    return false;
105:                final Segment segment = (Segment) object;
106:                return segment.begin == begin && segment.end == end
107:                        && segment.source == source;
108:            }
109:
110:            /**
111:             * Returns a hash code value for the segment.
112:             * <p>
113:             * The current implementation returns the sum of the begin and end positions, although this is not
114:             * guaranteed in future versions.
115:             *
116:             * @return a hash code value for the segment.
117:             */
118:            public int hashCode() {
119:                return begin + end;
120:            }
121:
122:            /**
123:             * Returns the length of the segment.
124:             * This is defined as the number of characters between the begin and end positions.
125:             * @return the length of the segment.
126:             */
127:            public final int length() {
128:                return end - begin;
129:            }
130:
131:            /**
132:             * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
133:             * <p>
134:             * This is the case if {@link #getBegin()}<code>&lt;=segment.</code>{@link #getBegin()}<code> &amp;&amp; </code>{@link #getEnd()}<code>&gt;=segment.</code>{@link #getEnd()}.
135:             *
136:             * @param segment  the segment to be tested for being enclosed by this segment.
137:             * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
138:             */
139:            public final boolean encloses(final Segment segment) {
140:                return begin <= segment.begin && end >= segment.end;
141:            }
142:
143:            /**
144:             * Indicates whether this segment encloses the specified character position in the source document.
145:             * <p>
146:             * This is the case if {@link #getBegin()}<code> &lt;= pos &lt; </code>{@link #getEnd()}.
147:             *
148:             * @param pos  the position in the {@link Source} document.
149:             * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
150:             */
151:            public final boolean encloses(final int pos) {
152:                return begin <= pos && pos < end;
153:            }
154:
155:            /**
156:             * Returns the source text of this segment as a <code>String</code>.
157:             * <p>
158:             * The returned <code>String</code> is newly created with every call to this method, unless this
159:             * segment is itself an instance of {@link Source}.
160:             * <p>
161:             * Note that before version 2.0 this returned a representation of this object useful for debugging purposes,
162:             * which can now be obtained via the {@link #getDebugInfo()} method.
163:             *
164:             * @return the source text of this segment as a <code>String</code>.
165:             */
166:            public String toString() {
167:                return source.string.substring(begin, end).toString();
168:            }
169:
170:            /**
171:             * Performs a simple rendering of the HTML markup in this segment into text.
172:             * <p>
173:             * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before
174:             * {@linkplain Renderer#writeTo(Writer) obtaining its output}.
175:             * 
176:             * @return an instance of {@link Renderer} based on this segment.
177:             * @see #getTextExtractor()
178:             */
179:            public Renderer getRenderer() {
180:                return new Renderer(this );
181:            }
182:
183:            /**
184:             * Extracts the textual content from the HTML markup of this segment.
185:             * <p>
186:             * The output can be configured by setting properties on the returned {@link TextExtractor} instance before
187:             * {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
188:             * <p>
189:             * @return an instance of {@link TextExtractor} based on this segment.
190:             * @see #getRenderer()
191:             */
192:            public TextExtractor getTextExtractor() {
193:                return new TextExtractor(this );
194:            }
195:
196:            /**
197:             * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
198:             * <p>
199:             * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
200:             * if this method is to be used on a large proportion of the source.
201:             * It is called automatically if this method is called on the {@link Source} object itself.
202:             * <p>
203:             * See the {@link Tag} class documentation for more details about the behaviour of this method.
204:             *
205:             * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
206:             */
207:            public List findAllTags() {
208:                return findAllTags(null);
209:            }
210:
211:            /**
212:             * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
213:             * <p>
214:             * See the {@link Tag} class documentation for more details about the behaviour of this method.
215:             * <p>
216:             * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #findAllTags()}.
217:             *
218:             * @param tagType  the {@linkplain TagType type} of tags to find.
219:             * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
220:             */
221:            public List findAllTags(final TagType tagType) {
222:                Tag tag = checkEnclosure(Tag.findPreviousOrNextTag(source,
223:                        begin, tagType, false));
224:                if (tag == null)
225:                    return Collections.EMPTY_LIST;
226:                final ArrayList list = new ArrayList();
227:                do {
228:                    list.add(tag);
229:                    tag = checkEnclosure(Tag.findPreviousOrNextTag(source,
230:                            tag.begin + 1, tagType, false));
231:                } while (tag != null);
232:                return list;
233:            }
234:
235:            /**
236:             * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
237:             * <p>
238:             * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
239:             * if this method is to be used on a large proportion of the source.
240:             * It is called automatically if this method is called on the {@link Source} object itself.
241:             * <p>
242:             * See the {@link Tag} class documentation for more details about the behaviour of this method.
243:             *
244:             * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
245:             */
246:            public List findAllStartTags() {
247:                return findAllStartTags(null);
248:            }
249:
250:            /**
251:             * Returns a list of all {@link StartTag} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
252:             * <p>
253:             * See the {@link Tag} class documentation for more details about the behaviour of this method.
254:             * <p>
255:             * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #findAllStartTags()}.
256:             * <p>
257:             * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
258:             *
259:             * @param name  the {@linkplain StartTag#getName() name} of the start tags to find.
260:             * @return a list of all {@link StartTag} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
261:             */
262:            public List findAllStartTags(String name) {
263:                if (name != null)
264:                    name = name.toLowerCase();
265:                final boolean isXMLTagName = Tag.isXMLName(name);
266:                StartTag startTag = (StartTag) checkEnclosure(StartTag
267:                        .findPreviousOrNext(source, begin, name, isXMLTagName,
268:                                false));
269:                if (startTag == null)
270:                    return Collections.EMPTY_LIST;
271:                final ArrayList list = new ArrayList();
272:                do {
273:                    list.add(startTag);
274:                    startTag = (StartTag) checkEnclosure(StartTag
275:                            .findPreviousOrNext(source, startTag.begin + 1,
276:                                    name, isXMLTagName, false));
277:                } while (startTag != null);
278:                return list;
279:            }
280:
281:            /**
282:             * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair 
283:             * that are {@linkplain #encloses(Segment) enclosed} by this segment.
284:             * <p>
285:             * See the {@link Tag} class documentation for more details about the behaviour of this method.
286:             *
287:             * @param attributeName  the attribute name (case insensitive) to search for, must not be <code>null</code>.
288:             * @param value  the value of the specified attribute to search for, must not be <code>null</code>.
289:             * @param valueCaseSensitive  specifies whether the attribute value matching is case sensitive.
290:             * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
291:             */
292:            public List findAllStartTags(final String attributeName,
293:                    final String value, final boolean valueCaseSensitive) {
294:                StartTag startTag = (StartTag) checkEnclosure(source
295:                        .findNextStartTag(begin, attributeName, value,
296:                                valueCaseSensitive));
297:                if (startTag == null)
298:                    return Collections.EMPTY_LIST;
299:                final ArrayList list = new ArrayList();
300:                do {
301:                    list.add(startTag);
302:                    startTag = (StartTag) checkEnclosure(source
303:                            .findNextStartTag(startTag.begin + 1,
304:                                    attributeName, value, valueCaseSensitive));
305:                } while (startTag != null);
306:                return list;
307:            }
308:
309:            /**
310:             * Returns a list of the immediate children of this segment in the document element hierarchy.
311:             * <p>
312:             * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
313:             * <p>
314:             * An element found at the start of this segment is included in the list.
315:             * Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead,
316:             * which only returns the children of the element.
317:             * <p>
318:             * Calling <code>getChildElements()</code> on an <code>Element</code> is usually more efficient than calling it on a <code>Segment</code>.
319:             * <p>
320:             * The objects in the list are all of type {@link Element}.
321:             * <p>
322:             * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
323:             * if this method is to be used on a large proportion of the source.
324:             * It is called automatically if this method is called on the {@link Source} object itself.
325:             * <p>
326:             * See the {@link Source#getChildElements()} method for more details.
327:             *
328:             * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
329:             * @see Element#getParentElement()
330:             */
331:            public List getChildElements() {
332:                if (childElements == null) {
333:                    if (length() == 0) {
334:                        childElements = Collections.EMPTY_LIST;
335:                    } else {
336:                        childElements = new ArrayList();
337:                        int pos = begin;
338:                        while (true) {
339:                            final StartTag childStartTag = source
340:                                    .findNextStartTag(pos);
341:                            if (childStartTag == null
342:                                    || childStartTag.begin >= end)
343:                                break;
344:                            if (!Config.IncludeServerTagsInElementHierarchy
345:                                    && childStartTag.getTagType().isServerTag()) {
346:                                pos = childStartTag.end;
347:                                continue;
348:                            }
349:                            final Element childElement = childStartTag
350:                                    .getElement();
351:                            childElements.add(childElement);
352:                            childElement.getChildElements();
353:                            pos = childElement.end;
354:                        }
355:                    }
356:                }
357:                return childElements;
358:            }
359:
360:            /**
361:             * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
362:             * <p>
363:             * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
364:             * if this method is to be used on a large proportion of the source.
365:             * It is called automatically if this method is called on the {@link Source} object itself.
366:             * <p>
367:             * The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags()} method.
368:             *
369:             * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
370:             */
371:            public List findAllElements() {
372:                return findAllElements((String) null);
373:            }
374:
375:            /**
376:             * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
377:             * <p>
378:             * The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags(String name)} method.
379:             * <p>
380:             * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #findAllElements()}.
381:             * <p>
382:             * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
383:             *
384:             * @param name  the {@linkplain Element#getName() name} of the elements to find.
385:             * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
386:             */
387:            public List findAllElements(String name) {
388:                if (name != null)
389:                    name = name.toLowerCase();
390:                final List startTags = findAllStartTags(name);
391:                if (startTags.isEmpty())
392:                    return Collections.EMPTY_LIST;
393:                final ArrayList elements = new ArrayList(startTags.size());
394:                for (final Iterator i = startTags.iterator(); i.hasNext();) {
395:                    final StartTag startTag = (StartTag) i.next();
396:                    final Element element = startTag.getElement();
397:                    if (element.end > end)
398:                        break;
399:                    elements.add(element);
400:                }
401:                return elements;
402:            }
403:
404:            /**
405:             * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
406:             * <p>
407:             * The elements returned correspond exactly with the start tags returned in the {@link #findAllTags(TagType)} method.
408:             *
409:             * @param startTagType  the {@linkplain StartTagType type} of start tags to find, must not be <code>null</code>.
410:             * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
411:             */
412:            public List findAllElements(final StartTagType startTagType) {
413:                final List startTags = findAllTags(startTagType);
414:                if (startTags.isEmpty())
415:                    return Collections.EMPTY_LIST;
416:                final ArrayList elements = new ArrayList(startTags.size());
417:                for (final Iterator i = startTags.iterator(); i.hasNext();) {
418:                    final StartTag startTag = (StartTag) i.next();
419:                    final Element element = startTag.getElement();
420:                    if (element.end > end)
421:                        break;
422:                    elements.add(element);
423:                }
424:                return elements;
425:            }
426:
427:            /**
428:             * Returns a list of all {@link Element} objects with the specified attribute name/value pair 
429:             * that are {@linkplain #encloses(Segment) enclosed} by this segment.
430:             * <p>
431:             * The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method.
432:             *
433:             * @param attributeName  the attribute name (case insensitive) to search for, must not be <code>null</code>.
434:             * @param value  the value of the specified attribute to search for, must not be <code>null</code>.
435:             * @param valueCaseSensitive  specifies whether the attribute value matching is case sensitive.
436:             * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
437:             */
438:            public List findAllElements(final String attributeName,
439:                    final String value, final boolean valueCaseSensitive) {
440:                final List startTags = findAllStartTags(attributeName, value,
441:                        valueCaseSensitive);
442:                if (startTags.isEmpty())
443:                    return Collections.EMPTY_LIST;
444:                final ArrayList elements = new ArrayList(startTags.size());
445:                for (final Iterator i = startTags.iterator(); i.hasNext();) {
446:                    final StartTag startTag = (StartTag) i.next();
447:                    final Element element = startTag.getElement();
448:                    if (element.end > end)
449:                        break;
450:                    elements.add(element);
451:                }
452:                return elements;
453:            }
454:
455:            /**
456:             * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
457:             * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
458:             */
459:            public List findAllCharacterReferences() {
460:                CharacterReference characterReference = findNextCharacterReference(begin);
461:                if (characterReference == null)
462:                    return Collections.EMPTY_LIST;
463:                final ArrayList list = new ArrayList();
464:                do {
465:                    list.add(characterReference);
466:                    characterReference = findNextCharacterReference(characterReference.end);
467:                } while (characterReference != null);
468:                return list;
469:            }
470:
471:            /**
472:             * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
473:             * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
474:             */
475:            public List findFormControls() {
476:                return FormControl.findAll(this );
477:            }
478:
479:            /**
480:             * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
481:             * <p>
482:             * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #findFormControls()}<code>)</code>.
483:             *
484:             * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
485:             * @see #findFormControls()
486:             */
487:            public FormFields findFormFields() {
488:                return new FormFields(findFormControls());
489:            }
490:
491:            /**
492:             * Parses any {@link Attributes} within this segment.
493:             * This method is only used in the unusual situation where attributes exist outside of a start tag.
494:             * The {@link StartTag#getAttributes()} method should be used in normal situations.
495:             * <p>
496:             * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
497:             *
498:             * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
499:             */
500:            public Attributes parseAttributes() {
501:                return source.parseAttributes(begin, end);
502:            }
503:
504:            /**
505:             * Causes the this segment to be ignored when parsing.
506:             * <p>
507:             * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
508:             * <p>
509:             * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside
510:             * {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags.
511:             * The most common scenario is where the {@linkplain Attributes attributes} of a normal tag uses server tags to dynamically set the values of the attributes.
512:             * <p>
513:             * As of version 2.4 it is no longer necessary to use this method to ignore {@linkplain StartTagType#SERVER_COMMON common server tags} inside normal tags,
514:             * as the attributes parser now automatically ignores common server tags.
515:             * <p>
516:             * As of version 2.5 it is also unnecessary to use this method to ignore the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements,
517:             * as the parser automatically ignores this content when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
518:             * <p>
519:             * This leaves only a few scenarios where calling this method still provides a significant benefit.
520:             * <p>
521:             * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags.
522:             * Here is an example using an XML-style JSP tag:
523:             * <blockquote class="code"><code>&lt;a href="&lt;i18n:resource path="/Portal"/&gt;?BACK=TRUE"&gt;back&lt;/a&gt;</code></blockquote>
524:             * The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute,
525:             * as there is no way for the parser to recognise the <code>il8n:resource</code> element as a server tag.
526:             * Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice,
527:             * but it is nevertheless valid JSP.  The only way to ensure that this library is able to parse the normal tag surrounding it is to
528:             * find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
529:             * <p>
530:             * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely.
531:             * Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of
532:             * tools such as {@link TextExtractor} and {@link Renderer}.
533:             * <p>
534:             * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
535:             * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment.
536:             * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
537:             * and perform the desired operations on this new source object.
538:             * <p>
539:             * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>.
540:             * <p>
541:             * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache},
542:             * and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
543:             * If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
544:             * Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
545:             * <p>
546:             * For best performance, this method should be called on all segments that need to be ignored without calling
547:             * any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
548:             *
549:             * @see Source#ignoreWhenParsing(Collection segments)
550:             */
551:            public void ignoreWhenParsing() {
552:                source.ignoreWhenParsing(begin, end);
553:            }
554:
555:            /**
556:             * Compares this <code>Segment</code> object to another object.
557:             * <p>
558:             * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
559:             * <p>
560:             * A segment is considered to be before another segment if its begin position is earlier,
561:             * or in the case that both segments begin at the same position, its end position is earlier.
562:             * <p>
563:             * Segments that begin and end at the same position are considered equal for
564:             * the purposes of this comparison, even if they relate to different source documents.
565:             * <p>
566:             * Note: this class has a natural ordering that is inconsistent with equals.
567:             * This means that this method may return zero in some cases where calling the
568:             * {@link #equals(Object)} method with the same argument returns <code>false</code>.
569:             *
570:             * @param o  the segment to be compared
571:             * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
572:             * @throws ClassCastException if the argument is not a <code>Segment</code>
573:             */
574:            public int compareTo(final Object o) {
575:                if (this  == o)
576:                    return 0;
577:                final Segment segment = (Segment) o;
578:                if (begin < segment.begin)
579:                    return -1;
580:                if (begin > segment.begin)
581:                    return 1;
582:                if (end < segment.end)
583:                    return -1;
584:                if (end > segment.end)
585:                    return 1;
586:                return 0;
587:            }
588:
589:            /**
590:             * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
591:             * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
592:             */
593:            public final boolean isWhiteSpace() {
594:                for (int i = begin; i < end; i++)
595:                    if (!isWhiteSpace(source.charAt(i)))
596:                        return false;
597:                return true;
598:            }
599:
600:            /**
601:             * Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
602:             * <p>
603:             * The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a>
604:             * specifies the following white space characters:
605:             * <ul>
606:             *  <li>space (U+0020)
607:             *  <li>tab (U+0009)
608:             *  <li>form feed (U+000C)
609:             *  <li>line feed (U+000A)
610:             *  <li>carriage return (U+000D)
611:             *  <li>zero-width space (U+200B)
612:             * </ul>
613:             * <p>
614:             * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
615:             * recognise them as whitespace and renders them as an unprintable character (empty square).
616:             * Even zero-width spaces included using the numeric character reference <code>&amp;#x200B;</code> are rendered this way.
617:             *
618:             * @param ch  the character to test.
619:             * @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
620:             */
621:            public static final boolean isWhiteSpace(final char ch) {
622:                for (int i = 0; i < WHITESPACE.length; i++)
623:                    if (ch == WHITESPACE[i])
624:                        return true;
625:                return false;
626:            }
627:
628:            /**
629:             * Returns a string representation of this object useful for debugging purposes.
630:             * @return a string representation of this object useful for debugging purposes.
631:             */
632:            public String getDebugInfo() {
633:                final StringBuffer sb = new StringBuffer(50);
634:                sb.append('(');
635:                source.getRowColumnVector(begin).appendTo(sb);
636:                sb.append('-');
637:                source.getRowColumnVector(end).appendTo(sb);
638:                sb.append(')');
639:                return sb.toString();
640:            }
641:
642:            /**
643:             * Returns the character at the specified index.
644:             * <p>
645:             * This is logically equivalent to <code>toString().charAt(index)</code>
646:             * for valid argument values <code>0 <= index < length()</code>.
647:             * <p>
648:             * However because this implementation works directly on the underlying document source string,
649:             * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
650:             * for an invalid argument value.
651:             *
652:             * @param index  the index of the character.
653:             * @return the character at the specified index.
654:             */
655:            public final char charAt(final int index) {
656:                return source.string.charAt(begin + index);
657:            }
658:
659:            /**
660:             * Returns a new character sequence that is a subsequence of this sequence.
661:             * <p>
662:             * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code>
663:             * for valid values of <code>beginIndex</code> and <code>endIndex</code>.
664:             * <p>
665:             * However because this implementation works directly on the underlying document source string,
666:             * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
667:             * for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
668:             *
669:             * @param beginIndex  the begin index, inclusive.
670:             * @param endIndex  the end index, exclusive.
671:             * @return a new character sequence that is a subsequence of this sequence.
672:             */
673:            public final CharSequence subSequence(final int beginIndex,
674:                    final int endIndex) {
675:                return source.string.subSequence(begin + beginIndex, begin
676:                        + endIndex);
677:            }
678:
679:            /**
680:             * Extracts the textual content from the HTML markup of this segment.
681:             * <p>
682:             * This method has been deprecated as of version 2.4 and replaced with the {@link #getTextExtractor()} method.
683:             *
684:             * @return the textual content from the HTML markup of this segment.
685:             * @deprecated  Use {@link #getTextExtractor()}<code>.</code>{@link TextExtractor#toString() toString()} instead.
686:             */
687:            public String extractText() {
688:                return new TextExtractor(this ).toString();
689:            }
690:
691:            /**
692:             * Extracts the textual content from the HTML markup of this segment.
693:             * <p>
694:             * This method has been deprecated as of version 2.4 and replaced with the {@link #getTextExtractor()} method.
695:             *
696:             * @param includeAttributes  specifies whether the values of <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>, <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>, <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>, and <a target="_blank" href="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a> attributes are included in the output.
697:             * @return the textual content from the HTML markup of this segment.
698:             * @deprecated  Use {@link #getTextExtractor()}<code>.</code>{@link TextExtractor#setIncludeAttributes(boolean) setIncludeAttributes(includeAttributes)}<code>.</code>{@link TextExtractor#toString() toString()} instead.
699:             */
700:            public String extractText(final boolean includeAttributes) {
701:                return new TextExtractor(this ).setIncludeAttributes(
702:                        includeAttributes).toString();
703:            }
704:
705:            /**
706:             * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
707:             * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
708:             */
709:            static final StringBuffer appendCollapseWhiteSpace(
710:                    final StringBuffer sb, final CharSequence text) {
711:                final int textLength = text.length();
712:                int i = 0;
713:                boolean lastWasWhiteSpace = false;
714:                while (true) {
715:                    if (i >= textLength)
716:                        return sb;
717:                    if (!isWhiteSpace(text.charAt(i)))
718:                        break;
719:                    i++;
720:                }
721:                do {
722:                    final char ch = text.charAt(i++);
723:                    if (isWhiteSpace(ch)) {
724:                        lastWasWhiteSpace = true;
725:                    } else {
726:                        if (lastWasWhiteSpace) {
727:                            sb.append(' ');
728:                            lastWasWhiteSpace = false;
729:                        }
730:                        sb.append(ch);
731:                    }
732:                } while (i < textLength);
733:                return sb;
734:            }
735:
736:            private Tag checkEnclosure(final Tag tag) {
737:                if (tag == null || tag.end > end)
738:                    return null;
739:                return tag;
740:            }
741:
742:            private CharacterReference findNextCharacterReference(final int pos) {
743:                final CharacterReference characterReference = source
744:                        .findNextCharacterReference(pos);
745:                if (characterReference == null || characterReference.end > end)
746:                    return null;
747:                return characterReference;
748:            }
749:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.