001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.util.Enumeration;
036:
037: /**
038: * Element in an HTML page. An element runs from a start tag
039: * (like <ul>) to its matching end tag (</ul>),
040: * inclusive.
041: * An element may have an optional end tag (like <p>),
042: * in which case the element runs up to (but not including)
043: * the tag that implicitly closes it. For example:
044: * <PRE><p>Paragraph 1<p>Paragraph 2</PRE>
045: * contains two elements, <PRE><p>Paragraph 1</PRE>
046: * and <PRE><p>Paragraph 2</PRE>.
047: */
048: public class Element extends Region {
049:
050: protected Tag startTag;
051: protected Tag endTag;
052:
053: protected Element sibling; // next sibling
054: protected Element parent;
055: protected Element child; // first child
056:
057: /**
058: * Make an Element from a start tag and end tag. The tags
059: * must be on the same Page.
060: * @param startTag Start tag of element
061: * @param endTag End tag of element (may be null)
062: */
063: public Element(Tag startTag, Tag endTag) {
064: super (startTag.source, startTag.start,
065: endTag != null ? endTag.end : startTag.end);
066: this .startTag = startTag;
067: this .endTag = endTag;
068: }
069:
070: /**
071: * Make an Element from a start tag and an end position. Used
072: * when the end tag has been omitted (like <p>, frequently).
073: * @param startTag Start tag of element
074: * @param end Ending offset of element
075: */
076: public Element(Tag startTag, int end) {
077: super (startTag.source, startTag.start, end);
078: this .startTag = startTag;
079: this .endTag = null;
080: }
081:
082: /**
083: * Get tag name.
084: * @return tag name (like "p"), in lower-case, String.intern()'ed form.
085: * Thus you can compare tag names with ==, as in:
086: * <CODE>getTagName() == Tag.IMG</CODE>.
087: */
088: public String getTagName() {
089: return startTag.getTagName();
090: }
091:
092: /**
093: * Get start tag.
094: * @return start tag of element
095: */
096: public Tag getStartTag() {
097: return startTag;
098: }
099:
100: /**
101: * Get end tag.
102: * @return end tag of element, or null if element has no end tag.
103: */
104: public Tag getEndTag() {
105: return endTag;
106: }
107:
108: /**
109: * Get element's parent.
110: * @return element that contains this element, or null if at top-level.
111: */
112: public Element getParent() {
113: return parent;
114: }
115:
116: /**
117: * Get element's next sibling.
118: * @return element that follows this element, or null if at end of
119: * parent's children.
120: */
121: public Element getSibling() {
122: return sibling;
123: }
124:
125: /**
126: * Get element's first child.
127: * @return first element contained by this element, or null if no children.
128: */
129: public Element getChild() {
130: return child;
131: }
132:
133: /**
134: * Return next element in an inorder walk of the tree,
135: * assuming this element and its children have been visited.
136: * @return next element
137: */
138: public Element getNext() {
139: if (sibling != null)
140: return sibling;
141: else if (parent != null)
142: return parent.getNext();
143: else
144: return null;
145: }
146:
147: /**
148: * Test if tag has an HTML attribute.
149: * @param name Name of HTML attribute (e.g. "HREF"). Doesn't have to be
150: * converted with toHTMLAttributeName().
151: * @return true if tag has the attribute, false if not
152: */
153: public boolean hasHTMLAttribute(String name) {
154: return startTag.hasHTMLAttribute(name);
155: }
156:
157: /**
158: * Get an HTML attribute's value.
159: * @param name Name of HTML attribute (e.g. "HREF"). Doesn't have to be
160: * converted with toHTMLAttributeName().
161: * @return value of attribute if it exists, TRUE if the attribute exists but has no value, or null if tag lacks the attribute.
162: */
163: public String getHTMLAttribute(String name) {
164: return startTag.getHTMLAttribute(name);
165: }
166:
167: /**
168: * Get an HTML attribute's value, with a default value if it doesn't exist.
169: * @param name Name of HTML attribute (e.g. "HREF"). Doesn't have to be
170: * converted with toHTMLAttributeName().
171: * @param defaultValue default value to return if the attribute
172: * doesn't exist
173: * @return value of attribute if it exists, TRUE if the attribute exists but has no value, or defaultValue if tag lacks the attribute.
174: */
175: public String getHTMLAttribute(String name, String defaultValue) {
176: return startTag.getHTMLAttribute(name, defaultValue);
177: }
178:
179: /**
180: * Enumerate the HTML attributes found on this tag.
181: * @return enumeration of the attribute names found on this tag.
182: */
183: public Enumeration enumerateHTMLAttributes() {
184: return startTag.enumerateHTMLAttributes();
185: }
186:
187: }
|