001:/*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033:package websphinx;
034:
035:import java.util.Enumeration;
036:import rcm.enum.ArrayEnumeration;
037:
038:/**
039: * Tag in an HTML page.
040: */
041:public class Tag extends Region {
042:
043: String tagName;
044: boolean startTag;
045: String[] htmlAttributes;// HTML attributes on this tag (lower case and interned)
046: Element element;
047:
048: /**
049: * Make a Tag.
050: * @param page Page containing tag
051: * @param start Starting offset of tag in page
052: * @param end Ending offset of tag
053: * @param tagName Name of tag (like "p")
054: * @param startTag true for start tags (like "<p>"), false for end tags ("</p>")
055: */
056: public Tag (Page page, int start, int end, String tagName, boolean startTag) {
057: super (page, start, end);
058: this .tagName = tagName.toLowerCase ().intern ();
059: this .startTag = startTag;
060: this .htmlAttributes = null;
061: }
062:
063: /**
064: * Get tag name.
065: * @return tag name (like "p"), in lower-case, String.intern()'ed form.
066: */
067: public String getTagName () {
068: return tagName;
069: }
070:
071: /**
072: * Get element to which this tag is the start or end tag.
073: * @return element, or null if tag has no element.
074: */
075: public Element getElement () {
076: return element;
077: }
078:
079: /**
080: * Convert a String to a tag name. Tag names are lower-case, intern()'ed
081: * Strings. Thus you can compare tag names with ==, as in:
082: * <CODE>getTagName() == Tag.IMG</CODE>.
083: * @param name Name to convert (e.g., "P")
084: * @return tag name (e.g. "p"), in lower-case, String.intern()'ed form.
085: */
086: public static String toTagName (String name) {
087: return name.toLowerCase().intern ();
088: }
089:
090: /**
091: * Test if tag is a start tag. Equivalent to !isEndTag().
092: * @return true if and only if tag is a start tag (like "<P>")
093: */
094: public boolean isStartTag () {
095: return startTag;
096: }
097:
098: /**
099: * Test if tag is an end tag. Equivalent to !isStartTag().
100: * @return true if and only if tag is a start tag (like "</P>")
101: */
102: public boolean isEndTag () {
103: return !startTag;
104: }
105:
106: /**
107: * Test if tag is a block-level tag. Equivalent to !isFlowTag().
108: * @return true if and only if tag is a block-level tag (like "<P>")
109: */
110: public boolean isBlockTag () {
111: return HTMLParser.blocktag.containsKey (tagName);
112: }
113:
114: /**
115: * Test if tag is a flow-level tag. Equivalent to !isBlockTag().
116: * @return true if and only if tag is a block-level tag (like "<A>")
117: */
118: public boolean isFlowTag () {
119: return !isBlockTag ();
120: }
121:
122: /**
123: * Test if tag belongs in the <HEAD> element.
124: * @return true if and only if tag is a HEAD-level tag (like "<TITLE>")
125: */
126: public boolean isHeadTag () {
127: return HTMLParser.headtag.containsKey (tagName);
128: }
129:
130: /**
131: * Test if tag belongs in the <BODY> element.
132: * @return true if and only if tag is a BODY-level tag (like "<A>")
133: */
134: public boolean isBodyTag () {
135: return !isHeadTag()
136: && tagName != HTML
137: && tagName != HEAD
138: && tagName != BODY;
139: }
140:
141: /**
142: * Convert a String to an HTML attribute name. Attribute names are
143: * lower-case, intern()'ed
144: * Strings. Thus you can compare attribute names with ==.
145: * @param name Name to convert (e.g., "HREF")
146: * @return tag name (e.g. "href"), in lower-case, String.intern()'ed form.
147: */
148: public static String toHTMLAttributeName (String name) {
149: return name.toLowerCase ().intern ();
150: }
151:
152: /**
153: * Test if tag has an HTML attribute.
154: * @param name Name of HTML attribute (e.g. "HREF"). Doesn't have to be
155: * converted with toHTMLAttributeName().
156: * @return true if tag has the attribute, false if not
157: */
158: public boolean hasHTMLAttribute (String name) {
159: if (htmlAttributes == null)
160: return false;
161: name = toHTMLAttributeName (name);
162: for (int i=0; i<htmlAttributes.length; ++i)
163: if (htmlAttributes[i] == name)
164: return true;
165: return false;
166: }
167:
168: /**
169: * Get an HTML attribute's value.
170: * @param name Name of HTML attribute (e.g. "HREF"). Doesn't have to be
171: * converted with toHTMLAttributeName().
172: * @return value of attribute if it exists, TRUE if the attribute exists but has no value, or null if tag lacks the attribute.
173: */
174: public String getHTMLAttribute (String name) {
175: if (htmlAttributes == null)
176: return null;
177: name = toHTMLAttributeName (name);
178: for (int i=0; i<htmlAttributes.length; ++i)
179: if (htmlAttributes[i] == name)
180: return getLabel (name);
181: return null;
182: }
183:
184: /**
185: * Get an HTML attribute's value, with a default value if it doesn't exist.
186: * @param name Name of HTML attribute (e.g. "HREF"). Doesn't have to be
187: * converted with toHTMLAttributeName().
188: * @param defaultValue default value to return if the attribute
189: * doesn't exist
190: * @return value of attribute if it exists, TRUE if the attribute exists but has no value, or defaultValue if tag lacks the attribute.
191: */
192: public String getHTMLAttribute (String name, String defaultValue) {
193: String val = getHTMLAttribute (name);
194: return val != null ? val : defaultValue;
195: }
196:
197: /**
198: * Get number of HTML attributes on this tag.
199: * @return number of HTML attributes
200: */
201: public int countHTMLAttributes () {
202: return htmlAttributes != null ? htmlAttributes.length : 0;
203: }
204:
205: /**
206: * Get all the HTML attributes found on this tag.
207: * @return array of name-value pairs, alternating between
208: * names and values. Thus array[0] is a name, array[1] is a value,
209: * array[2] is a name, etc.
210: */
211: public String[] getHTMLAttributes () {
212: if (htmlAttributes == null)
213: return new String[0];
214:
215: String[] result = new String[htmlAttributes.length * 2];
216: for (int i=0, j=0; i<htmlAttributes.length; ++i) {
217: String name = htmlAttributes[i];
218: result[j++] = name;
219: result[j++] = getLabel (name);
220: }
221: return result;
222: }
223:
224: /**
225: * Enumerate the HTML attributes found on this tag.
226: * @return enumeration of the attribute names found on this tag.
227: */
228: public Enumeration enumerateHTMLAttributes () {
229: return new ArrayEnumeration (htmlAttributes);
230: }
231:
232: /**
233: * Copy this tag, removing an HTML attribute.
234: * @param name Name of HTML attribute (e.g. "HREF"). Doesn't have to be
235: * converted with toHTMLAttributeName().
236: * @return copy of this tag with named attribute removed. The copy is
237: * a region of a fresh page containing only the tag.
238: */
239: public Tag removeHTMLAttribute (String name) {
240: return replaceHTMLAttribute (name, null);
241: }
242:
243: /**
244: * Copy this tag, setting an HTML attribute's value to TRUE.
245: * @param name Name of HTML attribute (e.g. "HREF"). Doesn't have to be
246: * converted with toHTMLAttributeName().
247: * @return copy of this tag with named attribute set to TRUE. The copy is
248: * a region of a fresh page containing only the tag.
249: */
250: public Tag replaceHTMLAttribute (String name) {
251: return replaceHTMLAttribute (name, TRUE);
252: }
253:
254: /**
255: * Copy this tag, setting an HTML attribute's value.
256: * @param name Name of HTML attribute (e.g. "HREF"). Doesn't have to be
257: * converted with toHTMLAttributeName().
258: * @param value New value for the attribute
259: * @return copy of this tag with named attribute set to value.
260: * The copy is
261: * a region of a fresh page containing only the tag.
262: */
263: public Tag replaceHTMLAttribute (String name, String value) {
264: name = toHTMLAttributeName (name);
265:
266: if (!startTag)
267: return this ; // illegal!
268:
269: StringBuffer newstr = new StringBuffer ();
270: String[] newattrs = null;
271:
272: newstr.append ('<');
273: newstr.append (tagName);
274:
275: boolean foundit = false;
276:
277: int len = htmlAttributes.length;
278: for (int i=0; i < len; ++i) {
279: String attrName = htmlAttributes[i];
280: String attrVal;
281:
282: // FIX: entity-encode attrVal
283: if (attrName == name) {
284: newattrs = htmlAttributes;
285: foundit = true;
286: if (value == null)
287: continue;
288:
289: attrVal = value;
290: }
291: else
292: attrVal = getLabel (attrName);
293:
294: newstr.append (' ');
295: newstr.append (attrName);
296: if (attrVal != TRUE) {
297: newstr.append ('=');
298: if (attrVal.indexOf ('"') == -1) {
299: newstr.append ('"');
300: newstr.append (attrVal);
301: newstr.append ('"');
302: }
303: else {
304: newstr.append ('\'');
305: newstr.append (attrVal);
306: newstr.append ('\'');
307: }
308: }
309: }
310: if (!foundit && value != null) {
311: // add new attribute at end
312: newstr.append (' ');
313: newstr.append (name);
314: if (value != name) {
315: newstr.append ('=');
316: if (value.indexOf ('"') == -1) {
317: newstr.append ('"');
318: newstr.append (value);
319: newstr.append ('"');
320: }
321: else {
322: newstr.append ('\'');
323: newstr.append (value);
324: newstr.append ('\'');
325: }
326: }
327:
328: // append name to list of attribute names
329: newattrs = new String[len + 1];
330: System.arraycopy (htmlAttributes, 0, newattrs, 0, len);
331: newattrs[len] = name;
332: }
333:
334: newstr.append ('>');
335:
336: Tag newTag = new Tag (new Page (newstr.toString()), 0,
337: newstr.length(), tagName, startTag);
338: newTag.names = names;
339: newTag.htmlAttributes = newattrs;
340: newTag.setLabel (name, value);
341:
342: return newTag;
343: }
344:
345: /**
346: * Commonly useful tag names.
347: * Derived from <a href="http://www.sandia.gov/sci_compute/elements.html">HTML Elements</a>
348: * at Sandia National Labs.
349: */
350:
351: public static final String A = "a".intern();
352: public static final String ABBREV = "abbrev".intern();
353: public static final String ACRONYM = "acronym".intern();
354: public static final String ADDRESS = "address".intern();
355: public static final String APPLET = "applet".intern();
356: public static final String AREA = "area".intern();
357: public static final String B = "b".intern();
358: public static final String BASE = "base".intern();
359: public static final String BASEFONT = "basefont".intern();
360: public static final String BDO = "bdo".intern();
361: public static final String BGSOUND = "bgsound".intern();
362: public static final String BIG = "big".intern();
363: public static final String BLINK = "blink".intern();
364: public static final String BLOCKQUOTE = "blockquote".intern();
365: public static final String BODY = "body".intern();
366: public static final String BR = "br".intern();
367: public static final String CAPTION = "caption".intern();
368: public static final String CENTER = "center".intern();
369: public static final String CITE = "cite".intern();
370: public static final String CODE = "code".intern();
371: public static final String COL = "col".intern();
372: public static final String COLGROUP = "colgroup".intern();
373: public static final String COMMENT = "!".intern();
374: public static final String DD = "dd".intern();
375: public static final String DEL = "del".intern();
376: public static final String DFN = "dfn".intern();
377: public static final String DIR = "dir".intern();
378: public static final String DIV = "div".intern();
379: public static final String DL = "dd".intern();
380: public static final String DT = "dt".intern();
381: public static final String EM = "em".intern();
382: public static final String EMBED = "embed".intern();
383: public static final String FONT = "font".intern();
384: public static final String FRAME = "frame".intern();
385: public static final String FRAMESET = "frameset".intern();
386: public static final String FORM = "form".intern();
387: public static final String H1 = "h1".intern();
388: public static final String H2 = "h2".intern();
389: public static final String H3 = "h3".intern();
390: public static final String H4 = "h4".intern();
391: public static final String H5 = "h5".intern();
392: public static final String H6 = "h6".intern();
393: public static final String HEAD = "head".intern();
394: public static final String HR = "hr".intern();
395: public static final String HTML = "html".intern();
396: public static final String I = "i".intern();
397: public static final String IMG = "img".intern();
398: public static final String INPUT = "input".intern();
399: public static final String ISINDEX = "isindex".intern();
400: public static final String KBD = "kbd".intern();
401: public static final String LI = "li".intern();
402: public static final String LINK = "link".intern();
403: public static final String LISTING = "listing".intern();
404: public static final String MAP = "map".intern();
405: public static final String MARQUEE = "marquee".intern();
406: public static final String MENU = "menu".intern();
407: public static final String META = "meta".intern();
408: public static final String NEXTID = "nextid".intern();
409: public static final String NOBR = "nobr".intern();
410: public static final String NOEMBED = "noembed".intern();
411: public static final String NOFRAMES = "noframes".intern();
412: public static final String OBJECT = "object".intern();
413: public static final String OL = "ol".intern();
414: public static final String OPTION = "option".intern();
415: public static final String P = "p".intern();
416: public static final String PARAM = "param".intern();
417: public static final String PLAINTEXT = "plaintext".intern();
418: public static final String PRE = "pre".intern();
419: public static final String SAMP = "samp".intern();
420: public static final String SCRIPT = "script".intern();
421: public static final String SELECT = "select".intern();
422: public static final String SMALL = "small".intern();
423: public static final String SPACER = "spacer".intern();
424: public static final String STRIKE = "strike".intern();
425: public static final String STRONG = "strong".intern();
426: public static final String STYLE = "style".intern();
427: public static final String SUB = "sub".intern();
428: public static final String SUP = "sup".intern();
429: public static final String TABLE = "table".intern();
430: public static final String TD = "td".intern();
431: public static final String TEXTAREA = "textarea".intern();
432: public static final String TH = "th".intern();
433: public static final String TITLE = "title".intern();
434: public static final String TR = "tr".intern();
435: public static final String TT = "tt".intern();
436: public static final String U = "u".intern();
437: public static final String UL = "ul".intern();
438: public static final String VAR = "var".intern();
439: public static final String WBR = "wbr".intern();
440: public static final String XMP = "xmp".intern();
441:
442: /**
443: * Length of longest tag name.
444: */
445: public static int MAX_LENGTH = 10; // longest tag name is BLOCKQUOTE
446:}
|