001: /*
002: * LexHTML.java
003: *
004: * Brazil project web application Framework,
005: * export version: 1.1
006: * Copyright (c) 1999-2000 Sun Microsystems, Inc.
007: *
008: * Sun Public License Notice
009: *
010: * The contents of this file are subject to the Sun Public License Version
011: * 1.0 (the "License"). You may not use this file except in compliance with
012: * the License. A copy of the License is included as the file "license.terms",
013: * and also available at http://www.sun.com/
014: *
015: * The Original Code is from:
016: * Brazil project web application Framework release 1.1.
017: * The Initial Developer of the Original Code is: cstevens.
018: * Portions created by cstevens are Copyright (C) Sun Microsystems, Inc.
019: * All Rights Reserved.
020: *
021: * Contributor(s): cstevens, suhler.
022: *
023: * Version: 1.9
024: * Created by cstevens on 99/09/29
025: * Last modified by suhler on 00/05/31 13:52:39
026: */
027:
028: package sunlabs.brazil.util;
029:
030: import java.util.Vector;
031:
032: /**
033: * This class breaks up HTML into tokens.
034: * <p>
035: * <a name=intro></a>
036: * This class differs slightly from LexML as follows: after certain tags,
037: * like the <code><script></code> tag, the body that follows is
038: * uninterpreted data and ends only at the next, in this case,
039: * <code></script></code> tag, not at the just the next
040: * "<" or ">" character. This is one way that HTML is not fully
041: * compliant with XML.
042: * <p>
043: * The default set of tags that have this special processing is
044: * <code><script></code>, <code><style></code>, and
045: * <code><xmp></code>. The user can change this by retrieving
046: * the Vector of special tags via
047: * <code>getClosingTags</code>, and modifying it as needed.
048: *
049: * @author Colin Stevens (colin.stevens@sun.com)
050: * @version 1.9, 00/05/31
051: */
052: public class LexHTML extends LexML {
053: static final String[] defaultClosingTags = { "script", "style",
054: "xmp", "server" };
055: Vector closingTags;
056:
057: String bodyTag;
058:
059: /**
060: * Creates a new HTML parser, which can be used to iterate over the
061: * tokens in the given string.
062: *
063: * @param str
064: * The HTML to parse.
065: */
066: public LexHTML(String str) {
067: super (str);
068: closingTags = new Vector();
069: for (int i = 0; i < defaultClosingTags.length; i++) {
070: closingTags.addElement(defaultClosingTags[i]);
071: }
072: }
073:
074: /**
075: * Get the set of HTML tags that have the special body-processing
076: * behavior mentioned <a href=#intro>above</a>. The Vector
077: * is returned; the caller may modify it after calling this method,
078: * which will affect this parser's settings.
079: *
080: * @param tags
081: * The array of case-insensitive tag names that are only
082: * closed by seeing their "slashed" version.
083: */
084: public Vector getClosingTags() {
085: return closingTags;
086: }
087:
088: /**
089: * Advances to the next token, correctly handling HTML tags that have
090: * the special body-processing behavior mentioned <a href=#intro>above</a>.
091: * The user can then call the other methods in this class to get
092: * information about the new current token.
093: * <p>
094: * This method returns the uninterpreted data making up the body of a
095: * special HTML tag as a token of type <code>LexML.STRING</code>, even
096: * if the body was actually a comment or another tag.
097: *
098: * @return <code>true</code> if a token was found, <code>false</code>
099: * if there were no more tokens left.
100: */
101: public boolean nextToken() {
102: if (bodyTag != null) {
103: /*
104: * bodyTag was set when we saw one of the special tags that
105: * are only closed when we see "</" + tag + ">". Look for that
106: * closing tag and return everything between the last tag and
107: * the start of the closing tag as the current token.
108: */
109:
110: String end = "</" + bodyTag + ">";
111: int endLength = end.length();
112:
113: String rest = rest();
114: int restLength = rest.length() - endLength;
115:
116: bodyTag = null;
117: replace(rest);
118:
119: int i;
120: for (i = 0; i < restLength; i++) {
121: if (rest.regionMatches(true, i, end, 0, endLength)) {
122: break;
123: }
124: }
125: if (i <= 0) {
126: return false;
127: }
128: type = STRING;
129: tokenEnd = i;
130: return true;
131: }
132:
133: if (super .nextToken() == false) {
134: return false;
135: }
136: if (getType() == TAG) {
137: String tag = getTag();
138: for (int i = 0; i < closingTags.size(); i++) {
139: if (closingTags.elementAt(i).equals(tag)) {
140: bodyTag = tag;
141: break;
142: }
143: }
144: }
145: return true;
146: }
147:
148: /**
149: * Gets the tag name at the begining of the current tag. In HTML,
150: * tag names are defined as case-insensitive, so the name returned
151: * is converted to lower case for the convenience of the user.
152: *
153: * @return The lower-cased tag name, or <code>null</code> if the
154: * current token does not have a tag name.
155: *
156: * @see LexML#getTag
157: */
158: public String getTag() {
159: String tag = super .getTag();
160: if (tag == null) {
161: return null;
162: }
163: return tag.toLowerCase();
164: }
165:
166: /**
167: * Changes the string that this LexHTML is parsing.
168: *
169: * @param str
170: * The string that this LexHTML should now parse.
171: */
172: public void replace(String str) {
173: this.bodyTag = null;
174: super.replace(str);
175: }
176:
177: }
|