001: /*
002: * LexML.java
003: *
004: * Brazil project web application Framework,
005: * export version: 1.1
006: * Copyright (c) 1999-2001 Sun Microsystems, Inc.
007: *
008: * Sun Public License Notice
009: *
010: * The contents of this file are subject to the Sun Public License Version
011: * 1.0 (the "License"). You may not use this file except in compliance with
012: * the License. A copy of the License is included as the file "license.terms",
013: * and also available at http://www.sun.com/
014: *
015: * The Original Code is from:
016: * Brazil project web application Framework release 1.1.
017: * The Initial Developer of the Original Code is: cstevens.
018: * Portions created by cstevens are Copyright (C) Sun Microsystems, Inc.
019: * All Rights Reserved.
020: *
021: * Contributor(s): cstevens, suhler.
022: *
023: * Version: 1.6
024: * Created by cstevens on 99/09/29
025: * Last modified by suhler on 01/01/16 14:19:08
026: */
027:
028: package sunlabs.brazil.util;
029:
030: /**
031: * This class breaks angle-bracket-separated markup languages like SGML, XML,
032: * and HTML into tokens. It understands three types of tokens: <dl>
033: * <dt> tags
034: * <dd> Formally known as "entities", tags are delimited by "<" and
035: * ">". The first word in the tag is the tag name and the
036: * rest of the tag consists of the attributes, a set of
037: * "name=value" or "name" data. Spaces in tags are not significant
038: * except for quoted values in the attributes.
039: *
040: * <dt> string
041: * <dd> Plain strings that are not in angle-brackets. Spaces are
042: * significant and preserved.
043: *
044: * <dt> comments
045: * <dd> Delimited by "<!--" and "-->". All text between the
046: * delimiters is part of the comment. However, by convention,
047: * some comments actually contain data and so the methods that
048: * extract the fields from tags can be used to attempt to extract
049: * the fields from comments, too. Spaces are significant and
050: * preserved in a comment, unless the comment is treated as a
051: * tag, in which the tag rules apply.
052: * </dl>
053: * <p>
054: * This class is intended to parse markup languages, not to validate them.
055: * "Malformed" data is interpreted as graciously as possible, in order to
056: * extract as much information as possible. For instance: spaces are
057: * allowed between the "<" and the tag name, values in tags do not need
058: * to be quoted, and unbalanced quotes are accepted.
059: * <p>
060: * <a name=badquote></a>
061: * One type of "malformed" data specifically not handled is a quoted
062: * ">" character occurring within the body of a tag. Even if it is
063: * quoted, a ">" in the attributes of a tag will be interpreted as the
064: * end of the tag. For example, the single tag <code><img src='foo.jpg'
065: * alt='xyz > abc'></code> will be erroneously broken by
066: * this parser into two tokens: <ul>
067: * <li> the tag <code><img src='foo.jpg' alt='xyz ></code>
068: * <li> the string "abc'>" (and possibly whatever text follows after).
069: * </ul>
070: * Unfortunately, this type of "malformed" data is known to occur regularly.
071: * <p>
072: * This class also may not properly parse all well-formed XML tags, such
073: * as tags with extended paired delimiters <code><&</code> and
074: * <code>&></code>, <code><?</code> and <code>?></code>, or
075: * <code><![CDATA[</code> and <code>]]></code>.
076: * Additionally, XML tags that have embedded comments containing the
077: * ">" character will not be parsed correctly (for example:
078: * <code><!DOCTYPE foo SYSTEM -- a > b -- foo.dtd></code>),
079: * since the ">" in the comment will be interpreted as
080: * the end of declaration tag, for the same reason mentioned
081: * <a href=#badquote>above</a>.
082: *
083: * @author Colin Stevens (colin.stevens@sun.com)
084: * @version 1.6, 01/01/16
085: */
086:
087: public class LexML {
088: /**
089: * The value returned by <code>getType</code> for comment tokens
090: */
091: public static final int COMMENT = 0;
092:
093: /**
094: * The value returned by <code>getType</code> for tag tokens
095: */
096: public static final int TAG = 1;
097:
098: /**
099: * The value returned by <code>getType</code> for string tokens
100: */
101: public static final int STRING = 2;
102:
103: private static final String SPACE = " \t\r\n";
104: private static final String SPACE_EQUAL = SPACE + "=";
105:
106: int type;
107:
108: String str;
109: int strEnd;
110: int tokenStart;
111: int tokenEnd;
112: int tagStart;
113: int tagEnd;
114: int argsStart;
115: int argsEnd;
116:
117: /**
118: * Create a new ML parser, which can be used to iterate over the
119: * tokens in the given string.
120: *
121: * @param str
122: * The ML to parse.
123: */
124: public LexML(String str) {
125: replace(str);
126: }
127:
128: /**
129: * Advances to the next token. The user can then call the other methods
130: * in this class to get information about the new current token.
131: *
132: * @return <code>true</code> if a token was found, <code>false</code>
133: * if there were no more tokens left.
134: */
135: public boolean nextToken() {
136: if (tokenEnd >= strEnd) {
137: return false;
138: }
139:
140: tokenStart = tokenEnd;
141: if (str.startsWith("<!--", tokenStart)) {
142: try {
143: tokenEnd = str.indexOf("-->", tokenStart + 4);
144: } catch (StringIndexOutOfBoundsException e) {
145: tokenEnd = -1;
146: }
147: if (tokenEnd < 0) {
148: str += "-->";
149: tokenEnd = strEnd;
150: strEnd += 3;
151: }
152: tokenEnd += 3;
153: type = COMMENT;
154: } else if (str.charAt(tokenStart) == '<') {
155: tokenEnd = str.indexOf('>', tokenStart);
156: if (tokenEnd < 0) {
157: str += ">";
158: strEnd++;
159: tokenEnd = strEnd;
160: }
161: tokenEnd++;
162: type = TAG;
163: } else {
164: tokenEnd = str.indexOf('<', tokenStart);
165: if (tokenEnd < 0) {
166: tokenEnd = strEnd;
167: }
168: type = STRING;
169: }
170: return true;
171: }
172:
173: /**
174: * Gets the type of the current token.
175: *
176: * @return The type.
177: *
178: * @see #COMMENT
179: * @see #TAG
180: * @see #STRING
181: */
182: public int getType() {
183: return type;
184: }
185:
186: /**
187: * Gets the string making up the whole current token, including the
188: * brackets or comment delimiters, if appropriate.
189: *
190: * @return The current token.
191: */
192: public String getToken() {
193: return str.substring(tokenStart, tokenEnd);
194: }
195:
196: /**
197: * Gets the string making up the current token, not including the angle
198: * brackets or comment delimiters, if appropriate.
199: *
200: * @return The body of the token.
201: */
202: public String getBody() {
203: if (type == TAG) {
204: return str.substring(tokenStart + 1, tokenEnd - 1);
205: } else if (type == COMMENT) {
206: return str.substring(tokenStart + 4, tokenEnd - 3);
207: } else {
208: return str.substring(tokenStart, tokenEnd);
209: }
210: }
211:
212: private void split() {
213: if (tagStart <= tokenStart) {
214: int off = tokenStart + 1;
215: int end = (type == TAG) ? tokenEnd - 1 : tokenEnd - 3;
216:
217: tagStart = skip(SPACE, str, off, end);
218: tagEnd = next(SPACE, str, tagStart, end);
219: argsStart = skip(SPACE, str, tagEnd, end);
220: argsEnd = end;
221: }
222: }
223:
224: /**
225: * Gets the tag name at the beginning of the current tag. In other
226: * words, the tag name for <code><table border=3></code> is
227: * "table". Any surrounding space characters are removed, but the
228: * case of the tag is preserved.
229: * <p>
230: * For comments, the "tag" is the first word in the comment. This can
231: * be used to help parse comments that are structured similar to regular
232: * tags, such as server-side include comments like
233: * <code><!--#include virtual="file.inc"></code>. The tag in
234: * this case would be "!--#include".
235: *
236: * @return The tag name, or <code>null</code> if the current token
237: * was a string.
238: *
239: */
240: public String getTag() {
241: if (type == STRING) {
242: return null;
243: }
244: split();
245: return str.substring(tagStart, tagEnd);
246: }
247:
248: /**
249: * Gets the name/value pairs in the body of the current tag as a
250: * string.
251: *
252: * @return The name/value pairs, or <code>null</code> if
253: * the current token was a string.
254: */
255: public String getArgs() {
256: if (type == STRING) {
257: return null;
258: }
259: split();
260: return str.substring(argsStart, argsEnd);
261: }
262:
263: /**
264: * Gets the name/value pairs in the body of the current tag as a
265: * table.
266: * <p>
267: * Any quote marks in the body, either single or double quotes, are
268: * left on the values, so that the values can be easily re-emitted
269: * and still form a valid body.
270: * <p>
271: * For names that have no associated value in the tag, the value is
272: * stored as the empty string "". Therefore, the two tags
273: * <code><table border></code> and
274: * <code><table border=""></code> cannot be distinguished
275: * based on the result of calling <code>getAttributes</code>.
276: *
277: * @return The table of name/value pairs, or <code>null</code> if
278: * the current token was a string.
279: */
280: public StringMap getAttributes() {
281: if (type == STRING) {
282: return null;
283: }
284:
285: StringMap map = new StringMap();
286:
287: split();
288: int off = argsStart;
289: int end = argsEnd;
290: String token = str;
291:
292: while (off < end) {
293: int nameStart = off;
294: int nameEnd = next(SPACE_EQUAL, token, off + 1, end);
295:
296: String name = token.substring(nameStart, nameEnd);
297:
298: off = skip(SPACE, token, nameEnd, end);
299: if ((off < end) && (token.charAt(off) == '=')) {
300: off = skip(SPACE, token, off + 1, end);
301: if (off < end) {
302: char ch = token.charAt(off);
303: int valueStart = off;
304: int valueEnd;
305:
306: if ((ch == '"') || (ch == '\'')) {
307: off++;
308: if (off < end) {
309: off = token.indexOf(ch, off);
310: if (off < 0) {
311: off = end;
312: }
313: }
314: off++;
315: valueEnd = off;
316: } else {
317: off = next(SPACE, token, off, end);
318: valueEnd = off;
319: }
320: map
321: .add(name, token.substring(valueStart,
322: valueEnd));
323: off = skip(SPACE, token, off, end);
324: continue;
325: }
326: }
327: map.add(name, "");
328: }
329: return map;
330: }
331:
332: /**
333: * Gets the rest of the string that has not yet been parsed.
334: * <p>
335: * Example use: to help the parser in circumstances such as the HTML
336: * "<script>" tag where the script body doesn't the obey the rules
337: * because it might contain lone "<" or ">" characters, which this
338: * parser would interpret as the start or end of funny-looking tags.
339: *
340: * @return The unparsed remainder of the string.
341: *
342: * @see #replace
343: */
344: public String rest() {
345: return str.substring(tokenEnd);
346: }
347:
348: /**
349: * Changes the string that this LexML is parsing.
350: * <p>
351: * Example use: the caller decided to parse part of the body,
352: * and now wants this LexML to pick up and parse the rest of it.
353: *
354: * @param str
355: * The string that this LexML should now parse. Whatever
356: * string this LexML was parsing is forgotten, and it now
357: * starts parsing at the beginning of the new string.
358: *
359: * @see #rest
360: */
361: public void replace(String str) {
362: this .str = str;
363: this .strEnd = str.length();
364: this .tokenStart = 0;
365: this .tokenEnd = 0;
366:
367: this .tagStart = 0;
368: }
369:
370: private int skip(String pattern, String str, int i, int end) {
371: for (; i < end; i++) {
372: if (pattern.indexOf(str.charAt(i)) < 0) {
373: break;
374: }
375: }
376: return i;
377: }
378:
379: private int next(String pattern, String str, int i, int end) {
380: for (; i < end; i++) {
381: if (pattern.indexOf(str.charAt(i)) >= 0) {
382: break;
383: }
384: }
385: return i;
386: }
387: }
|