001: /*
002: * HtmlRewriter.java
003: *
004: * Brazil project web application Framework,
005: * export version: 1.1
006: * Copyright (c) 1999-2000 Sun Microsystems, Inc.
007: *
008: * Sun Public License Notice
009: *
010: * The contents of this file are subject to the Sun Public License Version
011: * 1.0 (the "License"). You may not use this file except in compliance with
012: * the License. A copy of the License is included as the file "license.terms",
013: * and also available at http://www.sun.com/
014: *
015: * The Original Code is from:
016: * Brazil project web application Framework release 1.1.
017: * The Initial Developer of the Original Code is: cstevens.
018: * Portions created by cstevens are Copyright (C) Sun Microsystems, Inc.
019: * All Rights Reserved.
020: *
021: * Contributor(s): cstevens, suhler.
022: *
023: * Version: 1.9
024: * Created by cstevens on 99/09/29
025: * Last modified by suhler on 00/12/27 12:12:09
026: */
027:
028: package sunlabs.brazil.handler;
029:
030: import sunlabs.brazil.util.LexHTML;
031: import sunlabs.brazil.util.StringMap;
032:
033: import java.util.Enumeration;
034:
035: /**
036: * This class helps with parsing and rewriting an HTML document. The
037: * source document is not changed; a new HTML document is built.
038: * <p>
039: * The user can sequentially examine and rewrite each token in the source
040: * HTML document. As each token in the document is seen, the user has
041: * two choices: <ul>
042: * <li> modify the current token.
043: * <li> don't modify the current token.
044: * </ul>
045: * If the user modifies (or replaces, deletes, etc.) the current token,
046: * then the resultant HTML document will contain that modification. On
047: * the other hand, if the user doesn't do anything with the current token,
048: * it will appear, unchanged, in the resultant HTML document.
049: * <p>
050: * Parsing is implemented lazily, meaning, for example, that unless the
051: * user actually asks for attributes of an HTML tag, this parser
052: * does not have to spend the time breaking up the attributes.
053: * <p>
054: * This class is used by HTML filters to maintain the state of the
055: * document and allow the filters to perform arbitrary rewriting.
056: *
057: * @author Colin Stevens (colin.stevens@sun.com)
058: * @version 1.9, 00/12/27
059: */
060: public class HtmlRewriter {
061: /**
062: * The parser for the source HTML document.
063: */
064: public LexHTML lex;
065:
066: /**
067: * Storage holding the resultant HTML document.
068: */
069: public StringBuffer sb;
070:
071: /**
072: * <code>true</code> if the last token was pushed back and should be
073: * presented again next time. Made <code>false</code> once the
074: * pushedback token is presented.
075: */
076: boolean pushback;
077:
078: /**
079: * <code>true</code> if <code>nextToken</code> should automatically
080: * append unmodified tokens to the result.
081: */
082: boolean accumulate;
083:
084: /**
085: * <code>true</code> if the user has already explicitly appended
086: * something, so <code>nextToken</code> shouldn't append the
087: * unmodified token.
088: */
089: boolean appendToken;
090:
091: /**
092: * <code>true</code> if the user has modified the tag name or
093: * attributes of the current tag, so when this tag is appended, we
094: * need to write out its parts rather than just emitting the raw token.
095: */
096: boolean tokenModified;
097:
098: int type;
099: String token;
100: String tag;
101: StringMap map;
102:
103: /**
104: * Creates a new <code>HtmlRewriter</code> from the given HTML parser.
105: *
106: * @param lex
107: * The HTML parser.
108: */
109: public HtmlRewriter(LexHTML lex) {
110: this .lex = lex;
111:
112: sb = new StringBuffer();
113: accumulate = true;
114: }
115:
116: /**
117: * Creates a new <code>HtmlRewriter</code> that will operate on the given
118: * string.
119: *
120: * @param str
121: * The HTML document.
122: */
123: public HtmlRewriter(String str) {
124: this (new LexHTML(str));
125: }
126:
127: /**
128: * Returns the "new" rewritten HTML document. This is normally called
129: * once all of the tokens have been processed, and the user wants to
130: * send on this rewritten document.
131: * <p>
132: * At any time, this method can be called to return the current state
133: * of the HTML document. The return value is the result of
134: * processing the source document up to this point in time; the
135: * unprocessed remainder of the source document is not considered.
136: * <p>
137: * Due to the implementation, calling this method may be expensive.
138: * Specifically, calling this method a second (or further) time for
139: * a given <code>HtmlRewriter</code> may involve copying temporary
140: * strings around. The pessimal case would be to call this method
141: * every time a new token is appended.
142: *
143: * @return The rewritten HTML document, up to this point in time.
144: */
145: public String toString() {
146: return sb.toString();
147: }
148:
149: /**
150: * Advances to the next token in the source HTML document.
151: * <p>
152: * The other purpose of this function is to "do the right thing", which
153: * is to append the token we just processed to the resultant HTML
154: * document, unless the user has already appended something else.
155: * <p>
156: * A sample program follows. This program changes all
157: * <code><img></code> tags to <code><form></code> tags,
158: * deletes all <code><table></code> tags, capitalizes
159: * and bolds each string token, and passes all other tokens through
160: * unchanged, to illustrate how <code>nextToken</code> interacts with
161: * some of the other methods in this class.
162: * <pre>
163: * HtmlRewriter hr = new HtmlRewriter(str);
164: * while (hr.nextToken()) {
165: * switch (hr.getType()) {
166: * case LexHTML.TAG:
167: * if (hr.getTag().equals("img")) {
168: * // Change the tag name w/o affecting the attributes.
169: *
170: * hr.setTag("form");
171: * } else if (hr.getTag().equals("table")) {
172: * // Eliminate the entire "table" token.
173: *
174: * hr.killToken();
175: * }
176: * break;
177: *
178: * case LexHTML.STRING:
179: * // Append a new sequence in place of the existing token.
180: *
181: * hr.append("<b>" + hr.getToken().toUpperCase() + "</b>");
182: * break;
183: * }
184: * // Any tokens we didn't modify get copied through unchanged.
185: * }
186: * </pre>
187: *
188: * @return <code>true</code> if there are tokens left to process,
189: * <code>false</code> otherwise.
190: */
191: public boolean nextToken() {
192: if (pushback) {
193: pushback = false;
194: return true;
195: }
196:
197: if (appendToken && accumulate) {
198: appendToken();
199: }
200:
201: token = null;
202: tag = null;
203: map = null;
204:
205: appendToken = true;
206: tokenModified = false;
207:
208: if (lex.nextToken()) {
209: type = lex.getType();
210: return true;
211: }
212: return false;
213: }
214:
215: /**
216: * A convenence method built on top of <code>nextToken</code>.
217: * Advances to the next HTML tag. All intervening strings and comments
218: * between the last tag and the new current tag are copied through
219: * unchanged. This method can be used when the caller wants to process
220: * only HTML tags, without having to manually check the type of each
221: * token to see if it is actually a tag.
222: *
223: * @return <code>true</code> if there are tokens left to process,
224: * <code>false</code> otherwise.
225: */
226: public boolean nextTag() {
227: while (nextToken()) {
228: if (getType() == LexHTML.TAG) {
229: return true;
230: }
231: }
232: return false;
233: }
234:
235: /**
236: * Gets the type of the current token.
237: *
238: * @return The type.
239: *
240: * @see LexHTML#getType
241: */
242: public int getType() {
243: return type;
244: }
245:
246: /**
247: * Sets the type of the current token.
248: */
249: public void setType(int type) {
250: this .type = type;
251: tokenModified = true;
252: }
253:
254: /**
255: * Gets the raw string making up the entire current token, including
256: * the angle brackets or comment delimiters, if applicable.
257: *
258: * @return The current token.
259: *
260: * @see LexHTML#getToken
261: */
262: public String getToken() {
263: if (token == null) {
264: token = lex.getToken();
265: }
266: return token;
267: }
268:
269: /**
270: * Gets the current tag's name. The name returned is converted to
271: * lower case.
272: *
273: * @return The lower-cased tag name, or <code>null</code> if the
274: * current token does not have a tag name
275: *
276: * @see LexHTML#getTag
277: */
278: public String getTag() {
279: if (tag == null) {
280: tag = lex.getTag();
281: }
282: return tag;
283: }
284:
285: /**
286: * Changes the current tag's name. The tag's attributes are not changed.
287: *
288: * @param tag
289: * New tag name
290: */
291: public void setTag(String tag) {
292: this .tag = tag;
293: tokenModified = true;
294: }
295:
296: /**
297: * Gets the body of the current token as a string.
298: *
299: * @return The body.
300: *
301: * @see LexHTML#getBody
302: */
303: public String getBody() {
304: return lex.getBody();
305: }
306:
307: /**
308: * Gets the arguments of the current token as a string.
309: *
310: * @return The body.
311: *
312: * @see LexHTML#getArgs
313: */
314: public String getArgs() {
315: return lex.getArgs();
316: }
317:
318: /**
319: * Returns the value that the specified case-insensitive key maps
320: * to in the attributes for the current tag. For keys that were
321: * present in the tag's attributes without a value, the value returned
322: * is the empty string. In other words, for the tag
323: * <code><table border rows=2></code>: <ul>
324: * <li> <code>get("border")</code> returns the empty string "".
325: * <li> <code>get("rows")</code> returns <i>2</i>.
326: * </ul>
327: * <p>
328: * Surrounding single and double quote marks that occur in the literal
329: * tag are removed from the values reported. So, for the tag
330: * <code><a href="/foo.html" target=_top onclick='alert("hello")'></code>: <ul>
331: * <li> <code>get("href")</code> returns <i>/foo.html</i> .
332: * <li> <code>get("target")</code> returns <i>_top</i> .
333: * <li> <code>get("onclick")</code> returns <i>alert("hello")</i> .
334: * </ul>
335: *
336: * @param The key to lookup in the current tag's attributes.
337: *
338: * @return The value to which the specified key is mapped, or
339: * <code>null</code> if the key was not in the attributes.
340: *
341: * @see LexHTML#getAttributes
342: */
343: public String get(String key) {
344: getAttributes();
345:
346: String str = map.get(key);
347: if (str == null) {
348: return null;
349: }
350:
351: /*
352: * Strip off the quote marks, if necessary.
353: */
354:
355: int length = str.length();
356: if (length < 2) {
357: return str;
358: }
359: char ch = str.charAt(0);
360: if (((ch == '"') || (ch == '\''))
361: && (str.charAt(length - 1) == ch)) {
362: return str.substring(1, length - 1);
363: }
364: return str;
365: }
366:
367: /**
368: * Maps the given case-insensitive key to the specified value in the
369: * current tag's attributes.
370: * <p>
371: * The value can be retrieved by calling <code>get</code> with a
372: * key that is case-insensitive equal to the given key.
373: * <p>
374: * If the attributes already contained a mapping for the given key,
375: * the old value is forgotten and the new specified value is used.
376: * The case of the prior key is retained in that case. Otherwise
377: * the case of the new key is used and a new mapping is made.
378: *
379: * @param key
380: * The new key. May not be <code>null</code>.
381: *
382: * @param value
383: * The new value. May be not be <code>null</code>.
384: */
385: public void put(String key, String value) {
386: getAttributes();
387: map.put(key, quote(value));
388: tokenModified = true;
389: }
390:
391: /**
392: * Removes the given case-insensitive key and its corresponding value
393: * from the current tag's attributes. This method does nothing if the
394: * key is not in the attributes.
395: *
396: * @param key
397: * The key that needs to be removed. Must not be
398: * <code>null</code>.
399: */
400: public void remove(String key) {
401: getAttributes();
402: map.remove(key);
403: tokenModified = true;
404: }
405:
406: /**
407: * Returns an enumeration of the keys in the current tag's attributes.
408: * The elements of the enumeration are the string keys. The keys can
409: * be passed to <code>get</code> to get the values of the attributes.
410: *
411: * @return An enumeration of the keys.
412: */
413: public Enumeration keys() {
414: getAttributes();
415: return map.keys();
416: }
417:
418: /**
419: * Instead of modifying an existing token, this method allows the user
420: * to completely replace the current token with arbitrary new content.
421: * <p>
422: * This method may be called multiple times while processing the current
423: * token to add more and more data to the resultant HTML document.
424: * Before and/or after calling this method, the <code>appendToken</code>
425: * method may also be called explicitly in order to add the current token
426: * to the resultant HTML document.
427: * <p>
428: * Following is sample code illustrating how to use this method
429: * to put bold tags around all the <code><a></code> tags.
430: * <pre>
431: * HtmlRewriter hr = new HtmlRewriter(str);
432: * while (hr.nextTag()) {
433: * if (hr.getTag().equals("a")) {
434: * hr.append("<b>");
435: * hr.appendToken();
436: * } else if (hr.getTag().equals("/a")) {
437: * hr.appendToken();
438: * hr.append("</b>");
439: * }
440: * }
441: * </pre>
442: * The calls to <code>appendToken</code> are necessary. Otherwise,
443: * the <code>HtmlRewriter</code> could not know where and when to
444: * append the existing token in addition to the new content provided
445: * by the user.
446: *
447: * @param str
448: * The new content to append. May be <code>null</code>,
449: * in which case no new content is appended (the equivalent
450: * of appending "").
451: *
452: * @see #appendToken
453: * @see #killToken
454: */
455: public void append(String str) {
456: if (str != null) {
457: sb.append(str);
458: }
459: appendToken = false;
460: }
461:
462: /**
463: * Appends the current token to the resultant HTML document.
464: * If the caller has changed the current token using the
465: * <code>setTag</code>, <code>set</code>, or <code>remove</code>
466: * methods, those changes will be reflected.
467: * <p>
468: * By default, this method is automatically called after each token is
469: * processed unless the user has already appended something to the
470: * resultant HTML document. Therefore, if the user appends something
471: * and also wants to append the current token, or if the user wants
472: * to append the current token a number of times, this method must
473: * be called.
474: *
475: * @see #append
476: * @see #killToken
477: */
478: public void appendToken() {
479: appendToken = false;
480: if (tokenModified) {
481: getTag();
482: getAttributes();
483:
484: if (getType() == LexHTML.COMMENT) {
485: sb.append("<--");
486: } else {
487: sb.append('<');
488: }
489: sb.append(tag);
490: int length = map.size();
491: for (int i = 0; i < length; i++) {
492: sb.append(' ').append(map.getKey(i));
493: String value = map.get(i);
494: if ((value != null) && (value.length() > 0)) {
495: sb.append('=').append(value);
496: }
497: }
498: if (getType() == LexHTML.COMMENT) {
499: sb.append("-->");
500: } else {
501: sb.append('>');
502: }
503: } else {
504: sb.append(getToken());
505: }
506: }
507:
508: /**
509: * Tells this <code>HtmlRewriter</code> not to append the current token
510: * to the resultant HTML document. Even if the user hasn't appended
511: * anything else, the current token will be ignored rather than appended.
512: *
513: * @see #append
514: * @see #killToken
515: */
516: public void killToken() {
517: appendToken = false;
518: }
519:
520: /**
521: * Turns on or off the automatic accumulation of each token.
522: * <p>
523: * After each token is processed, the current token is appended to
524: * to the resultant HTML document unless the user has already appended
525: * something else. By setting <code>accumulate</code> to
526: * <code>false</code>, this behavior is turned off. The user must then
527: * explicitly call <code>appendToken</code> to cause the current token
528: * to be appended.
529: * <p>
530: * Turning off accumulation takes effect immediately, while turning
531: * on accumulation takes effect on the next token. In other words,
532: * whether the user turns this setting off or on, the current token
533: * will not be added to the resultant HTML document unless the user
534: * explicitly calls <code>appendToken</code>.
535: * <p>
536: * Following is sample code that illustrates how to use this method
537: * to extract the contents of the <code><head></code> of the
538: * source HTML document.
539: * <pre>
540: * HtmlRewriter hr = new HtmlRewriter(str);
541: * // Don't accumulate tokens until we see the <head> below.
542: * hr.accumulate(false);
543: * while (hr.nextTag()) {
544: * if (hr.getTag().equals("head")) {
545: * // Start remembering the contents of the HTML document,
546: * // not including the <head> tag itself.
547: *
548: * hr.accumulate(true);
549: * } else if (hr.getTag().equals("/head")) {
550: * // Return everything accumulated so far.
551: *
552: * return hr.toString();
553: * }
554: * }
555: * </pre>
556: * This method can be called any number of times while processing
557: * the source HTML document.
558: *
559: * @param accumulate
560: * <code>true</code> to automatically accumulate tokens in the
561: * resultant HTML document, <code>false</code> to require
562: * that the user explicitly accumulate them.
563: * @return The previous accumulate setting
564: *
565: * @see #reset
566: */
567: public boolean accumulate(boolean accumulate) {
568: boolean was = this .accumulate;
569: this .accumulate = accumulate;
570: appendToken = false;
571: return was;
572: }
573:
574: /**
575: * Forgets all the tokens that have been appended to the resultant
576: * HTML document so far, including the current token.
577: */
578: public void reset() {
579: sb.setLength(0);
580: appendToken = false;
581: }
582:
583: /**
584: * Puts the current token back. The next time <code>nextToken</code>
585: * is called, it will be the current token again, rather than
586: * advancing to the next token in the source HTML document.
587: * <p>
588: * This is useful when a code fragment needs to read an indefinite
589: * number of tokens, but that once some distinguished token is found,
590: * needs to push that token back so that normal processing can occur
591: * on that token.
592: */
593: public void pushback() {
594: pushback = true;
595: }
596:
597: /**
598: * Helper class to quote a attribute's value when the value is being
599: * written to the resultant HTML document. Values set by the
600: * <code>put</code> method are automatically quoted as needed. This
601: * method is provided in case the user is dynamically constructing a new
602: * tag to be appended with <code>append</code> and needs to quote some
603: * arbitrary values.
604: * <p>
605: * The quoting algorithm is as follows: <br>
606: * If the string contains double-quotes, put single quotes around it. <br>
607: * If the string contains single-quotes or spaces, put double-quotes
608: * around it.
609: * <p>
610: * This algorithm is, of course, insufficient for complicated
611: * strings that include both single and double quotes. In that case,
612: * it is the user's responsibility to escape the special characters
613: * in the string using the HTML special symbols like
614: * <code>&quot;</code> or <code>&#34;</code>
615: *
616: * @return The quoted string, or the original string if it did not
617: * need to be quoted.
618: */
619: public static String quote(String str) {
620: if (str.indexOf('\"') >= 0) {
621: return "\'" + str + "\'";
622: } else if (str.indexOf('\'') >= 0) {
623: return "\"" + str + "\"";
624: } else if (str.indexOf(' ') >= 0) {
625: return "\"" + str + "\"";
626: } else {
627: return str;
628: }
629: }
630:
631: private void getAttributes() {
632: if (map == null) {
633: map = lex.getAttributes();
634: }
635: }
636: }
|