001: /*
002: * Copyright (c) 1997-1999 The Java Apache Project. All rights reserved.
003: *
004: * Redistribution and use in source and binary forms, with or without
005: * modification, are permitted provided that the following conditions
006: * are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright
009: * notice, this list of conditions and the following disclaimer.
010: *
011: * 2. Redistributions in binary form must reproduce the above copyright
012: * notice, this list of conditions and the following disclaimer in
013: * the documentation and/or other materials provided with the
014: * distribution.
015: *
016: * 3. All advertising materials mentioning features or use of this
017: * software must display the following acknowledgment:
018: * "This product includes software developed by the Java Apache
019: * Project for use in the Apache JServ servlet engine project
020: * (http://java.apache.org/)."
021: *
022: * 4. The names "Apache JServ", "Apache JServ Servlet Engine" and
023: * "Java Apache Project" must not be used to endorse or promote products
024: * derived from this software without prior written permission.
025: *
026: * 5. Products derived from this software may not be called "Apache JServ"
027: * nor may "Apache" nor "Apache JServ" appear in their names without
028: * prior written permission of the Java Apache Project.
029: *
030: * 6. Redistributions of any form whatsoever must retain the following
031: * acknowledgment:
032: * "This product includes software developed by the Java Apache
033: * Project for use in the Apache JServ servlet engine project
034: * (http://java.apache.org/)."
035: *
036: * THIS SOFTWARE IS PROVIDED BY THE JAVA APACHE PROJECT "AS IS" AND ANY
037: * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
038: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
039: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE JAVA APACHE PROJECT OR
040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
042: * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
043: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
044: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
045: * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
046: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
047: * OF THE POSSIBILITY OF SUCH DAMAGE.
048: *
049: * This software consists of voluntary contributions made by many
050: * individuals on behalf of the Java Apache Group. For more information
051: * on the Java Apache Project and the Apache JServ Servlet Engine project,
052: * please see <http://java.apache.org/>.
053: */
054:
055: /*
056: * Copyright 2000,2005 wingS development team.
057: *
058: * This file is part of wingS (http://wingsframework.org).
059: *
060: * wingS is free software; you can redistribute it and/or modify
061: * it under the terms of the GNU Lesser General Public License
062: * as published by the Free Software Foundation; either version 2.1
063: * of the License, or (at your option) any later version.
064: *
065: * Please see COPYING for the complete licence.
066: */
067: package org.wings.template.parser;
068:
069: import java.io.IOException;
070: import java.io.Reader;
071: import java.io.StringReader;
072: import java.util.HashMap;
073: import java.util.Iterator;
074: import java.util.LinkedHashMap;
075: import java.util.LinkedList;
076:
077: import org.wings.util.SStringBuilder;
078:
079: /**
080: * Convenient class for parsing SGML tokens from a page.
081: * <p/>
082: * <p>This class is optimized for speed, not ease of use.
083: * (Though I'd contend its fairly easy to use anyway!).
084: * <p/>
085: * <p>Other than earlier versions of this class this one reads
086: * its content from a <code>Reader</code> to avoid reading
087: * the whole file into a String before parsing it.
088: * The Reader is required to support the <code>mark()</code>
089: * operation.
090: * <p/>
091: * <p>Tags are only read enough to find out what the tag name is;
092: * If you want to read the full tag call <code>parse(inputReader)</code>.
093: * This is done so that applications don't spend time processing
094: * tags about which they care little.
095: * <p/>
096: * <p>Here's a sample piece of code which uses this class to read
097: * all SGML tags on a page:
098: * <p/>
099: * <pre>
100: * void showTags(PrintWriter out, Reader input)
101: * {
102: * SGMLTag tag = new SGMLTag(input);
103: * while (!tag.finished()) {
104: * out.println ("tag: " + tag.toString());
105: * tag = new SGMLTag (input);
106: * }
107: * }
108: * </pre>
109: *
110: * @author <a href="mailto:williams@ugsolutions.com">Tim Williams</a>
111: * @author <a href="mailto:zeller@think.de">Henner Zeller</a>
112: */
113:
114: /*
115: * TODO: (hen)
116: * - read incomplete TAGs <input type="checkbox" checked> => checked=1
117: */
118:
119: public class SGMLTag {
120: public final static char singleQuote = '\'';
121: public final static char doubleQuote = '\"';
122:
123: /**
124: * Name of this SGML tag, in uppercase format.
125: * This is only public for compatibility reasons.
126: */
127: private String name = null;
128:
129: /**
130: * The token that closes this tag.
131: * Different for SSI and SGML tags.
132: */
133: private String closeTag = null;
134:
135: /**
136: * Number of characters skipped <
137: */
138: private int offset = 0;
139:
140: /* These attributes are to be compatible with the 'old'
141: * SGMLTag using Strings
142: */
143: private int start = 0;
144: private int end = 0;
145:
146: // private stuff
147: private LinkedList attrs = null; // tag attributes (mixed)
148: private LinkedHashMap values = null; // tag attribute values (uc)
149: private boolean wellFormed = true; // looks good?
150: private boolean attr_ready = false;
151:
152: // comment delimitation
153: static final String COMMENT_START = "!--", COMMENT_END = "-->";
154: static final String SSI_START = COMMENT_START + "#",
155: SSI_END = COMMENT_END;
156:
157: /**
158: * for historical reasons only; behaves like the
159: * old SGMLTag().
160: */
161: private SGMLTag(String textContent, int begin) {
162: PositionReader r = new PositionReader(new StringReader(
163: textContent));
164: try {
165: r.skip(begin);
166: offset = begin;
167: searchStart(r);
168: start = offset;
169: // do a full parse here; since the usage of the
170: // String based SGMLTag() is deprecated this
171: // performance penalty doesn't matter
172: parse(r);
173: } catch (IOException reading_from_string_should_never_fail) {
174: offset = -1;
175: }
176: end = (int) r.getPosition();
177: }
178:
179: /**
180: * Create new SGML tag reference, starting at current location
181: * of the Reader.
182: * At first, only the type of tag (first argument) is read if
183: * <code>parseIt</code> is false.
184: * Tag may not be well-formed: if interested, call "parse(input)"
185: * directly afterwards (without reading any characters
186: * from the Reader) to get the attributes.
187: * <p/>
188: * <p>Note that this constructor skips over any HTML-style comments,
189: * as denoted by matched <tt><--</tt> ... <tt>--></tt> pairs.
190: *
191: * @param input the Reader being parsed for SGML tags
192: * @param parseIt boolean which denotes if SGMLTag should be
193: * parsed fully
194: * @see #attributes
195: */
196: public SGMLTag(Reader input, boolean parseIt) throws IOException {
197: searchStart(input);
198: if (parseIt)
199: readAttributes(input);
200: }
201:
202: /**
203: * Create new SGML tag reference, starting at current location
204: * of the Reader. Read all attributes.
205: * <p/>
206: * <p>Note that this constructor skips over any HTML-style comments,
207: * as denoted by matched <tt><--</tt> ... <tt>--></tt> pairs.
208: *
209: * @param input the Reader being parsed for SGML tags
210: * @see #attributes
211: */
212: public SGMLTag(Reader input) throws IOException {
213: this (input, true);
214: }
215:
216: public void parse(Reader input) throws IOException {
217: readAttributes(input);
218: }
219:
220: /**
221: * Skip over any HTML-style comments,
222: * as denoted by matched <tt><--</tt> ... <tt>--></tt> pairs.
223: *
224: * @param input the reader being parsed for SGMLtags
225: */
226: protected void searchStart(Reader input) throws IOException {
227: int c = 0, num;
228: char buff[] = new char[8]; // must at least hold the length of COMMENT_(START|END)
229: String cmpStr;
230:
231: // skipping over comments, find first tag
232: while (true) {
233: // find starting character of SGML tag
234: while (c >= 0 && c != '<') {
235: c = input.read();
236: offset++;
237: }
238: if (c == -1) {
239: offset = -1;
240: return;
241: } // EOF
242: offset--;
243:
244: /* -- check if we just found a comment
245: * <!--# - SSI Commands start just like
246: * ordinary comments, so we've to make sure
247: * that exclude these (<!--) but not those (<!--#)
248: */
249: input.mark(SSI_START.length());
250: int pos;
251: num = 0;
252: for (pos = 0; pos >= 0 && num < SSI_START.length(); num += pos)
253: pos = input.read(buff, pos, SSI_START.length() - pos);
254: if (pos == -1) {
255: offset = -1;
256: return;
257: } // EOF
258:
259: cmpStr = new String(buff, 0, num);
260: if (SSI_START.equals(cmpStr)
261: || !(cmpStr.startsWith(COMMENT_START))) {
262: input.reset();
263: break; // No comment .. real start of a SGML / SSI Tag
264: }
265:
266: /*
267: * ok, we got an comment; but since we read SSI_START length
268: * characters, we've to reset and just read COMMENT_START so
269: * we're in a defined state ..
270: */
271: input.reset();
272: num = 0;
273: for (pos = 0; pos >= 0 && num < COMMENT_START.length(); num += pos)
274: pos = input.read(buff, pos, COMMENT_START.length()
275: - pos);
276: // since length(COMMENT_START) < length(SSI_START) (which we
277: // already successfully read), we don't have to check for EOF here
278:
279: offset += COMMENT_START.length() + 1; // +1 for the starting '<'
280: // otherwise skip extent of commented area
281: boolean endOfComment = false;
282: int len = 0, ringHead = 0;
283: int checkpos, p;
284: while (!endOfComment) {
285: c = input.read();
286: if (c == -1) {
287: offset = -1;
288: return;
289: } // EOF
290: len++;
291: offset++;
292: // since we don't have '-1' here anymore, cast is save:
293: buff[ringHead] = (char) c; // buffer is a ringbuffer
294: if (len >= COMMENT_END.length()) {
295: // compare, beginning from the last position backward
296: for (checkpos = ringHead + buff.length, p = COMMENT_END
297: .length() - 1; p >= 0; --checkpos, --p) {
298: if (COMMENT_END.charAt(p) != buff[checkpos
299: % buff.length])
300: break;
301: }
302: endOfComment = (p == -1);
303: }
304: ringHead = (++ringHead) % buff.length;
305: }
306:
307: }
308:
309: // get the name
310: // do not skip Whitespaces, since the Tagname must
311: // start just after the '<'
312: name = nextToken(input, false);
313: if (name != null)
314: name = name.toUpperCase();
315:
316: // set the token that closes this tag
317: if (name != null && name.startsWith(SSI_START)) {
318: closeTag = SSI_END; // SSI tag
319: } else {
320: closeTag = ">"; // SGML tag
321: }
322: }
323:
324: /**
325: * Checked whether this tag indicates we're at the end of the list.
326: * Note: The end tag is not usuable as an SGML tag.
327: *
328: * @return true if this tag represents end of tags, and is not usuable
329: */
330: public boolean finished() {
331: return offset == -1 && name == null;
332: }
333:
334: /**
335: * Check name of tag.
336: * (Comparision is case-insensitive.)
337: *
338: * @return true if passed tag matches this one.
339: */
340: public boolean isNamed(String name) {
341: return this .name != null
342: && this .name.equals(name.toUpperCase());
343: }
344:
345: /**
346: * Check for well-formedness of this tag.
347: * Note that calling this method causes rest of tag to be parsed.
348: *
349: * @return true if tag is a well-formed SGML tag, false otherwise
350: */
351: public boolean isWellFormed() {
352: if (name == null)
353: return false;
354: if (!attr_ready || values == null)
355: return false;
356: return wellFormed;
357: }
358:
359: /**
360: * returns the number of chars skipped before the
361: * starting '<'
362: */
363: public int getOffset() {
364: return offset;
365: }
366:
367: /**
368: * get the Name of this SGML tag, in uppercase format.
369: * For example, P for paragraph, B for bold, etc.
370: * This value is set to null when whitespace or another
371: * problem was encountered where the tag would be.
372: */
373: public String getName() {
374: return name;
375: }
376:
377: /**
378: * Get list of attribute names.
379: *
380: * @param upperCase true returns names in all uppercase (good for
381: * case-insensitive applications), false returns attribute names
382: * with same case as in original text
383: * @return enumeration of attribute names specified as strings,
384: * or null if this tag is poorly formed
385: */
386: public Iterator attributes(boolean upperCase) {
387: // check to make sure attributes have been read
388: if (!isWellFormed())
389: return null;
390:
391: // or return uppercase names?
392: if (upperCase) {
393: return values.keySet().iterator();
394: } else {
395: return attrs.iterator();
396: }
397: }
398:
399: /**
400: * Get attribute value, or default if not set.
401: * Case is ignored, <tt>value("a")</tt> will return the same
402: * result as <tt>value("A")</tt>. Note also that if wish to
403: * check whether value was set, you can pass <tt>null</tt>
404: * as the defaultValue.
405: *
406: * @param attributeName attribute for which to check
407: * @param defaultValue value if attribute unset
408: * @return value of attribute, or defaultValue if not available
409: */
410: public String value(String attributeName, String defaultValue) {
411: if (!isWellFormed())
412: return null;
413: String value = (String) values.get(attributeName.toUpperCase());
414: return value == null ? defaultValue : value;
415: }
416:
417: /**
418: * Attempt to read attributes from tag if not already read.
419: *
420: * @return true if everything was read fine, false otherwise
421: */
422: private boolean readAttributes(Reader input) throws IOException {
423: // just try to read Attributes once
424:
425: if (attr_ready)
426: return wellFormed && values != null;
427: attr_ready = true;
428:
429: if (values == null && wellFormed) {
430: String key = null, token;
431: wellFormed = false;
432: attrs = new LinkedList();
433: values = new LinkedHashMap();
434:
435: while (true) {
436: // check for valid value tag (or end delimiter)
437: if (key == null)
438: key = nextToken(input);
439:
440: // close-Tag
441: if (key != null && key.equals(closeTag)) {
442: wellFormed = true;
443: break;
444: }
445:
446: // close-Tag
447: if (key != null && key.equals("/>")) {
448: wellFormed = true;
449: break;
450: }
451:
452: // 'key'-part
453: if (key == null || isDelimiter(key.charAt(0))
454: || key.charAt(0) == doubleQuote
455: || key.charAt(0) == singleQuote)
456: break;
457:
458: // ok, we have a key. Now insure that we have an equals sign
459: token = nextToken(input);
460: if (token == null || token.charAt(0) != '=') {
461: attrs.add(key);
462: if (token == null)
463: break;
464: key = token; // this token is the next key
465: continue;
466: }
467:
468: // read value of tag
469: token = nextToken(input);
470: if (token == null || isDelimiter(token.charAt(0)))
471: break;
472:
473: // strip quotes
474: if (token.charAt(0) == doubleQuote
475: || token.charAt(0) == singleQuote)
476: token = token.substring(1, token.length() - 1);
477:
478: // store attribute name with original case
479: String upperCase = key.toUpperCase();
480: if (!values.containsKey(upperCase))
481: attrs.add(key);
482:
483: // store assignment in case-insensitive manner
484: values.put(upperCase, token);
485: key = null; // clear this key; next token is our next key.
486: }
487: }
488: return wellFormed && values != null;
489: }
490:
491: /**
492: * Read next token from string.
493: * A token is a space-delimited word, a string in quotes
494: * (returned with quotes), a delimiter such as a greater-than,
495: * less-than, or equals sign.
496: * Quotes marks inside quoted strings may be escaped with a
497: * backslash (\) character.
498: *
499: * @return next token, or null if whitespace was encountered
500: */
501: public String nextToken(Reader input) throws IOException {
502: return nextToken(input, true);
503: }
504:
505: /**
506: * Read next token from string.
507: * A token is a space-delimited word, a string in quotes
508: * (returned with quotes), a delimiter such as a greater-than,
509: * less-than, or equals sign.
510: * Quotes marks inside quoted strings may be escaped with a
511: * backslash (\) character.
512: *
513: * @return next token, or null if whitespace was encountered
514: */
515: public String nextToken(Reader input, boolean skipWhitespaces)
516: throws IOException {
517: SStringBuilder token = new SStringBuilder();
518:
519: if (skipWhitespaces)
520: skipWhiteSpace(input);
521:
522: input.mark(1);
523: int c = input.read();
524:
525: if (c == -1) {
526: offset = -1;
527: return null;
528: }
529:
530: // quoted string? (handle both single and double)
531: if (c == doubleQuote || c == singleQuote) {
532: boolean inSingle = false;
533: boolean inDouble = false;
534: if (c == singleQuote)
535: inSingle = true;
536: else
537: inDouble = true;
538: token.append((char) c);
539: do {
540: c = input.read();
541: if (c == -1) {
542: offset = -1;
543: String reportString = token.toString();
544: if (reportString.length() > 30) {
545: reportString = reportString.substring(0, 30)
546: + " (truncated, length is "
547: + reportString.length() + ")";
548: }
549: throw new IOException("EOF in String: "
550: + reportString);
551: }
552: if (c == '\\') {
553: int quoted = input.read();
554: if (quoted >= 0)
555: token.append((char) quoted);
556: } else
557: token.append((char) c);
558: } while ((inDouble && c != doubleQuote)
559: || (inSingle && c != singleQuote));
560: }
561:
562: // parameter delimiter? read just one
563: else if (isDelimiter((char) c)) {
564: token.append((char) c);
565: }
566:
567: // Inserted for token "-->".
568: // Like a word token, but includes the delimiter ">".
569: else if (c == '-') {
570: do {
571: token.append((char) c);
572: input.mark(1);
573: c = input.read();
574: } while (c >= 0 && !Character.isWhitespace((char) c)
575: && !isDelimiter((char) c));
576: input.reset();
577: token.append((char) input.read());
578: }
579:
580: // If we did not skip Whitespaces but actually got one
581: // this token is empty.
582: else if (!skipWhitespaces && Character.isWhitespace((char) c)) {
583: input.reset();
584: return null;
585: }
586:
587: // word token or />
588: else {
589: do {
590: token.append((char) c);
591: input.mark(1);
592: c = input.read();
593: } while (c >= 0 && !Character.isWhitespace((char) c)
594: && !isDelimiter((char) c));
595: if (token.length() == 1 && token.charAt(0) == '/')
596: token.append((char) c);
597: else
598: input.reset();
599: }
600: return token.toString();
601: }
602:
603: /**
604: * could be overwritten
605: */
606: public static int skipWhiteSpace(Reader r) throws IOException {
607: int c, len = 0;
608: do {
609: r.mark(1);
610: c = r.read();
611: len++;
612: } while (c >= 0 && Character.isWhitespace((char) c));
613: r.reset();
614: return len - 1;
615: }
616:
617: /**
618: * Return value of attribute (parameter) setting in SGML tag.
619: * @param key name (uppercase) of attribute for which to check
620: * @param defaultValue value if attribute unset
621: * @deprecated use <tt>attributes()</tt> and <tt>value()</tt> instead
622: * @see #attributes
623: * @see #value
624: * @return value of that attribute, or default if not defined
625: */
626: public String getAttribute(String key, String defaultValue) {
627: return value(key, defaultValue);
628: }
629:
630: /**
631: * Return tag attributes and values.
632: * @return parameter key / value pairs
633: * @deprecated use <tt>attributes()</tt> and <tt>value()</tt> instead
634: * @see #attributes
635: * @see #value
636: */
637: public HashMap getAttributes() {
638: return isWellFormed() ? values : null;
639: }
640:
641: /**
642: * Decide whether character is SGML delimiter or equals.
643: *
644: * @param c character in question
645: * @return true if character is an SGML delimiter
646: */
647: private static boolean isDelimiter(char c) {
648: return c == '<' || c == '=' || c == '>';
649: }
650:
651: /**
652: * Render this tag as a string.
653: *
654: * @return SGML tag as string, showing range and values
655: */
656: public String toString() {
657: SStringBuilder str = new SStringBuilder();
658: str.append("[SGMLTag ").append(name).append(": (").append(
659: getOffset()).append(",---)");
660: if (attrs != null && wellFormed) {
661: Iterator iter = attributes(true);
662: while (iter.hasNext()) {
663: String key = (String) iter.next();
664: str.append(" ").append(key).append("=\"").append(
665: value(key, null)).append("\"");
666: }
667: } else {
668: str.append(" *MALFORMED TAG*");
669: }
670: str.append(" ]");
671: return str.toString();
672: }
673: }
|