001: /*
002: * Copyright (c) 2001, Jacob Smullyan.
003: *
004: * This is part of SkunkDAV, a WebDAV client. See http://skunkdav.sourceforge.net/
005: * for the latest version.
006: *
007: * SkunkDAV is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU General Public License as published
009: * by the Free Software Foundation; either version 2, or (at your option)
010: * any later version.
011: *
012: * SkunkDAV is distributed in the hope that it will be useful,
013: * but WITHOUT ANY WARRANTY; without even the implied warranty of
014: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015: * General Public License for more details.
016: *
017: * You should have received a copy of the GNU General Public License
018: * along with SkunkDAV; see the file COPYING. If not, write to the Free
019: * Software Foundation, 59 Temple Place - Suite 330, Boston, MA
020: * 02111-1307, USA.
021: */
022:
023: /**
024: * XMLParser.java
025: * a parser with a very minimal feature set
026: * Created: Wed Sep 13 17:54:20 2000
027: *
028: * @author <a href="mailto:smulloni@smullyan.org">Jacob Smullyan</a>
029: * @version $Revision: 1.10 $
030: */package org.skunk.minixml;
031:
032: import java.io.BufferedReader;
033: import java.io.IOException;
034: import java.io.FileReader;
035: import java.io.Reader;
036: import java.io.StreamTokenizer;
037: import java.io.StringReader;
038: import java.util.StringTokenizer;
039:
040: public class XMLParser {
041: public static final int OPEN = 0;
042: public static final int CLOS = 1;
043: public static final int QUES = 2;
044: public static final int EXCL = 3;
045: public static final int SLAS = 4;
046: public static final int EQUA = 5;
047: public static final int QUOT = 6;
048: public static final int DASH = 7;
049: public static final int COLO = 8;
050: public static final int WORD = 9;
051: public static final int EOF = 10;
052: public static final int UNKN = 11;
053: public static final int BRAC = 12;
054:
055: public XMLDocument parse(String s) throws MalformedXMLException,
056: IOException {
057: return parse(s, false);
058: }
059:
060: public XMLDocument parse(Reader reader)
061: throws MalformedXMLException, IOException {
062: return parse(reader, false);
063: }
064:
065: public XMLDocument parse(String s, boolean lenient)
066: throws MalformedXMLException, IOException {
067: return parse(new StringReader(s), lenient);
068: }
069:
070: public XMLDocument parse(Reader reader, boolean lenient)
071: throws MalformedXMLException, IOException {
072: Lexer lex = new Lexer(reader);
073: XMLDocument doc = new XMLDocument(lenient);
074: Object o;
075: while ((o = getNextElement(lex)) != null) {
076: if (o.toString().trim().length() > 0)
077: doc.addElement(o);
078: }
079: return doc;
080: }
081:
082: private Object getNextElement(Lexer lex)
083: throws MalformedXMLException, IOException {
084: int token = lex.getNextToken();
085: Object elem = null;
086: switch (token) {
087: case EOF:
088: //nothing to do, leave elem as null
089: break;
090: case OPEN:
091: int nextTok;
092: while ((nextTok = lex.getNextToken()) != EOF) {
093: if (nextTok == UNKN) {
094: continue;
095: }
096: if (nextTok == QUES) //an instruction; eat up tokens
097: {
098: elem = getProcessingOrXMLInstruction(lex);
099: break;
100: } else if (nextTok == EXCL) {
101: nextTok = lex.getNextToken();
102: if (nextTok == WORD) {
103: lex.pushBack();
104: elem = getDoctypeInstruction(lex);
105: break;
106: } else if (nextTok == DASH) {
107: if (lex.getNextToken() == DASH) //discard next dash
108: {
109: elem = getComment(lex);
110: break;
111: } else
112: throw new MalformedXMLException(
113: "unknown token: \"<!-\"");
114: } else if (nextTok == BRAC) {
115: if ((lex.getNextToken() == WORD)
116: && lex.getTokenString().equals("CDATA")) {
117: nextTok = lex.getNextToken();
118: if (nextTok == BRAC) {
119: elem = getCData(lex);
120: break;
121: } else
122: throw new MalformedXMLException(
123: "unknown token: \"<![CDATA\" without \"[\"");
124:
125: } else
126: throw new MalformedXMLException(
127: "unknown token: \"<![\" without CDATA");
128: }
129: } else if (nextTok == WORD) {
130: lex.pushBack();
131: elem = getElement(lex);
132: break;
133: } else if (nextTok == SLAS) {
134: break;
135: }
136: }
137: break;
138: case SLAS:
139: //fall through to case WORD
140: case QUOT:
141: case EQUA:
142: case DASH:
143: case COLO:
144: case QUES:
145: case EXCL:
146: case WORD:
147: //leave the word to be gathered by the following routine
148: lex.pushBack();
149: elem = getPCData(lex);
150: break;
151: case UNKN:
152: //drop through to default
153: default:
154: return getNextElement(lex);
155: }
156: return elem;
157: }
158:
159: /**
160: *
161: */
162: private XMLPInstruction getProcessingOrXMLInstruction(Lexer lex)
163: throws MalformedXMLException, IOException {
164: String s = lex.readUntil("?>");
165: if (s != null)
166: s = s.trim();
167: return new XMLPInstruction(s);
168: }
169:
170: /**
171: *
172: */
173: private String getDoctypeInstruction(Lexer lex)
174: throws MalformedXMLException, IOException {
175: String s = lex.readUpToCloseTag("<", ">");
176: if (s != null)
177: s = s.trim();
178: return "<!" + s;
179: }
180:
181: /**
182: *
183: */
184: private XMLCData getCData(Lexer lex) throws MalformedXMLException,
185: IOException {
186: String data = lex.readUntil("]]>");
187: return new XMLCData(data);
188: //return "<![CDATA" + lex.readUntil("]]>") + "]]>";
189: }
190:
191: /**
192: *
193: */
194: private XMLComment getComment(Lexer lex)
195: throws MalformedXMLException, IOException {
196: String s = lex.readUntil("-->");
197: if (s != null)
198: s = s.trim();
199: return new XMLComment(s);
200: }
201:
202: /**
203: * Precondition: lexer's next token will return a word
204: */
205: private XMLElement getElement(Lexer lex)
206: throws MalformedXMLException, IOException {
207: String tag = lex.readUntil(">");
208: if (tag.endsWith("/")) //empty element
209: {
210: XMLElement elem = parseOpeningTag(tag.substring(0, tag
211: .length() - 1));
212: return elem;
213: } else {
214: XMLElement elem = parseOpeningTag(tag);
215: Object o;
216: while ((o = getNextElement(lex)) != null
217: && o.toString().trim().length() > 0) {
218: elem.addChild(o);
219: }
220: //should now be on a closing tag
221: lex.readUntil(">");
222: return elem;
223: }
224: }
225:
226: /**
227: * Returns an XMLElement without children, based on an opening xml tag
228: */
229: private XMLElement parseOpeningTag(String tag)
230: throws MalformedXMLException, IOException {
231: XMLElement elem = null;
232: StringTokenizer stk = new StringTokenizer(tag);
233: if (stk.hasMoreTokens()) {
234: String tagName = stk.nextToken();
235: elem = new XMLElement(tagName);
236: while (stk.hasMoreTokens()) {
237: String attrName = stk.nextToken("= \n\r\t");
238: String attrValue = null;
239: if (stk.hasMoreTokens()) {
240: attrValue = stk.nextToken();
241: if (attrValue.equals("=") && stk.hasMoreTokens()) {
242: attrValue = stk.nextToken();
243: }
244: if (attrValue.length() > 1
245: && (attrValue.startsWith("\"") && attrValue
246: .endsWith("\""))
247: || (attrValue.startsWith("'") && attrValue
248: .endsWith("'"))) {
249: attrValue = attrValue.substring(1, attrValue
250: .length() - 1);
251: }
252: }
253: if (attrValue != null)
254: elem.setAttribute(attrName, attrValue);
255: }
256: }
257: return elem;
258: }
259:
260: private String getPCData(Lexer lex) throws MalformedXMLException,
261: IOException {
262: String s = lex.readUntil("<");
263: lex.pushBack();
264: return s.trim();
265: }
266:
267: public static void main(String[] args)
268: throws MalformedXMLException, IOException {
269: if (args.length == 0) {
270: System.out
271: .println("for testing, pass file or files as arguments to be parsed");
272: System.exit(1);
273: }
274: XMLParser parser = new XMLParser();
275: for (int i = 0; i < args.length; i++) {
276: XMLDocument doc = parser.parse(new FileReader(args[i]));
277: XMLViewer.viewDocumentDialog(doc);
278: }
279: System.exit(0);
280: }
281: }
282:
283: class Lexer {
284: private StreamTokenizer st;
285:
286: public Lexer(Reader r) {
287: if (r instanceof BufferedReader)
288: st = new StreamTokenizer(r);
289: else
290: st = new StreamTokenizer(new BufferedReader(r));
291: resetSyntax();
292: }
293:
294: private static final int TT_EOF = StreamTokenizer.TT_EOF;
295: private static final int TT_WORD = StreamTokenizer.TT_WORD;
296: private static final int TT_EOL = StreamTokenizer.TT_EOL;
297:
298: public int getNextToken() throws IOException {
299: int tok = st.nextToken();
300: int retTok = XMLParser.UNKN;
301:
302: switch (tok) {
303: case TT_EOF:
304: retTok = XMLParser.EOF;
305: break;
306: case '<':
307: retTok = XMLParser.OPEN;
308: break;
309: case '>':
310: retTok = XMLParser.CLOS;
311: break;
312: case '?':
313: retTok = XMLParser.QUES;
314: break;
315: case '!':
316: retTok = XMLParser.EXCL;
317: break;
318: case '/':
319: retTok = XMLParser.SLAS;
320: break;
321: case ':':
322: retTok = XMLParser.COLO;
323: break;
324: case '-':
325: retTok = XMLParser.DASH;
326: break;
327: case '=':
328: retTok = XMLParser.EQUA;
329: break;
330: case '"':
331: retTok = XMLParser.QUOT;
332: break;
333: case '[':
334: retTok = XMLParser.BRAC;
335: break;
336: case TT_WORD:
337: retTok = XMLParser.WORD;
338: break;
339: case TT_EOL:
340: //do nothing
341: break;
342: default:
343: System.err.println("unexpected token: " + tok);
344: return getNextToken();
345: }
346: return retTok;
347: }
348:
349: public String getTokenString() {
350: return st.sval;
351: }
352:
353: public void pushBack() {
354: st.pushBack();
355: }
356:
357: /**
358: * this method stops with the first terminator that matches
359: */
360: public String readUntil(String terminator)
361: throws MalformedXMLException, IOException {
362: StringBuffer sb = new StringBuffer();
363: boolean found = false;
364: st.resetSyntax();
365: st.eolIsSignificant(false);
366: while (st.nextToken() != st.TT_EOF) {
367: if (st.ttype == st.TT_WORD)
368: sb.append(st.sval);
369: else
370: sb.append((char) st.ttype);
371: found = endsWith(sb, terminator);
372: if (found)
373: break;
374: }
375: resetSyntax();
376: if (!found)
377: throw new MalformedXMLException("terminator " + terminator
378: + " not found before EOF");
379: String s = sb.toString();
380: return s.substring(0, s.indexOf(terminator));
381: }
382:
383: private boolean endsWith(StringBuffer sb, String terminator) {
384: if (sb == null || terminator == null)
385: return false;
386: int termlen = terminator.length();
387: int sblen = sb.length();
388: if (sblen < termlen)
389: return false;
390: boolean matches = true;
391: int offset = sblen - termlen;
392: for (int i = 0; i < termlen; i++) {
393: matches &= (terminator.charAt(i) == sb.charAt(offset++));
394: if (!matches)
395: break;
396: }
397: return matches;
398: }
399:
400: public String readUpToCloseTag(String openTag, String closeTag)
401: throws MalformedXMLException, IOException {
402: StringBuffer sb = new StringBuffer();
403: String retStr = "";
404: int tagsOpen = 1;
405: st.resetSyntax();
406: st.eolIsSignificant(false);
407: while (st.nextToken() != st.TT_EOF) {
408: if (st.ttype == st.TT_WORD)
409: sb.append(st.sval);
410: else
411: sb.append((char) st.ttype);
412: retStr = sb.toString();
413: if (retStr.endsWith(openTag))
414: tagsOpen++;
415: else if (retStr.endsWith(closeTag))
416: tagsOpen--;
417: if (tagsOpen == 0)
418: break;
419: }
420: this .resetSyntax();
421: if (tagsOpen != 0)
422: throw new MalformedXMLException("unmatched tags at "
423: + st.lineno());
424: return retStr;
425: }
426:
427: private void resetSyntax() {
428: st.resetSyntax();
429: st.eolIsSignificant(false);
430: st.wordChars('a', 'z');
431: st.wordChars('A', 'Z');
432: st.wordChars('0', '9');
433: st.wordChars(35, 38); // # $ % &
434: st.wordChars(40, 46); // ( ) * + , - .
435: // thanks to Chris Knight for this patch:
436: st.wordChars(91, 96); // [ \ ] ^ _ `
437: // end patch
438: st.wordChars(123, 126);
439: st.whitespaceChars(0, 32);
440: }
441:
442: public String toString() {
443: return st.toString();
444: }
445: }
446:
447: /* $Log: XMLParser.java,v $
448: /* Revision 1.10 2001/07/31 22:26:20 smulloni
449: /* included Chris Knight's patch for underscores and other well-beloved
450: /* characters.
451: /*
452: /* Revision 1.9 2001/07/17 03:00:04 smulloni
453: /* added XMLCData class to represent CDATA, and fixed CDATA parsing, which was
454: /* wrong in any case; added license preambles where I had forgotten to put them.
455: /*
456: /* Revision 1.8 2001/07/11 02:15:01 smulloni
457: /* the previous spi build targets have been folded into the main build;
458: /* also added a javadoc target.
459: /*
460: /* Revision 1.7 2000/12/08 05:50:30 smulloni
461: /* fixed MessageCatalogEditor. The spi features are now a special build option,
462: /* and editors are loaded through reflection.
463: /*
464: /* Revision 1.6 2000/12/06 23:59:56 smulloni
465: /* added commented-out code that I mean to check (possible performance
466: /* improvement)
467: /*
468: /* Revision 1.5 2000/11/09 23:35:12 smullyan
469: /* log added to every Java file, with the help of python. Lock stealing
470: /* implemented, and treatment of locks made more robust.
471: /* */
|