001: package org.methodize.nntprss.rss.parser;
002:
003: /* -----------------------------------------------------------
004: * nntp//rss - a bridge between the RSS world and NNTP clients
005: * Copyright (c) 2002, 2003 Jason Brome. All Rights Reserved.
006: *
007: * email: nntprss@methodize.org
008: * mail: Methodize Solutions
009: * PO Box 3865
010: * Grand Central Station
011: * New York NY 10163
012: *
013: * This file is part of nntp//rss
014: *
015: * nntp//rss is free software; you can redistribute it
016: * and/or modify it under the terms of the GNU General
017: * Public License as published by the Free Software Foundation;
018: * either version 2 of the License, or (at your option) any
019: * later version.
020: *
021: * This program is distributed in the hope that it will be
022: * useful, but WITHOUT ANY WARRANTY; without even the implied
023: * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
024: * PURPOSE. See the GNU General Public License for more
025: * details.
026: *
027: * You should have received a copy of the GNU General Public
028: * License along with this program; if not, write to the
029: * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
030: * Boston, MA 02111-1307 USA
031: * ----------------------------------------------------- */
032:
033: import java.io.CharArrayReader;
034: import java.io.CharArrayWriter;
035: import java.io.FileInputStream;
036: import java.io.IOException;
037: import java.io.InputStream;
038: import java.io.InputStreamReader;
039: import java.io.Reader;
040: import java.util.ArrayList;
041: import java.util.List;
042:
043: import javax.swing.text.html.parser.DTD;
044: import javax.swing.text.html.parser.DTDConstants;
045: import javax.swing.text.html.parser.DocumentParser;
046: import javax.xml.parsers.DocumentBuilder;
047: import javax.xml.parsers.ParserConfigurationException;
048:
049: import org.methodize.nntprss.rss.Channel;
050: import org.methodize.nntprss.util.AppConstants;
051: import org.methodize.nntprss.util.XMLHelper;
052: import org.w3c.dom.Document;
053: import org.w3c.dom.Element;
054:
055: /**
056: * @author Jason Brome <jason@methodize.org>
057: * @version $Id: LooseParser.java,v 1.1 2003/03/22 16:32:02 jasonbrome Exp $
058: *
059: * 'Loose' Parser - when enabled, will parse those
060: * not well-formed RSS/RDF xml documents on which the
061: * standard XML parser chokes.
062: *
063: * This first version is an interesting kludge based
064: * upon the HTML parsing capabilities available within
065: * Swing.
066: *
067: * TODO: More tests on internationalization
068: *
069: */
070: public class LooseParser {
071:
072: private static DTD dtd = null;
073:
074: static {
075: try {
076: dtd = DTD.getDTD("html32");
077: } catch (IOException ie) {
078: }
079:
080: dtd.getElement("description");
081: javax.swing.text.html.parser.Element element = dtd
082: .getElement("rss");
083: element.getAttribute("version");
084:
085: dtd.getElement("rdf");
086: dtd.getElement("channel");
087: dtd.getElement("category");
088: dtd.getElement("link");
089: dtd.getElement("language");
090: dtd.getElement("title");
091: dtd.getElement("admin");
092: dtd.getElement("item");
093:
094: dtd.defEntity("lt", DTDConstants.GENERAL, '<');
095: dtd.defEntity("gt", DTDConstants.GENERAL, '>');
096: dtd.defEntity("nbsp", DTDConstants.GENERAL, ' ');
097: dtd.defEntity("amp", DTDConstants.GENERAL, '&');
098: dtd.defEntity("quot", DTDConstants.GENERAL, '"');
099: dtd.defEntity("apos", DTDConstants.GENERAL, '\'');
100: }
101:
102: public static Document parse(InputStream is) throws IOException,
103: ParserConfigurationException {
104:
105: DocumentBuilder db = AppConstants.newDocumentBuilder();
106:
107: Document doc = db.newDocument();
108: Element rootElm = doc.createElement("rss");
109: doc.appendChild(rootElm);
110: Element channelElm = doc.createElement("channel");
111: rootElm.appendChild(channelElm);
112:
113: Reader reader = new InputStreamReader(is);
114:
115: CharArrayWriter caw = new CharArrayWriter();
116:
117: // A little hack to use Swing's parser
118: // This informs the parser that the content is within
119: // the body, and therefore all text should be passed
120: // to the callback.
121:
122: // TODO: Extract XML PI
123: caw.write("<html><body>");
124: char[] buf = new char[1024];
125: int charsRead = reader.read(buf);
126: while (charsRead > -1) {
127: if (charsRead > 0) {
128: caw.write(buf, 0, charsRead);
129: }
130: charsRead = reader.read(buf);
131: }
132: caw.write("</body></html>");
133: caw.flush();
134: caw.close();
135: reader.close();
136:
137: reader = new CharArrayReader(caw.toCharArray());
138:
139: DocumentParser docParser = new DocumentParser(dtd);
140: docParser.parse(reader, new ParserCallback(doc), false);
141:
142: reader.close();
143:
144: return doc;
145:
146: }
147:
148: public static void main(String args[]) {
149: try {
150: Channel tstChannel = new Channel("test",
151: "http://localhost/");
152: List items = new ArrayList();
153:
154: InputStream is = new FileInputStream("c:\\test.xml");
155: Document doc = parse(is);
156:
157: System.out.println("Channel: "
158: + XMLHelper.getChildElementValue((Element) doc
159: .getDocumentElement().getElementsByTagName(
160: "channel").item(0), "title"));
161: System.out.println("Items: "
162: + doc.getDocumentElement().getElementsByTagName(
163: "item").getLength());
164: System.out.println("Version: "
165: + ((Element) doc.getDocumentElement()
166: .getElementsByTagName("channel").item(0))
167: .getAttribute("version"));
168:
169: } catch (Exception e) {
170: e.printStackTrace();
171: }
172: System.out.println("finished...");
173: }
174:
175: }
|