001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.jetspeed.rewriter.xml;
018:
019: import java.io.ByteArrayInputStream;
020: import java.io.ByteArrayOutputStream;
021: import java.io.IOException;
022: import java.io.Reader;
023: import java.io.Writer;
024: import java.net.URL;
025: import java.util.HashMap;
026: import java.util.Map;
027:
028: import javax.xml.parsers.ParserConfigurationException;
029: import javax.xml.parsers.SAXParser;
030: import javax.xml.parsers.SAXParserFactory;
031:
032: import org.apache.commons.logging.Log;
033: import org.apache.commons.logging.LogFactory;
034: import org.apache.jetspeed.rewriter.ParserAdaptor;
035: import org.apache.jetspeed.rewriter.Rewriter;
036: import org.apache.jetspeed.rewriter.RewriterException;
037: import org.apache.jetspeed.rewriter.MutableAttributes;
038: import org.apache.jetspeed.util.Streams;
039: import org.xml.sax.InputSource;
040: import org.xml.sax.SAXException;
041: import org.xml.sax.helpers.DefaultHandler;
042:
043: /**
044: * SaxParserAdaptor
045: *
046: * @author <a href="mailto:taylor@apache.org">David Sean Taylor</a>
047: * @version $Id: SaxParserAdaptor.java 516448 2007-03-09 16:25:47Z ate $
048: */
049: public class SaxParserAdaptor implements ParserAdaptor {
050: protected final static Log log = LogFactory
051: .getLog(SaxParserAdaptor.class);
052: private String lineSeparator;
053:
054: private Rewriter rewriter;
055:
056: public SaxParserAdaptor() {
057: lineSeparator = System.getProperty("line.separator", "\r\n");
058: }
059:
060: /* (non-Javadoc)
061: * @see org.apache.jetspeed.syndication.services.crawler.rewriter.ParserAdaptor#parse(org.apache.jetspeed.syndication.services.crawler.rewriter.Rewriter, java.io.Reader)
062: */
063: public void parse(Rewriter rewriter, Reader reader)
064: throws RewriterException {
065: try {
066: this .rewriter = rewriter;
067: SAXParser sp = getParser();
068: sp.parse(new InputSource(reader),
069: new SaxFormatHandler(null));
070: } catch (Exception e) {
071: e.printStackTrace();
072: throw new RewriterException(e);
073: }
074:
075: }
076:
077: /* (non-Javadoc)
078: * @see org.apache.jetspeed.syndication.services.crawler.rewriter.ParserAdaptor#rewrite(org.apache.jetspeed.syndication.services.crawler.rewriter.Rewriter, java.io.Reader, java.io.Writer)
079: */
080: public void rewrite(Rewriter rewriter, Reader reader, Writer writer)
081: throws RewriterException {
082: // TODO Auto-generated method stub
083: }
084:
085: /**
086: * Get a Parser from the SAX Parser factory
087: *
088: * @return A SAXParser
089: */
090: protected SAXParser getParser()
091: throws ParserConfigurationException, SAXException {
092: SAXParserFactory spf = SAXParserFactory.newInstance();
093: spf.setValidating(false);
094:
095: return spf.newSAXParser();
096: }
097:
098: /**
099: * Inner class to handle SAX parsing of XML files
100: */
101: public class SaxFormatHandler extends DefaultHandler {
102: private int elementCount = 0;
103: private boolean emit = true;
104: private Writer writer = null;
105:
106: public SaxFormatHandler(Writer writer) {
107: super ();
108: this .writer = writer;
109: }
110:
111: private void write(String text) throws IOException {
112: if (writer != null) {
113: writer.write(text);
114: }
115: }
116:
117: public void characters(char[] values, int start, int length) {
118: if (false == emit)
119: return;
120:
121: if (false == rewriter.enterText(values, start))
122: return;
123:
124: if (writer != null) {
125: try {
126: writer.write(values);
127: } catch (IOException e) {
128: }
129: }
130: }
131:
132: public void startElement(String uri, String localName,
133: String qName, MutableAttributes attributes)
134: throws SAXException {
135: // System.out.println("qName = " + qName);
136: // System.out.println("localName = " + localName);
137: // System.out.println("uri = " + uri);
138: String tag = qName;
139:
140: if (false == rewriter.enterStartTagEvent(tag.toString(),
141: attributes))
142: return;
143:
144: try {
145: appendTagToResult(tag, attributes);
146: write(lineSeparator);
147: String appended = rewriter.exitStartTagEvent(tag
148: .toString(), attributes);
149: if (null != appended) {
150: write(appended);
151: }
152: } catch (Exception e) {
153: log.error("Start tag parsing error", e);
154: }
155: }
156:
157: public void endElement(String uri, String localName,
158: String qName) throws SAXException {
159: String tag = qName;
160: elementCount++;
161: if (false == rewriter.enterEndTagEvent(tag.toString()))
162: return;
163:
164: try {
165: addToResult("</").addToResult(tag).addToResult(">");
166:
167: write(lineSeparator);
168: String appended = rewriter.exitEndTagEvent(tag
169: .toString());
170: if (null != appended) {
171: write(appended);
172: }
173: } catch (Exception e) {
174: log.error("End tag parsing error", e);
175: }
176:
177: }
178:
179: /*
180: * Writes output to the final stream for all attributes of a given tag.
181: *
182: * @param tag The HTML tag being output.
183: * @param attrs The mutable HTML attribute set for the current HTML tag.
184: */
185: private void appendTagToResult(String tag,
186: MutableAttributes attrs) {
187: convertURLS(tag, attrs);
188: addToResult("<").addToResult(tag);
189: for (int ix = 0; ix < attrs.getLength(); ix++) {
190: String value = attrs.getValue(ix);
191: addToResult(" ").addToResult(value).addToResult("=\"")
192: .addToResult(value).addToResult("\"");
193: }
194: addToResult(">");
195: }
196:
197: /*
198: * Used to write tag and attribute objects to the output stream.
199: * Returns a reference to itself so that these calls can be chained.
200: *
201: * @param txt Any text to be written out to stream with toString method.
202: * The object being written should implement its toString method.
203: * @return A handle to the this, the callback, for chaining results.
204: *
205: */
206: private SaxFormatHandler addToResult(Object txt) {
207: // to allow for implementation using Stringbuffer or StringWriter
208: // I don't know yet, which one is better in this case
209: // if (ignoreLevel > 0 ) return this;
210:
211: try {
212: write(txt.toString());
213: } catch (Exception e) {
214: System.err.println("Error parsing:" + e);
215: }
216: return this ;
217: }
218:
219: /*
220: * Determines which HTML Tag/Element is being inspected, and calls the
221: * appropriate converter for that context. This method contains all the
222: * logic for determining how tags are rewritten.
223: *
224: * TODO: it would be better to drive this logic off a state table that is not
225: * tied to the Hot Java parser.
226: *
227: * @param tag TAG from the Callback-Interface.
228: * @param attrs The mutable HTML attribute set for the current HTML element.
229: */
230:
231: private void convertURLS(String tag, MutableAttributes attrs) {
232: rewriter.enterConvertTagEvent(tag.toString(), attrs);
233: }
234:
235: public InputSource resolveEntity(String publicId,
236: String systemId) {
237:
238: try {
239: Map dtds = getDtds();
240: byte[] dtd = (byte[]) dtds.get(systemId);
241: if (dtd == null) {
242: ByteArrayOutputStream baos = new ByteArrayOutputStream();
243: URL url = new URL(systemId);
244: Streams.drain(url.openStream(), baos);
245: dtd = baos.toByteArray();
246: dtds.put(systemId, dtd);
247: }
248:
249: if (dtd != null) {
250: ByteArrayInputStream bais = new ByteArrayInputStream(
251: dtd);
252: InputSource is = new InputSource(bais);
253: is.setPublicId(publicId);
254: is.setSystemId(systemId);
255:
256: return is;
257: }
258: } catch (Throwable t) // java.io.IOException x
259: {
260: t.printStackTrace();
261: log.error("failed to get URL input source", t);
262: }
263:
264: // forces to get dtd over internet
265: return null;
266: }
267:
268: }
269:
270: // DTD Map
271: static private Map dtds = new HashMap();
272:
273: public static Map getDtds() {
274: return dtds;
275: }
276:
277: public static void clearDtdCache() {
278: dtds.clear();
279: }
280:
281: }
|