001: /*
002: * Copyright 2004 Outerthought bvba and Schaubroeck nv
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: package org.outerj.daisy.textextraction.impl;
017:
018: import org.xml.sax.helpers.DefaultHandler;
019: import org.xml.sax.*;
020: import org.outerj.daisy.xmlutil.LocalSAXParserFactory;
021: import org.outerj.daisy.textextraction.TextExtractor;
022: import org.outerj.daisy.plugin.PluginRegistry;
023:
024: import javax.xml.parsers.SAXParserFactory;
025: import javax.xml.parsers.SAXParser;
026: import java.io.InputStream;
027: import java.util.Set;
028: import java.util.HashSet;
029: import java.util.List;
030:
031: /**
032: * Extracts all text between tags in an XML document. Only works (of course)
033: * for well formed XML documents.
034: *
035: */
036: public class XmlTextExtractor extends AbstractTextExtractor implements
037: TextExtractor {
038:
039: public XmlTextExtractor() {
040: super ();
041: }
042:
043: public XmlTextExtractor(List<String> mimeTypes,
044: PluginRegistry pluginRegistry) {
045: super (mimeTypes, pluginRegistry);
046: }
047:
048: protected String getName() {
049: return getClass().getName();
050: }
051:
052: public String getText(InputStream is) throws Exception {
053: SAXParserFactory factory = LocalSAXParserFactory
054: .getSAXParserFactory();
055: SAXParser parser = factory.newSAXParser();
056:
057: // Try to disable loading of external things as much as possible
058: // (e.g. external DTD, can be slow or impossible to load)
059: XMLReader xmlReader = parser.getXMLReader();
060: safeSetFeature(xmlReader,
061: "http://xml.org/sax/features/validation", false);
062: safeSetFeature(
063: xmlReader,
064: "http://xml.org/sax/features/external-general-entities",
065: false);
066: safeSetFeature(
067: xmlReader,
068: "http://xml.org/sax/features/external-parameter-entities",
069: false);
070: safeSetFeature(
071: xmlReader,
072: "http://apache.org/xml/features/nonvalidating/load-external-dtd",
073: false);
074:
075: MyHandler handler = new MyHandler();
076: parser.parse(is, handler);
077: return handler.getText();
078: }
079:
080: private void safeSetFeature(XMLReader reader, String feature,
081: boolean value) {
082: try {
083: reader.setFeature(feature, value);
084: } catch (SAXNotRecognizedException e) {
085: // ignore
086: } catch (SAXNotSupportedException e) {
087: // ignore
088: }
089: }
090:
091: private static class MyHandler extends DefaultHandler {
092: private StringBuilder text = new StringBuilder();
093: private int nestingLevel = -1;
094: private int ignoreContentNestingLevel = -1;
095:
096: private static Set<String> SPECIAL_PRE_CLASSES;
097: static {
098: SPECIAL_PRE_CLASSES = new HashSet<String>();
099: SPECIAL_PRE_CLASSES.add("query");
100: SPECIAL_PRE_CLASSES.add("include");
101: SPECIAL_PRE_CLASSES.add("query-and-include");
102: }
103:
104: private static Set<String> SPECIAL_SPAN_CLASSES;
105: static {
106: SPECIAL_SPAN_CLASSES = new HashSet<String>();
107: SPECIAL_SPAN_CLASSES.add("indexentry");
108: SPECIAL_SPAN_CLASSES.add("crossreference");
109: }
110:
111: public void startElement(String uri, String localName,
112: String qName, Attributes attributes)
113: throws SAXException {
114: // Special handling to ignore the content of includes instructions etc.
115: nestingLevel++;
116: if (ignoreContentNestingLevel != -1) {
117: // we're already ignoring things
118: } else if (uri.equals("")
119: && localName.equals("pre")
120: && SPECIAL_PRE_CLASSES.contains(attributes
121: .getValue("class"))) {
122: ignoreContentNestingLevel = nestingLevel;
123: } else if (uri.equals("")
124: && localName.equals("span")
125: && SPECIAL_SPAN_CLASSES.contains(attributes
126: .getValue("class"))) {
127: ignoreContentNestingLevel = nestingLevel;
128: }
129: }
130:
131: public void endElement(String uri, String localName,
132: String qName) throws SAXException {
133: if (ignoreContentNestingLevel != -1
134: && ignoreContentNestingLevel == nestingLevel) {
135: ignoreContentNestingLevel = -1;
136: }
137: nestingLevel--;
138: }
139:
140: public void characters(char ch[], int start, int length)
141: throws SAXException {
142: if (ignoreContentNestingLevel == -1)
143: text.append(ch, start, length);
144: }
145:
146: public String getText() {
147: return text.toString();
148: }
149: }
150: }
|