001: /*
002: * Copyright 2004 Outerthought bvba and Schaubroeck nv
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: package org.outerj.daisy.textextraction.impl;
017:
018: import java.io.InputStream;
019: import java.util.zip.ZipEntry;
020: import java.util.zip.ZipInputStream;
021: import java.util.List;
022:
023: import org.xmlpull.mxp1.MXParser;
024: import org.xmlpull.v1.XmlPullParser;
025: import org.outerj.daisy.xmlutil.XmlReader;
026: import org.outerj.daisy.textextraction.TextExtractor;
027: import org.outerj.daisy.plugin.PluginRegistry;
028:
029: /**
030: * Extracts all text from an OpenOffice document.
031: */
032: public class OpenOfficeTextExtractor extends AbstractTextExtractor
033: implements TextExtractor {
034: private static final String TEXTNAMESPACE = "http://openoffice.org/2000/text";
035:
036: public OpenOfficeTextExtractor() {
037: super ();
038: }
039:
040: public OpenOfficeTextExtractor(List<String> mimeTypes,
041: PluginRegistry pluginRegistry) {
042: super (mimeTypes, pluginRegistry);
043: }
044:
045: protected String getName() {
046: return getClass().getName();
047: }
048:
049: public String getText(InputStream is) throws Exception {
050: /*
051: * the byte array we receive here is in fact a ZIP containing the
052: * content.xml, styles.xml,meta.xml and META-INF/manifest.xml files. We
053: * are only interested in the content.xml because that's the file
054: * containing the actual content (duh)
055: */
056:
057: ZipInputStream zis = new ZipInputStream(is);
058:
059: ZipEntry ze;
060: String zipEntryName = null;
061: StringBuilder text = new StringBuilder();
062:
063: while ((ze = zis.getNextEntry()) != null
064: && !(zipEntryName = ze.getName()).equals("content.xml")) {
065: }
066:
067: if (zipEntryName != null && zipEntryName.equals("content.xml")) {
068: /*
069: * we found the correct zip entry. This means the "read pointer" of
070: * the zipinputstream points correctly to the beginning of this zip
071: * entry and we can pass it to the xml parser like this (will
072: * return -1 as soon as the end of the zip entry is reached)
073: */
074:
075: /* We are using this XmlPullParser because it was impossible to work
076: * with a sax parser. The sax parser always wanted to have access to the
077: * openoffice dtd. Even tried to write our own entityresolver to work
078: * around this problem but didnt work out. In order not to pin ourselves
079: * down to a specific sax implementor (where we eg. would be able to
080: * specify that we explicitly don't want any check at all against a dtd)
081: * we choose not to use sax at all and use a very lightweight type of
082: * parsing for this specific goal.
083: */
084:
085: XmlPullParser parser = new MXParser();
086: parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES,
087: true);
088: parser.setInput(new XmlReader(zis));
089: boolean inText = false;
090:
091: int eventType = parser.getEventType();
092: while (eventType != XmlPullParser.END_DOCUMENT) {
093: eventType = parser.next();
094: if (eventType == XmlPullParser.START_TAG) {
095: if (parser.getName().equals("p")
096: && parser.getNamespace().equals(
097: TEXTNAMESPACE)) {
098: text.append(' ');
099: inText = true;
100: }
101: } else if (eventType == XmlPullParser.END_TAG) {
102: if (parser.getName().equals("p")
103: && parser.getNamespace().equals(
104: TEXTNAMESPACE)) {
105: inText = false;
106: }
107: } else if (eventType == XmlPullParser.TEXT) {
108: if (inText) {
109: String gotText = parser.getText();
110: text.append(gotText);
111: }
112: }
113: }
114:
115: } else {
116: throw new Exception(
117: "Invalid OpenOffice document format (content.xml not found)");
118: }
119:
120: return text.toString();
121: }
122: }
|