001: /*
002: * Copyright 2008 Hippo Webworks.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: package nl.hippo.slide.extractor;
017:
018: import java.io.InputStream;
019: import java.util.Collections;
020: import java.util.Enumeration;
021: import java.util.HashMap;
022: import java.util.Iterator;
023: import java.util.List;
024: import java.util.Map;
025: import java.text.SimpleDateFormat;
026: import org.apache.poi.hpsf.NoPropertySetStreamException;
027: import org.apache.poi.hpsf.Property;
028: import org.apache.poi.hpsf.PropertySet;
029: import org.apache.poi.hpsf.PropertySetFactory;
030: import org.apache.poi.hpsf.Section;
031: import org.apache.poi.poifs.eventfilesystem.POIFSReader;
032: import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
033: import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
034: import org.apache.slide.extractor.*;
035: import org.apache.slide.common.PropertyName;
036: import org.apache.slide.util.conf.Configurable;
037: import org.apache.slide.util.conf.Configuration;
038: import org.apache.slide.util.conf.ConfigurationException;
039:
040: /**
041: * Property extractor for Microsoft office documents.
042: *
043: * <p>This property extractor extracts properties from <code>SummaryInformation</code> and
044: * <code>DocumentSummaryInformation</code> headers of office documents.
045: *
046: * <p>Sample configuration:
047: * <pre>
048: * <extractor classname="org.apache.slide.extractor.OfficeExtractor" uri="/files/docs/">
049: * <configuration>
050: * <instruction property="author" namespace="http://mycomp.com/namepsaces/webdav" summary-information="4" />
051: * <instruction property="application" namespace="http://mycomp.com/namepsaces/webdav" summary-information="18" />
052: * <instruction property="title" namespace="http://mycomp.com/namepsaces/webdav" summary-information="2" />
053: * <instruction property="category" namespace="http://mycomp.com/namepsaces/webdav" document-summary-information="2" />
054: * <instruction property="docid" namespace="http://mycomp.com/namepsaces/webdav" label="Document-ID" />
055: * </configuration>
056: * </extractor>
057: * </pre>
058: * The sample configuration
059: * <ul>
060: * <li>maps the <em>author</em> info of office documents to the <code>author</code>
061: * property. The author info can be found in the <code>SummaryInformation</code> header and
062: * has the <code>id</code> 4.
063: * <li>and maps the <em>category</em> entry of the <code>DocumentSummaryInformation</code> header,
064: * which has the <code>id</code> 2 to the WebDAV property <code>category</code>.
065: * <li><code>SummaryInformation</code> headers can also contain "labled" entries, e.g. for user
066: * defined metadata. In the sample the labled entries with the label <code>Document-ID</code>
067: * will be mapped to the WebDAV-Property <code>docid</code>.
068: * </ul>
069: * All WebDAV properties in the sample will have the namespace
070: * <code>http://mycomp.com/namepsaces/webdav</code>.
071: *
072: * <p>The IDs in the <code>DocumentSummaryInformation</code> and <code>SummaryInformation</code>
073: * headers are somewhat mystical. Samples for <code>SummaryInformation</code> are:
074: * <pre>
075: * 1: codepage
076: * 2: title
077: * 3: theme
078: * 4: author
079: * 5: keywords
080: * 6: comments
081: * 7: template (e.g. Normal.dot"
082: * 8: last author
083: * 9: revision number
084: * 11: last printing date
085: * 12: creation date
086: * 13: last saved date
087: * 14: number of pages
088: * 15: number of words
089: * 16: number of characters
090: * 18: application name (e.g. "Microsoft Word 9.0")
091: * 19:
092: * </pre>
093: * Samples for <code>DocumentSummaryInformation</code> are:
094: * <pre>
095: * 1: codepage
096: * 2: category
097: * 5: number of lines
098: * 6: number of paragraphs
099: * 14: manager
100: * 15: company
101: * </pre>
102: */
103: public class OfficeExtractor extends AbstractPropertyExtractor
104: implements Configurable {
105: // maps SummaryInformation IDs to PropertyNames
106: protected Map propertyMapSI = new HashMap();
107: // maps DocumentSummaryInformation IDs to PropertyNames
108: protected Map propertyMapDSI = new HashMap();
109: // maps labled properties to PropertyNames
110: protected Map propertyMapLbl = new HashMap();
111: // maps labled properties to output formats
112: protected Map propertyMapOutFrmt = new HashMap();
113:
114: static final String CONTENT_TYPE_MS_OFFICE_ALL_CSV = "application/msword,application/vnd.ms-word,"
115: + // WORD
116: "application/mspowerpoint,application/vnd.ms-powerpoint,"
117: + "application/msexcel,application/vnd.ms-excel";
118:
119: public OfficeExtractor(String uri, String contentType,
120: String namespace) {
121: super (uri, contentType, namespace);
122: }
123:
124: public Map extract(InputStream content) throws ExtractorException {
125: OfficePropertiesListener listener = new OfficePropertiesListener();
126: try {
127: POIFSReader r = new POIFSReader();
128: r.registerListener(listener);
129: r.read(content);
130: } catch (Exception e) {
131: throw new ExtractorException(
132: "Exception while extracting properties in OfficeExtractor: "
133: + e);
134: }
135: return listener.getProperties();
136: }
137:
138: class OfficePropertiesListener implements POIFSReaderListener {
139:
140: private HashMap extractedProperties = new HashMap();
141:
142: public Map getProperties() {
143: return extractedProperties;
144: }
145:
146: public void processPOIFSReaderEvent(POIFSReaderEvent event) {
147: PropertySet ps = null;
148: try {
149: ps = PropertySetFactory.create(event.getStream());
150: } catch (NoPropertySetStreamException ex) {
151: return;
152: } catch (Exception ex) {
153: throw new RuntimeException("Property set stream \""
154: + event.getPath() + event.getName() + "\": "
155: + ex);
156: }
157:
158: // we are only interested in the global properties
159: if (event.getPath().length() > 0) // length 0 is 'root'
160: return;
161:
162: //System.out.println("Extracting properties!");
163:
164: Map idMap = null;
165:
166: if (ps.isDocumentSummaryInformation()) {
167: idMap = propertyMapDSI;
168: } else if (ps.isSummaryInformation()) {
169: idMap = propertyMapSI;
170: } else {
171: // can this happen?
172: idMap = Collections.EMPTY_MAP;
173: }
174:
175: List sections = ps.getSections();
176:
177: for (Iterator i = sections.iterator(); i.hasNext();) {
178: Section sec = (Section) i.next();
179: //System.out.println("section: " + sec);
180:
181: if (sec.getProperty(0) == null) {
182: for (Iterator j = idMap.entrySet().iterator(); j
183: .hasNext();) {
184: Map.Entry e = (Map.Entry) j.next();
185:
186: Object propertyValue = sec
187: .getProperty(((Integer) e.getKey())
188: .intValue());
189:
190: //if(propertyValue!=null)
191: //System.out.println("Class = "+propertyValue.getClass());
192:
193: //System.out.println("PropertyValue is: "+propertyValue);
194:
195: try {
196: Object outFormatObj = propertyMapOutFrmt
197: .get(e.getValue());
198: //System.out.println("DateFormat Obj is: "+outFormatObj);
199: if (outFormatObj != null
200: && propertyValue instanceof java.util.Date) {
201: SimpleDateFormat outFormat = (SimpleDateFormat) outFormatObj;
202:
203: String propval = outFormat
204: .format((java.util.Date) propertyValue);
205: propertyValue = propval;
206: }
207: } catch (Throwable ignored) { /*System.out.println("Problem: "); ignored.printStackTrace(); */
208: }
209:
210: //System.out.println("PropertyValue is now: "+propertyValue);
211:
212: if (propertyValue != null) {
213: //System.out.println("\t" + e.getValue() + "=" + propertyValue);
214: extractedProperties.put(e.getValue(),
215: propertyValue);
216: }
217: }
218: } else {
219: Map dict = (Map) sec.getProperty(0);
220: // this section has a dictionary
221: Property property[] = sec.getProperties();
222: for (int j = 0; j < property.length; j++) {
223: //String label = sec.getPIDString(property[j].getID()); TODO why doesn't this work
224: String label = (String) dict.get(new Long(
225: property[j].getID()));
226: PropertyName slideProperty = (PropertyName) propertyMapLbl
227: .get(label);
228: if (slideProperty != null) {
229: //System.out.println("\t" + slideProperty + "=" + property[j].getValue());
230: extractedProperties.put(slideProperty,
231: property[j].getValue());
232: }
233: }
234: }
235: }
236: }
237: }
238:
239: public void configure(Configuration configuration)
240: throws ConfigurationException {
241: Enumeration instructions = configuration
242: .getConfigurations("instruction");
243:
244: //System.out.println("OfficeExtractor configuring!");
245:
246: while (instructions.hasMoreElements()) {
247: Configuration instruction = (Configuration) instructions
248: .nextElement();
249: PropertyName propertyName = new PropertyName(instruction
250: .getAttribute("property"), instruction
251: .getAttribute("namespace", "DAV:"));
252:
253: //System.out.println("Configuring property "+propertyName);
254:
255: try {
256:
257: String id = instruction.getAttribute(
258: "summary-information", null);
259:
260: String format = instruction.getAttribute("date-format",
261: null);
262: //System.out.println("got format string: "+format);
263: if (format != null) {
264: this .propertyMapOutFrmt.put(propertyName,
265: new SimpleDateFormat(format));
266: }
267:
268: if (id != null) {
269: this .propertyMapSI.put(Integer.valueOf(id),
270: propertyName);
271: continue;
272: }
273:
274: id = instruction.getAttribute(
275: "document-summary-information", null);
276: if (id != null) {
277: this .propertyMapDSI.put(Integer.valueOf(id),
278: propertyName);
279: continue;
280: }
281:
282: id = instruction.getAttribute("label", null);
283: if (id != null) {
284: this .propertyMapLbl.put(id, propertyName);
285: continue;
286: }
287:
288: // for backward compatibility
289: // old style id atributes like SummaryInformation-0-4
290: id = instruction.getAttribute("id", null);
291: if (id != null) {
292: Integer intId = Integer.valueOf(id.substring(id
293: .lastIndexOf('-') + 1));
294: if (id.startsWith("SummaryInformation")) {
295: this .propertyMapSI.put(intId, propertyName);
296: }
297: if (id.startsWith("DocumentSummaryInformation")) {
298: this .propertyMapDSI.put(intId, propertyName);
299: }
300: }
301: } catch (NumberFormatException e) {
302: throw new ConfigurationException(
303: "Invalid instruction: " + e, instruction);
304: }
305: }
306: }
307:
308: /* (non-Javadoc)
309: * @see org.apache.slide.extractor.Extractor#getContentType()
310: */
311: public String getContentType() {
312: if (super.getContentType() == null) {
313: return CONTENT_TYPE_MS_OFFICE_ALL_CSV;
314: }
315: return super.getContentType();
316: }
317:
318: }
|