01: /**********************************************************************************
02: * $URL: https://source.sakaiproject.org/svn/search/tags/sakai_2-4-1/search-impl/impl/src/java/org/sakaiproject/search/component/adapter/contenthosting/WordContentDigester.java $
03: * $Id: WordContentDigester.java 21946 2007-02-27 11:41:58Z ian@caret.cam.ac.uk $
04: ***********************************************************************************
05: *
06: * Copyright (c) 2003, 2004, 2005, 2006 The Sakai Foundation.
07: *
08: * Licensed under the Educational Community License, Version 1.0 (the "License");
09: * you may not use this file except in compliance with the License.
10: * You may obtain a copy of the License at
11: *
12: * http://www.opensource.org/licenses/ecl1.php
13: *
14: * Unless required by applicable law or agreed to in writing, software
15: * distributed under the License is distributed on an "AS IS" BASIS,
16: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17: * See the License for the specific language governing permissions and
18: * limitations under the License.
19: *
20: **********************************************************************************/package org.sakaiproject.search.component.adapter.contenthosting;
21:
22: import java.io.IOException;
23: import java.io.InputStream;
24: import java.io.Reader;
25: import java.io.StringReader;
26:
27: import org.apache.commons.logging.Log;
28: import org.apache.commons.logging.LogFactory;
29: import org.apache.poi.hwpf.HWPFDocument;
30: import org.apache.poi.hwpf.extractor.WordExtractor;
31: import org.apache.poi.poifs.filesystem.POIFSFileSystem;
32: import org.sakaiproject.content.api.ContentResource;
33: import org.sakaiproject.search.api.SearchUtils;
34:
35: /**
36: * @author ieb
37: */
38: public class WordContentDigester extends BaseContentDigester {
39:
40: private static Log log = LogFactory
41: .getLog(WordContentDigester.class);
42:
43: static {
44: System.setProperty("org.apache.poi.util.POILogger",
45: "org.apache.poi.util.NullLogger");
46: }
47:
48: /*
49: * (non-Javadoc)
50: *
51: * @see org.sakaiproject.search.component.adapter.contenthosting.BaseContentDigester#getContent(org.sakaiproject.content.api.ContentResource)
52: */
53:
54: public String getContent(ContentResource contentResource) {
55: if (contentResource != null
56: && contentResource.getContentLength() > maxDigestSize) {
57: throw new RuntimeException(
58: "Attempt to get too much content as a string on "
59: + contentResource.getReference());
60: }
61: InputStream contentStream = null;
62: try {
63: contentStream = contentResource.streamContent();
64: POIFSFileSystem poifs = new POIFSFileSystem(contentStream);
65: HWPFDocument hwpf = new HWPFDocument(poifs);
66: WordExtractor wordExtractor = new WordExtractor(hwpf);
67:
68: String paragraphs = wordExtractor.getTextFromPieces();
69:
70: StringBuilder sb = new StringBuilder();
71: SearchUtils.appendCleanString(paragraphs, sb);
72: return sb.toString();
73: } catch (Exception e) {
74: throw new RuntimeException(
75: "Failed to read content for indexing ", e);
76: } finally {
77: if (contentStream != null) {
78: try {
79: contentStream.close();
80: } catch (IOException e) {
81: }
82: }
83: }
84: }
85:
86: /*
87: * (non-Javadoc)
88: *
89: * @see org.sakaiproject.search.component.adapter.contenthosting.BaseContentDigester#getContentReader(org.sakaiproject.content.api.ContentResource)
90: */
91:
92: public Reader getContentReader(ContentResource contentResource) {
93: return new StringReader(getContent(contentResource));
94: }
95:
96: }
|