01: /**********************************************************************************
02: * $URL: https://source.sakaiproject.org/svn/search/tags/sakai_2-4-1/search-impl/impl/src/java/org/sakaiproject/search/component/adapter/contenthosting/PDFContentDigester.java $
03: * $Id: PDFContentDigester.java 21387 2007-02-11 19:37:04Z ian@caret.cam.ac.uk $
04: ***********************************************************************************
05: *
06: * Copyright (c) 2003, 2004, 2005, 2006 The Sakai Foundation.
07: *
08: * Licensed under the Educational Community License, Version 1.0 (the "License");
09: * you may not use this file except in compliance with the License.
10: * You may obtain a copy of the License at
11: *
12: * http://www.opensource.org/licenses/ecl1.php
13: *
14: * Unless required by applicable law or agreed to in writing, software
15: * distributed under the License is distributed on an "AS IS" BASIS,
16: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17: * See the License for the specific language governing permissions and
18: * limitations under the License.
19: *
20: **********************************************************************************/package org.sakaiproject.search.component.adapter.contenthosting;
21:
22: import java.io.IOException;
23: import java.io.InputStream;
24: import java.io.Reader;
25: import java.io.StringReader;
26:
27: import org.apache.commons.logging.Log;
28: import org.apache.commons.logging.LogFactory;
29: import org.pdfbox.pdmodel.PDDocument;
30: import org.pdfbox.util.PDFTextStripper;
31: import org.sakaiproject.content.api.ContentResource;
32: import org.sakaiproject.search.api.SearchUtils;
33:
34: /**
35: * @author ieb
36: */
37: public class PDFContentDigester extends BaseContentDigester {
38: private static Log log = LogFactory
39: .getLog(PDFContentDigester.class);
40:
41: public String getContent(ContentResource contentResource) {
42: if (contentResource != null
43: && contentResource.getContentLength() > maxDigestSize) {
44: throw new RuntimeException(
45: "Attempt to get too much content as a string on "
46: + contentResource.getReference());
47: }
48:
49: InputStream contentStream = null;
50: PDDocument pddoc = null;
51: try {
52: contentStream = contentResource.streamContent();
53: PDFTextStripper stripper = new PDFTextStripper();
54: pddoc = PDDocument.load(contentStream);
55: String text = stripper.getText(pddoc);
56: pddoc.close();
57: return SearchUtils.appendCleanString(text, null).toString();
58: } catch (Exception ex) {
59: throw new RuntimeException(
60: "Failed to get content for indexing", ex);
61: } finally {
62: if (contentStream != null) {
63: try {
64: contentStream.close();
65: } catch (IOException e) {
66: }
67: }
68: try {
69: pddoc.close();
70: } catch (Exception ex) {
71:
72: }
73: }
74: }
75:
76: public Reader getContentReader(ContentResource contentResource) {
77: return new StringReader(getContent(contentResource));
78: }
79:
80: }
|