01: /*
02: * regain - A file search engine providing plenty of formats
03: * Copyright (C) 2004 Til Schneider
04: *
05: * This library is free software; you can redistribute it and/or
06: * modify it under the terms of the GNU Lesser General Public
07: * License as published by the Free Software Foundation; either
08: * version 2.1 of the License, or (at your option) any later version.
09: *
10: * This library is distributed in the hope that it will be useful,
11: * but WITHOUT ANY WARRANTY; without even the implied warranty of
12: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13: * Lesser General Public License for more details.
14: *
15: * You should have received a copy of the GNU Lesser General Public
16: * License along with this library; if not, write to the Free Software
17: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18: *
19: * Contact: Til Schneider, info@murfman.de
20: *
21: * CVS information:
22: * $RCSfile$
23: * $Source$
24: * $Date: 2005-11-21 11:20:09 +0100 (Mo, 21 Nov 2005) $
25: * $Author: til132 $
26: * $Revision: 180 $
27: */
28: package net.sf.regain.crawler.preparator;
29:
30: import java.io.InputStream;
31: import java.io.InputStreamReader;
32: import java.io.StringWriter;
33:
34: import net.sf.regain.RegainException;
35: import net.sf.regain.RegainToolkit;
36: import net.sf.regain.crawler.document.AbstractPreparator;
37: import net.sf.regain.crawler.document.RawDocument;
38: import net.sf.regain.crawler.preparator.rtf.RtfFilterReader;
39:
40: /**
41: * Präpariert ein RTF-Dokument für die Indizierung. Dazu wird sämtliche
42: * Formatierungsinformation einfach ignoriert.
43: * <p>
44: * Dabei werden die Rohdaten des Dokuments von Formatierungsinformation befreit.
45: *
46: * @author Til Schneider, www.murfman.de
47: */
48: public class SimpleRtfPreparator extends AbstractPreparator {
49:
50: /**
51: * Creates a new instance of SimpleRtfPreparator.
52: *
53: * @throws RegainException If creating the preparator failed.
54: */
55: public SimpleRtfPreparator() throws RegainException {
56: super ("rtf");
57: }
58:
59: /**
60: * Präpariert ein Dokument für die Indizierung.
61: *
62: * @param rawDocument Das zu präpariernde Dokument.
63: *
64: * @throws RegainException Wenn die Präparation fehl schlug.
65: */
66: public void prepare(RawDocument rawDocument) throws RegainException {
67: InputStream stream = null;
68: try {
69: stream = rawDocument.getContentAsStream();
70: RtfFilterReader reader = new RtfFilterReader(
71: new InputStreamReader(stream));
72: StringWriter writer = new StringWriter();
73:
74: RegainToolkit.pipe(reader, writer);
75:
76: stream.close();
77: reader.close();
78: writer.close();
79:
80: String cleanedContent = writer.toString();
81: setCleanedContent(cleanedContent);
82: } catch (Exception exc) {
83: throw new RegainException("Reading RTF dokument failed: "
84: + rawDocument.getUrl(), exc);
85: } finally {
86: if (stream != null) {
87: try {
88: stream.close();
89: } catch (Exception exc) {
90: }
91: }
92: }
93: }
94:
95: }
|