01: /***************************************************************
02: * This file is part of the [fleXive](R) project.
03: *
04: * Copyright (c) 1999-2008
05: * UCS - unique computing solutions gmbh (http://www.ucs.at)
06: * All rights reserved
07: *
08: * The [fleXive](R) project is free software; you can redistribute
09: * it and/or modify it under the terms of the GNU General Public
10: * License as published by the Free Software Foundation;
11: * either version 2 of the License, or (at your option) any
12: * later version.
13: *
14: * The GNU General Public License can be found at
15: * http://www.gnu.org/copyleft/gpl.html.
16: * A copy is found in the textfile GPL.txt and important notices to the
17: * license from the author are found in LICENSE.txt distributed with
18: * these libraries.
19: *
20: * This library is distributed in the hope that it will be useful,
21: * but WITHOUT ANY WARRANTY; without even the implied warranty of
22: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23: * GNU General Public License for more details.
24: *
25: * For further information about UCS - unique computing solutions gmbh,
26: * please see the company website: http://www.ucs.at
27: *
28: * For further information about [fleXive](R), please see the
29: * project website: http://www.flexive.org
30: *
31: *
32: * This copyright notice MUST APPEAR in all copies of the file!
33: ***************************************************************/package com.flexive.extractor;
34:
35: import java.io.InputStream;
36:
37: /**
38: * This class allows meta data and text extraction from a HTML stream (file).
39: *
40: * @author Gregor Schober (gregor.schober@flexive.com), UCS - unique computing solutions gmbh (http://www.ucs.at)
41: */
42: public class HtmlExtractor {
43:
44: /**
45: * Extracts the text informations from the html stream.
46: *
47: * @param in the input stream to read from
48: * @return the extraxted informations, or null if no text extraction was possible
49: */
50: public static ExtractedData extract(final InputStream in) {
51: com.flexive.extractor.htmlExtractor.HtmlExtractor result = new com.flexive.extractor.htmlExtractor.HtmlExtractor(
52: in, true);
53: FxSummaryInformation si = new FxSummaryInformation(result
54: .getAuthor()/*author*/,
55: result.getGenerator()/*appName*/, result
56: .getCharacterCount(), ""/*comments*/, result
57: .getCreated()/*createdAt*/, result
58: .getCreated()/*editTime*/, result
59: .getKeywords(),
60: result.getAuthor()/*lastModifiedBy*/,
61: null/*lastPrintedAt*/, result.getTitle(), result
62: .getCreated()/*lastModifiedAt*/,
63: 1/*pageCount*/, null/*revNumber*/, result
64: .getWordCount(), false/*encrypted*/, result
65: .getTagText());
66: return new ExtractedData(si, result.getText());
67: }
68:
69: /**
70: * Extracts the text informations from the html stream.
71: *
72: * @param html the HTML data
73: * @return the extraxted informations, or null if no text extraction was possible
74: */
75: public static ExtractedData extract(final String html) {
76: com.flexive.extractor.htmlExtractor.HtmlExtractor result = new com.flexive.extractor.htmlExtractor.HtmlExtractor(
77: html, true);
78: FxSummaryInformation si = new FxSummaryInformation(result
79: .getAuthor()/*author*/,
80: result.getGenerator()/*appName*/, result
81: .getCharacterCount(), ""/*comments*/, result
82: .getCreated()/*createdAt*/, result
83: .getCreated()/*editTime*/, result
84: .getKeywords(),
85: result.getAuthor()/*lastModifiedBy*/,
86: null/*lastPrintedAt*/, result.getTitle(), result
87: .getCreated()/*lastModifiedAt*/,
88: 1/*pageCount*/, null/*revNumber*/, result
89: .getWordCount(), false/*encrypted*/, result
90: .getTagText());
91: return new ExtractedData(si, result.getText());
92: }
93: }
|