001: /***************************************************************
002: * This file is part of the [fleXive](R) project.
003: *
004: * Copyright (c) 1999-2008
005: * UCS - unique computing solutions gmbh (http://www.ucs.at)
006: * All rights reserved
007: *
008: * The [fleXive](R) project is free software; you can redistribute
009: * it and/or modify it under the terms of the GNU General Public
010: * License as published by the Free Software Foundation;
011: * either version 2 of the License, or (at your option) any
012: * later version.
013: *
014: * The GNU General Public License can be found at
015: * http://www.gnu.org/copyleft/gpl.html.
016: * A copy is found in the textfile GPL.txt and important notices to the
017: * license from the author are found in LICENSE.txt distributed with
018: * these libraries.
019: *
020: * This library is distributed in the hope that it will be useful,
021: * but WITHOUT ANY WARRANTY; without even the implied warranty of
022: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
023: * GNU General Public License for more details.
024: *
025: * For further information about UCS - unique computing solutions gmbh,
026: * please see the company website: http://www.ucs.at
027: *
028: * For further information about [fleXive](R), please see the
029: * project website: http://www.flexive.org
030: *
031: *
032: * This copyright notice MUST APPEAR in all copies of the file!
033: ***************************************************************/package com.flexive.extractor.htmlExtractor;
034:
035: import java.io.ByteArrayInputStream;
036: import java.io.InputStream;
037: import java.text.SimpleDateFormat;
038: import java.util.Date;
039: import java.util.Hashtable;
040: import java.util.StringTokenizer;
041:
042: /**
043: * HTML Text Extractor.
044: * Part of the fleXive 3.X Framework
045: *
046: * @author Gregor Schober (gregor.schober@flexive.com), UCS - unique computing solutions gmbh (http://www.ucs.at)
047: */
048: public class HtmlExtractor {
049:
050: boolean convertSpecialHtmlChars;
051: private StringBuffer result;
052: private StringBuffer tagText;
053: private Exception error = null;
054: private Hashtable meta = new Hashtable(25);
055: private int characterCount;
056: private int wordCount = 0;
057: private final static String META_AUTHOR = "AUTHOR";
058: private final static String META_DESCRIPTION = "DESCRIPTION";
059: private final static String META_KEYWORDS = "KEYWORDS";
060: private final static String META_DATE = "DATE";
061: private final static String META_TITLE = "TITLE";
062: private final static String META_CREATOR = "CREATOR";
063: private final static String META_SUBJECT = "SUBJECT";
064: private final static String META_PUBLISHER = "PUBLISHER";
065: private final static String META_CONTRIBUTER = "CONTRIBUTER";
066: private final static String META_TYPE = "TYPE";
067: private final static String META_LANGUAGE = "LANGUAGE";
068: private final static String META_ROBOTS = "ROBOTS";
069: protected final static String META_CREATED = "CREATED";
070: protected final static String META_LAST_MODIFIED = "LAST_MODIFIED";
071: private final static String META_GENERATOR = "GENERATOR";
072:
073: final static String META_TAGS[] = { META_AUTHOR, META_DESCRIPTION,
074: META_KEYWORDS, META_DATE, META_TITLE, META_CREATOR,
075: META_SUBJECT, META_PUBLISHER, META_CONTRIBUTER, META_TYPE,
076: META_LANGUAGE, META_ROBOTS, META_GENERATOR };
077:
078: /**
079: * Returns the number of words in the EXTRACTED text.
080: *
081: * @return the number of words in the EXTRACTED text.
082: */
083: public int getWordCount() {
084: return wordCount;
085: }
086:
087: /**
088: * The total number of characters in the HTML file.
089: *
090: * @return the total number of characters in the HTML file
091: */
092: public int getCharacterCount() {
093: return characterCount;
094: }
095:
096: /**
097: * Constructor.
098: *
099: * @param convertSpecialHtmlChars if set to true special HTML characters are replaced
100: * to a readable form in text files (eg german umlaute).
101: * @param html the html to parse
102: */
103: public HtmlExtractor(String html, boolean convertSpecialHtmlChars) {
104: this .convertSpecialHtmlChars = convertSpecialHtmlChars;
105: extract(html);
106: }
107:
108: /**
109: * Constructor.
110: *
111: * @param convertSpecialHtmlChars if set to true special HTML characters are replaced
112: * to a readable form in text files (eg german umlaute).
113: * @param in the html to parse
114: */
115: public HtmlExtractor(InputStream in, boolean convertSpecialHtmlChars) {
116: this .convertSpecialHtmlChars = convertSpecialHtmlChars;
117: StringBuffer buffer = null;
118: try {
119: buffer = new StringBuffer(in.available());
120: int achar;
121: while ((achar = in.read()) != -1) {
122: buffer.append((char) achar);
123: }
124: } catch (Exception exc) {
125: //
126: }
127: extract(buffer == null ? "" : buffer.toString());
128: }
129:
130: /**
131: * Extracts the text informations from the html file.
132: *
133: * @param html the HTML
134: */
135: private void extract(final String html) {
136: // Store character count
137: this .characterCount = html.length();
138: // Get TEXT
139: this .result = new StringBuffer(html.length() / 5);
140: this .tagText = new StringBuffer(1024);
141: try {
142: ByteArrayInputStream byis = new ByteArrayInputStream(html
143: .getBytes("UTF-8"));
144: new HtmlExtractorParser(byis, "UTF-8").extract(this );
145: } catch (Exception exc) {
146: exc.printStackTrace();
147: error = exc;
148: }
149: // Store word count
150: StringTokenizer st = new StringTokenizer(result.toString(),
151: " ", false);
152: while (st.hasMoreTokens()) {
153: this .wordCount++;
154: st.nextToken();
155: }
156: }
157:
158: protected void setTitle(Token tk) {
159: String title = tk.image;
160: title = title.substring("<title>".length(), title.length()
161: - "</title>".length());
162: //noinspection unchecked
163: meta.put(META_TITLE, title.trim());
164: }
165:
166: /**
167: * Returns the extracted text.
168: *
169: * @return the extracted text
170: */
171: public String getText() {
172: return this .result.toString();
173: }
174:
175: /**
176: * Returns the text extracted from tag attributes like 'title' and 'alt'.
177: *
178: * @return the text extracted from tag attributes like 'title' and 'alt'.
179: */
180: public String getTagText() {
181: return tagText.toString();
182: }
183:
184: public String getAuthor() {
185: return (String) meta.get(META_AUTHOR);
186: }
187:
188: public String getGenerator() {
189: return (String) meta.get(META_GENERATOR);
190: }
191:
192: public Date getCreated() {
193: return metaToDate((String) meta.get(META_CREATED));
194: }
195:
196: public String getKeywords() {
197: return (String) meta.get(META_KEYWORDS);
198: }
199:
200: public String getTitle() {
201: return (String) meta.get(META_TITLE);
202: }
203:
204: private Date metaToDate(String value) {
205: if (value == null) {
206: return null;
207: }
208: // eg "6 Feb 1999 09:31:30+01:00 "
209: SimpleDateFormat sdf = new SimpleDateFormat(
210: "d MMM yyyy HH:mm:ss");
211: try {
212: return sdf.parse(value.trim());
213: } catch (Exception exc) {
214: /* ignore */
215: return null;
216: }
217: }
218:
219: protected void addMeta(String key, String value) {
220: //noinspection unchecked
221: meta.put(key, value);
222: }
223:
224: protected void append(Token text) {
225: append(text.image);
226: }
227:
228: protected void append(String text) {
229: this .result.append(getValue(text));
230: }
231:
232: protected void appendTagText(String text) {
233: if (tagText.length() > 0) {
234: tagText.append(" ");
235: }
236: tagText.append(getValue(text).trim());
237: }
238:
239: private String getValue(String value) {
240: if (value.length() == 0) {
241: return "";
242: }
243: if (convertSpecialHtmlChars) {
244: value = value.replaceAll("’", "'");
245: value = value.replaceAll(" ", " ");
246: value = value.replaceAll("&", "&");
247: value = value.replaceAll(""", "\"");
248: // kl Umlaute
249: value = value.replaceAll("ä", "\u00E4");
250: value = value.replaceAll("ü", "\u00FC");
251: value = value.replaceAll("ö", "\u00F6");
252: // gr Umlaute
253: value = value.replaceAll("Ä", "\u00C4");
254: value = value.replaceAll("Ü", "\u00DC");
255: value = value.replaceAll("Ö", "\u00D6");
256: // Scharfes s
257: value = value.replaceAll("ß", "\u00DF");
258: // greater / less than
259: value = value.replaceAll(">", ">");
260: value = value.replaceAll("<", "<");
261: // Euro
262: value = value.replaceAll("€", "?");
263: value = value.replaceAll("€", "?");
264: value = value.replaceAll("€", "?");
265: }
266: return value;
267: }
268:
269: /**
270: * Returns true if a error occured during the parseing - in this case only the
271: * text extracted up to the error is returned.
272: *
273: * @return true if a error occured during the parseing
274: */
275: public boolean hadError() {
276: return this .error != null;
277: }
278:
279: /**
280: * Returns null if the parser was successfully, or the parser error.
281: *
282: * @return null if the parser was successfully, or the parser error
283: */
284: public Exception getError() {
285: return error;
286: }
287:
288: }
|