001: /***************************************************************
002: * This file is part of the [fleXive](R) project.
003: *
004: * Copyright (c) 1999-2008
005: * UCS - unique computing solutions gmbh (http://www.ucs.at)
006: * All rights reserved
007: *
008: * The [fleXive](R) project is free software; you can redistribute
009: * it and/or modify it under the terms of the GNU General Public
010: * License as published by the Free Software Foundation;
011: * either version 2 of the License, or (at your option) any
012: * later version.
013: *
014: * The GNU General Public License can be found at
015: * http://www.gnu.org/copyleft/gpl.html.
016: * A copy is found in the textfile GPL.txt and important notices to the
017: * license from the author are found in LICENSE.txt distributed with
018: * these libraries.
019: *
020: * This library is distributed in the hope that it will be useful,
021: * but WITHOUT ANY WARRANTY; without even the implied warranty of
022: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
023: * GNU General Public License for more details.
024: *
025: * For further information about UCS - unique computing solutions gmbh,
026: * please see the company website: http://www.ucs.at
027: *
028: * For further information about [fleXive](R), please see the
029: * project website: http://www.flexive.org
030: *
031: *
032: * This copyright notice MUST APPEAR in all copies of the file!
033: ***************************************************************/package com.flexive.extractor;
034:
035: import java.io.Serializable;
036: import java.util.Enumeration;
037: import java.util.Hashtable;
038:
039: public class ExtractedData implements Serializable {
040: private FxSummaryInformation si;
041: private String text;
042: private String compressed = null;
043:
044: protected ExtractedData(FxSummaryInformation si, String text) {
045: this .si = si;
046: if (text == null) {
047: this .text = "";
048: return;
049: }
050: this .text = text.trim().replaceAll("[\\x00-\\x09\\x0B-\\x1F]",
051: "");
052: }
053:
054: /**
055: * Returns the meta information extracted from the document.
056: *
057: * @return the meta information extracted from the document
058: */
059: public FxSummaryInformation getSummaryInformation() {
060: return si;
061: }
062:
063: /**
064: * Returns the text extracted from the document.
065: *
066: * @return the text extracted from the document
067: */
068: public String getText() {
069: return text;
070: }
071:
072: /**
073: * Returns a compressed form of the extracted text that only contains words with at least
074: * 4 characters, and contains every distinct uppercase word only one time.
075: * Additional text (eg text from html tag attributes like 'title' and 'alt') stored in the
076: * FxSummaryInformation will be included.
077: *
078: * @return a compressed form of the extracted text
079: */
080: public String getCompressedText() {
081: if (compressed == null) {
082: Hashtable<String, Boolean> words = new Hashtable<String, Boolean>(
083: 5000);
084: StringBuffer concateWords = new StringBuffer(30000);
085: // Split by whitespaces and other chars, also remove dups.
086: // Only keep words that have more than 3 characters
087: String txt = this .text;
088: if (si != null)
089: txt += (si.getTitle() == null ? "" : " "
090: + si.getTitle())
091: + (si.getKeywords() == null ? "" : " "
092: + si.getKeywords())
093: + (si.getAuthor() == null ? "" : " "
094: + si.getAuthor())
095: + (si.getComments() == null ? "" : " "
096: + si.getComments())
097: + (si.getRevNumber() == null ? "" : " "
098: + si.getRevNumber())
099: + (si.getAdditionalText() == null ? " " : si
100: .getAdditionalText());
101:
102: txt = txt.replace("?", " ").replace(".", " ").replace("!",
103: " ");
104: String[] sw = txt.split("[\\s,;:=\\(\\)\"-']");
105: for (String word : sw) {
106: if (word.length() < 4)
107: continue;
108: words.put(word.toUpperCase(), Boolean.TRUE);
109: }
110: for (Enumeration e = words.keys(); e.hasMoreElements();) {
111: concateWords.append(" ").append(e.nextElement());
112: }
113: compressed = concateWords.toString();
114: }
115: return compressed;
116: }
117:
118: public String toXML() {
119: StringBuilder sb = new StringBuilder(1000);
120: sb.append("<extract>");
121: if (getSummaryInformation() != null)
122: sb.append(getSummaryInformation().toXML());
123: text = text.replaceAll("<!\\[CDATA\\[", "<![CDATA[")
124: .replaceAll("\\]\\]>", "]]>");
125: sb.append("<text><![CDATA[").append(
126: getText().replace('\f', ' ')).append("]]></text>");
127: sb.append("<compressed><![CDATA[").append(getCompressedText())
128: .append("]]></compressed>");
129: sb.append("</extract>");
130: return sb.toString();
131: }
132:
133: public static String toEmptyXML() {
134: return "<extract><summary></summary><compressed></compressed></extract>";
135: }
136: }
|