01: package org.contineo.core.text;
02:
03: import java.util.Collection;
04: import java.util.Iterator;
05:
06: import org.contineo.core.document.Term;
07: import org.contineo.core.document.dao.TermDAO;
08: import org.contineo.core.text.analyze.Analyzer;
09: import org.contineo.core.text.analyze.AnalyzerFactory;
10: import org.contineo.core.text.analyze.Entry;
11: import org.contineo.util.Context;
12:
13: /**
14: * Class for analysing texts like extracting keywords from a given text.
15: * Created on 24.03.2004
16: *
17: * @author Michael Scholz
18: */
19: public class AnalyzeText {
20:
21: public AnalyzeText() {
22: }
23:
24: /**
25: * This method selects 20 keywords of a given text in a specified language
26: * and stores these keywords in a database.
27: *
28: * @param menuId MenuId of the document the text is from.
29: * @param text Text of a document.
30: * @param language Identified language of the text.
31: * @throws Exception
32: */
33: public void storeTerms(int menuId, String text, String language)
34: throws Exception {
35: TermDAO termDao = (TermDAO) Context.getInstance().getBean(
36: TermDAO.class);
37: Analyzer analyzer = AnalyzerFactory.getAnalyzer(language);
38: analyzer.analyze(text);
39:
40: long words = analyzer.getWordCount();
41: Collection terms = analyzer.getTopWords(20);
42: Iterator iter = terms.iterator();
43:
44: while (iter.hasNext()) {
45: Entry entry = (Entry) iter.next();
46: Term term = new Term();
47: term.setMenuId(menuId);
48: term.setStem(entry.getWord());
49: term.setValue(entry.getNumber() * 1000 / words);
50: term.setWordCount(entry.getNumber());
51: term.setOriginWord(entry.getOriginWord());
52: termDao.store(term);
53: }
54: }
55:
56: /**
57: * This method extracts a specified number of keywords and appends them to a
58: * String
59: *
60: * @param count Number of keywords.
61: * @param text Given text of a document.
62: * @param language Identified language of the text.
63: * @return String of keywords like "Information, Retrieval, DMS, CMS"
64: * @throws Exception
65: */
66: public String getTerms(int count, String text, String language)
67: throws Exception {
68: StringBuffer result = new StringBuffer();
69: Analyzer analyzer = AnalyzerFactory.getAnalyzer(language);
70: analyzer.analyze(text);
71:
72: Collection terms = analyzer.getTopWords(count);
73: Iterator iter = terms.iterator();
74: int temp = 0;
75:
76: while (iter.hasNext() && (temp < count)) {
77: Entry entry = (Entry) iter.next();
78:
79: if (temp > 0) {
80: result.append(", ");
81: }
82:
83: result.append(entry.getOriginWord());
84: temp++;
85: }
86:
87: return result.toString();
88: }
89: }
|