001: /*
002:
003: * LIUS - Lucene Index Update and Search
004: * http://sourceforge.net/projects/lius/
005: *
006: * Copyright (c) 2005, Laval University Library. All rights reserved.
007: *
008: * This library is free software; you can redistribute it and/or
009: * modify it under the terms of the GNU Lesser General Public
010: * License as published by the Free Software Foundation; either
011: * version 2.1 of the License, or (at your option) any later version.
012: *
013: * This library is distributed in the hope that it will be useful,
014: * but WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
016: * Lesser General Public License for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public
019: * License along with this library; if not, write to the Free Software
020: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
021: */
022:
023: package ca.ulaval.bibl.lius.index.MSWord;
024:
025: import java.io.File;
026: import java.io.FileInputStream;
027: import java.io.InputStream;
028: import java.util.ArrayList;
029: import java.util.Collection;
030: import java.util.Iterator;
031:
032: import org.apache.log4j.Logger;
033: import org.apache.lucene.document.Document;
034: import org.textmining.text.extraction.WordExtractor;
035:
036: import ca.ulaval.bibl.lius.Lucene.LuceneActions;
037: import ca.ulaval.bibl.lius.config.LiusConfig;
038: import ca.ulaval.bibl.lius.config.LiusConfigBuilder;
039: import ca.ulaval.bibl.lius.config.LiusField;
040: import ca.ulaval.bibl.lius.index.Indexer;
041:
042: /**
043: *
044: * Classe permettant d'indexer des fichiers Microsoft Word
045: *
046: * <br/><br/>
047: *
048: * Class for indexing Microsoft Word documents.
049: *
050: * @author Rida Benjelloun (rida.benjelloun@bibl.ulaval.ca)
051: *
052: */
053:
054: public class WordIndexer
055:
056: extends Indexer {
057:
058: static Logger logger = Logger.getRootLogger();
059:
060: public Object parse(Object file) {
061:
062: File wordDoc = new File((String) file);
063:
064: WordExtractor we = new WordExtractor();
065:
066: InputStream in = null;
067:
068: String text = "";
069:
070: try {
071:
072: in = new FileInputStream(wordDoc);
073:
074: text = we.extractText(in);
075:
076: }
077:
078: catch (Exception e) {
079:
080: logger.error(e.getMessage());
081:
082: }
083:
084: return text;
085:
086: }
087:
088: /**
089: *
090: * Méthode retournant un objet de type Lucene document à partir du fichier
091: *
092: * à indexer et du fichier de configuration de Lius exprimé sous forme
093: *
094: * d'objet de type LiusConfig.
095: *
096: * <br/><br/>
097: *
098: * Method that returns a Lucene Document object from a file to index and
099: *
100: * the Lius Configuration as a LiusConfig object.
101: *
102: */
103:
104: public Document createLuceneDocument(String file, LiusConfig lc) {
105:
106: Document doc = createLuceneDocument(file, lc.getMsWordFields());
107:
108: return doc;
109:
110: }
111:
112: /**
113: *
114: * Méthode retournant un objet de type Lucene document à partir du fichier à
115: *
116: * indexer et d'une collection d'objets de type LiusField. Chaque objet
117: *
118: * LiusField contient de l'information sur le nom du champs Lucene, le type,
119: *
120: * etc.
121: *
122: * <br/><br/>
123: *
124: * Method that return a Lucene object from the configuration file and a
125: * collection
126: *
127: * of LiusField objects. Each LiusField object contains information about
128: * the Lucene
129: *
130: * field, the type, etc.
131: *
132: *
133: *
134: */
135:
136: public Collection getPopulatedCollection(Object file,
137: Collection liusFields) {
138:
139: LuceneActions la = LuceneActions.getSingletonInstance();
140:
141: Collection coll = new ArrayList();
142:
143: Iterator it = liusFields.iterator();
144:
145: while (it.hasNext()) {
146:
147: Object field = it.next();
148:
149: if (field instanceof LiusField) {
150:
151: LiusField lf = (LiusField) field;
152:
153: if (lf.getGet() != null) {
154:
155: if (lf.getGet().equalsIgnoreCase("content")) {
156:
157: String text = (String) parse(file);
158:
159: lf.setValue(text);
160:
161: coll.add(lf);
162:
163: }
164:
165: }
166:
167: }
168:
169: else {
170:
171: coll.add(field);
172:
173: }
174:
175: }
176:
177: return coll;
178:
179: }
180:
181: public Collection getPopulatedCollection(Object file,
182: String liusConfig) {
183:
184: LiusConfig lc = LiusConfigBuilder.getSingletonInstance()
185: .getLiusConfig(
186:
187: liusConfig);
188:
189: return getPopulatedCollection(file, lc);
190:
191: }
192:
193: public Collection getPopulatedCollection(Object file, LiusConfig lc) {
194:
195: return getPopulatedCollection(file, lc.getMsWordFields());
196:
197: }
198:
199: /**
200: *
201: * Permet de récupérer les champs de Lius à partir du fichier de
202: * configuration
203: *
204: * pour effectuer l'indexation.
205: *
206: * <br/><br/>
207: *
208: * Gets Lius fiels from the configuration file for indexation.
209: *
210: */
211:
212: public Collection getLiusFields(LiusConfig lc) {
213:
214: return lc.getMsWordFields();
215:
216: }
217:
218: }
|