001: /*
002: * LIUS - Lucene Index Update and Search
003: * http://sourceforge.net/projects/lius/
004: *
005: * Copyright (c) 2005, Laval University Library. All rights reserved.
006: *
007: * This library is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU Lesser General Public
009: * License as published by the Free Software Foundation; either
010: * version 2.1 of the License, or (at your option) any later version.
011: *
012: * This library is distributed in the hope that it will be useful,
013: * but WITHOUT ANY WARRANTY; without even the implied warranty of
014: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015: * Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public
018: * License along with this library; if not, write to the Free Software
019: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
020: */
021:
022: package ca.ulaval.bibl.lius.index.HTML;
023:
024: /**
025: * <p>Titre : Lius (Lucene Index, Update and Search)</p>
026: * <p>Description : Application Java permettant d'indexer du XML, PDF, HTML, JSP, ASP, PHP, Word, Excel et des objets Java</p>
027: * <p>Copyright : Copyright (c) 2003 Rida Benjelloun</p>
028: * @author Rida Benjelloun
029: * @e-mail rbenjelloun@hotmail.com
030: * rida.benjelloun@bibl.ulaval.ca
031: * @version 0.0.1
032: * @date 09-08-2003
033: */
034:
035: import java.io.BufferedReader;
036: import java.io.BufferedWriter;
037: import java.io.File;
038: import java.io.FileInputStream;
039: import java.io.FileNotFoundException;
040: import java.io.FileOutputStream;
041: import java.io.IOException;
042: import java.io.InputStreamReader;
043: import java.io.OutputStreamWriter;
044: import java.util.Collection;
045:
046: import org.apache.log4j.Logger;
047: import org.cyberneko.html.parsers.DOMParser;
048: import org.jdom.JDOMException;
049: import org.jdom.input.DOMBuilder;
050: import org.xml.sax.SAXException;
051:
052: import ca.ulaval.bibl.lius.config.LiusConfig;
053: import ca.ulaval.bibl.lius.config.LiusConfigBuilder;
054: import ca.ulaval.bibl.lius.index.XML.XmlFileIndexer;
055:
056: /**
057: * Classe permettant d'indexer des fichiers HTML <br/><br/>Class for indexing
058: * HTML files.
059: *
060: * @author Rida Benjelloun (rida.benjelloun@bibl.ulaval.ca)
061: */
062:
063: public class HtmlIndexer extends XmlFileIndexer {
064: private String fileToDelete = null;
065:
066: static Logger logger = Logger.getRootLogger();
067:
068: public Object parse(Object file) {
069: org.jdom.Document jdomDoc = null;
070: try {
071: String newFile = "file:"
072: + omitXMLDeclaration((String) file);
073: DOMParser parser = new DOMParser();
074: parser.parse(newFile);
075: org.w3c.dom.Document domDoc = parser.getDocument();
076: jdomDoc = convert(domDoc);
077: } catch (SAXException e) {
078: logger.error(e.getMessage());
079: } catch (IOException e) {
080: logger.error(e.getMessage());
081: } catch (JDOMException e) {
082: logger.error(e.getMessage());
083: } catch (Exception e) {
084:
085: logger.error(e.getMessage());
086: } finally {
087: deleteTmp(fileToDelete);
088: }
089: return jdomDoc;
090: }
091:
092: public org.jdom.Document convert(org.w3c.dom.Document domDoc)
093: throws JDOMException, IOException {
094: DOMBuilder builder = new DOMBuilder();
095: org.jdom.Document jdomDoc = builder.build(domDoc);
096: return jdomDoc;
097: }
098:
099: /**
100: * Méthode retournant un objet de type Lucene document à partir du fichier à
101: * indexer et du fichier de configuration de Lius exprimé sous forme d'objet
102: * de type LiusConfig. <br/><br/>Method that returns a Lucene Document
103: * object from a file to index and the Lius configuration as a LiusConfig
104: * object.
105: */
106:
107: public org.apache.lucene.document.Document createLuceneDocument(
108: String file, LiusConfig lc) {
109: org.apache.lucene.document.Document doc = createLuceneDocument(
110: file, lc.getHtmlFields());
111: return doc;
112: }
113:
114: /**
115: * Méthode retournant un objet de type Lucene document à partir du fichier à
116: * indexer et d'une collection d'objets de type LiusField. Chaque objet
117: * LiusField contient de l'information sur le nom du champs Lucene, le type,
118: * etc. <br/><br/>Method that returns a Lucene object from the
119: * configuration file and a collection of LiusField objects. Each LiusField
120: * object contains information about the Lucene field, the type, etc.
121: */
122:
123: public Collection getPopulatedCollection(Object file,
124: Collection liusFields) {
125: org.jdom.Document jdomDoc = (org.jdom.Document) this
126: .parse(file);
127: Collection coll = super .getPopulatedCollection(jdomDoc,
128: liusFields);
129: deleteTmp(fileToDelete);
130: return coll;
131: }
132:
133: /**
134: * Permet de récupérer les champs de Lius à partir du fichier de
135: * configuration pour effectuer l'indexation. <br/><br/>Get Lius fiels
136: * from the configuration file for indexation.
137: */
138:
139: public Collection getLiusFields(LiusConfig lc) {
140: return lc.getHtmlFields();
141: }
142:
143: public String omitXMLDeclaration(String file)
144: throws FileNotFoundException, IOException {
145: String line = null;
146: String extension = file.substring(file.lastIndexOf("."));
147: String base = file.substring(0, file.lastIndexOf("."));
148: String fOut = base + "_liusTmpFileToIndex" + extension;
149: FileInputStream fis = new FileInputStream(file);
150: FileOutputStream fos = new FileOutputStream(fOut);
151: BufferedReader in = new BufferedReader(new InputStreamReader(
152: fis));
153: BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
154: fos));
155: while ((line = in.readLine()) != null) {
156: if (line.startsWith("<?xml")) {
157: int offset = line.indexOf("?>");
158: out.write(line.substring(offset + 2));
159: //new String((line.substring(offset+2)).getBytes(), "UTF8"));
160: } else {
161: out.write(line);
162: //new String(line.getBytes(),"UTF8"));
163: }
164: }
165: out.close();
166: fileToDelete = fOut;
167: return fOut;
168: }
169:
170: public Collection getPopulatedCollection(Object file,
171: String liusConfig) {
172: LiusConfig lc = LiusConfigBuilder.getSingletonInstance()
173: .getLiusConfig(liusConfig);
174: return getPopulatedCollection(file, lc);
175: }
176:
177: public Collection getPopulatedCollection(Object file, LiusConfig lc) {
178: return getPopulatedCollection(file, lc.getHtmlFields());
179: }
180:
181: private void deleteTmp(String file) {
182: File toDelete = new File(file);
183: toDelete.delete();
184: }
185: }
|