001: /*
002: Copyright (C) 2003 Know Gate S.L. All rights reserved.
003: C/Oña, 107 1º2 28050 Madrid (Spain)
004:
005: Redistribution and use in source and binary forms, with or without
006: modification, are permitted provided that the following conditions
007: are met:
008:
009: 1. Redistributions of source code must retain the above copyright
010: notice, this list of conditions and the following disclaimer.
011:
012: 2. The end-user documentation included with the redistribution,
013: if any, must include the following acknowledgment:
014: "This product includes software parts from hipergate
015: (http://www.hipergate.org/)."
016: Alternately, this acknowledgment may appear in the software itself,
017: if and wherever such third-party acknowledgments normally appear.
018:
019: 3. The name hipergate must not be used to endorse or promote products
020: derived from this software without prior written permission.
021: Products derived from this software may not be called hipergate,
022: nor may hipergate appear in their name, without prior written
023: permission.
024:
025: This library is distributed in the hope that it will be useful,
026: but WITHOUT ANY WARRANTY; without even the implied warranty of
027: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
028:
029: You should have received a copy of hipergate License with this code;
030: if not, visit http://www.hipergate.org or mail to info@hipergate.org
031: */
032:
033: package com.knowgate.lucene;
034:
035: import java.util.Properties;
036:
037: import java.io.FileNotFoundException;
038: import java.io.IOException;
039: import java.io.File;
040: import java.io.FilenameFilter;
041: import java.io.FileReader;
042: import java.io.FileInputStream;
043:
044: import org.apache.lucene.analysis.*;
045: import org.apache.lucene.index.*;
046: import org.apache.lucene.document.*;
047:
048: import org.apache.oro.text.regex.*;
049:
050: import com.knowgate.debug.DebugFile;
051:
052: /**
053: * <p>Simple HTML crawler for Lucene</p>
054: * @author Sergio Montoro Ten
055: * @version 1.0
056: * @see http://jakarta.apache.org/lucene/docs/index.html
057: */
058:
059: public class Crawler {
060:
061: class RegExpFilter implements FilenameFilter {
062:
063: private Pattern oPattern;
064: private PatternMatcher oMatcher;
065: private PatternCompiler oCompiler;
066:
067: RegExpFilter(String sPattern) throws MalformedPatternException {
068: oMatcher = new Perl5Matcher();
069: oCompiler = new Perl5Compiler();
070: oPattern = oCompiler.compile(sPattern);
071: }
072:
073: public boolean accept(File oFile, String sName) {
074: return oFile.isDirectory()
075: || oMatcher.matches(sName, oPattern);
076: }
077: } // RegExpFilter
078:
079: // ---------------------------------------------------------------------------
080: // Private Variables
081:
082: private String sSeparator;
083: private PatternMatcher oMatcher;
084: private PatternCompiler oCompiler;
085: private Pattern oTagPattern;
086:
087: // ---------------------------------------------------------------------------
088:
089: public Crawler() {
090: oMatcher = new Perl5Matcher();
091: oCompiler = new Perl5Compiler();
092:
093: try {
094: oTagPattern = oCompiler.compile("<[^>]*>");
095: } catch (MalformedPatternException mpe) {
096: }
097:
098: sSeparator = System.getProperty("file.separator");
099: }
100:
101: // ---------------------------------------------------------------------------
102:
103: private Document makeHTMLDocument(String sRelativePath,
104: String sName, String sHTMLText) {
105: int iTitleStart, iTitleEnd;
106:
107: if (DebugFile.trace)
108: DebugFile.writeln("Crawler.addHTMLDocument("
109: + sRelativePath + "," + sName + ")");
110:
111: iTitleStart = sHTMLText.indexOf("<TITLE>");
112: if (iTitleStart < 0)
113: iTitleStart = sHTMLText.indexOf("<title>");
114:
115: if (iTitleStart >= 0) {
116: iTitleEnd = sHTMLText.indexOf("</TITLE>");
117: if (iTitleEnd < 0)
118: iTitleEnd = sHTMLText.indexOf("</title>");
119: } else
120: iTitleEnd = -1;
121:
122: String sTitle;
123:
124: if (iTitleStart >= 0 && iTitleEnd > 0)
125:
126: sTitle = sHTMLText.substring(iTitleStart + 7, iTitleEnd)
127: .trim();
128:
129: else {
130:
131: sTitle = null;
132:
133: // ***************************************************************
134: // Código ñapa para indexar las listas de correo waltrappa de Iván
135:
136: iTitleStart = sHTMLText.indexOf("<H1>");
137: if (iTitleStart < 0)
138: iTitleStart = sHTMLText.indexOf("<h1>");
139:
140: if (iTitleStart >= 0) {
141: iTitleEnd = sHTMLText.indexOf("</H1>");
142: if (iTitleEnd < 0)
143: iTitleEnd = sHTMLText.indexOf("</h1>");
144: }
145:
146: if (iTitleStart >= 0 && iTitleEnd > 0)
147: sTitle = sHTMLText
148: .substring(iTitleStart + 4, iTitleEnd).trim();
149:
150: iTitleStart = sHTMLText.indexOf("<H2>");
151: if (iTitleStart < 0)
152: iTitleStart = sHTMLText.indexOf("<h2>");
153:
154: if (iTitleStart >= 0) {
155: iTitleEnd = sHTMLText.indexOf("</H2>");
156: if (iTitleEnd < 0)
157: iTitleEnd = sHTMLText.indexOf("</h2>");
158: }
159:
160: if (iTitleStart >= 0 && iTitleEnd > 0)
161: if (null == sTitle)
162: sTitle = sHTMLText.substring(iTitleStart + 4,
163: iTitleEnd).trim();
164: else
165: sTitle += " "
166: + sHTMLText.substring(iTitleStart + 4,
167: iTitleEnd).trim();
168:
169: // Fin de ñapa
170: // ***************************************************************
171:
172: if (sTitle == null)
173: sTitle = "untitled";
174: }
175:
176: Document oDoc = new Document();
177:
178: oDoc
179: .add(new Field("subpath", sRelativePath, true, false,
180: false));
181: oDoc.add(new Field("name", sName, true, false, false));
182: oDoc.add(Field.Keyword("title", sTitle));
183: oDoc.add(Field.UnStored("text", Util.substitute(oMatcher,
184: oTagPattern, new StringSubstitution(""), sHTMLText,
185: Util.SUBSTITUTE_ALL)));
186:
187: return oDoc;
188: } // makeHTMLDocument
189:
190: // ---------------------------------------------------------------------------
191:
192: private void crawlDir(IndexWriter oIWrt, String sBasePath,
193: int iBasePathlen, RegExpFilter oFileFilter)
194: throws IOException, FileNotFoundException {
195:
196: if (DebugFile.trace) {
197: DebugFile.writeln("Begin Crawler.crawlDir(" + sBasePath
198: + ")");
199: DebugFile.incIdent();
200: }
201:
202: File oBaseDir = new File(sBasePath);
203: String sName;
204:
205: if (!oBaseDir.exists())
206: throw new FileNotFoundException(sBasePath
207: + " directory does not exist");
208:
209: if (!oBaseDir.isDirectory())
210: throw new IOException(sBasePath + " is not a directory");
211:
212: File[] aFiles = oBaseDir.listFiles();
213: int iFiles = aFiles.length;
214:
215: int iBuffer;
216: char[] aBuffer;
217: String sBuffer;
218: String sText;
219: Document oDoc;
220:
221: sBasePath += sSeparator;
222:
223: for (int f = 0; f < iFiles; f++) {
224:
225: if (aFiles[f].isDirectory()) {
226:
227: crawlDir(oIWrt, sBasePath + aFiles[f].getName(),
228: iBasePathlen, oFileFilter);
229: }
230:
231: else {
232:
233: sName = aFiles[f].getName().toLowerCase();
234:
235: if (sName.endsWith(".htm") || sName.endsWith(".html")
236: || sName.endsWith(".shtml")
237: || sName.endsWith(".shtm")) {
238: iBuffer = new Long(aFiles[f].length()).intValue();
239:
240: if (iBuffer > 0) {
241: FileReader oReader = new FileReader(aFiles[f]);
242: aBuffer = new char[iBuffer];
243: oReader.read(aBuffer);
244: sBuffer = new String(aBuffer);
245:
246: oIWrt.addDocument(makeHTMLDocument(sBasePath
247: .substring(iBasePathlen), aFiles[f]
248: .getName(), sBuffer));
249: } // fi (iBuffer>0)
250: } // fi (sName.endsWith(".htm") || sName.endsWith(".html"))
251: }
252: } // next
253:
254: if (DebugFile.trace) {
255: DebugFile.decIdent();
256: DebugFile.writeln("End Crawler.crawlDir()");
257: }
258: } // crawlDir
259:
260: // ---------------------------------------------------------------------------
261:
262: /**
263: * <p>Add contents to a Lucene Index
264: * @param sBasePath Base Path for crawling
265: * @param sFileFilter Perl5 Regular Expression filter for file names
266: * @param sIndexDirectory Lucene index target directory
267: * @param bRebuild <b>true</b> if index must be deleted and fully rebuild.
268: * @throws IOException
269: * @throws FileNotFoundException If sBasePath direcory does not exist
270: * @throws MalformedPatternException If sFileFilter is not a valid Perl5 regular expression pattern
271: */
272: public void crawl(String sBasePath, String sFileFilter,
273: String sIndexDirectory, boolean bRebuild)
274: throws IOException, MalformedPatternException {
275:
276: if (DebugFile.trace) {
277: DebugFile.writeln("Begin Crawler.crawl(" + sBasePath + ","
278: + sFileFilter + "," + sIndexDirectory + ")");
279: DebugFile.incIdent();
280: }
281:
282: IndexWriter oIWrt = new IndexWriter(sIndexDirectory,
283: new SimpleAnalyzer(), bRebuild);
284:
285: if (sBasePath.endsWith(sSeparator))
286: sBasePath = sBasePath.substring(0, sBasePath.length() - 1);
287:
288: crawlDir(oIWrt, sBasePath, sBasePath.length(),
289: new RegExpFilter(sFileFilter));
290:
291: oIWrt.optimize();
292: oIWrt.close();
293:
294: if (DebugFile.trace) {
295: DebugFile.decIdent();
296: DebugFile.writeln("End Crawler.crawl()");
297: }
298: } // crawl
299:
300: // ---------------------------------------------------------------------------
301:
302: private static void printUsage() {
303: System.out.println("");
304: System.out.println("Usage:");
305: System.out
306: .println("Crawler cnf_path rebuild index_name base_path");
307: }
308:
309: // ---------------------------------------------------------------------------
310:
311: public static void main(String[] argv) throws NoSuchFieldException,
312: IOException, FileNotFoundException,
313: MalformedPatternException {
314:
315: if (argv.length != 4)
316: printUsage();
317: else if (!argv[1].equals("rebuild")) {
318: printUsage();
319: } else {
320: Properties oProps = new Properties();
321: FileInputStream oCNF = new FileInputStream(argv[0]);
322: oProps.load(oCNF);
323: oCNF.close();
324:
325: String sDirectory = oProps.getProperty("luceneindex");
326:
327: if (null == sDirectory)
328: throw new NoSuchFieldException(
329: "Cannot find luceneindex property");
330:
331: if (!sDirectory.endsWith(System
332: .getProperty("file.separator")))
333: sDirectory += System.getProperty("file.separator");
334:
335: new Crawler().crawl(argv[3], ".*htm*$", sDirectory
336: + argv[2], true);
337: }
338: } // main
339:
340: } // Crawler
|