001: /*
002: * (C) Copyright 2004 Nabh Information Systems, Inc.
003: *
004: * This program is free software; you can redistribute it and/or
005: * modify it under the terms of the GNU General Public License
006: * as published by the Free Software Foundation; either version 2
007: * of the License, or (at your option) any later version.
008: *
009: * This program is distributed in the hope that it will be useful,
010: * but WITHOUT ANY WARRANTY; without even the implied warranty of
011: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
012: * GNU General Public License for more details.
013: *
014: * You should have received a copy of the GNU General Public License
015: * along with this program; if not, write to the Free Software
016: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
017: *
018: */
019: package com.nabhinc.portlet.search;
020:
021: import java.io.ByteArrayInputStream;
022: import java.io.ByteArrayOutputStream;
023: import java.io.File;
024: import java.io.IOException;
025: import java.io.InputStream;
026: import java.io.InputStreamReader;
027: import java.io.OutputStreamWriter;
028: import java.net.URL;
029: import java.net.URLConnection;
030: import java.util.Date;
031: import java.util.HashMap;
032: import java.util.Vector;
033: import java.util.regex.Matcher;
034: import java.util.regex.Pattern;
035:
036: import javax.portlet.PortletException;
037:
038: import org.apache.commons.logging.Log;
039: import org.apache.commons.logging.LogFactory;
040: import org.apache.lucene.analysis.standard.StandardAnalyzer;
041: import org.apache.lucene.document.DateField;
042: import org.apache.lucene.document.Document;
043: import org.apache.lucene.document.Field;
044: import org.apache.lucene.index.IndexWriter;
045: import org.htmlparser.NodeFilter;
046: import org.htmlparser.Parser;
047: import org.htmlparser.beans.StringBean;
048: import org.htmlparser.filters.NodeClassFilter;
049: import org.htmlparser.filters.TagNameFilter;
050: import org.htmlparser.lexer.nodes.StringNode;
051: import org.htmlparser.tags.LinkTag;
052: import org.htmlparser.tags.TitleTag;
053: import org.htmlparser.util.NodeList;
054: import org.htmlparser.util.ParserException;
055: import org.pdfbox.exceptions.CryptographyException;
056: import org.pdfbox.exceptions.InvalidPasswordException;
057: import org.pdfbox.pdmodel.PDDocument;
058: import org.pdfbox.pdmodel.PDDocumentInformation;
059: import org.pdfbox.util.PDFTextStripper;
060:
061: /**
062: *
063: *
064: * @author Padmanabh Dabke
065: * (c) 2004 Nabh Information Systems, Inc. All Rights Reserved.
066: */
067: public class WebSiteIndexer {
068:
069: private static final char FILE_SEPARATOR = System.getProperty(
070: "file.separator").charAt(0);
071: private static Log wsiLogger = LogFactory
072: .getLog(WebSiteIndexer.class);
073:
074: public static void main(String[] args) {
075: String seedURL = "http://localhost/stringbeans";
076: String indexDir = "C:/temp/index";
077: try {
078: String[] includeRegex = { "http://localhost" };
079: String[] excludeRegex = { ".*maximized.*", ".*minimized.*",
080: ".*process_action.*" };
081: createSiteIndex(seedURL, indexDir, includeRegex,
082: excludeRegex, 10);
083: } catch (Exception ex) {
084: ex.printStackTrace();
085: }
086: }
087:
088: public static void createSiteIndex(String seedURLStr,
089: String indexDirStr, String[] includeRegex,
090: String[] excludeRegex, int maxDepth)
091: throws PortletException {
092: // IndexReader reader; // existing index
093: IndexWriter writer; // new index being built
094: // TermEnum uidIter; // document id iterator
095: final TagNameFilter filter = new TagNameFilter("A");
096: NodeList list;
097: Parser parser = null;
098:
099: // Check for a valid index directory. If the directory does not exist
100: // create one.
101: File indexDir = new File(indexDirStr);
102: if (indexDir.isFile()) {
103: throw new PortletException(indexDirStr
104: + " is a file not a directory.");
105: }
106: if (!indexDir.exists()) {
107: indexDir.mkdirs();
108: }
109:
110: try {
111: writer = new IndexWriter(indexDirStr,
112: new StandardAnalyzer(), true);
113: writer.setMaxFieldLength(1000000);
114: } catch (Exception ex) {
115: throw new PortletException(
116: "Could not create search index writer.", ex);
117: }
118:
119: // Holds all URLs we have visited
120: HashMap urlCollection = new HashMap();
121:
122: // URLs we are currently working on
123: Vector currentURLs = new Vector();
124:
125: // Links found in processing current generation documents
126: Vector nextGenURLs = new Vector();
127:
128: // Start of with the seed URL
129: currentURLs.addElement(seedURLStr);
130: int depth = 0;
131: int includeLen = includeRegex == null ? 0 : includeRegex.length;
132: int excludeLen = excludeRegex == null ? 0 : excludeRegex.length;
133:
134: // Create pattern objects corresponding to inclusion/exclusion pattern
135: // strings
136: Pattern[] includePatterns = new Pattern[includeLen];
137: for (int j = 0; j < includeLen; j++) {
138: includePatterns[j] = Pattern.compile(includeRegex[j]);
139: }
140: Pattern[] excludePatterns = new Pattern[excludeLen];
141: for (int j = 0; j < excludeLen; j++) {
142: excludePatterns[j] = Pattern.compile(excludeRegex[j]);
143: }
144:
145: // Index URLs till we run out of depth or URLs
146: while (depth < maxDepth && currentURLs.size() > 0) {
147: int len = currentURLs.size();
148: for (int count = 0; count < len; count++) {
149: String currentURL = (String) currentURLs
150: .elementAt(count);
151: System.out.println("Processing: " + currentURL);
152: boolean isNotHTML = true;
153: try {
154: URL docURL = new URL(currentURL);
155: URLConnection conn = docURL.openConnection();
156: String contentType = conn.getContentType();
157: Document doc = null;
158: if (contentType == null) {
159: // Index binary document
160: continue; // for now
161: } else if (contentType.indexOf("text/html") > -1) {
162: parser = new Parser(conn);
163: doc = getHTMLDocument(parser);
164: if (doc == null)
165: continue;
166: isNotHTML = false;
167: } else if (contentType.indexOf("application/pdf") > -1
168: || contentType.indexOf("application/x-pdf") > -1) {
169: // Index PDF document
170: doc = getPDFDocument(conn, currentURL);
171: } else {
172: continue;
173: }
174:
175: // Add the url as a field named "url". Use an UnIndexed field, so
176: // that the url is just stored with the document, but is not searchable.
177: doc.add(new Field("url", docURL.toExternalForm(),
178: Field.Store.YES, Field.Index.NO));
179:
180: // Add the last modified date of the file a field named "modified". Use a
181: // Keyword field, so that it's searchable, but so that no attempt is made
182: // to tokenize the field into words.
183: doc.add(new Field("modified", DateField
184: .timeToString(conn.getLastModified()),
185: Field.Store.YES, Field.Index.UN_TOKENIZED));
186:
187: String uid = docURL.toExternalForm().replace(
188: FILE_SEPARATOR, '\u0000')
189: + "\u0000"
190: + DateField.timeToString(conn
191: .getLastModified());
192:
193: // Add the uid as a field, so that index can be incrementally maintained.
194: // This field is not stored with document, it is indexed, but it is not
195: // tokenized prior to indexing.
196: doc.add(new Field("uid", uid, Field.Store.NO,
197: Field.Index.UN_TOKENIZED));
198:
199: // Add document to the index
200: writer.addDocument(doc);
201:
202: if (isNotHTML)
203: continue;
204:
205: // Extract links in the document
206: parser.reset();
207: list = parser.extractAllNodesThatMatch(filter);
208: skiplink: for (int i = 0; i < list.size(); i++) {
209: LinkTag link = (LinkTag) list.elementAt(i);
210: String url = link.getLink();
211: if (link.isHTTPLink()) {
212: boolean found = false;
213: for (int j = 0; j < includeLen; j++) {
214: Matcher m = includePatterns[j]
215: .matcher(url);
216: if (m.find()) {
217: found = true;
218: break;
219: }
220: }
221:
222: if (!found) {
223: continue skiplink;
224: }
225:
226: for (int j = 0; j < excludeLen; j++) {
227: Matcher m = excludePatterns[j]
228: .matcher(url);
229: if (m.find()) {
230: continue skiplink;
231: }
232: }
233: if (!urlCollection.containsKey(url)) {
234: urlCollection.put(url, "");
235: nextGenURLs.addElement(url);
236: }
237: }
238: }
239: } catch (IOException ioe) {
240: wsiLogger.warn("Broken URL: " + currentURL, ioe);
241: } catch (ParserException ex) {
242: wsiLogger.warn("Broken URL: " + currentURL, ex);
243: }
244: }
245: depth++;
246: Vector tempVec = currentURLs;
247: currentURLs = nextGenURLs;
248: nextGenURLs = tempVec;
249: nextGenURLs.clear();
250: }
251:
252: System.out.println("Optimizing index...");
253:
254: try {
255: writer.optimize();
256: writer.close();
257: } catch (Exception ex) {
258: throw new PortletException(
259: "Could not optimize search index.", ex);
260: }
261:
262: }
263:
264: private static String getSummary(String content) {
265: if (content == null)
266: return "";
267: if (content.length() < 200)
268: return content;
269: int spaceIndex = content.indexOf(" ", 200);
270: if (spaceIndex == -1)
271: return content;
272: return content.substring(0, spaceIndex);
273:
274: }
275:
276: private static Document getHTMLDocument(Parser parser)
277: throws PortletException {
278: final NodeFilter titleFilter = new TagNameFilter("TITLE");
279: final NodeFilter stringFilter = new NodeClassFilter(
280: StringNode.class);
281: Document doc = new Document();
282:
283: try {
284:
285: // Add the title as a separate Text field, so that it can be searched
286: // separately.
287: NodeList list = parser
288: .extractAllNodesThatMatch(titleFilter);
289: String title = "Untitled";
290: if (list != null && list.size() != 0) {
291: TitleTag titleTag = (TitleTag) list.elementAt(0);
292: title = titleTag.getChildrenHTML();
293: }
294: doc.add(new Field("title", title, Field.Store.YES,
295: Field.Index.TOKENIZED));
296: System.out.println("Doc Title: " + title);
297:
298: // Add the tag-stripped contents as a Reader-valued Text field so it will
299: // get tokenized and indexed.
300: parser.reset();
301: StringBean sb = new StringBean();
302: sb.setLinks(false);
303: parser.reset();
304: parser.visitAllNodesWith(sb);
305: String content = sb.getStrings();
306: if (content == null)
307: return null;
308: doc.add(new Field("contents", content, Field.Store.YES,
309: Field.Index.TOKENIZED));
310:
311: // Add the summary as an UnIndexed field, so that it is stored and returned
312: // with hit documents for display.
313: String summary = getSummary(content);
314: System.out.println("Summary: ");
315: System.out.println(summary);
316: doc.add(new Field("summary", summary, Field.Store.YES,
317: Field.Index.NO));
318:
319: return doc;
320: } catch (Exception ex) {
321: throw new PortletException(
322: "Failed to index HTML document.", ex);
323: }
324:
325: }
326:
327: private static Document getPDFDocument(URLConnection connection,
328: String docURL) throws PortletException {
329: try {
330: Document document = new Document();
331: connection.connect();
332: InputStream input = null;
333: try {
334: input = connection.getInputStream();
335: addPDFContent(document, input, docURL);
336: } finally {
337: if (input != null) {
338: input.close();
339: }
340: }
341:
342: // return the document
343: return document;
344:
345: } catch (Exception ex) {
346: throw new PortletException("Failed to index PDF document.",
347: ex);
348: }
349: }
350:
351: /**
352: * This will add the contents to the lucene document.
353: *
354: * @param document The document to add the contents to.
355: * @param is The stream to get the contents from.
356: * @param documentLocation The location of the document, used just for debug messages.
357: *
358: * @throws IOException If there is an error parsing the document.
359: */
360: private static void addPDFContent(Document document,
361: InputStream is, String documentLocation)
362: throws PortletException {
363: PDDocument pdfDocument = null;
364: try {
365: pdfDocument = PDDocument.load(is);
366:
367: if (pdfDocument.isEncrypted()) {
368: //Just try using the default password and move on
369: pdfDocument.decrypt("");
370: }
371:
372: //create a tmp output stream with the size of the content.
373: ByteArrayOutputStream out = new ByteArrayOutputStream();
374: OutputStreamWriter writer = new OutputStreamWriter(out);
375: PDFTextStripper stripper = new PDFTextStripper();
376: stripper.writeText(pdfDocument, writer);
377: writer.close();
378:
379: byte[] contents = out.toByteArray();
380: InputStreamReader input = new InputStreamReader(
381: new ByteArrayInputStream(contents));
382: // Add the tag-stripped contents as a Reader-valued Text field so it will
383: // get tokenized and indexed.
384: document.add(new Field("contents", input));
385:
386: PDDocumentInformation info = pdfDocument
387: .getDocumentInformation();
388: if (info.getAuthor() != null) {
389: document.add(new Field("Author", info.getAuthor(),
390: Field.Store.YES, Field.Index.TOKENIZED));
391: }
392: if (info.getCreationDate() != null) {
393: Date date = info.getCreationDate().getTime();
394: //for some reason lucene cannot handle dates before the epoch
395: //and throws a nasty RuntimeException, so we will check and
396: //verify that this does not happen
397: if (date.getTime() >= 0) {
398: document.add(new Field("CreationDate", DateField
399: .dateToString(date), Field.Store.YES,
400: Field.Index.TOKENIZED));
401: }
402: }
403: if (info.getCreator() != null) {
404: document.add(new Field("Creator", info.getCreator(),
405: Field.Store.YES, Field.Index.TOKENIZED));
406: }
407: if (info.getKeywords() != null) {
408: document.add(new Field("Keywords", info.getKeywords(),
409: Field.Store.YES, Field.Index.TOKENIZED));
410: }
411: if (info.getModificationDate() != null) {
412: Date date = info.getModificationDate().getTime();
413: //for some reason lucene cannot handle dates before the epoch
414: //and throws a nasty RuntimeException, so we will check and
415: //verify that this does not happen
416: if (date.getTime() >= 0) {
417: document.add(new Field("ModificationDate",
418: DateField.dateToString(date),
419: Field.Store.YES, Field.Index.NO));
420: }
421: }
422: if (info.getProducer() != null) {
423: document.add(new Field("Producer", info.getProducer(),
424: Field.Store.YES, Field.Index.TOKENIZED));
425: }
426: if (info.getSubject() != null) {
427: document.add(new Field("Subject", info.getSubject(),
428: Field.Store.YES, Field.Index.TOKENIZED));
429: }
430: if (info.getTitle() != null) {
431: document.add(new Field("Title", info.getTitle(),
432: Field.Store.YES, Field.Index.TOKENIZED));
433: }
434: if (info.getTrapped() != null) {
435: document.add(new Field("Trapped", info.getTrapped(),
436: Field.Store.YES, Field.Index.TOKENIZED));
437: }
438:
439: int summarySize = Math.min(contents.length, 500);
440: // Add the summary as an UnIndexed field, so that it is stored and returned
441: // with hit documents for display.
442: document.add(new Field("summary", new String(contents, 0,
443: summarySize), Field.Store.YES, Field.Index.NO));
444: } catch (CryptographyException e) {
445: throw new PortletException("Error decrypting document("
446: + documentLocation + ".", e);
447: } catch (InvalidPasswordException e) {
448: //they didn't suppply a password and the default of "" was wrong.
449: throw new PortletException("Error: The document("
450: + documentLocation
451: + ") is encrypted and will not be indexed.", e);
452: } catch (IOException e) {
453: throw new PortletException(
454: "IO Exception in indexing PDF document "
455: + documentLocation + ".", e);
456: } finally {
457: if (pdfDocument != null) {
458: try {
459: pdfDocument.close();
460: } catch (Exception ex) { // ignore
461: }
462: }
463: }
464: }
465:
466: }
|