001: /*
002: * $Id: AbstractSearchEngine.java 644 2006-04-23 07:52:28Z wrh2 $
003: *
004: * Filename : AbstractSearchEngine.java
005: * Project : vqwiki-classic
006: */
007: package vqwiki;
008:
009: import java.io.File;
010: import java.io.IOException;
011: import java.io.Reader;
012: import java.io.StringReader;
013: import java.lang.reflect.InvocationTargetException;
014: import java.lang.reflect.Method;
015: import java.util.ArrayList;
016: import java.util.Collection;
017: import java.util.Collections;
018: import java.util.HashSet;
019: import java.util.Iterator;
020: import java.util.List;
021: import java.util.Set;
022:
023: import javax.servlet.ServletContext;
024:
025: import org.apache.commons.httpclient.HttpClient;
026: import org.apache.commons.httpclient.HttpException;
027: import org.apache.commons.httpclient.HttpMethod;
028: import org.apache.commons.httpclient.methods.GetMethod;
029: import org.apache.log4j.Logger;
030: import org.apache.lucene.analysis.Analyzer;
031: import org.apache.lucene.analysis.Token;
032: import org.apache.lucene.analysis.TokenStream;
033: import org.apache.lucene.document.Document;
034: import org.apache.lucene.document.Field;
035: import org.apache.lucene.index.IndexReader;
036: import org.apache.lucene.index.IndexWriter;
037: import org.apache.lucene.index.Term;
038: import org.apache.lucene.queryParser.QueryParser;
039: import org.apache.lucene.search.BooleanQuery;
040: import org.apache.lucene.search.Hits;
041: import org.apache.lucene.search.IndexSearcher;
042: import org.apache.lucene.search.Searcher;
043: import org.apache.lucene.store.Directory;
044: import org.apache.lucene.store.FSDirectory;
045: import org.apache.lucene.store.InputStream;
046: import org.apache.lucene.store.OutputStream;
047: import org.apache.lucene.store.RAMDirectory;
048:
049: import vqwiki.lex.BackLinkLex;
050: import vqwiki.utils.Utilities;
051: import vqwiki.utils.lucene.HTMLParser;
052: import vqwiki.utils.lucene.LuceneTools;
053: import vqwiki.utils.lucene.SimpleKeepNumbersAnalyzer;
054:
055: /**
056: * Abstract class to do the search.
057: *
058: * This class was created on 09:59:41 04.08.2003
059: *
060: * @author tobsch
061: */
062: public abstract class AbstractSearchEngine implements SearchEngine {
063:
064: /** Index type "File" */
065: protected static final String ITYPE_FILE = "file";
066: /** Index type "topic" */
067: protected static final String ITYPE_TOPIC = "topic";
068: /** Index type "content" */
069: protected static final String ITYPE_CONTENT = "content";
070: /** Index type "content plain" */
071: protected static final String ITYPE_CONTENT_PLAIN = "content_plain";
072: /** Index type "topic plain" */
073: protected static final String ITYPE_TOPIC_PLAIN = "topic_plain";
074: /** Where to log to */
075: private static final Logger logger = Logger
076: .getLogger(AbstractSearchEngine.class);
077: /** File separator */
078: protected static String sep = System.getProperty("file.separator");
079: /** Temp directory - where to store the indexes (initialized via getInstance method) */
080: protected static String indexPath = null;
081: /** Index is stored in RAM */
082: private static final int RAM_BASED = 0;
083: /** Index is stored in the file system */
084: private static final int FS_BASED = 1;
085:
086: /** where is the index stored */
087: private transient int fsType = FS_BASED;
088: /** Can we parse HTML files? */
089: private transient boolean canParseHTML = false;
090: /** Can we parse PDF files? */
091: private transient boolean canParsePDF = false;
092:
093: /**
094: * Index the given text for the search engine database
095: */
096: public void indexText(String virtualWiki, String topic, String text)
097: throws IOException {
098: // put keywords into index db - ignore particles etc
099: add(virtualWiki, topic, text);
100: }
101:
102: /**
103: * Should be called by a monitor thread at regular intervals, rebuilds the
104: * entire seach index to account for removed items. Due to the additive rather
105: * than subtractive nature of a Wiki, it probably only needs to be called once
106: * or twice a day
107: */
108: public void refreshIndex() throws Exception {
109: rebuild();
110: }
111:
112: /**
113: * Find topics that contain the given term.
114: * Note: Use this method ONLY to search for topics!
115: *
116: * @param virtualWiki The virtual wiki to use
117: * @param text The text to find
118: * @param fuzzy true, if fuzzy search should be used, false otherwise
119: *
120: * @return A collection of SearchResultEntry, containing the search results
121: */
122: public Collection find(String virtualWiki, String text,
123: boolean doTextBeforeAndAfterParsing) {
124: return doSearch(virtualWiki, text, false,
125: doTextBeforeAndAfterParsing);
126: }
127:
128: /**
129: * Find topics that contain a link to the given topic name
130: * @param virtualWiki the virtual wiki to look in
131: * @param topicName the topic being searched for
132: * @return A collection of SearchResultEntry, containing the search results
133: */
134: public Collection findLinkedTo(String virtualWiki, String topicName)
135: throws Exception {
136: // create a set to hold the valid back linked topics
137: Set results = new HashSet();
138: // find all topics that actually mention the name of the topic in the text somewhere
139: Collection all = doSearch(virtualWiki, topicName, false, false);
140: // iterate the results from the general search
141: for (Iterator iterator = all.iterator(); iterator.hasNext();) {
142: SearchResultEntry searchResultEntry = (SearchResultEntry) iterator
143: .next();
144: // the topic where the hit was is the topic that will potentially contain a link back to our topicName
145: String topicFoundIn = searchResultEntry.getTopic();
146: if (!topicName.equalsIgnoreCase(topicFoundIn)) {
147: logger.debug("checking links in topic " + topicFoundIn
148: + " to " + topicName);
149: // read the raw content of the topic the hit was in
150: String topicContents = WikiBase.getInstance().readRaw(
151: virtualWiki, topicFoundIn);
152: StringReader reader = new StringReader(topicContents);
153: BackLinkLex backLinkLex = new BackLinkLex(reader);
154: // lex the whole file with a back link lexer that simply catalogues all the valid intrawiki links
155: while (backLinkLex.yylex() != null)
156: ;
157: reader.close();
158: // get the intrawiki links
159: List backLinks = backLinkLex.getLinks();
160: logger.debug("links: " + backLinks);
161: if (Utilities.containsStringIgnoreCase(backLinks,
162: topicName)) {
163: // only add the topic if there is an actual link
164: results.add(searchResultEntry);
165: logger.debug("'" + topicFoundIn
166: + "' does contain a link to '" + topicName
167: + "'");
168: } else {
169: logger.debug("'" + topicFoundIn
170: + "' contains no link to '" + topicName
171: + "'");
172: }
173: } else {
174: // the topic itself does not count as a back link
175: logger.debug("the topic itself is not a back link");
176: }
177: }
178: return results;
179: }
180:
181: /**
182: * Find topics that contain any of the space delimited terms.
183: * Note: Use this method for full text search.
184: *
185: * @param virtualWiki The virtual wiki to use
186: * @param text The text to find
187: * @param fuzzy true, if fuzzy search should be used, false otherwise
188: *
189: * @return A collection of SearchResultEntry, containing the search results
190: */
191: public Collection findMultiple(String virtualWiki, String text,
192: boolean fuzzy) {
193: return doSearch(virtualWiki, text, true, true);
194: }
195:
196: /**
197: * @param indexPath
198: */
199: protected void initSearchEngine(ServletContext ctx)
200: throws Exception {
201: // Initialize the temp directory used to store search indexes.
202: // In order to avoid collisions in the case of multiple deployments
203: // of this very application, the temp directory supplied by the
204: // servlet container (which is required to be private per servlet
205: // context by ?? 3.7.1 of the Java Servlet Specification) is used
206: // rather than the global temp directory as defined in the system
207: // property 'java.io.tmpdir'.
208: try {
209: File tmpDir = (File) ctx
210: .getAttribute("javax.servlet.context.tempdir");
211: indexPath = tmpDir.getPath();
212: } catch (Throwable t) {
213: logger
214: .warn(
215: "'javax.servlet.context.tempdir' attribute undefined or invalid, using java.io.tmpdir",
216: t);
217: indexPath = System.getProperty("java.io.tmpdir");
218: }
219: refreshIndex();
220: }
221:
222: /**
223: * @param indexPath
224: */
225: protected void initSearchEngine(String iP) throws Exception {
226: indexPath = iP;
227: refreshIndex();
228: }
229:
230: /**
231: * Actually perform the search.
232: *
233: * @param virtualWiki The virtual wiki to use
234: * @param text The text to find
235: * @param caseInsensitiveSearch true, if case does not matter in search, false otherwise
236: *
237: * @return A collection of SearchResultEntry, containing the search results
238: */
239: protected Collection doSearch(String virtualWiki, String text,
240: boolean caseInsensitiveSearch,
241: boolean doTextBeforeAndAfterParsing) {
242: if (indexPath == null) {
243: return Collections.EMPTY_LIST;
244: }
245: String indexFilename = getSearchIndexPath(virtualWiki);
246: Analyzer analyzer = new SimpleKeepNumbersAnalyzer();
247: Collection result = new ArrayList();
248: logger.debug("search text: " + text);
249: try {
250: BooleanQuery query = new BooleanQuery();
251: if (caseInsensitiveSearch) {
252: query.add(QueryParser
253: .parse(text, ITYPE_TOPIC, analyzer), false,
254: false);
255: query.add(QueryParser.parse(text, ITYPE_CONTENT,
256: analyzer), false, false);
257: } else {
258: query.add(QueryParser.parse("\"" + text + "\"",
259: ITYPE_TOPIC, analyzer), false, false);
260: query.add(QueryParser.parse("\"" + text + "\"",
261: ITYPE_CONTENT, analyzer), false, false);
262: }
263: Searcher searcher = new IndexSearcher(getIndexDirectory(
264: indexFilename, false));
265: // actually perform the search
266: Hits hits = searcher.search(query);
267: for (int i = 0; i < hits.length(); i++) {
268: SearchResultEntry entry = new SearchResultEntry();
269: entry.setTopic(hits.doc(i).get(ITYPE_TOPIC_PLAIN));
270: entry.setRanking(hits.score(i));
271: boolean canBeAdded = true;
272: boolean found = false;
273: if (doTextBeforeAndAfterParsing) {
274: String content = hits.doc(i).get(
275: ITYPE_CONTENT_PLAIN);
276: if (content != null) {
277: if (!caseInsensitiveSearch) {
278: if (content.indexOf(text) != -1) {
279: found = true;
280: }
281: } else {
282: if (content.toLowerCase().indexOf(
283: text.toLowerCase()) != -1) {
284: found = true;
285: }
286: if (!found) {
287: HashSet terms = new HashSet();
288: LuceneTools.getTerms(query, terms,
289: false);
290: Token token;
291: TokenStream stream = new SimpleKeepNumbersAnalyzer()
292: .tokenStream(
293: ITYPE_CONTENT,
294: new java.io.StringReader(
295: content));
296: while ((token = stream.next()) != null) {
297: // does query contain current token?
298: if (terms
299: .contains(token.termText())) {
300: found = true;
301: }
302: }
303: }
304: if (!found) {
305: // we had a keyword hit
306: int firstword = LuceneTools.findAfter(
307: content, 1, 0);
308: if (firstword == -1) {
309: firstword = 0;
310: }
311: entry.setTextBefore("");
312: entry.setFoundWord(content.substring(0,
313: firstword));
314: if ((firstword + 1) < content.length()) {
315: firstword++;
316: }
317: int lastword = LuceneTools.findAfter(
318: content, 1, 19);
319: if (lastword < 0) {
320: lastword = content.length();
321: }
322: if (firstword < 0) {
323: firstword = 0;
324: }
325: entry.setTextAfter(content.substring(
326: Math.min(firstword, lastword),
327: Math.max(firstword, lastword))
328: + " ...");
329: } else {
330: // we had a regular hit
331: String[] tempresult = LuceneTools
332: .outputHits(
333: hits
334: .doc(i)
335: .get(
336: ITYPE_CONTENT_PLAIN),
337: query,
338: new Analyzer[] {
339: new SimpleKeepNumbersAnalyzer(),
340: new SimpleKeepNumbersAnalyzer() });
341: entry.setTextBefore("... "
342: + tempresult[0]);
343: entry.setTextAfter(tempresult[2]
344: + " ...");
345: entry.setFoundWord(tempresult[1]);
346: }
347: }
348: }
349: if (!caseInsensitiveSearch && !found) {
350: canBeAdded = false;
351: }
352: } else {
353: canBeAdded = true;
354: entry.setTextBefore("");
355: entry.setTextAfter("");
356: entry.setFoundWord(entry.getTopic());
357: }
358: if (canBeAdded) {
359: result.add(entry);
360: }
361: }
362: } catch (IOException e) {
363: logger.warn("Error (IOExcpetion) while searching for "
364: + text + "; Refreshing search index");
365: SearchRefreshThread.refreshNow();
366: } catch (Exception e) {
367: logger.fatal("Excpetion while searching for " + text, e);
368: }
369: return result;
370: }
371:
372: /**
373: * Adds to the in-memory table. Does not remove indexed items that are
374: * no longer valid due to deletions, edits etc.
375: */
376: public synchronized void add(String virtualWiki, String topic,
377: String contents) throws IOException {
378: String indexFilename = getSearchIndexPath(virtualWiki);
379: try {
380: Directory directory = getIndexDirectory(indexFilename,
381: false);
382: if (IndexReader.isLocked(directory)) {
383: // wait up to ten seconds until unlocked
384: int count = 0;
385: while (IndexReader.isLocked(directory) && count < 20) {
386: try {
387: Thread.sleep(500);
388: } catch (InterruptedException ie) {
389: ; // do nothing
390: }
391: count++;
392: }
393: // if still locked, force to unlock it
394: if (IndexReader.isLocked(directory)) {
395: IndexReader.unlock(directory);
396: logger.fatal("Unlocking search index by force");
397: }
398: }
399: // delete the current document
400: IndexReader reader = IndexReader.open(directory);
401: reader.delete(new Term(ITYPE_TOPIC_PLAIN, topic));
402: reader.close();
403: directory.close();
404: // add new document
405: IndexWriter writer = new IndexWriter(directory,
406: new SimpleKeepNumbersAnalyzer(), false);
407: writer.optimize();
408: Document doc = createDocument(virtualWiki, topic);
409: try {
410: writer.addDocument(doc);
411: } catch (IOException ex) {
412: logger.error(ex);
413: } finally {
414: try {
415: if (writer != null) {
416: writer.optimize();
417: }
418: } catch (IOException ioe) {
419: logger.fatal("IOException during optimize", ioe);
420: }
421: try {
422: if (writer != null) {
423: writer.close();
424: }
425: } catch (IOException ioe) {
426: logger.fatal("IOException during closing", ioe);
427: }
428: writer = null;
429: }
430: } catch (IOException e) {
431: logger.fatal("Excpetion while adding topic " + topic
432: + "; Refreshing search index", e);
433: SearchRefreshThread.refreshNow();
434: } catch (Exception e) {
435: logger.error("Excpetion while adding topic " + topic, e);
436: }
437: }
438:
439: /**
440: * Trawls all the files in the wiki directory and indexes them
441: */
442: public synchronized void rebuild() throws Exception {
443: logger.info("Building index");
444: Collection allWikis = WikiBase.getInstance()
445: .getVirtualWikiList();
446: if (!allWikis.contains(WikiBase.DEFAULT_VWIKI)) {
447: allWikis.add(WikiBase.DEFAULT_VWIKI);
448: }
449: try {
450: // check, if classes are here:
451: Class.forName("vqwiki.utils.lucene.HTMLParser");
452: canParseHTML = true;
453: } catch (ClassNotFoundException e) {
454: canParseHTML = false;
455: }
456: try {
457: // check, if classes are here:
458: Class.forName("org.pdfbox.pdfparser.PDFParser");
459: canParsePDF = true;
460: } catch (ClassNotFoundException e) {
461: canParsePDF = false;
462: }
463: for (Iterator iterator = allWikis.iterator(); iterator
464: .hasNext();) {
465: String currentWiki = (String) iterator.next();
466: logger.debug("indexing virtual wiki " + currentWiki);
467: File indexFile = new File(indexPath, "index" + currentWiki);
468: logger.debug("Index file path = " + indexFile);
469: if (currentWiki.equals(WikiBase.DEFAULT_VWIKI)) {
470: currentWiki = "";
471: }
472: int retrycounter = 0;
473: do {
474: // initially create index in ram
475: RAMDirectory ram = new RAMDirectory();
476: Analyzer analyzer = new SimpleKeepNumbersAnalyzer();
477: IndexWriter writer = new IndexWriter(ram, analyzer,
478: true);
479: try {
480: Collection topics = getAllTopicNames(currentWiki);
481: for (Iterator iter = topics.iterator(); iter
482: .hasNext();) {
483: String topic = (String) iter.next();
484: Document doc = createDocument(currentWiki,
485: topic);
486: writer.addDocument(doc);
487: }
488: } catch (IOException ex) {
489: logger.error(ex);
490: } finally {
491: try {
492: if (writer != null) {
493: writer.optimize();
494: }
495: } catch (IOException ioe) {
496: logger
497: .fatal("IOException during optimize",
498: ioe);
499: }
500: try {
501: if (writer != null) {
502: writer.close();
503: retrycounter = 999;
504: }
505: } catch (IOException ioe) {
506: logger.fatal("IOException during close", ioe);
507: }
508: writer = null;
509: }
510: // write back to disc
511: copyRamIndexToFileIndex(ram, indexFile);
512: retrycounter++;
513: } while (retrycounter < 1);
514: }
515: }
516:
517: /**
518: * Copy an index from RAM to file
519: * @param ram The index in RAM
520: * @param indexFile The index on disc
521: * @throws IOException
522: */
523: private void copyRamIndexToFileIndex(RAMDirectory ram,
524: File indexFile) throws IOException {
525: Directory index = getIndexDirectory(indexFile, true);
526: try {
527: if (IndexReader.isLocked(index)) {
528: // wait up to ten seconds until unlocked
529: int count = 0;
530: while (IndexReader.isLocked(index) && count < 20) {
531: try {
532: Thread.sleep(500);
533: } catch (InterruptedException ie) {
534: ; // do nothing
535: }
536: count++;
537: }
538: // if still locked, force to unlock it
539: if (IndexReader.isLocked(index)) {
540: IndexReader.unlock(index);
541: logger.fatal("Unlocking search index by force");
542: }
543: }
544: IndexWriter indexWriter = new IndexWriter(index, null, true);
545: indexWriter.close();
546: } catch (Exception e) {
547: logger.fatal("Cannot create empty directory: ", e);
548: // delete all files in the temp directory
549: if (fsType == FS_BASED) {
550: File[] files = indexFile.listFiles();
551: for (int i = 0; i < files.length; i++) {
552: files[i].delete();
553: }
554: }
555: }
556: // actually copy files
557: String[] ar = ram.list();
558: for (int i = 0; i < ar.length; i++) {
559: // make place on ram disk
560: OutputStream os = index.createFile(ar[i]);
561: // read current file
562: InputStream is = ram.openFile(ar[i]);
563: // and copy to ram disk
564: int len = (int) is.length();
565: byte[] buf = new byte[len];
566: is.readBytes(buf, 0, len);
567: os.writeBytes(buf, len);
568: // graceful cleanup
569: is.close();
570: os.close();
571: }
572: }
573:
574: /**
575: * @param indexFile
576: */
577: protected Directory getIndexDirectory(File indexFile, boolean create)
578: throws IOException {
579: if (fsType == FS_BASED) {
580: return FSDirectory.getDirectory(indexFile, create);
581: } else {
582: return null;
583: }
584: }
585:
586: /**
587: * @param indexFilename
588: */
589: protected Directory getIndexDirectory(String indexFilename,
590: boolean create) throws IOException {
591: if (fsType == FS_BASED) {
592: return FSDirectory.getDirectory(indexFilename, create);
593: } else {
594: return null;
595: }
596: }
597:
598: /**
599: * Create a document to add to the search index
600: * @param currentWiki Name of this wiki
601: * @param topic Name of the topic to add
602: * @return The document to add
603: */
604: protected Document createDocument(String currentWiki, String topic)
605: throws Exception {
606: // get content
607: StringBuffer contents = new StringBuffer(WikiBase.getInstance()
608: .getHandler().read(currentWiki, topic));
609: // find attachments
610: List attachments = extractByKeyword(contents, "attach:", true);
611: // find links
612: List links = new ArrayList();
613: List linksNonsecure = extractByKeyword(contents, "http://",
614: false);
615: for (Iterator iter = linksNonsecure.iterator(); iter.hasNext();) {
616: links.add("http://" + (String) iter.next());
617: }
618: List linksSecure = extractByKeyword(contents, "https://", false);
619: for (Iterator iter = linksSecure.iterator(); iter.hasNext();) {
620: links.add("https://" + (String) iter.next());
621: }
622: if (Environment.getInstance().isAttachmentIndexingEnabled()) {
623: for (Iterator iter = attachments.iterator(); iter.hasNext();) {
624: String attachmentFileName = (String) iter.next();
625: String extension = "";
626: if (attachmentFileName.lastIndexOf('.') != -1) {
627: extension = attachmentFileName.substring(
628: attachmentFileName.lastIndexOf('.') + 1)
629: .toLowerCase();
630: }
631: File attachmentFile = Environment.getInstance()
632: .uploadPath(currentWiki, attachmentFileName);
633: if ("txt".equals(extension) || "asc".equals(extension)) {
634: StringBuffer textFileBuffer = Utilities
635: .readFile(attachmentFile);
636: contents.append(" ").append(textFileBuffer);
637: }
638: if (canParseHTML
639: && ("htm".equals(extension) || "html"
640: .equals(extension))) {
641: HTMLParser parser = new HTMLParser(attachmentFile);
642: // Add the tag-stripped contents as a Reader-valued Text field so it will
643: // get tokenized and indexed.
644: contents.append(" ");
645: Reader inStream = parser.getReader();
646: while (true) {
647: int read = inStream.read();
648: if (read == -1) {
649: break;
650: }
651: contents.append((char) read);
652: }
653: inStream.close();
654: }
655: if (canParsePDF && ("pdf".equals(extension))) {
656: try {
657: Class pdfclass = Class
658: .forName("vqwiki.utils.lucene.PDFDocument");
659: Object pdfdocument = pdfclass.newInstance();
660: Method method = pdfclass.getMethod(
661: "getContentOfPDFFile", new Class[] {
662: String.class, File.class });
663: Object result = method.invoke(pdfdocument,
664: new Object[] { attachmentFileName,
665: attachmentFile });
666: if (result instanceof StringBuffer) {
667: contents.append((StringBuffer) result);
668: }
669: } catch (SecurityException e) {
670: // Actually do nothing
671: } catch (IllegalArgumentException e) {
672: // Actually do nothing
673: } catch (ClassNotFoundException e) {
674: // Actually do nothing
675: } catch (InstantiationException e) {
676: // Actually do nothing
677: } catch (IllegalAccessException e) {
678: // Actually do nothing
679: } catch (NoSuchMethodException e) {
680: // Actually do nothing
681: } catch (InvocationTargetException e) {
682: // Actually do nothing
683: }
684: }
685: // otherwise we cannot index it -> ignore it!
686: }
687: if (canParseHTML
688: && Environment.getInstance()
689: .isExtLinksIndexingEnabled()) {
690: for (Iterator iter = links.iterator(); iter.hasNext();) {
691: try {
692: String link = (String) iter.next();
693: // get page
694: HttpClient client = new HttpClient();
695: // establish a connection within 15 seconds
696: client.setConnectionTimeout(15000);
697: client.setTimeout(15000);
698: HttpMethod method = new GetMethod(link);
699: method.setFollowRedirects(true);
700: client.executeMethod(method);
701: HTMLParser parser = new HTMLParser(method
702: .getResponseBodyAsStream());
703: // Add the tag-stripped contents as a Reader-valued Text field so it will
704: // get tokenized and indexed.
705: contents.append(" ");
706: Reader inStream = parser.getReader();
707: while (true) {
708: int read = inStream.read();
709: if (read == -1) {
710: break;
711: }
712: contents.append((char) read);
713: }
714: inStream.close();
715: } catch (HttpException e) {
716: // Actually do nothing
717: } catch (IOException e) {
718: // Actually do nothing
719: } catch (IllegalArgumentException e) {
720: // Actually do nothing
721: }
722: }
723: }
724: }
725: // add remaining information
726: String fileName = getFilename(currentWiki, topic);
727: if (fileName != null) {
728: logger.debug("Indexing topic " + topic + " in file "
729: + fileName);
730: } else {
731: logger.debug("Indexing topic " + topic);
732: }
733: Document doc = new Document();
734: doc.add(Field.Text(ITYPE_TOPIC, new StringReader(topic)));
735: doc.add(Field.Keyword(ITYPE_TOPIC_PLAIN, topic));
736: if (fileName != null) {
737: doc.add(Field.UnIndexed(ITYPE_FILE, fileName));
738: }
739: doc.add(Field.Text(ITYPE_CONTENT, new StringReader(contents
740: .toString())));
741: doc.add(Field.UnIndexed(ITYPE_CONTENT_PLAIN, contents
742: .toString()));
743: return doc;
744: }
745:
746: /**
747: * Get a list of all keywords in a given text. The list returned contains all words
748: * following the keyword. For example if the keyword is "attach:" all attachments
749: * are returned.
750: * @param contents The content to search
751: * @param keyword The keyword to search
752: * @return A list of all words
753: */
754: private ArrayList extractByKeyword(StringBuffer contents,
755: String keyword, boolean possibleQuoted) {
756: ArrayList returnList = new ArrayList();
757: int attPos = contents.toString().indexOf(keyword);
758: while (attPos != -1) {
759: int endPos = attPos + keyword.length() + 1;
760: boolean beginQuote = contents.charAt(attPos
761: + keyword.length()) == '\"';
762: while (endPos < contents.length()) {
763: // attach: can have quotes, so we need a special handling if there are
764: // begin and end quotes.
765: if (possibleQuoted && beginQuote) {
766: if (contents.charAt(endPos) == '\"'
767: || contents.charAt(endPos) == '\n'
768: || contents.charAt(endPos) == '\r') {
769: attPos++;
770: break;
771: }
772: } else if (contents.charAt(endPos) == ' '
773: || contents.charAt(endPos) == ')'
774: || contents.charAt(endPos) == '|'
775: || contents.charAt(endPos) == '\"'
776: || contents.charAt(endPos) == '\n'
777: || contents.charAt(endPos) == '\r'
778: || contents.charAt(endPos) == '\t') {
779: break;
780: }
781: endPos++;
782: }
783: returnList.add(contents.substring(
784: attPos + keyword.length(), endPos));
785: attPos = contents.toString().indexOf(keyword, endPos);
786: }
787: return returnList;
788: }
789:
790: /**
791: * @param currentWiki
792: * @param topic
793: * @return
794: */
795: protected abstract String getFilename(String currentWiki,
796: String topic);
797:
798: /**
799: * Get the path, which holds all index files
800: */
801: public String getSearchIndexPath(String virtualWiki) {
802: return indexPath + sep + "index" + virtualWiki;
803: }
804: }
805:
806: /*
807: * Log:
808: *
809: * $Log$
810: * Revision 1.23 2006/04/23 07:52:28 wrh2
811: * Coding style updates (VQW-73).
812: *
813: * Revision 1.22 2006/04/20 01:32:17 wrh2
814: * Use standard variable name for logger (VQW-73).
815: *
816: * Revision 1.21 2006/03/15 00:23:21 studer
817: * Fixing bug http://www.vqwiki.org/jira/browse/VQW-26
818: * Adding new parameter on the admin console to switch off indexing of http:-resources
819: *
820: * Revision 1.20 2006/03/01 17:34:57 studer
821: * fixing http://www.vqwiki.org/jira/browse/VQW-51
822: *
823: * Revision 1.19 2006/02/28 21:23:40 studer
824: * Humm.. a guess for the VQW-26 bug. Maybe it is something with the jdk and maybe it helps Pete...
825: *
826: * Revision 1.18 2006/01/31 21:57:13 studer
827: * http://www.vqwiki.org/jira/browse/VQW-26
828: * Fixed definitively a bug. Quoted attach:-files wheren't indexed at all. But I guess it's not the one which got us some problems?!
829: *
830: * Revision 1.17 2004/07/14 04:58:51 garethc
831: * fix
832: *
833: * Revision 1.16 2004/06/28 09:42:06 mrgadget4711
834: * Fix with searching referred HTML pages
835: *
836: * Revision 1.15 2004/06/24 18:55:22 mrgadget4711
837: * ADD: The search engine now also searches external links
838: *
839: * Revision 1.14 2004/04/02 12:51:54 mrgadget4711
840: * ADD: Ignore numbers, when doing a search
841: *
842: * Revision 1.13 2004/02/28 04:05:42 garethc
843: * General bug fixes, panic on admin console
844: *
845: * Revision 1.12 2003/11/29 23:40:12 mrgadget4711
846: * MOD: Using the PDFDocument by dynamic lookup. By this, you can
847: * delete it if you need JDK 1.3.x compatibility.
848: *
849: * Revision 1.11 2003/11/29 21:53:08 mrgadget4711
850: * MOD: In case, search throws an IOException, only give a warning
851: * and do not dump the whole stacktrace.
852: *
853: * Revision 1.10 2003/11/29 21:24:26 mrgadget4711
854: * MOD: catching a null pointer exception in PDFparser.close();
855: *
856: * Revision 1.9 2003/11/27 01:57:11 garethc
857: * fixes
858: *
859: * Revision 1.8 2003/10/05 05:07:30 garethc
860: * fixes and admin file encoding option + merge with contributions
861: *
862: * Revision 1.7 2003/09/12 14:06:43 makub
863: * Made code JDK1.3 compatible by removing calls to StringBuffer.indexOf() and STring.split()
864: *
865: * Revision 1.6 2003/08/22 12:34:34 mrgadget4711
866: * Search extendes, so that text attachments,
867: * html attachments and pdf attachments are searched as well
868: *
869: * Revision 1.5 2003/08/20 20:45:45 mrgadget4711
870: * Avoid scanning of result, if not needed
871: *
872: * Revision 1.3 2003/08/05 05:41:45 mrgadget4711
873: * MOD: Wait up to 10 seconds until a lock is released
874: * ADD: More specific log information
875: *
876: * Revision 1.2 2003/08/04 17:23:58 mrgadget4711
877: * MOD: Use RAM to build up index, then copy it by brute force into the file system
878: *
879: * Revision 1.1 2003/08/04 09:06:47 mrgadget4711
880: * MOD: Extracted all core search engine functionality into an AbstractSearchEngine
881: * MOD: Try really hard to delete a lock
882: *
883: * ------------END------------
884: */
|