001: package org.tigris.scarab.util.word;
002:
003: /* ================================================================
004: * Copyright (c) 2001 Collab.Net. All rights reserved.
005: *
006: * Redistribution and use in source and binary forms, with or without
007: * modification, are permitted provided that the following conditions are
008: * met:
009: *
010: * 1. Redistributions of source code must retain the above copyright
011: * notice, this list of conditions and the following disclaimer.
012: *
013: * 2. Redistributions in binary form must reproduce the above copyright
014: * notice, this list of conditions and the following disclaimer in the
015: * documentation and/or other materials provided with the distribution.
016: *
017: * 3. The end-user documentation included with the redistribution, if
018: * any, must include the following acknowlegement: "This product includes
019: * software developed by Collab.Net <http://www.Collab.Net/>."
020: * Alternately, this acknowlegement may appear in the software itself, if
021: * and wherever such third-party acknowlegements normally appear.
022: *
023: * 4. The hosted project names must not be used to endorse or promote
024: * products derived from this software without prior written
025: * permission. For written permission, please contact info@collab.net.
026: *
027: * 5. Products derived from this software may not use the "Tigris" or
028: * "Scarab" names nor may "Tigris" or "Scarab" appear in their names without
029: * prior written permission of Collab.Net.
030: *
031: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
032: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
033: * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
034: * IN NO EVENT SHALL COLLAB.NET OR ITS CONTRIBUTORS BE LIABLE FOR ANY
035: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
036: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
037: * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
038: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
039: * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
040: * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
041: * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
042: *
043: * ====================================================================
044: *
045: * This software consists of voluntary contributions made by many
046: * individuals on behalf of Collab.Net.
047: */
048:
049: // JDK classes
050: import java.io.File;
051: import java.io.IOException;
052: import java.util.ArrayList;
053: import java.util.HashMap;
054: import java.util.Iterator;
055: import java.util.List;
056: import java.util.Map;
057:
058: import org.apache.avalon.framework.activity.Initializable;
059: import org.apache.avalon.framework.configuration.Configurable;
060: import org.apache.avalon.framework.configuration.Configuration;
061: import org.apache.avalon.framework.context.Context;
062: import org.apache.avalon.framework.context.ContextException;
063: import org.apache.avalon.framework.context.Contextualizable;
064: import org.apache.lucene.document.Document;
065: import org.apache.lucene.document.Field;
066: import org.apache.lucene.index.IndexReader;
067: import org.apache.lucene.index.IndexWriter;
068: import org.apache.lucene.index.Term;
069: import org.apache.lucene.queryParser.QueryParser;
070: import org.apache.lucene.search.Hits;
071: import org.apache.lucene.search.IndexSearcher;
072: import org.apache.lucene.search.Query;
073: import org.apache.torque.util.Criteria;
074: import org.apache.turbine.RunData;
075: import org.apache.turbine.TemplateContext;
076: import org.tigris.scarab.actions.admin.UpdateSearchIndex.UpdateThread;
077: import org.tigris.scarab.om.Attachment;
078: import org.tigris.scarab.om.AttachmentPeer;
079: import org.tigris.scarab.om.AttributeValue;
080: import org.tigris.scarab.om.AttributeValuePeer;
081: import org.tigris.scarab.om.IssuePeer;
082: import org.tigris.scarab.tools.ScarabLocalizationTool;
083: import org.tigris.scarab.tools.ScarabRequestTool;
084: import org.tigris.scarab.tools.localization.L10NKeySet;
085: import org.tigris.scarab.tools.localization.L10NMessage;
086: import org.tigris.scarab.tools.localization.Localizable;
087: import org.tigris.scarab.util.Log;
088: import org.tigris.scarab.util.ScarabException;
089:
090: import com.workingdogs.village.Record;
091:
092: /**
093: * Support for searching/indexing text
094: *
095: * @author <a href="mailto:jmcnally@collab.net">John McNally</a>
096: * @version $Id: LuceneSearchIndex.java 10193 2006-06-30 12:49:42Z dabbous $
097: */
098: public class LuceneSearchIndex implements SearchIndex, Configurable,
099: Contextualizable, Initializable {
100: private String applicationRoot;
101: // used to occasionally optimize the index
102: private static int counter = 0;
103:
104: /** the location of the index */
105: private String path;
106:
107: /** the attributes that will be searched */
108: private List attributeIds;
109:
110: /** the words and boolean operators */
111: private List queryText;
112:
113: /** the attachments that will be searched */
114: private List attachmentIds;
115:
116: /** the words and boolean operators */
117: private List attachmentQueryText;
118:
119: static private ThreadGroup tg = null;
120:
121: /**
122: * Ctor. Sets up an index directory if one does not yet exist in the
123: * path specified by searchindex.path property in Scarab.properties.
124: */
125: public LuceneSearchIndex() throws IOException {
126:
127: }
128:
129: public void addQuery(Integer[] ids, String text) {
130: attributeIds.add(ids);
131: queryText.add(text);
132: }
133:
134: public void addAttachmentQuery(Integer[] ids, String text) {
135: attachmentIds.add(ids);
136: attachmentQueryText.add(text);
137: }
138:
139: public Long[] getRelatedIssues() throws Exception {
140: return getRelatedIssues(false); // perform AND operation
141: }
142:
143: /**
144: * returns a list of related issue IDs sorted by relevance descending.
145: * Should return an empty/length=0 array if search returns no results.
146: * If mergeResults==true, internally merges results of partial queries,
147: * otherwise performs an implicit AND operation on partial queries.
148: */
149: public Long[] getRelatedIssues(boolean mergeResults)
150: throws Exception {
151: Long[] result;
152: List issueIds = null;
153: // if there are no words to search for return no results
154: if (queryText.size() != 0 || attachmentQueryText.size() != 0) {
155: // attributes
156: for (int j = attributeIds.size() - 1; j >= 0; j--) {
157: Integer[] ids = (Integer[]) attributeIds.get(j);
158: String query = (String) queryText.get(j);
159: issueIds = performPartialQuery(ATTRIBUTE_ID, ids,
160: query, issueIds, mergeResults);
161: }
162:
163: // attachments
164: for (int j = attachmentIds.size() - 1; j >= 0; j--) {
165: Integer[] ids = (Integer[]) attachmentIds.get(j);
166: String query = (String) attachmentQueryText.get(j);
167: issueIds = performPartialQuery(ATTACHMENT_TYPE_ID, ids,
168: query, issueIds, mergeResults);
169: }
170:
171: // put results into final form
172: result = new Long[issueIds.size()];
173: for (int i = 0; i < issueIds.size(); i++) {
174: result[i] = (Long) issueIds.get(i);
175: }
176: } else {
177: result = EMPTY_LIST;
178: }
179:
180: return result;
181: }
182:
183: private List performPartialQuery(String key, Integer[] ids,
184: String query, List issueIds, boolean mergeResults)
185: throws ScarabException, IOException {
186: StringBuffer fullQuery = new StringBuffer(query.length() + 100);
187:
188: if (query.length() > 0) {
189: query.trim();
190: }
191:
192: if (ids != null && ids.length != 0) {
193: fullQuery.append("+((");
194: for (int i = ids.length - 1; i >= 0; i--) {
195: fullQuery.append(key).append(':').append(
196: ids[i].toString());
197: if (i != 0) {
198: fullQuery.append(" OR ");
199: }
200: }
201: fullQuery.append(") AND (").append(query).append("))");
202: } else {
203: fullQuery.append("+(").append(query).append(')');
204: }
205:
206: Query q = null;
207: try {
208: Log.get().debug("Querybefore=" + fullQuery);
209: q = QueryParser.parse(fullQuery.toString(), TEXT,
210: new PorterStemAnalyzer());
211: Log.get().debug("Queryafter=" + q.toString("text"));
212: } catch (Throwable t) {
213: throw new ScarabException(L10NKeySet.ExceptionParseError,
214: fullQuery, t);
215: }
216:
217: IndexSearcher is = new IndexSearcher(path);
218: Hits hits = is.search(q);
219: // remove duplicates
220: Map deduper = new HashMap((int) (1.25 * hits.length() + 1));
221: for (int i = 0; i < hits.length(); i++) {
222: deduper.put(hits.doc(i).get(ISSUE_ID), null);
223: Log.get().debug(
224: "Possible issueId from search: "
225: + hits.doc(i).get(ISSUE_ID));
226: }
227: is.close();
228:
229: if (issueIds == null) {
230: issueIds = new ArrayList(deduper.size());
231: Iterator iter = deduper.keySet().iterator();
232: while (iter.hasNext()) {
233: issueIds.add(new Long((String) iter.next()));
234: Log.get().debug(
235: "Adding issueId from search: "
236: + issueIds.get(issueIds.size() - 1));
237: }
238: } else {
239: if (mergeResults) {
240: // perform OR operation
241: mergeResults(issueIds, deduper);
242: } else {
243: // perform an AND operation
244: removeUniqueElements(issueIds, deduper);
245: }
246: }
247: return issueIds;
248: }
249:
250: /**
251: * Elements from the list that are not in map are removed from the list
252: */
253: private void removeUniqueElements(List list, Map map) {
254: for (int i = list.size() - 1; i >= 0; i--) {
255: Object obj = list.get(i);
256: if (!map.containsKey(obj.toString())) {
257: Log.get().debug("removing issueId from search: " + obj);
258: list.remove(i);
259: }
260: }
261: }
262:
263: /**
264: * Elements from the map, which are not in list are added to the list
265: */
266: private void mergeResults(List list, Map map) {
267: for (int i = list.size() - 1; i >= 0; i--) {
268: Long issueId = (Long) list.get(i);
269: String id = issueId.toString();
270: if (map.containsKey(id)) {
271: map.remove(id);
272: Log.get().debug(
273: "removed duplicate issueId from map: " + id);
274: }
275: }
276: Iterator iter = map.keySet().iterator();
277: while (iter.hasNext()) {
278: String id = (String) iter.next();
279: list.add(new Long(Long.parseLong(id)));
280: Log.get().debug("Add issueId from map to List: " + id);
281: }
282: }
283:
284: /**
285: * Store index information for an AttributeValue
286: */
287: public void index(AttributeValue attributeValue) throws Exception {
288: String valId = attributeValue.getValueId().toString();
289:
290: // make sure any old data stored for this attribute value is deleted.
291: Term term = new Term(VALUE_ID, valId);
292: int deletedDocs = 0;
293: try {
294: synchronized (getClass()) {
295: IndexReader reader = null;
296: try {
297: reader = IndexReader.open(path);
298: deletedDocs = reader.delete(term);
299: } finally {
300: if (reader != null) {
301: reader.close();
302: }
303: }
304: }
305: } catch (NullPointerException npe) {
306: /* Lucene is throwing npe in reader.delete, so have to explicitely
307: search. Not sure if the npe will be thrown in the
308: case where the attribute has previously been indexed, so
309: test whether the npe is harmful.
310: */
311: IndexSearcher is = new IndexSearcher(path);
312: Query q = QueryParser.parse("+" + VALUE_ID + ":" + valId,
313: TEXT, new PorterStemAnalyzer());
314: Hits hits = is.search(q);
315: if (hits.length() > 0) {
316: Localizable l10nInstance = new L10NMessage(
317: L10NKeySet.ExceptionLucene, valId, npe);
318: Log.get().debug(l10nInstance.getMessage());//[HD: create english message for logging!
319: throw new ScarabException(l10nInstance);
320: }
321: }
322: if (deletedDocs > 1) {
323: throw new ScarabException(
324: L10NKeySet.ExceptionMultipleAttValues, valId);
325: }
326: /*
327: System.out.println("deleting valId: " + valId);
328: IndexSearcher is = new IndexSearcher(path);
329: Hits hits = is.search("+" + VALUE_ID + ":" + valId);
330: System.out.println("deleting previous: " + hits.length());
331: if (hits.length() > 1)
332: {
333: throw new ScarabException("Multiple AttributeValues in Lucene" +
334: "index with same ValueId: " + valId);
335: }
336: Document doc = hits.doc(0);
337: */
338:
339: if (attributeValue.getValue() == null) {
340: Log.get().warn(
341: "Attribute value pk=" + valId
342: + " has a null value.");
343: } else {
344: Document doc = new Document();
345: Field valueId = Field.Keyword(VALUE_ID, valId);
346: Field issueId = Field.UnIndexed(ISSUE_ID, attributeValue
347: .getIssueId().toString());
348: Field attributeId = Field.Keyword(ATTRIBUTE_ID,
349: attributeValue.getAttributeId().toString());
350: Field text = Field
351: .UnStored(TEXT, attributeValue.getValue());
352: doc.add(valueId);
353: doc.add(issueId);
354: doc.add(attributeId);
355: doc.add(text);
356: addDoc(doc);
357: }
358: }
359:
360: private void addDoc(Document doc) throws IOException {
361: synchronized (getClass()) {
362: IndexWriter indexer = null;
363: try {
364: indexer = new IndexWriter(path,
365: new PorterStemAnalyzer(), false);
366: indexer.addDocument(doc);
367:
368: if (++counter % 100 == 0) {
369: indexer.optimize();
370: }
371: } finally {
372: if (indexer != null) {
373: indexer.close();
374: }
375: }
376: }
377: }
378:
379: /**
380: * Store index information for an Attachment
381: */
382: public void index(Attachment attachment) throws Exception {
383: String attId = attachment.getAttachmentId().toString();
384:
385: // make sure any old data stored for this attribute value is deleted.
386: Term term = new Term(ATTACHMENT_ID, attId);
387: int deletedDocs = 0;
388: try {
389: synchronized (getClass()) {
390: IndexReader reader = null;
391: try {
392: reader = IndexReader.open(path);
393: deletedDocs = reader.delete(term);
394: } finally {
395: if (reader != null) {
396: reader.close();
397: }
398: }
399: }
400: } catch (NullPointerException npe) {
401: /* Lucene is throwing npe in reader.delete, so have to explicitely
402: search. Not sure if the npe will be thrown in the
403: case where the attribute has previously been indexed, so
404: test whether the npe is harmful.
405: */
406: IndexSearcher is = new IndexSearcher(path);
407: Query q = QueryParser.parse("+" + ATTACHMENT_ID + ":"
408: + attId, TEXT, new PorterStemAnalyzer());
409: Hits hits = is.search(q);
410: if (hits.length() > 0) {
411: Localizable l10nInstance = new L10NMessage(
412: L10NKeySet.ExceptionLucene, attId, npe);
413: Log.get().debug(l10nInstance.getMessage());//[HD: create english message for logging!
414: throw new ScarabException(l10nInstance);
415: }
416: }
417: if (deletedDocs > 1) {
418: throw new ScarabException(
419: L10NKeySet.ExceptionMultipleAttachements, attId);
420: }
421:
422: if (attachment.getData() == null) {
423: Log.get().warn(
424: "Attachment pk=" + attId + " has a null data.");
425: } else {
426: Document doc = new Document();
427: Field attachmentId = Field.Keyword(ATTACHMENT_ID, attId);
428: Field issueId = Field.UnIndexed(ISSUE_ID, attachment
429: .getIssueId().toString());
430: Field typeId = Field.Keyword(ATTACHMENT_TYPE_ID, attachment
431: .getTypeId().toString());
432: Field text = Field.UnStored(TEXT, attachment.getData());
433: doc.add(attachmentId);
434: doc.add(issueId);
435: doc.add(typeId);
436: doc.add(text);
437: addDoc(doc);
438: }
439: }
440:
441: /**
442: * update the index for all entities that currently exist
443: */
444: public void updateIndex() throws Exception {
445: Log.get().info("find estimate of max id...");
446: Criteria crit = new Criteria();
447: crit
448: .addSelectColumn("max(" + AttributeValuePeer.VALUE_ID
449: + ")");
450: List records = AttributeValuePeer.doSelectVillageRecords(crit);
451: long max = ((Record) records.get(0)).getValue(1).asLong();
452:
453: long i = 0L;
454: List avs = null;
455:
456: Log.get().info("index attribute values in database ...");
457: do {
458: crit = new Criteria();
459: Criteria.Criterion low = crit.getNewCriterion(
460: AttributeValuePeer.VALUE_ID, new Long(i),
461: Criteria.GREATER_THAN);
462: i += 100L;
463: Criteria.Criterion high = crit.getNewCriterion(
464: AttributeValuePeer.VALUE_ID, new Long(i),
465: Criteria.LESS_EQUAL);
466: crit.add(low.and(high));
467: crit.add(AttributeValuePeer.DELETED, false);
468: // don't index issues that have been deleted
469: crit.addJoin(AttributeValuePeer.ISSUE_ID,
470: IssuePeer.ISSUE_ID);
471: crit.add(IssuePeer.DELETED, false);
472: avs = AttributeValuePeer.doSelect(crit);
473: if (!avs.isEmpty()) {
474: Iterator avi = avs.iterator();
475: while (avi.hasNext()) {
476: AttributeValue av = (AttributeValue) avi.next();
477: index(av);
478: }
479: if (Log.get().isDebugEnabled()) {
480: Log.get().debug(
481: "Updated index for attribute values ("
482: + (i - 100L) + "-" + i + "]");
483: Log.debugMemory();
484: }
485: }
486: } while (i < max || !avs.isEmpty());
487:
488: Log.get().info("Index attachments ...");// Attachments
489:
490: crit = new Criteria();
491: crit.addSelectColumn("max(" + AttachmentPeer.ATTACHMENT_ID
492: + ")");
493: records = AttachmentPeer.doSelectVillageRecords(crit);
494: max = ((Record) records.get(0)).getValue(1).asLong();
495: i = 0L;
496: List atts = null;
497: do {
498: crit = new Criteria();
499: Criteria.Criterion low = crit.getNewCriterion(
500: AttachmentPeer.ATTACHMENT_ID, new Long(i),
501: Criteria.GREATER_THAN);
502: i += 100L;
503: Criteria.Criterion high = crit.getNewCriterion(
504: AttachmentPeer.ATTACHMENT_ID, new Long(i),
505: Criteria.LESS_EQUAL);
506: crit.add(low.and(high));
507: crit.add(AttachmentPeer.DELETED, false);
508: // don't index issues that have been deleted
509: crit.addJoin(AttachmentPeer.ISSUE_ID, IssuePeer.ISSUE_ID);
510: crit.add(IssuePeer.DELETED, false);
511: atts = AttachmentPeer.doSelect(crit);
512: if (!atts.isEmpty()) {
513: Iterator atti = atts.iterator();
514: while (atti.hasNext()) {
515: Attachment att = (Attachment) atti.next();
516: if (att.getData() != null
517: && att.getData().length() > 0
518: && att.getIssueId() != null
519: && att.getTypeId() != null) {
520: index(att);
521: }
522: }
523:
524: if (Log.get().isDebugEnabled()) {
525: Log.get().debug(
526: "Updated index for attachments ("
527: + (i - 100L) + "-" + i + "]");
528: Log.debugMemory();
529: }
530: }
531: } while (i < max || !atts.isEmpty());
532:
533: Log.get().info("Finish off with an optimized index...");
534: synchronized (getClass()) {
535: IndexWriter indexer = null;
536: try {
537: indexer = new IndexWriter(path,
538: new PorterStemAnalyzer(), false);
539: indexer.optimize();
540: } finally {
541: if (indexer != null) {
542: indexer.close();
543: }
544: }
545: }
546: Log.get().info("Indexing terminated.");
547: }
548:
549: // ---------------- Avalon Lifecycle Methods ---------------------
550: /**
551: * Avalon component lifecycle method
552: */
553: public void configure(Configuration conf) {
554: path = conf.getAttribute(INDEX_PATH, null);
555:
556: }
557:
558: /**
559: * @see org.apache.avalon.framework.context.Contextualizable
560: * @avalon.entry key="urn:avalon:home" type="java.io.File"
561: */
562: public void contextualize(Context context) throws ContextException {
563: this .applicationRoot = context.get("urn:avalon:home")
564: .toString();
565: }
566:
567: /**
568: * Avalon component lifecycle method
569: * Initializes the service by loading default class loaders
570: * and customized object factories.
571: *
572: * @throws InitializationException if initialization fails.
573: */
574: public void initialize() throws Exception {
575:
576: File indexDir = new File(path);
577: if (!indexDir.isAbsolute()) {
578: path = getRealPath(path);
579: indexDir = new File(path);
580: }
581:
582: boolean createIndex = false;
583: if (indexDir.exists()) {
584: int length = indexDir.listFiles().length;
585: if (length < 3) {
586: createIndex = true;
587: }
588: } else {
589: indexDir.mkdirs();
590: createIndex = true;
591: }
592:
593: if (createIndex) {
594: Log.get().info("Creating index at '" + path + '\'');
595: synchronized (getClass()) {
596: doCreateIndex();
597: }
598: }
599:
600: clear();
601: }
602:
603: private String getRealPath(String path) {
604: String absolutePath = null;
605: if (applicationRoot == null) {
606: absolutePath = new File(path).getAbsolutePath();
607: } else {
608: absolutePath = new File(applicationRoot, path)
609: .getAbsolutePath();
610: }
611: return absolutePath;
612: }
613:
614: /* (non-Javadoc)
615: * @see org.tigris.scarab.util.word.SearchIndex#clear()
616: */
617: public void clear() {
618: attributeIds = new ArrayList(5);
619: queryText = new ArrayList(5);
620: attachmentIds = new ArrayList(2);
621: attachmentQueryText = new ArrayList(2);
622: }
623:
624: public void doCreateIndex() throws Exception {
625: synchronized (this ) {
626:
627: try {
628: if (tg == null) {
629: tg = new ThreadGroup("UpdateIndex");
630: }
631: Thread updateThread = new Thread(tg, new UpdateThread());
632: updateThread.start();
633: } catch (Exception e) {
634: Log.get().warn(
635: "Could not start SearchIndex initialization:",
636: e);
637: }
638:
639: }
640:
641: }
642:
643: public class UpdateThread implements Runnable {
644: public void run() {
645: try {
646: IndexWriter indexWriter = null;
647: try {
648: indexWriter = new IndexWriter(path,
649: new PorterStemAnalyzer(), true);
650: } finally {
651: if (indexWriter != null) {
652: indexWriter.close();
653: }
654: }
655:
656: Log.get().info("Update index started!");
657: SearchIndex indexer = SearchFactory.getInstance();
658: indexer.updateIndex();
659: Log.get().info("Update index completed!");
660:
661: } catch (Exception e) {
662: Log.get().error("Update index failed:", e);
663: }
664: }
665: }
666: }
|