001: /*
002: * The contents of this file are subject to the
003: * Mozilla Public License Version 1.1 (the "License");
004: * you may not use this file except in compliance with the License.
005: * You may obtain a copy of the License at http://www.mozilla.org/MPL/
006: *
007: * Software distributed under the License is distributed on an "AS IS"
008: * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
009: * See the License for the specific language governing rights and
010: * limitations under the License.
011: *
012: * The Initial Developer of the Original Code is Simulacra Media Ltd.
013: * Portions created by Simulacra Media Ltd are Copyright (C) Simulacra Media Ltd, 2004.
014: *
015: * All Rights Reserved.
016: *
017: * Contributor(s):
018: */
019:
020: package org.openharmonise.rm.search;
021:
022: import java.io.*;
023: import java.util.*;
024: import java.util.logging.*;
025:
026: import javax.xml.transform.*;
027: import javax.xml.transform.dom.*;
028: import javax.xml.transform.stream.*;
029:
030: import org.apache.lucene.analysis.*;
031: import org.apache.lucene.analysis.standard.*;
032: import org.apache.lucene.document.*;
033: import org.apache.lucene.document.Document;
034: import org.apache.lucene.index.*;
035: import org.apache.lucene.queryParser.*;
036: import org.apache.lucene.search.*;
037: import org.apache.lucene.store.*;
038: import org.openharmonise.commons.xml.*;
039: import org.openharmonise.rm.*;
040: import org.openharmonise.rm.config.*;
041: import org.openharmonise.rm.resources.*;
042: import org.openharmonise.rm.resources.content.*;
043: import org.pdfbox.pdfparser.*;
044: import org.pdfbox.pdmodel.*;
045: import org.pdfbox.util.*;
046: import EDU.oswego.cs.dl.util.concurrent.*;
047:
048: /**
049: * Interface to the Lucene text searching and indexing API for use with objects within Harmonise.
050: *
051: * @author Michael Bell
052: * @author jejking
053: * @version $Revision: 1.4 $
054: *
055: */
056: public class HarmoniseIndexer {
057:
058: private static HarmoniseIndexer m_instance = null;
059: private static String m_indexHome = "";
060: private static Templates m_striptags_xsl = null;
061: private static ArrayList keywordFieldList = new ArrayList();
062:
063: private static final String FIELD_UNIQUEID = "uniqueid";
064: private static final String FIELD_ID = "id";
065: private static final String FIELD_NAME = "name";
066: private static final String FIELD_DISPLAY_NAME = "display_name";
067: private static final String FIELD_SUMMARY = "summary";
068: private static final String FIELD_GROUP = "group";
069: private static final String FIELD_CONTENTS = "contents";
070: private static final String FIELD_CLASS = "class";
071: private static final String INDEX_LOC_PROP = "INDEX_LOCATION";
072: public static final String TAG_INDEXER = "indexer";
073: public static final String TAG_INDEXABLE = "indexable";
074: public static final String TAG_TEMPLATE = "template";
075: public static final String TAG_COMPARISON = "comparison";
076: public static final String TAG_INDEX = "index";
077: public static final String ATTRIB_CLASSNAME = "classname";
078: private static final String PNAME_STRIPTAGS_XSL = "STRIPTAGS";
079:
080: private Executor executor;
081:
082: /**
083: * Logger.
084: */
085: private static Logger m_logger = Logger
086: .getLogger(HarmoniseIndexer.class.getName());
087:
088: static {
089: //initialise array list of key word fields
090: keywordFieldList.add(FIELD_UNIQUEID);
091: keywordFieldList.add(FIELD_ID);
092: keywordFieldList.add(FIELD_GROUP);
093: }
094:
095: /**
096: * Default constructor.
097: *
098: * @throws HarmoniseIndexerException
099: */
100: private HarmoniseIndexer() throws HarmoniseIndexerException {
101: try {
102: m_indexHome = ConfigSettings.getProperty(INDEX_LOC_PROP);
103: if ((m_indexHome == null) || (m_indexHome.length() == 0)) {
104: throw new HarmoniseIndexerException(
105: "Index location is not defined!!");
106: }
107: executor = new QueuedExecutor();
108: } catch (Exception e) {
109: m_logger.log(Level.SEVERE,
110: "Could not instantiate HarmoniseIndexer", e);
111: throw new HarmoniseIndexerException(e.getMessage(), e);
112: }
113: }
114:
115: /**
116: * Returns singleton instance of <code>HarmoniseIndexer</code>.
117: *
118: * @return instance of <code>HarmoniseIndexer</code>.
119: * @throws HarmoniseIndexerException
120: */
121: public static HarmoniseIndexer getInstance()
122: throws HarmoniseIndexerException {
123: if (m_instance == null) {
124: m_instance = new HarmoniseIndexer();
125: }
126: return m_instance;
127: }
128:
129: public static HarmoniseIndexer getIndexer(String indexHome)
130: throws HarmoniseIndexerException {
131: if (m_instance == null) {
132: m_instance = new HarmoniseIndexer();
133: }
134: HarmoniseIndexer.m_indexHome = indexHome;
135: return m_instance;
136: }
137:
138: /**
139: * Returns <code>true</code> if the given object is indexed.
140: *
141: * @param xobj
142: * @return
143: * @throws HarmoniseIndexerException
144: */
145: public static boolean isIndexed(AbstractObject xobj)
146: throws HarmoniseIndexerException {
147: boolean bExists = false;
148:
149: try {
150: Directory directory = FSDirectory.getDirectory(
151: HarmoniseIndexer.m_indexHome, false);
152: IndexReader reader = IndexReader.open(directory);
153: Term term = new Term(HarmoniseIndexer.FIELD_UNIQUEID, xobj
154: .getClass().getName()
155: + String.valueOf(xobj.getId()));
156:
157: if (reader.docFreq(term) > 0) {
158: bExists = true;
159: }
160: reader.close();
161: } catch (FileNotFoundException e) {
162: bExists = false;
163: } catch (Exception e) {
164: m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
165: throw new HarmoniseIndexerException(e.getMessage(), e);
166: }
167:
168: return bExists;
169: }
170:
171: /**
172: * Indexes the given object.
173: *
174: * @param pObj
175: * @throws HarmoniseIndexerException
176: */
177: public void indexObject(AbstractObject pObj)
178: throws HarmoniseIndexerException {
179: if (pObj == null || (pObj instanceof AbstractObject) == false) {
180: throw new HarmoniseIndexerException(
181: "Object must be AbstractObject - "
182: + pObj.getClass().getName());
183: }
184: IndexRunnable indexer = new IndexRunnable(pObj);
185: try {
186: executor.execute(indexer); // hands off to single background thread
187: } catch (InterruptedException e) {
188: throw new HarmoniseIndexerException(
189: "Problem running indexer asynchronously", e);
190: }
191: }
192:
193: /**
194: * Searches the index for objects of the type given by the <code>Class</code> and
195: * fulfilling the conditions given by the other arguments and returns a <code>List</code>
196: * of object IDs.
197: *
198: * @param xobj
199: * @param groupIds
200: * @param sName
201: * @param sSummary
202: * @param sContent
203: * @return
204: * @throws HarmoniseIndexerException
205: */
206: public List searchContents(Class xobjClass, Vector groupIds,
207: String sName, String sSummary, String sContent)
208: throws HarmoniseIndexerException {
209: return searchContents(getQuery(xobjClass, groupIds, sName,
210: null, sSummary, sContent));
211: }
212:
213: /**
214: * Returns the Lucene query string built from the conditions given for 'name', 'summary', etc.
215: *
216: * @param xobjClass
217: * @param groupIds
218: * @param sName
219: * @param sDisplayName
220: * @param sSummary
221: * @param sContent
222: *
223: * @return
224: */
225: public String getQuery(Class xobjClass, Vector groupIds,
226: String sName, String sDisplayName, String sSummary,
227: String sContent) {
228: StringBuffer sQuery = new StringBuffer();
229:
230: sQuery.append(HarmoniseIndexer.FIELD_CLASS).append(":").append(
231: xobjClass.getName());
232:
233: if ((groupIds != null) && (groupIds.size() > 0)) {
234: sQuery.append(" AND (");
235:
236: for (int i = 0; i < groupIds.size(); i++) {
237: if (i > 0) {
238: sQuery.append(" OR ");
239: }
240:
241: sQuery.append(HarmoniseIndexer.FIELD_GROUP).append(":")
242: .append(groupIds.elementAt(i));
243: }
244:
245: sQuery.append(") ");
246: }
247:
248: sQuery.append(" AND (");
249:
250: boolean bOR = false;
251:
252: // Process Name, if it has been submitted
253: if ((sName != null) && (sName.length() > 0)) {
254: buildFieldQueryString(sQuery, FIELD_NAME, sName);
255: bOR = true;
256: }
257:
258: if (sDisplayName != null) {
259: if (bOR) {
260: sQuery.append(" OR ");
261: }
262: buildFieldQueryString(sQuery, FIELD_DISPLAY_NAME,
263: sDisplayName);
264:
265: bOR = true;
266: }
267:
268: // Process Summary, it it has been submitted
269: if ((sSummary != null) && (sSummary.length() > 0)) {
270: if (bOR) {
271: sQuery.append(" OR ");
272: }
273:
274: buildFieldQueryString(sQuery, FIELD_SUMMARY, sSummary);
275: bOR = true;
276: }
277:
278: // Process Content, if it is been submitted
279: if ((sContent != null) && (sContent.length() > 0)) {
280: if (bOR) {
281: sQuery.append(" OR ");
282: }
283:
284: buildFieldQueryString(sQuery, FIELD_CONTENTS, sContent);
285: }
286:
287: sQuery.append(")");
288:
289: return sQuery.toString();
290: }
291:
292: /**
293: * Runs the given query against the index and returns a <code>List</code> of object
294: * IDs.
295: *
296: * @param queryString
297: * @return List of hits
298: * @throws HarmoniseIndexerException
299: */
300: public List searchContents(String queryString)
301: throws HarmoniseIndexerException {
302: Vector vec = new Vector();
303:
304: if (m_logger.getLevel() == Level.FINE) {
305: m_logger.log(Level.FINE, "Lucene query - " + queryString);
306: }
307:
308: try {
309: Searcher searcher = new IndexSearcher(m_indexHome);
310: StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
311:
312: //need a PerFieldAnalyzerWrapper so that our PorterStem
313: //analyzer isn't applied to the keywords we've set
314: PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
315: new HarmoniseAnalyzer());
316:
317: Iterator iter = keywordFieldList.iterator();
318:
319: while (iter.hasNext()) {
320: String field = (String) iter.next();
321: analyzer.addAnalyzer(field, standardAnalyzer);
322: }
323:
324: Query query = QueryParser.parse(queryString,
325: FIELD_CONTENTS, analyzer);
326:
327: Hits hits = searcher.search(query);
328: if (m_logger.getLevel() == Level.FINE) {
329: m_logger.log(Level.FINE, "Lucene query found "
330: + hits.length() + " hits in the index");
331: }
332:
333: for (int i = 0; i < hits.length(); i++) {
334: vec.addElement(hits.doc(i).get(FIELD_ID));
335: }
336:
337: searcher.close();
338: } catch (Exception e) {
339: m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
340: }
341: // if we encounter any errors, rather than propagating the exception up
342: // we'll just return the vector, but it'll be empty
343: return vec;
344: }
345:
346: /**
347: * Utility method to query the Lucene index independently of the Harmonise API.
348: *
349: * @param queryString a correctly formatted Lucene query to be parsed.
350: * @return List of Strings containing some summary info about the Hits returned
351: * @throws HarmoniseIndexerException
352: */
353: public List search(String queryString)
354: throws HarmoniseIndexerException {
355: List hitsList = new ArrayList();
356: Hits hits = null;
357: try {
358: Searcher searcher = new IndexSearcher(m_indexHome);
359: Query query = QueryParser.parse(queryString,
360: FIELD_CONTENTS, new HarmoniseAnalyzer());
361: hits = searcher.search(query);
362: // iterate through hits and build a list to return
363:
364: for (int i = 0; i < hits.length(); i++) {
365: StringBuffer sb = new StringBuffer();
366: org.apache.lucene.document.Document doc = hits.doc(i);
367: sb.append("Unique id: " + doc.get(FIELD_UNIQUEID));
368: sb.append("\n");
369: sb.append("Summary: " + doc.get(FIELD_SUMMARY));
370: sb.append("\n");
371: sb.append("Name: " + doc.get(FIELD_NAME));
372: sb.append("\n");
373: sb.append("Class: " + doc.get(FIELD_CLASS));
374: hitsList.add(sb.toString());
375: }
376: searcher.close();
377: } catch (Exception e) {
378: throw new HarmoniseIndexerException(e.getMessage(), e);
379: }
380: return hitsList;
381: }
382:
383: /**
384: * Deletes the given object from the index.
385: *
386: * @param xobj
387: * @throws HarmoniseIndexerException
388: */
389: public void deleteFromIndex(AbstractObject xobj)
390: throws HarmoniseIndexerException {
391: DeleterRunnable deleter = new DeleterRunnable(xobj);
392: try {
393: executor.execute(deleter);
394: } catch (InterruptedException e) {
395: throw new HarmoniseIndexerException(
396: "Problem running delete asynchronously", e);
397: }
398: }
399:
400: /**
401: * Utility method to process boolean operators in the raw query string correctly into the
402: * so that the terms are associated with the correct Lucene field name.
403: *
404: * @param queryBuf StringBuffer being used to assemble the final Lucene query
405: * @param fieldName name of the field, so we can prepend it so that Lucene field is searched
406: * @param inputString the raw input, to be processed to make it field specific
407: */
408: private void buildFieldQueryString(StringBuffer queryBuf,
409: String fieldName, String inputString) {
410: //just tokenise on white space as per default
411: StringTokenizer tokeniser = new StringTokenizer(inputString);
412:
413: // ** Use the following code to build simple phrases
414: // ** when this is used, make sure that two successive quotes "" are replaced with
415: // ** one single quote or the parser falls over
416: boolean buildingPhrase = false;
417: while (tokeniser.hasMoreTokens()) {
418: String token = tokeniser.nextToken();
419: token = token.replaceAll("\"\"", "\"");
420: if (token.equals("AND") || token.equals("OR")
421: || token.equals("NOT")) {
422: queryBuf.append(token + " "); // it's an operator, just append it raw
423: } else {
424: // are we building a phrase?
425: if (buildingPhrase == true) {
426: queryBuf.append(token + " "); // no need to prepend field:
427: // do we need to stop building the phrase/
428: if (token.endsWith("\"")) {
429: buildingPhrase = false;
430: }
431: continue;
432: }
433: queryBuf.append(fieldName + ":" + token + " ");
434: // are we going to start building a phrase ?
435: if (token.startsWith("\"")) {
436: buildingPhrase = true;
437: }
438: }
439: }
440: }
441:
442: /**
443: * Utility to add objects to the Lucene index. Extracts the indexable fields, including contents for
444: * PDF and XML docments and writes them to the index.
445: *
446: * @author John King
447: */
448: private class IndexRunnable implements Runnable {
449:
450: private AbstractObject obj;
451: private String contents;
452:
453: public IndexRunnable(AbstractObject obj) {
454: this .obj = obj;
455: }
456:
457: /* (non-Javadoc)
458: * @see java.lang.Runnable#run()
459: */
460: public void run() {
461: contents = getContents();
462: Document doc = new Document(); // new Lucene document to hold details we're indexing
463:
464: String classname = obj.getClass().getName();
465:
466: doc.add(Field.Keyword(FIELD_UNIQUEID, classname
467: + String.valueOf(obj.getId())));
468: doc.add(Field.UnIndexed(FIELD_ID, String.valueOf(obj
469: .getId())));
470:
471: try {
472: AbstractParentObject grp = ((AbstractChildObject) obj)
473: .getRealParent();
474: if (grp != null) {
475: doc.add(Field.Keyword(FIELD_GROUP, String
476: .valueOf(grp.getId())));
477: }
478:
479: doc.add(Field.Text(FIELD_CLASS, classname));
480: doc.add(Field.Text(FIELD_NAME, obj.getName()));
481: if (obj.getSummary() != null) {
482: doc
483: .add(Field.Text(FIELD_SUMMARY, obj
484: .getSummary()));
485: }
486:
487: if (obj instanceof AbstractEditableObject) {
488: AbstractEditableObject edObj = (AbstractEditableObject) obj;
489: String sDispName = edObj.getDisplayName();
490:
491: if (sDispName != null) {
492: doc.add(Field.Text(FIELD_DISPLAY_NAME,
493: sDispName));
494: }
495: }
496:
497: if (contents != null) {
498: doc.add(Field.Text(FIELD_CONTENTS,
499: new StringReader(contents)));
500: }
501:
502: if (HarmoniseIndexer.isIndexed(obj) == true) {
503: Directory directory = FSDirectory.getDirectory(
504: HarmoniseIndexer.m_indexHome, false);
505:
506: if (IndexReader.indexExists(directory)) {
507: IndexReader reader = IndexReader
508: .open(directory);
509: Term term = new Term(
510: HarmoniseIndexer.FIELD_UNIQUEID, obj
511: .getClass().getName()
512: + String.valueOf(obj.getId()));
513: reader.delete(term);
514: reader.close();
515: }
516: }
517: IndexWriter writer = null;
518: try {
519: writer = new IndexWriter(
520: HarmoniseIndexer.m_indexHome,
521: new HarmoniseAnalyzer(), false);
522: } catch (FileNotFoundException e) {
523: writer = new IndexWriter(
524: HarmoniseIndexer.m_indexHome,
525: new HarmoniseAnalyzer(), true);
526: }
527:
528: writer.addDocument(doc);
529: writer.optimize();
530: writer.close();
531: HarmoniseIndexer.m_logger.log(Level.INFO, "indexed "
532: + obj.getType() + ", ID: " + obj.getId());
533: } catch (DataAccessException e) {
534: HarmoniseIndexer.m_logger.log(Level.WARNING,
535: "Data Access Exception", e);
536: } catch (IOException e) {
537: HarmoniseIndexer.m_logger.log(Level.WARNING,
538: "IOException", e);
539: } catch (HarmoniseIndexerException e) {
540: HarmoniseIndexer.m_logger.log(Level.WARNING,
541: "Harmonise Indexer Exception", e);
542: }
543: }
544:
545: private String getContents() {
546:
547: String objContents = null;
548:
549: try {
550: if (obj instanceof org.openharmonise.rm.resources.content.Document) {
551: org.openharmonise.rm.resources.content.Document doc = (org.openharmonise.rm.resources.content.Document) obj;
552: org.w3c.dom.Document xmlcontent = XMLDocument
553: .getXMLDocumentFromString(doc.getContent());
554: objContents = getStringFromXML(xmlcontent);
555: } else if (obj instanceof Asset) {
556: Asset asset = (Asset) obj;
557: if (asset.getContentType().equalsIgnoreCase(
558: "application/pdf")) {
559: objContents = getStringFromPDF(asset
560: .getContentFile());
561: }
562: }
563: } catch (Exception e) {
564: HarmoniseIndexer.m_logger.log(Level.WARNING,
565: "Exception", e);
566: }
567: return objContents;
568: }
569:
570: /**
571: * Returns the text content of an XML document.
572: *
573: * @param xml
574: * @return String representing content of document once tags have been stripped off.
575: * @throws HarmoniseIndexerException
576: */
577: private String getStringFromXML(org.w3c.dom.Document xml)
578: throws HarmoniseIndexerException {
579:
580: String sResult = "";
581: try {
582: if (HarmoniseIndexer.m_striptags_xsl == null) {
583: //get strip tags xsl if not already created
584: String stripFileName = ConfigSettings
585: .getProperty(PNAME_STRIPTAGS_XSL);
586:
587: if (stripFileName != null
588: && stripFileName.length() > 0) {
589: StreamSource ssource = new StreamSource(
590: new File(stripFileName));
591: HarmoniseIndexer.m_striptags_xsl = (Templates) org.apache.xalan.xsltc.trax.TransformerFactoryImpl
592: .newInstance().newTemplates(ssource);
593: }
594: }
595: //if m_striptags_xsl is null here don't do anything
596: if (m_striptags_xsl != null) {
597: Transformer trans = HarmoniseIndexer.m_striptags_xsl
598: .newTransformer();
599: DOMSource ds = new DOMSource(xml
600: .getDocumentElement());
601: StringWriter sw = new StringWriter();
602: StreamResult res = new StreamResult(sw);
603: trans.transform(ds, res);
604: sResult = sw.toString();
605: sw.close();
606: }
607: } catch (ConfigException e) {
608: throw new HarmoniseIndexerException("Config error", e);
609: } catch (TransformerConfigurationException e) {
610: throw new HarmoniseIndexerException(
611: "Transformer Configuration Exception", e);
612: } catch (TransformerFactoryConfigurationError e) {
613: throw new HarmoniseIndexerException(
614: "Transformer Factory Configuration error", e);
615: } catch (TransformerException e) {
616: throw new HarmoniseIndexerException(
617: "Transformer error", e);
618: } catch (IOException e) {
619: throw new HarmoniseIndexerException("IO error", e);
620: }
621:
622: return sResult;
623: }
624:
625: /**
626: * Returns the text content of a PDF file as a String.
627: *
628: * @param pdfFile
629: * @return
630: * @throws HarmoniseIndexerException
631: */
632: private String getStringFromPDF(File pdfFile)
633: throws HarmoniseIndexerException {
634: String sText = "";
635:
636: try {
637: FileInputStream pdfStream = new FileInputStream(pdfFile);
638: PDFParser pdfParser = new PDFParser(pdfStream);
639: pdfParser.parse();
640: PDDocument pdf = pdfParser.getPDDocument();
641: PDFTextStripper textstripper = new PDFTextStripper();
642: sText = textstripper.getText(pdf);
643: HarmoniseIndexer.m_logger.log(Level.FINEST, sText);
644: pdf.close();
645: } catch (FileNotFoundException e) {
646: throw new HarmoniseIndexerException("File not found", e);
647: } catch (IOException e) {
648: throw new HarmoniseIndexerException("IO exception", e);
649: }
650:
651: return sText;
652: }
653: }
654:
655: /**
656: * Deletion utility to remove objects from the Lucene index.
657: *
658: * @author jejking
659: */
660: private class DeleterRunnable implements Runnable {
661:
662: private AbstractObject obj;
663:
664: /**
665: * @param obj the Harmonise object to delete
666: */
667: public DeleterRunnable(AbstractObject obj) {
668: if (obj == null) {
669: throw new NullPointerException("obj cannot be null");
670: }
671: this .obj = obj;
672: }
673:
674: /* (non-Javadoc)
675: * @see java.lang.Runnable#run()
676: */
677: public void run() {
678: try {
679: Directory directory = FSDirectory.getDirectory(
680: HarmoniseIndexer.m_indexHome, false);
681:
682: if (IndexReader.indexExists(directory)) {
683: IndexReader reader = IndexReader.open(directory);
684: Term term = new Term(
685: HarmoniseIndexer.FIELD_UNIQUEID, obj
686: .getClass().getName()
687: + String.valueOf(obj.getId()));
688: reader.delete(term);
689: reader.close();
690: HarmoniseIndexer.m_logger.log(Level.FINE,
691: "deleted " + obj.getType() + ", ID: "
692: + obj.getId() + " from index");
693: }
694: } catch (Exception e) {
695: HarmoniseIndexer.m_logger.log(Level.WARNING,
696: "problem deleting object", e);
697: }
698: }
699: }
700:
701: }
|