001: //=== Copyright (C) 2001-2007 Food and Agriculture Organization of the
002: //=== United Nations (FAO-UN), United Nations World Food Programme (WFP)
003: //=== and United Nations Environment Programme (UNEP)
004: //===
005: //=== This program is free software; you can redistribute it and/or modify
006: //=== it under the terms of the GNU General Public License as published by
007: //=== the Free Software Foundation; either version 2 of the License, or (at
008: //=== your option) any later version.
009: //===
010: //=== This program is distributed in the hope that it will be useful, but
011: //=== WITHOUT ANY WARRANTY; without even the implied warranty of
012: //=== MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
013: //=== General Public License for more details.
014: //===
015: //=== You should have received a copy of the GNU General Public License
016: //=== along with this program; if not, write to the Free Software
017: //=== Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
018: //===
019: //=== Contact: Jeroen Ticheler - FAO - Viale delle Terme di Caracalla 2,
020: //=== Rome - Italy. email: geonetwork@osgeo.org
021: //==============================================================================
022:
023: package org.fao.geonet.kernel.search;
024:
025: import com.k_int.IR.Searchable;
026: import com.k_int.hss.HeterogeneousSetOfSearchable;
027: import com.k_int.util.LoggingFacade.LogContextFactory;
028: import com.k_int.util.LoggingFacade.LoggingContext;
029: import com.k_int.util.Repository.CollectionDirectory;
030: import java.io.File;
031: import java.util.Enumeration;
032: import java.util.Hashtable;
033: import java.util.Iterator;
034: import java.util.List;
035: import java.util.Properties;
036: import java.util.Vector;
037: import javax.naming.Context;
038: import javax.naming.InitialContext;
039: import jeeves.utils.Log;
040: import jeeves.utils.Xml;
041: import org.apache.lucene.analysis.standard.StandardAnalyzer;
042: import org.apache.lucene.document.Document;
043: import org.apache.lucene.document.Field;
044: import org.apache.lucene.index.IndexReader;
045: import org.apache.lucene.index.IndexWriter;
046: import org.apache.lucene.index.Term;
047: import org.apache.lucene.index.TermEnum;
048: import org.fao.geonet.constants.Geonet;
049: import org.jdom.Element;
050:
051: /**
052: * Indexes metadata using Lucene.
053: */
054: public class SearchManager {
055: public static final int LUCENE = 1;
056: public static final int Z3950 = 2;
057: public static final int UNUSED = 3;
058:
059: private static final String SEARCH_STYLESHEETS_DIR_PATH = "xml/search";
060: private static final String SCHEMA_STYLESHEETS_DIR_PATH = "xml/schemas";
061:
062: private File _stylesheetsDir;
063: private File _schemasDir;
064: private File _luceneDir;
065: private LoggingContext _cat;
066: private Searchable _hssSearchable;
067:
068: //-----------------------------------------------------------------------------
069:
070: /**
071: * @param appPath
072: * @param luceneDir
073: * @throws Exception
074: */
075: public SearchManager(String appPath, String luceneDir)
076: throws Exception {
077: _stylesheetsDir = new File(appPath, SEARCH_STYLESHEETS_DIR_PATH);
078: _schemasDir = new File(appPath, SCHEMA_STYLESHEETS_DIR_PATH);
079:
080: if (!_stylesheetsDir.isDirectory())
081: throw new Exception("directory " + _stylesheetsDir
082: + " not found");
083:
084: initLucene(appPath, luceneDir);
085: initZ3950(appPath);
086: }
087:
088: //-----------------------------------------------------------------------------
089:
090: public void end() throws Exception {
091: endZ3950();
092: }
093:
094: //-----------------------------------------------------------------------------
095:
096: public MetaSearcher newSearcher(int type, String stylesheetName)
097: throws Exception {
098: switch (type) {
099: case LUCENE:
100: return new LuceneSearcher(this , stylesheetName);
101: case Z3950:
102: return new Z3950Searcher(this , stylesheetName);
103: case UNUSED:
104: return new UnusedSearcher();
105:
106: default:
107: throw new Exception("unknown MetaSearcher type: " + type);
108: }
109: }
110:
111: /**
112: * Lucene init/end methods. Creates the Lucene index directory.
113: * @param appPath
114: * @param luceneDir
115: * @throws Exception
116: */
117: private void initLucene(String appPath, String luceneDir)
118: throws Exception {
119: _luceneDir = new File(luceneDir);
120:
121: if (!_luceneDir.isAbsolute())
122: _luceneDir = new File(appPath + luceneDir);
123:
124: //--- the lucene dir cannot be inside the CVS so it is better to create it here
125:
126: _luceneDir.mkdirs();
127:
128: setupIndex(false); // RGFIX: check if this is correct
129: }
130:
131: //-----------------------------------------------------------------------------
132: // Z39.50 init/end methods
133:
134: /**
135: * Initializes the Z3950 client searcher.
136: * @param appPath
137: * @throws Exception
138: */
139: private void initZ3950(String appPath) throws Exception {
140: _cat = LogContextFactory.getContext("GeoNetwork"); // FIXME: maybe it should use the webapp path
141:
142: String configClass = "com.k_int.util.Repository.XMLDataSource";
143: String configUrl = "file:///" + appPath
144: + jeeves.constants.Jeeves.Path.XML
145: + "/repositories.xml";
146: String directoryNamingLocation = "/Services/IR/Directory"; // RGFIX: change to use servlet context
147:
148: Properties props = new Properties();
149: props.setProperty("CollectionDataSourceClassName", configClass);
150: props.setProperty("RepositoryDataSourceURL", configUrl);
151: props.setProperty("DirectoryServiceName",
152: directoryNamingLocation); // RGFIX: check this
153: // set up the collection directory and register it with the naming service in the
154: // default way
155: // RGFIX: this could not work for different servlet instances, should be changed to use servlet context
156: CollectionDirectory cd = new CollectionDirectory(configClass,
157: configUrl);
158: Context context = new InitialContext();
159: Context services_context = context.createSubcontext("Services");
160: Context ir_context = services_context.createSubcontext("IR");
161: ir_context.bind("Directory", cd);
162:
163: // pull in the repository
164: _hssSearchable = new HeterogeneousSetOfSearchable();
165: _hssSearchable.init(props);
166: }
167:
168: /** deinitializes the Z3950 client searcher
169: */
170: private void endZ3950() {
171: if (_hssSearchable != null) {
172: _hssSearchable.destroy();
173: _hssSearchable = null;
174: }
175: }
176:
177: //--------------------------------------------------------------------------------
178: // indexing methods
179:
180: /**
181: * Indexes a metadata record.
182: * @param type
183: * @param metadata
184: * @param id
185: * @param moreFields
186: * @param isTemplate
187: * @param title
188: * @throws Exception
189: */
190: public synchronized void index(String type, Element metadata,
191: String id, List moreFields, String isTemplate, String title)
192: throws Exception {
193: delete("_id", id);
194:
195: Element xmlDoc;
196:
197: // check for subtemplates
198: if (isTemplate.equals("s")) {
199: // create empty document with only title and "any" fields
200: xmlDoc = new Element("Document");
201:
202: StringBuffer sb = new StringBuffer();
203: allText(metadata, sb);
204: addField(xmlDoc, "title", title, true, true, true);
205: addField(xmlDoc, "any", sb.toString(), true, true, true);
206: } else {
207: Log.debug(Geonet.INDEX_ENGINE, "Metadata to index:\n"
208: + Xml.getString(metadata));
209:
210: xmlDoc = getIndexFields(type, metadata);
211:
212: Log.debug(Geonet.INDEX_ENGINE, "Indexing fields:\n"
213: + Xml.getString(xmlDoc));
214: }
215: // add _id field
216: addField(xmlDoc, "_id", id, true, true, false);
217:
218: // add more fields
219: for (Iterator iter = moreFields.iterator(); iter.hasNext();) {
220: Element field = (Element) iter.next();
221: xmlDoc.addContent(field);
222: }
223:
224: Log.debug(Geonet.INDEX_ENGINE, "Lucene document:\n"
225: + Xml.getString(xmlDoc));
226:
227: Document doc = newDocument(xmlDoc);
228: IndexWriter writer = new IndexWriter(_luceneDir,
229: new StandardAnalyzer(new String[] {}), false);
230: try {
231: writer.addDocument(doc);
232: lazyOptimize(writer);
233: } finally {
234: writer.close();
235: }
236: }
237:
238: /**
239: * Creates a new field for the Lucene index.
240: * @param xmlDoc
241: * @param name
242: * @param value
243: * @param store
244: * @param index
245: * @param token
246: */
247: private void addField(Element xmlDoc, String name, String value,
248: boolean store, boolean index, boolean token) {
249: Element field = new Element("Field");
250: field.setAttribute("name", name);
251: field.setAttribute("string", value);
252: field.setAttribute("store", store + "");
253: field.setAttribute("index", index + "");
254: field.setAttribute("token", token + "");
255: xmlDoc.addContent(field);
256: }
257:
258: /**
259: * Extracts text from metadata record.
260: * @param metadata
261: * @param sb
262: * @return all text in the metadata elements for indexing
263: */
264: private void allText(Element metadata, StringBuffer sb) {
265: String text = metadata.getText().trim();
266: if (text.length() > 0) {
267: if (sb.length() > 0)
268: sb.append(" ");
269: sb.append(text);
270: }
271: List children = metadata.getChildren();
272: if (children.size() > 0) {
273: for (Iterator i = children.iterator(); i.hasNext();)
274: allText((Element) i.next(), sb);
275: }
276: }
277:
278: //--------------------------------------------------------------------------------
279: // delete a document
280:
281: public synchronized void delete(String fld, String txt)
282: throws Exception {
283: // possibly remove old document
284: IndexReader reader = IndexReader.open(_luceneDir);
285: try {
286: reader.deleteDocuments(new Term(fld, txt));
287:
288: // RGFIX: should I optimize here, or at least increase updateCount?
289: } finally {
290: reader.close();
291: }
292: }
293:
294: //--------------------------------------------------------------------------------
295:
296: public Hashtable getDocs() throws Exception {
297: IndexReader reader = IndexReader.open(_luceneDir);
298: try {
299: Hashtable docs = new Hashtable();
300: for (int i = 0; i < reader.numDocs(); i++) {
301: if (reader.isDeleted(i))
302: continue; // FIXME: strange lucene hack: sometimes it tries to load a deleted document
303:
304: Hashtable record = new Hashtable();
305: Document doc = reader.document(i);
306: String id = doc.get("_id");
307: for (Enumeration j = doc.fields(); j.hasMoreElements();) {
308: Field field = (Field) j.nextElement();
309: record.put(field.name(), field.stringValue());
310: }
311: docs.put(id, record);
312: }
313: return docs;
314: } finally {
315: reader.close();
316: }
317: }
318:
319: //--------------------------------------------------------------------------------
320:
321: public Vector getTerms(String fld) throws Exception {
322: Vector terms = new Vector();
323:
324: IndexReader reader = IndexReader.open(_luceneDir);
325: try {
326: TermEnum enu = reader.terms(new Term(fld, ""));
327: while (enu.next()) {
328: Term term = enu.term();
329: if (term.field().equals(fld))
330: terms.add(enu.term().text());
331: }
332: } finally {
333: reader.close();
334: }
335: return terms;
336: }
337:
338: //-----------------------------------------------------------------------------
339: // utilities
340:
341: Element getIndexFields(String schema, Element xml) throws Exception {
342: File schemaDir = new File(_schemasDir, schema);
343:
344: try {
345: String styleSheet = new File(schemaDir, "index-fields.xsl")
346: .getAbsolutePath();
347: return Xml.transform(xml, styleSheet);
348: } catch (Exception e) {
349: Log.error(Geonet.SEARCH_ENGINE,
350: "Indexing stylesheet contains errors : "
351: + e.getMessage());
352: throw e;
353: }
354: }
355:
356: //-----------------------------------------------------------------------------
357: // utilities
358:
359: Element transform(String styleSheetName, Element xml)
360: throws Exception {
361: try {
362: String styleSheetPath = new File(_stylesheetsDir,
363: styleSheetName).getAbsolutePath();
364: return Xml.transform(xml, styleSheetPath);
365: } catch (Exception e) {
366: Log.error(Geonet.SEARCH_ENGINE,
367: "Search stylesheet contains errors : "
368: + e.getMessage());
369: throw e;
370: }
371: }
372:
373: public File getLuceneDir() {
374: return _luceneDir;
375: }
376:
377: Searchable getSearchable() {
378: return _hssSearchable;
379: }
380:
381: //-----------------------------------------------------------------------------
382: // private methods
383:
384: // creates an index in directory luceneDir with StandardAnalyzer if not present
385: private void setupIndex(boolean rebuild) throws Exception {
386: // if rebuild forced don't check
387: boolean badIndex = true;
388: if (!rebuild) {
389: try {
390: IndexReader reader = IndexReader.open(_luceneDir);
391: reader.close();
392: badIndex = false;
393: } catch (Exception e) {
394: System.err
395: .println("exception while opening lucene index, going to rebuild it: "
396: + e.getMessage());
397: }
398: }
399: // if rebuild forced or bad index then rebuild index
400: if (rebuild || badIndex) {
401: System.err.println("rebuilding lucene index");
402:
403: IndexWriter writer = new IndexWriter(_luceneDir,
404: new StandardAnalyzer(new String[] {}), true);
405: writer.close();
406: }
407: }
408:
409: // creates a new document
410: private Document newDocument(Element xml) {
411: Document doc = new Document();
412: for (Iterator iter = xml.getChildren().iterator(); iter
413: .hasNext();) {
414: Element field = (Element) iter.next();
415: String name = field.getAttributeValue("name");
416: String string = field.getAttributeValue("string")
417: .toLowerCase(); // RGFIX: should be only needed for non-tokenized fields
418: if (string.trim().length() > 0) {
419: String sStore = field.getAttributeValue("store");
420: String sIndex = field.getAttributeValue("index");
421: String sToken = field.getAttributeValue("token");
422: boolean bStore = sStore != null
423: && sStore.equals("true");
424: boolean bIndex = sIndex != null
425: && sIndex.equals("true");
426: boolean token = sToken != null && sToken.equals("true");
427: Field.Store store = null;
428: if (bStore) {
429: store = Field.Store.YES;
430: } else {
431: store = Field.Store.NO;
432: }
433: Field.Index index = null;
434: if (bIndex && token) {
435: index = Field.Index.TOKENIZED;
436: }
437: if (bIndex && !token) {
438: index = Field.Index.UN_TOKENIZED;
439: }
440: if (!bIndex) {
441: index = Field.Index.NO;
442: }
443: doc.add(new Field(name, string, store, index));
444: }
445: }
446: return doc;
447: }
448:
449: //--------------------------------------------------------------------------------
450:
451: private static final long TIME_BETWEEN_OPTS = 1000; // time between two optimizations in ms
452: private static final int UPDTATES_BETWEEN_OPTS = 10; // number of updates between two optimizations
453:
454: private long lastOptTime = 0; // time since last optimization
455: private int updateCount = UPDTATES_BETWEEN_OPTS - 1; // number of updates since last uptimization
456: private boolean optimizing = false; // true iff optimization is in progress
457: private Object mutex = new Object(); // RGFIX: check concurrent access from multiple servlets
458:
459: /**
460: * lazy optimization: optimize index if
461: * at least TIME_BETWEEN_OPTS time passed or
462: * at least UPDTATES_BETWEEN_OPTS updates were performed
463: * since last optimization
464: * @param writer
465: * @throws Exception
466: */
467: private void lazyOptimize(IndexWriter writer) throws Exception {
468: if (optimizing)
469: return;
470:
471: boolean doOptimize;
472: synchronized (mutex) {
473: if (System.currentTimeMillis() - lastOptTime < TIME_BETWEEN_OPTS
474: && ++updateCount < UPDTATES_BETWEEN_OPTS)
475: doOptimize = false;
476: else {
477: doOptimize = true;
478: optimizing = true;
479: updateCount = 0;
480: }
481: }
482: if (doOptimize) {
483: // System.out.println("**** OPTIMIZING"); // DEBUG
484:
485: writer.optimize();
486: lastOptTime = System.currentTimeMillis();
487: optimizing = false;
488: }
489: }
490: }
|