001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.components.search;
018:
019: import org.apache.avalon.framework.activity.Disposable;
020: import org.apache.avalon.framework.configuration.Configurable;
021: import org.apache.avalon.framework.configuration.Configuration;
022: import org.apache.avalon.framework.configuration.ConfigurationException;
023: import org.apache.avalon.framework.logger.AbstractLogEnabled;
024: import org.apache.avalon.framework.service.ServiceException;
025: import org.apache.avalon.framework.service.ServiceManager;
026: import org.apache.avalon.framework.service.Serviceable;
027: import org.apache.cocoon.ProcessingException;
028: import org.apache.cocoon.components.crawler.CocoonCrawler;
029: import org.apache.lucene.analysis.Analyzer;
030: import org.apache.lucene.document.Document;
031: import org.apache.lucene.index.IndexReader;
032: import org.apache.lucene.index.IndexWriter;
033: import org.apache.lucene.index.Term;
034: import org.apache.lucene.index.TermEnum;
035: import org.apache.lucene.store.Directory;
036:
037: import java.io.IOException;
038: import java.net.URL;
039: import java.util.Iterator;
040:
041: /**
042: * A lucene indexer.
043: *
044: * <p>
045: * XML documents are indexed using lucene.
046: * Links to XML documents are supplied by
047: * a crawler, requesting links of documents by specifying a cocoon-view, and
048: * HTTP protocol.
049: * </p>
050: *
051: * @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
052: * @version CVS $Id: SimpleLuceneCocoonIndexerImpl.java 433543 2006-08-22 06:22:54Z crossley $
053: */
054: public class SimpleLuceneCocoonIndexerImpl extends AbstractLogEnabled
055: implements LuceneCocoonIndexer, Configurable, Serviceable,
056: Disposable {
057:
058: /**
059: * configuration tagname for specifying the analyzer class
060: */
061: public final static String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
062:
063: /**
064: * configuration default analyzer class
065: */
066: public final static String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
067:
068: /**
069: * configuration tagname for specifying lucene's index directory
070: */
071: public final static String DIRECTORY_CONFIG = "directory";
072:
073: /**
074: * configuration default directory, ie. no default.
075: */
076: public final static String DIRECTORY_DEFAULT = null;
077:
078: /**
079: * configuration tagname for specifying lucene's merge factor.
080: */
081: public final static String MERGE_FACTOR_CONFIG = "merge-factor";
082:
083: /**
084: * configuration default value for
085: * <a href="http://www.mail-archive.com/lucene-user@jakarta.apache.org/msg00373.html">lucene's merge factor</a>.
086: */
087: public final static int MERGE_FACTOR_DEFAULT = 10;
088:
089: /**
090: * The service manager for looking up components used.
091: */
092: protected ServiceManager manager = null;
093:
094: protected Analyzer analyzer;
095: // private String analyzerClassnameDefault = ANALYZER_CLASSNAME_DEFAULT;
096: private int mergeFactor = MERGE_FACTOR_DEFAULT;
097:
098: /**
099: *Sets the analyzer attribute of the SimpleLuceneCocoonIndexerImpl object
100: *
101: * @param analyzer The new analyzer value
102: */
103: public void setAnalyzer(Analyzer analyzer) {
104: this .analyzer = analyzer;
105: }
106:
107: /**
108: * Configure this component.
109: *
110: * @param conf is the configuration
111: * @exception ConfigurationException is thrown if configuring fails
112: */
113: public void configure(Configuration conf)
114: throws ConfigurationException {
115: Configuration child;
116:
117: /* child = conf.getChild(ANALYZER_CLASSNAME_CONFIG, false);
118: if (child != null) {
119: // fix Bugzilla Bug 25277, use child.getValue
120: // and in all following blocks
121: String value = child.getValue(ANALYZER_CLASSNAME_DEFAULT);
122: if (value != null) {
123: analyzerClassnameDefault = value;
124: }
125: }
126: */
127: child = conf.getChild(MERGE_FACTOR_CONFIG, false);
128: if (child != null) {
129: // fix Bugzilla Bug 25277, use child instead of conf
130: int int_value = child
131: .getValueAsInteger(MERGE_FACTOR_DEFAULT);
132: mergeFactor = int_value;
133: }
134: }
135:
136: /**
137: * Set the current <code>ServiceManager</code> instance used by this
138: * <code>Serviceable</code>.
139: *
140: * @param manager used by this component
141: * @exception ServiceException is never thrown
142: */
143: public void service(ServiceManager manager) throws ServiceException {
144: this .manager = manager;
145: }
146:
147: /**
148: * Dispose this component.
149: */
150: public void dispose() {
151: }
152:
153: /**
154: * index content of base_url, index content of links from base_url.
155: *
156: * @param index the lucene store to write the index to
157: * @param create if true create, or overwrite existing index, else
158: * update existing index.
159: * @param base_url index content of base_url, and crawl through all its
160: * links recursivly.
161: * @exception ProcessingException is thrown if indexing fails
162: */
163: public void index(Directory index, boolean create, URL base_url)
164: throws ProcessingException {
165:
166: IndexWriter writer = null;
167: LuceneXMLIndexer lxi = null;
168: CocoonCrawler cocoonCrawler = null;
169:
170: try {
171: lxi = (LuceneXMLIndexer) manager
172: .lookup(LuceneXMLIndexer.ROLE);
173:
174: writer = new IndexWriter(index, analyzer, create);
175: writer.mergeFactor = this .mergeFactor;
176:
177: cocoonCrawler = (CocoonCrawler) manager
178: .lookup(CocoonCrawler.ROLE);
179: cocoonCrawler.crawl(base_url);
180:
181: Iterator cocoonCrawlerIterator = cocoonCrawler.iterator();
182: while (cocoonCrawlerIterator.hasNext()) {
183: URL crawl_url = (URL) cocoonCrawlerIterator.next();
184: // result of fix Bugzilla Bug 25270, in SimpleCocoonCrawlerImpl
185: // check if crawl_url is null
186: if (crawl_url == null) {
187: continue;
188: } else if (!crawl_url.getHost().equals(
189: base_url.getHost())
190: || crawl_url.getPort() != base_url.getPort()) {
191:
192: // skip urls using different host, or port than host,
193: // or port of base url
194: if (getLogger().isDebugEnabled()) {
195: getLogger().debug(
196: "Skipping crawling URL "
197: + crawl_url.toString()
198: + " as base_url is "
199: + base_url.toString());
200: }
201: continue;
202: }
203:
204: // build lucene documents from the content of the crawl_url
205: Iterator i = lxi.build(crawl_url).iterator();
206:
207: // add all built lucene documents
208: while (i.hasNext()) {
209: writer.addDocument((Document) i.next());
210: }
211: }
212: // optimize it
213: writer.optimize();
214: } catch (IOException ioe) {
215: throw new ProcessingException("IOException in index()", ioe);
216: } catch (ServiceException se) {
217: throw new ProcessingException(
218: "Could not lookup service in index()", se);
219: } finally {
220: if (writer != null) {
221: try {
222: writer.close();
223: } catch (IOException ioe) {
224: }
225: writer = null;
226: }
227:
228: if (lxi != null) {
229: manager.release(lxi);
230: lxi = null;
231: }
232: if (cocoonCrawler != null) {
233: manager.release(cocoonCrawler);
234: cocoonCrawler = null;
235: }
236: }
237: }
238:
239: /**
240: * A document iterator deleting "old" documents form the index.
241: *
242: * TODO: use this class before indexing, in non-creating mode.
243: */
244: static class DocumentDeletableIterator {
245: private IndexReader reader;
246: // existing index
247: private TermEnum uidIter;
248:
249: // document id iterator
250:
251: /**
252: *Constructor for the DocumentDeletableIterator object
253: *
254: * @param directory Description of Parameter
255: * @exception IOException Description of Exception
256: */
257: public DocumentDeletableIterator(Directory directory)
258: throws IOException {
259: reader = IndexReader.open(directory);
260: // open existing index
261: uidIter = reader.terms(new Term("uid", ""));
262: // init uid iterator
263: }
264:
265: /**
266: *Description of the Method
267: *
268: * @exception IOException Description of Exception
269: */
270: public void deleteAllStaleDocuments() throws IOException {
271: while (uidIter.term() != null
272: && uidIter.term().field().equals("uid")) {
273: reader.delete(uidIter.term());
274: uidIter.next();
275: }
276: }
277:
278: /**
279: *Description of the Method
280: *
281: * @param uid Description of Parameter
282: * @exception IOException Description of Exception
283: */
284: public void deleteModifiedDocuments(String uid)
285: throws IOException {
286: while (documentHasBeenModified(uidIter.term(), uid)) {
287: reader.delete(uidIter.term());
288: uidIter.next();
289: }
290: if (documentHasNotBeenModified(uidIter.term(), uid)) {
291: uidIter.next();
292: }
293: }
294:
295: /**
296: *Description of the Method
297: *
298: * @exception Throwable Description of Exception
299: */
300: protected void finalize() throws Throwable {
301: super .finalize();
302: if (uidIter != null) {
303: uidIter.close();
304: // close uid iterator
305: uidIter = null;
306: }
307: if (reader != null) {
308: reader.close();
309: // close existing index
310: reader = null;
311: }
312: }
313:
314: /**
315: *Description of the Method
316: *
317: * @param term Description of Parameter
318: * @return Description of the Returned Value
319: */
320: boolean documentIsDeletable(Term term) {
321: return term != null && term.field() == "uid";
322: }
323:
324: /**
325: *Description of the Method
326: *
327: * @param term Description of Parameter
328: * @param uid Description of Parameter
329: * @return Description of the Returned Value
330: */
331: boolean documentHasBeenModified(Term term, String uid) {
332: return documentIsDeletable(term)
333: && term.text().compareTo(uid) < 0;
334: }
335:
336: /**
337: *Description of the Method
338: *
339: * @param term Description of Parameter
340: * @param uid Description of Parameter
341: * @return Description of the Returned Value
342: */
343: boolean documentHasNotBeenModified(Term term, String uid) {
344: return documentIsDeletable(term)
345: && term.text().compareTo(uid) == 0;
346: }
347: }
348: }
|