001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.transformation;
018:
019: import java.io.File;
020: import java.io.IOException;
021: import java.io.Serializable;
022: import java.util.Map;
023: import java.util.Stack;
024:
025: import org.apache.avalon.framework.configuration.Configurable;
026: import org.apache.avalon.framework.configuration.Configuration;
027: import org.apache.avalon.framework.configuration.ConfigurationException;
028: import org.apache.avalon.framework.context.Context;
029: import org.apache.avalon.framework.context.ContextException;
030: import org.apache.avalon.framework.context.Contextualizable;
031: import org.apache.avalon.framework.parameters.Parameters;
032:
033: import org.apache.cocoon.Constants;
034: import org.apache.cocoon.ProcessingException;
035: import org.apache.cocoon.caching.CacheableProcessingComponent;
036: import org.apache.cocoon.components.search.LuceneCocoonHelper;
037: import org.apache.cocoon.components.search.LuceneXMLIndexer;
038: import org.apache.cocoon.environment.SourceResolver;
039: import org.apache.commons.lang.BooleanUtils;
040: import org.apache.excalibur.source.SourceValidity;
041: import org.apache.excalibur.source.impl.validity.NOPValidity;
042:
043: import org.apache.lucene.analysis.Analyzer;
044: import org.apache.lucene.document.Document;
045: import org.apache.lucene.document.Field;
046: import org.apache.lucene.index.IndexWriter;
047: import org.apache.lucene.index.IndexReader;
048: import org.apache.lucene.index.Term;
049: import org.apache.lucene.store.Directory;
050: import org.xml.sax.Attributes;
051: import org.xml.sax.SAXException;
052: import org.xml.sax.helpers.AttributesImpl;
053:
054: /**
055: * <p style="font-weight: bold;">A lucene index creation transformer.</p>
056: * <p>This transformer reads a document with elements in the namespace
057: * <code>http://apache.org/cocoon/lucene/1.0</code>, and creates a new Lucene Index,
058: * or updates an existing one.</p>
059: * <p>It has several parameters which can be set in the sitemap component configuration or as
060: * parameters to the transformation step in the pipeline, or finally as attributes of the root element
061: * in the source XML document. The source document over-rides the transformation parameters,
062: * which in turn over-ride any configuration parameters.</p>
063: * <dl>
064: * <dt>
065: * <dt style="font-weight: bold;">directory</dt>
066: * <dd><p>Location of directory where index files are stored.
067: * This path is relative to the Cocoon work directory</p></dd>
068: * <dt style="font-weight: bold;">create</dt>
069: * <dd><p>This attribute controls whether the index is recreated. </p>
070: * <ul><li><p>If create = "false" and the index already exists then the index will be updated.
071: * Any documents which had already been indexed will be removed from the index and reinserted.</p></li>
072: * <li><p>If the index does not exist then it will be created even if <code>create</code>="false".</p></li>
073: * <li><p>If <code>create</code>="true" then any existing index will be destroyed and a new index created.
074: * If you are rebuilding your entire index then you should set <code>create</code>="true" because the
075: * indexer doesn't need to remove old documents from the index, so it will be faster.</p></li></ul>
076: * </dd>
077: * <dt style="font-weight: bold;">max-field-length</dt>
078: * <dd><p>Maximum number of terms to index in a field (as far as the index is concerned,
079: * the document will effectively be truncated at this point. The default value, 10k, may not be sufficient for large documents.</p></dd>
080: * <dt style="font-weight: bold;">analyzer</dt>
081: * <dd><p>Class name of the Lucene text analyzer to use. Typically depends on the language of the text being indexed.
082: * See the Lucene documentation for more information.</p></dd>
083: * <dt style="font-weight: bold;">merge-factor</dt>
084: * <dd>Determines how often segment indices are merged. See the Lucene documentation for more information.</dd>
085: * </dl>
086: * <dl>
087: * <dt style="font-weight: bold;">A simple example of the input:</dt>
088: * <dd>
089: * <pre><?xml version="1.0" encoding="UTF-8"?>
090: * <lucene:index xmlns:lucene="http://apache.org/cocoon/lucene/1.0"
091: * merge-factor="20"
092: * create="false"
093: * directory="index"
094: * max-field-length="10000"
095: * analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer">
096: * <lucene:document url="a.html">
097: * <documentTitle lucene:store="true">Doggerel</documentTitle>
098: * <body>The quick brown fox jumped over the lazy dog</body>
099: * </lucene:document>
100: * <lucene:document url="b.html">
101: * <documentTitle lucene:store="true">Lorem Ipsum</documentTitle>
102: * <body>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</body>
103: * <body>Nunc a mauris blandit ligula scelerisque tristique.</body>
104: * </lucene:document>
105: * </lucene:index>
106: * </pre>
107: * </dd>
108: * </dl>
109: *
110: * @author <a href="mailto:vgritsenko@apache.org">Vadim Gritsenko</a>
111: * @author <a href="mailto:conal@nzetc.org">Conal Tuohy</a>
112: * @version $Id: LuceneIndexTransformer.java 433543 2006-08-22 06:22:54Z crossley $
113: */
114: public class LuceneIndexTransformer extends AbstractTransformer
115: implements CacheableProcessingComponent, Configurable,
116: Contextualizable {
117:
118: public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
119: public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname";
120: public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
121: public static final String DIRECTORY_CONFIG = "directory";
122: public static final String DIRECTORY_PARAMETER = "directory";
123: public static final String DIRECTORY_DEFAULT = "index";
124: public static final String MERGE_FACTOR_CONFIG = "merge-factor";
125: public static final String MERGE_FACTOR_PARAMETER = "merge-factor";
126: public static final int MERGE_FACTOR_DEFAULT = 20;
127: public static final String MAX_FIELD_LENGTH_CONFIG = "max-field-length";
128: public static final String MAX_FIELD_LENGTH_PARAMETER = "max-field-length";
129: public static final int MAX_FIELD_LENGTH_DEFAULT = IndexWriter.DEFAULT_MAX_FIELD_LENGTH;
130:
131: public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
132: public static final String LUCENE_QUERY_ELEMENT = "index";
133: public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer";
134: public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory";
135: public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create";
136: public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor";
137: public static final String LUCENE_QUERY_MAX_FIELD_LENGTH_ATTRIBUTE = "max-field-length";
138: public static final String LUCENE_DOCUMENT_ELEMENT = "document";
139: public static final String LUCENE_DOCUMENT_URL_ATTRIBUTE = "url";
140: public static final String LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
141: public static final String LUCENE_ELEMENT_ATTR_STORE_VALUE = "store";
142: public static final String LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time";
143: public static final String CDATA = "CDATA";
144:
145: // The 3 states of the state machine
146: private static final int STATE_GROUND = 0; // initial or "ground" state
147: private static final int STATE_QUERY = 1; // processing a lucene:index (Query) element
148: private static final int STATE_DOCUMENT = 2; // processing a lucene:document element
149:
150: // Initialization time variables
151: protected File workDir = null;
152:
153: // Declaration time parameters values (specified in sitemap component config)
154: private IndexerConfiguration configureConfiguration;
155: // Invocation time parameters values (specified in sitemap transform parameters)
156: private IndexerConfiguration setupConfiguration;
157: // Parameters specified in the input document
158: private IndexerConfiguration queryConfiguration;
159:
160: // Runtime variables
161: private int processing;
162: private boolean createIndex = false;
163: private IndexWriter writer;
164: private StringBuffer bodyText;
165: private Document bodyDocument;
166: private String bodyDocumentURL;
167: private Stack elementStack = new Stack();
168: /**
169: * Storage for the document element's attributes until the document
170: * has been indexed, so that they can be copied to the output
171: * along with a boolean <code>indexed</code> attribute.
172: */
173: private AttributesImpl documentAttributes;
174: private long documentStartTime;
175:
176: private static String uid(String url) {
177: return url.replace('/', '\u0000'); // + "\u0000" + DateField.timeToString(urlConnection.getLastModified());
178: }
179:
180: /**
181: * Configure the transformer. The configuration parameters are stored as
182: * general defaults, which may be over-ridden by parameters specified as
183: * parameters in the sitemap pipeline, or by attributes of the query
184: * element(s) in the XML input document.
185: */
186: public void configure(Configuration conf)
187: throws ConfigurationException {
188: this .configureConfiguration = new IndexerConfiguration(conf
189: .getChild(ANALYZER_CLASSNAME_CONFIG).getValue(
190: ANALYZER_CLASSNAME_DEFAULT), conf.getChild(
191: DIRECTORY_CONFIG).getValue(DIRECTORY_DEFAULT), conf
192: .getChild(MERGE_FACTOR_CONFIG).getValueAsInteger(
193: MERGE_FACTOR_DEFAULT), conf.getChild(
194: MAX_FIELD_LENGTH_CONFIG).getValueAsInteger(
195: MAX_FIELD_LENGTH_DEFAULT));
196: }
197:
198: /**
199: * Setup the transformer.
200: * Called when the pipeline is assembled.
201: * The parameters are those specified as child elements of the
202: * <code><map:transform></code> element in the sitemap.
203: * These parameters are optional:
204: * If no parameters are specified here then the defaults are
205: * supplied by the component configuration.
206: * Any parameters specified here may be over-ridden by attributes
207: * of the lucene:index element in the input document.
208: */
209: public void setup(SourceResolver resolver, Map objectModel,
210: String src, Parameters parameters)
211: throws ProcessingException, SAXException, IOException {
212: setupConfiguration = new IndexerConfiguration(parameters
213: .getParameter(ANALYZER_CLASSNAME_PARAMETER,
214: configureConfiguration.analyzerClassname),
215: parameters.getParameter(DIRECTORY_PARAMETER,
216: configureConfiguration.indexDirectory),
217: parameters.getParameterAsInteger(
218: MERGE_FACTOR_PARAMETER,
219: configureConfiguration.mergeFactor), parameters
220: .getParameterAsInteger(
221: MAX_FIELD_LENGTH_PARAMETER,
222: configureConfiguration.maxFieldLength));
223: }
224:
225: /**
226: * Contextualize this class
227: */
228: public void contextualize(Context context) throws ContextException {
229: this .workDir = (File) context.get(Constants.CONTEXT_WORK_DIR);
230: }
231:
232: public void recycle() {
233: this .processing = STATE_GROUND;
234: if (this .writer != null) {
235: try {
236: this .writer.close();
237: } catch (IOException ioe) {
238: }
239: this .writer = null;
240: }
241: this .bodyText = null;
242: this .bodyDocument = null;
243: this .bodyDocumentURL = null;
244: this .elementStack.clear();
245: super .recycle();
246: }
247:
248: /**
249: * Generate the unique key.
250: * This key must be unique inside the space of this component.
251: *
252: * @return The generated key
253: */
254: public Serializable getKey() {
255: return "1";
256: }
257:
258: /**
259: * Generate the validity object.
260: *
261: * @return The generated validity object or <code>null</code> if the
262: * component is currently not cacheable.
263: */
264: public SourceValidity getValidity() {
265: return NOPValidity.SHARED_INSTANCE;
266: }
267:
268: public void startDocument() throws SAXException {
269: super .startDocument();
270: }
271:
272: public void endDocument() throws SAXException {
273: super .endDocument();
274: }
275:
276: /**
277: * Begin the scope of a prefix-URI Namespace mapping.
278: *
279: * @param prefix The Namespace prefix being declared.
280: * @param uri The Namespace URI the prefix is mapped to.
281: */
282: public void startPrefixMapping(String prefix, String uri)
283: throws SAXException {
284: if (processing == STATE_GROUND) {
285: super .startPrefixMapping(prefix, uri);
286: }
287: }
288:
289: /**
290: * End the scope of a prefix-URI mapping.
291: *
292: * @param prefix The prefix that was being mapping.
293: */
294: public void endPrefixMapping(String prefix) throws SAXException {
295: if (processing == STATE_GROUND) {
296: super .endPrefixMapping(prefix);
297: }
298: }
299:
300: public void startElement(String namespaceURI, String localName,
301: String qName, Attributes atts) throws SAXException {
302:
303: if (processing == STATE_GROUND) {
304: if (LUCENE_URI.equals(namespaceURI)
305: && LUCENE_QUERY_ELEMENT.equals(localName)) {
306: String sCreate = atts
307: .getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
308: createIndex = BooleanUtils.toBoolean(sCreate);
309:
310: String analyzerClassname = atts
311: .getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
312: String indexDirectory = atts
313: .getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
314: String mergeFactor = atts
315: .getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);
316: String maxFieldLength = atts
317: .getValue(LUCENE_QUERY_MAX_FIELD_LENGTH_ATTRIBUTE);
318:
319: queryConfiguration = new IndexerConfiguration(
320: analyzerClassname != null ? analyzerClassname
321: : setupConfiguration.analyzerClassname,
322: indexDirectory != null ? indexDirectory
323: : setupConfiguration.indexDirectory,
324: mergeFactor != null ? Integer
325: .parseInt(mergeFactor)
326: : setupConfiguration.mergeFactor,
327: maxFieldLength != null ? Integer
328: .parseInt(maxFieldLength)
329: : setupConfiguration.maxFieldLength);
330:
331: if (!createIndex) {
332: // Not asked to create the index - but check if this is necessary anyway:
333: try {
334: IndexReader reader = openReader();
335: reader.close();
336: } catch (IOException ioe) {
337: // couldn't open the index - so recreate it
338: createIndex = true;
339: }
340: }
341: // propagate the lucene:index to the next stage in the pipeline
342: super
343: .startElement(namespaceURI, localName, qName,
344: atts);
345: processing = STATE_QUERY;
346: } else {
347: super
348: .startElement(namespaceURI, localName, qName,
349: atts);
350: }
351: } else if (processing == STATE_QUERY) {
352: // processing a lucene:index - expecting a lucene:document
353: if (LUCENE_URI.equals(namespaceURI)
354: && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
355: this .bodyDocumentURL = atts
356: .getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE);
357: if (this .bodyDocumentURL == null) {
358: throw new SAXException(
359: "<lucene:document> must have @url attribute");
360: }
361:
362: // Remember the time the document indexing began
363: this .documentStartTime = System.currentTimeMillis();
364: // remember these attributes so they can be passed on to the next stage in the pipeline,
365: // when this document element is ended.
366: this .documentAttributes = new AttributesImpl(atts);
367: this .bodyText = new StringBuffer();
368: this .bodyDocument = new Document();
369: this .elementStack.clear();
370: processing = STATE_DOCUMENT;
371: } else {
372: throw new SAXException(
373: "<lucene:index> element can contain only <lucene:document> elements!");
374: }
375: } else if (processing == STATE_DOCUMENT) {
376: elementStack.push(new IndexHelperField(localName,
377: new AttributesImpl(atts)));
378: }
379: }
380:
381: public void endElement(String namespaceURI, String localName,
382: String qName) throws SAXException {
383:
384: if (processing == STATE_QUERY) {
385: if (LUCENE_URI.equals(namespaceURI)
386: && LUCENE_QUERY_ELEMENT.equals(localName)) {
387: // End query processing
388: try {
389: if (this .writer == null) {
390: openWriter();
391: }
392: this .writer.optimize();
393: this .writer.close();
394: this .writer = null;
395: } catch (IOException e) {
396: throw new SAXException(e);
397: }
398: // propagate the query element to the next stage in the pipeline
399: super .endElement(namespaceURI, localName, qName);
400: this .processing = STATE_GROUND;
401: } else {
402: throw new SAXException("</lucene:index> was expected!");
403: }
404: } else if (processing == STATE_DOCUMENT) {
405: if (LUCENE_URI.equals(namespaceURI)
406: && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
407: // End document processing
408: this .bodyDocument.add(Field.UnStored(
409: LuceneXMLIndexer.BODY_FIELD, this .bodyText
410: .toString()));
411: this .bodyText = null;
412:
413: this .bodyDocument.add(Field.UnIndexed(
414: LuceneXMLIndexer.URL_FIELD,
415: this .bodyDocumentURL));
416: // store: false, index: true, tokenize: false
417: this .bodyDocument.add(new Field(
418: LuceneXMLIndexer.UID_FIELD,
419: uid(this .bodyDocumentURL), false, true, false));
420: try {
421: reindexDocument();
422: } catch (IOException e) {
423: throw new SAXException(e);
424: }
425: this .bodyDocumentURL = null;
426:
427: // propagate the lucene:document element to the next stage in the pipeline
428: long elapsedTime = System.currentTimeMillis()
429: - this .documentStartTime;
430: //documentAttributes = new AttributesImpl();
431: this .documentAttributes.addAttribute("",
432: LUCENE_ELAPSED_TIME_ATTRIBUTE,
433: LUCENE_ELAPSED_TIME_ATTRIBUTE, CDATA, String
434: .valueOf(elapsedTime));
435: super .startElement(namespaceURI, localName, qName,
436: this .documentAttributes);
437: super .endElement(namespaceURI, localName, qName);
438: this .processing = STATE_QUERY;
439: } else {
440: // End element processing
441: IndexHelperField tos = (IndexHelperField) elementStack
442: .pop();
443: StringBuffer text = tos.getText();
444:
445: Attributes atts = tos.getAttributes();
446: boolean attributesToText = atts.getIndex(LUCENE_URI,
447: LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE) != -1;
448: for (int i = 0; i < atts.getLength(); i++) {
449: // Ignore Lucene attributes
450: if (LUCENE_URI.equals(atts.getURI(i))) {
451: continue;
452: }
453:
454: String atts_lname = atts.getLocalName(i);
455: String atts_value = atts.getValue(i);
456: bodyDocument.add(Field.UnStored(localName + "@"
457: + atts_lname, atts_value));
458: if (attributesToText) {
459: text.append(atts_value);
460: text.append(' ');
461: bodyText.append(atts_value);
462: bodyText.append(' ');
463: }
464: }
465:
466: boolean store = atts.getIndex(LUCENE_URI,
467: LUCENE_ELEMENT_ATTR_STORE_VALUE) != -1;
468: if (text != null && text.length() > 0) {
469: if (store) {
470: bodyDocument.add(Field.Text(localName, text
471: .toString()));
472: } else {
473: bodyDocument.add(Field.UnStored(localName, text
474: .toString()));
475: }
476: }
477: }
478: } else {
479: // All other tags
480: super .endElement(namespaceURI, localName, qName);
481: }
482: }
483:
484: public void characters(char[] ch, int start, int length)
485: throws SAXException {
486:
487: if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0
488: && length > 1 && elementStack.size() > 0) {
489: String text = new String(ch, start, length);
490: ((IndexHelperField) elementStack.peek()).append(text);
491: bodyText.append(text);
492: bodyText.append(' ');
493: } else if (processing == STATE_GROUND) {
494: super .characters(ch, start, length);
495: }
496: }
497:
498: private void openWriter() throws IOException {
499: File indexDirectory = new File(
500: queryConfiguration.indexDirectory);
501: if (!indexDirectory.isAbsolute()) {
502: indexDirectory = new File(workDir,
503: queryConfiguration.indexDirectory);
504: }
505:
506: // If the index directory doesn't exist, then always create it.
507: boolean indexExists = IndexReader.indexExists(indexDirectory);
508: if (!indexExists) {
509: createIndex = true;
510: }
511:
512: // Get the index directory, creating it if necessary
513: Directory directory = LuceneCocoonHelper.getDirectory(
514: indexDirectory, createIndex);
515: Analyzer analyzer = LuceneCocoonHelper
516: .getAnalyzer(queryConfiguration.analyzerClassname);
517: this .writer = new IndexWriter(directory, analyzer, createIndex);
518: this .writer.mergeFactor = queryConfiguration.mergeFactor;
519: this .writer.maxFieldLength = queryConfiguration.maxFieldLength;
520: }
521:
522: private IndexReader openReader() throws IOException {
523: File indexDirectory = new File(
524: queryConfiguration.indexDirectory);
525: if (!indexDirectory.isAbsolute()) {
526: indexDirectory = new File(workDir,
527: queryConfiguration.indexDirectory);
528: }
529: Directory directory = LuceneCocoonHelper.getDirectory(
530: indexDirectory, createIndex);
531: IndexReader reader = IndexReader.open(directory);
532: return reader;
533: }
534:
535: private void reindexDocument() throws IOException {
536: if (this .createIndex) {
537: // The index is being created, so there's no need to delete the doc from an existing index.
538: // This means we can keep a single IndexWriter open throughout the process.
539: if (this .writer == null) {
540: openWriter();
541: }
542: this .writer.addDocument(this .bodyDocument);
543: } else {
544: // This is an incremental reindex, so the document should be removed from the index before adding it
545: try {
546: IndexReader reader = openReader();
547: reader.delete(new Term(LuceneXMLIndexer.UID_FIELD,
548: uid(this .bodyDocumentURL)));
549: reader.close();
550: } catch (IOException e) { /* ignore */
551: }
552: openWriter();
553: this .writer.addDocument(this .bodyDocument);
554: this .writer.close();
555: this .writer = null;
556: }
557: this .bodyDocument = null;
558: }
559:
560: static class IndexHelperField {
561: String localName;
562: StringBuffer text;
563: Attributes attributes;
564:
565: IndexHelperField(String localName, Attributes atts) {
566: this .localName = localName;
567: this .attributes = atts;
568: this .text = new StringBuffer();
569: }
570:
571: public Attributes getAttributes() {
572: return attributes;
573: }
574:
575: public StringBuffer getText() {
576: return text;
577: }
578:
579: public void append(String text) {
580: this .text.append(text);
581: }
582:
583: public void append(char[] str, int offset, int length) {
584: this .text.append(str, offset, length);
585: }
586: }
587:
588: static class IndexerConfiguration {
589: String analyzerClassname;
590: String indexDirectory;
591: int mergeFactor;
592: int maxFieldLength;
593:
594: public IndexerConfiguration(String analyzerClassname,
595: String indexDirectory, int mergeFactor,
596: int maxFieldLength) {
597: this.analyzerClassname = analyzerClassname;
598: this.indexDirectory = indexDirectory;
599: this.mergeFactor = mergeFactor;
600: this.maxFieldLength = maxFieldLength;
601: }
602: }
603: }
|