001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.transformation;
018:
019: import java.io.File;
020: import java.io.IOException;
021: import java.io.Serializable;
022: import java.util.Map;
023: import java.util.Stack;
024:
025: import org.apache.avalon.framework.configuration.Configurable;
026: import org.apache.avalon.framework.configuration.Configuration;
027: import org.apache.avalon.framework.configuration.ConfigurationException;
028: import org.apache.avalon.framework.context.Context;
029: import org.apache.avalon.framework.context.ContextException;
030: import org.apache.avalon.framework.context.Contextualizable;
031: import org.apache.avalon.framework.parameters.Parameters;
032: import org.apache.avalon.framework.service.ServiceException;
033: import org.apache.avalon.framework.service.ServiceManager;
034: import org.apache.avalon.framework.service.Serviceable;
035: import org.apache.cocoon.Constants;
036: import org.apache.cocoon.ProcessingException;
037: import org.apache.cocoon.caching.CacheableProcessingComponent;
038: import org.apache.cocoon.components.search.IndexException;
039: import org.apache.cocoon.components.search.LuceneCocoonHelper;
040: import org.apache.cocoon.components.search.LuceneXMLIndexer;
041: import org.apache.cocoon.components.search.components.Indexer;
042: import org.apache.cocoon.environment.SourceResolver;
043: import org.apache.commons.lang.BooleanUtils;
044: import org.apache.excalibur.source.SourceValidity;
045: import org.apache.excalibur.source.impl.validity.NOPValidity;
046: import org.apache.lucene.analysis.Analyzer;
047: import org.apache.lucene.document.Document;
048: import org.apache.lucene.document.Field;
049: import org.apache.lucene.index.IndexReader;
050: import org.apache.lucene.store.Directory;
051: import org.xml.sax.Attributes;
052: import org.xml.sax.SAXException;
053: import org.xml.sax.helpers.AttributesImpl;
054:
055: /**
056: * A lucene index creation transformer.
057: * <p>
058: * See <a
059: * href="http://wiki.cocoondev.org/Wiki.jsp?page=LuceneIndexTransformer">LuceneIndexTransformer
060: * </a> documentation on the Cocoon Wiki.
061: * </p>
062: * <p>
063: * TODO: Write more documentation.
064: * </p>
065: *
066: * @author <a href="mailto:vgritsenko@apache.org">Vadim Gritsenko </a>
067: * @author <a href="mailto:conal@nzetc.org">Conal Tuohy </a>
068: * @author Nicolas Maisonneuve
069: */
070: public class LuceneIndexTransformerOptimized extends
071: AbstractTransformer implements CacheableProcessingComponent,
072: Configurable, Contextualizable, Serviceable {
073:
074: public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
075:
076: public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname";
077:
078: public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
079:
080: public static final String DIRECTORY_CONFIG = "directory";
081:
082: public static final String DIRECTORY_PARAMETER = "directory";
083:
084: public static final String DIRECTORY_DEFAULT = "index";
085:
086: public static final String MERGE_FACTOR_CONFIG = "merge-factor";
087:
088: public static final String MERGE_FACTOR_PARAMETER = "merge-factor";
089:
090: public static final int MERGE_FACTOR_DEFAULT = 20;
091:
092: public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
093:
094: public static final String LUCENE_QUERY_ELEMENT = "index";
095:
096: public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer";
097:
098: public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory";
099:
100: public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create";
101:
102: public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor";
103:
104: public static final String LUCENE_DOCUMENT_ELEMENT = "document";
105:
106: public static final String LUCENE_DOCUMENT_URL_ATTRIBUTE = "url";
107:
108: public static final String LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
109:
110: public static final String LUCENE_ELEMENT_ATTR_STORE_VALUE = "store";
111:
112: public static final String LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time";
113:
114: public static final String CDATA = "CDATA";
115:
116: // The 3 states of the state machine
117: private static final int STATE_GROUND = 0; // initial or "ground" state
118:
119: private static final int STATE_QUERY = 1; // processing a lucene:index
120: // (Query) element
121:
122: private static final int STATE_DOCUMENT = 2; // processing a
123: // lucene:document element
124:
125: // Initialization time variables
126: protected File workDir = null;
127:
128: // service manager
129: private ServiceManager manager;
130:
131: private Indexer indexer;
132:
133: // Declaration time parameters values (specified in sitemap component
134: // config)
135: private IndexerConfiguration configureConfiguration;
136:
137: // Invocation time parameters values (specified in sitemap transform
138: // parameters)
139: private IndexerConfiguration setupConfiguration;
140:
141: // Parameters specified in the input document
142: private IndexerConfiguration queryConfiguration;
143:
144: // Runtime variables
145: private int processing;
146:
147: private boolean createIndex = false;
148:
149: private StringBuffer bodyText;
150:
151: private Document bodyDocument;
152:
153: private String bodyDocumentURL;
154:
155: private Stack elementStack = new Stack();
156:
157: /**
158: * Storage for the document element's attributes until the document has been
159: * indexed, so that they can be copied to the output along with a boolean
160: * <code>indexed</code> attribute.
161: */
162: private AttributesImpl documentAttributes;
163:
164: private long documentStartTime;
165:
166: private static String uid(String url) {
167: return url.replace('/', '\u0000'); // + "\u0000" +
168: // DateField.timeToString(urlConnection.getLastModified());
169: }
170:
171: public void service(ServiceManager manager) throws ServiceException {
172: this .manager = manager;
173: }
174:
175: /**
176: * Configure the transformer. The configuration parameters are stored as
177: * general defaults, which may be over-ridden by parameters specified as
178: * parameters in the sitemap pipeline, or by attributes of the query
179: * element(s) in the XML input document.
180: */
181: public void configure(Configuration conf)
182: throws ConfigurationException {
183: this .configureConfiguration = new IndexerConfiguration(conf
184: .getChild(ANALYZER_CLASSNAME_CONFIG).getValue(
185: ANALYZER_CLASSNAME_DEFAULT), conf.getChild(
186: DIRECTORY_CONFIG).getValue(DIRECTORY_DEFAULT), conf
187: .getChild(MERGE_FACTOR_CONFIG).getValueAsInteger(
188: MERGE_FACTOR_DEFAULT));
189: }
190:
191: /**
192: * Setup the transformer. Called when the pipeline is assembled. The
193: * parameters are those specified as child elements of the
194: * <code><map:transform></code> element in the sitemap. These
195: * parameters are optional: If no parameters are specified here then the
196: * defaults are supplied by the component configuration. Any parameters
197: * specified here may be over-ridden by attributes of the lucene:index
198: * element in the input document.
199: */
200: public void setup(SourceResolver resolver, Map objectModel,
201: String src, Parameters parameters)
202: throws ProcessingException, SAXException, IOException {
203: setupConfiguration = new IndexerConfiguration(parameters
204: .getParameter(ANALYZER_CLASSNAME_PARAMETER,
205: configureConfiguration.analyzerClassname),
206: parameters.getParameter(DIRECTORY_PARAMETER,
207: configureConfiguration.indexDirectory),
208: parameters.getParameterAsInteger(
209: MERGE_FACTOR_PARAMETER,
210: configureConfiguration.mergeFactor));
211: }
212:
213: /**
214: * Contextualize this class
215: */
216: public void contextualize(Context context) throws ContextException {
217: this .workDir = (File) context.get(Constants.CONTEXT_WORK_DIR);
218: }
219:
220: public void recycle() {
221: this .processing = STATE_GROUND;
222: if (this .indexer != null) {
223: manager.release(indexer);
224: indexer = null;
225: }
226:
227: this .bodyText = null;
228: this .bodyDocument = null;
229: this .bodyDocumentURL = null;
230: this .elementStack.clear();
231: super .recycle();
232: }
233:
234: /**
235: * Generate the unique key. This key must be unique inside the space of this
236: * component.
237: *
238: * @return The generated key
239: */
240: public Serializable getKey() {
241: return "1";
242: }
243:
244: /**
245: * Generate the validity object.
246: *
247: * @return The generated validity object or <code>null</code> if the
248: * component is currently not cacheable.
249: */
250: public SourceValidity getValidity() {
251: return NOPValidity.SHARED_INSTANCE;
252: }
253:
254: public void startDocument() throws SAXException {
255: super .startDocument();
256: }
257:
258: public void endDocument() throws SAXException {
259: super .endDocument();
260: }
261:
262: /**
263: * Begin the scope of a prefix-URI Namespace mapping.
264: *
265: * @param prefix
266: * The Namespace prefix being declared.
267: * @param uri
268: * The Namespace URI the prefix is mapped to.
269: */
270: public void startPrefixMapping(String prefix, String uri)
271: throws SAXException {
272: if (processing == STATE_GROUND) {
273: super .startPrefixMapping(prefix, uri);
274: }
275: }
276:
277: /**
278: * End the scope of a prefix-URI mapping.
279: *
280: * @param prefix
281: * The prefix that was being mapping.
282: */
283: public void endPrefixMapping(String prefix) throws SAXException {
284: if (processing == STATE_GROUND) {
285: super .endPrefixMapping(prefix);
286: }
287: }
288:
289: public void startElement(String namespaceURI, String localName,
290: String qName, Attributes atts) throws SAXException {
291:
292: if (processing == STATE_GROUND) {
293: if (LUCENE_URI.equals(namespaceURI)
294: && LUCENE_QUERY_ELEMENT.equals(localName)) {
295: String sCreate = atts
296: .getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
297: createIndex = BooleanUtils.toBoolean(sCreate);
298:
299: String analyzerClassname = atts
300: .getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
301: String indexDirectory = atts
302: .getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
303: String mergeFactor = atts
304: .getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);
305:
306: queryConfiguration = new IndexerConfiguration(
307: analyzerClassname != null ? analyzerClassname
308: : setupConfiguration.analyzerClassname,
309: indexDirectory != null ? indexDirectory
310: : setupConfiguration.indexDirectory,
311: mergeFactor != null ? Integer
312: .parseInt(mergeFactor)
313: : setupConfiguration.mergeFactor);
314:
315: // propagate the lucene:index to the next stage in the pipeline
316: super
317: .startElement(namespaceURI, localName, qName,
318: atts);
319: processing = STATE_QUERY;
320: } else {
321: super
322: .startElement(namespaceURI, localName, qName,
323: atts);
324: }
325: } else if (processing == STATE_QUERY) {
326: // processing a lucene:index - expecting a lucene:document
327: if (LUCENE_URI.equals(namespaceURI)
328: && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
329: this .bodyDocumentURL = atts
330: .getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE);
331: if (this .bodyDocumentURL == null) {
332: throw new SAXException(
333: "<lucene:document> must have @url attribute");
334: }
335:
336: // Remember the time the document indexing began
337: this .documentStartTime = System.currentTimeMillis();
338: // remember these attributes so they can be passed on to the
339: // next stage in the pipeline,
340: // when this document element is ended.
341: this .documentAttributes = new AttributesImpl(atts);
342: this .bodyText = new StringBuffer();
343: this .bodyDocument = new Document();
344: this .elementStack.clear();
345: processing = STATE_DOCUMENT;
346: } else {
347: throw new SAXException(
348: "<lucene:index> element can contain only <lucene:document> elements!");
349: }
350: } else if (processing == STATE_DOCUMENT) {
351: elementStack.push(new IndexHelperField(localName,
352: new AttributesImpl(atts)));
353: }
354: }
355:
356: public void endElement(String namespaceURI, String localName,
357: String qName) throws SAXException {
358:
359: if (processing == STATE_QUERY) {
360: if (LUCENE_URI.equals(namespaceURI)
361: && LUCENE_QUERY_ELEMENT.equals(localName)) {
362: // propagate the query element to the next stage in the pipeline
363: super .endElement(namespaceURI, localName, qName);
364: this .processing = STATE_GROUND;
365: } else {
366: throw new SAXException("</lucene:index> was expected!");
367: }
368: } else if (processing == STATE_DOCUMENT) {
369: if (LUCENE_URI.equals(namespaceURI)
370: && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
371: // End document processing
372: this .bodyDocument.add(Field.UnStored(
373: LuceneXMLIndexer.BODY_FIELD, this .bodyText
374: .toString()));
375: this .bodyText = null;
376:
377: this .bodyDocument.add(Field.UnIndexed(
378: LuceneXMLIndexer.URL_FIELD,
379: this .bodyDocumentURL));
380: // store: false, index: true, tokenize: false
381: this .bodyDocument.add(new Field(
382: LuceneXMLIndexer.UID_FIELD,
383: uid(this .bodyDocumentURL), false, true, false));
384: try {
385: reindexDocument();
386: } catch (IndexException e) {
387: throw new SAXException(e);
388: }
389: this .bodyDocumentURL = null;
390:
391: // propagate the lucene:document element to the next stage in
392: // the pipeline
393: long elapsedTime = System.currentTimeMillis()
394: - this .documentStartTime;
395: // documentAttributes = new AttributesImpl();
396: this .documentAttributes.addAttribute("",
397: LUCENE_ELAPSED_TIME_ATTRIBUTE,
398: LUCENE_ELAPSED_TIME_ATTRIBUTE, CDATA, String
399: .valueOf(elapsedTime));
400: super .startElement(namespaceURI, localName, qName,
401: this .documentAttributes);
402: super .endElement(namespaceURI, localName, qName);
403: this .processing = STATE_QUERY;
404: } else {
405: // End element processing
406: IndexHelperField tos = (IndexHelperField) elementStack
407: .pop();
408: StringBuffer text = tos.getText();
409:
410: Attributes atts = tos.getAttributes();
411: boolean attributesToText = atts.getIndex(LUCENE_URI,
412: LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE) != -1;
413: for (int i = 0; i < atts.getLength(); i++) {
414: // Ignore Lucene attributes
415: if (LUCENE_URI.equals(atts.getURI(i))) {
416: continue;
417: }
418:
419: String atts_lname = atts.getLocalName(i);
420: String atts_value = atts.getValue(i);
421: bodyDocument.add(Field.UnStored(localName + "@"
422: + atts_lname, atts_value));
423: if (attributesToText) {
424: text.append(atts_value);
425: text.append(' ');
426: bodyText.append(atts_value);
427: bodyText.append(' ');
428: }
429: }
430:
431: boolean store = atts.getIndex(LUCENE_URI,
432: LUCENE_ELEMENT_ATTR_STORE_VALUE) != -1;
433: if (text != null && text.length() > 0) {
434: if (store) {
435: bodyDocument.add(Field.Text(localName, text
436: .toString()));
437: } else {
438: bodyDocument.add(Field.UnStored(localName, text
439: .toString()));
440: }
441: }
442: }
443: } else {
444: // All other tags
445: super .endElement(namespaceURI, localName, qName);
446: }
447: }
448:
449: public void characters(char[] ch, int start, int length)
450: throws SAXException {
451:
452: if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0
453: && length > 1 && elementStack.size() > 0) {
454: String text = new String(ch, start, length);
455: ((IndexHelperField) elementStack.peek()).append(text);
456: bodyText.append(text);
457: bodyText.append(' ');
458: } else if (processing == STATE_GROUND) {
459: super .characters(ch, start, length);
460: }
461: }
462:
463: private void openWriter() throws IndexException {
464: getLogger().debug(
465: "use luceneIndexTransformer with indexer component");
466: // lookup the indexer
467: try {
468: indexer = (Indexer) this .manager.lookup(Indexer.ROLE
469: + "/default");
470: } catch (ServiceException e) {
471: throw new IndexException(e);
472: }
473:
474: File indexDirectory = new File(
475: queryConfiguration.indexDirectory);
476: if (!indexDirectory.isAbsolute()) {
477: indexDirectory = new File(workDir,
478: queryConfiguration.indexDirectory);
479: }
480: // If the index directory doesn't exist, then always create it.
481: boolean indexExists = IndexReader.indexExists(indexDirectory);
482: if (!indexExists) {
483: createIndex = true;
484: }
485: // Get the index directory, creating it if necessary
486: try {
487: Directory directory = LuceneCocoonHelper.getDirectory(
488: indexDirectory, createIndex);
489: indexer.setIndex(directory);
490: } catch (IOException e) {
491: throw new IndexException("set directory " + indexDirectory
492: + " error", e);
493: }
494: // Get the analyzer
495: Analyzer analyzer = LuceneCocoonHelper
496: .getAnalyzer(queryConfiguration.analyzerClassname);
497: indexer.setAnalyzer(analyzer);
498:
499: this .indexer.setMergeFactor(queryConfiguration.mergeFactor);
500: if (this .createIndex) {
501: this .indexer.clearIndex();
502: }
503: }
504:
505: private void reindexDocument() throws IndexException {
506: // The index is being created, so there's no need to delete the doc from
507: // an existing index.
508: // This means we can keep a single IndexWriter open throughout the
509: // process.
510: if (this .indexer == null) {
511: openWriter();
512: }
513: this .indexer.index(this .bodyDocument);
514: this .bodyDocument = null;
515: }
516:
517: class IndexHelperField {
518: String localName;
519:
520: StringBuffer text;
521:
522: Attributes attributes;
523:
524: IndexHelperField(String localName, Attributes atts) {
525: this .localName = localName;
526: this .attributes = atts;
527: this .text = new StringBuffer();
528: }
529:
530: public Attributes getAttributes() {
531: return attributes;
532: }
533:
534: public StringBuffer getText() {
535: return text;
536: }
537:
538: public void append(String text) {
539: this .text.append(text);
540: }
541:
542: public void append(char[] str, int offset, int length) {
543: this .text.append(str, offset, length);
544: }
545: }
546:
547: class IndexerConfiguration {
548: String analyzerClassname;
549:
550: String indexDirectory;
551:
552: int mergeFactor;
553:
554: public IndexerConfiguration(String analyzerClassname,
555: String indexDirectory, int mergeFactor) {
556: this.analyzerClassname = analyzerClassname;
557: this.indexDirectory = indexDirectory;
558: this.mergeFactor = mergeFactor;
559: }
560: }
561:
562: }
|