001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.cocoon.transformation;
017:
018: import java.io.IOException;
019: import java.util.Map;
020:
021: import org.apache.avalon.excalibur.pool.Recyclable;
022: import org.apache.avalon.framework.configuration.Configurable;
023: import org.apache.avalon.framework.configuration.Configuration;
024: import org.apache.avalon.framework.configuration.ConfigurationException;
025: import org.apache.avalon.framework.parameters.ParameterException;
026: import org.apache.avalon.framework.parameters.Parameters;
027: import org.apache.avalon.framework.service.ServiceException;
028: import org.apache.avalon.framework.service.ServiceManager;
029: import org.apache.avalon.framework.service.Serviceable;
030: import org.apache.cocoon.ProcessingException;
031: import org.apache.cocoon.components.search.Index;
032: import org.apache.cocoon.components.search.IndexException;
033: import org.apache.cocoon.components.search.components.AnalyzerManager;
034: import org.apache.cocoon.components.search.components.IndexManager;
035: import org.apache.cocoon.components.search.components.Indexer;
036: import org.apache.cocoon.environment.ObjectModelHelper;
037: import org.apache.cocoon.environment.Request;
038: import org.apache.cocoon.environment.SourceResolver;
039: import org.apache.lenya.ac.Identifiable;
040: import org.apache.lenya.ac.User;
041: import org.apache.lenya.ac.UserManager;
042: import org.apache.lenya.cms.repository.RepositoryUtil;
043: import org.apache.lenya.cms.repository.Session;
044: import org.apache.lenya.notification.Message;
045: import org.apache.lenya.notification.NotificationUtil;
046: import org.apache.lucene.analysis.Analyzer;
047: import org.apache.lucene.document.Document;
048: import org.apache.lucene.document.Field;
049: import org.xml.sax.Attributes;
050: import org.xml.sax.SAXException;
051: import org.xml.sax.helpers.AttributesImpl;
052:
053: /**
054: * Another lucene index transformer.</br> allow
055: * <ul>
056: * <li>index function (update indexing or add indexing if clear attribute is
057: * true)</li>
058: * <li>lucene field boosting</li>
059: * <li>delete function</li>
060: * </ul>
061: *
062: * <p>
063: * This tranformer used several avalon components, but you can use them
064: * separatly :
065: * <ul>
066: * <li>AnalyzerManager: you can setup a analyzer (configurable) in the
067: * analyzer_manager tag in cocoon.xconf file</li>
068: * <li>IndexManager: you can setup a index in a the /WEB-INF/index.xml (default
069: * location , but you can specify the location in the IndexManager component
070: * configuration in cocoon.xconf file)</li>
071: * <li>Indexer (2 implementations: default (with update optimization) and
072: * parallel implementation for multiple cpu)</li>
073: * </p>
074: * <p>
075: * <strong>Example of input source: </strong>
076: * </p>
077: * <p>
078: * <ul>
079: * <li>to Index <br>
080: * <lucene:index xmlns:lucene="http://apache.org/cocoon/lucene/1.0"
081: * <br/>indexid="myindex" <br>
082: * clear="true" (optinal attribute: clear index) <br/>merge-factor="100">
083: * (optinal attribute: see lucene doc) <br>
084: * <br/><lucene:document uid="http://myhost/myfile1.data"> <br/>
085: * <lucene:field name="tile" > sqdqsdq </lucene:field> <br>
086: * <lucene:field name="description" > a text bla bal blalael
087: * balbal</lucene:field> <br>
088: * <lucene:field name="date" >10/12/2002</lucene:field> <br/>
089: * </lucene:document> <br>
090: *
091: * <p>
092: * <lucene:document uid="http://myhost/myfile2.data" > <br>
093: * <lucene:field name="author" boost="2" >Mr Author </lucene:field>
094: * <em>(boost the field for the search (see Lucene documentation))</em> <br/>
095: * <lucene:field name="langage" >french</lucene:field> <br>
096: * </lucene:document> <br>
097: * < /lucene:index>
098: * </p>
099: * </li>
100: *
101: * <li>To delete <br/>
102: * <p>
103: * <lucene:delete indexid="myindex" > <br>
104: * <lucene:document uid="http://myhost/myfile.data" > <br>
105: * <lucene:document uid="EODOED-EFE" <br>
106: * </lucene:delete>
107: * </p>
108: *
109: * <p>
110: * <strong>Example of Output Source </strong>
111: * </p>
112: * <p>
113: * <page xmlns:lucene="http://apache.org/cocoon/lucene/1.0">
114: * <br>
115: * < lucene:index > <br>
116: * <lucene:document uid="http://myhost/myfile1.data"/> <br/>
117: * <lucene:document uid="http://myhost/myfile2.data"/> <br/>
118: * </lucene:index>
119: * </p>
120: * <p>
121: * <lucene:delete > <lucene:document
122: * uid="http://myhost/myfile1.data"/> <br/><lucene:document
123: * uid="EODOED-EFE"/> <br/></lucene:delete ></br></li>
124: * </ul>
125: *
126: * @author Nicolas Maisonneuve
127: */
128:
129: public class LuceneIndexTransformer2 extends AbstractTransformer
130: implements Recyclable, Serviceable, Configurable {
131:
132: public static final String DIRECTORY_DEFAULT = "index";
133:
134: public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
135:
136: public static final String LUCENE_PREXIF = "lucene";
137:
138: /**
139: * action element : index doc
140: */
141: public static final String LUCENE_INDEXING_ELEMENT = "index";
142:
143: /**
144: * action element: delete doc
145: */
146: public static final String LUCENE_DELETING_ELEMENT = "delete";
147:
148: /**
149: * index identity (see index definition file)
150: */
151: public static final String LUCENE_INDEXING_INDEXID_ATTRIBUTE = "indexid";
152:
153: /**
154: * Optional attribute: Clear index: true/false (default: false)
155: */
156: public static final String LUCENE_INDEXING_CREATE_ATTRIBUTE = "clear";
157:
158: /**
159: * Optional attribute: Analyzer identity: see analyzerManager Component
160: * (default: the analyer of the index declared in the index definition)
161: */
162: public static final String LUCENE_INDEXING_ANALYZER_ATTRIBUTE = "analyzer";
163:
164: /**
165: * Optional attribute: MergeFactor number (default 10): improve the indexing
166: * speed for large indexing (see Lucene docs)
167: */
168: public static final String LUCENE_INDEXING_MERGE_FACTOR_ATTRIBUTE = "mergefactor";
169:
170: /**
171: * Lucene document element
172: */
173: public static final String LUCENE_DOCUMENT_ELEMENT = "document";
174:
175: /**
176: * Lucene document uid field
177: */
178: public static final String LUCENE_DOCUMENT_UID_ATTRIBUTE = "uid";
179:
180: /**
181: * lucene field element
182: */
183: public static final String LUCENE_FIELD_ELEMENT = "field";
184:
185: /**
186: * lucene field name
187: */
188: public static final String LUCENE_FIELD_NAME_ATTRIBUTE = "name";
189:
190: /**
191: * Optional attribute: lucene field boost (see lucene docs)
192: */
193: public static final String LUCENE_FIELD_BOOST_ATTRIBUTE = "boost";
194:
195: // The 6 states of the state machine
196: private int processing;
197:
198: public static final int NO_PROCESSING = 0;
199:
200: public static final int INDEX_PROCESS = 1;
201:
202: public static final int IN_DOCUMENT_PROCESS = 2;
203:
204: public static final int IN_FIELD_PROCESS = 4;
205:
206: public static final int DELETE_PROCESS = 5;
207:
208: public static final int DELETING_PROCESS = 6;
209:
210: // Runtime variables
211: private int mergeFactor;
212:
213: private AttributesImpl attrs = new AttributesImpl();
214:
215: private Index index;
216:
217: private Indexer indexer;
218:
219: private ServiceManager manager;
220:
221: private Document bodyDocument;
222:
223: private String uid;
224:
225: private String fieldname;
226:
227: private float fieldboost;
228:
229: private StringBuffer fieldvalue;
230:
231: private Request request;
232:
233: private String pubId;
234: private String area;
235: private String uuid;
236: private String language;
237:
238: /**
239: * Setup the transformer.
240: */
241: public void setup(SourceResolver resolver, Map objectModel,
242: String src, Parameters parameters)
243: throws ProcessingException, SAXException, IOException {
244: this .request = ObjectModelHelper.getRequest(objectModel);
245: try {
246: this .pubId = parameters.getParameter("publicationId");
247: this .area = parameters.getParameter("area");
248: this .uuid = parameters.getParameter("uuid");
249: this .language = parameters.getParameter("language");
250: } catch (ParameterException e) {
251: throw new ProcessingException(e);
252: }
253: }
254:
255: public void recycle() {
256: this .index = null;
257: this .indexer = null;
258: this .processing = NO_PROCESSING;
259: }
260:
261: public void service(ServiceManager manager) throws ServiceException {
262: this .manager = manager;
263: }
264:
265: public void startDocument() throws SAXException {
266: super .startDocument();
267: }
268:
269: public void endDocument() throws SAXException {
270: super .endDocument();
271: }
272:
273: /**
274: * Begin the scope of a prefix-URI Namespace mapping.
275: *
276: * @param prefix The Namespace prefix being declared.
277: * @param uri The Namespace URI the prefix is mapped to.
278: */
279: public void startPrefixMapping(String prefix, String uri)
280: throws SAXException {
281: if (processing == NO_PROCESSING) {
282: super .startPrefixMapping(prefix, uri);
283: }
284: }
285:
286: /**
287: * End the scope of a prefix-URI mapping.
288: *
289: * @param prefix The prefix that was being mapping.
290: */
291: public void endPrefixMapping(String prefix) throws SAXException {
292: if (processing == NO_PROCESSING) {
293: super .endPrefixMapping(prefix);
294: }
295: }
296:
297: public void startElement(String namespaceURI, String localName,
298: String qName, Attributes atts) throws SAXException {
299:
300: // getLogger().debug("START processing: "+processing+" "+localName);
301:
302: if (LUCENE_URI.equals(namespaceURI)) {
303: switch (processing) {
304:
305: case NO_PROCESSING:
306:
307: // index action
308: if (LUCENE_INDEXING_ELEMENT.equals(localName)) {
309: this .initIndexer(atts);
310: processing = INDEX_PROCESS;
311:
312: super .startElement(namespaceURI, localName, qName,
313: attrs);
314: }
315: // delete action
316: else if (LUCENE_DELETING_ELEMENT.equals(localName)) {
317: this .initIndexer(atts);
318: processing = DELETE_PROCESS;
319: super .startElement(namespaceURI, localName, qName,
320: attrs);
321: } else {
322: handleError("element " + localName + " unknown");
323: }
324: break;
325:
326: case INDEX_PROCESS:
327:
328: // new document to index
329: if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
330:
331: uid = atts.getValue(LUCENE_DOCUMENT_UID_ATTRIBUTE);
332: if (uid == null) {
333: handleError("<" + LUCENE_PREXIF + ":"
334: + LUCENE_DOCUMENT_ELEMENT
335: + "> element must contain "
336: + LUCENE_DOCUMENT_UID_ATTRIBUTE
337: + " attribute");
338: }
339: bodyDocument = index.createDocument(uid);
340: processing = IN_DOCUMENT_PROCESS;
341: } else {
342: handleError("element " + localName
343: + " is not allowed in <" + LUCENE_PREXIF
344: + ":" + LUCENE_DOCUMENT_ELEMENT
345: + "> element");
346: }
347: break;
348:
349: case DELETE_PROCESS:
350:
351: if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
352: uid = atts.getValue(LUCENE_DOCUMENT_UID_ATTRIBUTE);
353: if (uid == null) {
354: handleError("<" + LUCENE_PREXIF + ":"
355: + LUCENE_DOCUMENT_ELEMENT
356: + "> element must contain "
357: + LUCENE_DOCUMENT_UID_ATTRIBUTE
358: + " attribute");
359: }
360: processing = DELETING_PROCESS;
361: } else {
362: handleError("element " + localName
363: + " is not a <lucene:document> element");
364: }
365: break;
366:
367: case IN_DOCUMENT_PROCESS:
368: if (LUCENE_FIELD_ELEMENT.equals(localName)) {
369:
370: // set the field name
371: this .fieldname = atts
372: .getValue(LUCENE_FIELD_NAME_ATTRIBUTE);
373: if (this .fieldname == null
374: || this .fieldname.equals("")) {
375: handleError("<lucene:field> element must contain name attribut");
376: }
377:
378: // clear the text buffer
379: this .fieldvalue = new StringBuffer();
380:
381: // set boost value
382: String fieldboostS = atts
383: .getValue(LUCENE_FIELD_BOOST_ATTRIBUTE);
384: if (fieldboostS == null) {
385: fieldboost = 1.0f;
386: } else {
387: fieldboost = Float.parseFloat(fieldboostS);
388: }
389: processing = IN_FIELD_PROCESS;
390: } else {
391: handleError("<" + LUCENE_PREXIF + ":"
392: + LUCENE_FIELD_ELEMENT + " was expected!");
393: }
394: break;
395: }
396: } else {
397: // bypass
398: super .startElement(namespaceURI, localName, qName, atts);
399: }
400: }
401:
402: public void endElement(String namespaceURI, String localName,
403: String qName) throws SAXException {
404:
405: // getLogger().debug("END processing: "+processing+" "+localName);
406:
407: if (LUCENE_URI.equals(namespaceURI)) {
408: switch (processing) {
409:
410: case INDEX_PROCESS:
411: if (LUCENE_INDEXING_ELEMENT.equals(localName)) {
412: // end of the indexing -> close the indexer
413: this .closeIndexer();
414: this .processing = NO_PROCESSING;
415: super .endElement(namespaceURI, localName, qName);
416: } else {
417: handleError("</lucene:" + LUCENE_DELETING_ELEMENT
418: + " was expected!");
419: }
420: break;
421:
422: case DELETE_PROCESS:
423: if (LUCENE_DELETING_ELEMENT.equals(localName)) {
424: // end of the deleting -> close the indexer
425: this .closeIndexer();
426: this .processing = NO_PROCESSING;
427: super .endElement(namespaceURI, localName, qName);
428: } else {
429: handleError("</lucene:" + LUCENE_DELETING_ELEMENT
430: + " was expected!");
431: }
432: break;
433:
434: case IN_DOCUMENT_PROCESS:
435: if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
436: if (canIndex()) {
437: // index the document
438: try {
439: this .indexer.index(bodyDocument);
440: } catch (IndexException ex1) {
441: handleError(ex1);
442: }
443: }
444: if (this .getLogger().isDebugEnabled()) {
445: this .getLogger().debug(
446: " lucene document: "
447: + this .bodyDocument);
448: }
449: bodyDocument = null;
450: attrs.clear();
451: attrs.addAttribute(namespaceURI, "uid", "uid",
452: "CDATA", uid);
453: super .startElement(namespaceURI, localName, qName,
454: attrs);
455: super .endElement(namespaceURI, localName, qName);
456: this .processing = INDEX_PROCESS;
457: } else {
458: handleError("</lucene:" + LUCENE_DOCUMENT_ELEMENT
459: + " was expected!");
460: }
461: break;
462:
463: case DELETING_PROCESS:
464: if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
465: // delete a document
466: if (canIndex()) {
467: try {
468: indexer.del(uid);
469: } catch (IndexException ex2) {
470: handleError(ex2);
471: }
472: }
473: attrs.clear();
474: attrs.addAttribute(namespaceURI, "uid", "uid",
475: "CDATA", uid);
476: super .startElement(namespaceURI, localName, qName,
477: attrs);
478: super .endElement(namespaceURI, localName, qName);
479: this .processing = DELETE_PROCESS;
480: } else {
481: handleError("</lucene:" + LUCENE_DOCUMENT_ELEMENT
482: + " was expected!");
483: }
484: break;
485:
486: case IN_FIELD_PROCESS:
487: if (LUCENE_FIELD_ELEMENT.equals(localName)) {
488:
489: // create lucene field
490: Field f = null;
491: try {
492: f = index.createField(fieldname, fieldvalue
493: .toString());
494: } catch (IndexException ex) {
495: handleError(ex);
496: }
497: f.setBoost(fieldboost);
498:
499: // add field to the lucene document
500: bodyDocument.add(f);
501: processing = IN_DOCUMENT_PROCESS;
502: } else {
503: handleError("</lucene:" + LUCENE_FIELD_ELEMENT
504: + " was expected!");
505: }
506: break;
507:
508: default:
509: handleError("unknow element '" + LUCENE_FIELD_ELEMENT
510: + "'!");
511: }
512: } else {
513: super .endElement(namespaceURI, localName, qName);
514: }
515: }
516:
517: protected boolean canIndex() {
518: return this .indexer != null;
519: }
520:
521: public void characters(char[] ch, int start, int length)
522: throws SAXException {
523: if (processing == IN_FIELD_PROCESS) {
524: this .fieldvalue.append(ch, start, length);
525: } else {
526: super .characters(ch, start, length);
527: }
528:
529: }
530:
531: /**
532: * Configure the Indexer
533: *
534: * @param id the indexid
535: * @param analyzerid
536: * @param mergeF
537: * @param clear
538: * @throws SAXException
539: */
540: private void initIndexer(Attributes atts) throws SAXException {
541:
542: String id = atts.getValue(LUCENE_INDEXING_INDEXID_ATTRIBUTE);
543: String analyzerid = atts.getValue(LUCENE_URI,
544: LUCENE_INDEXING_ANALYZER_ATTRIBUTE);
545: String mergeF = atts.getValue(LUCENE_URI,
546: LUCENE_INDEXING_MERGE_FACTOR_ATTRIBUTE);
547: String clear = atts.getValue(LUCENE_URI,
548: LUCENE_INDEXING_CREATE_ATTRIBUTE);
549: attrs = new AttributesImpl(atts);
550:
551: // set the indexer
552: try {
553: IndexManager indexM = (IndexManager) manager
554: .lookup(IndexManager.ROLE);
555: index = indexM.getIndex(id);
556: if (index == null) {
557: handleError("index [" + id
558: + "] no found in the index definition");
559: }
560: indexer = index.getIndexer();
561: manager.release(indexM);
562: } catch (ServiceException ex1) {
563: handleError(ex1);
564:
565: } catch (IndexException ex3) {
566: handleError("get Indexer error for index [" + id + "]", ex3);
567: }
568:
569: // set a custum analyzer (default: the analyzer of the index)
570: if (analyzerid != null) {
571: Analyzer analyzer = null;
572: try {
573: AnalyzerManager analyzerM = (AnalyzerManager) manager
574: .lookup(IndexManager.ROLE);
575: analyzer = analyzerM.getAnalyzer(analyzerid);
576: indexer.setAnalyzer(analyzer);
577: manager.release(analyzerM);
578: } catch (ServiceException ex1) {
579: handleError(ex1);
580: } catch (ConfigurationException ex2) {
581: handleError("error setting analyzer for index [" + id
582: + "]", ex2);
583: }
584: } else {
585:
586: attrs.addAttribute(LUCENE_URI,
587: LUCENE_INDEXING_ANALYZER_ATTRIBUTE,
588: LUCENE_INDEXING_ANALYZER_ATTRIBUTE, "CDATA", index
589: .getDefaultAnalyzerID());
590: }
591:
592: if (canIndex()) {
593: // set clear mode
594: boolean new_index = (clear != null && clear.toLowerCase()
595: .equals("true")) ? true : false;
596: if (new_index) {
597: try {
598: indexer.clearIndex();
599: } catch (IndexException ex3) {
600: handleError("error clearing index", ex3);
601: }
602: }
603:
604: // set the mergeFactor
605: if (mergeF != null) {
606: int mergeFactor = Integer.parseInt(mergeF);
607: indexer.setMergeFactor(mergeFactor);
608: }
609:
610: if (this .getLogger().isDebugEnabled()) {
611: this .getLogger().debug(
612: "index " + id + " clear: " + new_index
613: + " analyzerid: " + analyzerid
614: + "mergefactor: " + mergeF);
615: }
616: }
617: }
618:
619: void handleError(String message, Exception ex) throws SAXException {
620: handleError(message + ": " + getExceptionMessage(ex));
621: }
622:
623: void handleError(Exception ex) throws SAXException {
624: handleError(getExceptionMessage(ex));
625: }
626:
627: protected String getExceptionMessage(Exception ex)
628: throws SAXException {
629: String exMsg = ex.getMessage();
630: String msg = exMsg == null ? "" : " (" + exMsg + ")";
631: return ex.getClass().getName() + msg;
632: }
633:
634: /**
635: * Handle Exception or Error
636: *
637: * @param msg
638: * @param ex
639: * @throws SAXException
640: */
641: void handleError(String msg) throws SAXException {
642: closeIndexer();
643:
644: try {
645: Session session = RepositoryUtil.getSession(this .manager,
646: this .request);
647: User sender = session.getIdentity().getUser();
648: UserManager userManager = (UserManager) sender
649: .getItemManager();
650: User recipient = userManager
651: .getUser(this .notificationRecipient);
652: Identifiable[] recipients = { recipient };
653:
654: String subject = "indexing-failed-subject";
655: String[] subjectParams = new String[0];
656: String body = "indexing-failed-body";
657: String[] bodyParams = { this .pubId, this .area, this .uuid,
658: this .language, msg };
659:
660: Message message = new Message(subject, subjectParams, body,
661: bodyParams, sender, recipients);
662: NotificationUtil.notify(this .manager, message);
663:
664: getLogger().error(
665: "Could not index document [" + this .pubId + ":"
666: + this .area + ":" + this .uuid + ":"
667: + this .language
668: + "], sent message to user ["
669: + this .notificationRecipient + "].");
670:
671: } catch (Exception e) {
672: throw new SAXException(e);
673: }
674:
675: /*
676: * if (ex == null) { // this.getLogger().error(msg); throw new
677: * SAXException(msg); } else { // this.getLogger().error(msg, ex); throw
678: * new SAXException(msg, ex); }
679: */
680: }
681:
682: /**
683: * Close the indexer
684: *
685: * @throws SAXException
686: */
687: void closeIndexer() throws SAXException {
688: if (index != null) {
689: index.releaseIndexer(indexer);
690: }
691: }
692:
693: private String notificationRecipient = null;
694:
695: public void configure(Configuration config)
696: throws ConfigurationException {
697: this .notificationRecipient = config.getChild("notify")
698: .getAttribute("user");
699: }
700:
701: }
|