001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.components.search;
018:
019: import org.apache.lucene.document.Document;
020: import org.apache.lucene.document.Field;
021: import org.xml.sax.Attributes;
022: import org.xml.sax.ContentHandler;
023: import org.xml.sax.Locator;
024: import org.xml.sax.helpers.AttributesImpl;
025:
026: import java.util.ArrayList;
027: import java.util.HashSet;
028: import java.util.Iterator;
029: import java.util.List;
030: import java.util.Stack;
031:
032: /**
033: * Parse XML and generate lucene document(s)
034: *
035: * can now be configured via SimpleLuceneXMLIndexerImpl
036: * to store specific tags in Lucene, so that you can
037: * display them with hits.
038: *
039: * @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
040: * @author <a href="mailto:jeremy@apache.org">Jeremy Quinn</a>
041: * @version CVS $Id: LuceneIndexContentHandler.java 433543 2006-08-22 06:22:54Z crossley $
042: */
043: public class LuceneIndexContentHandler implements ContentHandler {
044: public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
045:
046: /**
047: * If this attribute is specified on element, values of all attributes
048: * are added to the text of the element, and to the document
049: * body text
050: */
051: public static final String LUCENE_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
052:
053: StringBuffer bodyText;
054: private List documents;
055: private Document bodyDocument;
056: private Stack elementStack;
057: private HashSet fieldTags;
058:
059: /**
060: * Constructor for the LuceneIndexContentHandler object
061: */
062: public LuceneIndexContentHandler() {
063: this .bodyText = new StringBuffer();
064: this .bodyDocument = new Document();
065: this .documents = new ArrayList();
066: this .documents.add(this .bodyDocument);
067: this .elementStack = new Stack();
068: this .fieldTags = new HashSet();
069: }
070:
071: /**
072: * Sets the fieldTags attribute of the LuceneIndexContentHandler object
073: *
074: * @param fieldTags The new fieldTags value
075: */
076: public void setFieldTags(HashSet fieldTags) {
077: this .fieldTags = fieldTags;
078: }
079:
080: /**
081: * Sets the documentLocator attribute of the LuceneIndexContentHandler object
082: *
083: * @param locator The new documentLocator value
084: */
085: public void setDocumentLocator(Locator locator) {
086: }
087:
088: public List allDocuments() {
089: return documents;
090: }
091:
092: public Iterator iterator() {
093: return documents.iterator();
094: }
095:
096: public void characters(char[] ch, int start, int length) {
097: if (ch.length > 0 && start >= 0 && length > 1) {
098: if (elementStack.size() > 0) {
099: IndexHelperField tos = (IndexHelperField) elementStack
100: .peek();
101: tos.appendText(ch, start, length);
102: }
103: bodyText.append(' ');
104: bodyText.append(ch, start, length);
105: }
106: }
107:
108: public void endDocument() {
109: bodyDocument.add(Field.UnStored(LuceneXMLIndexer.BODY_FIELD,
110: bodyText.toString()));
111: }
112:
113: public void endElement(String namespaceURI, String localName,
114: String qName) {
115: IndexHelperField tos = (IndexHelperField) elementStack.pop();
116: String lname = tos.getLocalFieldName();
117: StringBuffer text = tos.getText();
118:
119: // (VG): Atts are never null, see startElement
120: Attributes atts = tos.getAttributes();
121: boolean attributesToText = atts.getIndex(LUCENE_URI,
122: LUCENE_ATTR_TO_TEXT_ATTRIBUTE) != -1;
123: for (int i = 0; i < atts.getLength(); i++) {
124: if (LUCENE_URI.equals(atts.getURI(i)))
125: continue;
126:
127: String atts_lname = atts.getLocalName(i);
128: String atts_value = atts.getValue(i);
129: bodyDocument.add(Field.UnStored(lname + "@" + atts_lname,
130: atts_value));
131: if (attributesToText) {
132: text.append(' ');
133: text.append(atts_value);
134: bodyText.append(' ');
135: bodyText.append(atts_value);
136: }
137: }
138:
139: if (text != null && text.length() > 0) {
140: if (isFieldTag(lname)) {
141: bodyDocument.add(Field
142: .UnIndexed(lname, text.toString()));
143: }
144: bodyDocument.add(Field.UnStored(lname, text.toString()));
145: }
146: }
147:
148: public void endPrefixMapping(String prefix) {
149: }
150:
151: public void ignorableWhitespace(char[] ch, int start, int length) {
152: }
153:
154: public void processingInstruction(String target, String data) {
155: }
156:
157: public void skippedEntity(String name) {
158: }
159:
160: public void startDocument() {
161: }
162:
163: public void startElement(String namespaceURI, String localName,
164: String qName, Attributes atts) {
165: IndexHelperField ihf = new IndexHelperField(localName, qName,
166: new AttributesImpl(atts));
167: elementStack.push(ihf);
168: }
169:
170: public void startPrefixMapping(String prefix, String uri) {
171: }
172:
173: /**
174: * check if tag is a candidate for making into a Field
175: *
176: * @param tag local name of the tag we are processing
177: * @return boolean
178: */
179: private boolean isFieldTag(String tag) {
180: // by default do not make field
181: if (fieldTags == null) {
182: return false;
183: }
184: Iterator i = fieldTags.iterator();
185: while (i.hasNext()) {
186: if (tag.equals(i.next())) {
187: return true;
188: }
189: }
190: return false;
191: }
192: }
|