001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.components.search;
018:
019: import org.apache.avalon.framework.configuration.Configurable;
020: import org.apache.avalon.framework.configuration.Configuration;
021: import org.apache.avalon.framework.configuration.ConfigurationException;
022: import org.apache.avalon.framework.logger.AbstractLogEnabled;
023: import org.apache.avalon.framework.service.ServiceException;
024: import org.apache.avalon.framework.service.ServiceManager;
025: import org.apache.avalon.framework.service.Serviceable;
026: import org.apache.avalon.framework.thread.ThreadSafe;
027: import org.apache.cocoon.ProcessingException;
028: import org.apache.commons.lang.StringUtils;
029: import org.apache.excalibur.xml.sax.SAXParser;
030: import org.apache.lucene.document.DateField;
031: import org.apache.lucene.document.Document;
032: import org.apache.lucene.document.Field;
033: import org.xml.sax.InputSource;
034: import org.xml.sax.SAXException;
035:
036: import java.io.IOException;
037: import java.io.InputStream;
038: import java.net.URL;
039: import java.net.URLConnection;
040: import java.util.Collections;
041: import java.util.HashSet;
042: import java.util.Iterator;
043: import java.util.List;
044:
045: /**
046: * A simple class building lucene documents from xml content.
047: *
048: * <p>It has two parameters that effect the way it works:</p>
049: * <p>
050: * <tt><store-fields/></tt>
051: * Sets which tags in your content are stored in Lucene as fields,
052: * during the indexing process. Allows them to be output with search hits.
053: * </p><p>
054: * <tt><content-view-query/></tt>
055: * Sets the view the indexer will request for indexing content.
056: * </p><p>
057: * Example configuration (goes in cocoon.xconf)
058: * <pre><tt>
059: * <lucene-xml-indexer logger="core.search.lucene">
060: * <store-fields>title, summary</store-fields>
061: * <content-view-query>cocoon-view=search</content-view-query>
062: * </lucene-xml-indexer>
063: * </tt></pre></p>
064: *
065: * @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
066: * @author <a href="mailto:jeremy@apache.org">Jeremy Quinn</a>
067: * @version CVS $Id: SimpleLuceneXMLIndexerImpl.java 433543 2006-08-22 06:22:54Z crossley $
068: */
069: public class SimpleLuceneXMLIndexerImpl extends AbstractLogEnabled
070: implements LuceneXMLIndexer, Configurable, Serviceable,
071: ThreadSafe {
072:
073: /**
074: * The service manager instance
075: *
076: * @since
077: */
078: protected ServiceManager manager = null;
079:
080: /**
081: * Config element name specifying query-string appendend for requesting links
082: * of an URL.
083: * <p>
084: * Its value is <code>link-view-query</code>.
085: * </p>
086: *
087: * @since
088: */
089: public final static String CONTENT_VIEW_QUERY_CONFIG = "content-view-query";
090:
091: /**
092: * append this string to the url in order to get the
093: * content view of the url
094: *
095: * @since
096: */
097:
098: final static String CONTENT_VIEW_QUERY_DEFAULT = "cocoon-view=content";
099:
100: /**
101: * Config element name specifying the tags to be added as Stored, Untokenised, Unindexed Fields.
102: * <p>
103: * Its value is <code>field-tags</code>.
104: * </p>
105: *
106: * @since
107: */
108: public final static String FIELDTAGS_CONFIG = "store-fields";
109:
110: /**
111: * set of allowed content types
112: *
113: * @since
114: */
115: final HashSet allowedContentType;
116:
117: /**
118: * @since
119: */
120: public SimpleLuceneXMLIndexerImpl() {
121: allowedContentType = new HashSet();
122: allowedContentType.add("text/xml");
123: allowedContentType.add("text/xhtml");
124: fieldTags = new HashSet();
125: }
126:
127: private String contentViewQuery = CONTENT_VIEW_QUERY_DEFAULT;
128: private HashSet fieldTags;
129:
130: /**
131: * configure
132: *
133: * @param configuration
134: * @exception ConfigurationException
135: * @since
136: */
137: public void configure(Configuration configuration)
138: throws ConfigurationException {
139:
140: Configuration[] children;
141: children = configuration.getChildren(FIELDTAGS_CONFIG);
142: if (children != null && children.length > 0) {
143: fieldTags = new HashSet();
144: for (int i = 0; i < children.length; i++) {
145: String pattern = children[i].getValue();
146: String params[] = StringUtils.split(pattern, ", ");
147: for (int index = 0; index < params.length; index++) {
148: String tokenized_pattern = params[index];
149: if (!tokenized_pattern.equals("")) {
150: this .fieldTags.add(tokenized_pattern);
151: if (getLogger().isDebugEnabled()) {
152: getLogger().debug(
153: "add field: " + tokenized_pattern);
154: }
155: }
156: }
157: }
158: } else {
159: if (getLogger().isDebugEnabled()) {
160: getLogger().debug("Do not add any fields");
161: }
162: }
163: this .contentViewQuery = configuration.getChild(
164: CONTENT_VIEW_QUERY_CONFIG, true).getValue(
165: CONTENT_VIEW_QUERY_DEFAULT);
166: if (getLogger().isDebugEnabled()) {
167: getLogger().debug("content view: " + this .contentViewQuery);
168: }
169: }
170:
171: /**
172: * Set the current <code>ServiceManager</code> instance used by this
173: * <code>Serviceable</code>.
174: *
175: * @param manager Description of Parameter
176: * @exception ServiceException Description of Exception
177: * @since
178: */
179: public void service(ServiceManager manager) throws ServiceException {
180: this .manager = manager;
181: }
182:
183: /**
184: * Build lucenen documents from a URL
185: *
186: * @param url the content of this url gets indexed.
187: * @exception ProcessingException Description of Exception
188: * @since
189: */
190: public List build(URL url) throws ProcessingException {
191:
192: try {
193: URL contentURL = new URL(url, url.getFile()
194: + ((url.getFile().indexOf("?") == -1) ? "?" : "&")
195: + contentViewQuery);
196: URLConnection contentURLConnection = contentURL
197: .openConnection();
198: if (contentURLConnection == null) {
199: throw new ProcessingException(
200: "Can not open connection to URL " + contentURL
201: + " (null connection)");
202: }
203:
204: String contentType = contentURLConnection.getContentType();
205: if (contentType == null) {
206: if (getLogger().isDebugEnabled()) {
207: getLogger().debug(
208: "Ignoring " + contentURL
209: + " (no content type)");
210: }
211:
212: return Collections.EMPTY_LIST;
213: }
214:
215: int index = contentType.indexOf(';');
216: if (index != -1) {
217: contentType = contentType.substring(0, index);
218: }
219:
220: if (allowedContentType.contains(contentType)) {
221: if (getLogger().isDebugEnabled()) {
222: getLogger().debug(
223: "Indexing " + contentURL + " ("
224: + contentType + ")");
225: }
226:
227: LuceneIndexContentHandler luceneIndexContentHandler = new LuceneIndexContentHandler();
228: luceneIndexContentHandler.setFieldTags(fieldTags);
229: indexDocument(contentURLConnection,
230: luceneIndexContentHandler);
231: //
232: // document is parsed
233: //
234: Iterator it = luceneIndexContentHandler.iterator();
235: while (it.hasNext()) {
236: Document d = (Document) it.next();
237: d.add(Field.UnIndexed(URL_FIELD, url.toString()));
238: // store ... false, index ... true, token ... false
239: d.add(new Field(UID_FIELD,
240: uid(contentURLConnection), false, true,
241: false));
242: }
243:
244: return luceneIndexContentHandler.allDocuments();
245: } else {
246: if (getLogger().isDebugEnabled()) {
247: getLogger().debug(
248: "Ignoring " + contentURL + " ("
249: + contentType + ")");
250: }
251:
252: return Collections.EMPTY_LIST;
253: }
254: } catch (IOException ioe) {
255: throw new ProcessingException("Cannot read URL " + url, ioe);
256: }
257: }
258:
259: /**
260: * index input stream producing lucene Documents
261: *
262: * @param contentURLConnection the xml content which should get indexed.
263: * @param luceneIndexContentHandler ContentHandler for generating
264: * a lucene Document from XML content.
265: * @exception ProcessingException Description of Exception
266: * @since
267: */
268: private void indexDocument(URLConnection contentURLConnection,
269: LuceneIndexContentHandler luceneIndexContentHandler)
270: throws ProcessingException {
271:
272: InputStream is = null;
273: InputSource in = null;
274: SAXParser parser = null;
275:
276: try {
277: is = contentURLConnection.getInputStream();
278: in = new InputSource(is);
279:
280: // get an XML parser
281: parser = (SAXParser) this .manager.lookup(SAXParser.ROLE);
282: //reader.setErrorHandler(new CocoonErrorHandler());
283: parser.parse(in, luceneIndexContentHandler);
284: //
285: // document is parsed
286: //
287: } catch (IOException ioe) {
288: throw new ProcessingException("Cannot read!", ioe);
289: } catch (SAXException saxe) {
290: throw new ProcessingException("Cannot parse!", saxe);
291: } catch (ServiceException se) {
292: throw new ProcessingException("Cannot lookup xml parser!",
293: se);
294: } finally {
295: if (parser != null) {
296: this .manager.release(parser);
297: }
298: }
299: }
300:
301: /**
302: * return a unique uid of a url connection
303: *
304: * @param urlConnection Description of Parameter
305: * @return String unique uid of a urlConnection
306: * @since
307: */
308: private String uid(URLConnection urlConnection) {
309: // Append path and date into a string in such a way that lexicographic
310: // sorting gives the same results as a walk of the file hierarchy. Thus
311: // null (\u0000) is used both to separate directory components and to
312: // separate the path from the date.
313: return urlConnection.toString().replace('/', '\u0000')
314: + "\u0000"
315: + DateField.timeToString(urlConnection
316: .getLastModified());
317: }
318: }
|