001: package org.apache.lucene.ant;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Analyzer;
021: import org.apache.lucene.analysis.StopAnalyzer;
022: import org.apache.lucene.analysis.SimpleAnalyzer;
023: import org.apache.lucene.analysis.WhitespaceAnalyzer;
024: import org.apache.lucene.analysis.standard.StandardAnalyzer;
025: import org.apache.lucene.document.Document;
026: import org.apache.lucene.document.Field;
027: import org.apache.lucene.document.DateTools;
028: import org.apache.lucene.index.IndexWriter;
029: import org.apache.lucene.index.Term;
030: import org.apache.lucene.search.Hits;
031: import org.apache.lucene.search.IndexSearcher;
032: import org.apache.lucene.search.Searcher;
033: import org.apache.lucene.search.TermQuery;
034: import org.apache.tools.ant.BuildException;
035: import org.apache.tools.ant.DirectoryScanner;
036: import org.apache.tools.ant.DynamicConfigurator;
037: import org.apache.tools.ant.Project;
038: import org.apache.tools.ant.Task;
039: import org.apache.tools.ant.types.FileSet;
040: import org.apache.tools.ant.types.EnumeratedAttribute;
041:
042: import java.io.File;
043: import java.io.IOException;
044: import java.util.Date;
045: import java.util.Properties;
046: import java.util.Map;
047: import java.util.HashMap;
048: import java.util.Set;
049: import java.util.ArrayList;
050: import java.text.ParseException;
051:
052: /**
053: * Ant task to index files with Lucene
054: *
055: *@author Erik Hatcher
056: */
057: public class IndexTask extends Task {
058: /**
059: * file list
060: */
061: private ArrayList filesets = new ArrayList();
062:
063: /**
064: * overwrite index?
065: */
066: private boolean overwrite = false;
067:
068: /**
069: * index path
070: */
071: private File indexDir;
072:
073: /**
074: * document handler classname
075: */
076: private String handlerClassName = FileExtensionDocumentHandler.class
077: .getName();
078:
079: /**
080: * document handler instance
081: */
082: private DocumentHandler handler;
083:
084: /**
085: *
086: */
087: private String analyzerClassName = StandardAnalyzer.class.getName();
088:
089: /**
090: * analyzer instance
091: */
092: private Analyzer analyzer;
093:
094: /**
095: * Lucene merge factor
096: */
097: private int mergeFactor = 20;
098:
099: private HandlerConfig handlerConfig;
100:
101: private boolean useCompoundIndex = true;
102:
103: /**
104: * Creates new instance
105: */
106: public IndexTask() {
107: }
108:
109: /**
110: * Specifies the directory where the index will be stored
111: */
112: public void setIndex(File indexDir) {
113: this .indexDir = indexDir;
114: }
115:
116: /**
117: * Sets the mergeFactor attribute of the IndexTask object
118: *
119: *@param mergeFactor The new mergeFactor value
120: */
121: public void setMergeFactor(int mergeFactor) {
122: this .mergeFactor = mergeFactor;
123: }
124:
125: /**
126: * Sets the overwrite attribute of the IndexTask object
127: *
128: *@param overwrite The new overwrite value
129: */
130: public void setOverwrite(boolean overwrite) {
131: this .overwrite = overwrite;
132: }
133:
134: /**
135: * If creating a new index and this is set to true, the
136: * index will be created in compound format.
137: */
138: public void setUseCompoundIndex(boolean useCompoundIndex) {
139: this .useCompoundIndex = useCompoundIndex;
140: }
141:
142: /**
143: * Sets the documentHandler attribute of the IndexTask object
144: *
145: *@param classname The new documentHandler value
146: */
147: public void setDocumentHandler(String classname) {
148: handlerClassName = classname;
149: }
150:
151: /**
152: * Sets the analyzer based on the builtin Lucene analyzer types.
153: *
154: * @todo Enforce analyzer and analyzerClassName to be mutually exclusive
155: */
156: public void setAnalyzer(AnalyzerType type) {
157: analyzerClassName = type.getClassname();
158: }
159:
160: public void setAnalyzerClassName(String classname) {
161: analyzerClassName = classname;
162: }
163:
164: /**
165: * Adds a set of files (nested fileset attribute).
166: *
167: *@param set FileSet to be added
168: */
169: public void addFileset(FileSet set) {
170: filesets.add(set);
171: }
172:
173: /**
174: * Sets custom properties for a configurable document handler.
175: */
176: public void addConfig(HandlerConfig config) throws BuildException {
177: if (handlerConfig != null) {
178: throw new BuildException("Only one config element allowed");
179: }
180:
181: handlerConfig = config;
182: }
183:
184: /**
185: * Begins the indexing
186: *
187: *@exception BuildException If an error occurs indexing the
188: * fileset
189: */
190: public void execute() throws BuildException {
191:
192: // construct handler and analyzer dynamically
193: try {
194: Class clazz = Class.forName(handlerClassName);
195: handler = (DocumentHandler) clazz.newInstance();
196:
197: clazz = Class.forName(analyzerClassName);
198: analyzer = (Analyzer) clazz.newInstance();
199: } catch (ClassNotFoundException cnfe) {
200: throw new BuildException(cnfe);
201: } catch (InstantiationException ie) {
202: throw new BuildException(ie);
203: } catch (IllegalAccessException iae) {
204: throw new BuildException(iae);
205: }
206:
207: log("Document handler = " + handler.getClass(),
208: Project.MSG_VERBOSE);
209: log("Analyzer = " + analyzer.getClass(), Project.MSG_VERBOSE);
210:
211: if (handler instanceof ConfigurableDocumentHandler) {
212: ((ConfigurableDocumentHandler) handler)
213: .configure(handlerConfig.getProperties());
214: }
215:
216: try {
217: indexDocs();
218: } catch (IOException e) {
219: throw new BuildException(e);
220: }
221: }
222:
223: /**
224: * Index the fileset.
225: *
226: *@exception IOException if Lucene I/O exception
227: *@todo refactor!!!!!
228: */
229: private void indexDocs() throws IOException {
230: Date start = new Date();
231:
232: boolean create = overwrite;
233: // If the index directory doesn't exist,
234: // create it and force create mode
235: if (indexDir.mkdirs() && !overwrite) {
236: create = true;
237: }
238:
239: Searcher searcher = null;
240: boolean checkLastModified = false;
241: if (!create) {
242: try {
243: searcher = new IndexSearcher(indexDir.getAbsolutePath());
244: checkLastModified = true;
245: } catch (IOException ioe) {
246: log("IOException: " + ioe.getMessage());
247: // Empty - ignore, which indicates to index all
248: // documents
249: }
250: }
251:
252: log("checkLastModified = " + checkLastModified,
253: Project.MSG_VERBOSE);
254:
255: IndexWriter writer = new IndexWriter(indexDir, analyzer, create);
256:
257: writer.setUseCompoundFile(useCompoundIndex);
258: int totalFiles = 0;
259: int totalIndexed = 0;
260: int totalIgnored = 0;
261: try {
262: writer.setMergeFactor(mergeFactor);
263:
264: for (int i = 0; i < filesets.size(); i++) {
265: FileSet fs = (FileSet) filesets.get(i);
266: if (fs != null) {
267: DirectoryScanner ds = fs
268: .getDirectoryScanner(getProject());
269: String[] dsfiles = ds.getIncludedFiles();
270: File baseDir = ds.getBasedir();
271:
272: for (int j = 0; j < dsfiles.length; j++) {
273: File file = new File(baseDir, dsfiles[j]);
274: totalFiles++;
275:
276: if (!file.exists() || !file.canRead()) {
277: throw new BuildException(
278: "File \""
279: + file.getAbsolutePath()
280: + "\" does not exist or is not readable.");
281: }
282:
283: boolean indexIt = true;
284:
285: if (checkLastModified) {
286: Term pathTerm = new Term("path", file
287: .getPath());
288: TermQuery query = new TermQuery(pathTerm);
289: Hits hits = searcher.search(query);
290:
291: // if document is found, compare the
292: // indexed last modified time with the
293: // current file
294: // - don't index if up to date
295: if (hits.length() > 0) {
296: Document doc = hits.doc(0);
297: String indexModified = doc.get(
298: "modified").trim();
299: if (indexModified != null) {
300: long lastModified = 0;
301: try {
302: lastModified = DateTools
303: .stringToTime(indexModified);
304: } catch (ParseException e) {
305: // if modified time is not parsable, skip
306: }
307: if (lastModified == file
308: .lastModified()) {
309: // TODO: remove existing document
310: indexIt = false;
311: }
312: }
313: }
314: }
315:
316: if (indexIt) {
317: try {
318: log("Indexing " + file.getPath(),
319: Project.MSG_VERBOSE);
320: Document doc = handler
321: .getDocument(file);
322:
323: if (doc == null) {
324: totalIgnored++;
325: } else {
326: // Add the path of the file as a field named "path". Use a Keyword field, so
327: // that the index stores the path, and so that the path is searchable
328: doc.add(new Field("path", file
329: .getPath(),
330: Field.Store.YES,
331: Field.Index.UN_TOKENIZED));
332:
333: // Add the last modified date of the file a field named "modified". Use a
334: // Keyword field, so that it's searchable, but so that no attempt is made
335: // to tokenize the field into words.
336: doc
337: .add(new Field(
338: "modified",
339: DateTools
340: .timeToString(
341: file
342: .lastModified(),
343: DateTools.Resolution.MILLISECOND),
344: Field.Store.YES,
345: Field.Index.UN_TOKENIZED));
346:
347: writer.addDocument(doc);
348: totalIndexed++;
349: }
350: } catch (DocumentHandlerException e) {
351: throw new BuildException(e);
352: }
353: }
354: }
355: // for j
356: }
357: // if (fs != null)
358: }
359: // for i
360:
361: writer.optimize();
362: }
363: //try
364: finally {
365: // always make sure everything gets closed,
366: // no matter how we exit.
367: writer.close();
368: if (searcher != null) {
369: searcher.close();
370: }
371: }
372:
373: Date end = new Date();
374:
375: log(totalIndexed + " out of " + totalFiles + " indexed ("
376: + totalIgnored + " ignored) in "
377: + (end.getTime() - start.getTime()) + " milliseconds");
378: }
379:
380: public static class HandlerConfig implements DynamicConfigurator {
381: Properties props = new Properties();
382:
383: public void setDynamicAttribute(String attributeName,
384: String value) throws BuildException {
385: props.setProperty(attributeName, value);
386: }
387:
388: public Object createDynamicElement(String elementName)
389: throws BuildException {
390: throw new BuildException("Sub elements not supported");
391: }
392:
393: public Properties getProperties() {
394: return props;
395: }
396: }
397:
398: public static class AnalyzerType extends EnumeratedAttribute {
399: private static Map analyzerLookup = new HashMap();
400:
401: static {
402: analyzerLookup
403: .put("simple", SimpleAnalyzer.class.getName());
404: analyzerLookup.put("standard", StandardAnalyzer.class
405: .getName());
406: analyzerLookup.put("stop", StopAnalyzer.class.getName());
407: analyzerLookup.put("whitespace", WhitespaceAnalyzer.class
408: .getName());
409: }
410:
411: /**
412: * @see EnumeratedAttribute#getValues
413: */
414: public String[] getValues() {
415: Set keys = analyzerLookup.keySet();
416: return (String[]) keys.toArray(new String[0]);
417: }
418:
419: public String getClassname() {
420: return (String) analyzerLookup.get(getValue());
421: }
422: }
423: }
|