001: /*
002: * $Header$
003: * $Revision: 7966 $
004: * $Date: 2007-08-23 05:23:06 -0700 $
005: *
006: * ====================================================================
007: *
008: * Copyright 1999-2004 The Apache Software Foundation
009: *
010: * Licensed under the Apache License, Version 2.0 (the "License");
011: * you may not use this file except in compliance with the License.
012: * You may obtain a copy of the License at
013: *
014: * http://www.apache.org/licenses/LICENSE-2.0
015: *
016: * Unless required by applicable law or agreed to in writing, software
017: * distributed under the License is distributed on an "AS IS" BASIS,
018: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
019: * See the License for the specific language governing permissions and
020: * limitations under the License.
021: *
022: */
023: package org.apache.slide.index.lucene;
024:
025: import java.io.ByteArrayInputStream;
026: import java.io.File;
027: import java.io.IOException;
028: import java.io.Reader;
029: import java.text.DecimalFormat;
030: import java.text.SimpleDateFormat;
031: import java.util.Date;
032: import java.util.Enumeration;
033: import java.util.Iterator;
034: import java.util.LinkedList;
035: import java.util.Locale;
036: import java.util.StringTokenizer;
037:
038: import nl.hippo.slide.extractor.LanguageSpecificContentExtractor;
039: import nl.hippo.slide.index.analysis.SimpleStandardAnalyzer;
040:
041: import org.apache.avalon.framework.logger.Logger;
042: import org.apache.lucene.document.Document;
043: import org.apache.lucene.document.Field;
044: import org.apache.lucene.index.IndexReader;
045: import org.apache.lucene.index.IndexWriter;
046: import org.apache.lucene.search.BooleanQuery;
047: import org.apache.lucene.search.IndexSearcher;
048: import org.apache.lucene.store.Directory;
049: import org.apache.lucene.store.FSDirectory;
050: import org.apache.slide.content.NodeProperty;
051: import org.apache.slide.content.NodeRevisionDescriptor;
052: import org.apache.slide.content.RevisionNotFoundException;
053: import org.apache.slide.extractor.ContentExtractor;
054: import org.apache.slide.extractor.ExtractorException;
055: import org.apache.slide.search.IndexException;
056:
057: /**
058: * Wrapper for Lucene index.
059: */
060: public class Index {
061: public static final String KEY_FIELD_NAME = "SLIDE_KEY";
062: public static final String URI_FIELD_NAME = "SLIDE_URI";
063: public static final String SCOPE_FIELD_NAME = "SLIDE_SCOPE";
064: public static final String DEPTH_FIELD_NAME = "SLIDE_DEPTH";
065: public static final String VERSION_FIELD_NAME = "SLIDE_VERSION";
066: public static final String IS_DEFINED_FIELD_NAME = "SLIDE_ISDEFINED";
067: public static final String CONTENT_FIELD_NAME = "SLIDE_CONTENT";
068: public static final String NULL_FIELD_NAME = "SLIDE_NULL";
069: public static final String STRING_INDEX_DATE_FORMATE = "yyyy-MM-dd HH:mm:ss";
070:
071: public static final SimpleDateFormat DATE_INDEX_FORMAT = new SimpleDateFormat(
072: STRING_INDEX_DATE_FORMATE, Locale.UK);
073:
074: public static final DecimalFormat INT_INDEX_FORMAT = new DecimalFormat(
075: "b0000000000000000000;a0000000000000000000");
076:
077: public static final String DATE_LOWER_BOUND = new SimpleDateFormat(
078: STRING_INDEX_DATE_FORMATE, Locale.UK).format(new Date(0));
079: public static final String DATE_UPPER_BOUND = new SimpleDateFormat(
080: STRING_INDEX_DATE_FORMATE, Locale.UK).format(new Date(
081: Long.MAX_VALUE));
082: public static final String INT_LOWER_BOUND = INT_INDEX_FORMAT
083: .format(Long.MIN_VALUE);
084: public static final String INT_UPPER_BOUND = INT_INDEX_FORMAT
085: .format(Long.MAX_VALUE);
086: public static final String STRING_UPPER_BOUND = "\uffff\uffff";
087: public static final String STRING_LOWER_BOUND = "";
088:
089: protected static final String LOG_CHANNEL = Index.class.getName();
090:
091: protected IndexConfiguration configuration;
092: protected String indexName;
093: protected Logger logger;
094:
095: protected LinkedList txnQueue = new LinkedList();
096:
097: /**
098: * Counter for recently executed index jobs (insertions, deletions).
099: * Will be reseted after optimization.
100: */
101: public Index(IndexConfiguration configuration, Logger logger,
102: String name) throws IndexException {
103: this .logger = logger;
104: this .configuration = configuration;
105: this .indexName = name;
106:
107: File file = new File(this .configuration.getIndexPath());
108: if (!file.exists() && !file.mkdirs()) {
109: throw new IndexException(
110: "Error can't find or create index directory: "
111: + this .configuration.getIndexPath());
112: }
113:
114: try {
115: Directory directory = getDirectory();
116: if (IndexReader.indexExists(directory)) {
117: if (IndexReader.isLocked(directory)) {
118: IndexReader.unlock(directory);
119: }
120: } else {
121: IndexWriter writer = new IndexWriter(directory,
122: configuration.getAnalyzer(), true);
123: writer.close();
124: }
125: } catch (IOException e) {
126: throw new IndexException("Error while creating index: "
127: + this .configuration.getIndexPath(), e);
128: }
129:
130: // TODO make configurable
131: BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE);
132: }
133:
134: public IndexConfiguration getConfiguration() {
135: return this .configuration;
136: }
137:
138: public Logger getLogger() {
139: return this .logger;
140: }
141:
142: public IndexSearcher getSearcher() throws IOException {
143: // TODO can this be reused?
144: return new IndexSearcher(this .configuration.getIndexPath());
145: }
146:
147: public void releaseSearcher(IndexSearcher s) throws IOException {
148: s.close();
149: }
150:
151: public IndexReader getReader() throws IOException {
152: return IndexReader.open(this .configuration.getIndexPath());
153: }
154:
155: public void releaseReader(IndexReader r) throws IOException {
156: r.close();
157: }
158:
159: private Directory getDirectory() throws IOException {
160: // file system based directory
161: return FSDirectory.getDirectory(this .configuration
162: .getIndexPath(), false);
163: }
164:
165: private Field indexString(String fieldName, String value,
166: boolean storeAll) {
167: // if storeAll : store field
168: if (storeAll) {
169: return storedString(fieldName, value);
170: } else {
171: return unstoredString(fieldName, value);
172: }
173: }
174:
175: private Field indexTextField(String fieldName, String value,
176: boolean storeAll) {
177: // if storeAll : store field
178: if (storeAll) {
179: return storedTextField(fieldName, value);
180: } else {
181: return textField(fieldName, value);
182: }
183: }
184:
185: private Field unstoredString(String fieldName, String value) {
186: if (!configuration.isCaseSensitive())
187: value = value.toLowerCase();
188: // don't store, index, don't tokenize
189: return new Field(fieldName, value, Field.Store.NO,
190: Field.Index.UN_TOKENIZED);
191: }
192:
193: private Field storedString(String fieldName, String value) {
194: if (!configuration.isCaseSensitive())
195: value = value.toLowerCase();
196: // store, index, don't tokenize
197: return new Field(fieldName, value, Field.Store.YES,
198: Field.Index.UN_TOKENIZED);
199: }
200:
201: private Field unstoredNonContentString(String fieldName,
202: String value) {
203: // store book keeping info in the original case
204: // don't store, index, don't tokenize
205: return new Field(fieldName, value, Field.Store.NO,
206: Field.Index.UN_TOKENIZED);
207: }
208:
209: private Field storedNonContentString(String fieldName, String value) {
210: // store book keeping info in the original case
211: // store, index, don't tokenize
212: return new Field(fieldName, value, Field.Store.YES,
213: Field.Index.UN_TOKENIZED);
214: }
215:
216: // dont need to lowercase the text fields cause the analyzers will take care of that
217: private Field textField(String fieldName, String value) {
218: // don't store, index, tokenize
219: return new Field(fieldName, value, Field.Store.NO,
220: Field.Index.TOKENIZED);
221: }
222:
223: private Field storedTextField(String fieldName, String value) {
224: // don't store, index, tokenize
225: return new Field(fieldName, value, Field.Store.YES,
226: Field.Index.TOKENIZED);
227: }
228:
229: private Field textField(String fieldName, Reader value) {
230: // default: don't store, index, tokenize
231: return new Field(fieldName, value);
232: }
233:
234: /**
235: * Creates a lucene index document for a properties indexer.
236: * @param uri resource
237: * @param descriptor properties to be indexed
238: */
239: public Document createLuceneDocument(String uri,
240: NodeRevisionDescriptor descriptor, byte[] contentBuffer,
241: ContentExtractor[] extractors)
242: throws RevisionNotFoundException, ExtractorException {
243:
244: this .logger.debug(uri + ": creating doc!");
245:
246: Document doc = new Document();
247:
248: doc.add(unstoredNonContentString(Index.KEY_FIELD_NAME,
249: configuration.generateKey(uri, descriptor
250: .getRevisionNumber())));
251: doc.add(storedNonContentString(Index.URI_FIELD_NAME, uri));
252:
253: // scopes
254: StringTokenizer tokenizer = new StringTokenizer(uri, "/");
255: StringBuffer buffer = new StringBuffer(uri.length());
256:
257: doc.add(unstoredNonContentString(Index.SCOPE_FIELD_NAME, "/"));
258: int depth = 0;
259: for (; tokenizer.hasMoreTokens();) {
260: buffer.append("/").append(tokenizer.nextToken());
261: doc.add(unstoredNonContentString(Index.SCOPE_FIELD_NAME,
262: buffer.toString()));
263: depth++;
264: }
265: doc.add(unstoredNonContentString(Index.DEPTH_FIELD_NAME,
266: configuration.intToIndexString(depth)));
267:
268: // resource type
269: String rtype = descriptor.getResourceType();
270: for (Iterator i = configuration.knownResourceTypes(); i
271: .hasNext();) {
272: String name = (String) i.next();
273: if (rtype.indexOf(name) != -1) {
274: doc.add(unstoredNonContentString(IndexConfiguration
275: .generateFieldName(
276: NodeProperty.DEFAULT_NAMESPACE,
277: "resourcetype"), name));
278: }
279: }
280:
281: // all other properties
282: for (Enumeration e = descriptor.enumerateProperties(); e
283: .hasMoreElements();) {
284: NodeProperty property = (NodeProperty) e.nextElement();
285:
286: String p_namespace = property.getNamespace();
287: String p_name = property.getName();
288:
289: Object value = property.getValue();
290:
291: if (value == null)
292: continue;
293: if (!configuration.isIndexedProperty(p_namespace, p_name))
294: continue;
295:
296: if (configuration.isDateProperty(p_namespace, p_name)) {
297: Date date = IndexConfiguration.getDateValue(value);
298: if (date != null) {
299: doc.add(indexString(IndexConfiguration
300: .generateFieldName(property.getNamespace(),
301: property.getName()), configuration
302: .dateToIndexString(date),
303: configuration.storeAll));
304: }
305: this .logger.debug(IndexConfiguration.generateFieldName(
306: property.getNamespace(), property.getName())
307: + " is a date type!");
308: } else if (configuration.isIntProperty(p_namespace, p_name)) {
309: try {
310: doc.add(indexString(IndexConfiguration
311: .generateFieldName(property.getNamespace(),
312: property.getName()), configuration
313: .intToIndexString(Long.parseLong(value
314: .toString())),
315: configuration.storeAll));
316: this .logger.debug(IndexConfiguration
317: .generateFieldName(property.getNamespace(),
318: property.getName())
319: + " is an int type!");
320: } catch (NumberFormatException ex) {
321: // TODO log warning
322: }
323: } else if (configuration
324: .isTextProperty(p_namespace, p_name)) {
325: doc.add(indexTextField(IndexConfiguration
326: .generateFieldName(property.getNamespace(),
327: property.getName()), value.toString(),
328: configuration.storeAll));
329: this .logger.debug(IndexConfiguration.generateFieldName(
330: property.getNamespace(), property.getName())
331: + " is a text type!");
332: } else if (configuration.isStringProperty(p_namespace,
333: p_name)) {
334: doc.add(indexString(IndexConfiguration
335: .generateFieldName(property.getNamespace(),
336: property.getName()), value.toString(),
337: configuration.storeAll));
338:
339: // also add default text tokenized property for String properties for seaching
340: String fieldName = IndexConfiguration
341: .generateFieldName(property.getNamespace(),
342: IndexConfiguration.TOKENIZED_PREFIX
343: + property.getName());
344: if (configuration.getAnalyzerForField(fieldName) == null) {
345: configuration.addTextProperty(property
346: .getNamespace(),
347: IndexConfiguration.TOKENIZED_PREFIX
348: + property.getName(),
349: new SimpleStandardAnalyzer());
350: }
351: doc.add(textField(fieldName, value.toString()));
352:
353: this .logger.debug(IndexConfiguration.generateFieldName(
354: property.getNamespace(), property.getName())
355: + " is a string type!");
356: }
357: if (configuration.supportsIsDefined(p_namespace, p_name)) {
358: doc.add(unstoredNonContentString(
359: Index.IS_DEFINED_FIELD_NAME, IndexConfiguration
360: .generateFieldName(property
361: .getNamespace(), property
362: .getName())));
363: this .logger.debug(IndexConfiguration.generateFieldName(
364: property.getNamespace(), property.getName())
365: + " supports is-defined!");
366: }
367: }
368:
369: if (extractors != null && extractors.length > 0
370: && contentBuffer != null) {
371: for (int i = 0; i < extractors.length; i++) {
372: ByteArrayInputStream content = new ByteArrayInputStream(
373: contentBuffer);
374: try {
375: if (extractors[i] instanceof LanguageSpecificContentExtractor) {
376: LanguageSpecificContentExtractor lsce = (LanguageSpecificContentExtractor) extractors[i];
377: String locale = lsce.getLocale();
378: if (locale != null && !locale.equals("")) {
379: doc.add(textField(Index.CONTENT_FIELD_NAME
380: .concat(locale), extractors[i]
381: .extract(content)));
382: } else {
383: doc.add(textField(Index.CONTENT_FIELD_NAME,
384: extractors[i].extract(content)));
385: }
386: } else {
387: doc.add(textField(Index.CONTENT_FIELD_NAME,
388: extractors[i].extract(content)));
389: }
390: } catch (Exception e) {
391: this .logger
392: .warn(
393: uri
394: + " error extracting content, skipping (extractor = '"
395: + extractors[i] + "'):", e);
396: }
397: }
398:
399: this .logger
400: .debug(uri + " has content which was extracted!");
401: }
402:
403: return doc;
404: }
405:
406: }
|