001: /*
002: * Copyright 2005 Hippo Webworks.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: package nl.hippo.slide.index;
017:
018: import java.io.IOException;
019:
020: import org.apache.avalon.framework.configuration.Configuration;
021: import org.apache.avalon.framework.configuration.ConfigurationException;
022: import org.apache.lucene.document.Document;
023: import org.apache.lucene.index.Term;
024: import org.apache.slide.common.NamespaceAccessToken;
025: import org.apache.slide.common.SlideException;
026: import org.apache.slide.common.Uri;
027: import org.apache.slide.content.NodeRevisionContent;
028: import org.apache.slide.content.NodeRevisionDescriptor;
029: import org.apache.slide.content.RevisionNotFoundException;
030: import org.apache.slide.extractor.ContentExtractor;
031: import org.apache.slide.extractor.ExtractorException;
032: import org.apache.slide.extractor.ExtractorManager;
033: import org.apache.slide.index.lucene.Index;
034: import org.apache.slide.index.lucene.IndexConfiguration;
035: import org.apache.slide.search.IndexException;
036:
037: /**
038: * @author <a href="mailto:m.pfingsthorn@hippo.nl">Max Pfingsthorn</a>
039: */
040: public class SubLuceneIndexerDASL extends SubLuceneIndexer {
041:
042: // ---------------------------------------------------- Constants
043:
044: public static final String DEFAULTPROPERTYANALYZER_ELEMENT = "default-property-analyzer";
045: public static final String DEFAULTPROPERTYANALYZER_ATTR = "class";
046: public static final String DEFAULTPROPERTYANALYZER_ELEMENT_DEFAULT = "org.apache.lucene.analysis.SimpleAnalyzer";
047:
048: public static final String CASESENSITIVE_ELEMENT = "case-sensitive";
049: public static final boolean CASESENSITIVE_ELEMENT_DEFAULT = false;
050:
051: public static final String INDEXALL_ELEMENT = "index-all";
052: public static final String STORE_ELEMENT = "store-properties";
053:
054: public static final String PROPERTIES_ELEMENT = "properties";
055: private static final String PROPERTY_ANALYZER_INITIALIZE = "initialize";
056: public static final String PROPERTY_ELEMENT = "property";
057: public static final String PROPERTY_NAMESPACE_ATTR = "namespace";
058: public static final String PROPERTY_NAME_ATTR = "name";
059: public static final String PROPERTY_TYPE_ATTR = "type";
060: public static final String PROPERTY_ANALYZER_ATTR = "analyzer";
061: public static final String PROPERTY_DEFINED_ATTR = "support-defined";
062:
063: public static final String PROPERTY_TYPE_ATTR_DEFAULT = "text";
064: public static final String PROPERTY_NAMESPACE_ATTR_DEFAULT = "http://hippo.nl/cms/1.0";
065:
066: public static final String RESOURCETYPES_ELEMENT = "resource-types";
067: public static final String RESOURCETYPE_ELEMENT = "resource-type";
068: public static final String RESOURCETYPE_NAMESPACE_ATTR = "namespace";
069: public static final String RESOURCETYPE_NAME_ATTR = "name";
070:
071: // ---------------------------------------------------- Instance variables
072:
073: protected IndexConfiguration config = new IndexConfiguration();
074: protected Index index = null;
075:
076: // ---------------------------------------------------- Lifecycle
077:
078: public SubLuceneIndexerDASL(NamespaceAccessToken namespace) {
079: super (namespace);
080:
081: }
082:
083: public Index getIndex() {
084: return index;
085: }
086:
087: public void configure(Configuration configuration)
088: throws ConfigurationException {
089: super .configure(configuration);
090:
091: Configuration props = configuration
092: .getChild(PROPERTIES_ELEMENT);
093:
094: config.setContentAnalyzer(m_analyzer);
095: config.setIndexPath(m_indexpath);
096:
097: m_analyzer = config.getAnalyzer();
098:
099: if (null != configuration.getChild(INDEXALL_ELEMENT)) {
100: // indexAll will alays be set to true according the above, even when there is no INDEXALL_ELEMENT
101: // due to legacy I will keep it this way. It should be like below with STORE_ELEMENT
102: // [Ard Schrijvers]
103: config.setIndexAll(true);
104: } else {
105: config.setIndexAll(false);
106: }
107:
108: if (null != configuration.getChild(STORE_ELEMENT, false)) {
109: config.setStoreAll(true);
110: } else {
111: config.setStoreAll(false);
112: }
113:
114: String defaultPropAnalyzer = null;
115: if (null != configuration
116: .getChild(DEFAULTPROPERTYANALYZER_ELEMENT))
117: defaultPropAnalyzer = configuration.getChild(
118: DEFAULTPROPERTYANALYZER_ELEMENT).getAttribute(
119: DEFAULTPROPERTYANALYZER_ATTR,
120: DEFAULTPROPERTYANALYZER_ELEMENT_DEFAULT);
121:
122: config
123: .setDefaultPropertyAnalyzer(getAnalyzer(defaultPropAnalyzer));
124:
125: config.setCaseSensitive(configuration.getChild(
126: CASESENSITIVE_ELEMENT).getValueAsBoolean(
127: CASESENSITIVE_ELEMENT_DEFAULT));
128:
129: if (props != null) {
130: Configuration[] prop = props.getChildren(PROPERTY_ELEMENT);
131: for (int i = 0; i < prop.length; i++) {
132: String namespace = prop[i].getAttribute(
133: PROPERTY_NAMESPACE_ATTR,
134: PROPERTY_NAMESPACE_ATTR_DEFAULT);
135: String name = prop[i].getAttribute(PROPERTY_NAME_ATTR,
136: null);
137: String type = prop[i].getAttribute(PROPERTY_TYPE_ATTR,
138: PROPERTY_TYPE_ATTR_DEFAULT);
139: String analyzer = prop[i].getAttribute(
140: PROPERTY_ANALYZER_ATTR, defaultPropAnalyzer);
141: String defined = prop[i].getAttribute(
142: PROPERTY_DEFINED_ATTR, null);
143: boolean clsInitialize = prop[i].getAttributeAsBoolean(
144: PROPERTY_ANALYZER_INITIALIZE, false);
145:
146: if (namespace == null || namespace.equals("")
147: || name == null || name.equals("")) {
148: throw new ConfigurationException(
149: "Specify all attributes ('namespace' and 'name') in 'property'!",
150: configuration);
151: }
152:
153: if (getLogger().isDebugEnabled()) {
154: getLogger().debug(
155: "Added property '" + namespace + "':'"
156: + name + "' as '" + type + "'.");
157: }
158:
159: //propertiesToIndex.put(getNamespacedPropertyName(namespace,name),as);
160:
161: if (type == null || type.equals("")
162: || type.equals("string")) //default
163: {
164: config.addStringProperty(namespace, name);
165: /*
166: * beside adding the StringProperty, also add a tokenized text property for
167: * searching within a property
168: */
169: config.addTextProperty(namespace,
170: IndexConfiguration.TOKENIZED_PREFIX + name,
171: getAnalyzer(analyzer));
172:
173: if (getLogger().isDebugEnabled()) {
174: getLogger().debug(
175: "Added property '" + namespace + "':'"
176: + name + "' as String!");
177: getLogger()
178: .debug(
179: "Added property '"
180: + namespace
181: + "':'"
182: + IndexConfiguration.TOKENIZED_PREFIX
183: + name + "' as Text!");
184: }
185: } else if (type.equals("text")) {
186: //this.config.
187: if (clsInitialize) {
188: config.addTextProperty(namespace, name,
189: getAnalyzer(analyzer, this .config));
190: } else {
191: config.addTextProperty(namespace, name,
192: getAnalyzer(analyzer));
193: }
194: if (getLogger().isDebugEnabled()) {
195: getLogger().debug(
196: "Added property '" + namespace + "':'"
197: + name + "' as Text!");
198: }
199: } else if (type.equals("int")) {
200: config.addIntProperty(namespace, name);
201:
202: if (getLogger().isDebugEnabled()) {
203: getLogger().debug(
204: "Added property '" + namespace + "':'"
205: + name + "' as Integer!");
206: }
207: } else if (type.equals("date")) {
208: config.addDateProperty(namespace, name);
209:
210: if (getLogger().isDebugEnabled()) {
211: getLogger().debug(
212: "Added property '" + namespace + "':'"
213: + name + "' as Date!");
214: }
215: }
216:
217: if (defined != null && defined.equals("true")) {
218: config
219: .addSupportsIsdefinedProperty(namespace,
220: name);
221:
222: if (getLogger().isDebugEnabled()) {
223: getLogger().debug(
224: "Property '" + namespace + "':'" + name
225: + "' supports defined!");
226: }
227: }
228: }
229: }
230:
231: Configuration res = configuration
232: .getChild(RESOURCETYPES_ELEMENT);
233: if (res != null) {
234: Configuration[] restypes = res
235: .getChildren(RESOURCETYPE_ELEMENT);
236: for (int i = 0; i < restypes.length; i++) {
237: String namespace = restypes[i]
238: .getAttribute(RESOURCETYPE_NAMESPACE_ATTR);
239: String name = restypes[i]
240: .getAttribute(RESOURCETYPE_NAME_ATTR);
241:
242: if (namespace == null || namespace.equals("")) {
243: throw new ConfigurationException(
244: "Empty 'namespace' element!", configuration);
245: }
246:
247: if (getLogger().isDebugEnabled()) {
248: getLogger().debug(
249: "Added resourcetype '" + namespace + "':'"
250: + name + "'.");
251: }
252:
253: config.addKnownResourceType(namespace, name);
254: }
255: }
256:
257: config.setNamespaceName(m_namespace.getName());
258:
259: try {
260: index = new Index(config, getLogger().getChildLogger(
261: "index"), "propindex");
262:
263: } catch (IndexException e) {
264: if (getLogger().isErrorEnabled()) {
265: getLogger()
266: .error("Could not open configured index!", e);
267: }
268: throw new ConfigurationException(
269: "Could not open configured index!", configuration,
270: e);
271: }
272:
273: }
274:
275: public String getNamespacedPropertyName(String namespace,
276: String propertyName) {
277: String result;
278: if (namespace == null)
279: result = propertyName;
280: else
281: result = namespace + propertyName;
282: return result;
283: }
284:
285: public int removeIndex(String uri) throws IOException,
286: SlideException {
287: switchToReader();
288:
289: if (getLogger().isDebugEnabled()) {
290: getLogger().debug("Deleting '" + uri + "' from index.");
291: }
292:
293: return reader.deleteDocuments(new Term(Index.URI_FIELD_NAME,
294: uri));
295: }
296:
297: protected int mWriteIndex(Uri uri, NodeRevisionDescriptor nrd,
298: NodeRevisionContent nrc) throws IOException, SlideException {
299: switchToWriter();
300:
301: int count = 0;
302:
303: try {
304:
305: if (getLogger().isDebugEnabled()) {
306: getLogger().debug(
307: "About to write '" + uri + "' to the index.");
308: }
309:
310: final ContentExtractor[] extractors = ExtractorManager
311: .getInstance().getContentExtractors(
312: m_namespace.getName(), nrd, uri);
313:
314: byte[] buffer = null;
315: if (nrd != null
316: && extractors != null
317: && extractors.length > 0
318: && !nrd.propertyValueContains(
319: NodeRevisionDescriptor.RESOURCE_TYPE,
320: NodeRevisionDescriptor.COLLECTION_TYPE)) {
321: buffer = nrc.getContentBytes();
322: }
323:
324: final Document doc = index.createLuceneDocument(uri
325: .toString(), nrd, buffer, extractors);
326:
327: writer.addDocument(doc);
328: count++;
329:
330: if (getLogger().isDebugEnabled()) {
331: getLogger()
332: .debug("Written '" + uri + "' to the index.");
333: }
334: } catch (RevisionNotFoundException e) {
335: // ignore, may happen in case of collections
336: getLogger().error("Error indexing " + uri, e);
337: } catch (ExtractorException e) {
338: getLogger().error("Error indexing " + uri, e);
339: } catch (Throwable t) {
340: getLogger().error("Error indexing " + uri, t);
341: }
342:
343: return count;
344:
345: }
346:
347: }
|