001: /*
002: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/documents/A_CmsVfsDocument.java,v $
003: * Date : $Date: 2008-02-27 12:05:21 $
004: * Version: $Revision: 1.18 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.search.documents;
033:
034: import org.opencms.file.CmsFile;
035: import org.opencms.file.CmsObject;
036: import org.opencms.file.CmsPropertyDefinition;
037: import org.opencms.file.CmsResource;
038: import org.opencms.file.types.I_CmsResourceType;
039: import org.opencms.main.CmsException;
040: import org.opencms.main.CmsLog;
041: import org.opencms.main.OpenCms;
042: import org.opencms.search.CmsIndexException;
043: import org.opencms.search.CmsSearchCategoryCollector;
044: import org.opencms.search.CmsSearchIndex;
045: import org.opencms.search.extractors.I_CmsExtractionResult;
046: import org.opencms.search.fields.CmsSearchField;
047: import org.opencms.search.fields.CmsSearchFieldConfiguration;
048: import org.opencms.search.fields.CmsSearchFieldMapping;
049: import org.opencms.util.CmsStringUtil;
050:
051: import java.util.ArrayList;
052: import java.util.Date;
053: import java.util.Iterator;
054: import java.util.List;
055:
056: import org.apache.commons.logging.Log;
057: import org.apache.lucene.document.DateTools;
058: import org.apache.lucene.document.Document;
059: import org.apache.lucene.document.Field;
060:
061: /**
062: * Base document factory class for a VFS <code>{@link org.opencms.file.CmsResource}</code>,
063: * just requires a specialized implementation of
064: * <code>{@link I_CmsDocumentFactory#extractContent(CmsObject, CmsResource, CmsSearchIndex)}</code>
065: * for text extraction from the binary document content.<p>
066: *
067: * @author Carsten Weinholz
068: * @author Alexander Kandzior
069: *
070: * @version $Revision: 1.18 $
071: *
072: * @since 6.0.0
073: */
074: public abstract class A_CmsVfsDocument implements I_CmsDocumentFactory {
075:
076: /** Value for "high" search priority. */
077: public static final String SEARCH_PRIORITY_HIGH_VALUE = "high";
078:
079: /** Value for "low" search priority. */
080: public static final String SEARCH_PRIORITY_LOW_VALUE = "low";
081:
082: /** Value for "maximum" search priority. */
083: public static final String SEARCH_PRIORITY_MAX_VALUE = "max";
084:
085: /** Value for "normal" search priority. */
086: public static final String SEARCH_PRIORITY_NORMAL_VALUE = "normal";
087:
088: /** The vfs prefix for document keys. */
089: public static final String VFS_DOCUMENT_KEY_PREFIX = "VFS";
090:
091: /** The log object for this class. */
092: private static final Log LOG = CmsLog
093: .getLog(A_CmsVfsDocument.class);
094:
095: /**
096: * Name of the documenttype.
097: */
098: protected String m_name;
099:
100: /** The cache used for storing extracted documents. */
101: private CmsExtractionResultCache m_cache;
102:
103: /**
104: * Creates a new instance of this lucene document factory.<p>
105: *
106: * @param name name of the documenttype
107: */
108: public A_CmsVfsDocument(String name) {
109:
110: m_name = name;
111: }
112:
113: /**
114: * Creates a document factory lookup key for the given resource type name / MIME type configuration.<p>
115: *
116: * If the given <code>mimeType</code> is <code>null</code>, this indicates that the key should
117: * match all VFS resource of the given resource type regardless of the MIME type.<p>
118: *
119: * @param type the resource type name to use
120: * @param mimeType the MIME type to use
121: *
122: * @return a document factory lookup key for the given resource id / MIME type configuration
123: */
124: public static String getDocumentKey(String type, String mimeType) {
125:
126: StringBuffer result = new StringBuffer(16);
127: result.append(A_CmsVfsDocument.VFS_DOCUMENT_KEY_PREFIX);
128: result.append('_');
129: result.append(type);
130: if (mimeType != null) {
131: result.append(':');
132: result.append(mimeType);
133: }
134: return result.toString();
135: }
136:
137: /**
138: * Generates a new lucene document instance from contents of the given resource for the provided index.<p>
139: *
140: * @see org.opencms.search.documents.I_CmsDocumentFactory#createDocument(CmsObject, CmsResource, CmsSearchIndex)
141: */
142: public Document createDocument(CmsObject cms, CmsResource resource,
143: CmsSearchIndex index) throws CmsException {
144:
145: String path = cms.getRequestContext().removeSiteRoot(
146: resource.getRootPath());
147:
148: // extract the content from the resource
149: I_CmsExtractionResult content = null;
150:
151: // check if caching is enabled for this document type
152: String cacheName = null;
153: CmsExtractionResultCache cache = getCache();
154: if (cache != null) {
155: cacheName = cache.getCacheName(resource,
156: isLocaleDependend() ? index.getLocale() : null);
157: content = (I_CmsExtractionResult) cache
158: .getCacheObject(cacheName);
159: }
160:
161: if (content == null) {
162: // extraction result has not been found in the cache
163: try {
164: content = extractContent(cms, resource, index);
165: if (cache != null) {
166: // save extracted content to the cache
167: cache.saveCacheObject(cacheName, content);
168: }
169: } catch (Exception e) {
170: // text extraction failed for document - continue indexing meta information only
171: LOG.error(Messages.get().getBundle().key(
172: Messages.ERR_TEXT_EXTRACTION_1,
173: resource.getRootPath()), e);
174: }
175: }
176:
177: // create the Lucene document according to the index field configuration
178: Document document = new Document();
179: CmsSearchFieldConfiguration fieldConfiguration = index
180: .getFieldConfiguration();
181: Iterator fieldConfigs = fieldConfiguration.getFields()
182: .iterator();
183: while (fieldConfigs.hasNext()) {
184: // check all field configurations
185: CmsSearchField fieldConfig = (CmsSearchField) fieldConfigs
186: .next();
187: // generate the content for the field mappings
188: StringBuffer text = new StringBuffer();
189: Iterator mappings = fieldConfig.getMappings().iterator();
190: while (mappings.hasNext()) {
191: // walk through all mappings and check if content for this is available
192: CmsSearchFieldMapping mapping = (CmsSearchFieldMapping) mappings
193: .next();
194: String mapResult = mapping.getStringValue(cms,
195: resource, content);
196: if (mapResult != null) {
197: // content is available for the mapping
198: // append the result of the mapping to the main result
199: text.append(mapResult);
200: text.append('\n');
201: }
202: }
203: if (text.length() > 0) {
204: // content is available for this field
205: Field field = fieldConfig.createField(text.toString());
206: document.add(field);
207: }
208: }
209:
210: // now add the special OpenCms default search fields
211: String value;
212: Field field;
213: // add the category of the file (this is searched so the value can also be attached on a folder)
214: value = cms.readPropertyObject(path,
215: CmsPropertyDefinition.PROPERTY_SEARCH_CATEGORY, true)
216: .getValue();
217: if (CmsStringUtil.isNotEmpty(value)) {
218: // all categorys are internally stored lower case
219: value = value.trim().toLowerCase();
220: if (value.length() > 0) {
221: field = new Field(CmsSearchField.FIELD_CATEGORY, value,
222: Field.Store.YES, Field.Index.UN_TOKENIZED);
223: field.setBoost(0);
224: document.add(field);
225: }
226: } else {
227: // synthetic "unknown" category if no category property defined for resource
228: field = new Field(CmsSearchField.FIELD_CATEGORY,
229: CmsSearchCategoryCollector.UNKNOWN_CATEGORY,
230: Field.Store.YES, Field.Index.UN_TOKENIZED);
231: document.add(field);
232: }
233:
234: // add the document root path, optimized for use with a phrase query
235: String rootPath = CmsSearchIndex.rootPathRewrite(resource
236: .getRootPath());
237: field = new Field(CmsSearchField.FIELD_ROOT, rootPath,
238: Field.Store.YES, Field.Index.TOKENIZED);
239: // set boost of 0 to root path field, since root path should have no effect on search result score
240: field.setBoost(0);
241: document.add(field);
242: // root path is stored again in "plain" format, but not for indexing since I_CmsDocumentFactory.DOC_ROOT is used for that
243: // must be indexed as a keyword ONLY to be able to use this when deleting a resource from the index
244: document.add(new Field(CmsSearchField.FIELD_PATH, resource
245: .getRootPath(), Field.Store.YES,
246: Field.Index.UN_TOKENIZED));
247:
248: // add date of creation and last modification as keywords (for sorting)
249: field = new Field(CmsSearchField.FIELD_DATE_CREATED, DateTools
250: .dateToString(new Date(resource.getDateCreated()),
251: DateTools.Resolution.MILLISECOND),
252: Field.Store.YES, Field.Index.UN_TOKENIZED);
253: field.setBoost(0);
254: document.add(field);
255: field = new Field(CmsSearchField.FIELD_DATE_LASTMODIFIED,
256: DateTools.dateToString(new Date(resource
257: .getDateLastModified()),
258: DateTools.Resolution.MILLISECOND),
259: Field.Store.YES, Field.Index.UN_TOKENIZED);
260: field.setBoost(0);
261: document.add(field);
262:
263: // special field for VFS documents - add a marker so that the document can be identified as VFS resource
264: I_CmsResourceType type = OpenCms.getResourceManager()
265: .getResourceType(resource.getTypeId());
266: String typeName = VFS_DOCUMENT_KEY_PREFIX;
267: if (type != null) {
268: typeName = type.getTypeName();
269: }
270: document.add(new Field(CmsSearchField.FIELD_TYPE, typeName,
271: Field.Store.YES, Field.Index.UN_TOKENIZED));
272:
273: // set individual document boost factor for the search
274: float boost = CmsSearchField.BOOST_DEFAULT;
275: // note that the priority property IS searched, so you can easily flag whole folders as "high" or "low"
276: value = cms.readPropertyObject(path,
277: CmsPropertyDefinition.PROPERTY_SEARCH_PRIORITY, true)
278: .getValue();
279: if (value != null) {
280: value = value.trim().toLowerCase();
281: if (value.equals(SEARCH_PRIORITY_MAX_VALUE)) {
282: boost = 2.0f;
283: } else if (value.equals(SEARCH_PRIORITY_HIGH_VALUE)) {
284: boost = 1.5f;
285: } else if (value.equals(SEARCH_PRIORITY_LOW_VALUE)) {
286: boost = 0.5f;
287: }
288: }
289: if (boost != CmsSearchField.BOOST_DEFAULT) {
290: // set individual document boost factor if required
291: document.setBoost(boost);
292: }
293:
294: return document;
295: }
296:
297: /**
298: * @see org.opencms.search.documents.I_CmsDocumentFactory#getCache()
299: */
300: public CmsExtractionResultCache getCache() {
301:
302: return m_cache;
303: }
304:
305: /**
306: * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List)
307: */
308: public List getDocumentKeys(List resourceTypes, List mimeTypes)
309: throws CmsException {
310:
311: ArrayList keys = new ArrayList();
312:
313: if (resourceTypes.contains("*")) {
314: ArrayList allTypes = new ArrayList();
315: for (Iterator i = OpenCms.getResourceManager()
316: .getResourceTypes().iterator(); i.hasNext();) {
317: I_CmsResourceType resourceType = (I_CmsResourceType) i
318: .next();
319: allTypes.add(resourceType.getTypeName());
320: }
321: resourceTypes = allTypes;
322: }
323:
324: try {
325: for (Iterator i = resourceTypes.iterator(); i.hasNext();) {
326:
327: String typeName = OpenCms.getResourceManager()
328: .getResourceType((String) i.next())
329: .getTypeName();
330: for (Iterator j = mimeTypes.iterator(); j.hasNext();) {
331: keys
332: .add(getDocumentKey(typeName, (String) j
333: .next()));
334: }
335: if (mimeTypes.isEmpty()) {
336: keys.add(getDocumentKey(typeName, null));
337: }
338: }
339: } catch (Exception exc) {
340: throw new CmsException(Messages.get().container(
341: Messages.ERR_CREATE_DOC_KEY_0), exc);
342: }
343:
344: return keys;
345: }
346:
347: /**
348: * @see org.opencms.search.documents.I_CmsDocumentFactory#getName()
349: */
350: public String getName() {
351:
352: return m_name;
353: }
354:
355: /**
356: * @see org.opencms.search.documents.I_CmsDocumentFactory#setCache(org.opencms.search.documents.CmsExtractionResultCache)
357: */
358: public void setCache(CmsExtractionResultCache cache) {
359:
360: m_cache = cache;
361: }
362:
363: /**
364: * Upgrades the given resource to a {@link CmsFile} with content.<p>
365: *
366: * @param cms the current users OpenCms context
367: * @param resource the resource to upgrade
368: *
369: * @return the given resource upgraded to a {@link CmsFile} with content
370: *
371: * @throws CmsException if the resource could not be read
372: * @throws CmsIndexException if the resource has no content
373: */
374: protected CmsFile readFile(CmsObject cms, CmsResource resource)
375: throws CmsException, CmsIndexException {
376:
377: CmsFile file = cms.readFile(resource);
378: if (file.getLength() <= 0) {
379: throw new CmsIndexException(Messages.get().container(
380: Messages.ERR_NO_CONTENT_1, resource.getRootPath()));
381: }
382: return file;
383: }
384: }
|