0001: /*
0002: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/CmsSearchIndex.java,v $
0003: * Date : $Date: 2008-02-27 12:05:38 $
0004: * Version: $Revision: 1.67 $
0005: *
0006: * This library is part of OpenCms -
0007: * the Open Source Content Management System
0008: *
0009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
0010: *
0011: * This library is free software; you can redistribute it and/or
0012: * modify it under the terms of the GNU Lesser General Public
0013: * License as published by the Free Software Foundation; either
0014: * version 2.1 of the License, or (at your option) any later version.
0015: *
0016: * This library is distributed in the hope that it will be useful,
0017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
0019: * Lesser General Public License for more details.
0020: *
0021: * For further information about Alkacon Software GmbH, please see the
0022: * company website: http://www.alkacon.com
0023: *
0024: * For further information about OpenCms, please see the
0025: * project website: http://www.opencms.org
0026: *
0027: * You should have received a copy of the GNU Lesser General Public
0028: * License along with this library; if not, write to the Free Software
0029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0030: */
0031:
0032: package org.opencms.search;
0033:
0034: import org.opencms.configuration.I_CmsConfigurationParameterHandler;
0035: import org.opencms.file.CmsObject;
0036: import org.opencms.file.CmsProject;
0037: import org.opencms.file.CmsRequestContext;
0038: import org.opencms.file.CmsResource;
0039: import org.opencms.i18n.CmsLocaleManager;
0040: import org.opencms.main.CmsException;
0041: import org.opencms.main.CmsIllegalArgumentException;
0042: import org.opencms.main.CmsLog;
0043: import org.opencms.main.OpenCms;
0044: import org.opencms.search.documents.A_CmsVfsDocument;
0045: import org.opencms.search.documents.I_CmsDocumentFactory;
0046: import org.opencms.search.documents.I_CmsTermHighlighter;
0047: import org.opencms.search.fields.CmsSearchField;
0048: import org.opencms.search.fields.CmsSearchFieldConfiguration;
0049: import org.opencms.util.CmsStringUtil;
0050:
0051: import java.io.File;
0052: import java.io.IOException;
0053: import java.text.ParseException;
0054: import java.util.ArrayList;
0055: import java.util.Date;
0056: import java.util.HashMap;
0057: import java.util.Iterator;
0058: import java.util.List;
0059: import java.util.Locale;
0060: import java.util.Map;
0061: import java.util.TreeMap;
0062:
0063: import org.apache.commons.logging.Log;
0064: import org.apache.lucene.analysis.Analyzer;
0065: import org.apache.lucene.document.DateTools;
0066: import org.apache.lucene.document.Document;
0067: import org.apache.lucene.document.Field;
0068: import org.apache.lucene.index.IndexWriter;
0069: import org.apache.lucene.index.Term;
0070: import org.apache.lucene.queryParser.QueryParser;
0071: import org.apache.lucene.search.BooleanClause;
0072: import org.apache.lucene.search.BooleanQuery;
0073: import org.apache.lucene.search.Hits;
0074: import org.apache.lucene.search.IndexSearcher;
0075: import org.apache.lucene.search.PhraseQuery;
0076: import org.apache.lucene.search.Query;
0077: import org.apache.lucene.search.TermQuery;
0078:
0079: /**
0080: * Implements the search within an index and the management of the index configuration.<p>
0081: *
0082: * @author Carsten Weinholz
0083: * @author Thomas Weckert
0084: * @author Alexander Kandzior
0085: *
0086: * @version $Revision: 1.67 $
0087: *
0088: * @since 6.0.0
0089: */
0090: public class CmsSearchIndex implements
0091: I_CmsConfigurationParameterHandler {
0092:
0093: /** Constant for additional param to enable excerpt creation (default: true). */
0094: public static final String EXCERPT = CmsSearchIndex.class.getName()
0095: + ".createExcerpt";
0096:
0097: /** Constant for additional param to enable permission checks (default: true). */
0098: public static final String PERMISSIONS = CmsSearchIndex.class
0099: .getName()
0100: + ".checkPermissions";
0101:
0102: /** Constant for additional param to set the thread priority during search. */
0103: public static final String PRIORITY = CmsSearchIndex.class
0104: .getName()
0105: + ".priority";
0106:
0107: /** Automatic ("auto") index rebuild mode. */
0108: public static final String REBUILD_MODE_AUTO = "auto";
0109:
0110: /** Manual ("manual") index rebuild mode. */
0111: public static final String REBUILD_MODE_MANUAL = "manual";
0112:
0113: /**
0114: * Special root path append token for optimized path queries.<p>
0115: *
0116: * @deprecated This is not longer requires since OpenCms version 7.0.2, since the implementation
0117: * of {@link CmsSearchManager#getAnalyzer(Locale)} was modified to use always
0118: * use for the {@link CmsSearchField#FIELD_ROOT} filed.
0119: *
0120: * @see #rootPathRewrite(String)
0121: */
0122: public static final String ROOT_PATH_SUFFIX = "";
0123:
0124: /** Special root path start token for optimized path queries. */
0125: public static final String ROOT_PATH_TOKEN = "root";
0126:
0127: /** Constant for a field list that contains the "meta" field as well as the "content" field. */
0128: static final String[] DOC_META_FIELDS = new String[] {
0129: CmsSearchField.FIELD_META, CmsSearchField.FIELD_CONTENT };
0130:
0131: /** The log object for this class. */
0132: private static final Log LOG = CmsLog.getLog(CmsSearchIndex.class);
0133:
0134: /** The list of configured index sources. */
0135: List m_sources;
0136:
0137: /** The excerpt mode for this index. */
0138: private boolean m_createExcerpt;
0139:
0140: /** Documenttypes of folders/channels. */
0141: private Map m_documenttypes;
0142:
0143: /** The permission check mode for this index. */
0144: private boolean m_dontCheckPermissions;
0145:
0146: /** An internal enabled flag, used to disable the index if for instance the configured project does not exist. */
0147: private boolean m_enabled;
0148:
0149: /** The search field configuration of this index. */
0150: private CmsSearchFieldConfiguration m_fieldConfiguration;
0151:
0152: /** The name of the search field configuration used by this index. */
0153: private String m_fieldConfigurationName;
0154:
0155: /** The locale of this index. */
0156: private Locale m_locale;
0157:
0158: /** The name of this index. */
0159: private String m_name;
0160:
0161: /** The path where this index stores it's data in the "real" file system. */
0162: private String m_path;
0163:
0164: /** The thread priority for a search. */
0165: private int m_priority;
0166:
0167: /** The project of this index. */
0168: private String m_project;
0169:
0170: /** The rebuild mode for this index. */
0171: private String m_rebuild;
0172:
0173: /** The configured sources for this index. */
0174: private List m_sourceNames;
0175:
0176: /**
0177: * Default constructor only intended to be used by the xml configuration. <p>
0178: *
0179: * It is recommended to use the constructor <code>{@link #CmsSearchIndex(String)}</code>
0180: * as it enforces the mandatory name argument. <p>
0181: *
0182: */
0183: public CmsSearchIndex() {
0184:
0185: m_sourceNames = new ArrayList();
0186: m_documenttypes = new HashMap();
0187: m_createExcerpt = true;
0188: m_enabled = true;
0189: m_priority = -1;
0190: }
0191:
0192: /**
0193: * Creates a new CmsSearchIndex with the given name.<p>
0194: *
0195: * @param name the system-wide unique name for the search index
0196: *
0197: * @throws org.opencms.main.CmsIllegalArgumentException
0198: * if the given name is null, empty or already taken
0199: * by another search index.
0200: *
0201: */
0202: public CmsSearchIndex(String name)
0203: throws CmsIllegalArgumentException {
0204:
0205: this ();
0206: setName(name);
0207: }
0208:
0209: /**
0210: * Rewrites the a resource path for use in the {@link CmsSearchField#FIELD_ROOT} field.<p>
0211: *
0212: * This is required in order to use a Lucene "phrase query" on the resource path.
0213: * Using a phrase query is much, much better for the search performance then using a straightforward
0214: * "prefix query". With a "prefix query", Lucene would interally generate a huge list of boolean sub-queries,
0215: * exactly one for every document in the VFS subtree of the query. So if you query on "/sites/default/*" on
0216: * a large OpenCms installation, this means thousands of sub-queries.
0217: * Using the "phrase query", only one (or very few) queries are internally generated, and the result
0218: * is just the same.<p>
0219: *
0220: * Since OpenCms version 7.0.2, the {@link CmsSearchField#FIELD_ROOT} field always uses a whitespace analyzer.
0221: * This is ensured by the {@link CmsSearchManager#getAnalyzer(Locale)} implementation.
0222: * The Lucene whitespace analyzer uses all words as tokens, no lower case transformation or word stemming is done.
0223: * So the root path is now just split along the '/' chars, which are replaced by simple space chars.<p>
0224: *
0225: * <i>Historical implementation sidenote:</i>
0226: * Before 7.0.2, the {@link CmsSearchField#FIELD_ROOT} used the analyzer configured by the language.
0227: * This introduced a number of issues as the language analyzer might modify the directory names, leading to potential
0228: * duplicates (e.g. <code>members/</code> and <code>member/</code> may both be trimmed to <code>member</code>),
0229: * so that the prefix search returns more or different results then expected.
0230: * This was avoided by a workaround where this method basically replaced the "/" of a path with "@o.c ".
0231: * Using this trick most Lucene analyzers left the directory names untouched,
0232: * and treated them like literal email addresses. However, this trick did not work with all analyzers,
0233: * for example the Russian analyzer does not work as expected.
0234: * An additional workaround was required to avoid problems with folders that that are different
0235: * only by the upper / lower chars. Since 7.0.2, these workarounds are not longer required, since the
0236: * {@link CmsSearchField#FIELD_ROOT} field always uses a whitespace analyzer, which is a much better solution.<p>
0237: *
0238: * @param path the path to rewrite
0239: *
0240: * @return the re-written path
0241: */
0242: public static String rootPathRewrite(String path) {
0243:
0244: StringBuffer result = new StringBuffer(256);
0245: String[] elements = rootPathSplit(path);
0246: for (int i = 0; i < elements.length; i++) {
0247: result.append(elements[i]);
0248: if ((i + 1) < elements.length) {
0249: result.append(' ');
0250: }
0251: }
0252: return result.toString();
0253: }
0254:
0255: /**
0256: * Spits the a resource path into tokens for use in the <code>{@link CmsSearchField#FIELD_ROOT}</code> field
0257: * and with the <code>{@link #rootPathRewrite(String)}</code> method.<p>
0258: *
0259: * @param path the path to split
0260: *
0261: * @return the split path
0262: *
0263: * @see #rootPathRewrite(String)
0264: */
0265: public static String[] rootPathSplit(String path) {
0266:
0267: if (CmsStringUtil.isEmpty(path)) {
0268: return new String[] { ROOT_PATH_TOKEN };
0269: }
0270:
0271: // split the path
0272: String[] elements = CmsStringUtil.splitAsArray(path, '/');
0273: String[] result = new String[elements.length + 1];
0274: result[0] = ROOT_PATH_TOKEN;
0275: System.arraycopy(elements, 0, result, 1, elements.length);
0276: return result;
0277: }
0278:
0279: /**
0280: * Adds a parameter.<p>
0281: *
0282: * @param key the key/name of the parameter
0283: * @param value the value of the parameter
0284: */
0285: public void addConfigurationParameter(String key, String value) {
0286:
0287: if (PERMISSIONS.equals(key)) {
0288: m_dontCheckPermissions = !Boolean.valueOf(value)
0289: .booleanValue();
0290: } else if (EXCERPT.equals(key)) {
0291: m_createExcerpt = Boolean.valueOf(value).booleanValue();
0292: } else if (PRIORITY.equals(key)) {
0293: m_priority = Integer.parseInt(value);
0294: if (m_priority < Thread.MIN_PRIORITY) {
0295: m_priority = Thread.MIN_PRIORITY;
0296: LOG.error(Messages.get().getBundle().key(
0297: Messages.LOG_SEARCH_PRIORITY_TOO_LOW_2, value,
0298: new Integer(Thread.MIN_PRIORITY)));
0299:
0300: } else if (m_priority > Thread.MAX_PRIORITY) {
0301: m_priority = Thread.MAX_PRIORITY;
0302: LOG.debug(Messages.get().getBundle().key(
0303: Messages.LOG_SEARCH_PRIORITY_TOO_HIGH_2, value,
0304: new Integer(Thread.MAX_PRIORITY)));
0305:
0306: }
0307: }
0308: }
0309:
0310: /**
0311: * Adds am index source to this search index.<p>
0312: *
0313: * @param sourceName the index source name to add
0314: */
0315: public void addSourceName(String sourceName) {
0316:
0317: m_sourceNames.add(sourceName);
0318: }
0319:
0320: /**
0321: * Checks is this index has been configured correctly.<p>
0322: *
0323: * In case the check fails, the <code>enabled</code> property
0324: * is set to <code>false</code>
0325: *
0326: * @param cms a OpenCms user context to perform the checks with (should have "Administrator" permissions)
0327: *
0328: * @return <code>true</code> in case the index is correctly configured and enabled after the check
0329: *
0330: * @see #isEnabled()
0331: */
0332: public boolean checkConfiguration(CmsObject cms) {
0333:
0334: if (isEnabled()) {
0335: // check if the project for the index exists
0336: try {
0337: cms.readProject(getProject());
0338: setEnabled(true);
0339: } catch (CmsException e) {
0340: // the project does not exist, disable the index
0341: setEnabled(false);
0342: if (LOG.isErrorEnabled()) {
0343: LOG
0344: .error(Messages
0345: .get()
0346: .getBundle()
0347: .key(
0348: Messages.LOG_SEARCHINDEX_CREATE_BAD_PROJECT_2,
0349: getProject(), getName()));
0350: }
0351: }
0352: } else {
0353: if (LOG.isInfoEnabled()) {
0354: LOG
0355: .info(Messages.get().getBundle().key(
0356: Messages.LOG_SEARCHINDEX_DISABLED_1,
0357: getName()));
0358: }
0359: }
0360:
0361: return isEnabled();
0362: }
0363:
0364: /**
0365: * @see java.lang.Object#equals(java.lang.Object)
0366: */
0367: public boolean equals(Object obj) {
0368:
0369: if (obj == this ) {
0370: return true;
0371: }
0372: if (obj instanceof CmsSearchIndex) {
0373: return ((CmsSearchIndex) obj).m_name.equals(m_name);
0374: }
0375: return false;
0376: }
0377:
0378: /**
0379: * @see org.opencms.configuration.I_CmsConfigurationParameterHandler#getConfiguration()
0380: */
0381: public Map getConfiguration() {
0382:
0383: Map result = new TreeMap();
0384: if (m_priority > 0) {
0385: result.put(PRIORITY, new Integer(m_priority));
0386: }
0387: if (!m_createExcerpt) {
0388: result.put(EXCERPT, Boolean.valueOf(m_createExcerpt));
0389: }
0390: if (m_dontCheckPermissions) {
0391: result.put(PERMISSIONS, Boolean
0392: .valueOf(!m_dontCheckPermissions));
0393: }
0394: return result;
0395: }
0396:
0397: /**
0398: * Returns the document type factory used for the given resource in this index, or <code>null</code>
0399: * in case the resource is not indexed by this index.<p>
0400: *
0401: * A resource is indexed if the following is all true: <ol>
0402: * <li>The index contains at last one index source matching the root path of the given resource.
0403: * <li>For this matching index source, the document type factory needed by the resource is also configured.
0404: * </ol>
0405: *
0406: * @param res the resource to check
0407: *
0408: * @return he document type factory used for the given resource in this index, or <code>null</code>
0409: * in case the resource is not indexed by this index
0410: */
0411: public I_CmsDocumentFactory getDocumentFactory(CmsResource res) {
0412:
0413: if ((res != null) && (m_sources != null)) {
0414: // the result can only be null or the type configured for the resource
0415: I_CmsDocumentFactory result = OpenCms.getSearchManager()
0416: .getDocumentFactory(res);
0417: if (result != null) {
0418: // check the path of the resource if it matches with one (or more) of the configured index sources
0419: Iterator i = m_sources.iterator();
0420: while (i.hasNext()) {
0421: CmsSearchIndexSource source = (CmsSearchIndexSource) i
0422: .next();
0423: if (source.isIndexing(res.getRootPath(), result
0424: .getName())) {
0425: // we found an index source that indexes the resource
0426: return result;
0427: }
0428: }
0429: }
0430: }
0431: return null;
0432: }
0433:
0434: /**
0435: * Returns a list of names (Strings) of configured document type factorys for the given resource path.<p>
0436: *
0437: * @param path path of the folder
0438: *
0439: * @return a list of names (Strings) of configured document type factorys for the given resource path
0440: *
0441: * @deprecated use {@link #getDocumentFactory(CmsResource)} instead to find out if this index is 'interested' in a resource
0442: */
0443: public List getDocumenttypes(String path) {
0444:
0445: List documenttypes = null;
0446: if (m_documenttypes != null) {
0447: for (Iterator i = m_documenttypes.entrySet().iterator(); i
0448: .hasNext();) {
0449: Map.Entry e = (Map.Entry) i.next();
0450: String key = (String) e.getKey();
0451: // NOTE: assumed that configured resource paths do not overlap, otherwise result is undefined
0452: if (path.startsWith(key)) {
0453: documenttypes = (List) e.getValue();
0454: break;
0455: }
0456: }
0457: }
0458: if (documenttypes == null) {
0459: documenttypes = OpenCms.getSearchManager()
0460: .getDocumentTypes();
0461: }
0462: return documenttypes;
0463: }
0464:
0465: /**
0466: * Returns the search field configuration of this index.<p>
0467: *
0468: * @return the search field configuration of this index
0469: */
0470: public CmsSearchFieldConfiguration getFieldConfiguration() {
0471:
0472: return m_fieldConfiguration;
0473: }
0474:
0475: /**
0476: * Returns the name of the field configuration used for this index.<p>
0477: *
0478: * @return the name of the field configuration used for this index
0479: */
0480: public String getFieldConfigurationName() {
0481:
0482: return m_fieldConfigurationName;
0483: }
0484:
0485: /**
0486: * Returns a new index writer for this index.<p>
0487: *
0488: * @param create if <code>true</code> a whole new index is created, if <code>false</code> an existing index is updated
0489: *
0490: * @return a new instance of IndexWriter
0491: * @throws CmsIndexException if the index can not be opened
0492: */
0493: public IndexWriter getIndexWriter(boolean create)
0494: throws CmsIndexException {
0495:
0496: IndexWriter indexWriter;
0497: Analyzer analyzer = OpenCms.getSearchManager().getAnalyzer(
0498: m_locale);
0499:
0500: try {
0501: File f = new File(m_path);
0502: if (f.exists()) {
0503: // index already exists
0504: indexWriter = new IndexWriter(m_path, analyzer, create);
0505: } else {
0506: // index does not exist yet
0507: f = f.getParentFile();
0508: if ((f != null) && !f.exists()) {
0509: // create the parent folders if required
0510: f.mkdirs();
0511: }
0512: indexWriter = new IndexWriter(m_path, analyzer, true);
0513: }
0514:
0515: } catch (Exception e) {
0516: throw new CmsIndexException(Messages.get()
0517: .container(Messages.ERR_IO_INDEX_WRITER_OPEN_2,
0518: m_path, m_name), e);
0519: }
0520:
0521: return indexWriter;
0522: }
0523:
0524: /**
0525: * Gets the langauge of this index.<p>
0526: *
0527: * @return the language of the index, i.e. de
0528: */
0529: public Locale getLocale() {
0530:
0531: return m_locale;
0532: }
0533:
0534: /**
0535: * Returns the locale of the index as a String.<p>
0536: *
0537: * @return the locale of the index as a String
0538: *
0539: * @see #getLocale()
0540: */
0541: public String getLocaleString() {
0542:
0543: return getLocale().toString();
0544: }
0545:
0546: /**
0547: * Gets the name of this index.<p>
0548: *
0549: * @return the name of the index
0550: */
0551: public String getName() {
0552:
0553: return m_name;
0554: }
0555:
0556: /**
0557: * Returns the path where this index stores it's data in the "real" file system.<p>
0558: *
0559: * @return the path where this index stores it's data in the "real" file system
0560: */
0561: public String getPath() {
0562:
0563: return m_path;
0564: }
0565:
0566: /**
0567: * Gets the project of this index.<p>
0568: *
0569: * @return the project of the index, i.e. "online"
0570: */
0571: public String getProject() {
0572:
0573: return m_project;
0574: }
0575:
0576: /**
0577: * Get the rebuild mode of this index.<p>
0578: *
0579: * @return the current rebuild mode
0580: */
0581: public String getRebuildMode() {
0582:
0583: return m_rebuild;
0584: }
0585:
0586: /**
0587: * Returns all configured sources names of this search index.<p>
0588: *
0589: * @return a list with all configured sources names of this search index
0590: */
0591: public List getSourceNames() {
0592:
0593: return m_sourceNames;
0594: }
0595:
0596: /**
0597: * Returns all configured index sources of this search index.<p>
0598: *
0599: * @return all configured index sources of this search index
0600: */
0601: public List getSources() {
0602:
0603: return m_sources;
0604: }
0605:
0606: /**
0607: * @see java.lang.Object#hashCode()
0608: */
0609: public int hashCode() {
0610:
0611: return m_name != null ? m_name.hashCode() : 0;
0612: }
0613:
0614: /**
0615: * @see org.opencms.configuration.I_CmsConfigurationParameterHandler#initConfiguration()
0616: */
0617: public void initConfiguration() {
0618:
0619: // noting to do here
0620: }
0621:
0622: /**
0623: * Initializes the search index.<p>
0624: *
0625: * @throws CmsSearchException if the index source association failed
0626: */
0627: public void initialize() throws CmsSearchException {
0628:
0629: if (!isEnabled()) {
0630: // index is disabled, no initialization is required
0631: return;
0632: }
0633:
0634: String sourceName = null;
0635: CmsSearchIndexSource indexSource = null;
0636: List searchIndexSourceDocumentTypes = null;
0637: List resourceNames = null;
0638: String resourceName = null;
0639: m_sources = new ArrayList();
0640:
0641: m_path = OpenCms.getSystemInfo()
0642: .getAbsoluteRfsPathRelativeToWebInf(
0643: OpenCms.getSearchManager().getDirectory() + "/"
0644: + m_name);
0645:
0646: for (int i = 0, n = m_sourceNames.size(); i < n; i++) {
0647:
0648: try {
0649: sourceName = (String) m_sourceNames.get(i);
0650: indexSource = OpenCms.getSearchManager()
0651: .getIndexSource(sourceName);
0652: m_sources.add(indexSource);
0653:
0654: resourceNames = indexSource.getResourcesNames();
0655: searchIndexSourceDocumentTypes = indexSource
0656: .getDocumentTypes();
0657: for (int j = 0, m = resourceNames.size(); j < m; j++) {
0658:
0659: resourceName = (String) resourceNames.get(j);
0660: m_documenttypes.put(resourceName,
0661: searchIndexSourceDocumentTypes);
0662: }
0663: } catch (Exception e) {
0664: // mark this index as disabled
0665: setEnabled(false);
0666: throw new CmsSearchException(Messages.get().container(
0667: Messages.ERR_INDEX_SOURCE_ASSOCIATION_1,
0668: sourceName), e);
0669: }
0670: }
0671:
0672: // initialize the search field configuration
0673: if (m_fieldConfigurationName == null) {
0674: // if not set, use standard field configuration
0675: m_fieldConfigurationName = CmsSearchFieldConfiguration.STR_STANDARD;
0676: }
0677: m_fieldConfiguration = OpenCms.getSearchManager()
0678: .getFieldConfiguration(m_fieldConfigurationName);
0679: if (m_fieldConfiguration == null) {
0680: // we must have a valid field configuration to continue
0681: throw new CmsSearchException(Messages.get().container(
0682: Messages.ERR_FIELD_CONFIGURATION_UNKNOWN_2, m_name,
0683: m_fieldConfigurationName));
0684: }
0685: }
0686:
0687: /**
0688: * Returns <code>true</code> if this index is currently disabled.<p>
0689: *
0690: * @return <code>true</code> if this index is currently disabled
0691: */
0692: public boolean isEnabled() {
0693:
0694: return m_enabled;
0695: }
0696:
0697: /**
0698: * Removes an index source from this search index.<p>
0699: *
0700: * @param sourceName the index source name to remove
0701: */
0702: public void removeSourceName(String sourceName) {
0703:
0704: m_sourceNames.remove(sourceName);
0705: }
0706:
0707: /**
0708: * Performs a search on the index within the given fields.<p>
0709: *
0710: * The result is returned as List with entries of type I_CmsSearchResult.<p>
0711: * @param cms the current user's Cms object
0712: * @param params the parameters to use for the search
0713: * @return the List of results found or an empty list
0714: * @throws CmsSearchException if something goes wrong
0715: */
0716: public synchronized CmsSearchResultList search(CmsObject cms,
0717: CmsSearchParameters params) throws CmsSearchException {
0718:
0719: long timeTotal = -System.currentTimeMillis();
0720: long timeLucene;
0721: long timeResultProcessing;
0722:
0723: if (LOG.isDebugEnabled()) {
0724: LOG.debug(Messages.get().getBundle().key(
0725: Messages.LOG_SEARCH_PARAMS_2, params, m_name));
0726: }
0727:
0728: CmsRequestContext context = cms.getRequestContext();
0729: CmsProject currentProject = context.currentProject();
0730:
0731: // the searcher to perform the operation in
0732: IndexSearcher searcher = null;
0733:
0734: // the hits found during the search
0735: Hits hits;
0736:
0737: // storage for the results found
0738: CmsSearchResultList searchResults = new CmsSearchResultList();
0739:
0740: int previousPriority = Thread.currentThread().getPriority();
0741:
0742: try {
0743:
0744: if (m_priority > 0) {
0745: // change thread priority in order to reduce search impact on overall system performance
0746: Thread.currentThread().setPriority(m_priority);
0747: }
0748:
0749: // change the project
0750: context.setCurrentProject(cms.readProject(m_project));
0751:
0752: // complete the search root
0753: String[] roots;
0754: if ((params.getRoots() != null)
0755: && (params.getRoots().size() > 0)) {
0756: // add the site root to all the search root
0757: roots = new String[params.getRoots().size()];
0758: for (int i = 0; i < params.getRoots().size(); i++) {
0759: roots[i] = cms.getRequestContext().addSiteRoot(
0760: (String) params.getRoots().get(i));
0761: }
0762: } else {
0763: // just use the site root as the search root
0764: // this permits searching in indexes that contain content of other sites than the current selected one?!?!
0765: roots = new String[] { cms.getRequestContext()
0766: .getSiteRoot() };
0767: }
0768:
0769: timeLucene = -System.currentTimeMillis();
0770:
0771: // the language analyzer to use for creating the queries
0772: Analyzer languageAnalyzer = OpenCms.getSearchManager()
0773: .getAnalyzer(m_locale);
0774:
0775: // the main query to use, will be constructed in the next lines
0776: BooleanQuery query = new BooleanQuery();
0777:
0778: // implementation note:
0779: // initially this was a simple PrefixQuery based on the DOC_PATH
0780: // however, internally Lucene rewrote that to literally hundreds of BooleanQuery parts
0781: // the following implementation will lead to just one Lucene PhraseQuery per directory and is thus much better
0782: BooleanQuery pathQuery = new BooleanQuery();
0783: for (int i = 0; i < roots.length; i++) {
0784: String[] paths = rootPathSplit(roots[i]);
0785: PhraseQuery phrase = new PhraseQuery();
0786: for (int j = 0; j < paths.length; j++) {
0787: Term term = new Term(CmsSearchField.FIELD_ROOT,
0788: paths[j]);
0789: phrase.add(term);
0790: }
0791: pathQuery.add(phrase, BooleanClause.Occur.SHOULD);
0792: }
0793: // add the calculated phrase query for the root path
0794: query.add(pathQuery, BooleanClause.Occur.MUST);
0795:
0796: if ((params.getCategories() != null)
0797: && (params.getCategories().size() > 0)) {
0798: // add query categories (if required)
0799: BooleanQuery categoryQuery = new BooleanQuery();
0800: for (int i = 0; i < params.getCategories().size(); i++) {
0801: Term term = new Term(CmsSearchField.FIELD_CATEGORY,
0802: (String) params.getCategories().get(i));
0803: TermQuery termQuery = new TermQuery(term);
0804: categoryQuery.add(termQuery,
0805: BooleanClause.Occur.SHOULD);
0806: }
0807: query.add(categoryQuery, BooleanClause.Occur.MUST);
0808: }
0809:
0810: // create the index searcher
0811: searcher = new IndexSearcher(m_path);
0812:
0813: // store separate fields query for excerpt highlighting
0814: Query fieldsQuery;
0815: if ((params.getFields() != null)
0816: && (params.getFields().size() > 0)) {
0817: BooleanQuery booleanFieldsQuery = new BooleanQuery();
0818: // this is a "regular" query over one or more fields
0819: // add one sub-query for each of the selected fields, e.g. "content", "title" etc.
0820: for (int i = 0; i < params.getFields().size(); i++) {
0821: QueryParser p = new QueryParser((String) params
0822: .getFields().get(i), languageAnalyzer);
0823: booleanFieldsQuery.add(p.parse(params.getQuery()),
0824: BooleanClause.Occur.SHOULD);
0825: }
0826: fieldsQuery = searcher.rewrite(booleanFieldsQuery);
0827: } else {
0828: // if no fields are provided, just use the "content" field by default
0829: QueryParser p = new QueryParser(
0830: CmsSearchField.FIELD_CONTENT, languageAnalyzer);
0831: fieldsQuery = searcher.rewrite(p.parse(params
0832: .getQuery()));
0833: }
0834: // finally add the field queries to the main query
0835: query.add(fieldsQuery, BooleanClause.Occur.MUST);
0836:
0837: if (LOG.isDebugEnabled()) {
0838: LOG.debug(Messages.get().getBundle().key(
0839: Messages.LOG_BASE_QUERY_1, query));
0840: LOG.debug(Messages.get().getBundle().key(
0841: Messages.LOG_FIELDS_QUERY_1, fieldsQuery));
0842: }
0843:
0844: // collect the categories
0845: CmsSearchCategoryCollector categoryCollector;
0846: if (params.isCalculateCategories()) {
0847: // USE THIS OPTION WITH CAUTION
0848: // this may slow down searched by an order of magnitude
0849: categoryCollector = new CmsSearchCategoryCollector(
0850: searcher);
0851: // perform a first search to collect the categories
0852: searcher.search(query, categoryCollector);
0853: // store the result
0854: searchResults.setCategories(categoryCollector
0855: .getCategoryCountResult());
0856: }
0857:
0858: // perform the search operation
0859: hits = searcher.search(query, params.getSort());
0860:
0861: timeLucene += System.currentTimeMillis();
0862: timeResultProcessing = -System.currentTimeMillis();
0863:
0864: Document doc;
0865: CmsSearchResult searchResult;
0866:
0867: if (hits != null) {
0868: int hitCount = hits.length();
0869: int page = params.getSearchPage();
0870: int start = -1, end = -1;
0871: if ((params.getMatchesPerPage() > 0) && (page > 0)
0872: && (hitCount > 0)) {
0873: // calculate the final size of the search result
0874: start = params.getMatchesPerPage() * (page - 1);
0875: end = start + params.getMatchesPerPage();
0876: // ensure that both i and n are inside the range of foundDocuments.size()
0877: start = (start > hitCount) ? hitCount : start;
0878: end = (end > hitCount) ? hitCount : end;
0879: } else {
0880: // return all found documents in the search result
0881: start = 0;
0882: end = hitCount;
0883: }
0884:
0885: int visibleHitCount = hitCount;
0886: for (int i = 0, cnt = 0; (i < hitCount) && (cnt < end); i++) {
0887: try {
0888: doc = hits.doc(i);
0889: if ((isInTimeRange(doc, params))
0890: && (hasReadPermission(cms, doc))) {
0891: // user has read permission
0892: if (cnt >= start) {
0893: // do not use the resource to obtain the raw content, read it from the lucene document!
0894: // documents must not have content (i.e. images), so check if the content field exists
0895: String excerpt = null;
0896: if (m_createExcerpt) {
0897: I_CmsTermHighlighter highlighter = OpenCms
0898: .getSearchManager()
0899: .getHighlighter();
0900: excerpt = highlighter.getExcerpt(
0901: doc, this , params,
0902: fieldsQuery,
0903: languageAnalyzer);
0904: }
0905: searchResult = new CmsSearchResult(Math
0906: .round(hits.score(i) * 100f),
0907: doc, excerpt);
0908: searchResults.add(searchResult);
0909: }
0910: cnt++;
0911: } else {
0912: visibleHitCount--;
0913: }
0914: } catch (Exception e) {
0915: // should not happen, but if it does we want to go on with the next result nevertheless
0916: if (LOG.isWarnEnabled()) {
0917: LOG
0918: .warn(
0919: Messages
0920: .get()
0921: .getBundle()
0922: .key(
0923: Messages.LOG_RESULT_ITERATION_FAILED_0),
0924: e);
0925: }
0926: }
0927: }
0928:
0929: // save the total count of search results at the last index of the search result
0930: searchResults.setHitCount(visibleHitCount);
0931: } else {
0932: searchResults.setHitCount(0);
0933: }
0934:
0935: timeResultProcessing += System.currentTimeMillis();
0936: } catch (RuntimeException e) {
0937: throw new CmsSearchException(Messages.get().container(
0938: Messages.ERR_SEARCH_PARAMS_1, params), e);
0939: } catch (Exception e) {
0940: throw new CmsSearchException(Messages.get().container(
0941: Messages.ERR_SEARCH_PARAMS_1, params), e);
0942: } finally {
0943:
0944: // re-set thread to previous priority
0945: Thread.currentThread().setPriority(previousPriority);
0946:
0947: if (searcher != null) {
0948: try {
0949: searcher.close();
0950: } catch (IOException exc) {
0951: // noop
0952: }
0953: }
0954:
0955: // switch back to the original project
0956: context.setCurrentProject(currentProject);
0957: }
0958:
0959: timeTotal += System.currentTimeMillis();
0960:
0961: Object[] logParams = new Object[] {
0962: new Integer(hits == null ? 0 : hits.length()),
0963: new Long(timeTotal), new Long(timeLucene),
0964: new Long(timeResultProcessing) };
0965: if (LOG.isDebugEnabled()) {
0966: LOG.debug(Messages.get().getBundle().key(
0967: Messages.LOG_STAT_RESULTS_TIME_4, logParams));
0968: }
0969:
0970: return searchResults;
0971: }
0972:
0973: /**
0974: * Can be used to enable / disable this index.<p>
0975: *
0976: * @param enabled the state of the index to set
0977: */
0978: public void setEnabled(boolean enabled) {
0979:
0980: m_enabled = enabled;
0981: }
0982:
0983: /**
0984: * Sets the field configuration used for this index.<p>
0985: *
0986: * @param fieldConfiguration the field configuration to set
0987: */
0988: public void setFieldConfiguration(
0989: CmsSearchFieldConfiguration fieldConfiguration) {
0990:
0991: m_fieldConfiguration = fieldConfiguration;
0992: }
0993:
0994: /**
0995: * Sets the name of the field configuration used for this index.<p>
0996: *
0997: * @param fieldConfigurationName the name of the field configuration to set
0998: */
0999: public void setFieldConfigurationName(String fieldConfigurationName) {
1000:
1001: m_fieldConfigurationName = fieldConfigurationName;
1002: }
1003:
1004: /**
1005: * Sets the locale to index resources.<p>
1006: *
1007: * @param locale the locale to index resources
1008: */
1009: public void setLocale(Locale locale) {
1010:
1011: m_locale = locale;
1012: }
1013:
1014: /**
1015: * Sets the locale to index resources as a String.<p>
1016: *
1017: * @param locale the locale to index resources
1018: *
1019: * @see #setLocale(Locale)
1020: */
1021: public void setLocaleString(String locale) {
1022:
1023: setLocale(CmsLocaleManager.getLocale(locale));
1024: }
1025:
1026: /**
1027: * Sets the logical key/name of this search index.<p>
1028: *
1029: * @param name the logical key/name of this search index
1030: *
1031: * @throws org.opencms.main.CmsIllegalArgumentException
1032: * if the given name is null, empty or already taken
1033: * by another search index.
1034: */
1035: public void setName(String name) throws CmsIllegalArgumentException {
1036:
1037: if (CmsStringUtil.isEmptyOrWhitespaceOnly(name)) {
1038: throw new CmsIllegalArgumentException(
1039: Messages
1040: .get()
1041: .container(
1042: Messages.ERR_SEARCHINDEX_CREATE_MISSING_NAME_0));
1043: } else {
1044: // check if already used, but only if the name was modified:
1045: // this is important as unmodifiable DisplayWidgets will also invoke this...
1046: if (!name.equals(m_name)) {
1047: // don't mess with xml-configuration
1048: if (OpenCms.getRunLevel() > OpenCms.RUNLEVEL_2_INITIALIZING) {
1049: // not needed at startup and additionally getSearchManager may return null
1050: Iterator itIdxNames = OpenCms.getSearchManager()
1051: .getIndexNames().iterator();
1052: while (itIdxNames.hasNext()) {
1053: if (itIdxNames.next().equals(name)) {
1054: throw new CmsIllegalArgumentException(
1055: Messages
1056: .get()
1057: .container(
1058: Messages.ERR_SEARCHINDEX_CREATE_INVALID_NAME_1,
1059: name));
1060: }
1061: }
1062: }
1063: }
1064: }
1065: m_name = name;
1066: }
1067:
1068: /**
1069: * Sets the name of the project used to index resources.<p>
1070: *
1071: * A duplicate method of <code>{@link #setProjectName(String)}</code> that allows
1072: * to use instances of this class as a widget object (bean convention,
1073: * cp.: <code>{@link #getProject()}</code>.<p>
1074: *
1075: * @param projectName the name of the project used to index resources
1076: */
1077: public void setProject(String projectName) {
1078:
1079: setProjectName(projectName);
1080: }
1081:
1082: /**
1083: * Sets the name of the project used to index resources.<p>
1084: *
1085: * @param projectName the name of the project used to index resources
1086: */
1087: public void setProjectName(String projectName) {
1088:
1089: m_project = projectName;
1090: }
1091:
1092: /**
1093: * Sets the rebuild mode of this search index.<p>
1094: *
1095: * @param rebuildMode the rebuild mode of this search index {auto|manual}
1096: */
1097: public void setRebuildMode(String rebuildMode) {
1098:
1099: m_rebuild = rebuildMode;
1100: }
1101:
1102: /**
1103: * Returns the name (<code>{@link #getName()}</code>) of this search index.<p>
1104: *
1105: * @return the name (<code>{@link #getName()}</code>) of this search index
1106: *
1107: * @see java.lang.Object#toString()
1108: */
1109: public String toString() {
1110:
1111: return getName();
1112: }
1113:
1114: /**
1115: * Checks if the OpenCms resource referenced by the result document can be read
1116: * be the user of the given OpenCms context.<p>
1117: *
1118: * @param cms the OpenCms user context to use for permission testing
1119: * @param doc the search result document to check
1120: * @return <code>true</code> if the user has read permissions to the resource
1121: */
1122: protected boolean hasReadPermission(CmsObject cms, Document doc) {
1123:
1124: if (m_dontCheckPermissions) {
1125: // no permission check is performed at all
1126: return true;
1127: }
1128:
1129: Field typeField = doc.getField(CmsSearchField.FIELD_TYPE);
1130: Field pathField = doc.getField(CmsSearchField.FIELD_PATH);
1131: if ((typeField == null) || (pathField == null)) {
1132: // permission check needs only to be performed for VFS documents that contain both fields
1133: return true;
1134: }
1135:
1136: String type = typeField.stringValue();
1137: if (!A_CmsVfsDocument.VFS_DOCUMENT_KEY_PREFIX.equals(type)
1138: && !OpenCms.getResourceManager().hasResourceType(type)) {
1139: // this is not a known VFS resource type (also not the generic "VFS" type of OpenCms before 7.0)
1140: return true;
1141: }
1142:
1143: // check if the resource exits in the VFS,
1144: // this will implicitly check read permission and if the resource was deleted
1145: String contextPath = cms.getRequestContext().removeSiteRoot(
1146: pathField.stringValue());
1147: return cms.existsResource(contextPath);
1148: }
1149:
1150: /**
1151: * Checks wether the document is in the time range specified in the
1152: * search parameters.<p>
1153: *
1154: * The creation date and/or the last modification date are checked.<p>
1155: *
1156: * @param doc the document to check the dates against the given time range
1157: * @param params the search parameters where the time ranges are specified
1158: *
1159: * @return true if document is in time range or not time range set otherwise false
1160: */
1161: protected boolean isInTimeRange(Document doc,
1162: CmsSearchParameters params) {
1163:
1164: try {
1165: // check the creation date of the document against the given time range
1166: Date dateCreated = DateTools.stringToDate(doc.getField(
1167: CmsSearchField.FIELD_DATE_CREATED).stringValue());
1168: if ((params.getMinDateCreated() > Long.MIN_VALUE)
1169: && (dateCreated.getTime() < params
1170: .getMinDateCreated())) {
1171: return false;
1172: }
1173: if ((params.getMaxDateCreated() < Long.MAX_VALUE)
1174: && (dateCreated.getTime() > params
1175: .getMaxDateCreated())) {
1176: return false;
1177: }
1178:
1179: // check the last modification date of the document against the given time range
1180: Date dateLastModified = DateTools.stringToDate(doc
1181: .getField(CmsSearchField.FIELD_DATE_LASTMODIFIED)
1182: .stringValue());
1183: if ((params.getMinDateLastModified() > Long.MIN_VALUE)
1184: && (dateLastModified.getTime() < params
1185: .getMinDateLastModified())) {
1186: return false;
1187: }
1188: if ((params.getMaxDateLastModified() < Long.MAX_VALUE)
1189: && (dateLastModified.getTime() > params
1190: .getMaxDateLastModified())) {
1191: return false;
1192: }
1193:
1194: } catch (ParseException ex) {
1195: // date could not be parsed -> doc is in time range
1196: }
1197:
1198: return true;
1199: }
1200: }
|