001: /*
002: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/CmsVfsIndexer.java,v $
003: * Date : $Date: 2008-02-27 12:05:38 $
004: * Version: $Revision: 1.37 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.search;
033:
034: import org.opencms.db.CmsPublishedResource;
035: import org.opencms.file.CmsObject;
036: import org.opencms.file.CmsProject;
037: import org.opencms.file.CmsResource;
038: import org.opencms.file.CmsResourceFilter;
039: import org.opencms.main.CmsException;
040: import org.opencms.main.CmsLog;
041: import org.opencms.report.I_CmsReport;
042: import org.opencms.search.fields.CmsSearchField;
043:
044: import java.io.IOException;
045: import java.util.ArrayList;
046: import java.util.Iterator;
047: import java.util.List;
048:
049: import org.apache.commons.logging.Log;
050: import org.apache.lucene.index.IndexReader;
051: import org.apache.lucene.index.IndexWriter;
052: import org.apache.lucene.index.Term;
053:
054: /**
055: * Implementation for an indexer indexing VFS Cms resources.<p>
056: *
057: * @author Carsten Weinholz
058: * @author Thomas Weckert
059: *
060: * @version $Revision: 1.37 $
061: *
062: * @since 6.0.0
063: */
064: public class CmsVfsIndexer implements I_CmsIndexer {
065:
066: /** The log object for this class. */
067: private static final Log LOG = CmsLog.getLog(CmsVfsIndexer.class);
068:
069: /** The OpenCms user context to use when reading resources from the VFS during indexing. */
070: private CmsObject m_cms;
071:
072: /** The index. */
073: private CmsSearchIndex m_index;
074:
075: /** The report. */
076: private I_CmsReport m_report;
077:
078: /**
079: * @see org.opencms.search.I_CmsIndexer#deleteResources(org.apache.lucene.index.IndexReader, java.util.List)
080: */
081: public void deleteResources(IndexReader reader,
082: List resourcesToDelete) {
083:
084: if ((resourcesToDelete == null) || resourcesToDelete.isEmpty()) {
085: // nothing to délete
086: return;
087: }
088:
089: // contains all resources already deleted to avoid multiple deleting in case of siblings
090: List resourcesAlreadyDeleted = new ArrayList(resourcesToDelete
091: .size());
092:
093: Iterator i = resourcesToDelete.iterator();
094: while (i.hasNext()) {
095: // iterate all resources in the given list of resources to delete
096: CmsPublishedResource res = (CmsPublishedResource) i.next();
097: String rootPath = res.getRootPath();
098: if (!resourcesAlreadyDeleted.contains(rootPath)) {
099: // ensure siblings are only deleted once per update
100: resourcesAlreadyDeleted.add(rootPath);
101: // search for an exact match on the document root path
102: Term term = new Term(CmsSearchField.FIELD_PATH,
103: rootPath);
104: try {
105: // delete all documents with this term from the index
106: reader.deleteDocuments(term);
107: } catch (IOException e) {
108: if (LOG.isWarnEnabled()) {
109: LOG
110: .warn(
111: Messages
112: .get()
113: .getBundle()
114: .key(
115: Messages.LOG_IO_INDEX_DOCUMENT_DELETE_2,
116: rootPath,
117: m_index
118: .getName()),
119: e);
120: }
121: }
122: }
123: }
124: }
125:
126: /**
127: * @see org.opencms.search.I_CmsIndexer#getUpdateData(org.opencms.search.CmsSearchIndexSource, java.util.List)
128: */
129: public CmsSearchIndexUpdateData getUpdateData(
130: CmsSearchIndexSource source, List publishedResources) {
131:
132: // create a new update collection from this indexer and the given index source
133: CmsSearchIndexUpdateData result = new CmsSearchIndexUpdateData(
134: source, this );
135:
136: Iterator i = publishedResources.iterator();
137: while (i.hasNext()) {
138: // check all published resources if they match this indexer / source
139: CmsPublishedResource resource = (CmsPublishedResource) i
140: .next();
141: // VFS resources will always have a structure id
142: if (!resource.getStructureId().isNullUUID()) {
143: // use utility method from CmsProject to check if published resource is "inside" this index source
144: if (CmsProject.isInsideProject(source
145: .getResourcesNames(), resource.getRootPath())) {
146: // the resource is "inside" this index source
147: if (resource.getState().isNew()) {
148: // new resource just needs to be updated
149: if (isResourceInTimeWindow(resource)) {
150: // update only if resource is in time window
151: result.addResourceToUpdate(resource);
152: }
153: } else if (resource.getState().isDeleted()) {
154: // deleted resource just needs to be removed
155: result.addResourceToDelete(resource);
156: } else if (resource.getState().isChanged()
157: || resource.getState().isUnchanged()) {
158: // changed (or unchaged) resource must be removed first, and then updated
159: // note: unchanged resources can be siblings that have been added from the online project,
160: // these must be treated as if the resource had changed
161: result.addResourceToDelete(resource);
162: if (isResourceInTimeWindow(resource)) {
163: // update only if resource is in time window
164: result.addResourceToUpdate(resource);
165: }
166: }
167: }
168: }
169: }
170: return result;
171: }
172:
173: /**
174: * @see org.opencms.search.I_CmsIndexer#newInstance(org.opencms.file.CmsObject, org.opencms.report.I_CmsReport, org.opencms.search.CmsSearchIndex)
175: */
176: public I_CmsIndexer newInstance(CmsObject cms, I_CmsReport report,
177: CmsSearchIndex index) {
178:
179: CmsVfsIndexer indexer = new CmsVfsIndexer();
180:
181: indexer.m_cms = cms;
182: indexer.m_report = report;
183: indexer.m_index = index;
184:
185: return indexer;
186: }
187:
188: /**
189: * @see org.opencms.search.I_CmsIndexer#rebuildIndex(org.apache.lucene.index.IndexWriter, org.opencms.search.CmsIndexingThreadManager, org.opencms.search.CmsSearchIndexSource)
190: */
191: public void rebuildIndex(IndexWriter writer,
192: CmsIndexingThreadManager threadManager,
193: CmsSearchIndexSource source) throws CmsIndexException {
194:
195: List resourceNames = source.getResourcesNames();
196: Iterator i = resourceNames.iterator();
197: while (i.hasNext()) {
198: // read the resources from all configured source folders
199: String resourceName = (String) i.next();
200: List resources = null;
201: try {
202: // read all resources (only files) below the given path
203: resources = m_cms.readResources(resourceName,
204: CmsResourceFilter.DEFAULT.addRequireFile());
205: } catch (CmsException e) {
206: if (m_report != null) {
207: m_report.println(Messages.get().container(
208: Messages.RPT_UNABLE_TO_READ_SOURCE_2,
209: resourceName, e.getLocalizedMessage()),
210: I_CmsReport.FORMAT_WARNING);
211: }
212: if (LOG.isWarnEnabled()) {
213: LOG.warn(Messages.get().getBundle().key(
214: Messages.LOG_UNABLE_TO_READ_SOURCE_2,
215: resourceName, m_index.getName()), e);
216: }
217: }
218: if (resources != null) {
219: // iterate all resources found in the folder
220: Iterator j = resources.iterator();
221: while (j.hasNext()) {
222: // now update all the resources individually
223: CmsResource resource = (CmsResource) j.next();
224: updateResource(writer, threadManager, resource);
225: }
226: }
227: }
228: }
229:
230: /**
231: * @see org.opencms.search.I_CmsIndexer#updateResources(org.apache.lucene.index.IndexWriter, org.opencms.search.CmsIndexingThreadManager, java.util.List)
232: */
233: public void updateResources(IndexWriter writer,
234: CmsIndexingThreadManager threadManager,
235: List resourcesToUpdate) throws CmsIndexException {
236:
237: if ((resourcesToUpdate == null) || resourcesToUpdate.isEmpty()) {
238: // nothing to update
239: return;
240: }
241:
242: // contains all resources already updated to avoid multiple updates in case of siblings
243: List resourcesAlreadyUpdated = new ArrayList(resourcesToUpdate
244: .size());
245:
246: // index all resources that in the given list
247: Iterator i = resourcesToUpdate.iterator();
248: while (i.hasNext()) {
249: CmsPublishedResource res = (CmsPublishedResource) i.next();
250: CmsResource resource = null;
251: try {
252: resource = m_cms.readResource(res.getRootPath());
253: } catch (CmsException e) {
254: if (LOG.isWarnEnabled()) {
255: LOG.warn(Messages.get().getBundle().key(
256: Messages.LOG_UNABLE_TO_READ_RESOURCE_2,
257: res.getRootPath(), m_index.getName()), e);
258: }
259: }
260: if (resource != null) {
261: if (!resourcesAlreadyUpdated.contains(resource
262: .getRootPath())) {
263: // ensure resources are only indexed once per update
264: resourcesAlreadyUpdated.add(resource.getRootPath());
265: updateResource(writer, threadManager, resource);
266: }
267: }
268: }
269: }
270:
271: /**
272: * Checks if the published resource is inside the time window set with release and expiration date.<p>
273: *
274: * @param resource the published resource to check
275: * @return true if the published resource is inside the time window, otherwise false
276: */
277: protected boolean isResourceInTimeWindow(
278: CmsPublishedResource resource) {
279:
280: return m_cms.existsResource(m_cms.getRequestContext()
281: .removeSiteRoot(resource.getRootPath()),
282: CmsResourceFilter.DEFAULT);
283: }
284:
285: /**
286: * Updates (writes) a single resource in the index.<p>
287: *
288: * @param writer the index writer to use
289: * @param threadManager the thread manager to use when extracting the document text
290: * @param resource the resource to update
291: *
292: * @throws CmsIndexException if something goes wrong
293: */
294: protected void updateResource(IndexWriter writer,
295: CmsIndexingThreadManager threadManager, CmsResource resource)
296: throws CmsIndexException {
297:
298: if (resource.isInternal()) {
299: // don't index internal resources
300: return;
301: }
302: // no check for folder resources, this must be taken care of before calling this method
303:
304: try {
305:
306: if (m_report != null) {
307: m_report
308: .print(
309: org.opencms.report.Messages
310: .get()
311: .container(
312: org.opencms.report.Messages.RPT_SUCCESSION_1,
313: String
314: .valueOf(threadManager
315: .getCounter() + 1)),
316: I_CmsReport.FORMAT_NOTE);
317: m_report.print(Messages.get().container(
318: Messages.RPT_SEARCH_INDEXING_FILE_BEGIN_0),
319: I_CmsReport.FORMAT_NOTE);
320: m_report
321: .print(org.opencms.report.Messages
322: .get()
323: .container(
324: org.opencms.report.Messages.RPT_ARGUMENT_1,
325: m_report
326: .removeSiteRoot(resource
327: .getRootPath())));
328: m_report
329: .print(
330: org.opencms.report.Messages
331: .get()
332: .container(
333: org.opencms.report.Messages.RPT_DOTS_0),
334: I_CmsReport.FORMAT_DEFAULT);
335: }
336:
337: threadManager.createIndexingThread(m_cms, writer, resource,
338: m_index, m_report);
339:
340: } catch (Exception e) {
341:
342: if (m_report != null) {
343: m_report.println(Messages.get().container(
344: Messages.RPT_SEARCH_INDEXING_FAILED_0),
345: I_CmsReport.FORMAT_WARNING);
346: }
347: if (LOG.isWarnEnabled()) {
348: LOG.warn(Messages.get().getBundle().key(
349: Messages.ERR_INDEX_RESOURCE_FAILED_2,
350: resource.getRootPath(), m_index.getName()), e);
351: }
352: throw new CmsIndexException(Messages.get().container(
353: Messages.ERR_INDEX_RESOURCE_FAILED_2,
354: resource.getRootPath(), m_index.getName()));
355: }
356: }
357: }
|