001: /*
002: * Copyright 2001-2006 C:1 Financial Services GmbH
003: *
004: * This software is free software; you can redistribute it and/or
005: * modify it under the terms of the GNU Lesser General Public
006: * License Version 2.1, as published by the Free Software Foundation.
007: *
008: * This software is distributed in the hope that it will be useful,
009: * but WITHOUT ANY WARRANTY; without even the implied warranty of
010: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
011: * Lesser General Public License for more details.
012: *
013: * You should have received a copy of the GNU Lesser General Public
014: * License along with this library; if not, write to the Free Software
015: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA
016: */
017:
018: package de.finix.contelligent.search.engine;
019:
020: import java.util.Collection;
021: import java.util.Collections;
022: import java.util.HashMap;
023: import java.util.HashSet;
024: import java.util.Iterator;
025: import java.util.Map;
026:
027: import org.apache.lucene.document.Document;
028:
029: import de.finix.contelligent.CallData;
030: import de.finix.contelligent.ComponentManager;
031: import de.finix.contelligent.ComponentPath;
032: import de.finix.contelligent.Session;
033: import de.finix.contelligent.core.ContelligentImpl;
034: import de.finix.contelligent.core.security.ContelligentSecurityManager;
035: import de.finix.contelligent.logging.LoggingService;
036: import de.finix.contelligent.xml.elements.IndexBuilderElement;
037: import de.finix.contelligent.xml.elements.IndexBuilderFilterElement;
038:
039: class Crawler {
040: final static org.apache.log4j.Logger log = LoggingService
041: .getLogger(Crawler.class);
042:
043: private ComponentPath root;
044:
045: private Map includes = new HashMap();
046:
047: private Map categoryValues;
048:
049: private Map renderParameters;
050:
051: private LuceneIndex index;
052:
053: Collection filters;
054:
055: public Crawler(LuceneIndex index, IndexBuilderElement element) {
056: this (new ComponentPath(element.getDir()),
057: element.getIncludes(), element.getCategoryValues(),
058: element.getRenderParameters(), index, element
059: .getFilters());
060: }
061:
062: protected Crawler(ComponentPath root, Map includes,
063: Map categoryValues, Map renderParameters,
064: LuceneIndex index, Collection filters) {
065: this .root = root;
066: this .index = index;
067: this .filters = filters;
068:
069: if (includes.isEmpty()) {
070: this .includes.put("contelligent.website.Page", "");
071: } else {
072: this .includes.putAll(includes);
073: }
074: this .categoryValues = categoryValues;
075: if (categoryValues == null) {
076: this .categoryValues = Collections.EMPTY_MAP;
077: }
078: this .renderParameters = renderParameters;
079: if (renderParameters == null) {
080: this .renderParameters = Collections.EMPTY_MAP;
081: }
082: }
083:
084: public void run() {
085: Session session = null;
086: try {
087: final ComponentManager cm = ContelligentImpl.getInstance()
088: .getRootComponentManager();
089: session = ContelligentImpl.getInstance().beginSession(
090: ContelligentSecurityManager.getIndexUser(), cm);
091: CallData callData = ContelligentImpl.getInstance()
092: .createCallData(session);
093:
094: Collection paths = cm.getComponentsInSubtreeFilteredByType(
095: root, includes.keySet());
096:
097: final Collection filteredPaths = filterPaths(paths,
098: callData);
099:
100: final LuceneDocumentFactory documentFactory;
101: if (index.isRender()) {
102: documentFactory = new RenderingDocumentFactory(
103: categoryValues, renderParameters, includes, cm,
104: session, callData, ContelligentImpl
105: .getInstance().getCategoryManager());
106: } else {
107: documentFactory = new RawDocumentFactory(cm);
108: }
109:
110: index.apply(new LuceneIndexAppender() {
111:
112: public void perform(LuceneIndexAppenderAdapter adapter) {
113:
114: Iterator iterator = filteredPaths.iterator();
115:
116: while (iterator.hasNext()) {
117: ComponentPath componentPath = (ComponentPath) iterator
118: .next();
119: try {
120: Iterator documents = documentFactory
121: .createDocuments(componentPath)
122: .iterator();
123:
124: while (documents.hasNext()) {
125: Document document = (Document) documents
126: .next();
127: adapter.add(document);
128: }
129: } catch (Exception e) {
130: log.warn("Failed to add component "
131: + componentPath + " to index ("
132: + e.getMessage() + ")");
133: if (log.isDebugEnabled()) {
134: log
135: .debug(
136: "Failed to add component to index",
137: e);
138: }
139: }
140: }
141: }
142: });
143: } catch (Exception e) {
144: log.error("run() failed", e);
145: } finally {
146: if (session != null) {
147: ContelligentImpl.getInstance().invalidateSession(
148: session);
149: }
150: }
151: }
152:
153: /**
154: * @param paths
155: * @param callData
156: * @return
157: */
158: private Collection filterPaths(Collection paths, CallData callData) {
159:
160: HashSet filteredPaths = new HashSet();
161:
162: if (filters.size() == 0)
163: return paths;
164:
165: Iterator f = filters.iterator();
166:
167: while (f.hasNext()) {
168: IndexBuilderFilterElement filterElement = (IndexBuilderFilterElement) f
169: .next();
170:
171: try {
172: CrawlerFilter filter = FilterEngine.getInstance()
173: .getFilterInstance(filterElement);
174:
175: filteredPaths.addAll(applyFilterToPaths(filter, paths,
176: filterElement, callData));
177:
178: } catch (FilterException e) {
179: log.warn(
180: "CrawlerFilter Implementation could not be resolved '"
181: + filterElement.getImpl() + "'", e);
182: continue;
183: }
184: }
185:
186: if (log.isDebugEnabled()) {
187: log.debug("filterPaths() - filtered paths: '" + paths
188: + "' => '" + filteredPaths + "'");
189: }
190:
191: return filteredPaths;
192: }
193:
194: /**
195: * @param filter
196: * @param filteredPaths
197: * @throws FilterException
198: */
199: private Collection applyFilterToPaths(CrawlerFilter filter,
200: Collection pathsToFilter,
201: IndexBuilderFilterElement filterConfig, CallData callData)
202: throws FilterException {
203: HashSet result = new HashSet();
204:
205: Iterator i = pathsToFilter.iterator();
206: while (i.hasNext()) {
207: ComponentPath path = (ComponentPath) i.next();
208: if (filter.filter(path, filterConfig, callData)) {
209: result.add(path);
210: }
211: }
212: return result;
213: }
214: }
|