001: /*
002: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/documents/CmsDocumentXmlPage.java,v $
003: * Date : $Date: 2008-02-27 12:05:21 $
004: * Version: $Revision: 1.14 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.search.documents;
033:
034: import org.opencms.file.CmsFile;
035: import org.opencms.file.CmsObject;
036: import org.opencms.file.CmsResource;
037: import org.opencms.main.CmsException;
038: import org.opencms.main.OpenCms;
039: import org.opencms.search.CmsIndexException;
040: import org.opencms.search.CmsSearchIndex;
041: import org.opencms.search.extractors.CmsExtractionResult;
042: import org.opencms.search.extractors.I_CmsExtractionResult;
043: import org.opencms.util.CmsHtmlExtractor;
044: import org.opencms.util.CmsStringUtil;
045: import org.opencms.xml.page.CmsXmlPage;
046: import org.opencms.xml.page.CmsXmlPageFactory;
047:
048: import java.util.HashMap;
049: import java.util.Iterator;
050: import java.util.List;
051: import java.util.Locale;
052:
053: /**
054: * Lucene document factory class to extract index data from a cms resource
055: * of type <code>CmsResourceTypeXmlPage</code>.<p>
056: *
057: * @author Carsten Weinholz
058: *
059: * @version $Revision: 1.14 $
060: *
061: * @since 6.0.0
062: */
063: public class CmsDocumentXmlPage extends A_CmsVfsDocument {
064:
065: /**
066: * Creates a new instance of this lucene document factory.<p>
067: *
068: * @param name name of the documenttype
069: */
070: public CmsDocumentXmlPage(String name) {
071:
072: super (name);
073: }
074:
075: /**
076: * Returns the raw text content of a given vfs resource of type <code>CmsResourceTypeXmlPage</code>.<p>
077: *
078: * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex)
079: */
080: public I_CmsExtractionResult extractContent(CmsObject cms,
081: CmsResource resource, CmsSearchIndex index)
082: throws CmsException {
083:
084: try {
085: CmsFile file = readFile(cms, resource);
086: String absolutePath = cms.getSitePath(file);
087: CmsXmlPage page = CmsXmlPageFactory.unmarshal(cms, file);
088:
089: List pageLocales = page.getLocales();
090: if (pageLocales.size() == 0) {
091: pageLocales = OpenCms.getLocaleManager()
092: .getDefaultLocales(cms, absolutePath);
093: }
094: Locale locale = OpenCms.getLocaleManager()
095: .getBestMatchingLocale(
096: index.getLocale(),
097: OpenCms.getLocaleManager()
098: .getDefaultLocales(cms,
099: absolutePath), pageLocales);
100:
101: List elements = page.getNames(locale);
102: StringBuffer content = new StringBuffer();
103: HashMap items = new HashMap();
104: for (Iterator i = elements.iterator(); i.hasNext();) {
105: String elementName = (String) i.next();
106: String value = page.getStringValue(cms, elementName,
107: locale);
108: String extracted = CmsHtmlExtractor.extractText(value,
109: page.getEncoding());
110: if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(extracted)) {
111: items.put(elementName, extracted);
112: content.append(extracted);
113: content.append('\n');
114: }
115: }
116:
117: return new CmsExtractionResult(content.toString(), items);
118:
119: } catch (Exception e) {
120: throw new CmsIndexException(Messages.get().container(
121: Messages.ERR_TEXT_EXTRACTION_1,
122: resource.getRootPath()), e);
123: }
124: }
125:
126: /**
127: * @see org.opencms.search.documents.I_CmsDocumentFactory#isLocaleDependend()
128: */
129: public boolean isLocaleDependend() {
130:
131: return true;
132: }
133:
134: /**
135: * @see org.opencms.search.documents.I_CmsDocumentFactory#isUsingCache()
136: */
137: public boolean isUsingCache() {
138:
139: return true;
140: }
141: }
|