001: /**
002: * Copyright (c) 2000-2008 Liferay, Inc. All rights reserved.
003: *
004: * Permission is hereby granted, free of charge, to any person obtaining a copy
005: * of this software and associated documentation files (the "Software"), to deal
006: * in the Software without restriction, including without limitation the rights
007: * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
008: * copies of the Software, and to permit persons to whom the Software is
009: * furnished to do so, subject to the following conditions:
010: *
011: * The above copyright notice and this permission notice shall be included in
012: * all copies or substantial portions of the Software.
013: *
014: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
017: * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
019: * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
020: * SOFTWARE.
021: */package com.liferay.portal.lucene;
022:
023: import com.liferay.portal.kernel.util.CharPool;
024: import com.liferay.portal.kernel.util.GetterUtil;
025: import com.liferay.portal.kernel.util.StringMaker;
026: import com.liferay.portal.kernel.util.StringPool;
027: import com.liferay.portal.kernel.util.Validator;
028: import com.liferay.portal.util.PropsValues;
029:
030: import java.io.BufferedInputStream;
031: import java.io.BufferedReader;
032: import java.io.ByteArrayInputStream;
033: import java.io.File;
034: import java.io.FileInputStream;
035: import java.io.IOException;
036: import java.io.InputStream;
037:
038: import org.apache.commons.logging.Log;
039: import org.apache.commons.logging.LogFactory;
040: import org.apache.jackrabbit.extractor.HTMLTextExtractor;
041: import org.apache.jackrabbit.extractor.MsExcelTextExtractor;
042: import org.apache.jackrabbit.extractor.MsPowerPointTextExtractor;
043: import org.apache.jackrabbit.extractor.MsWordTextExtractor;
044: import org.apache.jackrabbit.extractor.OpenOfficeTextExtractor;
045: import org.apache.jackrabbit.extractor.PdfTextExtractor;
046: import org.apache.jackrabbit.extractor.PlainTextExtractor;
047: import org.apache.jackrabbit.extractor.RTFTextExtractor;
048: import org.apache.jackrabbit.extractor.TextExtractor;
049: import org.apache.jackrabbit.extractor.XMLTextExtractor;
050: import org.apache.lucene.document.Field;
051:
052: /**
053: * <a href="LuceneFileExtractor.java.html"><b><i>View Source</i></b></a>
054: *
055: * @author Brian Wing Shun Chan
056: *
057: */
058: public class LuceneFileExtractor {
059:
060: public Field getFile(String field, InputStream is, String fileExt) {
061: String text = null;
062:
063: try {
064: fileExt = GetterUtil.getString(fileExt).toLowerCase();
065:
066: TextExtractor extractor = null;
067:
068: String contentType = null;
069: String encoding = System.getProperty("encoding");
070:
071: if (fileExt.equals(".doc")) {
072: extractor = new MsWordTextExtractor();
073:
074: contentType = "application/vnd.ms-word";
075: } else if (fileExt.equals(".htm")
076: || fileExt.equals(".html")) {
077: extractor = new HTMLTextExtractor();
078:
079: contentType = "text/html";
080: } else if (fileExt.equals(".odb") || fileExt.equals(".odf")
081: || fileExt.equals(".odg") || fileExt.equals(".odp")
082: || fileExt.equals(".ods") || fileExt.equals(".odt")) {
083:
084: extractor = new OpenOfficeTextExtractor();
085:
086: contentType = "application/vnd.oasis.opendocument.";
087:
088: if (fileExt.equals(".odb")) {
089: contentType += "database";
090: } else if (fileExt.equals(".odf")) {
091: contentType += "formula";
092: } else if (fileExt.equals(".odg")) {
093: contentType += "graphics";
094: } else if (fileExt.equals(".odp")) {
095: contentType += "presentation";
096: } else if (fileExt.equals(".ods")) {
097: contentType += "spreadsheet";
098: } else if (fileExt.equals(".odt")) {
099: contentType += "text";
100: }
101: } else if (fileExt.equals(".pdf")) {
102: extractor = new PdfTextExtractor();
103:
104: contentType = "application/pdf";
105: } else if (fileExt.equals(".ppt")) {
106: extractor = new MsPowerPointTextExtractor();
107:
108: contentType = "application/vnd.ms-powerpoint";
109: } else if (fileExt.equals(".rtf")) {
110: extractor = new RTFTextExtractor();
111:
112: contentType = "application/rtf";
113: } else if (fileExt.equals(".txt")) {
114: extractor = new PlainTextExtractor();
115:
116: contentType = "text/plain";
117: } else if (fileExt.equals(".xls")) {
118: extractor = new MsExcelTextExtractor();
119:
120: contentType = "application/vnd.ms-excel";
121: } else if (fileExt.equals(".xml")) {
122: extractor = new XMLTextExtractor();
123:
124: contentType = "text/xml";
125: }
126:
127: if (extractor != null) {
128: if (_log.isInfoEnabled()) {
129: _log.info("Using extractor "
130: + extractor.getClass().getName()
131: + " for extension " + fileExt);
132: }
133:
134: StringMaker sm = new StringMaker();
135:
136: BufferedReader reader = new BufferedReader(extractor
137: .extractText(is, contentType, encoding));
138:
139: int i;
140:
141: while ((i = reader.read()) != -1) {
142: sm.append((char) i);
143: }
144:
145: reader.close();
146:
147: text = sm.toString();
148:
149: if (Validator
150: .isNotNull(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
151:
152: text = regexpStrip(text);
153: }
154: } else {
155: if (_log.isInfoEnabled()) {
156: _log.info("No extractor found for extension "
157: + fileExt);
158: }
159: }
160: } catch (Exception e) {
161: _log.error(e);
162: }
163:
164: if (_log.isDebugEnabled()) {
165: _log.debug("Extractor returned text:\n\n" + text);
166: }
167:
168: if (text == null) {
169: text = StringPool.BLANK;
170: }
171:
172: return LuceneFields.getText(field, text);
173: }
174:
175: public Field getFile(String field, byte[] byteArray, String fileExt)
176: throws IOException {
177:
178: InputStream in = new BufferedInputStream(
179: new ByteArrayInputStream(byteArray));
180:
181: return getFile(field, in, fileExt);
182: }
183:
184: public Field getFile(String field, File file, String fileExt)
185: throws IOException {
186:
187: InputStream in = new FileInputStream(file);
188:
189: return getFile(field, in, fileExt);
190: }
191:
192: protected String regexpStrip(String text) {
193: char[] array = text.toCharArray();
194:
195: for (int i = 0; i < array.length; i++) {
196: String s = String.valueOf(array[i]);
197:
198: if (!s
199: .matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
200: array[i] = CharPool.SPACE;
201: }
202: }
203:
204: return new String(array);
205: }
206:
207: private static Log _log = LogFactory
208: .getLog(LuceneFileExtractor.class);
209:
210: }
|