001: // plasmaParserConfig.java
002: // -------------------------------------
003: // part of YACY
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2004
007: //
008: // This file ist contributed by Martin Thelian
009: //
010: // $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
011: // $LastChangedRevision: 1715 $
012: // $LastChangedBy: theli $
013: //
014: // This program is free software; you can redistribute it and/or modify
015: // it under the terms of the GNU General Public License as published by
016: // the Free Software Foundation; either version 2 of the License, or
017: // (at your option) any later version.
018: //
019: // This program is distributed in the hope that it will be useful,
020: // but WITHOUT ANY WARRANTY; without even the implied warranty of
021: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
022: // GNU General Public License for more details.
023: //
024: // You should have received a copy of the GNU General Public License
025: // along with this program; if not, write to the Free Software
026: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
027: //
028: // Using this software in any meaning (reading, learning, copying, compiling,
029: // running) means that you agree that the Author(s) is (are) not responsible
030: // for cost, loss of data or any harm that may be caused directly or indirectly
031: // by usage of this softare or this documentation. The usage of this software
032: // is on your own risk. The installation and usage (starting/running) of this
033: // software may allow other people or application to access your computer and
034: // any attached devices and is highly dependent on the configuration of the
035: // software which must be done by the user of the software; the author(s) is
036: // (are) also not responsible for proper configuration and usage of the
037: // software, even if provoked by documentation provided together with
038: // the software.
039: //
040: // Any changes to this file according to the GPL as documented in the file
041: // gpl.txt aside this file in the shipment you received can be done to the
042: // lines that follows this copyright notice here, but changes must not be
043: // done inside the copyright notive above. A re-distribution must contain
044: // the intact and unchanged copyright notice.
045: // Contributions and changes to the program code must be marked as such.
046:
047: package de.anomic.plasma;
048:
049: import java.util.Arrays;
050: import java.util.Enumeration;
051: import java.util.HashSet;
052: import java.util.Hashtable;
053: import java.util.Iterator;
054: import java.util.Set;
055:
056: import de.anomic.plasma.parser.Parser;
057: import de.anomic.plasma.parser.ParserInfo;
058: import de.anomic.server.logging.serverLog;
059: import de.anomic.yacy.yacyURL;
060:
061: public class plasmaParserConfig {
062: /**
063: * A list containing all enabled parsers and the mimeType that they can handle
064: * @see #loadEnabledParserList()
065: * @see #setEnabledParserList(Enumeration)
066: */
067: final HashSet<String> enabledParserList = new HashSet<String>();
068:
069: /**
070: * A list of file extensions that are supported by all enabled parsers
071: */
072: final HashSet<String> supportedFileExt = new HashSet<String>();
073:
074: /**
075: * Parsermode this configuration belongs to
076: */
077: public String parserMode = null;
078:
079: public plasmaParserConfig(String theParserMode) {
080: if (!plasmaParser.PARSER_MODE.contains(theParserMode)) {
081: throw new IllegalArgumentException("Unknown parser mode "
082: + theParserMode);
083: }
084:
085: this .parserMode = theParserMode;
086: }
087:
088: public boolean supportedContent(yacyURL url, String mimeType) {
089: // TODO: we need some exceptions here to index URLs like this
090: // http://www.musicabona.com/respighi/12668/cd/index.html.fr
091: mimeType = plasmaParser.normalizeMimeType(mimeType);
092: if (mimeType.equals("text/html")
093: || mimeType.equals("application/xhtml+xml")
094: || mimeType.equals("text/plain")) {
095: return supportedMimeTypesContains(mimeType);
096: }
097: return supportedMimeTypesContains(mimeType)
098: && supportedFileExt(url);
099: }
100:
101: public boolean supportedMimeTypesContains(String mimeType) {
102: mimeType = plasmaParser.normalizeMimeType(mimeType);
103:
104: synchronized (plasmaParser.supportedHTMLMimeTypes) {
105: if (plasmaParser.supportedHTMLMimeTypes.contains(mimeType))
106: return true;
107: }
108:
109: synchronized (this .enabledParserList) {
110: return this .enabledParserList.contains(mimeType);
111: }
112: }
113:
114: public boolean supportedFileExt(yacyURL url) {
115: if (url == null)
116: throw new NullPointerException();
117:
118: // getting the file path
119: String name = plasmaParser.getFileExt(url);
120: return supportedFileExtContains(name);
121: }
122:
123: public boolean supportedFileExtContains(String fileExt) {
124: if (fileExt == null)
125: return false;
126: fileExt = fileExt.trim().toLowerCase();
127:
128: synchronized (plasmaParser.supportedHTMLFileExt) {
129: if (plasmaParser.supportedHTMLFileExt.contains(fileExt))
130: return true;
131: }
132:
133: synchronized (this .supportedFileExt) {
134: return this .supportedFileExt.contains(fileExt);
135: }
136: }
137:
138: public void initParseableMimeTypes(String enabledMimeTypes) {
139: HashSet<String> mimeTypes = null;
140: if ((enabledMimeTypes == null)
141: || (enabledMimeTypes.length() == 0)) {
142: mimeTypes = new HashSet<String>();
143: } else {
144: String[] enabledMimeTypeList = enabledMimeTypes.split(",");
145: mimeTypes = new HashSet<String>(enabledMimeTypeList.length);
146: for (int i = 0; i < enabledMimeTypeList.length; i++)
147: mimeTypes.add(enabledMimeTypeList[i].toLowerCase()
148: .trim());
149: }
150: setEnabledParserList(mimeTypes);
151: }
152:
153: public void enableAllParsers() {
154: Set<String> availableMimeTypes = plasmaParser.availableParserList
155: .keySet();
156: setEnabledParserList(availableMimeTypes);
157: }
158:
159: public String[] setEnabledParserList(Set<String> mimeTypeSet) {
160:
161: HashSet<String> newEnabledParsers = new HashSet<String>();
162: HashSet<String> newSupportedFileExt = new HashSet<String>();
163:
164: if (mimeTypeSet != null) {
165: Iterator<String> mimeTypes = mimeTypeSet.iterator();
166: while (mimeTypes.hasNext()) {
167: String mimeType = (String) mimeTypes.next();
168: if (plasmaParser.availableParserList
169: .containsKey(mimeType)) {
170: Parser theParser = null;
171: try {
172: // getting the parser
173: theParser = plasmaParser
174: .makeParser(((ParserInfo) plasmaParser.availableParserList
175: .get(mimeType)).parserClassName);
176:
177: // getting a list of mimeTypes that the parser supports
178: Hashtable<String, String> parserSupportsMimeTypes = theParser
179: .getSupportedMimeTypes();
180: if (parserSupportsMimeTypes != null) {
181: Object supportedExtensions = parserSupportsMimeTypes
182: .get(mimeType);
183: if ((supportedExtensions != null)
184: && (supportedExtensions instanceof String)
185: && (((String) supportedExtensions)
186: .length() > 0)) {
187: String[] extArray = ((String) supportedExtensions)
188: .split(",");
189: newSupportedFileExt.addAll(Arrays
190: .asList(extArray));
191: }
192: }
193: newEnabledParsers.add(mimeType);
194:
195: } catch (Exception e) {
196: serverLog.logSevere("PARSER",
197: "error in setEnabledParserList", e);
198: } finally {
199: if (theParser != null)
200: theParser = null; // destroy object
201: }
202: }
203: }
204: }
205:
206: synchronized (this .enabledParserList) {
207: this .enabledParserList.clear();
208: this .enabledParserList.addAll(newEnabledParsers);
209: }
210:
211: synchronized (this .supportedFileExt) {
212: this .supportedFileExt.clear();
213: this .supportedFileExt.addAll(newSupportedFileExt);
214: }
215:
216: return (String[]) newEnabledParsers
217: .toArray(new String[newEnabledParsers.size()]);
218: }
219:
220: @SuppressWarnings("unchecked")
221: public HashSet<String> getEnabledParserList() {
222: synchronized (this .enabledParserList) {
223: return (HashSet<String>) this.enabledParserList.clone();
224: }
225: }
226: }
|