001: // abstractURLPattern.java
002: // -----------------------
003: // part of YaCy
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2005
007: // (C) 2007 by Bjoern Krombholz
008: // last major change: 12. August 2006 (theli) ?
009: //
010: // $LastChangedDate: 2008-01-23 23:08:32 +0000 (Mi, 23 Jan 2008) $
011: // $LastChangedRevision: 4382 $
012: // $LastChangedBy: orbiter $
013: //
014: // This program is free software; you can redistribute it and/or modify
015: // it under the terms of the GNU General Public License as published by
016: // the Free Software Foundation; either version 2 of the License, or
017: // (at your option) any later version.
018: //
019: // This program is distributed in the hope that it will be useful,
020: // but WITHOUT ANY WARRANTY; without even the implied warranty of
021: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
022: // GNU General Public License for more details.
023: //
024: // You should have received a copy of the GNU General Public License
025: // along with this program; if not, write to the Free Software
026: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
027: //
028: // Using this software in any meaning (reading, learning, copying, compiling,
029: // running) means that you agree that the Author(s) is (are) not responsible
030: // for cost, loss of data or any harm that may be caused directly or indirectly
031: // by usage of this softare or this documentation. The usage of this software
032: // is on your own risk. The installation and usage (starting/running) of this
033: // software may allow other people or application to access your computer and
034: // any attached devices and is highly dependent on the configuration of the
035: // software which must be done by the user of the software; the author(s) is
036: // (are) also not responsible for proper configuration and usage of the
037: // software, even if provoked by documentation provided together with
038: // the software.
039: //
040: // Any changes to this file according to the GPL as documented in the file
041: // gpl.txt aside this file in the shipment you received can be done to the
042: // lines that follows this copyright notice here, but changes must not be
043: // done inside the copyright notive above. A re-distribution must contain
044: // the intact and unchanged copyright notice.
045: // Contributions and changes to the program code must be marked as such.
046:
047: package de.anomic.plasma.urlPattern;
048:
049: import java.io.File;
050: import java.io.IOException;
051: import java.util.ArrayList;
052: import java.util.Arrays;
053: import java.util.Collections;
054: import java.util.HashMap;
055: import java.util.HashSet;
056: import java.util.Iterator;
057: import java.util.Map;
058: import java.util.Set;
059: import de.anomic.kelondro.kelondroMSetTools;
060: import de.anomic.yacy.yacyURL;
061:
062: public abstract class abstractURLPattern implements plasmaURLPattern {
063:
064: protected static final HashSet<String> BLACKLIST_TYPES = new HashSet<String>(
065: Arrays.asList(new String[] {
066: plasmaURLPattern.BLACKLIST_CRAWLER,
067: plasmaURLPattern.BLACKLIST_PROXY,
068: plasmaURLPattern.BLACKLIST_DHT,
069: plasmaURLPattern.BLACKLIST_SEARCH,
070: plasmaURLPattern.BLACKLIST_SURFTIPS,
071: plasmaURLPattern.BLACKLIST_NEWS }));
072: public static final String BLACKLIST_TYPES_STRING = "proxy,crawler,dht,search,surftips,news";
073:
074: protected File blacklistRootPath = null;
075: protected HashMap<String, Set<String>> cachedUrlHashs = null;
076: protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
077:
078: public abstractURLPattern(File rootPath) {
079: this .setRootPath(rootPath);
080:
081: this .blacklistRootPath = rootPath;
082:
083: // prepare the data structure
084: this .hostpaths = new HashMap<String, HashMap<String, ArrayList<String>>>();
085: this .cachedUrlHashs = new HashMap<String, Set<String>>();
086:
087: Iterator<String> iter = BLACKLIST_TYPES.iterator();
088: while (iter.hasNext()) {
089: String blacklistType = (String) iter.next();
090: this .hostpaths.put(blacklistType,
091: new HashMap<String, ArrayList<String>>());
092: this .cachedUrlHashs.put(blacklistType, Collections
093: .synchronizedSet(new HashSet<String>()));
094: }
095: }
096:
097: public void setRootPath(File rootPath) {
098: if (rootPath == null)
099: throw new NullPointerException(
100: "The blacklist root path must not be null.");
101: if (!rootPath.isDirectory())
102: throw new IllegalArgumentException(
103: "The blacklist root path is not a directory.");
104: if (!rootPath.canRead())
105: throw new IllegalArgumentException(
106: "The blacklist root path is not readable.");
107:
108: this .blacklistRootPath = rootPath;
109: }
110:
111: protected HashMap<String, ArrayList<String>> getBlacklistMap(
112: String blacklistType) {
113: if (blacklistType == null)
114: throw new IllegalArgumentException();
115: if (!BLACKLIST_TYPES.contains(blacklistType))
116: throw new IllegalArgumentException(
117: "Unknown blacklist type: " + blacklistType + ".");
118:
119: return this .hostpaths.get(blacklistType);
120: }
121:
122: protected Set<String> getCacheUrlHashsSet(String blacklistType) {
123: if (blacklistType == null)
124: throw new IllegalArgumentException();
125: if (!BLACKLIST_TYPES.contains(blacklistType))
126: throw new IllegalArgumentException("Unknown backlist type.");
127:
128: return this .cachedUrlHashs.get(blacklistType);
129: }
130:
131: public void clear() {
132: Iterator<HashMap<String, ArrayList<String>>> iter = this .hostpaths
133: .values().iterator();
134: Iterator<Set<String>> cIter = this .cachedUrlHashs.values()
135: .iterator();
136: while (iter.hasNext()) {
137: iter.next().clear();
138: }
139: while (cIter.hasNext()) {
140: // clear caches as well to avoid wrong/outdated matches after changing lists
141: cIter.next().clear();
142: }
143: }
144:
145: public int size() {
146: int size = 0;
147: Iterator<String> iter = this .hostpaths.keySet().iterator();
148: while (iter.hasNext()) {
149: Iterator<ArrayList<String>> blIter = this .hostpaths.get(
150: iter.next()).values().iterator();
151: while (blIter.hasNext())
152: size += blIter.next().size();
153: }
154: return size;
155: }
156:
157: public void loadList(blacklistFile[] blFiles, String sep) {
158: for (int j = 0; j < blFiles.length; j++) {
159: blacklistFile blf = blFiles[j];
160: loadList(blf.getType(), blf.getFileName(), sep);
161: }
162: }
163:
164: public void loadList(blacklistFile blFile, String sep) {
165: HashMap<String, ArrayList<String>> blacklistMap = getBlacklistMap(blFile
166: .getType());
167: Set<Map.Entry<String, ArrayList<String>>> loadedBlacklist;
168: Map.Entry<String, ArrayList<String>> loadedEntry;
169: ArrayList<String> paths;
170: ArrayList<String> loadedPaths;
171:
172: String[] fileNames = blFile.getFileNamesUnified();
173: if (fileNames.length > 0) {
174: for (int i = 0; i < fileNames.length; i++) {
175: // make sure all requested blacklist files exist
176: File file = new File(this .blacklistRootPath,
177: fileNames[i]);
178: try {
179: file.createNewFile();
180: } catch (IOException e) { /* */
181: }
182:
183: // join all blacklists from files into one internal blacklist map
184: loadedBlacklist = kelondroMSetTools
185: .loadMapMultiValsPerKey(file.toString(), sep)
186: .entrySet();
187: for (Iterator<Map.Entry<String, ArrayList<String>>> mi = loadedBlacklist
188: .iterator(); mi.hasNext();) {
189: loadedEntry = mi.next();
190: loadedPaths = loadedEntry.getValue();
191:
192: // create new entry if host mask unknown, otherwise merge
193: // existing one with path patterns from blacklist file
194: paths = blacklistMap.get(loadedEntry.getKey());
195: if (paths == null) {
196: blacklistMap.put(loadedEntry.getKey(),
197: loadedPaths);
198: } else {
199: // TODO check for duplicates? (refactor List -> Set)
200: paths.addAll(loadedPaths);
201: }
202: }
203: }
204: }
205: }
206:
207: public void loadList(String blacklistType, String fileNames,
208: String sep) {
209: // method for not breaking older plasmaURLPattern interface
210: blacklistFile blFile = new blacklistFile(fileNames,
211: blacklistType);
212:
213: loadList(blFile, sep);
214: }
215:
216: public void removeAll(String blacklistType, String host) {
217: HashMap<String, ArrayList<String>> blacklistMap = getBlacklistMap(blacklistType);
218: blacklistMap.remove(host);
219: }
220:
221: public void remove(String blacklistType, String host, String path) {
222: HashMap<String, ArrayList<String>> blacklistMap = getBlacklistMap(blacklistType);
223: ArrayList<String> hostList = blacklistMap.get(host);
224: hostList.remove(path);
225: if (hostList.size() == 0)
226: blacklistMap.remove(host);
227: }
228:
229: public void add(String blacklistType, String host, String path) {
230: if (host == null)
231: throw new NullPointerException();
232: if (path == null)
233: throw new NullPointerException();
234:
235: if (path.length() > 0 && path.charAt(0) == '/')
236: path = path.substring(1);
237:
238: HashMap<String, ArrayList<String>> blacklistMap = getBlacklistMap(blacklistType);
239: ArrayList<String> hostList = blacklistMap.get(host
240: .toLowerCase());
241: if (hostList == null)
242: blacklistMap.put(host.toLowerCase(),
243: (hostList = new ArrayList<String>()));
244: hostList.add(path);
245: }
246:
247: public int blacklistCacheSize() {
248: int size = 0;
249: Iterator<String> iter = this .cachedUrlHashs.keySet().iterator();
250: while (iter.hasNext()) {
251: Set<String> blacklistMap = this .cachedUrlHashs.get(iter
252: .next());
253: size += blacklistMap.size();
254: }
255: return size;
256: }
257:
258: public boolean hashInBlacklistedCache(String blacklistType,
259: String urlHash) {
260: Set<String> urlHashCache = getCacheUrlHashsSet(blacklistType);
261: return urlHashCache.contains(urlHash);
262: }
263:
264: public boolean isListed(String blacklistType, yacyURL url) {
265:
266: Set<String> urlHashCache = getCacheUrlHashsSet(blacklistType);
267: if (!urlHashCache.contains(url.hash())) {
268: boolean temp = isListed(blacklistType, url.getHost()
269: .toLowerCase(), url.getFile());
270: if (temp) {
271: urlHashCache.add(url.hash());
272: }
273: return temp;
274: }
275: return true;
276: }
277:
278: }
|