001: // plasmaCrawlProfile.java
002: // ------------------------
003: // part of YaCy
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2004
007: // last major change: 25.02.2004
008: //
009: // This program is free software; you can redistribute it and/or modify
010: // it under the terms of the GNU General Public License as published by
011: // the Free Software Foundation; either version 2 of the License, or
012: // (at your option) any later version.
013: //
014: // This program is distributed in the hope that it will be useful,
015: // but WITHOUT ANY WARRANTY; without even the implied warranty of
016: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: // GNU General Public License for more details.
018: //
019: // You should have received a copy of the GNU General Public License
020: // along with this program; if not, write to the Free Software
021: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: //
023: // Using this software in any meaning (reading, learning, copying, compiling,
024: // running) means that you agree that the Author(s) is (are) not responsible
025: // for cost, loss of data or any harm that may be caused directly or indirectly
026: // by usage of this softare or this documentation. The usage of this software
027: // is on your own risk. The installation and usage (starting/running) of this
028: // software may allow other people or application to access your computer and
029: // any attached devices and is highly dependent on the configuration of the
030: // software which must be done by the user of the software; the author(s) is
031: // (are) also not responsible for proper configuration and usage of the
032: // software, even if provoked by documentation provided together with
033: // the software.
034: //
035: // Any changes to this file according to the GPL as documented in the file
036: // gpl.txt aside this file in the shipment you received can be done to the
037: // lines that follows this copyright notice here, but changes must not be
038: // done inside the copyright notive above. A re-distribution must contain
039: // the intact and unchanged copyright notice.
040: // Contributions and changes to the program code must be marked as such.
041:
042: package de.anomic.plasma;
043:
044: import java.io.File;
045: import java.io.IOException;
046: import java.util.HashMap;
047: import java.util.HashSet;
048: import java.util.Iterator;
049: import java.util.Map;
050:
051: import de.anomic.kelondro.kelondroBase64Order;
052: import de.anomic.kelondro.kelondroCloneableIterator;
053: import de.anomic.kelondro.kelondroDyn;
054: import de.anomic.kelondro.kelondroException;
055: import de.anomic.kelondro.kelondroMapObjects;
056: import de.anomic.kelondro.kelondroNaturalOrder;
057: import de.anomic.server.serverCodings;
058: import de.anomic.yacy.yacySeedDB;
059: import de.anomic.yacy.yacyURL;
060:
061: public class plasmaCrawlProfile {
062:
063: private static HashMap<String, Map<String, DomProfile>> domsCache = new HashMap<String, Map<String, DomProfile>>();
064:
065: private kelondroMapObjects profileTable;
066: private File profileTableFile;
067: private long preloadTime;
068:
069: public plasmaCrawlProfile(File file, long preloadTime) {
070: this .profileTableFile = file;
071: this .preloadTime = preloadTime;
072: profileTableFile.getParentFile().mkdirs();
073: kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true,
074: preloadTime, yacySeedDB.commonHashLength, 2000, '#',
075: kelondroNaturalOrder.naturalOrder, false, false, true);
076: profileTable = new kelondroMapObjects(dyn, 500);
077: }
078:
079: private void resetDatabase() {
080: // deletes the profile database and creates a new one
081: if (profileTable != null)
082: profileTable.close();
083: if (!(profileTableFile.delete()))
084: throw new RuntimeException(
085: "cannot delete crawl profile database");
086: profileTableFile.getParentFile().mkdirs();
087: kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true,
088: preloadTime, yacySeedDB.commonHashLength, 2000, '#',
089: kelondroNaturalOrder.naturalOrder, false, false, true);
090: profileTable = new kelondroMapObjects(dyn, 500);
091: }
092:
093: public void close() {
094: profileTable.close();
095: }
096:
097: public int size() {
098: return profileTable.size();
099: }
100:
101: public Iterator<entry> profiles(boolean up) {
102: // enumerates profile entries
103: try {
104: return new profileIterator(up);
105: } catch (IOException e) {
106: return new HashSet<entry>().iterator();
107: }
108: }
109:
110: public class profileIterator implements Iterator<entry> {
111: // the iterator iterates all keys, which are byte[] objects
112: kelondroCloneableIterator<String> handleIterator;
113: String lastkey;
114:
115: public profileIterator(boolean up) throws IOException {
116: handleIterator = profileTable.keys(up, false);
117: lastkey = null;
118: }
119:
120: public boolean hasNext() {
121: try {
122: return handleIterator.hasNext();
123: } catch (kelondroException e) {
124: resetDatabase();
125: return false;
126: }
127: }
128:
129: public entry next() {
130: try {
131: lastkey = (String) handleIterator.next();
132: return getEntry(lastkey);
133: } catch (kelondroException e) {
134: resetDatabase();
135: return null;
136: }
137: }
138:
139: public void remove() {
140: if (lastkey != null)
141: try {
142: removeEntry(lastkey);
143: } catch (kelondroException e) {
144: resetDatabase();
145: }
146: }
147: }
148:
149: public void removeEntry(String handle) {
150: try {
151: profileTable.remove(handle);
152: } catch (IOException e) {
153: }
154: }
155:
156: public entry newEntry(HashMap<String, String> mem) {
157: entry ne = new entry(mem);
158: try {
159: profileTable.set(ne.handle(), ne.map());
160: } catch (kelondroException e) {
161: resetDatabase();
162: try {
163: profileTable.set(ne.handle(), ne.map());
164: } catch (IOException ee) {
165: e.printStackTrace();
166: System.exit(0);
167: }
168: } catch (IOException e) {
169: resetDatabase();
170: try {
171: profileTable.set(ne.handle(), ne.map());
172: } catch (IOException ee) {
173: e.printStackTrace();
174: System.exit(0);
175: }
176: }
177: return ne;
178: }
179:
180: public entry newEntry(String name, yacyURL startURL,
181: String generalFilter, String specificFilter,
182: int generalDepth, int specificDepth,
183: int recrawlIfOlder /*minutes*/, int domFilterDepth,
184: int domMaxPages, boolean crawlingQ, boolean indexText,
185: boolean indexMedia, boolean storeHTCache,
186: boolean storeTXCache, boolean remoteIndexing,
187: boolean xsstopw, boolean xdstopw, boolean xpstopw) {
188:
189: entry ne = new entry(name, startURL, generalFilter,
190: specificFilter, generalDepth, specificDepth,
191: recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ,
192: indexText, indexMedia, storeHTCache, storeTXCache,
193: remoteIndexing, xsstopw, xdstopw, xpstopw);
194: try {
195: profileTable.set(ne.handle(), ne.map());
196: } catch (kelondroException e) {
197: resetDatabase();
198: try {
199: profileTable.set(ne.handle(), ne.map());
200: } catch (IOException ee) {
201: e.printStackTrace();
202: System.exit(0);
203: }
204: } catch (IOException e) {
205: resetDatabase();
206: try {
207: profileTable.set(ne.handle(), ne.map());
208: } catch (IOException ee) {
209: e.printStackTrace();
210: System.exit(0);
211: }
212: }
213: return ne;
214: }
215:
216: public entry getEntry(String handle) {
217: HashMap<String, String> m = profileTable.getMap(handle);
218: if (m == null)
219: return null;
220: return new entry(m);
221: }
222:
223: public void changeEntry(entry e, String propName, String newValue)
224: throws IOException {
225: e.mem.put(propName, newValue);
226: profileTable.set(e.handle(), e.mem);
227: }
228:
229: public static class DomProfile {
230:
231: public String referrer;
232: public int depth, count;
233:
234: public DomProfile(String ref, int d) {
235: this .referrer = ref;
236: this .depth = d;
237: this .count = 1;
238: }
239:
240: public void inc() {
241: this .count++;
242: }
243:
244: }
245:
246: public static class entry {
247: // this is a simple record structure that hold all properties of a single crawl start
248:
249: public static final String HANDLE = "handle";
250: public static final String NAME = "name";
251: public static final String START_URL = "startURL";
252: public static final String GENERAL_FILTER = "generalFilter";
253: public static final String SPECIFIC_FILTER = "specificFilter";
254: public static final String GENERAL_DEPTH = "generalDepth";
255: public static final String SPECIFIC_DEPTH = "specificDepth";
256: public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
257: public static final String DOM_FILTER_DEPTH = "domFilterDepth";
258: public static final String DOM_MAX_PAGES = "domMaxPages";
259: public static final String CRAWLING_Q = "crawlingQ";
260: public static final String INDEX_TEXT = "indexText";
261: public static final String INDEX_MEDIA = "indexMedia";
262: public static final String STORE_HTCACHE = "storeHTCache";
263: public static final String STORE_TXCACHE = "storeTXCache";
264: public static final String REMOTE_INDEXING = "remoteIndexing";
265: public static final String XSSTOPW = "xsstopw";
266: public static final String XDSTOPW = "xdstopw";
267: public static final String XPSTOPW = "xpstopw";
268:
269: private HashMap<String, String> mem;
270: private Map<String, DomProfile> doms;
271:
272: public entry(String name, yacyURL startURL,
273: String generalFilter, String specificFilter,
274: int generalDepth, int specificDepth,
275: int recrawlIfOlder /*minutes*/, int domFilterDepth,
276: int domMaxPages, boolean crawlingQ, boolean indexText,
277: boolean indexMedia, boolean storeHTCache,
278: boolean storeTXCache, boolean remoteIndexing,
279: boolean xsstopw, boolean xdstopw, boolean xpstopw) {
280: if (name == null || name.length() == 0)
281: throw new NullPointerException("name must not be null");
282: String handle = (startURL == null) ? kelondroBase64Order.enhancedCoder
283: .encode(
284: serverCodings.encodeMD5Raw(Long
285: .toString(System
286: .currentTimeMillis())))
287: .substring(0, yacySeedDB.commonHashLength)
288: : startURL.hash();
289: mem = new HashMap<String, String>();
290: mem.put(HANDLE, handle);
291: mem.put(NAME, name);
292: mem.put(START_URL, (startURL == null) ? "" : startURL
293: .toNormalform(true, false));
294: mem.put(GENERAL_FILTER, (generalFilter == null) ? ".*"
295: : generalFilter);
296: mem.put(SPECIFIC_FILTER, (specificFilter == null) ? ".*"
297: : specificFilter);
298: mem.put(GENERAL_DEPTH, Integer.toString(generalDepth));
299: mem.put(SPECIFIC_DEPTH, Integer.toString(specificDepth));
300: mem.put(RECRAWL_IF_OLDER, Integer.toString(recrawlIfOlder));
301: mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth));
302: mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages));
303: mem.put(CRAWLING_Q, Boolean.toString(crawlingQ)); // crawling of urls with '?'
304: mem.put(INDEX_TEXT, Boolean.toString(indexText));
305: mem.put(INDEX_MEDIA, Boolean.toString(indexMedia));
306: mem.put(STORE_HTCACHE, Boolean.toString(storeHTCache));
307: mem.put(STORE_TXCACHE, Boolean.toString(storeTXCache));
308: mem.put(REMOTE_INDEXING, Boolean.toString(remoteIndexing));
309: mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words
310: mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word
311: mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words
312:
313: doms = new HashMap<String, DomProfile>();
314: }
315:
316: public String toString() {
317: StringBuffer str = new StringBuffer();
318:
319: if (this .mem != null) {
320: str.append(this .mem.toString());
321: }
322:
323: return str.toString();
324: }
325:
326: public entry(HashMap<String, String> mem) {
327: this .mem = mem;
328: this .doms = (HashMap<String, DomProfile>) domsCache
329: .get(this .mem.get(HANDLE));
330: if (this .doms == null)
331: this .doms = new HashMap<String, DomProfile>();
332: }
333:
334: public HashMap<String, String> map() {
335: return mem;
336: }
337:
338: public String handle() {
339: String r = (String) mem.get(HANDLE);
340: if (r == null)
341: return null;
342: else
343: return r;
344: }
345:
346: public String name() {
347: String r = (String) mem.get(NAME);
348: if (r == null)
349: return "";
350: else
351: return r;
352: }
353:
354: public String startURL() {
355: String r = (String) mem.get(START_URL);
356: if (r == null)
357: return null;
358: else
359: return r;
360: }
361:
362: public String generalFilter() {
363: String r = (String) mem.get(GENERAL_FILTER);
364: if (r == null)
365: return ".*";
366: else
367: return r;
368: }
369:
370: public String specificFilter() {
371: String r = (String) mem.get(SPECIFIC_FILTER);
372: if (r == null)
373: return ".*";
374: else
375: return r;
376: }
377:
378: public int generalDepth() {
379: String r = (String) mem.get(GENERAL_DEPTH);
380: if (r == null)
381: return 0;
382: else
383: try {
384: return Integer.parseInt(r);
385: } catch (NumberFormatException e) {
386: return 0;
387: }
388: }
389:
390: public int specificDepth() {
391: String r = (String) mem.get(SPECIFIC_DEPTH);
392: if (r == null)
393: return 0;
394: else
395: try {
396: return Integer.parseInt(r);
397: } catch (NumberFormatException e) {
398: return 0;
399: }
400: }
401:
402: public long recrawlIfOlder() {
403: // returns a long (millis) that is the minimum age that
404: // an antry must have to be re-crawled
405: String r = (String) mem.get(RECRAWL_IF_OLDER);
406: if (r == null)
407: return Long.MAX_VALUE;
408: else
409: try {
410: long l = Long.parseLong(r) * 60000L;
411: return (l < 0) ? Long.MAX_VALUE : l;
412: } catch (NumberFormatException e) {
413: return Long.MAX_VALUE;
414: }
415: }
416:
417: public int domFilterDepth() {
418: // if the depth is equal or less to this depth,
419: // then the current url feeds with its domain the crawl filter
420: // if this is -1, all domains are feeded
421: String r = (String) mem.get(DOM_FILTER_DEPTH);
422: if (r == null)
423: return Integer.MAX_VALUE;
424: else
425: try {
426: int i = Integer.parseInt(r);
427: if (i < 0)
428: return Integer.MAX_VALUE;
429: return i;
430: } catch (NumberFormatException e) {
431: return Integer.MAX_VALUE;
432: }
433: }
434:
435: public int domMaxPages() {
436: // this is the maximum number of pages that are crawled for a single domain
437: // if -1, this means no limit
438: String r = (String) mem.get(DOM_MAX_PAGES);
439: if (r == null)
440: return Integer.MAX_VALUE;
441: else
442: try {
443: int i = Integer.parseInt(r);
444: if (i < 0)
445: return Integer.MAX_VALUE;
446: return i;
447: } catch (NumberFormatException e) {
448: return Integer.MAX_VALUE;
449: }
450: }
451:
452: public boolean crawlingQ() {
453: String r = (String) mem.get(CRAWLING_Q);
454: if (r == null)
455: return false;
456: else
457: return (r.equals(Boolean.TRUE.toString()));
458: }
459:
460: public boolean indexText() {
461: String r = (String) mem.get(INDEX_TEXT);
462: if (r == null)
463: return true;
464: else
465: return (r.equals(Boolean.TRUE.toString()));
466: }
467:
468: public boolean indexMedia() {
469: String r = (String) mem.get(INDEX_MEDIA);
470: if (r == null)
471: return true;
472: else
473: return (r.equals(Boolean.TRUE.toString()));
474: }
475:
476: public boolean storeHTCache() {
477: String r = (String) mem.get(STORE_HTCACHE);
478: if (r == null)
479: return false;
480: else
481: return (r.equals(Boolean.TRUE.toString()));
482: }
483:
484: public boolean storeTXCache() {
485: String r = (String) mem.get(STORE_TXCACHE);
486: if (r == null)
487: return false;
488: else
489: return (r.equals(Boolean.TRUE.toString()));
490: }
491:
492: public boolean remoteIndexing() {
493: String r = (String) mem.get(REMOTE_INDEXING);
494: if (r == null)
495: return false;
496: else
497: return (r.equals(Boolean.TRUE.toString()));
498: }
499:
500: public boolean excludeStaticStopwords() {
501: String r = (String) mem.get(XSSTOPW);
502: if (r == null)
503: return false;
504: else
505: return (r.equals(Boolean.TRUE.toString()));
506: }
507:
508: public boolean excludeDynamicStopwords() {
509: String r = (String) mem.get(XDSTOPW);
510: if (r == null)
511: return false;
512: else
513: return (r.equals(Boolean.TRUE.toString()));
514: }
515:
516: public boolean excludeParentStopwords() {
517: String r = (String) mem.get(XPSTOPW);
518: if (r == null)
519: return false;
520: else
521: return (r.equals(Boolean.TRUE.toString()));
522: }
523:
524: public void domInc(String domain, String referrer, int depth) {
525: synchronized (domain.intern()) {
526: DomProfile dp = (DomProfile) doms.get(domain);
527: if (dp == null) {
528: // new domain
529: doms.put(domain, new DomProfile(referrer, depth));
530: } else {
531: // increase counter
532: dp.inc();
533: doms.put(domain, dp);
534: }
535: }
536: domsCache.put(this .mem.get(HANDLE), doms);
537: }
538:
539: public boolean grantedDomAppearance(String domain) {
540: int max = domFilterDepth();
541: if (max == Integer.MAX_VALUE)
542: return true;
543: synchronized (domain.intern()) {
544: DomProfile dp = (DomProfile) doms.get(domain);
545: if (dp == null) {
546: return 0 < max;
547: } else {
548: return dp.depth <= max;
549: }
550: }
551: }
552:
553: public boolean grantedDomCount(String domain) {
554: int max = domMaxPages();
555: if (max == Integer.MAX_VALUE)
556: return true;
557: synchronized (domain.intern()) {
558: DomProfile dp = (DomProfile) doms.get(domain);
559: if (dp == null) {
560: return 0 < max;
561: } else {
562: return dp.count <= max;
563: }
564: }
565: }
566:
567: public int domSize() {
568: return doms.size();
569: }
570:
571: public boolean domExists(String domain) {
572: if (domFilterDepth() == Integer.MAX_VALUE)
573: return true;
574: return doms.containsKey(domain);
575: }
576:
577: public String domName(boolean attr, int index) {
578: Iterator<Map.Entry<String, DomProfile>> domnamesi = doms
579: .entrySet().iterator();
580: String domname = "";
581: Map.Entry<String, DomProfile> ey;
582: DomProfile dp;
583: int i = 0;
584: while ((domnamesi.hasNext()) && (i < index)) {
585: ey = domnamesi.next();
586: i++;
587: }
588: if (domnamesi.hasNext()) {
589: ey = domnamesi.next();
590: dp = ey.getValue();
591: domname = ey.getKey()
592: + ((attr) ? ("/r=" + dp.referrer + ", d="
593: + dp.depth + ", c=" + dp.count) : " ");
594: }
595: return domname;
596: }
597: }
598: }
|