001: // plasmaWordIndex.java
002: // (C) 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
003: // first published 2005 on http://www.anomic.de
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2008-02-03 21:47:27 +0000 (So, 03 Feb 2008) $
008: // $LastChangedRevision: 4439 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.plasma;
028:
029: import java.io.File;
030: import java.util.ArrayList;
031: import java.util.Date;
032: import java.util.HashMap;
033: import java.util.HashSet;
034: import java.util.Iterator;
035: import java.util.Map;
036: import java.util.Set;
037: import java.util.TreeSet;
038:
039: import de.anomic.htmlFilter.htmlFilterContentScraper;
040: import de.anomic.index.indexCollectionRI;
041: import de.anomic.index.indexContainer;
042: import de.anomic.index.indexContainerOrder;
043: import de.anomic.index.indexRAMRI;
044: import de.anomic.index.indexRI;
045: import de.anomic.index.indexRWIEntry;
046: import de.anomic.index.indexRWIRowEntry;
047: import de.anomic.index.indexURLEntry;
048: import de.anomic.kelondro.kelondroBase64Order;
049: import de.anomic.kelondro.kelondroByteOrder;
050: import de.anomic.kelondro.kelondroCloneableIterator;
051: import de.anomic.kelondro.kelondroMergeIterator;
052: import de.anomic.kelondro.kelondroOrder;
053: import de.anomic.kelondro.kelondroRotateIterator;
054: import de.anomic.kelondro.kelondroRowSet;
055: import de.anomic.plasma.urlPattern.plasmaURLPattern;
056: import de.anomic.server.serverMemory;
057: import de.anomic.server.logging.serverLog;
058: import de.anomic.yacy.yacyDHTAction;
059: import de.anomic.yacy.yacySeedDB;
060: import de.anomic.yacy.yacyURL;
061:
062: public final class plasmaWordIndex implements indexRI {
063:
064: // environment constants
065: public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
066: public static final int wCacheMaxChunk = 400; // maximum number of references for each urlhash
067: public static final int lowcachedivisor = 320;
068: public static final int maxCollectionPartition = 7; // should be 7
069:
070: private final kelondroByteOrder indexOrder = kelondroBase64Order.enhancedCoder;
071: private final indexRAMRI dhtOutCache, dhtInCache;
072: private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
073: public boolean busyCacheFlush; // shows if a cache flush is currently performed
074: private int flushsize;
075: public final plasmaCrawlLURL loadedURL;
076:
077: public plasmaWordIndex(File indexPrimaryRoot,
078: File indexSecondaryRoot, long preloadTime, serverLog log) {
079: File textindexcache = new File(indexPrimaryRoot,
080: "PUBLIC/TEXT/RICACHE");
081: if (!(textindexcache.exists()))
082: textindexcache.mkdirs();
083: this .dhtOutCache = new indexRAMRI(textindexcache,
084: indexRWIRowEntry.urlEntryRow, wCacheMaxChunk,
085: wCacheMaxAge, "dump1.array", log);
086: this .dhtInCache = new indexRAMRI(textindexcache,
087: indexRWIRowEntry.urlEntryRow, wCacheMaxChunk,
088: wCacheMaxAge, "dump2.array", log);
089:
090: // create collections storage path
091: File textindexcollections = new File(indexPrimaryRoot,
092: "PUBLIC/TEXT/RICOLLECTION");
093: if (!(textindexcollections.exists()))
094: textindexcollections.mkdirs();
095: this .collections = new indexCollectionRI(textindexcollections,
096: "collection", preloadTime, maxCollectionPartition,
097: indexRWIRowEntry.urlEntryRow);
098:
099: // create LURL-db
100: loadedURL = new plasmaCrawlLURL(indexSecondaryRoot, preloadTime);
101:
102: // performance settings
103: busyCacheFlush = false;
104: this .flushsize = 2000;
105: }
106:
107: public int minMem() {
108: return 1024 * 1024 /* indexing overhead */
109: + dhtOutCache.minMem() + dhtInCache.minMem()
110: + collections.minMem();
111: }
112:
113: public int maxURLinDHTOutCache() {
114: return dhtOutCache.maxURLinCache();
115: }
116:
117: public long minAgeOfDHTOutCache() {
118: return dhtOutCache.minAgeOfCache();
119: }
120:
121: public long maxAgeOfDHTOutCache() {
122: return dhtOutCache.maxAgeOfCache();
123: }
124:
125: public int maxURLinDHTInCache() {
126: return dhtInCache.maxURLinCache();
127: }
128:
129: public long minAgeOfDHTInCache() {
130: return dhtInCache.minAgeOfCache();
131: }
132:
133: public long maxAgeOfDHTInCache() {
134: return dhtInCache.maxAgeOfCache();
135: }
136:
137: public int dhtOutCacheSize() {
138: return dhtOutCache.size();
139: }
140:
141: public int dhtInCacheSize() {
142: return dhtInCache.size();
143: }
144:
145: public long dhtCacheSizeBytes(boolean in) {
146: // calculate the real size in bytes of DHT-In/Out-Cache
147: long cacheBytes = 0;
148: long entryBytes = indexRWIRowEntry.urlEntryRow.objectsize;
149: indexRAMRI cache = (in ? dhtInCache : dhtOutCache);
150: synchronized (cache) {
151: Iterator<indexContainer> it = cache.wordContainers(null,
152: false);
153: while (it.hasNext())
154: cacheBytes += it.next().size() * entryBytes;
155: }
156:
157: return cacheBytes;
158: }
159:
160: public void setMaxWordCount(int maxWords) {
161: dhtOutCache.setMaxWordCount(maxWords);
162: dhtInCache.setMaxWordCount(maxWords);
163: }
164:
165: public void setWordFlushSize(int flushsize) {
166: this .flushsize = flushsize;
167: }
168:
169: public void dhtFlushControl(indexRAMRI theCache) {
170: // check for forced flush
171: int count = -1;
172: synchronized (theCache) {
173: if ((theCache.maxURLinCache() > wCacheMaxChunk)
174: || (theCache.size() > theCache.getMaxWordCount())
175: || (serverMemory.available() < collections.minMem())) {
176: count = theCache.size() + flushsize
177: - theCache.getMaxWordCount();
178: }
179: }
180: if (count >= 0)
181: flushCache(theCache, (count > 0) ? count : 1);
182: }
183:
184: public long getUpdateTime(String wordHash) {
185: indexContainer entries = getContainer(wordHash, null);
186: if (entries == null)
187: return 0;
188: return entries.updated();
189: }
190:
191: public static indexContainer emptyContainer(String wordHash,
192: int elementCount) {
193: return new indexContainer(wordHash,
194: indexRWIRowEntry.urlEntryRow, elementCount);
195: }
196:
197: public void addEntry(String wordHash, indexRWIEntry entry,
198: long updateTime, boolean dhtInCase) {
199: // set dhtInCase depending on wordHash
200: if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(wordHash)))
201: dhtInCase = true;
202:
203: // add the entry
204: if (dhtInCase) {
205: dhtInCache.addEntry(wordHash, entry, updateTime, true);
206: dhtFlushControl(this .dhtInCache);
207: } else {
208: dhtOutCache.addEntry(wordHash, entry, updateTime, false);
209: dhtFlushControl(this .dhtOutCache);
210: }
211: }
212:
213: public void addEntries(indexContainer entries) {
214: addEntries(entries, false);
215: }
216:
217: public void addEntries(indexContainer entries, boolean dhtInCase) {
218: assert (entries.row().objectsize == indexRWIRowEntry.urlEntryRow.objectsize);
219:
220: // set dhtInCase depending on wordHash
221: if ((!dhtInCase)
222: && (yacyDHTAction.shallBeOwnWord(entries.getWordHash())))
223: dhtInCase = true;
224:
225: // add the entry
226: if (dhtInCase) {
227: dhtInCache.addEntries(entries);
228: dhtFlushControl(this .dhtInCache);
229: } else {
230: dhtOutCache.addEntries(entries);
231: dhtFlushControl(this .dhtOutCache);
232: }
233: }
234:
235: public void flushCacheSome() {
236: flushCache(dhtOutCache,
237: (dhtOutCache.size() > 3 * flushsize) ? flushsize : Math
238: .min(flushsize, Math.max(1, dhtOutCache.size()
239: / lowcachedivisor)));
240: flushCache(dhtInCache,
241: (dhtInCache.size() > 3 * flushsize) ? flushsize : Math
242: .min(flushsize, Math.max(1, dhtInCache.size()
243: / lowcachedivisor)));
244: }
245:
246: private void flushCache(indexRAMRI ram, int count) {
247: if (count <= 0)
248: return;
249:
250: busyCacheFlush = true;
251: String wordHash;
252: ArrayList<indexContainer> containerList = new ArrayList<indexContainer>();
253: count = Math.min(5000, Math.min(count, ram.size()));
254: boolean collectMax = true;
255: indexContainer c;
256: while (collectMax) {
257: synchronized (ram) {
258: wordHash = ram.maxScoreWordHash();
259: c = ram.getContainer(wordHash, null);
260: if ((c != null) && (c.size() > wCacheMaxChunk)) {
261: containerList.add(ram.deleteContainer(wordHash));
262: if (serverMemory.available() < collections.minMem())
263: break; // protect memory during flush
264: } else {
265: collectMax = false;
266: }
267: }
268: }
269: count = count - containerList.size();
270: for (int i = 0; i < count; i++) { // possible position of outOfMemoryError ?
271: synchronized (ram) {
272: if (ram.size() == 0)
273: break;
274: if (serverMemory.available() < collections.minMem())
275: break; // protect memory during flush
276:
277: // select one word to flush
278: wordHash = ram.bestFlushWordHash();
279:
280: // move one container from ram to flush list
281: c = ram.deleteContainer(wordHash);
282: }
283: if (c != null)
284: containerList.add(c);
285: }
286: // flush the containers
287: collections.addMultipleEntries(containerList);
288: //System.out.println("DEBUG-Finished flush of " + count + " entries from RAM to DB in " + (System.currentTimeMillis() - start) + " milliseconds");
289: busyCacheFlush = false;
290: }
291:
292: private static final int hour = 3600000;
293: private static final int day = 86400000;
294:
295: public static int microDateDays(Date modified) {
296: return microDateDays(modified.getTime());
297: }
298:
299: public static int microDateDays(long modified) {
300: // this calculates a virtual age from a given date
301: // the purpose is to have an age in days of a given modified date
302: // from a fixed standpoint in the past
303: // one day has 60*60*24 seconds = 86400 seconds
304: // we take mod 64**3 = 262144, this is the mask of the storage
305: return (int) ((modified / day) % 262144);
306: }
307:
308: public static String microDateHoursStr(long time) {
309: return kelondroBase64Order.enhancedCoder.encodeLong(
310: microDateHoursInt(time), 3);
311: }
312:
313: public static int microDateHoursInt(long time) {
314: return (int) ((time / hour) % 262144);
315: }
316:
317: public static int microDateHoursAge(String mdhs) {
318: return microDateHoursInt(System.currentTimeMillis())
319: - (int) kelondroBase64Order.enhancedCoder
320: .decodeLong(mdhs);
321: }
322:
323: public static long reverseMicroDateDays(int microDateDays) {
324: return ((long) microDateDays) * ((long) day);
325: }
326:
327: public int addPageIndex(yacyURL url, Date urlModified, int size,
328: plasmaParserDocument document, plasmaCondenser condenser,
329: String language, char doctype, int outlinksSame,
330: int outlinksOther) {
331: // this is called by the switchboard to put in a new page into the index
332: // use all the words in one condenser object to simultanous create index entries
333:
334: int wordCount = 0;
335: int urlLength = url.toNormalform(true, true).length();
336: int urlComps = htmlFilterContentScraper
337: .urlComps(url.toString()).length;
338:
339: // iterate over all words of context text
340: Iterator<Map.Entry<String, plasmaCondenser.wordStatProp>> i = condenser
341: .words().entrySet().iterator();
342: Map.Entry<String, plasmaCondenser.wordStatProp> wentry;
343: String word;
344: indexRWIEntry ientry;
345: plasmaCondenser.wordStatProp wprop;
346: while (i.hasNext()) {
347: wentry = i.next();
348: word = wentry.getKey();
349: wprop = wentry.getValue();
350: assert (wprop.flags != null);
351: ientry = new indexRWIRowEntry(url.hash(), urlLength,
352: urlComps, (document == null) ? urlLength : document
353: .dc_title().length(), wprop.count,
354: condenser.words().size(), condenser.sentences()
355: .size(), wprop.posInText,
356: wprop.posInPhrase, wprop.numOfPhrase, 0, size,
357: urlModified.getTime(), System.currentTimeMillis(),
358: language, doctype, outlinksSame, outlinksOther,
359: wprop.flags);
360: addEntry(plasmaCondenser.word2hash(word), ientry, System
361: .currentTimeMillis(), false);
362: wordCount++;
363: }
364:
365: return wordCount;
366: }
367:
368: public boolean hasContainer(String wordHash) {
369: if (dhtOutCache.hasContainer(wordHash))
370: return true;
371: if (dhtInCache.hasContainer(wordHash))
372: return true;
373: if (collections.hasContainer(wordHash))
374: return true;
375: return false;
376: }
377:
378: public indexContainer getContainer(String wordHash,
379: Set<String> urlselection) {
380: if ((wordHash == null)
381: || (wordHash.length() != yacySeedDB.commonHashLength)) {
382: // wrong input
383: return null;
384: }
385:
386: // get from cache
387: indexContainer container;
388: synchronized (dhtOutCache) {
389: container = dhtOutCache
390: .getContainer(wordHash, urlselection);
391: }
392: synchronized (dhtInCache) {
393: if (container == null) {
394: container = dhtInCache.getContainer(wordHash,
395: urlselection);
396: } else {
397: container.addAllUnique(dhtInCache.getContainer(
398: wordHash, urlselection));
399: }
400: }
401:
402: // get from collection index
403: synchronized (this ) {
404: if (container == null) {
405: container = collections.getContainer(wordHash,
406: urlselection);
407: } else {
408: container.addAllUnique(collections.getContainer(
409: wordHash, urlselection));
410: }
411: }
412:
413: if (container == null)
414: return null;
415:
416: // check doubles
417: int beforeDouble = container.size();
418: ArrayList<kelondroRowSet> d = container.removeDoubles();
419: kelondroRowSet set;
420: for (int i = 0; i < d.size(); i++) {
421: // for each element in the double-set, take that one that is the most recent one
422: set = d.get(i);
423: indexRWIRowEntry e, elm = null;
424: long lm = 0;
425: for (int j = 0; j < set.size(); j++) {
426: e = new indexRWIRowEntry(set.get(j));
427: if ((elm == null) || (e.lastModified() > lm)) {
428: elm = e;
429: lm = e.lastModified();
430: }
431: }
432: container.addUnique(elm.toKelondroEntry());
433: }
434: if (container.size() < beforeDouble)
435: System.out.println("*** DEBUG DOUBLECHECK - removed "
436: + (beforeDouble - container.size())
437: + " index entries from word container "
438: + container.getWordHash());
439:
440: return container;
441: }
442:
443: public Map<String, indexContainer> getContainers(
444: Set<String> wordHashes, Set<String> urlselection,
445: boolean deleteIfEmpty, boolean interruptIfEmpty) {
446: // return map of wordhash:indexContainer
447:
448: // retrieve entities that belong to the hashes
449: HashMap<String, indexContainer> containers = new HashMap<String, indexContainer>();
450: String singleHash;
451: indexContainer singleContainer;
452: Iterator<String> i = wordHashes.iterator();
453: while (i.hasNext()) {
454:
455: // get next word hash:
456: singleHash = (String) i.next();
457:
458: // retrieve index
459: singleContainer = getContainer(singleHash, urlselection);
460:
461: // check result
462: if (((singleContainer == null) || (singleContainer.size() == 0))
463: && (interruptIfEmpty))
464: return new HashMap<String, indexContainer>();
465:
466: containers.put(singleHash, singleContainer);
467: }
468: return containers;
469: }
470:
471: @SuppressWarnings("unchecked")
472: public Map<String, indexContainer>[] localSearchContainers(
473: plasmaSearchQuery query, Set<String> urlselection) {
474: // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
475:
476: // retrieve entities that belong to the hashes
477: Map<String, indexContainer> inclusionContainers = (query.queryHashes
478: .size() == 0) ? new HashMap<String, indexContainer>()
479: : getContainers(query.queryHashes, urlselection, true,
480: true);
481: if ((inclusionContainers.size() != 0)
482: && (inclusionContainers.size() < query.queryHashes
483: .size()))
484: inclusionContainers = new HashMap<String, indexContainer>(); // prevent that only a subset is returned
485: Map<String, indexContainer> exclusionContainers = (inclusionContainers
486: .size() == 0) ? new HashMap<String, indexContainer>()
487: : getContainers(query.excludeHashes, urlselection,
488: true, true);
489: return new Map[] { inclusionContainers, exclusionContainers };
490: }
491:
492: public int size() {
493: return java.lang.Math.max(collections.size(), java.lang.Math
494: .max(dhtInCache.size(), dhtOutCache.size()));
495: }
496:
497: public int indexSize(String wordHash) {
498: int size = 0;
499: size += dhtInCache.indexSize(wordHash);
500: size += dhtOutCache.indexSize(wordHash);
501: size += collections.indexSize(wordHash);
502: return size;
503: }
504:
505: public void close() {
506: dhtInCache.close();
507: dhtOutCache.close();
508: synchronized (this ) {
509: collections.close();
510: loadedURL.close();
511: }
512: }
513:
514: public indexContainer deleteContainer(String wordHash) {
515: indexContainer c = new indexContainer(wordHash,
516: indexRWIRowEntry.urlEntryRow, dhtInCache
517: .sizeContainer(wordHash)
518: + dhtOutCache.sizeContainer(wordHash)
519: + collections.indexSize(wordHash));
520: synchronized (dhtInCache) {
521: c.addAllUnique(dhtInCache.deleteContainer(wordHash));
522: }
523: synchronized (dhtOutCache) {
524: c.addAllUnique(dhtOutCache.deleteContainer(wordHash));
525: }
526: synchronized (this ) {
527: c.addAllUnique(collections.deleteContainer(wordHash));
528: }
529: return c;
530: }
531:
532: public boolean removeEntry(String wordHash, String urlHash) {
533: boolean removed = false;
534: synchronized (dhtInCache) {
535: removed = removed
536: | (dhtInCache.removeEntry(wordHash, urlHash));
537: }
538: synchronized (dhtOutCache) {
539: removed = removed
540: | (dhtOutCache.removeEntry(wordHash, urlHash));
541: }
542: synchronized (this ) {
543: removed = removed
544: | (collections.removeEntry(wordHash, urlHash));
545: }
546: return removed;
547: }
548:
549: public int removeEntryMultiple(Set<String> wordHashes,
550: String urlHash) {
551: // remove the same url hashes for multiple words
552: // this is mainly used when correcting a index after a search
553: Iterator<String> i = wordHashes.iterator();
554: int count = 0;
555: while (i.hasNext()) {
556: if (removeEntry((String) i.next(), urlHash))
557: count++;
558: }
559: return count;
560: }
561:
562: public int removeEntries(String wordHash, Set<String> urlHashes) {
563: int removed = 0;
564: synchronized (dhtInCache) {
565: removed += dhtInCache.removeEntries(wordHash, urlHashes);
566: }
567: synchronized (dhtOutCache) {
568: removed += dhtOutCache.removeEntries(wordHash, urlHashes);
569: }
570: synchronized (this ) {
571: removed += collections.removeEntries(wordHash, urlHashes);
572: }
573: return removed;
574: }
575:
576: public String removeEntriesExpl(String wordHash,
577: Set<String> urlHashes) {
578: String removed = "";
579: synchronized (dhtInCache) {
580: removed += dhtInCache.removeEntries(wordHash, urlHashes)
581: + ", ";
582: }
583: synchronized (dhtOutCache) {
584: removed += dhtOutCache.removeEntries(wordHash, urlHashes)
585: + ", ";
586: }
587: synchronized (this ) {
588: removed += collections.removeEntries(wordHash, urlHashes);
589: }
590: return removed;
591: }
592:
593: public void removeEntriesMultiple(Set<String> wordHashes,
594: Set<String> urlHashes) {
595: // remove the same url hashes for multiple words
596: // this is mainly used when correcting a index after a search
597: Iterator<String> i = wordHashes.iterator();
598: while (i.hasNext()) {
599: removeEntries((String) i.next(), urlHashes);
600: }
601: }
602:
603: public int removeWordReferences(Set<String> words, String urlhash) {
604: // sequentially delete all word references
605: // returns number of deletions
606: Iterator<String> iter = words.iterator();
607: int count = 0;
608: while (iter.hasNext()) {
609: // delete the URL reference in this word index
610: if (removeEntry(plasmaCondenser.word2hash((String) iter
611: .next()), urlhash))
612: count++;
613: }
614: return count;
615: }
616:
617: public int tryRemoveURLs(String urlHash) {
618: // this tries to delete an index from the cache that has this
619: // urlHash assigned. This can only work if the entry is really fresh
620: // and can be found in the RAM cache
621: // this returns the number of deletion that had been possible
622: int d = 0;
623: d = dhtInCache.tryRemoveURLs(urlHash);
624: if (d > 0)
625: return d;
626: else
627: return dhtOutCache.tryRemoveURLs(urlHash);
628: }
629:
630: public synchronized TreeSet<indexContainer> indexContainerSet(
631: String startHash, boolean ram, boolean rot, int count) {
632: // creates a set of indexContainers
633: // this does not use the dhtInCache
634: kelondroOrder<indexContainer> containerOrder = new indexContainerOrder(
635: indexOrder.clone());
636: containerOrder.rotate(emptyContainer(startHash, 0));
637: TreeSet<indexContainer> containers = new TreeSet<indexContainer>(
638: containerOrder);
639: Iterator<indexContainer> i = wordContainers(startHash, ram, rot);
640: if (ram)
641: count = Math.min(dhtOutCache.size(), count);
642: indexContainer container;
643: // this loop does not terminate using the i.hasNex() predicate when rot == true
644: // because then the underlying iterator is a rotating iterator without termination
645: // in this case a termination must be ensured with a counter
646: // It must also be ensured that the counter is in/decreased every loop
647: while ((count > 0) && (i.hasNext())) {
648: container = i.next();
649: if ((container != null) && (container.size() > 0)) {
650: containers.add(container);
651: }
652: count--; // decrease counter even if the container was null or empty to ensure termination
653: }
654: return containers; // this may return less containers as demanded
655: }
656:
657: public synchronized kelondroCloneableIterator<indexContainer> wordContainers(
658: String startHash, boolean ram, boolean rot) {
659: kelondroCloneableIterator<indexContainer> i = wordContainers(
660: startHash, ram);
661: if (rot) {
662: return new kelondroRotateIterator<indexContainer>(i,
663: new String(kelondroBase64Order.zero(startHash
664: .length())), dhtOutCache.size()
665: + ((ram) ? 0 : collections.size()));
666: } else {
667: return i;
668: }
669: }
670:
671: public synchronized kelondroCloneableIterator<indexContainer> wordContainers(
672: String startWordHash, boolean ram) {
673: kelondroOrder<indexContainer> containerOrder = new indexContainerOrder(
674: indexOrder.clone());
675: containerOrder.rotate(emptyContainer(startWordHash, 0));
676: if (ram) {
677: return dhtOutCache.wordContainers(startWordHash, false);
678: } else {
679: return new kelondroMergeIterator<indexContainer>(
680: dhtOutCache.wordContainers(startWordHash, false),
681: collections.wordContainers(startWordHash, false),
682: containerOrder,
683: indexContainer.containerMergeMethod, true);
684: }
685: }
686:
687: // The Cleaner class was provided as "UrldbCleaner" by Hydrox
688: public synchronized Cleaner makeCleaner(plasmaCrawlLURL lurl,
689: String startHash) {
690: return new Cleaner(lurl, startHash);
691: }
692:
693: public class Cleaner extends Thread {
694:
695: private String startHash;
696: private boolean run = true;
697: private boolean pause = false;
698: public int rwiCountAtStart = 0;
699: public String wordHashNow = "";
700: public String lastWordHash = "";
701: public int lastDeletionCounter = 0;
702: private plasmaCrawlLURL lurl;
703:
704: public Cleaner(plasmaCrawlLURL lurl, String startHash) {
705: this .lurl = lurl;
706: this .startHash = startHash;
707: this .rwiCountAtStart = size();
708: }
709:
710: public void run() {
711: serverLog.logInfo("INDEXCLEANER",
712: "IndexCleaner-Thread started");
713: indexContainer container = null;
714: indexRWIRowEntry entry = null;
715: yacyURL url = null;
716: HashSet<String> urlHashs = new HashSet<String>();
717: Iterator<indexContainer> indexContainerIterator = indexContainerSet(
718: startHash, false, false, 100).iterator();
719: while (indexContainerIterator.hasNext() && run) {
720: waiter();
721: container = (indexContainer) indexContainerIterator
722: .next();
723: Iterator<indexRWIRowEntry> containerIterator = container
724: .entries();
725: wordHashNow = container.getWordHash();
726: while (containerIterator.hasNext() && run) {
727: waiter();
728: entry = containerIterator.next();
729: // System.out.println("Wordhash: "+wordHash+" UrlHash:
730: // "+entry.getUrlHash());
731: indexURLEntry ue = lurl.load(entry.urlHash(),
732: entry, 0);
733: if (ue == null) {
734: urlHashs.add(entry.urlHash());
735: } else {
736: url = ue.comp().url();
737: if ((url == null)
738: || (plasmaSwitchboard.urlBlacklist
739: .isListed(
740: plasmaURLPattern.BLACKLIST_CRAWLER,
741: url) == true)) {
742: urlHashs.add(entry.urlHash());
743: }
744: }
745: }
746: if (urlHashs.size() > 0) {
747: int removed = removeEntries(
748: container.getWordHash(), urlHashs);
749: serverLog
750: .logFine("INDEXCLEANER", container
751: .getWordHash()
752: + ": "
753: + removed
754: + " of "
755: + container.size()
756: + " URL-entries deleted");
757: lastWordHash = container.getWordHash();
758: lastDeletionCounter = urlHashs.size();
759: urlHashs.clear();
760: }
761: if (!containerIterator.hasNext()) {
762: // We may not be finished yet, try to get the next chunk of wordHashes
763: TreeSet<indexContainer> containers = indexContainerSet(
764: container.getWordHash(), false, false, 100);
765: indexContainerIterator = containers.iterator();
766: // Make sure we don't get the same wordhash twice, but don't skip a word
767: if ((indexContainerIterator.hasNext())
768: && (!container
769: .getWordHash()
770: .equals(
771: ((indexContainer) indexContainerIterator
772: .next())
773: .getWordHash()))) {
774: indexContainerIterator = containers.iterator();
775: }
776: }
777: }
778: serverLog.logInfo("INDEXCLEANER",
779: "IndexCleaner-Thread stopped");
780: }
781:
782: public void abort() {
783: synchronized (this ) {
784: run = false;
785: this .notifyAll();
786: }
787: }
788:
789: public void pause() {
790: synchronized (this ) {
791: if (!pause) {
792: pause = true;
793: serverLog.logInfo("INDEXCLEANER",
794: "IndexCleaner-Thread paused");
795: }
796: }
797: }
798:
799: public void endPause() {
800: synchronized (this ) {
801: if (pause) {
802: pause = false;
803: this .notifyAll();
804: serverLog.logInfo("INDEXCLEANER",
805: "IndexCleaner-Thread resumed");
806: }
807: }
808: }
809:
810: public void waiter() {
811: synchronized (this ) {
812: if (this .pause) {
813: try {
814: this .wait();
815: } catch (InterruptedException e) {
816: this .run = false;
817: return;
818: }
819: }
820: }
821: }
822: }
823:
824: }
|