001: // indexRAMRI.java
002: // (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003: // first published 2005 on http://yacy.net
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2008-02-02 23:53:39 +0000 (Sa, 02 Feb 2008) $
008: // $LastChangedRevision: 4430 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.index;
028:
029: import java.io.File;
030: import java.io.IOException;
031: import java.util.Collections;
032: import java.util.Iterator;
033: import java.util.Map;
034: import java.util.Set;
035: import java.util.SortedMap;
036: import java.util.TreeMap;
037:
038: import de.anomic.kelondro.kelondroBase64Order;
039: import de.anomic.kelondro.kelondroBufferedRA;
040: import de.anomic.kelondro.kelondroCloneableIterator;
041: import de.anomic.kelondro.kelondroException;
042: import de.anomic.kelondro.kelondroFixedWidthArray;
043: import de.anomic.kelondro.kelondroMScoreCluster;
044: import de.anomic.kelondro.kelondroNaturalOrder;
045: import de.anomic.kelondro.kelondroRow;
046: import de.anomic.kelondro.kelondroRow.EntryIndex;
047: import de.anomic.server.serverByteBuffer;
048: import de.anomic.server.serverFileUtils;
049: import de.anomic.server.serverMemory;
050: import de.anomic.server.logging.serverLog;
051: import de.anomic.yacy.yacySeedDB;
052:
053: public final class indexRAMRI implements indexRI {
054:
055: // class variables
056: private final File databaseRoot;
057: protected final SortedMap<String, indexContainer> cache; // wordhash-container
058: private final kelondroMScoreCluster<String> hashScore;
059: private final kelondroMScoreCluster<String> hashDate;
060: private long initTime;
061: private int cacheMaxCount;
062: public int cacheReferenceCountLimit;
063: public long cacheReferenceAgeLimit;
064: private final serverLog log;
065: private String indexArrayFileName;
066: private kelondroRow payloadrow;
067: private kelondroRow bufferStructureBasis;
068:
069: public indexRAMRI(File databaseRoot, kelondroRow payloadrow,
070: int wCacheReferenceCountLimitInit,
071: long wCacheReferenceAgeLimitInit, String dumpname,
072: serverLog log) {
073:
074: // creates a new index cache
075: // the cache has a back-end where indexes that do not fit in the cache are flushed
076: this .databaseRoot = databaseRoot;
077: this .cache = Collections
078: .synchronizedSortedMap(new TreeMap<String, indexContainer>());
079: this .hashScore = new kelondroMScoreCluster<String>();
080: this .hashDate = new kelondroMScoreCluster<String>();
081: this .initTime = System.currentTimeMillis();
082: this .cacheMaxCount = 10000;
083: this .cacheReferenceCountLimit = wCacheReferenceCountLimitInit;
084: this .cacheReferenceAgeLimit = wCacheReferenceAgeLimitInit;
085: this .log = log;
086: this .indexArrayFileName = dumpname;
087: this .payloadrow = payloadrow;
088: this .bufferStructureBasis = new kelondroRow("byte[] wordhash-"
089: + yacySeedDB.commonHashLength + ", "
090: + "Cardinal occ-4 {b256}, "
091: + "Cardinal time-8 {b256}, " + "byte[] urlprops-"
092: + payloadrow.objectsize,
093: kelondroBase64Order.enhancedCoder, 0);
094:
095: // read in dump of last session
096: try {
097: restore();
098: } catch (IOException e) {
099: log.logSevere("unable to restore cache dump: "
100: + e.getMessage(), e);
101: }
102: }
103:
104: public int minMem() {
105: // there is no specific large array that needs to be maintained
106: // this value is just a guess of the possible overhead
107: return 100 * 1024; // 100 kb
108: }
109:
110: public synchronized long getUpdateTime(String wordHash) {
111: indexContainer entries = getContainer(wordHash, null);
112: if (entries == null)
113: return 0;
114: return entries.updated();
115: }
116:
117: private void dump() throws IOException {
118: log.logConfig("creating dump for index cache '"
119: + indexArrayFileName + "', " + cache.size()
120: + " words (and much more urls)");
121: File indexDumpFile = new File(databaseRoot, indexArrayFileName);
122: if (indexDumpFile.exists())
123: indexDumpFile.delete();
124: kelondroFixedWidthArray dumpArray = null;
125: kelondroBufferedRA writeBuffer = null;
126: if (false /*serverMemory.available() > 50 * bufferStructureBasis.objectsize() * cache.size()*/) {
127: writeBuffer = new kelondroBufferedRA();
128: dumpArray = new kelondroFixedWidthArray(writeBuffer,
129: indexDumpFile.getCanonicalPath(),
130: bufferStructureBasis, 0);
131: log.logInfo("started dump of ram cache: " + cache.size()
132: + " words; memory-enhanced write");
133: } else {
134: dumpArray = new kelondroFixedWidthArray(indexDumpFile,
135: bufferStructureBasis, 0);
136: log.logInfo("started dump of ram cache: " + cache.size()
137: + " words; low-memory write");
138: }
139: long startTime = System.currentTimeMillis();
140: long messageTime = System.currentTimeMillis() + 5000;
141: long wordsPerSecond = 0, wordcount = 0, urlcount = 0;
142: Map.Entry<String, indexContainer> entry;
143: String wordHash;
144: indexContainer container;
145: long updateTime;
146: indexRWIEntry iEntry;
147: kelondroRow.Entry row = dumpArray.row().newEntry();
148: byte[] occ, time;
149:
150: // write wCache
151: synchronized (cache) {
152: Iterator<Map.Entry<String, indexContainer>> i = cache
153: .entrySet().iterator();
154: while (i.hasNext()) {
155: // get entries
156: entry = i.next();
157: wordHash = entry.getKey();
158: updateTime = getUpdateTime(wordHash);
159: container = entry.getValue();
160:
161: // put entries on stack
162: if (container != null) {
163: Iterator<indexRWIRowEntry> ci = container.entries();
164: occ = kelondroNaturalOrder.encodeLong(container
165: .size(), 4);
166: time = kelondroNaturalOrder.encodeLong(updateTime,
167: 8);
168: while (ci.hasNext()) {
169: iEntry = ci.next();
170: row.setCol(0, wordHash.getBytes());
171: row.setCol(1, occ);
172: row.setCol(2, time);
173: row.setCol(3, iEntry.toKelondroEntry().bytes());
174: dumpArray.set((int) urlcount++, row);
175: }
176: }
177: wordcount++;
178: i.remove(); // free some mem
179:
180: // write a log
181: if (System.currentTimeMillis() > messageTime) {
182: serverMemory.gc(1000,
183: "indexRAMRI, for better statistic-1"); // for better statistic - thq
184: wordsPerSecond = wordcount
185: * 1000
186: / (1 + System.currentTimeMillis() - startTime);
187: log
188: .logInfo("dump status: "
189: + wordcount
190: + " words done, "
191: + (cache.size() / (wordsPerSecond + 1))
192: + " seconds remaining, free mem = "
193: + (Runtime.getRuntime()
194: .freeMemory() / 1024 / 1024)
195: + "MB");
196: messageTime = System.currentTimeMillis() + 5000;
197: }
198: }
199: }
200: if (writeBuffer != null) {
201: serverByteBuffer bb = writeBuffer.getBuffer();
202: //System.out.println("*** byteBuffer size = " + bb.length());
203: serverFileUtils.write(bb.getBytes(), indexDumpFile);
204: writeBuffer.close();
205: }
206: dumpArray.close();
207: dumpArray = null;
208: log.logInfo("finished dump of ram cache: " + urlcount
209: + " word/URL relations in "
210: + ((System.currentTimeMillis() - startTime) / 1000)
211: + " seconds");
212: }
213:
214: private long restore() throws IOException {
215: File indexDumpFile = new File(databaseRoot, indexArrayFileName);
216: if (!(indexDumpFile.exists()))
217: return 0;
218: kelondroFixedWidthArray dumpArray;
219: kelondroBufferedRA readBuffer = null;
220: if (false /*serverMemory.available() > indexDumpFile.length() * 2*/) {
221: readBuffer = new kelondroBufferedRA(new serverByteBuffer(
222: serverFileUtils.read(indexDumpFile)));
223: dumpArray = new kelondroFixedWidthArray(readBuffer,
224: indexDumpFile.getCanonicalPath(),
225: bufferStructureBasis, 0);
226: log.logInfo("started restore of ram cache '"
227: + indexArrayFileName + "', " + dumpArray.size()
228: + " word/URL relations; memory-enhanced read");
229: } else {
230: dumpArray = new kelondroFixedWidthArray(indexDumpFile,
231: bufferStructureBasis, 0);
232: log.logInfo("started restore of ram cache '"
233: + indexArrayFileName + "', " + dumpArray.size()
234: + " word/URL relations; low-memory read");
235: }
236: long startTime = System.currentTimeMillis();
237: long messageTime = System.currentTimeMillis() + 5000;
238: long urlCount = 0, urlsPerSecond = 0;
239: try {
240: synchronized (cache) {
241: Iterator<EntryIndex> i = dumpArray.contentRows(-1);
242: String wordHash;
243: //long creationTime;
244: indexRWIEntry wordEntry;
245: kelondroRow.EntryIndex row;
246: //Runtime rt = Runtime.getRuntime();
247: while (i.hasNext()) {
248: // get out one entry
249: row = i.next();
250: if ((row == null) || (row.empty(0))
251: || (row.empty(3)))
252: continue;
253: wordHash = row.getColString(0, "UTF-8");
254: //creationTime = kelondroRecords.bytes2long(row[2]);
255: wordEntry = new indexRWIRowEntry(row.getColBytes(3));
256: // store to cache
257: addEntry(wordHash, wordEntry, startTime, false);
258: urlCount++;
259: // protect against memory shortage
260: //while (rt.freeMemory() < 1000000) {flushFromMem(); java.lang.System.gc();}
261: // write a log
262: if (System.currentTimeMillis() > messageTime) {
263: serverMemory.gc(1000,
264: "indexRAMRI, for better statistic-2"); // for better statistic - thq
265: urlsPerSecond = 1
266: + urlCount
267: * 1000
268: / (1 + System.currentTimeMillis() - startTime);
269: log
270: .logInfo("restoring status: "
271: + urlCount
272: + " urls done, "
273: + ((dumpArray.size() - urlCount) / urlsPerSecond)
274: + " seconds remaining, free mem = "
275: + (Runtime.getRuntime()
276: .freeMemory() / 1024 / 1024)
277: + "MB");
278: messageTime = System.currentTimeMillis() + 5000;
279: }
280: }
281: }
282: if (readBuffer != null)
283: readBuffer.close();
284: dumpArray.close();
285: dumpArray = null;
286: log.logConfig("finished restore: " + cache.size()
287: + " words in "
288: + ((System.currentTimeMillis() - startTime) / 1000)
289: + " seconds");
290: } catch (kelondroException e) {
291: // restore failed
292: log.logSevere("failed restore of indexCache array dump: "
293: + e.getMessage(), e);
294: } finally {
295: if (dumpArray != null)
296: try {
297: dumpArray.close();
298: } catch (Exception e) {
299: }
300: }
301: return urlCount;
302: }
303:
304: // cache settings
305:
306: public int maxURLinCache() {
307: if (hashScore.size() == 0)
308: return 0;
309: return hashScore.getMaxScore();
310: }
311:
312: public long minAgeOfCache() {
313: if (hashDate.size() == 0)
314: return 0;
315: return System.currentTimeMillis()
316: - longEmit(hashDate.getMaxScore());
317: }
318:
319: public long maxAgeOfCache() {
320: if (hashDate.size() == 0)
321: return 0;
322: return System.currentTimeMillis()
323: - longEmit(hashDate.getMinScore());
324: }
325:
326: public void setMaxWordCount(int maxWords) {
327: this .cacheMaxCount = maxWords;
328: }
329:
330: public int getMaxWordCount() {
331: return this .cacheMaxCount;
332: }
333:
334: public int size() {
335: return cache.size();
336: }
337:
338: public synchronized int indexSize(String wordHash) {
339: indexContainer cacheIndex = (indexContainer) cache
340: .get(wordHash);
341: if (cacheIndex == null)
342: return 0;
343: return cacheIndex.size();
344: }
345:
346: public synchronized kelondroCloneableIterator<indexContainer> wordContainers(
347: String startWordHash, boolean rot) {
348: // we return an iterator object that creates top-level-clones of the indexContainers
349: // in the cache, so that manipulations of the iterated objects do not change
350: // objects in the cache.
351: return new wordContainerIterator(startWordHash, rot);
352: }
353:
354: public class wordContainerIterator implements
355: kelondroCloneableIterator<indexContainer> {
356:
357: // this class exists, because the wCache cannot be iterated with rotation
358: // and because every indexContainer Object that is iterated must be returned as top-level-clone
359: // so this class simulates wCache.tailMap(startWordHash).values().iterator()
360: // plus the mentioned features
361:
362: private boolean rot;
363: private Iterator<indexContainer> iterator;
364:
365: public wordContainerIterator(String startWordHash, boolean rot) {
366: this .rot = rot;
367: this .iterator = (startWordHash == null) ? cache.values()
368: .iterator() : cache.tailMap(startWordHash).values()
369: .iterator();
370: // The collection's iterator will return the values in the order that their corresponding keys appear in the tree.
371: }
372:
373: public wordContainerIterator clone(Object secondWordHash) {
374: return new wordContainerIterator((String) secondWordHash,
375: rot);
376: }
377:
378: public boolean hasNext() {
379: if (rot)
380: return true;
381: return iterator.hasNext();
382: }
383:
384: public indexContainer next() {
385: if (iterator.hasNext()) {
386: return ((indexContainer) iterator.next())
387: .topLevelClone();
388: } else {
389: // rotation iteration
390: if (rot) {
391: iterator = cache.values().iterator();
392: return ((indexContainer) iterator.next())
393: .topLevelClone();
394: } else {
395: return null;
396: }
397: }
398: }
399:
400: public void remove() {
401: iterator.remove();
402: }
403:
404: }
405:
406: public synchronized String maxScoreWordHash() {
407: if (cache.size() == 0)
408: return null;
409: try {
410: return (String) hashScore.getMaxObject();
411: } catch (Exception e) {
412: log.logSevere("flushFromMem: " + e.getMessage(), e);
413: }
414: return null;
415: }
416:
417: public synchronized String bestFlushWordHash() {
418: // select appropriate hash
419: // we have 2 different methods to find a good hash:
420: // - the oldest entry in the cache
421: // - the entry with maximum count
422: if (cache.size() == 0)
423: return null;
424: try {
425: String hash = null;
426: int count = hashScore.getMaxScore();
427: if ((count >= cacheReferenceCountLimit)
428: && ((hash = (String) hashScore.getMaxObject()) != null)) {
429: // we MUST flush high-score entries, because a loop deletes entries in cache until this condition fails
430: // in this cache we MUST NOT check wCacheMinAge
431: return hash;
432: }
433: long oldestTime = longEmit(hashDate.getMinScore());
434: if (((System.currentTimeMillis() - oldestTime) > cacheReferenceAgeLimit)
435: && ((hash = (String) hashDate.getMinObject()) != null)) {
436: // flush out-dated entries
437: return hash;
438: }
439: // cases with respect to memory situation
440: if (Runtime.getRuntime().freeMemory() < 100000) {
441: // urgent low-memory case
442: hash = (String) hashScore.getMaxObject(); // flush high-score entries (saves RAM)
443: } else {
444: // not-efficient-so-far case. cleans up unnecessary cache slots
445: hash = (String) hashDate.getMinObject(); // flush oldest entries
446: }
447: return hash;
448: } catch (Exception e) {
449: log.logSevere("flushFromMem: " + e.getMessage(), e);
450: }
451: return null;
452: }
453:
454: private int intTime(long longTime) {
455: return (int) Math.max(0, ((longTime - initTime) / 1000));
456: }
457:
458: private long longEmit(int intTime) {
459: return (((long) intTime) * (long) 1000) + initTime;
460: }
461:
462: public boolean hasContainer(String wordHash) {
463: return cache.containsKey(wordHash);
464: }
465:
466: public int sizeContainer(String wordHash) {
467: indexContainer c = (indexContainer) cache.get(wordHash);
468: return (c == null) ? 0 : c.size();
469: }
470:
471: public synchronized indexContainer getContainer(String wordHash,
472: Set<String> urlselection) {
473: if (wordHash == null)
474: return null;
475:
476: // retrieve container
477: indexContainer container = (indexContainer) cache.get(wordHash);
478:
479: // We must not use the container from cache to store everything we find,
480: // as that container remains linked to in the cache and might be changed later
481: // while the returned container is still in use.
482: // create a clone from the container
483: if (container != null)
484: container = container.topLevelClone();
485:
486: // select the urlselection
487: if ((urlselection != null) && (container != null))
488: container.select(urlselection);
489:
490: return container;
491: }
492:
493: public synchronized indexContainer deleteContainer(String wordHash) {
494: // returns the index that had been deleted
495: indexContainer container = (indexContainer) cache
496: .remove(wordHash);
497: hashScore.deleteScore(wordHash);
498: hashDate.deleteScore(wordHash);
499: return container;
500: }
501:
502: public synchronized boolean removeEntry(String wordHash,
503: String urlHash) {
504: indexContainer c = (indexContainer) cache.get(wordHash);
505: if ((c != null) && (c.remove(urlHash) != null)) {
506: // removal successful
507: if (c.size() == 0) {
508: deleteContainer(wordHash);
509: } else {
510: cache.put(wordHash, c);
511: hashScore.decScore(wordHash);
512: hashDate.setScore(wordHash, intTime(System
513: .currentTimeMillis()));
514: }
515: return true;
516: }
517: return false;
518: }
519:
520: public synchronized int removeEntries(String wordHash,
521: Set<String> urlHashes) {
522: if (urlHashes.size() == 0)
523: return 0;
524: indexContainer c = (indexContainer) cache.get(wordHash);
525: int count;
526: if ((c != null) && ((count = c.removeEntries(urlHashes)) > 0)) {
527: // removal successful
528: if (c.size() == 0) {
529: deleteContainer(wordHash);
530: } else {
531: cache.put(wordHash, c);
532: hashScore.setScore(wordHash, c.size());
533: hashDate.setScore(wordHash, intTime(System
534: .currentTimeMillis()));
535: }
536: return count;
537: }
538: return 0;
539: }
540:
541: public synchronized int tryRemoveURLs(String urlHash) {
542: // this tries to delete an index from the cache that has this
543: // urlHash assigned. This can only work if the entry is really fresh
544: // Such entries must be searched in the latest entries
545: int delCount = 0;
546: Iterator<Map.Entry<String, indexContainer>> i = cache
547: .entrySet().iterator();
548: Map.Entry<String, indexContainer> entry;
549: String wordhash;
550: indexContainer c;
551: while (i.hasNext()) {
552: entry = i.next();
553: wordhash = entry.getKey();
554:
555: // get container
556: c = entry.getValue();
557: if (c.remove(urlHash) != null) {
558: if (c.size() == 0) {
559: i.remove();
560: } else {
561: cache.put(wordhash, c); // superfluous?
562: }
563: delCount++;
564: }
565: }
566: return delCount;
567: }
568:
569: public synchronized void addEntries(indexContainer container) {
570: // this puts the entries into the cache, not into the assortment directly
571: int added = 0;
572: if ((container == null) || (container.size() == 0))
573: return;
574:
575: // put new words into cache
576: String wordHash = container.getWordHash();
577: indexContainer entries = (indexContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null
578: if (entries == null) {
579: entries = container.topLevelClone();
580: added = entries.size();
581: } else {
582: added = entries.putAllRecent(container);
583: }
584: if (added > 0) {
585: cache.put(wordHash, entries);
586: hashScore.addScore(wordHash, added);
587: hashDate.setScore(wordHash, intTime(System
588: .currentTimeMillis()));
589: }
590: entries = null;
591: }
592:
593: public synchronized void addEntry(String wordHash,
594: indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
595: indexContainer container = (indexContainer) cache.get(wordHash);
596: if (container == null)
597: container = new indexContainer(wordHash, this .payloadrow, 1);
598: container.put(newEntry);
599: cache.put(wordHash, container);
600: hashScore.incScore(wordHash);
601: hashDate.setScore(wordHash, intTime(updateTime));
602: }
603:
604: public synchronized void close() {
605: // dump cache
606: try {
607: dump();
608: } catch (IOException e) {
609: log.logSevere("unable to dump cache: " + e.getMessage(), e);
610: }
611: }
612: }
|