001: // indexContainer.java
002: // (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003: // first published 04.07.2006 on http://yacy.net
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.index;
028:
029: import java.lang.reflect.Method;
030: import java.util.Collection;
031: import java.util.ConcurrentModificationException;
032: import java.util.Iterator;
033: import java.util.Map;
034: import java.util.Set;
035: import java.util.TreeMap;
036:
037: import de.anomic.kelondro.kelondroBase64Order;
038: import de.anomic.kelondro.kelondroRow;
039: import de.anomic.kelondro.kelondroRowSet;
040: import de.anomic.plasma.plasmaWordIndex;
041: import de.anomic.server.serverByteBuffer;
042:
043: public class indexContainer extends kelondroRowSet {
044:
045: private String wordHash;
046:
047: public indexContainer(String wordHash, kelondroRowSet collection) {
048: super (collection);
049: this .wordHash = wordHash;
050: }
051:
052: public indexContainer(String wordHash, kelondroRow rowdef,
053: int objectCount) {
054: super (rowdef, objectCount);
055: this .wordHash = wordHash;
056: this .lastTimeWrote = 0;
057: }
058:
059: public indexContainer topLevelClone() {
060: indexContainer newContainer = new indexContainer(this .wordHash,
061: this .rowdef, this .size());
062: newContainer.addAllUnique(this );
063: return newContainer;
064: }
065:
066: public void setWordHash(String newWordHash) {
067: this .wordHash = newWordHash;
068: }
069:
070: public long updated() {
071: return super .lastWrote();
072: }
073:
074: public String getWordHash() {
075: return wordHash;
076: }
077:
078: public void add(indexRWIEntry entry) {
079: // add without double-occurrence test
080: assert entry.toKelondroEntry().objectsize() == super .rowdef.objectsize;
081: this .addUnique(entry.toKelondroEntry());
082: }
083:
084: public void add(indexRWIEntry entry, long updateTime) {
085: // add without double-occurrence test
086: assert entry.toKelondroEntry().objectsize() == super .rowdef.objectsize;
087: this .add(entry);
088: this .lastTimeWrote = updateTime;
089: }
090:
091: public static final indexContainer mergeUnique(indexContainer a,
092: boolean aIsClone, indexContainer b, boolean bIsClone) {
093: if ((aIsClone) && (bIsClone)) {
094: if (a.size() > b.size())
095: return (indexContainer) mergeUnique(a, b);
096: else
097: return (indexContainer) mergeUnique(b, a);
098: }
099: if (aIsClone)
100: return (indexContainer) mergeUnique(a, b);
101: if (bIsClone)
102: return (indexContainer) mergeUnique(b, a);
103: if (a.size() > b.size())
104: return (indexContainer) mergeUnique(a, b);
105: else
106: return (indexContainer) mergeUnique(b, a);
107: }
108:
109: public static Object mergeUnique(Object a, Object b) {
110: indexContainer c = (indexContainer) a;
111: c.addAllUnique((indexContainer) b);
112: return c;
113: }
114:
115: public indexRWIEntry put(indexRWIEntry entry) {
116: assert entry.toKelondroEntry().objectsize() == super .rowdef.objectsize;
117: kelondroRow.Entry r = super .put(entry.toKelondroEntry());
118: if (r == null)
119: return null;
120: return new indexRWIRowEntry(r);
121: }
122:
123: public boolean putRecent(indexRWIEntry entry) {
124: assert entry.toKelondroEntry().objectsize() == super .rowdef.objectsize;
125: // returns true if the new entry was added, false if it already existed
126: kelondroRow.Entry oldEntryRow = this .put(entry
127: .toKelondroEntry());
128: if (oldEntryRow == null) {
129: return true;
130: } else {
131: indexRWIEntry oldEntry = new indexRWIRowEntry(oldEntryRow);
132: if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container
133: this .put(oldEntry.toKelondroEntry()); // put it back
134: return false;
135: } else {
136: return true;
137: }
138: }
139: }
140:
141: public int putAllRecent(indexContainer c) {
142: // adds all entries in c and checks every entry for double-occurrence
143: // returns the number of new elements
144: if (c == null)
145: return 0;
146: int x = 0;
147: synchronized (c) {
148: Iterator<indexRWIRowEntry> i = c.entries();
149: while (i.hasNext()) {
150: try {
151: if (putRecent((indexRWIEntry) i.next()))
152: x++;
153: } catch (ConcurrentModificationException e) {
154: e.printStackTrace();
155: }
156: }
157: }
158: this .lastTimeWrote = java.lang.Math.max(this .lastTimeWrote, c
159: .updated());
160: return x;
161: }
162:
163: public indexRWIEntry get(String urlHash) {
164: kelondroRow.Entry entry = this .get(urlHash.getBytes());
165: if (entry == null)
166: return null;
167: return new indexRWIRowEntry(entry);
168: }
169:
170: public indexRWIEntry remove(String urlHash) {
171: kelondroRow.Entry entry = remove(urlHash.getBytes(), true);
172: if (entry == null)
173: return null;
174: return new indexRWIRowEntry(entry);
175: }
176:
177: public int removeEntries(Set<String> urlHashes) {
178: int count = 0;
179: Iterator<String> i = urlHashes.iterator();
180: while (i.hasNext())
181: count += (remove(i.next()) == null) ? 0 : 1;
182: return count;
183: }
184:
185: public Iterator<indexRWIRowEntry> entries() {
186: // returns an iterator of indexRWIEntry objects
187: return new entryIterator();
188: }
189:
190: public class entryIterator implements Iterator<indexRWIRowEntry> {
191:
192: Iterator<kelondroRow.Entry> rowEntryIterator;
193:
194: public entryIterator() {
195: rowEntryIterator = rows();
196: }
197:
198: public boolean hasNext() {
199: return rowEntryIterator.hasNext();
200: }
201:
202: public indexRWIRowEntry next() {
203: kelondroRow.Entry rentry = (kelondroRow.Entry) rowEntryIterator
204: .next();
205: if (rentry == null)
206: return null;
207: return new indexRWIRowEntry(rentry);
208: }
209:
210: public void remove() {
211: rowEntryIterator.remove();
212: }
213:
214: }
215:
216: public static Method containerMergeMethod = null;
217: static {
218: try {
219: Class<?> c = Class
220: .forName("de.anomic.index.indexContainer");
221: containerMergeMethod = c.getMethod("mergeUnique",
222: new Class[] { Object.class, Object.class });
223: } catch (SecurityException e) {
224: System.out
225: .println("Error while initializing containerMerge.SecurityException: "
226: + e.getMessage());
227: containerMergeMethod = null;
228: } catch (ClassNotFoundException e) {
229: System.out
230: .println("Error while initializing containerMerge.ClassNotFoundException: "
231: + e.getMessage());
232: containerMergeMethod = null;
233: } catch (NoSuchMethodException e) {
234: System.out
235: .println("Error while initializing containerMerge.NoSuchMethodException: "
236: + e.getMessage());
237: containerMergeMethod = null;
238: }
239: }
240:
241: public static indexContainer joinExcludeContainers(
242: Collection<indexContainer> includeContainers,
243: Collection<indexContainer> excludeContainers,
244: int maxDistance) {
245: // join a search result and return the joincount (number of pages after join)
246:
247: // since this is a conjunction we return an empty entity if any word is not known
248: if (includeContainers == null)
249: return plasmaWordIndex.emptyContainer(null, 0);
250:
251: // join the result
252: indexContainer rcLocal = indexContainer.joinContainers(
253: includeContainers, maxDistance);
254: if (rcLocal == null)
255: return plasmaWordIndex.emptyContainer(null, 0);
256: excludeContainers(rcLocal, excludeContainers);
257:
258: return rcLocal;
259: }
260:
261: public static indexContainer joinContainers(
262: Collection<indexContainer> containers, int maxDistance) {
263:
264: // order entities by their size
265: TreeMap<Long, indexContainer> map = new TreeMap<Long, indexContainer>();
266: indexContainer singleContainer;
267: Iterator<indexContainer> i = containers.iterator();
268: int count = 0;
269: while (i.hasNext()) {
270: // get next entity:
271: singleContainer = (indexContainer) i.next();
272:
273: // check result
274: if ((singleContainer == null)
275: || (singleContainer.size() == 0))
276: return null; // as this is a cunjunction of searches, we have no result if any word is not known
277:
278: // store result in order of result size
279: map.put(new Long(singleContainer.size() * 1000 + count),
280: singleContainer);
281: count++;
282: }
283:
284: // check if there is any result
285: if (map.size() == 0)
286: return null; // no result, nothing found
287:
288: // the map now holds the search results in order of number of hits per word
289: // we now must pairwise build up a conjunction of these sets
290: Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries
291: indexContainer searchA, searchB, searchResult = (indexContainer) map
292: .remove(k);
293: while ((map.size() > 0) && (searchResult.size() > 0)) {
294: // take the first element of map which is a result and combine it with result
295: k = (Long) map.firstKey(); // the next smallest...
296: searchA = searchResult;
297: searchB = (indexContainer) map.remove(k);
298: searchResult = indexContainer.joinConstructive(searchA,
299: searchB, maxDistance);
300: // free resources
301: searchA = null;
302: searchB = null;
303: }
304:
305: // in 'searchResult' is now the combined search result
306: if (searchResult.size() == 0)
307: return null;
308: return searchResult;
309: }
310:
311: public static indexContainer excludeContainers(
312: indexContainer pivot, Collection<indexContainer> containers) {
313:
314: // check if there is any result
315: if ((containers == null) || (containers.size() == 0))
316: return pivot; // no result, nothing found
317:
318: Iterator<indexContainer> i = containers.iterator();
319: while (i.hasNext()) {
320: pivot = excludeDestructive(pivot, (indexContainer) i.next());
321: if ((pivot == null) || (pivot.size() == 0))
322: return null;
323: }
324:
325: return pivot;
326: }
327:
328: // join methods
329: private static int log2(int x) {
330: int l = 0;
331: while (x > 0) {
332: x = x >> 1;
333: l++;
334: }
335: return l;
336: }
337:
338: public static indexContainer joinConstructive(indexContainer i1,
339: indexContainer i2, int maxDistance) {
340: if ((i1 == null) || (i2 == null))
341: return null;
342: if ((i1.size() == 0) || (i2.size() == 0))
343: return null;
344:
345: // decide which method to use
346: int high = ((i1.size() > i2.size()) ? i1.size() : i2.size());
347: int low = ((i1.size() > i2.size()) ? i2.size() : i1.size());
348: int stepsEnum = 10 * (high + low - 1);
349: int stepsTest = 12 * log2(high) * low;
350:
351: // start most efficient method
352: if (stepsEnum > stepsTest) {
353: if (i1.size() < i2.size())
354: return joinConstructiveByTest(i1, i2, maxDistance);
355: else
356: return joinConstructiveByTest(i2, i1, maxDistance);
357: } else {
358: return joinConstructiveByEnumeration(i1, i2, maxDistance);
359: }
360: }
361:
362: private static indexContainer joinConstructiveByTest(
363: indexContainer small, indexContainer large, int maxDistance) {
364: System.out.println("DEBUG: JOIN METHOD BY TEST");
365: assert small.rowdef.equals(large.rowdef) : "small = "
366: + small.rowdef.toString() + "; large = "
367: + large.rowdef.toString();
368: int keylength = small.rowdef.width(0);
369: assert (keylength == large.rowdef.width(0));
370: indexContainer conj = new indexContainer(null, small.rowdef, 0); // start with empty search result
371: Iterator<indexRWIRowEntry> se = small.entries();
372: indexRWIEntry ie0, ie1;
373: while (se.hasNext()) {
374: ie0 = (indexRWIEntry) se.next();
375: ie1 = large.get(ie0.urlHash());
376: if ((ie0 != null) && (ie1 != null)) {
377: assert (ie0.urlHash().length() == keylength) : "ie0.urlHash() = "
378: + ie0.urlHash();
379: assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = "
380: + ie1.urlHash();
381: // this is a hit. Calculate word distance:
382: ie0.join(ie1);
383: if (ie0.worddistance() <= maxDistance)
384: conj.add(ie0);
385: }
386: }
387: return conj;
388: }
389:
390: private static indexContainer joinConstructiveByEnumeration(
391: indexContainer i1, indexContainer i2, int maxDistance) {
392: System.out.println("DEBUG: JOIN METHOD BY ENUMERATION");
393: assert i1.rowdef.equals(i2.rowdef) : "i1 = "
394: + i1.rowdef.toString() + "; i2 = "
395: + i2.rowdef.toString();
396: int keylength = i1.rowdef.width(0);
397: assert (keylength == i2.rowdef.width(0));
398: indexContainer conj = new indexContainer(null, i1.rowdef, 0); // start with empty search result
399: if (!((i1.rowdef.getOrdering().signature().equals(i2.rowdef
400: .getOrdering().signature())) && (i1.rowdef.primaryKeyIndex == i2.rowdef.primaryKeyIndex)))
401: return conj; // ordering must be equal
402: Iterator<indexRWIRowEntry> e1 = i1.entries();
403: Iterator<indexRWIRowEntry> e2 = i2.entries();
404: int c;
405: if ((e1.hasNext()) && (e2.hasNext())) {
406: indexRWIEntry ie1;
407: indexRWIEntry ie2;
408: ie1 = (indexRWIEntry) e1.next();
409: ie2 = (indexRWIEntry) e2.next();
410:
411: while (true) {
412: assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = "
413: + ie1.urlHash();
414: assert (ie2.urlHash().length() == keylength) : "ie2.urlHash() = "
415: + ie2.urlHash();
416: c = i1.rowdef.getOrdering().compare(
417: ie1.urlHash().getBytes(),
418: ie2.urlHash().getBytes());
419: //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
420: if (c < 0) {
421: if (e1.hasNext())
422: ie1 = (indexRWIEntry) e1.next();
423: else
424: break;
425: } else if (c > 0) {
426: if (e2.hasNext())
427: ie2 = (indexRWIEntry) e2.next();
428: else
429: break;
430: } else {
431: // we have found the same urls in different searches!
432: ie1.join(ie2);
433: if (ie1.worddistance() <= maxDistance)
434: conj.add(ie1);
435: if (e1.hasNext())
436: ie1 = (indexRWIEntry) e1.next();
437: else
438: break;
439: if (e2.hasNext())
440: ie2 = (indexRWIEntry) e2.next();
441: else
442: break;
443: }
444: }
445: }
446: return conj;
447: }
448:
449: public static indexContainer excludeDestructive(
450: indexContainer pivot, indexContainer excl) {
451: if (pivot == null)
452: return null;
453: if (excl == null)
454: return pivot;
455: if (pivot.size() == 0)
456: return null;
457: if (excl.size() == 0)
458: return pivot;
459:
460: // decide which method to use
461: int high = ((pivot.size() > excl.size()) ? pivot.size() : excl
462: .size());
463: int low = ((pivot.size() > excl.size()) ? excl.size() : pivot
464: .size());
465: int stepsEnum = 10 * (high + low - 1);
466: int stepsTest = 12 * log2(high) * low;
467:
468: // start most efficient method
469: if (stepsEnum > stepsTest) {
470: return excludeDestructiveByTest(pivot, excl);
471: } else {
472: return excludeDestructiveByEnumeration(pivot, excl);
473: }
474: }
475:
476: private static indexContainer excludeDestructiveByTest(
477: indexContainer pivot, indexContainer excl) {
478: assert pivot.rowdef.equals(excl.rowdef) : "small = "
479: + pivot.rowdef.toString() + "; large = "
480: + excl.rowdef.toString();
481: int keylength = pivot.rowdef.width(0);
482: assert (keylength == excl.rowdef.width(0));
483: boolean iterate_pivot = pivot.size() < excl.size();
484: Iterator<indexRWIRowEntry> se = (iterate_pivot) ? pivot
485: .entries() : excl.entries();
486: indexRWIEntry ie0, ie1;
487: while (se.hasNext()) {
488: ie0 = (indexRWIEntry) se.next();
489: ie1 = excl.get(ie0.urlHash());
490: if ((ie0 != null) && (ie1 != null)) {
491: assert (ie0.urlHash().length() == keylength) : "ie0.urlHash() = "
492: + ie0.urlHash();
493: assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = "
494: + ie1.urlHash();
495: if (iterate_pivot)
496: se.remove();
497: pivot.remove(ie0.urlHash().getBytes(), true);
498: }
499: }
500: return pivot;
501: }
502:
503: private static indexContainer excludeDestructiveByEnumeration(
504: indexContainer pivot, indexContainer excl) {
505: assert pivot.rowdef.equals(excl.rowdef) : "i1 = "
506: + pivot.rowdef.toString() + "; i2 = "
507: + excl.rowdef.toString();
508: int keylength = pivot.rowdef.width(0);
509: assert (keylength == excl.rowdef.width(0));
510: if (!((pivot.rowdef.getOrdering().signature()
511: .equals(excl.rowdef.getOrdering().signature())) && (pivot.rowdef.primaryKeyIndex == excl.rowdef.primaryKeyIndex)))
512: return pivot; // ordering must be equal
513: Iterator<indexRWIRowEntry> e1 = pivot.entries();
514: Iterator<indexRWIRowEntry> e2 = excl.entries();
515: int c;
516: if ((e1.hasNext()) && (e2.hasNext())) {
517: indexRWIEntry ie1;
518: indexRWIEntry ie2;
519: ie1 = (indexRWIEntry) e1.next();
520: ie2 = (indexRWIEntry) e2.next();
521:
522: while (true) {
523: assert (ie1.urlHash().length() == keylength) : "ie1.urlHash() = "
524: + ie1.urlHash();
525: assert (ie2.urlHash().length() == keylength) : "ie2.urlHash() = "
526: + ie2.urlHash();
527: c = pivot.rowdef.getOrdering().compare(
528: ie1.urlHash().getBytes(),
529: ie2.urlHash().getBytes());
530: //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
531: if (c < 0) {
532: if (e1.hasNext())
533: ie1 = (indexRWIEntry) e1.next();
534: else
535: break;
536: } else if (c > 0) {
537: if (e2.hasNext())
538: ie2 = (indexRWIEntry) e2.next();
539: else
540: break;
541: } else {
542: // we have found the same urls in different searches!
543: ie1.join(ie2);
544: e1.remove();
545: if (e1.hasNext())
546: ie1 = (indexRWIEntry) e1.next();
547: else
548: break;
549: if (e2.hasNext())
550: ie2 = (indexRWIEntry) e2.next();
551: else
552: break;
553: }
554: }
555: }
556: return pivot;
557: }
558:
559: public String toString() {
560: return "C[" + wordHash + "] has " + this .size() + " entries";
561: }
562:
563: public int hashCode() {
564: return (int) kelondroBase64Order.enhancedCoder
565: .decodeLong(this .wordHash.substring(0, 4));
566: }
567:
568: public static final serverByteBuffer compressIndex(
569: indexContainer inputContainer,
570: indexContainer excludeContainer, long maxtime) {
571: // collect references according to domains
572: long timeout = (maxtime < 0) ? Long.MAX_VALUE : System
573: .currentTimeMillis()
574: + maxtime;
575: TreeMap<String, String> doms = new TreeMap<String, String>();
576: synchronized (inputContainer) {
577: Iterator<indexRWIRowEntry> i = inputContainer.entries();
578: indexRWIEntry iEntry;
579: String dom, paths;
580: while (i.hasNext()) {
581: iEntry = i.next();
582: if ((excludeContainer != null)
583: && (excludeContainer.get(iEntry.urlHash()) != null))
584: continue; // do not include urls that are in excludeContainer
585: dom = iEntry.urlHash().substring(6);
586: if ((paths = (String) doms.get(dom)) == null) {
587: doms.put(dom, iEntry.urlHash().substring(0, 6));
588: } else {
589: doms.put(dom, paths
590: + iEntry.urlHash().substring(0, 6));
591: }
592: if (System.currentTimeMillis() > timeout)
593: break;
594: }
595: }
596: // construct a result string
597: serverByteBuffer bb = new serverByteBuffer(inputContainer
598: .size() * 6);
599: bb.append('{');
600: Iterator<Map.Entry<String, String>> i = doms.entrySet()
601: .iterator();
602: Map.Entry<String, String> entry;
603: while (i.hasNext()) {
604: entry = i.next();
605: bb.append((String) entry.getKey());
606: bb.append(':');
607: bb.append((String) entry.getValue());
608: if (System.currentTimeMillis() > timeout)
609: break;
610: if (i.hasNext())
611: bb.append(',');
612: }
613: bb.append('}');
614: return bb;
615: }
616:
617: public static final void decompressIndex(
618: TreeMap<String, String> target, serverByteBuffer ci,
619: String peerhash) {
620: // target is a mapping from url-hashes to a string of peer-hashes
621: if ((ci.byteAt(0) == '{')
622: && (ci.byteAt(ci.length() - 1) == '}')) {
623: //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
624: ci = ci.trim(1, ci.length() - 2);
625: String dom, url, peers;
626: while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
627: assert ci.length() >= 6 : "ci.length() = "
628: + ci.length();
629: dom = ci.toString(0, 6);
630: ci.trim(7);
631: while ((ci.length() > 0) && (ci.byteAt(0) != ',')) {
632: assert ci.length() >= 6 : "ci.length() = "
633: + ci.length();
634: url = ci.toString(0, 6) + dom;
635: ci.trim(6);
636: peers = target.get(url);
637: if (peers == null) {
638: target.put(url, peerhash);
639: } else {
640: target.put(url, peers + peerhash);
641: }
642: //System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
643: }
644: if (ci.byteAt(0) == ',')
645: ci.trim(1);
646: }
647: }
648: }
649: }
|