001: // plasmaSearchRankingProcess.java
002: // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003: // first published 07.11.2007 on http://yacy.net
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.plasma;
028:
029: import java.io.File;
030: import java.io.IOException;
031: import java.util.HashMap;
032: import java.util.Iterator;
033: import java.util.Map;
034: import java.util.Set;
035: import java.util.TreeMap;
036: import java.util.TreeSet;
037:
038: import de.anomic.htmlFilter.htmlFilterContentScraper;
039: import de.anomic.index.indexContainer;
040: import de.anomic.index.indexRWIEntry;
041: import de.anomic.index.indexRWIEntryOrder;
042: import de.anomic.index.indexRWIRowEntry;
043: import de.anomic.index.indexURLEntry;
044: import de.anomic.kelondro.kelondroBinSearch;
045: import de.anomic.kelondro.kelondroMScoreCluster;
046: import de.anomic.server.serverCodings;
047: import de.anomic.server.serverFileUtils;
048: import de.anomic.server.serverProfiling;
049:
050: public final class plasmaSearchRankingProcess {
051:
052: public static kelondroBinSearch[] ybrTables = null; // block-rank tables
053: private static boolean useYBR = true;
054:
055: private TreeMap<Object, indexRWIRowEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String
056: private HashMap<String, TreeMap<Object, indexRWIRowEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
057: private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
058: private plasmaSearchQuery query;
059: private int sortorder;
060: private int maxentries;
061: private int remote_peerCount, remote_indexCount,
062: remote_resourceSize, local_resourceSize;
063: private indexRWIEntryOrder order;
064: private HashMap<String, Object> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
065: private kelondroMScoreCluster<String> ref; // reference score computation for the commonSense heuristic
066: private int[] flagcount; // flag counter
067: private TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
068: private plasmaWordIndex wordIndex;
069: private Map<String, indexContainer>[] localSearchContainerMaps;
070:
071: public plasmaSearchRankingProcess(plasmaWordIndex wordIndex,
072: plasmaSearchQuery query, int sortorder, int maxentries) {
073: // we collect the urlhashes and construct a list with urlEntry objects
074: // attention: if minEntries is too high, this method will not terminate within the maxTime
075: // sortorder: 0 = hash, 1 = url, 2 = ranking
076: this .localSearchContainerMaps = null;
077: this .sortedRWIEntries = new TreeMap<Object, indexRWIRowEntry>();
078: this .doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIRowEntry>>();
079: this .handover = new HashMap<String, String>();
080: this .order = null;
081: this .query = query;
082: this .maxentries = maxentries;
083: this .remote_peerCount = 0;
084: this .remote_indexCount = 0;
085: this .remote_resourceSize = 0;
086: this .local_resourceSize = 0;
087: this .urlhashes = new HashMap<String, Object>();
088: this .ref = new kelondroMScoreCluster<String>();
089: this .misses = new TreeSet<String>();
090: this .wordIndex = wordIndex;
091: this .sortorder = sortorder;
092: this .flagcount = new int[32];
093: for (int i = 0; i < 32; i++) {
094: this .flagcount[i] = 0;
095: }
096: }
097:
098: public void execQuery() {
099:
100: long timer = System.currentTimeMillis();
101: this .localSearchContainerMaps = wordIndex
102: .localSearchContainers(query, null);
103: serverProfiling.update("SEARCH",
104: new plasmaProfiling.searchEvent(query.id(true),
105: plasmaSearchEvent.COLLECTION,
106: this .localSearchContainerMaps[0].size(), System
107: .currentTimeMillis()
108: - timer));
109:
110: // join and exclude the local result
111: timer = System.currentTimeMillis();
112: indexContainer index = (this .localSearchContainerMaps == null) ? plasmaWordIndex
113: .emptyContainer(null, 0)
114: : indexContainer.joinExcludeContainers(
115: this .localSearchContainerMaps[0].values(),
116: this .localSearchContainerMaps[1].values(),
117: query.maxDistance);
118: serverProfiling.update("SEARCH",
119: new plasmaProfiling.searchEvent(query.id(true),
120: plasmaSearchEvent.JOIN, index.size(), System
121: .currentTimeMillis()
122: - timer));
123: int joincount = index.size();
124:
125: if ((index == null) || (joincount == 0)) {
126: return;
127: }
128:
129: if (sortorder == 2) {
130: insertRanked(index, true, index.size());
131: } else {
132: insertNoOrder(index, true, index.size());
133: }
134: }
135:
136: private void insertNoOrder(indexContainer index, boolean local,
137: int fullResource) {
138: final Iterator<indexRWIRowEntry> en = index.entries();
139: // generate a new map where the urls are sorted (not by hash but by the url text)
140:
141: if (local) {
142: this .local_resourceSize += fullResource;
143: } else {
144: this .remote_resourceSize += fullResource;
145: this .remote_peerCount++;
146: this .remote_indexCount += index.size();
147: }
148:
149: indexRWIRowEntry ientry;
150: indexURLEntry uentry;
151: String u;
152: loop: while (en.hasNext()) {
153: ientry = en.next();
154:
155: // check constraints
156: if (!testFlags(ientry))
157: continue loop;
158:
159: // increase flag counts
160: for (int i = 0; i < 32; i++) {
161: if (ientry.flags().get(i)) {
162: flagcount[i]++;
163: }
164: }
165:
166: // load url
167: if (sortorder == 0) {
168: this .sortedRWIEntries.put(ientry.urlHash(), ientry);
169: this .urlhashes.put(ientry.urlHash(), ientry.urlHash());
170: } else {
171: uentry = wordIndex.loadedURL.load(ientry.urlHash(),
172: ientry, 0);
173: if (uentry == null) {
174: this .misses.add(ientry.urlHash());
175: } else {
176: u = uentry.comp().url().toNormalform(false, true);
177: this .sortedRWIEntries.put(u, ientry);
178: this .urlhashes.put(ientry.urlHash(), u);
179: }
180: }
181:
182: // interrupt if we have enough
183: if ((query.neededResults() > 0)
184: && (this .misses.size()
185: + this .sortedRWIEntries.size() > query
186: .neededResults()))
187: break loop;
188: } // end loop
189: }
190:
191: public void insertRanked(indexContainer index, boolean local,
192: int fullResource) {
193: // we collect the urlhashes and construct a list with urlEntry objects
194: // attention: if minEntries is too high, this method will not terminate within the maxTime
195:
196: assert (index != null);
197: if (index.size() == 0)
198: return;
199: if (local) {
200: this .local_resourceSize += fullResource;
201: } else {
202: this .remote_resourceSize += fullResource;
203: this .remote_peerCount++;
204: }
205:
206: long timer = System.currentTimeMillis();
207: if (this .order == null) {
208: this .order = new indexRWIEntryOrder(query.ranking);
209: }
210: this .order.normalizeWith(index);
211: serverProfiling.update("SEARCH",
212: new plasmaProfiling.searchEvent(query.id(true),
213: plasmaSearchEvent.NORMALIZING, index.size(),
214: System.currentTimeMillis() - timer));
215:
216: // normalize entries and get ranking
217: timer = System.currentTimeMillis();
218: Iterator<indexRWIRowEntry> i = index.entries();
219: indexRWIRowEntry iEntry, l;
220: long biggestEntry = 0;
221: //long s0 = System.currentTimeMillis();
222: Long r;
223: while (i.hasNext()) {
224: iEntry = i.next();
225: if (iEntry.urlHash().length() != index.row().primaryKeyLength)
226: continue;
227:
228: // increase flag counts
229: for (int j = 0; j < 32; j++) {
230: if (iEntry.flags().get(j)) {
231: flagcount[j]++;
232: }
233: }
234:
235: // kick out entries that are too bad according to current findings
236: r = new Long(order.cardinal(iEntry));
237: if ((maxentries >= 0)
238: && (sortedRWIEntries.size() >= maxentries)
239: && (r.longValue() > biggestEntry))
240: continue;
241:
242: // check constraints
243: if (!testFlags(iEntry))
244: continue;
245:
246: if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
247: if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO)
248: && (!(iEntry.flags()
249: .get(plasmaCondenser.flag_cat_hasaudio))))
250: continue;
251: if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO)
252: && (!(iEntry.flags()
253: .get(plasmaCondenser.flag_cat_hasvideo))))
254: continue;
255: if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE)
256: && (!(iEntry.flags()
257: .get(plasmaCondenser.flag_cat_hasimage))))
258: continue;
259: if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP)
260: && (!(iEntry.flags()
261: .get(plasmaCondenser.flag_cat_hasapp))))
262: continue;
263: }
264: if ((maxentries < 0)
265: || (sortedRWIEntries.size() < maxentries)) {
266: if (urlhashes.containsKey(iEntry.urlHash()))
267: continue;
268: while (sortedRWIEntries.containsKey(r))
269: r = new Long(r.longValue() + 1);
270: sortedRWIEntries.put(r, iEntry);
271: } else {
272: if (r.longValue() > biggestEntry) {
273: continue;
274: } else {
275: if (urlhashes.containsKey(iEntry.urlHash()))
276: continue;
277: l = sortedRWIEntries.remove((Long) sortedRWIEntries
278: .lastKey());
279: urlhashes.remove(l.urlHash());
280: while (sortedRWIEntries.containsKey(r))
281: r = new Long(r.longValue() + 1);
282: sortedRWIEntries.put(r, iEntry);
283: biggestEntry = order.cardinal(sortedRWIEntries
284: .get(sortedRWIEntries.lastKey()));
285: }
286: }
287:
288: // increase counter for statistics
289: if (!local)
290: this .remote_indexCount++;
291: }
292:
293: //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
294: serverProfiling.update("SEARCH",
295: new plasmaProfiling.searchEvent(query.id(true),
296: plasmaSearchEvent.PRESORT, index.size(), System
297: .currentTimeMillis()
298: - timer));
299: }
300:
301: private boolean testFlags(indexRWIEntry ientry) {
302: if (query.constraint == null)
303: return true;
304: // test if ientry matches with filter
305: // if all = true: let only entries pass that has all matching bits
306: // if all = false: let all entries pass that has at least one matching bit
307: if (query.allofconstraint) {
308: for (int i = 0; i < 32; i++) {
309: if ((query.constraint.get(i))
310: && (!ientry.flags().get(i)))
311: return false;
312: }
313: return true;
314: }
315: for (int i = 0; i < 32; i++) {
316: if ((query.constraint.get(i)) && (ientry.flags().get(i)))
317: return true;
318: }
319: return false;
320: }
321:
322: public synchronized Map<String, indexContainer>[] searchContainerMaps() {
323: // direct access to the result maps is needed for abstract generation
324: // this is only available if execQuery() was called before
325: return localSearchContainerMaps;
326: }
327:
328: // todo:
329: // - remove redundant urls (sub-path occurred before)
330: // - move up shorter urls
331: // - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
332:
333: private synchronized Object[] /*{Object, indexRWIEntry}*/bestRWI(
334: boolean skipDoubleDom) {
335: // returns from the current RWI list the best entry and removed this entry from the list
336: Object bestEntry;
337: TreeMap<Object, indexRWIRowEntry> m;
338: indexRWIRowEntry rwi;
339: while (sortedRWIEntries.size() > 0) {
340: bestEntry = sortedRWIEntries.firstKey();
341: rwi = sortedRWIEntries.remove(bestEntry);
342: if (!skipDoubleDom)
343: return new Object[] { bestEntry, rwi };
344: // check doubledom
345: String domhash = rwi.urlHash().substring(6);
346: m = this .doubleDomCache.get(domhash);
347: if (m == null) {
348: // first appearance of dom
349: m = new TreeMap<Object, indexRWIRowEntry>();
350: this .doubleDomCache.put(domhash, m);
351: return new Object[] { bestEntry, rwi };
352: }
353: // second appearances of dom
354: m.put(bestEntry, rwi);
355: }
356: // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
357: // find best entry from all caches
358: Iterator<TreeMap<Object, indexRWIRowEntry>> i = this .doubleDomCache
359: .values().iterator();
360: bestEntry = null;
361: Object o;
362: indexRWIRowEntry bestrwi = null;
363: while (i.hasNext()) {
364: m = i.next();
365: if (m.size() == 0)
366: continue;
367: if (bestEntry == null) {
368: bestEntry = m.firstKey();
369: bestrwi = m.remove(bestEntry);
370: continue;
371: }
372: o = m.firstKey();
373: rwi = m.remove(o);
374: if (o instanceof Long) {
375: if (((Long) o).longValue() < ((Long) bestEntry)
376: .longValue()) {
377: bestEntry = o;
378: bestrwi = rwi;
379: }
380: }
381: if (o instanceof String) {
382: if (((String) o).compareTo((String) bestEntry) < 0) {
383: bestEntry = o;
384: bestrwi = rwi;
385: }
386: }
387: }
388: if (bestrwi == null)
389: return null;
390: // finally remove the best entry from the doubledom cache
391: m = this .doubleDomCache.get(bestrwi.urlHash().substring(6));
392: m.remove(bestEntry);
393: return new Object[] { bestEntry, bestrwi };
394: }
395:
396: public synchronized indexURLEntry bestURL(boolean skipDoubleDom) {
397: // returns from the current RWI list the best URL entry and removed this entry from the list
398: while ((sortedRWIEntries.size() > 0) || (size() > 0)) {
399: Object[] obrwi = bestRWI(skipDoubleDom);
400: Object bestEntry = obrwi[0];
401: indexRWIRowEntry ientry = (indexRWIRowEntry) obrwi[1];
402: long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry)
403: .longValue()
404: : 0;
405: indexURLEntry u = wordIndex.loadedURL.load(
406: ientry.urlHash(), ientry, ranking);
407: if (u != null) {
408: indexURLEntry.Components comp = u.comp();
409: if (comp.url() != null)
410: this .handover.put(u.hash(), comp.url()
411: .toNormalform(true, false)); // remember that we handed over this url
412: return u;
413: }
414: misses.add(ientry.urlHash());
415: }
416: return null;
417: }
418:
419: public synchronized int size() {
420: //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
421: int c = sortedRWIEntries.size();
422: Iterator<TreeMap<Object, indexRWIRowEntry>> i = this .doubleDomCache
423: .values().iterator();
424: while (i.hasNext())
425: c += i.next().size();
426: return c;
427: }
428:
429: public int[] flagCount() {
430: return flagcount;
431: }
432:
433: // "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."
434:
435: public int filteredCount() {
436: // the number of index entries that are considered as result set
437: return this .sortedRWIEntries.size();
438: }
439:
440: public int getRemoteIndexCount() {
441: // the number of result contributions from all the remote peers
442: return this .remote_indexCount;
443: }
444:
445: public int getRemotePeerCount() {
446: // the number of remote peers that have contributed
447: return this .remote_peerCount;
448: }
449:
450: public int getRemoteResourceSize() {
451: // the number of all hits in all the remote peers
452: return this .remote_resourceSize;
453: }
454:
455: public int getLocalResourceSize() {
456: // the number of hits in the local peer (index size, size of the collection in the own index)
457: return this .local_resourceSize;
458: }
459:
460: public indexRWIEntry remove(String urlHash) {
461: Object r = (Long) urlhashes.get(urlHash);
462: if (r == null)
463: return null;
464: assert sortedRWIEntries.containsKey(r);
465: indexRWIEntry iEntry = (indexRWIEntry) sortedRWIEntries
466: .remove(r);
467: urlhashes.remove(urlHash);
468: return iEntry;
469: }
470:
471: public Iterator<String> miss() {
472: return this .misses.iterator();
473: }
474:
475: public Set<String> getReferences(int count) {
476: // create a list of words that had been computed by statistics over all
477: // words that appeared in the url or the description of all urls
478: Object[] refs = ref.getScores(count, false, 2,
479: Integer.MAX_VALUE);
480: TreeSet<String> s = new TreeSet<String>(
481: String.CASE_INSENSITIVE_ORDER);
482: for (int i = 0; i < refs.length; i++) {
483: s.add((String) refs[i]);
484: }
485: return s;
486: }
487:
488: public void addReferences(String[] words) {
489: String word;
490: for (int i = 0; i < words.length; i++) {
491: word = words[i].toLowerCase();
492: if ((word.length() > 2)
493: && ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_"
494: .indexOf(word) < 0)
495: && (!(query.queryHashes.contains(plasmaCondenser
496: .word2hash(word)))))
497: ref.incScore(word);
498: }
499: }
500:
501: protected void addReferences(
502: plasmaSearchEvent.ResultEntry resultEntry) {
503: // take out relevant information for reference computation
504: if ((resultEntry.url() == null)
505: || (resultEntry.title() == null))
506: return;
507: String[] urlcomps = htmlFilterContentScraper
508: .urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
509: String[] descrcomps = resultEntry.title().toLowerCase().split(
510: htmlFilterContentScraper.splitrex); // words in the description
511:
512: // add references
513: addReferences(urlcomps);
514: addReferences(descrcomps);
515: }
516:
517: public indexRWIEntryOrder getOrder() {
518: return this .order;
519: }
520:
521: public static void loadYBR(File rankingPath, int count) {
522: // load ranking tables
523: if (rankingPath.exists()) {
524: ybrTables = new kelondroBinSearch[count];
525: String ybrName;
526: File f;
527: try {
528: for (int i = 0; i < count; i++) {
529: ybrName = "YBR-4-" + serverCodings.encodeHex(i, 2)
530: + ".idx";
531: f = new File(rankingPath, ybrName);
532: if (f.exists()) {
533: ybrTables[i] = new kelondroBinSearch(
534: serverFileUtils.read(f), 6);
535: } else {
536: ybrTables[i] = null;
537: }
538: }
539: } catch (IOException e) {
540: ybrTables = null;
541: }
542: } else {
543: ybrTables = null;
544: }
545: }
546:
547: public static boolean canUseYBR() {
548: return ybrTables != null;
549: }
550:
551: public static boolean isUsingYBR() {
552: return useYBR;
553: }
554:
555: public static void switchYBR(boolean usage) {
556: useYBR = usage;
557: }
558:
559: public static int ybr(String urlHash) {
560: // returns the YBR value in a range of 0..15, where 0 means best ranking and 15 means worst ranking
561: if (ybrTables == null)
562: return 15;
563: if (!(useYBR))
564: return 15;
565: final String domHash = urlHash.substring(6);
566: for (int i = 0; i < ybrTables.length; i++) {
567: if ((ybrTables[i] != null)
568: && (ybrTables[i].contains(domHash.getBytes()))) {
569: //System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
570: return i;
571: }
572: }
573: //System.out.println("NOT FOUND: " + urlHash);
574: return 15;
575: }
576:
577: public long postRanking(Set<String> topwords,
578: plasmaSearchEvent.ResultEntry rentry, int position) {
579:
580: long r = (255 - position) << 8;
581:
582: // for media search: prefer pages with many links
583: if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE)
584: r += rentry.limage() << query.ranking.coeff_cathasimage;
585: if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO)
586: r += rentry.laudio() << query.ranking.coeff_cathasaudio;
587: if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO)
588: r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
589: if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP)
590: r += rentry.lapp() << query.ranking.coeff_cathasapp;
591:
592: // prefer hit with 'prefer' pattern
593: if (rentry.url().toNormalform(true, true).matches(query.prefer))
594: r += 256 << query.ranking.coeff_prefer;
595: if (rentry.title().matches(query.prefer))
596: r += 256 << query.ranking.coeff_prefer;
597:
598: // apply 'common-sense' heuristic using references
599: String urlstring = rentry.url().toNormalform(true, true);
600: String[] urlcomps = htmlFilterContentScraper
601: .urlComps(urlstring);
602: String[] descrcomps = rentry.title().toLowerCase().split(
603: htmlFilterContentScraper.splitrex);
604: for (int j = 0; j < urlcomps.length; j++) {
605: if (topwords.contains(urlcomps[j]))
606: r += Math.max(1, 256 - urlstring.length()) << query.ranking.coeff_urlcompintoplist;
607: }
608: for (int j = 0; j < descrcomps.length; j++) {
609: if (topwords.contains(descrcomps[j]))
610: r += Math.max(1, 256 - rentry.title().length()) << query.ranking.coeff_descrcompintoplist;
611: }
612:
613: // apply query-in-result matching
614: Set<String> urlcomph = plasmaCondenser.words2hashSet(urlcomps);
615: Set<String> descrcomph = plasmaCondenser
616: .words2hashSet(descrcomps);
617: Iterator<String> shi = query.queryHashes.iterator();
618: String queryhash;
619: while (shi.hasNext()) {
620: queryhash = shi.next();
621: if (urlcomph.contains(queryhash))
622: r += 256 << query.ranking.coeff_appurl;
623: if (descrcomph.contains(queryhash))
624: r += 256 << query.ranking.coeff_app_dc_title;
625: }
626:
627: return r;
628: }
629: }
|