001: // indexRWIEntryOrder.java
002: // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003: // first published 07.11.2007 on http://yacy.net
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.index;
028:
029: import java.util.HashMap;
030: import java.util.Iterator;
031: import java.util.Map;
032:
033: import de.anomic.kelondro.kelondroAbstractOrder;
034: import de.anomic.kelondro.kelondroBitfield;
035: import de.anomic.kelondro.kelondroMScoreCluster;
036: import de.anomic.kelondro.kelondroOrder;
037: import de.anomic.plasma.plasmaCondenser;
038: import de.anomic.plasma.plasmaSearchRankingProcess;
039: import de.anomic.plasma.plasmaSearchRankingProfile;
040: import de.anomic.yacy.yacyURL;
041:
042: public class indexRWIEntryOrder extends
043: kelondroAbstractOrder<indexRWIVarEntry> implements
044: kelondroOrder<indexRWIVarEntry> {
045: private indexRWIVarEntry min, max;
046: private plasmaSearchRankingProfile ranking;
047: private kelondroMScoreCluster<String> doms; // collected for "authority" heuristic
048: private int maxdomcount;
049:
050: private static final int processors = Runtime.getRuntime()
051: .availableProcessors(); // for multiprocessor support, used during normalization
052:
053: public indexRWIEntryOrder(plasmaSearchRankingProfile profile) {
054: this .min = null;
055: this .max = null;
056: this .ranking = profile;
057: this .doms = new kelondroMScoreCluster<String>();
058: this .maxdomcount = 0;
059: }
060:
061: public void normalizeWith(indexContainer container) {
062: // normalize ranking: find minimum and maxiumum of separate ranking criteria
063: assert (container != null);
064:
065: //long s0 = System.currentTimeMillis();
066: if ((processors > 1) && (container.size() > 10000)) {
067: // run minmax with two threads
068: int middle = container.size() / 2;
069: minmaxfinder mmf0 = new minmaxfinder(container, 0, middle);
070: mmf0.start(); // fork here
071: minmaxfinder mmf1 = new minmaxfinder(container, middle,
072: container.size());
073: mmf1.run(); // execute other fork in this thread
074: if (this .min == null)
075: this .min = mmf1.entryMin;
076: else
077: indexRWIVarEntry.min(this .min, mmf1.entryMin);
078: if (this .max == null)
079: this .max = mmf1.entryMax;
080: else
081: indexRWIVarEntry.max(this .max, mmf1.entryMax);
082: Map.Entry<String, Integer> entry;
083: Iterator<Map.Entry<String, Integer>> di = mmf1.domcount()
084: .entrySet().iterator();
085: while (di.hasNext()) {
086: entry = di.next();
087: this .doms.addScore(entry.getKey(), ((Integer) entry
088: .getValue()).intValue());
089: }
090: try {
091: mmf0.join();
092: } catch (InterruptedException e) {
093: } // wait for fork thread to finish
094: if (this .min == null)
095: this .min = mmf0.entryMin;
096: else
097: indexRWIVarEntry.min(this .min, mmf0.entryMin);
098: if (this .max == null)
099: this .max = mmf0.entryMax;
100: else
101: indexRWIVarEntry.max(this .max, mmf0.entryMax);
102: di = mmf0.domcount().entrySet().iterator();
103: while (di.hasNext()) {
104: entry = di.next();
105: this .doms.addScore(entry.getKey(), ((Integer) entry
106: .getValue()).intValue());
107: }
108: //long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0);
109: //System.out.println("***DEBUG*** indexRWIEntry.Order (2-THREADED): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond");
110: } else if (container.size() > 0) {
111: // run minmax in one thread
112: minmaxfinder mmf = new minmaxfinder(container, 0, container
113: .size());
114: mmf.run(); // execute without multi-threading
115: if (this .min == null)
116: this .min = mmf.entryMin;
117: else
118: indexRWIVarEntry.min(this .min, mmf.entryMin);
119: if (this .max == null)
120: this .max = mmf.entryMax;
121: else
122: indexRWIVarEntry.max(this .max, mmf.entryMax);
123: Map.Entry<String, Integer> entry;
124: Iterator<Map.Entry<String, Integer>> di = mmf.domcount()
125: .entrySet().iterator();
126: while (di.hasNext()) {
127: entry = di.next();
128: this .doms.addScore(entry.getKey(), ((Integer) entry
129: .getValue()).intValue());
130: }
131: //long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0);
132: //System.out.println("***DEBUG*** indexRWIEntry.Order (ONETHREAD): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond");
133: }
134: if (this .doms.size() > 0)
135: this .maxdomcount = this .doms.getMaxScore();
136: }
137:
138: public kelondroOrder<indexRWIVarEntry> clone() {
139: return null;
140: }
141:
142: public int authority(String urlHash) {
143: return (doms.getScore(urlHash.substring(6)) << 8)
144: / (1 + this .maxdomcount);
145: }
146:
147: public long cardinal(byte[] key) {
148: return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key)));
149: }
150:
151: public long cardinal(indexRWIRowEntry t) {
152: return cardinal(new indexRWIVarEntry(t));
153: }
154:
155: public long cardinal(indexRWIVarEntry t) {
156: //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
157: // the normalizedEntry must be a normalized indexEntry
158: kelondroBitfield flags = t.flags();
159: long r = ((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
160: + ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr)
161: + ((t.urlcomps() == 0) ? 0
162: : ((256 - (((t.urlcomps() - min.urlcomps()) << 8) / (1 + max
163: .urlcomps() - min.urlcomps()))) << ranking.coeff_urlcomps))
164: + ((t.urllength() == 0) ? 0
165: : ((256 - (((t.urllength() - min.urllength()) << 8) / (1 + max
166: .urllength() - min.urllength()))) << ranking.coeff_urllength))
167: + ((t.posintext() == 0) ? 0
168: : ((256 - (((t.posintext() - min.posintext()) << 8) / (1 + max
169: .posintext() - min.posintext()))) << ranking.coeff_posintext))
170: + ((t.posofphrase() == 0) ? 0
171: : ((256 - (((t.posofphrase() - min
172: .posofphrase()) << 8) / (1 + max
173: .posofphrase() - min.posofphrase()))) << ranking.coeff_posofphrase))
174: + ((t.posinphrase() == 0) ? 0
175: : ((256 - (((t.posinphrase() - min
176: .posinphrase()) << 8) / (1 + max
177: .posinphrase() - min.posinphrase()))) << ranking.coeff_posinphrase))
178: + ((256 - (((t.worddistance() - min.worddistance()) << 8) / (1 + max
179: .worddistance() - min.worddistance()))) << ranking.coeff_worddistance)
180: + ((((t.virtualAge() - min.virtualAge()) << 8) / (1 + max
181: .virtualAge() - min.virtualAge())) << ranking.coeff_date)
182: + ((((t.wordsintitle() - min.wordsintitle()) << 8) / (1 + max
183: .wordsintitle() - min.wordsintitle())) << ranking.coeff_wordsintitle)
184: + ((((t.wordsintext() - min.wordsintext()) << 8) / (1 + max
185: .wordsintext() - min.wordsintext())) << ranking.coeff_wordsintext)
186: + ((((t.phrasesintext() - min.phrasesintext()) << 8) / (1 + max
187: .phrasesintext() - min.phrasesintext())) << ranking.coeff_phrasesintext)
188: + ((((t.llocal() - min.llocal()) << 8) / (1 + max
189: .llocal() - min.llocal())) << ranking.coeff_llocal)
190: + ((((t.lother() - min.lother()) << 8) / (1 + max
191: .lother() - min.lother())) << ranking.coeff_lother)
192: + ((((t.hitcount() - min.hitcount()) << 8) / (1 + max
193: .hitcount() - min.hitcount())) << ranking.coeff_hitcount)
194: + (((int) ((((t.termFrequency() - min.termFrequency()) * 256.0) / (1 + max
195: .termFrequency() - min.termFrequency())))) << ranking.coeff_termfrequency)
196: + (authority(t.urlHash()) << ranking.coeff_authority)
197: + (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl
198: : 0))
199: + (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title
200: : 0))
201: + (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator
202: : 0))
203: + (((flags.get(indexRWIEntry.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject
204: : 0))
205: + (((flags.get(indexRWIEntry.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description
206: : 0))
207: + (((flags.get(indexRWIEntry.flag_app_emphasized)) ? 255 << ranking.coeff_appemph
208: : 0))
209: + (((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof
210: : 0))
211: + (((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage
212: : 0))
213: + (((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio
214: : 0))
215: + (((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo
216: : 0))
217: + (((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp
218: : 0))
219: + (((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength
220: : 0));
221: //if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
222:
223: return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap
224: }
225:
226: public int compare(indexRWIVarEntry a, indexRWIVarEntry b) {
227: long ca = cardinal(a);
228: long cb = cardinal(b);
229: return (ca > cb) ? 1 : (ca < cb) ? -1 : 0;
230: }
231:
232: public String signature() {
233: return "rx";
234: }
235:
236: public boolean wellformed(indexRWIVarEntry a) {
237: return true;
238: }
239:
240: public static class minmaxfinder extends Thread {
241:
242: private indexRWIVarEntry entryMin, entryMax;
243: private indexContainer container;
244: private int start, end;
245: private HashMap<String, Integer> doms;
246: private Integer int1;
247:
248: public minmaxfinder(indexContainer container,
249: int start /*including*/, int end /*excluding*/) {
250: this .container = container;
251: this .start = start;
252: this .end = end;
253: this .doms = new HashMap<String, Integer>();
254: this .int1 = new Integer(1);
255: }
256:
257: public void run() {
258: // find min/max to obtain limits for normalization
259: this .entryMin = null;
260: this .entryMax = null;
261: indexRWIRowEntry iEntry;
262: int p = this .start;
263: String dom;
264: Integer count;
265: while (p < this .end) {
266: iEntry = new indexRWIRowEntry(container.get(p++));
267: // find min/max
268: if (this .entryMin == null)
269: this .entryMin = new indexRWIVarEntry(iEntry);
270: else
271: indexRWIVarEntry.min(this .entryMin, iEntry);
272: if (this .entryMax == null)
273: this .entryMax = new indexRWIVarEntry(iEntry);
274: else
275: indexRWIVarEntry.max(this .entryMax, iEntry);
276: // update domcount
277: dom = iEntry.urlHash().substring(6);
278: count = (Integer) doms.get(dom);
279: if (count == null) {
280: doms.put(dom, int1);
281: } else {
282: doms.put(dom, new Integer(count.intValue() + 1));
283: }
284: }
285: }
286:
287: public HashMap<String, Integer> domcount() {
288: return this.doms;
289: }
290: }
291:
292: }
|