Source Code Cross Referenced for plasmaSearchRankingProcess.java in  » Search-Engine » yacy » de » anomic » plasma » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » yacy » de.anomic.plasma 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        // plasmaSearchRankingProcess.java
002:        // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003:        // first published 07.11.2007 on http://yacy.net
004:        //
005:        // This is a part of YaCy, a peer-to-peer based web search engine
006:        //
007:        // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008:        // $LastChangedRevision: 1986 $
009:        // $LastChangedBy: orbiter $
010:        //
011:        // LICENSE
012:        // 
013:        // This program is free software; you can redistribute it and/or modify
014:        // it under the terms of the GNU General Public License as published by
015:        // the Free Software Foundation; either version 2 of the License, or
016:        // (at your option) any later version.
017:        //
018:        // This program is distributed in the hope that it will be useful,
019:        // but WITHOUT ANY WARRANTY; without even the implied warranty of
020:        // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
021:        // GNU General Public License for more details.
022:        //
023:        // You should have received a copy of the GNU General Public License
024:        // along with this program; if not, write to the Free Software
025:        // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026:
027:        package de.anomic.plasma;
028:
029:        import java.io.File;
030:        import java.io.IOException;
031:        import java.util.HashMap;
032:        import java.util.Iterator;
033:        import java.util.Map;
034:        import java.util.Set;
035:        import java.util.TreeMap;
036:        import java.util.TreeSet;
037:
038:        import de.anomic.htmlFilter.htmlFilterContentScraper;
039:        import de.anomic.index.indexContainer;
040:        import de.anomic.index.indexRWIEntry;
041:        import de.anomic.index.indexRWIEntryOrder;
042:        import de.anomic.index.indexRWIRowEntry;
043:        import de.anomic.index.indexURLEntry;
044:        import de.anomic.kelondro.kelondroBinSearch;
045:        import de.anomic.kelondro.kelondroMScoreCluster;
046:        import de.anomic.server.serverCodings;
047:        import de.anomic.server.serverFileUtils;
048:        import de.anomic.server.serverProfiling;
049:
050:        public final class plasmaSearchRankingProcess {
051:
052:            public static kelondroBinSearch[] ybrTables = null; // block-rank tables
053:            private static boolean useYBR = true;
054:
055:            private TreeMap<Object, indexRWIRowEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String
056:            private HashMap<String, TreeMap<Object, indexRWIRowEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
057:            private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
058:            private plasmaSearchQuery query;
059:            private int sortorder;
060:            private int maxentries;
061:            private int remote_peerCount, remote_indexCount,
062:                    remote_resourceSize, local_resourceSize;
063:            private indexRWIEntryOrder order;
064:            private HashMap<String, Object> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
065:            private kelondroMScoreCluster<String> ref; // reference score computation for the commonSense heuristic
066:            private int[] flagcount; // flag counter
067:            private TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
068:            private plasmaWordIndex wordIndex;
069:            private Map<String, indexContainer>[] localSearchContainerMaps;
070:
071:            public plasmaSearchRankingProcess(plasmaWordIndex wordIndex,
072:                    plasmaSearchQuery query, int sortorder, int maxentries) {
073:                // we collect the urlhashes and construct a list with urlEntry objects
074:                // attention: if minEntries is too high, this method will not terminate within the maxTime
075:                // sortorder: 0 = hash, 1 = url, 2 = ranking
076:                this .localSearchContainerMaps = null;
077:                this .sortedRWIEntries = new TreeMap<Object, indexRWIRowEntry>();
078:                this .doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIRowEntry>>();
079:                this .handover = new HashMap<String, String>();
080:                this .order = null;
081:                this .query = query;
082:                this .maxentries = maxentries;
083:                this .remote_peerCount = 0;
084:                this .remote_indexCount = 0;
085:                this .remote_resourceSize = 0;
086:                this .local_resourceSize = 0;
087:                this .urlhashes = new HashMap<String, Object>();
088:                this .ref = new kelondroMScoreCluster<String>();
089:                this .misses = new TreeSet<String>();
090:                this .wordIndex = wordIndex;
091:                this .sortorder = sortorder;
092:                this .flagcount = new int[32];
093:                for (int i = 0; i < 32; i++) {
094:                    this .flagcount[i] = 0;
095:                }
096:            }
097:
098:            public void execQuery() {
099:
100:                long timer = System.currentTimeMillis();
101:                this .localSearchContainerMaps = wordIndex
102:                        .localSearchContainers(query, null);
103:                serverProfiling.update("SEARCH",
104:                        new plasmaProfiling.searchEvent(query.id(true),
105:                                plasmaSearchEvent.COLLECTION,
106:                                this .localSearchContainerMaps[0].size(), System
107:                                        .currentTimeMillis()
108:                                        - timer));
109:
110:                // join and exclude the local result
111:                timer = System.currentTimeMillis();
112:                indexContainer index = (this .localSearchContainerMaps == null) ? plasmaWordIndex
113:                        .emptyContainer(null, 0)
114:                        : indexContainer.joinExcludeContainers(
115:                                this .localSearchContainerMaps[0].values(),
116:                                this .localSearchContainerMaps[1].values(),
117:                                query.maxDistance);
118:                serverProfiling.update("SEARCH",
119:                        new plasmaProfiling.searchEvent(query.id(true),
120:                                plasmaSearchEvent.JOIN, index.size(), System
121:                                        .currentTimeMillis()
122:                                        - timer));
123:                int joincount = index.size();
124:
125:                if ((index == null) || (joincount == 0)) {
126:                    return;
127:                }
128:
129:                if (sortorder == 2) {
130:                    insertRanked(index, true, index.size());
131:                } else {
132:                    insertNoOrder(index, true, index.size());
133:                }
134:            }
135:
136:            private void insertNoOrder(indexContainer index, boolean local,
137:                    int fullResource) {
138:                final Iterator<indexRWIRowEntry> en = index.entries();
139:                // generate a new map where the urls are sorted (not by hash but by the url text)
140:
141:                if (local) {
142:                    this .local_resourceSize += fullResource;
143:                } else {
144:                    this .remote_resourceSize += fullResource;
145:                    this .remote_peerCount++;
146:                    this .remote_indexCount += index.size();
147:                }
148:
149:                indexRWIRowEntry ientry;
150:                indexURLEntry uentry;
151:                String u;
152:                loop: while (en.hasNext()) {
153:                    ientry = en.next();
154:
155:                    // check constraints
156:                    if (!testFlags(ientry))
157:                        continue loop;
158:
159:                    // increase flag counts
160:                    for (int i = 0; i < 32; i++) {
161:                        if (ientry.flags().get(i)) {
162:                            flagcount[i]++;
163:                        }
164:                    }
165:
166:                    // load url
167:                    if (sortorder == 0) {
168:                        this .sortedRWIEntries.put(ientry.urlHash(), ientry);
169:                        this .urlhashes.put(ientry.urlHash(), ientry.urlHash());
170:                    } else {
171:                        uentry = wordIndex.loadedURL.load(ientry.urlHash(),
172:                                ientry, 0);
173:                        if (uentry == null) {
174:                            this .misses.add(ientry.urlHash());
175:                        } else {
176:                            u = uentry.comp().url().toNormalform(false, true);
177:                            this .sortedRWIEntries.put(u, ientry);
178:                            this .urlhashes.put(ientry.urlHash(), u);
179:                        }
180:                    }
181:
182:                    // interrupt if we have enough
183:                    if ((query.neededResults() > 0)
184:                            && (this .misses.size()
185:                                    + this .sortedRWIEntries.size() > query
186:                                    .neededResults()))
187:                        break loop;
188:                } // end loop
189:            }
190:
191:            public void insertRanked(indexContainer index, boolean local,
192:                    int fullResource) {
193:                // we collect the urlhashes and construct a list with urlEntry objects
194:                // attention: if minEntries is too high, this method will not terminate within the maxTime
195:
196:                assert (index != null);
197:                if (index.size() == 0)
198:                    return;
199:                if (local) {
200:                    this .local_resourceSize += fullResource;
201:                } else {
202:                    this .remote_resourceSize += fullResource;
203:                    this .remote_peerCount++;
204:                }
205:
206:                long timer = System.currentTimeMillis();
207:                if (this .order == null) {
208:                    this .order = new indexRWIEntryOrder(query.ranking);
209:                }
210:                this .order.normalizeWith(index);
211:                serverProfiling.update("SEARCH",
212:                        new plasmaProfiling.searchEvent(query.id(true),
213:                                plasmaSearchEvent.NORMALIZING, index.size(),
214:                                System.currentTimeMillis() - timer));
215:
216:                // normalize entries and get ranking
217:                timer = System.currentTimeMillis();
218:                Iterator<indexRWIRowEntry> i = index.entries();
219:                indexRWIRowEntry iEntry, l;
220:                long biggestEntry = 0;
221:                //long s0 = System.currentTimeMillis();
222:                Long r;
223:                while (i.hasNext()) {
224:                    iEntry = i.next();
225:                    if (iEntry.urlHash().length() != index.row().primaryKeyLength)
226:                        continue;
227:
228:                    // increase flag counts
229:                    for (int j = 0; j < 32; j++) {
230:                        if (iEntry.flags().get(j)) {
231:                            flagcount[j]++;
232:                        }
233:                    }
234:
235:                    // kick out entries that are too bad according to current findings
236:                    r = new Long(order.cardinal(iEntry));
237:                    if ((maxentries >= 0)
238:                            && (sortedRWIEntries.size() >= maxentries)
239:                            && (r.longValue() > biggestEntry))
240:                        continue;
241:
242:                    // check constraints
243:                    if (!testFlags(iEntry))
244:                        continue;
245:
246:                    if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
247:                        if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO)
248:                                && (!(iEntry.flags()
249:                                        .get(plasmaCondenser.flag_cat_hasaudio))))
250:                            continue;
251:                        if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO)
252:                                && (!(iEntry.flags()
253:                                        .get(plasmaCondenser.flag_cat_hasvideo))))
254:                            continue;
255:                        if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE)
256:                                && (!(iEntry.flags()
257:                                        .get(plasmaCondenser.flag_cat_hasimage))))
258:                            continue;
259:                        if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP)
260:                                && (!(iEntry.flags()
261:                                        .get(plasmaCondenser.flag_cat_hasapp))))
262:                            continue;
263:                    }
264:                    if ((maxentries < 0)
265:                            || (sortedRWIEntries.size() < maxentries)) {
266:                        if (urlhashes.containsKey(iEntry.urlHash()))
267:                            continue;
268:                        while (sortedRWIEntries.containsKey(r))
269:                            r = new Long(r.longValue() + 1);
270:                        sortedRWIEntries.put(r, iEntry);
271:                    } else {
272:                        if (r.longValue() > biggestEntry) {
273:                            continue;
274:                        } else {
275:                            if (urlhashes.containsKey(iEntry.urlHash()))
276:                                continue;
277:                            l = sortedRWIEntries.remove((Long) sortedRWIEntries
278:                                    .lastKey());
279:                            urlhashes.remove(l.urlHash());
280:                            while (sortedRWIEntries.containsKey(r))
281:                                r = new Long(r.longValue() + 1);
282:                            sortedRWIEntries.put(r, iEntry);
283:                            biggestEntry = order.cardinal(sortedRWIEntries
284:                                    .get(sortedRWIEntries.lastKey()));
285:                        }
286:                    }
287:
288:                    // increase counter for statistics
289:                    if (!local)
290:                        this .remote_indexCount++;
291:                }
292:
293:                //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
294:                serverProfiling.update("SEARCH",
295:                        new plasmaProfiling.searchEvent(query.id(true),
296:                                plasmaSearchEvent.PRESORT, index.size(), System
297:                                        .currentTimeMillis()
298:                                        - timer));
299:            }
300:
301:            private boolean testFlags(indexRWIEntry ientry) {
302:                if (query.constraint == null)
303:                    return true;
304:                // test if ientry matches with filter
305:                // if all = true: let only entries pass that has all matching bits
306:                // if all = false: let all entries pass that has at least one matching bit
307:                if (query.allofconstraint) {
308:                    for (int i = 0; i < 32; i++) {
309:                        if ((query.constraint.get(i))
310:                                && (!ientry.flags().get(i)))
311:                            return false;
312:                    }
313:                    return true;
314:                }
315:                for (int i = 0; i < 32; i++) {
316:                    if ((query.constraint.get(i)) && (ientry.flags().get(i)))
317:                        return true;
318:                }
319:                return false;
320:            }
321:
322:            public synchronized Map<String, indexContainer>[] searchContainerMaps() {
323:                // direct access to the result maps is needed for abstract generation
324:                // this is only available if execQuery() was called before
325:                return localSearchContainerMaps;
326:            }
327:
328:            // todo:
329:            // - remove redundant urls (sub-path occurred before)
330:            // - move up shorter urls
331:            // - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
332:
333:            private synchronized Object[] /*{Object, indexRWIEntry}*/bestRWI(
334:                    boolean skipDoubleDom) {
335:                // returns from the current RWI list the best entry and removed this entry from the list
336:                Object bestEntry;
337:                TreeMap<Object, indexRWIRowEntry> m;
338:                indexRWIRowEntry rwi;
339:                while (sortedRWIEntries.size() > 0) {
340:                    bestEntry = sortedRWIEntries.firstKey();
341:                    rwi = sortedRWIEntries.remove(bestEntry);
342:                    if (!skipDoubleDom)
343:                        return new Object[] { bestEntry, rwi };
344:                    // check doubledom
345:                    String domhash = rwi.urlHash().substring(6);
346:                    m = this .doubleDomCache.get(domhash);
347:                    if (m == null) {
348:                        // first appearance of dom
349:                        m = new TreeMap<Object, indexRWIRowEntry>();
350:                        this .doubleDomCache.put(domhash, m);
351:                        return new Object[] { bestEntry, rwi };
352:                    }
353:                    // second appearances of dom
354:                    m.put(bestEntry, rwi);
355:                }
356:                // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
357:                // find best entry from all caches
358:                Iterator<TreeMap<Object, indexRWIRowEntry>> i = this .doubleDomCache
359:                        .values().iterator();
360:                bestEntry = null;
361:                Object o;
362:                indexRWIRowEntry bestrwi = null;
363:                while (i.hasNext()) {
364:                    m = i.next();
365:                    if (m.size() == 0)
366:                        continue;
367:                    if (bestEntry == null) {
368:                        bestEntry = m.firstKey();
369:                        bestrwi = m.remove(bestEntry);
370:                        continue;
371:                    }
372:                    o = m.firstKey();
373:                    rwi = m.remove(o);
374:                    if (o instanceof  Long) {
375:                        if (((Long) o).longValue() < ((Long) bestEntry)
376:                                .longValue()) {
377:                            bestEntry = o;
378:                            bestrwi = rwi;
379:                        }
380:                    }
381:                    if (o instanceof  String) {
382:                        if (((String) o).compareTo((String) bestEntry) < 0) {
383:                            bestEntry = o;
384:                            bestrwi = rwi;
385:                        }
386:                    }
387:                }
388:                if (bestrwi == null)
389:                    return null;
390:                // finally remove the best entry from the doubledom cache
391:                m = this .doubleDomCache.get(bestrwi.urlHash().substring(6));
392:                m.remove(bestEntry);
393:                return new Object[] { bestEntry, bestrwi };
394:            }
395:
396:            public synchronized indexURLEntry bestURL(boolean skipDoubleDom) {
397:                // returns from the current RWI list the best URL entry and removed this entry from the list
398:                while ((sortedRWIEntries.size() > 0) || (size() > 0)) {
399:                    Object[] obrwi = bestRWI(skipDoubleDom);
400:                    Object bestEntry = obrwi[0];
401:                    indexRWIRowEntry ientry = (indexRWIRowEntry) obrwi[1];
402:                    long ranking = (bestEntry instanceof  Long) ? ((Long) bestEntry)
403:                            .longValue()
404:                            : 0;
405:                    indexURLEntry u = wordIndex.loadedURL.load(
406:                            ientry.urlHash(), ientry, ranking);
407:                    if (u != null) {
408:                        indexURLEntry.Components comp = u.comp();
409:                        if (comp.url() != null)
410:                            this .handover.put(u.hash(), comp.url()
411:                                    .toNormalform(true, false)); // remember that we handed over this url
412:                        return u;
413:                    }
414:                    misses.add(ientry.urlHash());
415:                }
416:                return null;
417:            }
418:
419:            public synchronized int size() {
420:                //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
421:                int c = sortedRWIEntries.size();
422:                Iterator<TreeMap<Object, indexRWIRowEntry>> i = this .doubleDomCache
423:                        .values().iterator();
424:                while (i.hasNext())
425:                    c += i.next().size();
426:                return c;
427:            }
428:
429:            public int[] flagCount() {
430:                return flagcount;
431:            }
432:
433:            // "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."
434:
435:            public int filteredCount() {
436:                // the number of index entries that are considered as result set
437:                return this .sortedRWIEntries.size();
438:            }
439:
440:            public int getRemoteIndexCount() {
441:                // the number of result contributions from all the remote peers
442:                return this .remote_indexCount;
443:            }
444:
445:            public int getRemotePeerCount() {
446:                // the number of remote peers that have contributed
447:                return this .remote_peerCount;
448:            }
449:
450:            public int getRemoteResourceSize() {
451:                // the number of all hits in all the remote peers
452:                return this .remote_resourceSize;
453:            }
454:
455:            public int getLocalResourceSize() {
456:                // the number of hits in the local peer (index size, size of the collection in the own index)
457:                return this .local_resourceSize;
458:            }
459:
460:            public indexRWIEntry remove(String urlHash) {
461:                Object r = (Long) urlhashes.get(urlHash);
462:                if (r == null)
463:                    return null;
464:                assert sortedRWIEntries.containsKey(r);
465:                indexRWIEntry iEntry = (indexRWIEntry) sortedRWIEntries
466:                        .remove(r);
467:                urlhashes.remove(urlHash);
468:                return iEntry;
469:            }
470:
471:            public Iterator<String> miss() {
472:                return this .misses.iterator();
473:            }
474:
475:            public Set<String> getReferences(int count) {
476:                // create a list of words that had been computed by statistics over all
477:                // words that appeared in the url or the description of all urls
478:                Object[] refs = ref.getScores(count, false, 2,
479:                        Integer.MAX_VALUE);
480:                TreeSet<String> s = new TreeSet<String>(
481:                        String.CASE_INSENSITIVE_ORDER);
482:                for (int i = 0; i < refs.length; i++) {
483:                    s.add((String) refs[i]);
484:                }
485:                return s;
486:            }
487:
488:            public void addReferences(String[] words) {
489:                String word;
490:                for (int i = 0; i < words.length; i++) {
491:                    word = words[i].toLowerCase();
492:                    if ((word.length() > 2)
493:                            && ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_"
494:                                    .indexOf(word) < 0)
495:                            && (!(query.queryHashes.contains(plasmaCondenser
496:                                    .word2hash(word)))))
497:                        ref.incScore(word);
498:                }
499:            }
500:
501:            protected void addReferences(
502:                    plasmaSearchEvent.ResultEntry resultEntry) {
503:                // take out relevant information for reference computation
504:                if ((resultEntry.url() == null)
505:                        || (resultEntry.title() == null))
506:                    return;
507:                String[] urlcomps = htmlFilterContentScraper
508:                        .urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
509:                String[] descrcomps = resultEntry.title().toLowerCase().split(
510:                        htmlFilterContentScraper.splitrex); // words in the description
511:
512:                // add references
513:                addReferences(urlcomps);
514:                addReferences(descrcomps);
515:            }
516:
517:            public indexRWIEntryOrder getOrder() {
518:                return this .order;
519:            }
520:
521:            public static void loadYBR(File rankingPath, int count) {
522:                // load ranking tables
523:                if (rankingPath.exists()) {
524:                    ybrTables = new kelondroBinSearch[count];
525:                    String ybrName;
526:                    File f;
527:                    try {
528:                        for (int i = 0; i < count; i++) {
529:                            ybrName = "YBR-4-" + serverCodings.encodeHex(i, 2)
530:                                    + ".idx";
531:                            f = new File(rankingPath, ybrName);
532:                            if (f.exists()) {
533:                                ybrTables[i] = new kelondroBinSearch(
534:                                        serverFileUtils.read(f), 6);
535:                            } else {
536:                                ybrTables[i] = null;
537:                            }
538:                        }
539:                    } catch (IOException e) {
540:                        ybrTables = null;
541:                    }
542:                } else {
543:                    ybrTables = null;
544:                }
545:            }
546:
547:            public static boolean canUseYBR() {
548:                return ybrTables != null;
549:            }
550:
551:            public static boolean isUsingYBR() {
552:                return useYBR;
553:            }
554:
555:            public static void switchYBR(boolean usage) {
556:                useYBR = usage;
557:            }
558:
559:            public static int ybr(String urlHash) {
560:                // returns the YBR value in a range of 0..15, where 0 means best ranking and 15 means worst ranking
561:                if (ybrTables == null)
562:                    return 15;
563:                if (!(useYBR))
564:                    return 15;
565:                final String domHash = urlHash.substring(6);
566:                for (int i = 0; i < ybrTables.length; i++) {
567:                    if ((ybrTables[i] != null)
568:                            && (ybrTables[i].contains(domHash.getBytes()))) {
569:                        //System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
570:                        return i;
571:                    }
572:                }
573:                //System.out.println("NOT FOUND: " + urlHash);
574:                return 15;
575:            }
576:
577:            public long postRanking(Set<String> topwords,
578:                    plasmaSearchEvent.ResultEntry rentry, int position) {
579:
580:                long r = (255 - position) << 8;
581:
582:                // for media search: prefer pages with many links
583:                if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE)
584:                    r += rentry.limage() << query.ranking.coeff_cathasimage;
585:                if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO)
586:                    r += rentry.laudio() << query.ranking.coeff_cathasaudio;
587:                if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO)
588:                    r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
589:                if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP)
590:                    r += rentry.lapp() << query.ranking.coeff_cathasapp;
591:
592:                // prefer hit with 'prefer' pattern
593:                if (rentry.url().toNormalform(true, true).matches(query.prefer))
594:                    r += 256 << query.ranking.coeff_prefer;
595:                if (rentry.title().matches(query.prefer))
596:                    r += 256 << query.ranking.coeff_prefer;
597:
598:                // apply 'common-sense' heuristic using references
599:                String urlstring = rentry.url().toNormalform(true, true);
600:                String[] urlcomps = htmlFilterContentScraper
601:                        .urlComps(urlstring);
602:                String[] descrcomps = rentry.title().toLowerCase().split(
603:                        htmlFilterContentScraper.splitrex);
604:                for (int j = 0; j < urlcomps.length; j++) {
605:                    if (topwords.contains(urlcomps[j]))
606:                        r += Math.max(1, 256 - urlstring.length()) << query.ranking.coeff_urlcompintoplist;
607:                }
608:                for (int j = 0; j < descrcomps.length; j++) {
609:                    if (topwords.contains(descrcomps[j]))
610:                        r += Math.max(1, 256 - rentry.title().length()) << query.ranking.coeff_descrcompintoplist;
611:                }
612:
613:                // apply query-in-result matching
614:                Set<String> urlcomph = plasmaCondenser.words2hashSet(urlcomps);
615:                Set<String> descrcomph = plasmaCondenser
616:                        .words2hashSet(descrcomps);
617:                Iterator<String> shi = query.queryHashes.iterator();
618:                String queryhash;
619:                while (shi.hasNext()) {
620:                    queryhash = shi.next();
621:                    if (urlcomph.contains(queryhash))
622:                        r += 256 << query.ranking.coeff_appurl;
623:                    if (descrcomph.contains(queryhash))
624:                        r += 256 << query.ranking.coeff_app_dc_title;
625:                }
626:
627:                return r;
628:            }
629:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.