001: // plasmaSearchAPI.java
002: // -----------------------
003: // (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
004: // first published 2008 on http://yacy.net
005: //
006: // This is a part of YaCy, a peer-to-peer based web search engine
007: //
008: // $LastChangedDate: 2007-11-14 01:15:28 +0000 (Mi, 14 Nov 2007) $
009: // $LastChangedRevision: 4216 $
010: // $LastChangedBy: orbiter $
011: //
012: // LICENSE
013: //
014: // This program is free software; you can redistribute it and/or modify
015: // it under the terms of the GNU General Public License as published by
016: // the Free Software Foundation; either version 2 of the License, or
017: // (at your option) any later version.
018: //
019: // This program is distributed in the hope that it will be useful,
020: // but WITHOUT ANY WARRANTY; without even the implied warranty of
021: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
022: // GNU General Public License for more details.
023: //
024: // You should have received a copy of the GNU General Public License
025: // along with this program; if not, write to the Free Software
026: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
027:
028: package de.anomic.plasma;
029:
030: import java.util.Date;
031: import java.util.Iterator;
032:
033: import de.anomic.data.listManager;
034: import de.anomic.index.indexRWIEntry;
035: import de.anomic.index.indexURLEntry;
036: import de.anomic.kelondro.kelondroBitfield;
037: import de.anomic.plasma.urlPattern.plasmaURLPattern;
038: import de.anomic.server.serverDate;
039: import de.anomic.server.serverObjects;
040: import de.anomic.yacy.yacyCore;
041: import de.anomic.yacy.yacySeed;
042: import de.anomic.yacy.yacyURL;
043:
044: public class plasmaSearchAPI {
045: // collection of static methods for a search servlet. Exists only to prevent that the same processes are defined more than once.
046:
047: public static kelondroBitfield compileFlags(serverObjects post) {
048: kelondroBitfield b = new kelondroBitfield(4);
049: if (post.get("allurl", "").equals("on"))
050: return null;
051: if (post.get("flags") != null) {
052: if (post.get("flags", "").length() == 0)
053: return null;
054: return new kelondroBitfield(4, (String) post.get("flags"));
055: }
056: if (post.get("description", "").equals("on"))
057: b.set(indexRWIEntry.flag_app_dc_description, true);
058: if (post.get("title", "").equals("on"))
059: b.set(indexRWIEntry.flag_app_dc_title, true);
060: if (post.get("creator", "").equals("on"))
061: b.set(indexRWIEntry.flag_app_dc_creator, true);
062: if (post.get("subject", "").equals("on"))
063: b.set(indexRWIEntry.flag_app_dc_subject, true);
064: if (post.get("url", "").equals("on"))
065: b.set(indexRWIEntry.flag_app_dc_identifier, true);
066: if (post.get("emphasized", "").equals("on"))
067: b.set(indexRWIEntry.flag_app_emphasized, true);
068: if (post.get("image", "").equals("on"))
069: b.set(plasmaCondenser.flag_cat_hasimage, true);
070: if (post.get("audio", "").equals("on"))
071: b.set(plasmaCondenser.flag_cat_hasaudio, true);
072: if (post.get("video", "").equals("on"))
073: b.set(plasmaCondenser.flag_cat_hasvideo, true);
074: if (post.get("app", "").equals("on"))
075: b.set(plasmaCondenser.flag_cat_hasapp, true);
076: if (post.get("indexof", "").equals("on"))
077: b.set(plasmaCondenser.flag_cat_indexof, true);
078: return b;
079: }
080:
081: public static void listHosts(serverObjects prop, String startHash) {
082: // list known hosts
083: yacySeed seed;
084: int hc = 0;
085: prop.put("searchresult_keyhash", startHash);
086: if (yacyCore.seedDB != null
087: && yacyCore.seedDB.sizeConnected() > 0) {
088: Iterator<yacySeed> e = yacyCore.dhtAgent
089: .getAcceptRemoteIndexSeeds(startHash);
090: while (e.hasNext()) {
091: seed = (yacySeed) e.next();
092: if (seed != null) {
093: prop.put("searchresult_hosts_" + hc + "_hosthash",
094: seed.hash);
095: prop.putHTML("searchresult_hosts_" + hc
096: + "_hostname", seed.hash + " "
097: + seed.get(yacySeed.NAME, "nameless"));
098: hc++;
099: }
100: }
101: prop.put("searchresult_hosts", hc);
102: } else {
103: prop.put("searchresult_hosts", "0");
104: }
105: }
106:
107: public static plasmaSearchRankingProcess genSearchresult(
108: serverObjects prop, plasmaSwitchboard sb, String keyhash,
109: kelondroBitfield filter, int sortorder) {
110: plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, sb
111: .getRanking(), filter);
112: plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(
113: sb.wordIndex, query, sortorder, Integer.MAX_VALUE);
114: ranked.execQuery();
115:
116: if (ranked.filteredCount() == 0) {
117: prop.put("searchresult", 2);
118: prop.put("searchresult_wordhash", keyhash);
119: } else {
120: prop.put("searchresult", 3);
121: prop.put("searchresult_allurl", ranked.filteredCount());
122: prop
123: .put(
124: "searchresult_description",
125: ranked.flagCount()[indexRWIEntry.flag_app_dc_description]);
126: prop
127: .put(
128: "searchresult_title",
129: ranked.flagCount()[indexRWIEntry.flag_app_dc_title]);
130: prop
131: .put(
132: "searchresult_creator",
133: ranked.flagCount()[indexRWIEntry.flag_app_dc_creator]);
134: prop
135: .put(
136: "searchresult_subject",
137: ranked.flagCount()[indexRWIEntry.flag_app_dc_subject]);
138: prop
139: .put(
140: "searchresult_url",
141: ranked.flagCount()[indexRWIEntry.flag_app_dc_identifier]);
142: prop
143: .put(
144: "searchresult_emphasized",
145: ranked.flagCount()[indexRWIEntry.flag_app_emphasized]);
146: prop
147: .put(
148: "searchresult_image",
149: ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]);
150: prop
151: .put(
152: "searchresult_audio",
153: ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]);
154: prop
155: .put(
156: "searchresult_video",
157: ranked.flagCount()[plasmaCondenser.flag_cat_hasvideo]);
158: prop
159: .put(
160: "searchresult_app",
161: ranked.flagCount()[plasmaCondenser.flag_cat_hasapp]);
162: prop
163: .put(
164: "searchresult_indexof",
165: ranked.flagCount()[plasmaCondenser.flag_cat_indexof]);
166: }
167: return ranked;
168: }
169:
170: public static void genURLList(serverObjects prop, String keyhash,
171: String keystring, plasmaSearchRankingProcess ranked,
172: kelondroBitfield flags, int maxlines, int ordering) {
173: // search for a word hash and generate a list of url links
174: prop.put("genUrlList_keyHash", keyhash);
175:
176: if (ranked.filteredCount() == 0) {
177: prop.put("genUrlList", 1);
178: prop.put("genUrlList_count", 0);
179: prop.put("searchresult", 2);
180: } else {
181: prop.put("genUrlList", 2);
182: prop.put("searchresult", 3);
183: prop.put("genUrlList_flags", (flags == null) ? "" : flags
184: .exportB64());
185: prop.put("genUrlList_lines", maxlines);
186: prop.put("genUrlList_ordering", ordering);
187: int i = 0;
188: yacyURL url;
189: indexURLEntry entry;
190: String us;
191: long rn = -1;
192: while ((ranked.size() > 0)
193: && ((entry = ranked.bestURL(false)) != null)) {
194: if ((entry == null) || (entry.comp() == null))
195: continue;
196: url = entry.comp().url();
197: if (url == null)
198: continue;
199: us = url.toNormalform(false, false);
200: if (rn == -1)
201: rn = entry.ranking();
202: prop.put("genUrlList_urlList_" + i + "_urlExists", "1");
203: prop.put("genUrlList_urlList_" + i
204: + "_urlExists_urlhxCount", i);
205: prop.putHTML("genUrlList_urlList_" + i
206: + "_urlExists_urlhxValue", entry.word()
207: .urlHash());
208: prop.putHTML("genUrlList_urlList_" + i
209: + "_urlExists_keyString", keystring);
210: prop.put("genUrlList_urlList_" + i
211: + "_urlExists_keyHash", keyhash);
212: prop.putHTML("genUrlList_urlList_" + i
213: + "_urlExists_urlString", us);
214: prop
215: .put(
216: "genUrlList_urlList_" + i
217: + "_urlExists_urlStringShort",
218: (us.length() > 40) ? (us.substring(0,
219: 20)
220: + "<br>" + us.substring(20, 40) + "...")
221: : ((us.length() > 30) ? (us
222: .substring(0, 20)
223: + "<br>" + us
224: .substring(20)) : us));
225: prop.putNum("genUrlList_urlList_" + i
226: + "_urlExists_ranking", (entry.ranking() - rn));
227: prop.putNum("genUrlList_urlList_" + i
228: + "_urlExists_domlength", yacyURL
229: .domLengthEstimation(entry.hash()));
230: prop.putNum("genUrlList_urlList_" + i
231: + "_urlExists_ybr", plasmaSearchRankingProcess
232: .ybr(entry.hash()));
233: prop.putNum(
234: "genUrlList_urlList_" + i + "_urlExists_tf",
235: 1000.0 * entry.word().termFrequency());
236: prop.putNum("genUrlList_urlList_" + i
237: + "_urlExists_authority",
238: (ranked.getOrder() == null) ? -1 : ranked
239: .getOrder().authority(entry.hash()));
240: prop.put("genUrlList_urlList_" + i + "_urlExists_date",
241: serverDate.formatShortDay(new Date(entry.word()
242: .lastModified())));
243: prop.putNum("genUrlList_urlList_" + i
244: + "_urlExists_wordsintitle", entry.word()
245: .wordsintitle());
246: prop.putNum("genUrlList_urlList_" + i
247: + "_urlExists_wordsintext", entry.word()
248: .wordsintext());
249: prop.putNum("genUrlList_urlList_" + i
250: + "_urlExists_phrasesintext", entry.word()
251: .phrasesintext());
252: prop.putNum("genUrlList_urlList_" + i
253: + "_urlExists_llocal", entry.word().llocal());
254: prop.putNum("genUrlList_urlList_" + i
255: + "_urlExists_lother", entry.word().lother());
256: prop.putNum("genUrlList_urlList_" + i
257: + "_urlExists_hitcount", entry.word()
258: .hitcount());
259: prop.putNum("genUrlList_urlList_" + i
260: + "_urlExists_worddistance", entry.word()
261: .worddistance());
262: prop.putNum("genUrlList_urlList_" + i
263: + "_urlExists_pos", entry.word().posintext());
264: prop.putNum("genUrlList_urlList_" + i
265: + "_urlExists_phrase", entry.word()
266: .posofphrase());
267: prop.putNum("genUrlList_urlList_" + i
268: + "_urlExists_posinphrase", entry.word()
269: .posinphrase());
270: prop.putNum("genUrlList_urlList_" + i
271: + "_urlExists_urlcomps", entry.word()
272: .urlcomps());
273: prop.putNum("genUrlList_urlList_" + i
274: + "_urlExists_urllength", entry.word()
275: .urllength());
276: prop
277: .put(
278: "genUrlList_urlList_" + i
279: + "_urlExists_props",
280: ((entry.word().flags()
281: .get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, "
282: : "")
283: + ((entry.word().flags()
284: .get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, "
285: : "")
286: + ((entry.word().flags()
287: .get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, "
288: : "")
289: + ((entry.word().flags()
290: .get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, "
291: : "")
292: + ((entry.word().flags()
293: .get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, "
294: : "")
295: + ((entry.word().flags()
296: .get(indexRWIEntry.flag_app_dc_identifier)) ? "appears in url, "
297: : "")
298: + ((entry.word().flags()
299: .get(indexRWIEntry.flag_app_dc_title)) ? "appears in title, "
300: : "")
301: + ((entry.word().flags()
302: .get(indexRWIEntry.flag_app_dc_creator)) ? "appears in author, "
303: : "")
304: + ((entry.word().flags()
305: .get(indexRWIEntry.flag_app_dc_subject)) ? "appears in subject, "
306: : "")
307: + ((entry.word().flags()
308: .get(indexRWIEntry.flag_app_dc_description)) ? "appears in description, "
309: : "")
310: + ((entry.word().flags()
311: .get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized, "
312: : "")
313: + ((yacyURL
314: .probablyRootURL(entry
315: .word()
316: .urlHash())) ? "probably root url"
317: : ""));
318: if (plasmaSwitchboard.urlBlacklist.isListed(
319: plasmaURLPattern.BLACKLIST_DHT, url)) {
320: prop.put("genUrlList_urlList_" + i
321: + "_urlExists_urlhxChecked", "1");
322: }
323: i++;
324: if ((maxlines >= 0) && (i >= maxlines))
325: break;
326: }
327: Iterator<String> iter = ranked.miss(); // iterates url hash strings
328: while (iter.hasNext()) {
329: us = (String) iter.next();
330: prop.put("genUrlList_urlList_" + i + "_urlExists", "0");
331: prop.put("genUrlList_urlList_" + i
332: + "_urlExists_urlhxCount", i);
333: prop.putHTML("genUrlList_urlList_" + i
334: + "_urlExists_urlhxValue", us);
335: i++;
336: }
337: prop.put("genUrlList_urlList", i);
338: prop.putHTML("genUrlList_keyString", keystring);
339: prop.put("genUrlList_count", i);
340: putBlacklists(prop, listManager
341: .getDirListing(listManager.listsPath));
342: }
343: }
344:
345: public static void putBlacklists(serverObjects prop, String[] lists) {
346: prop.put("genUrlList_blacklists", lists.length);
347: for (int i = 0; i < lists.length; i++)
348: prop.put("genUrlList_blacklists_" + i + "_name", lists[i]);
349: }
350: }
|