001: // plasmaSearchRankingProfile.java
002: // -------------------------------
003: // part of YACY
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2006
007: // Created: 05.02.2006
008: //
009: // This program is free software; you can redistribute it and/or modify
010: // it under the terms of the GNU General Public License as published by
011: // the Free Software Foundation; either version 2 of the License, or
012: // (at your option) any later version.
013: //
014: // This program is distributed in the hope that it will be useful,
015: // but WITHOUT ANY WARRANTY; without even the implied warranty of
016: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: // GNU General Public License for more details.
018: //
019: // You should have received a copy of the GNU General Public License
020: // along with this program; if not, write to the Free Software
021: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: //
023: // Using this software in any meaning (reading, learning, copying, compiling,
024: // running) means that you agree that the Author(s) is (are) not responsible
025: // for cost, loss of data or any harm that may be caused directly or indirectly
026: // by usage of this softare or this documentation. The usage of this software
027: // is on your own risk. The installation and usage (starting/running) of this
028: // software may allow other people or application to access your computer and
029: // any attached devices and is highly dependent on the configuration of the
030: // software which must be done by the user of the software; the author(s) is
031: // (are) also not responsible for proper configuration and usage of the
032: // software, even if provoked by documentation provided together with
033: // the software.
034: //
035: // Any changes to this file according to the GPL as documented in the file
036: // gpl.txt aside this file in the shipment you received can be done to the
037: // lines that follows this copyright notice here, but changes must not be
038: // done inside the copyright notive above. A re-distribution must contain
039: // the intact and unchanged copyright notice.
040: // Contributions and changes to the program code must be marked as such.
041:
042: package de.anomic.plasma;
043:
044: import java.util.HashMap;
045: import java.util.Iterator;
046: import java.util.Map;
047:
048: public class plasmaSearchRankingProfile {
049:
050: // pre-sort attributes
051: public static final String DOMLENGTH = "domlength";
052: public static final String YBR = "ybr";
053: public static final String DATE = "date";
054: public static final String WORDSINTITLE = "wordsintitle";
055: public static final String WORDSINTEXT = "wordsintext";
056: public static final String PHRASESINTEXT = "phrasesintext";
057: public static final String LLOCAL = "llocal";
058: public static final String LOTHER = "lother";
059: public static final String URLLENGTH = "urllength";
060: public static final String URLCOMPS = "urlcomps";
061: public static final String HITCOUNT = "hitcount";
062: public static final String POSINTEXT = "posintext";
063: public static final String POSOFPHRASE = "posofphrase";
064: public static final String POSINPHRASE = "posinphrase";
065: public static final String AUTHORITY = "authority";
066: public static final String WORDDISTANCE = "worddistance";
067: public static final String APPURL = "appurl";
068: public static final String APP_DC_TITLE = "appdescr"; // title of page
069: public static final String APP_DC_CREATOR = "appauthor"; // the author field
070: public static final String APP_DC_SUBJECT = "apptags"; // tags
071: public static final String APP_DC_DESCRIPTION = "appref"; // references to the source (content of <a> tag)
072: public static final String APPEMPH = "appemph";
073: public static final String CATINDEXOF = "catindexof";
074: public static final String CATHASIMAGE = "cathasimage";
075: public static final String CATHASAUDIO = "cathasaudio";
076: public static final String CATHASVIDEO = "cathasvideo";
077: public static final String CATHASAPP = "cathasapp";
078: public static final String TERMFREQUENCY = "tf";
079:
080: // post-sort predicates
081: public static final String URLCOMPINTOPLIST = "urlcompintoplist";
082: public static final String DESCRCOMPINTOPLIST = "descrcompintoplist";
083: public static final String PREFER = "prefer";
084:
085: // coefficient max/min values
086: public static final int COEFF_MIN = 0;
087: public static final int COEFF_MAX = 15;
088:
089: public int coeff_domlength, coeff_ybr, coeff_date,
090: coeff_wordsintitle, coeff_wordsintext, coeff_phrasesintext,
091: coeff_llocal, coeff_lother, coeff_urllength,
092: coeff_urlcomps, coeff_hitcount, coeff_posintext,
093: coeff_posofphrase, coeff_posinphrase, coeff_authority,
094: coeff_worddistance, coeff_appurl, coeff_app_dc_title,
095: coeff_app_dc_creator, coeff_app_dc_subject,
096: coeff_app_dc_description, coeff_appemph, coeff_catindexof,
097: coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo,
098: coeff_cathasapp, coeff_urlcompintoplist,
099: coeff_descrcompintoplist, coeff_prefer,
100: coeff_termfrequency;
101:
102: public plasmaSearchRankingProfile(int mediatype) {
103: // set default-values
104: coeff_domlength = 11;
105: coeff_ybr = 9;
106: coeff_date = 4;
107: coeff_wordsintitle = 4;
108: coeff_wordsintext = 2;
109: coeff_phrasesintext = 3;
110: coeff_llocal = 2;
111: coeff_lother = 3;
112: coeff_urllength = 12;
113: coeff_urlcomps = 12;
114: coeff_hitcount = 9;
115: coeff_posintext = 10;
116: coeff_posofphrase = 8;
117: coeff_posinphrase = 1;
118: coeff_authority = 11;
119: coeff_worddistance = 12;
120: coeff_appurl = 13;
121: coeff_app_dc_title = 13;
122: coeff_app_dc_creator = 12;
123: coeff_app_dc_subject = 9;
124: coeff_app_dc_description = 8;
125: coeff_appemph = 10;
126: coeff_catindexof = (mediatype == plasmaSearchQuery.CONTENTDOM_TEXT) ? 0
127: : 15;
128: coeff_cathasimage = (mediatype == plasmaSearchQuery.CONTENTDOM_IMAGE) ? 15
129: : 0;
130: coeff_cathasaudio = (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO) ? 15
131: : 0;
132: coeff_cathasvideo = (mediatype == plasmaSearchQuery.CONTENTDOM_VIDEO) ? 15
133: : 0;
134: coeff_cathasapp = (mediatype == plasmaSearchQuery.CONTENTDOM_APP) ? 15
135: : 0;
136: coeff_termfrequency = 14;
137: coeff_urlcompintoplist = 3;
138: coeff_descrcompintoplist = 2;
139: coeff_prefer = 14;
140: }
141:
142: public plasmaSearchRankingProfile(String prefix, String profile) {
143: this (plasmaSearchQuery.CONTENTDOM_TEXT); // set defaults
144: if ((profile != null) && (profile.length() > 0)) {
145: //parse external form
146: HashMap<String, Integer> coeff = new HashMap<String, Integer>();
147: String[] elts = ((profile.startsWith("{") && (profile
148: .endsWith("}"))) ? profile.substring(1, profile
149: .length() - 1) : profile).split(",");
150: int p;
151: int s = (prefix == null) ? 0 : prefix.length();
152: String e;
153:
154: for (int i = 0; i < elts.length; i++) {
155: e = elts[i].trim();
156: if ((s == 0) || (e.startsWith(prefix))) {
157: p = e.indexOf("=");
158: if (p < 0)
159: System.out
160: .println("DEBUG: bug in plasmaSearchRankingProfile: e = "
161: + e);
162: if ((p > 0) && (e.length() > p + 1))
163: coeff.put(e.substring(s, p), new Integer(
164: Integer.parseInt(e.substring(p + 1))));
165: }
166: }
167: coeff_domlength = parseMap(coeff, DOMLENGTH,
168: coeff_domlength);
169: coeff_ybr = parseMap(coeff, YBR, coeff_ybr);
170: coeff_date = parseMap(coeff, DATE, coeff_date);
171: coeff_wordsintitle = parseMap(coeff, WORDSINTITLE,
172: coeff_wordsintitle);
173: coeff_wordsintext = parseMap(coeff, WORDSINTEXT,
174: coeff_wordsintext);
175: coeff_phrasesintext = parseMap(coeff, PHRASESINTEXT,
176: coeff_phrasesintext);
177: coeff_llocal = parseMap(coeff, LLOCAL, coeff_llocal);
178: coeff_lother = parseMap(coeff, LOTHER, coeff_lother);
179: coeff_urllength = parseMap(coeff, URLLENGTH,
180: coeff_urllength);
181: coeff_urlcomps = parseMap(coeff, URLCOMPS, coeff_urlcomps);
182: coeff_hitcount = parseMap(coeff, HITCOUNT, coeff_hitcount);
183: coeff_posintext = parseMap(coeff, POSINTEXT,
184: coeff_posintext);
185: coeff_posofphrase = parseMap(coeff, POSOFPHRASE,
186: coeff_posofphrase);
187: coeff_posinphrase = parseMap(coeff, POSINPHRASE,
188: coeff_posinphrase);
189: coeff_authority = parseMap(coeff, AUTHORITY,
190: coeff_authority);
191: coeff_worddistance = parseMap(coeff, WORDDISTANCE,
192: coeff_worddistance);
193: coeff_appurl = parseMap(coeff, APPURL, coeff_appurl);
194: coeff_app_dc_title = parseMap(coeff, APP_DC_TITLE,
195: coeff_app_dc_title);
196: coeff_app_dc_creator = parseMap(coeff, APP_DC_CREATOR,
197: coeff_app_dc_creator);
198: coeff_app_dc_subject = parseMap(coeff, APP_DC_SUBJECT,
199: coeff_app_dc_subject);
200: coeff_app_dc_description = parseMap(coeff,
201: APP_DC_DESCRIPTION, coeff_app_dc_description);
202: coeff_appemph = parseMap(coeff, APPEMPH, coeff_appemph);
203: coeff_catindexof = parseMap(coeff, CATINDEXOF,
204: coeff_catindexof);
205: coeff_cathasimage = parseMap(coeff, CATHASIMAGE,
206: coeff_cathasimage);
207: coeff_cathasaudio = parseMap(coeff, CATHASAUDIO,
208: coeff_cathasaudio);
209: coeff_cathasvideo = parseMap(coeff, CATHASVIDEO,
210: coeff_cathasvideo);
211: coeff_cathasapp = parseMap(coeff, CATHASAPP,
212: coeff_cathasapp);
213: coeff_termfrequency = parseMap(coeff, TERMFREQUENCY,
214: coeff_termfrequency);
215: coeff_urlcompintoplist = parseMap(coeff, URLCOMPINTOPLIST,
216: coeff_urlcompintoplist);
217: coeff_descrcompintoplist = parseMap(coeff,
218: DESCRCOMPINTOPLIST, coeff_descrcompintoplist);
219: coeff_prefer = parseMap(coeff, PREFER, coeff_prefer);
220: }
221: }
222:
223: private static int parseMap(HashMap<String, Integer> coeff,
224: String attr, int dflt) {
225: if (coeff.containsKey(attr))
226: try {
227: return ((Integer) coeff.get(attr)).intValue();
228: } catch (NumberFormatException e) {
229: return dflt;
230: }
231: else {
232: return dflt;
233: }
234: }
235:
236: public String toExternalString() {
237: return toExternalMap("").toString();
238: }
239:
240: public Map<String, String> toExternalMap(String prefix) {
241: Map<String, String> ext = preToExternalMap(prefix);
242: ext.putAll(postToExternalMap(prefix));
243: return ext;
244: }
245:
246: public Map<String, String> preToExternalMap(String prefix) {
247: Map<String, String> ext = new HashMap<String, String>();
248: ext.put(prefix + DOMLENGTH, Integer.toString(coeff_domlength));
249: ext.put(prefix + YBR, Integer.toString(coeff_ybr));
250: ext.put(prefix + DATE, Integer.toString(coeff_date));
251: ext.put(prefix + WORDSINTITLE, Integer
252: .toString(coeff_wordsintitle));
253: ext.put(prefix + WORDSINTEXT, Integer
254: .toString(coeff_wordsintext));
255: ext.put(prefix + PHRASESINTEXT, Integer
256: .toString(coeff_phrasesintext));
257: ext.put(prefix + LLOCAL, Integer.toString(coeff_llocal));
258: ext.put(prefix + LOTHER, Integer.toString(coeff_lother));
259: ext.put(prefix + URLLENGTH, Integer.toString(coeff_urllength));
260: ext.put(prefix + URLCOMPS, Integer.toString(coeff_urlcomps));
261: ext.put(prefix + HITCOUNT, Integer.toString(coeff_hitcount));
262: ext.put(prefix + POSINTEXT, Integer.toString(coeff_posintext));
263: ext.put(prefix + POSOFPHRASE, Integer
264: .toString(coeff_posofphrase));
265: ext.put(prefix + POSINPHRASE, Integer
266: .toString(coeff_posinphrase));
267: ext.put(prefix + AUTHORITY, Integer.toString(coeff_authority));
268: ext.put(prefix + WORDDISTANCE, Integer
269: .toString(coeff_worddistance));
270: ext.put(prefix + APPURL, Integer.toString(coeff_appurl));
271: ext.put(prefix + APP_DC_TITLE, Integer
272: .toString(coeff_app_dc_title));
273: ext.put(prefix + APP_DC_CREATOR, Integer
274: .toString(coeff_app_dc_creator));
275: ext.put(prefix + APP_DC_SUBJECT, Integer
276: .toString(coeff_app_dc_subject));
277: ext.put(prefix + APP_DC_DESCRIPTION, Integer
278: .toString(coeff_app_dc_description));
279: ext.put(prefix + APPEMPH, Integer.toString(coeff_appemph));
280: ext
281: .put(prefix + CATINDEXOF, Integer
282: .toString(coeff_catindexof));
283: ext.put(prefix + CATHASIMAGE, Integer
284: .toString(coeff_cathasimage));
285: ext.put(prefix + CATHASAUDIO, Integer
286: .toString(coeff_cathasaudio));
287: ext.put(prefix + CATHASVIDEO, Integer
288: .toString(coeff_cathasvideo));
289: ext.put(prefix + CATHASAPP, Integer.toString(coeff_cathasapp));
290: ext.put(prefix + TERMFREQUENCY, Integer
291: .toString(coeff_termfrequency));
292: return ext;
293: }
294:
295: public Map<String, String> postToExternalMap(String prefix) {
296: Map<String, String> ext = new HashMap<String, String>();
297: ext.put(prefix + URLCOMPINTOPLIST, Integer
298: .toString(coeff_urlcompintoplist));
299: ext.put(prefix + DESCRCOMPINTOPLIST, Integer
300: .toString(coeff_descrcompintoplist));
301: ext.put(prefix + PREFER, Integer.toString(coeff_prefer));
302: return ext;
303: }
304:
305: public String toExternalURLGet(String prefix) {
306: Iterator<Map.Entry<String, String>> i = toExternalMap("")
307: .entrySet().iterator();
308: Map.Entry<String, String> entry;
309: StringBuffer ext = new StringBuffer();
310: while (i.hasNext()) {
311: entry = i.next();
312: ext.append("&");
313: ext.append(prefix);
314: ext.append(entry.getKey());
315: ext.append("=");
316: ext.append(entry.getValue());
317: }
318: return new String(ext);
319: }
320:
321: }
|