001: // plasmaSearchQuery.java
002: // -----------------------
003: // part of YACY
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2005
007: // Created: 10.10.2005
008: //
009: // This program is free software; you can redistribute it and/or modify
010: // it under the terms of the GNU General Public License as published by
011: // the Free Software Foundation; either version 2 of the License, or
012: // (at your option) any later version.
013: //
014: // This program is distributed in the hope that it will be useful,
015: // but WITHOUT ANY WARRANTY; without even the implied warranty of
016: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: // GNU General Public License for more details.
018: //
019: // You should have received a copy of the GNU General Public License
020: // along with this program; if not, write to the Free Software
021: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: //
023: // Using this software in any meaning (reading, learning, copying, compiling,
024: // running) means that you agree that the Author(s) is (are) not responsible
025: // for cost, loss of data or any harm that may be caused directly or indirectly
026: // by usage of this softare or this documentation. The usage of this software
027: // is on your own risk. The installation and usage (starting/running) of this
028: // software may allow other people or application to access your computer and
029: // any attached devices and is highly dependent on the configuration of the
030: // software which must be done by the user of the software; the author(s) is
031: // (are) also not responsible for proper configuration and usage of the
032: // software, even if provoked by documentation provided together with
033: // the software.
034: //
035: // Any changes to this file according to the GPL as documented in the file
036: // gpl.txt aside this file in the shipment you received can be done to the
037: // lines that follows this copyright notice here, but changes must not be
038: // done inside the copyright notive above. A re-distribution must contain
039: // the intact and unchanged copyright notice.
040: // Contributions and changes to the program code must be marked as such.
041:
042: package de.anomic.plasma;
043:
044: import java.util.HashMap;
045: import java.util.Iterator;
046: import java.util.Set;
047: import java.util.TreeSet;
048:
049: import de.anomic.htmlFilter.htmlFilterAbstractScraper;
050: import de.anomic.kelondro.kelondroBase64Order;
051: import de.anomic.kelondro.kelondroBitfield;
052: import de.anomic.kelondro.kelondroMSetTools;
053: import de.anomic.kelondro.kelondroNaturalOrder;
054: import de.anomic.server.serverCharBuffer;
055: import de.anomic.yacy.yacySeedDB;
056:
057: public final class plasmaSearchQuery {
058:
059: public static final int SEARCHDOM_LOCAL = 0;
060: public static final int SEARCHDOM_CLUSTERDHT = 1;
061: public static final int SEARCHDOM_CLUSTERALL = 2;
062: public static final int SEARCHDOM_GLOBALDHT = 3;
063: public static final int SEARCHDOM_GLOBALALL = 4;
064:
065: public static final int CONTENTDOM_ALL = -1;
066: public static final int CONTENTDOM_TEXT = 0;
067: public static final int CONTENTDOM_IMAGE = 1;
068: public static final int CONTENTDOM_AUDIO = 2;
069: public static final int CONTENTDOM_VIDEO = 3;
070: public static final int CONTENTDOM_APP = 4;
071:
072: public static final kelondroBitfield empty_constraint = new kelondroBitfield(
073: 4, "AAAAAA");
074: public static final kelondroBitfield catchall_constraint = new kelondroBitfield(
075: 4, "______");
076:
077: public String queryString;
078: public TreeSet<String> queryHashes, excludeHashes;
079: private int linesPerPage, offset;
080: public String prefer;
081: public int contentdom;
082: public String urlMask;
083: public int domType;
084: public String domGroupName;
085: public int domMaxTargets;
086: public int maxDistance;
087: public kelondroBitfield constraint;
088: public boolean allofconstraint;
089: public boolean onlineSnippetFetch;
090: public plasmaSearchRankingProfile ranking;
091:
092: public plasmaSearchQuery(String queryString, int lines,
093: plasmaSearchRankingProfile ranking,
094: kelondroBitfield constraint) {
095: if ((queryString.length() == 12)
096: && (kelondroBase64Order.enhancedCoder
097: .wellformed(queryString.getBytes()))) {
098: this .queryString = null;
099: this .queryHashes = new TreeSet<String>();
100: this .excludeHashes = new TreeSet<String>();
101: this .queryHashes.add(queryString);
102: } else {
103: this .queryString = queryString;
104: TreeSet<String>[] cq = cleanQuery(queryString);
105: this .queryHashes = plasmaCondenser.words2hashes(cq[0]);
106: this .excludeHashes = plasmaCondenser.words2hashes(cq[1]);
107: }
108: this .ranking = ranking;
109: this .maxDistance = Integer.MAX_VALUE;
110: this .prefer = "";
111: this .contentdom = CONTENTDOM_ALL;
112: this .linesPerPage = lines;
113: this .offset = 0;
114: this .urlMask = ".*";
115: this .domType = SEARCHDOM_LOCAL;
116: this .domGroupName = "";
117: this .domMaxTargets = 0;
118: this .constraint = constraint;
119: this .allofconstraint = false;
120: this .onlineSnippetFetch = false;
121: }
122:
123: public plasmaSearchQuery(String queryString,
124: TreeSet<String> queryHashes, TreeSet<String> excludeHashes,
125: plasmaSearchRankingProfile ranking, int maxDistance,
126: String prefer, int contentdom, boolean onlineSnippetFetch,
127: int lines, int offset, String urlMask, int domType,
128: String domGroupName, int domMaxTargets,
129: kelondroBitfield constraint, boolean allofconstraint) {
130: this .queryString = queryString;
131: this .queryHashes = queryHashes;
132: this .excludeHashes = excludeHashes;
133: this .ranking = ranking;
134: this .maxDistance = maxDistance;
135: this .prefer = prefer;
136: this .contentdom = contentdom;
137: this .linesPerPage = lines;
138: this .offset = offset;
139: //this.maximumTime = Math.min(6000, maximumTime);
140: this .urlMask = urlMask;
141: this .domType = domType;
142: this .domGroupName = domGroupName;
143: this .domMaxTargets = domMaxTargets;
144: this .constraint = constraint;
145: this .allofconstraint = allofconstraint;
146: this .onlineSnippetFetch = onlineSnippetFetch;
147: }
148:
149: public int neededResults() {
150: // the number of result lines that must be computed
151: return this .offset + this .linesPerPage;
152: }
153:
154: public int displayResults() {
155: // the number of result lines that are displayed at once (size of result page)
156: return this .linesPerPage;
157: }
158:
159: public void setOffset(int newOffset) {
160: this .offset = newOffset;
161: }
162:
163: public static int contentdomParser(String dom) {
164: if (dom.equals("text"))
165: return CONTENTDOM_TEXT;
166: else if (dom.equals("image"))
167: return CONTENTDOM_IMAGE;
168: else if (dom.equals("audio"))
169: return CONTENTDOM_AUDIO;
170: else if (dom.equals("video"))
171: return CONTENTDOM_VIDEO;
172: else if (dom.equals("app"))
173: return CONTENTDOM_APP;
174: return CONTENTDOM_TEXT;
175: }
176:
177: public String contentdom() {
178: if (this .contentdom == CONTENTDOM_TEXT)
179: return "text";
180: else if (this .contentdom == CONTENTDOM_IMAGE)
181: return "image";
182: else if (this .contentdom == CONTENTDOM_AUDIO)
183: return "audio";
184: else if (this .contentdom == CONTENTDOM_VIDEO)
185: return "video";
186: else if (this .contentdom == CONTENTDOM_APP)
187: return "app";
188: return "text";
189: }
190:
191: public String searchdom() {
192: return (this .domType == SEARCHDOM_LOCAL) ? "local" : "global";
193: }
194:
195: public static TreeSet<String> hashes2Set(String query) {
196: if (query == null)
197: return new TreeSet<String>(
198: kelondroBase64Order.enhancedComparator);
199: final TreeSet<String> keyhashes = new TreeSet<String>(
200: kelondroBase64Order.enhancedComparator);
201: for (int i = 0; i < (query.length() / yacySeedDB.commonHashLength); i++) {
202: keyhashes.add(query.substring(i
203: * yacySeedDB.commonHashLength, (i + 1)
204: * yacySeedDB.commonHashLength));
205: }
206: return keyhashes;
207: }
208:
209: public static String hashSet2hashString(Set<String> hashes) {
210: Iterator<String> i = hashes.iterator();
211: StringBuffer sb = new StringBuffer(hashes.size()
212: * yacySeedDB.commonHashLength);
213: while (i.hasNext())
214: sb.append(i.next());
215: return new String(sb);
216: }
217:
218: public static String anonymizedQueryHashes(Set<String> hashes) {
219: // create a more anonymized representation of euqery hashes for logging
220: Iterator<String> i = hashes.iterator();
221: StringBuffer sb = new StringBuffer(hashes.size()
222: * (yacySeedDB.commonHashLength + 2) + 2);
223: sb.append("[");
224: String hash;
225: if (i.hasNext()) {
226: hash = i.next();
227: sb.append(hash.substring(0, 3)).append(".........");
228: }
229: while (i.hasNext()) {
230: hash = i.next();
231: sb.append(", ").append(hash.substring(0, 3)).append(
232: ".........");
233: }
234: sb.append("]");
235: return new String(sb);
236: }
237:
238: public static final boolean matches(String text,
239: TreeSet<String> keyhashes) {
240: // returns true if any of the word hashes in keyhashes appear in the String text
241: // to do this, all words in the string must be recognized and transcoded to word hashes
242: TreeSet<String> wordhashes = plasmaCondenser
243: .words2hashes(plasmaCondenser.getWords(text).keySet());
244: return kelondroMSetTools.anymatch(wordhashes, keyhashes);
245: }
246:
247: @SuppressWarnings("unchecked")
248: public static TreeSet<String>[] cleanQuery(String querystring) {
249: // returns two sets: a query set and a exclude set
250: if ((querystring == null) || (querystring.length() == 0))
251: return new TreeSet[] {
252: new TreeSet<String>(
253: kelondroNaturalOrder.naturalComparator),
254: new TreeSet<String>(
255: kelondroNaturalOrder.naturalComparator) };
256:
257: // convert Umlaute
258: querystring = htmlFilterAbstractScraper.convertUmlaute(
259: new serverCharBuffer(querystring.toCharArray()))
260: .toString();
261:
262: // remove funny symbols
263: final String seps = "'.,:/&";
264: querystring = querystring.toLowerCase().trim();
265: int c;
266: for (int i = 0; i < seps.length(); i++) {
267: while ((c = querystring.indexOf(seps.charAt(i))) >= 0) {
268: querystring = querystring.substring(0, c)
269: + (((c + 1) < querystring.length()) ? (" " + querystring
270: .substring(c + 1))
271: : "");
272: }
273: }
274:
275: // the string is clean now, but we must generate a set out of it
276: final TreeSet<String> query = new TreeSet<String>(
277: kelondroNaturalOrder.naturalComparator);
278: final TreeSet<String> exclude = new TreeSet<String>(
279: kelondroNaturalOrder.naturalComparator);
280: final String[] a = querystring.split(" ");
281: for (int i = 0; i < a.length; i++) {
282: if (a[i].startsWith("-")) {
283: exclude.add(a[i].substring(1));
284: } else {
285: while ((c = a[i].indexOf('-')) >= 0) {
286: query.add(a[i].substring(0, c));
287: a[i] = a[i].substring(c + 1);
288: }
289: if (a[i].length() > 0)
290: query.add(a[i]);
291: }
292: }
293: return new TreeSet[] { query, exclude };
294: }
295:
296: public String queryString() {
297: return this .queryString;
298: }
299:
300: public TreeSet<String>[] queryWords() {
301: return cleanQuery(this .queryString);
302: }
303:
304: public void filterOut(Set<String> blueList) {
305: // filter out words that appear in this set
306: // this is applied to the queryHashes
307: TreeSet<String> blues = plasmaCondenser.words2hashes(blueList);
308: kelondroMSetTools.excludeDestructive(queryHashes, blues);
309: }
310:
311: public String id(boolean anonymized) {
312: // generate a string that identifies a search so results can be re-used in a cache
313: if (anonymized) {
314: return anonymizedQueryHashes(this .queryHashes) + "-"
315: + anonymizedQueryHashes(this .excludeHashes) + ":"
316: + this .contentdom + "*"
317: + this .ranking.toExternalString();
318: } else {
319: return hashSet2hashString(this .queryHashes) + "-"
320: + hashSet2hashString(this .excludeHashes) + ":"
321: + this .contentdom + this .ranking.toExternalString();
322: }
323: }
324:
325: public HashMap<String, Object> resultProfile(int searchcount,
326: long searchtime, long urlretrieval, long snippetcomputation) {
327: // generate statistics about search: query, time, etc
328: HashMap<String, Object> r = new HashMap<String, Object>();
329: r.put("queryhashes", queryHashes);
330: r.put("querystring", queryString);
331: r.put("querycount", new Integer(linesPerPage));
332: //r.put("querytime", new Long(maximumTime));
333: r.put("resultcount", new Integer(searchcount));
334: r.put("resulttime", new Long(searchtime));
335: r.put("resulturltime", new Long(urlretrieval));
336: r.put("resultsnippettime", new Long(snippetcomputation));
337: return r;
338: }
339: }
|