0001: // plasmaSnippetCache.java
0002: // -----------------------
0003: // part of YaCy
0004: // (C) by Michael Peter Christen; mc@anomic.de
0005: // first published on http://www.anomic.de
0006: // Frankfurt, Germany, 2005
0007: // last major change: 09.10.2006
0008: //
0009: // contributions by Marc Nause [MN]
0010: //
0011: // This program is free software; you can redistribute it and/or modify
0012: // it under the terms of the GNU General Public License as published by
0013: // the Free Software Foundation; either version 2 of the License, or
0014: // (at your option) any later version.
0015: //
0016: // This program is distributed in the hope that it will be useful,
0017: // but WITHOUT ANY WARRANTY; without even the implied warranty of
0018: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0019: // GNU General Public License for more details.
0020: //
0021: // You should have received a copy of the GNU General Public License
0022: // along with this program; if not, write to the Free Software
0023: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0024: //
0025: // Using this software in any meaning (reading, learning, copying, compiling,
0026: // running) means that you agree that the Author(s) is (are) not responsible
0027: // for cost, loss of data or any harm that may be caused directly or indirectly
0028: // by usage of this softare or this documentation. The usage of this software
0029: // is on your own risk. The installation and usage (starting/running) of this
0030: // software may allow other people or application to access your computer and
0031: // any attached devices and is highly dependent on the configuration of the
0032: // software which must be done by the user of the software; the author(s) is
0033: // (are) also not responsible for proper configuration and usage of the
0034: // software, even if provoked by documentation provided together with
0035: // the software.
0036: //
0037: // Any changes to this file according to the GPL as documented in the file
0038: // gpl.txt aside this file in the shipment you received can be done to the
0039: // lines that follows this copyright notice here, but changes must not be
0040: // done inside the copyright notive above. A re-distribution must contain
0041: // the intact and unchanged copyright notice.
0042: // Contributions and changes to the program code must be marked as such.
0043:
0044: package de.anomic.plasma;
0045:
0046: import java.io.ByteArrayInputStream;
0047: import java.io.InputStream;
0048: import java.util.ArrayList;
0049: import java.util.Enumeration;
0050: import java.util.HashMap;
0051: import java.util.HashSet;
0052: import java.util.Iterator;
0053: import java.util.Map;
0054: import java.util.Set;
0055: import java.util.TreeMap;
0056: import java.util.TreeSet;
0057:
0058: import de.anomic.htmlFilter.htmlFilterImageEntry;
0059: import de.anomic.http.httpHeader;
0060: import de.anomic.http.httpc;
0061: import de.anomic.index.indexURLEntry;
0062: import de.anomic.kelondro.kelondroMScoreCluster;
0063: import de.anomic.kelondro.kelondroMSetTools;
0064: import de.anomic.plasma.cache.IResourceInfo;
0065: import de.anomic.plasma.parser.ParserException;
0066: import de.anomic.server.logging.serverLog;
0067: import de.anomic.yacy.yacyCore;
0068: import de.anomic.yacy.yacySearch;
0069: import de.anomic.yacy.yacyURL;
0070:
0071: public class plasmaSnippetCache {
0072:
0073: private static final int maxCache = 500;
0074:
0075: public static final int SOURCE_CACHE = 0;
0076: public static final int SOURCE_FILE = 1;
0077: public static final int SOURCE_WEB = 2;
0078: public static final int SOURCE_METADATA = 3;
0079:
0080: public static final int ERROR_NO_HASH_GIVEN = 11;
0081: public static final int ERROR_SOURCE_LOADING = 12;
0082: public static final int ERROR_RESOURCE_LOADING = 13;
0083: public static final int ERROR_PARSER_FAILED = 14;
0084: public static final int ERROR_PARSER_NO_LINES = 15;
0085: public static final int ERROR_NO_MATCH = 16;
0086:
0087: private static int snippetsScoreCounter;
0088: private static kelondroMScoreCluster<String> snippetsScore;
0089: private static HashMap<String, String> snippetsCache;
0090:
0091: /**
0092: * a cache holding URLs to favicons specified by the page content, e.g. by using the html link-tag. e.g.
0093: * <pre>
0094: * <link rel="shortcut icon" type="image/x-icon" href="../src/favicon.ico">
0095: * </pre>
0096: */
0097: private static HashMap<String, yacyURL> faviconCache;
0098: private static plasmaParser parser;
0099: private static serverLog log;
0100:
0101: public static void init(plasmaParser parserx, serverLog logx) {
0102: parser = parserx;
0103: log = logx;
0104: snippetsScoreCounter = 0;
0105: snippetsScore = new kelondroMScoreCluster<String>();
0106: snippetsCache = new HashMap<String, String>();
0107: faviconCache = new HashMap<String, yacyURL>();
0108: }
0109:
0110: public static class TextSnippet {
0111: private yacyURL url;
0112: private String line;
0113: private String error;
0114: private int errorCode;
0115: private Set<String> remaingHashes;
0116: private yacyURL favicon;
0117:
0118: public TextSnippet(yacyURL url, String line, int errorCode,
0119: Set<String> remaingHashes, String errortext) {
0120: this (url, line, errorCode, remaingHashes, errortext, null);
0121: }
0122:
0123: public TextSnippet(yacyURL url, String line, int errorCode,
0124: Set<String> remaingHashes, String errortext,
0125: yacyURL favicon) {
0126: this .url = url;
0127: this .line = line;
0128: this .errorCode = errorCode;
0129: this .error = errortext;
0130: this .remaingHashes = remaingHashes;
0131: this .favicon = favicon;
0132: }
0133:
0134: public yacyURL getUrl() {
0135: return this .url;
0136: }
0137:
0138: public boolean exists() {
0139: return line != null;
0140: }
0141:
0142: public String toString() {
0143: return (line == null) ? "" : line;
0144: }
0145:
0146: public String getLineRaw() {
0147: return (line == null) ? "" : line;
0148: }
0149:
0150: public String getError() {
0151: return (error == null) ? "" : error.trim();
0152: }
0153:
0154: public int getErrorCode() {
0155: return errorCode;
0156: }
0157:
0158: public Set<String> getRemainingHashes() {
0159: return this .remaingHashes;
0160: }
0161:
0162: public String getLineMarked(Set<String> queryHashes) {
0163: if (line == null)
0164: return "";
0165: if ((queryHashes == null) || (queryHashes.size() == 0))
0166: return line.trim();
0167: if (line.endsWith("."))
0168: line = line.substring(0, line.length() - 1);
0169: Iterator<String> i = queryHashes.iterator();
0170: String h;
0171: String[] w = line.split(" ");
0172: String prefix = "";
0173: String postfix = "";
0174: int len = 0;
0175: while (i.hasNext()) {
0176: h = i.next();
0177: for (int j = 0; j < w.length; j++) {
0178: //ignore punctuation marks (contrib [MN])
0179: //note to myself:
0180: //For details on regex see "Mastering regular expressions" by J.E.F. Friedl
0181: //especially p. 123 and p. 390/391 (in the German version of the 2nd edition)
0182:
0183: prefix = "";
0184: postfix = "";
0185:
0186: // cut off prefix if it contains of non-characters or non-numbers
0187: while (w[j].matches("\\A[^\\p{L}\\p{N}].+")) {
0188: prefix = prefix + w[j].substring(0, 1);
0189: w[j] = w[j].substring(1);
0190: }
0191:
0192: // cut off postfix if it contains of non-characters or non-numbers
0193: while (w[j].matches(".+[^\\p{L}\\p{N}]\\Z")) {
0194: len = w[j].length();
0195: postfix = w[j].substring(len - 1, len)
0196: + postfix;
0197: w[j] = w[j].substring(0, len - 1);
0198: }
0199:
0200: //special treatment if there is a special character in the word
0201: if (w[j]
0202: .matches("\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z")) {
0203: String out = "";
0204: String temp = "";
0205: for (int k = 0; k < w[j].length(); k++) {
0206: //is character a special character?
0207: if (w[j].substring(k, k + 1).matches(
0208: "[^\\p{L}\\p{N}]")) {
0209: if (plasmaCondenser.word2hash(temp)
0210: .equals(h))
0211: temp = "<b>" + temp + "</b>";
0212: out = out + temp
0213: + w[j].substring(k, k + 1);
0214: temp = "";
0215: }
0216: //last character
0217: else if (k == (w[j].length() - 1)) {
0218: temp = temp + w[j].substring(k, k + 1);
0219: if (plasmaCondenser.word2hash(temp)
0220: .equals(h))
0221: temp = "<b>" + temp + "</b>";
0222: out = out + temp;
0223: temp = "";
0224: } else
0225: temp = temp + w[j].substring(k, k + 1);
0226: }
0227: w[j] = out;
0228: }
0229:
0230: //end contrib [MN]
0231: else if (plasmaCondenser.word2hash(w[j]).equals(h))
0232: w[j] = "<b>" + w[j] + "</b>";
0233:
0234: w[j] = prefix + w[j] + postfix;
0235: }
0236: }
0237: StringBuffer l = new StringBuffer(line.length()
0238: + queryHashes.size() * 8);
0239: for (int j = 0; j < w.length; j++) {
0240: l.append(w[j]);
0241: l.append(' ');
0242: }
0243: return l.toString().trim();
0244: }
0245:
0246: public yacyURL getFavicon() {
0247: return this .favicon;
0248: }
0249: }
0250:
0251: public static class MediaSnippet {
0252: public int type;
0253: public yacyURL href;
0254: public String name, attr;
0255:
0256: public MediaSnippet(int type, yacyURL href, String name,
0257: String attr) {
0258: this .type = type;
0259: this .href = href;
0260: this .name = name;
0261: this .attr = attr;
0262: if ((this .name == null) || (this .name.length() == 0))
0263: this .name = "_";
0264: if ((this .attr == null) || (this .attr.length() == 0))
0265: this .attr = "_";
0266: }
0267: }
0268:
0269: public static boolean existsInCache(yacyURL url,
0270: Set<String> queryhashes) {
0271: String hashes = yacySearch.set2string(queryhashes);
0272: return retrieveFromCache(hashes, url.hash()) != null;
0273: }
0274:
0275: @SuppressWarnings("unchecked")
0276: public static TextSnippet retrieveTextSnippet(
0277: indexURLEntry.Components comp, Set<String> queryhashes,
0278: boolean fetchOnline, boolean pre, int snippetMaxLength,
0279: int timeout, int maxDocLen) {
0280: // heise = "0OQUNU3JSs05"
0281: yacyURL url = comp.url();
0282: if (queryhashes.size() == 0) {
0283: //System.out.println("found no queryhashes for URL retrieve " + url);
0284: return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN,
0285: queryhashes, "no query hashes given");
0286: }
0287:
0288: // try to get snippet from snippetCache
0289: int source = SOURCE_CACHE;
0290: String wordhashes = yacySearch.set2string(queryhashes);
0291: String line = retrieveFromCache(wordhashes, url.hash());
0292: if (line != null) {
0293: // found the snippet
0294: return new TextSnippet(url, line, source, null, null,
0295: faviconCache.get(url.hash()));
0296: }
0297:
0298: /* ===========================================================================
0299: * LOADING RESOURCE DATA
0300: * =========================================================================== */
0301: // if the snippet is not in the cache, we can try to get it from the htcache
0302: long resContentLength = 0;
0303: InputStream resContent = null;
0304: IResourceInfo resInfo = null;
0305: try {
0306: // trying to load the resource from the cache
0307: resContent = plasmaHTCache.getResourceContentStream(url);
0308: if (resContent != null) {
0309: // if the content was found
0310: resContentLength = plasmaHTCache
0311: .getResourceContentLength(url);
0312: if ((resContentLength > maxDocLen) && (!fetchOnline)) {
0313: // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
0314: return new TextSnippet(url, null,
0315: ERROR_SOURCE_LOADING, queryhashes,
0316: "resource available, but too large: "
0317: + resContentLength + " bytes");
0318: }
0319: } else if (containsAllHashes(comp.dc_title(), queryhashes)) {
0320: // try to create the snippet from information given in the url itself
0321: return new TextSnippet(url,
0322: (comp.dc_subject().length() > 0) ? comp
0323: .dc_creator() : comp.dc_subject(),
0324: SOURCE_METADATA, null, null, faviconCache
0325: .get(url.hash()));
0326: } else if (containsAllHashes(comp.dc_creator(), queryhashes)) {
0327: // try to create the snippet from information given in the creator metadata
0328: return new TextSnippet(url, comp.dc_creator(),
0329: SOURCE_METADATA, null, null, faviconCache
0330: .get(url.hash()));
0331: } else if (containsAllHashes(comp.dc_subject(), queryhashes)) {
0332: // try to create the snippet from information given in the subject metadata
0333: return new TextSnippet(url,
0334: (comp.dc_creator().length() > 0) ? comp
0335: .dc_creator() : comp.dc_subject(),
0336: SOURCE_METADATA, null, null, faviconCache
0337: .get(url.hash()));
0338: } else if (containsAllHashes(comp.url().toNormalform(true,
0339: true), queryhashes)) {
0340: // try to create the snippet from information given in the subject metadata
0341: return new TextSnippet(url,
0342: (comp.dc_creator().length() > 0) ? comp
0343: .dc_creator() : comp.dc_subject(),
0344: SOURCE_METADATA, null, null, faviconCache
0345: .get(url.hash()));
0346: } else if (fetchOnline) {
0347: // if not found try to download it
0348:
0349: // download resource using the crawler and keep resource in memory if possible
0350: plasmaHTCache.Entry entry = plasmaSwitchboard
0351: .getSwitchboard().crawlQueues
0352: .loadResourceFromWeb(url, timeout, true, true);
0353:
0354: // place entry on crawl queue
0355: plasmaHTCache.push(entry);
0356:
0357: // getting resource metadata (e.g. the http headers for http resources)
0358: if (entry != null) {
0359: resInfo = entry.getDocumentInfo();
0360:
0361: // read resource body (if it is there)
0362: byte[] resourceArray = entry.cacheArray();
0363: if (resourceArray != null) {
0364: resContent = new ByteArrayInputStream(
0365: resourceArray);
0366: resContentLength = resourceArray.length;
0367: } else {
0368: resContent = plasmaHTCache
0369: .getResourceContentStream(url);
0370: resContentLength = plasmaHTCache
0371: .getResourceContentLength(url);
0372: }
0373: }
0374:
0375: // if it is still not available, report an error
0376: if (resContent == null)
0377: return new TextSnippet(url, null,
0378: ERROR_RESOURCE_LOADING, queryhashes,
0379: "error loading resource, plasmaHTCache.Entry cache is NULL");
0380:
0381: source = SOURCE_WEB;
0382: } else {
0383: return new TextSnippet(url, null, ERROR_SOURCE_LOADING,
0384: queryhashes, "no resource available");
0385: }
0386: } catch (Exception e) {
0387: e.printStackTrace();
0388: return new TextSnippet(url, null, ERROR_SOURCE_LOADING,
0389: queryhashes, "error loading resource: "
0390: + e.getMessage());
0391: }
0392:
0393: /* ===========================================================================
0394: * PARSING RESOURCE
0395: * =========================================================================== */
0396: plasmaParserDocument document = null;
0397: try {
0398: document = parseDocument(url, resContentLength, resContent,
0399: resInfo);
0400: } catch (ParserException e) {
0401: return new TextSnippet(url, null, ERROR_PARSER_FAILED,
0402: queryhashes, e.getMessage()); // cannot be parsed
0403: } finally {
0404: try {
0405: resContent.close();
0406: } catch (Exception e) {/* ignore this */
0407: }
0408: }
0409: if (document == null)
0410: return new TextSnippet(url, null, ERROR_PARSER_FAILED,
0411: queryhashes, "parser error/failed"); // cannot be parsed
0412:
0413: /* ===========================================================================
0414: * COMPUTE SNIPPET
0415: * =========================================================================== */
0416: yacyURL resFavicon = document.getFavicon();
0417: if (resFavicon != null)
0418: faviconCache.put(url.hash(), resFavicon);
0419: // we have found a parseable non-empty file: use the lines
0420:
0421: // compute snippet from text
0422: final Iterator<StringBuffer> sentences = document
0423: .getSentences(pre);
0424: if (sentences == null)
0425: return new TextSnippet(url, null, ERROR_PARSER_NO_LINES,
0426: queryhashes, "parser returned no sentences",
0427: resFavicon);
0428: Object[] tsr = computeTextSnippet(sentences, queryhashes,
0429: snippetMaxLength);
0430: String textline = (tsr == null) ? null : (String) tsr[0];
0431: Set<String> remainingHashes = (tsr == null) ? queryhashes
0432: : (Set<String>) tsr[1];
0433:
0434: // compute snippet from media
0435: String audioline = computeMediaSnippet(
0436: document.getAudiolinks(), queryhashes);
0437: String videoline = computeMediaSnippet(
0438: document.getVideolinks(), queryhashes);
0439: String appline = computeMediaSnippet(document.getApplinks(),
0440: queryhashes);
0441: //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
0442: //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
0443:
0444: line = "";
0445: if (audioline != null)
0446: line += (line.length() == 0) ? audioline : "<br />"
0447: + audioline;
0448: if (videoline != null)
0449: line += (line.length() == 0) ? videoline : "<br />"
0450: + videoline;
0451: if (appline != null)
0452: line += (line.length() == 0) ? appline : "<br />" + appline;
0453: //if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
0454: if (textline != null)
0455: line += (line.length() == 0) ? textline : "<br />"
0456: + textline;
0457:
0458: if ((line == null) || (remainingHashes.size() > 0))
0459: return new TextSnippet(url, null, ERROR_NO_MATCH,
0460: remainingHashes, "no matching snippet found",
0461: resFavicon);
0462: if (line.length() > snippetMaxLength)
0463: line = line.substring(0, snippetMaxLength);
0464:
0465: // finally store this snippet in our own cache
0466: storeToCache(wordhashes, url.hash(), line);
0467:
0468: document.close();
0469: return new TextSnippet(url, line, source, null, null,
0470: resFavicon);
0471: }
0472:
0473: /**
0474: * Tries to load and parse a resource specified by it's URL.
0475: * If the resource is not stored in cache and if fetchOnline is set the
0476: * this function tries to download the resource from web.
0477: *
0478: * @param url the URL of the resource
0479: * @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache
0480: * @return the parsed document as {@link plasmaParserDocument}
0481: */
0482: public static plasmaParserDocument retrieveDocument(yacyURL url,
0483: boolean fetchOnline, int timeout, boolean forText) {
0484:
0485: // load resource
0486: long resContentLength = 0;
0487: InputStream resContent = null;
0488: IResourceInfo resInfo = null;
0489: try {
0490: // trying to load the resource from the cache
0491: resContent = plasmaHTCache.getResourceContentStream(url);
0492: if (resContent != null) {
0493: // if the content was found
0494: resContentLength = plasmaHTCache
0495: .getResourceContentLength(url);
0496: } else if (fetchOnline) {
0497: // if not found try to download it
0498:
0499: // download resource using the crawler and keep resource in memory if possible
0500: plasmaHTCache.Entry entry = plasmaSwitchboard
0501: .getSwitchboard().crawlQueues
0502: .loadResourceFromWeb(url, timeout, true,
0503: forText);
0504:
0505: // getting resource metadata (e.g. the http headers for http resources)
0506: if (entry != null) {
0507: resInfo = entry.getDocumentInfo();
0508:
0509: // read resource body (if it is there)
0510: byte[] resourceArray = entry.cacheArray();
0511: if (resourceArray != null) {
0512: resContent = new ByteArrayInputStream(
0513: resourceArray);
0514: resContentLength = resourceArray.length;
0515: } else {
0516: resContent = plasmaHTCache
0517: .getResourceContentStream(url);
0518: resContentLength = plasmaHTCache
0519: .getResourceContentLength(url);
0520: }
0521: }
0522:
0523: // if it is still not available, report an error
0524: if (resContent == null) {
0525: serverLog.logFine("snippet fetch",
0526: "plasmaHTCache.Entry cache is NULL for url "
0527: + url);
0528: return null;
0529: }
0530: } else {
0531: serverLog.logFine("snippet fetch",
0532: "no resource available for url " + url);
0533: return null;
0534: }
0535: } catch (Exception e) {
0536: serverLog.logFine("snippet fetch",
0537: "error loading resource: " + e.getMessage()
0538: + " for url " + url);
0539: return null;
0540: }
0541:
0542: // parse resource
0543: plasmaParserDocument document = null;
0544: try {
0545: document = parseDocument(url, resContentLength, resContent,
0546: resInfo);
0547: } catch (ParserException e) {
0548: serverLog.logFine("snippet fetch", "parser error "
0549: + e.getMessage() + " for url " + url);
0550: return null;
0551: } finally {
0552: try {
0553: resContent.close();
0554: } catch (Exception e) {
0555: }
0556: }
0557: return document;
0558: }
0559:
0560: public static void storeToCache(String wordhashes, String urlhash,
0561: String snippet) {
0562: // generate key
0563: String key = urlhash + wordhashes;
0564:
0565: // do nothing if snippet is known
0566: if (snippetsCache.containsKey(key))
0567: return;
0568:
0569: // learn new snippet
0570: snippetsScore.addScore(key, snippetsScoreCounter++);
0571: snippetsCache.put(key, snippet);
0572:
0573: // care for counter
0574: if (snippetsScoreCounter == java.lang.Integer.MAX_VALUE) {
0575: snippetsScoreCounter = 0;
0576: snippetsScore = new kelondroMScoreCluster<String>();
0577: snippetsCache = new HashMap<String, String>();
0578: }
0579:
0580: // flush cache if cache is full
0581: while (snippetsCache.size() > maxCache) {
0582: key = (String) snippetsScore.getMinObject();
0583: snippetsScore.deleteScore(key);
0584: snippetsCache.remove(key);
0585: }
0586: }
0587:
0588: private static String retrieveFromCache(String wordhashes,
0589: String urlhash) {
0590: // generate key
0591: String key = urlhash + wordhashes;
0592: return snippetsCache.get(key);
0593: }
0594:
0595: private static String computeMediaSnippet(
0596: Map<yacyURL, String> media, Set<String> queryhashes) {
0597: Iterator<Map.Entry<yacyURL, String>> i = media.entrySet()
0598: .iterator();
0599: Map.Entry<yacyURL, String> entry;
0600: yacyURL url;
0601: String desc;
0602: Set<String> s;
0603: String result = "";
0604: while (i.hasNext()) {
0605: entry = i.next();
0606: url = entry.getKey();
0607: desc = entry.getValue();
0608: s = removeAppearanceHashes(url.toNormalform(false, false),
0609: queryhashes);
0610: if (s.size() == 0) {
0611: result += "<br /><a href=\"" + url + "\">"
0612: + ((desc.length() == 0) ? url : desc) + "</a>";
0613: continue;
0614: }
0615: s = removeAppearanceHashes(desc, s);
0616: if (s.size() == 0) {
0617: result += "<br /><a href=\"" + url + "\">"
0618: + ((desc.length() == 0) ? url : desc) + "</a>";
0619: continue;
0620: }
0621: }
0622: if (result.length() == 0)
0623: return null;
0624: return result.substring(6);
0625: }
0626:
0627: @SuppressWarnings("unchecked")
0628: private static Object[] /*{String - the snippet, Set - remaining hashes}*/
0629: computeTextSnippet(Iterator<StringBuffer> sentences,
0630: Set<String> queryhashes, int maxLength) {
0631: try {
0632: if (sentences == null)
0633: return null;
0634: if ((queryhashes == null) || (queryhashes.size() == 0))
0635: return null;
0636: Iterator<String> j;
0637: HashMap<String, Integer> hs;
0638: StringBuffer sentence;
0639: TreeMap<Integer, StringBuffer> os = new TreeMap<Integer, StringBuffer>();
0640: int uniqCounter = 9999;
0641: int score;
0642: while (sentences.hasNext()) {
0643: sentence = sentences.next();
0644: hs = hashSentence(sentence.toString());
0645: j = queryhashes.iterator();
0646: score = 0;
0647: while (j.hasNext()) {
0648: if (hs.containsKey(j.next()))
0649: score++;
0650: }
0651: if (score > 0) {
0652: os.put(
0653: new Integer(1000000 * score
0654: - sentence.length() * 10000
0655: + uniqCounter--), sentence);
0656: }
0657: }
0658:
0659: String result;
0660: Set<String> remaininghashes;
0661: while (os.size() > 0) {
0662: sentence = os.remove(os.lastKey()); // sentence with the biggest score
0663: Object[] tsr = computeTextSnippet(sentence.toString(),
0664: queryhashes, maxLength);
0665: if (tsr == null)
0666: continue;
0667: result = (String) tsr[0];
0668: if ((result != null) && (result.length() > 0)) {
0669: remaininghashes = (Set<String>) tsr[1];
0670: if (remaininghashes.size() == 0) {
0671: // we have found the snippet
0672: return new Object[] { result, remaininghashes };
0673: } else if (remaininghashes.size() < queryhashes
0674: .size()) {
0675: // the result has not all words in it.
0676: // find another sentence that represents the missing other words
0677: // and find recursively more sentences
0678: maxLength = maxLength - result.length();
0679: if (maxLength < 20)
0680: maxLength = 20;
0681: tsr = computeTextSnippet(
0682: os.values().iterator(),
0683: remaininghashes, maxLength);
0684: if (tsr == null)
0685: return null;
0686: String nextSnippet = (String) tsr[0];
0687: if (nextSnippet == null)
0688: return tsr;
0689: return new Object[] {
0690: result + (" / " + nextSnippet), tsr[1] };
0691: } else {
0692: // error
0693: //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
0694: continue;
0695: }
0696: }
0697: }
0698: return null;
0699: } catch (IndexOutOfBoundsException e) {
0700: log.logSevere(
0701: "computeSnippet: error with string generation", e);
0702: return new Object[] { null, queryhashes };
0703: }
0704: }
0705:
0706: private static Object[] /*{String - the snippet, Set - remaining hashes}*/
0707: computeTextSnippet(String sentence, Set<String> queryhashes,
0708: int maxLength) {
0709: try {
0710: if (sentence == null)
0711: return null;
0712: if ((queryhashes == null) || (queryhashes.size() == 0))
0713: return null;
0714: String hash;
0715:
0716: // find all hashes that appear in the sentence
0717: HashMap<String, Integer> hs = hashSentence(sentence);
0718: Iterator<String> j = queryhashes.iterator();
0719: Integer pos;
0720: int p, minpos = sentence.length(), maxpos = -1;
0721: HashSet<String> remainingHashes = new HashSet<String>();
0722: while (j.hasNext()) {
0723: hash = j.next();
0724: pos = hs.get(hash);
0725: if (pos == null) {
0726: remainingHashes.add(hash);
0727: } else {
0728: p = pos.intValue();
0729: if (p > maxpos)
0730: maxpos = p;
0731: if (p < minpos)
0732: minpos = p;
0733: }
0734: }
0735: // check result size
0736: maxpos = maxpos + 10;
0737: if (maxpos > sentence.length())
0738: maxpos = sentence.length();
0739: if (minpos < 0)
0740: minpos = 0;
0741: // we have a result, but is it short enough?
0742: if (maxpos - minpos + 10 > maxLength) {
0743: // the string is too long, even if we cut at both ends
0744: // so cut here in the middle of the string
0745: int lenb = sentence.length();
0746: sentence = sentence.substring(
0747: 0,
0748: (minpos + 20 > sentence.length()) ? sentence
0749: .length() : minpos + 20).trim()
0750: + " [..] "
0751: + sentence
0752: .substring(
0753: (maxpos + 26 > sentence
0754: .length()) ? sentence
0755: .length() : maxpos + 26)
0756: .trim();
0757: maxpos = maxpos + lenb - sentence.length() + 6;
0758: }
0759: if (maxpos > maxLength) {
0760: // the string is too long, even if we cut it at the end
0761: // so cut it here at both ends at once
0762: assert maxpos >= minpos;
0763: int newlen = Math.max(10, maxpos - minpos + 10);
0764: int around = (maxLength - newlen) / 2;
0765: assert minpos - around < sentence.length() : "maxpos = "
0766: + maxpos
0767: + ", minpos = "
0768: + minpos
0769: + ", around = "
0770: + around
0771: + ", sentence.length() = " + sentence.length();
0772: assert ((maxpos + around) <= sentence.length())
0773: && ((maxpos + around) <= sentence.length()) : "maxpos = "
0774: + maxpos
0775: + ", minpos = "
0776: + minpos
0777: + ", around = "
0778: + around
0779: + ", sentence.length() = " + sentence.length();
0780: sentence = "[..] "
0781: + sentence
0782: .substring(
0783: minpos - around,
0784: ((maxpos + around) > sentence
0785: .length()) ? sentence
0786: .length()
0787: : (maxpos + around))
0788: .trim() + " [..]";
0789: minpos = around;
0790: maxpos = sentence.length() - around - 5;
0791: }
0792: if (sentence.length() > maxLength) {
0793: // trim sentence, 1st step (cut at right side)
0794: sentence = sentence.substring(0, maxpos).trim()
0795: + " [..]";
0796: }
0797: if (sentence.length() > maxLength) {
0798: // trim sentence, 2nd step (cut at left side)
0799: sentence = "[..] " + sentence.substring(minpos).trim();
0800: }
0801: if (sentence.length() > maxLength) {
0802: // trim sentence, 3rd step (cut in the middle)
0803: sentence = sentence.substring(6, 20).trim()
0804: + " [..] "
0805: + sentence.substring(sentence.length() - 26,
0806: sentence.length() - 6).trim();
0807: }
0808: return new Object[] { sentence, remainingHashes };
0809: } catch (IndexOutOfBoundsException e) {
0810: log.logSevere(
0811: "computeSnippet: error with string generation", e);
0812: return null;
0813: }
0814: }
0815:
0816: public static ArrayList<MediaSnippet> retrieveMediaSnippets(
0817: yacyURL url, Set<String> queryhashes, int mediatype,
0818: boolean fetchOnline, int timeout) {
0819: if (queryhashes.size() == 0) {
0820: serverLog.logFine("snippet fetch",
0821: "no query hashes given for url " + url);
0822: return new ArrayList<MediaSnippet>();
0823: }
0824:
0825: plasmaParserDocument document = retrieveDocument(url,
0826: fetchOnline, timeout, false);
0827: ArrayList<MediaSnippet> a = new ArrayList<MediaSnippet>();
0828: if (document != null) {
0829: if ((mediatype == plasmaSearchQuery.CONTENTDOM_ALL)
0830: || (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO))
0831: a.addAll(computeMediaSnippets(document, queryhashes,
0832: plasmaSearchQuery.CONTENTDOM_AUDIO));
0833: if ((mediatype == plasmaSearchQuery.CONTENTDOM_ALL)
0834: || (mediatype == plasmaSearchQuery.CONTENTDOM_VIDEO))
0835: a.addAll(computeMediaSnippets(document, queryhashes,
0836: plasmaSearchQuery.CONTENTDOM_VIDEO));
0837: if ((mediatype == plasmaSearchQuery.CONTENTDOM_ALL)
0838: || (mediatype == plasmaSearchQuery.CONTENTDOM_APP))
0839: a.addAll(computeMediaSnippets(document, queryhashes,
0840: plasmaSearchQuery.CONTENTDOM_APP));
0841: if ((mediatype == plasmaSearchQuery.CONTENTDOM_ALL)
0842: || (mediatype == plasmaSearchQuery.CONTENTDOM_IMAGE))
0843: a.addAll(computeImageSnippets(document, queryhashes));
0844: }
0845: return a;
0846: }
0847:
0848: public static ArrayList<MediaSnippet> computeMediaSnippets(
0849: plasmaParserDocument document, Set<String> queryhashes,
0850: int mediatype) {
0851:
0852: if (document == null)
0853: return new ArrayList<MediaSnippet>();
0854: Map<yacyURL, String> media = null;
0855: if (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO)
0856: media = document.getAudiolinks();
0857: else if (mediatype == plasmaSearchQuery.CONTENTDOM_VIDEO)
0858: media = document.getVideolinks();
0859: else if (mediatype == plasmaSearchQuery.CONTENTDOM_APP)
0860: media = document.getApplinks();
0861: if (media == null)
0862: return null;
0863:
0864: Iterator<Map.Entry<yacyURL, String>> i = media.entrySet()
0865: .iterator();
0866: Map.Entry<yacyURL, String> entry;
0867: yacyURL url;
0868: String desc;
0869: Set<String> s;
0870: ArrayList<MediaSnippet> result = new ArrayList<MediaSnippet>();
0871: while (i.hasNext()) {
0872: entry = i.next();
0873: url = entry.getKey();
0874: desc = entry.getValue();
0875: s = removeAppearanceHashes(url.toNormalform(false, false),
0876: queryhashes);
0877: if (s.size() == 0) {
0878: result
0879: .add(new MediaSnippet(mediatype, url, desc,
0880: null));
0881: continue;
0882: }
0883: s = removeAppearanceHashes(desc, s);
0884: if (s.size() == 0) {
0885: result
0886: .add(new MediaSnippet(mediatype, url, desc,
0887: null));
0888: continue;
0889: }
0890: }
0891: return result;
0892: }
0893:
0894: public static ArrayList<MediaSnippet> computeImageSnippets(
0895: plasmaParserDocument document, Set<String> queryhashes) {
0896:
0897: TreeSet<htmlFilterImageEntry> images = document.getImages();
0898:
0899: Iterator<htmlFilterImageEntry> i = images.iterator();
0900: htmlFilterImageEntry ientry;
0901: yacyURL url;
0902: String desc;
0903: Set<String> s;
0904: ArrayList<MediaSnippet> result = new ArrayList<MediaSnippet>();
0905: while (i.hasNext()) {
0906: ientry = i.next();
0907: url = ientry.url();
0908: desc = ientry.alt();
0909: s = removeAppearanceHashes(url.toNormalform(false, false),
0910: queryhashes);
0911: if (s.size() == 0) {
0912: result.add(new MediaSnippet(
0913: plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc,
0914: ientry.width() + " x " + ientry.height()));
0915: continue;
0916: }
0917: s = removeAppearanceHashes(desc, s);
0918: if (s.size() == 0) {
0919: result.add(new MediaSnippet(
0920: plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc,
0921: ientry.width() + " x " + ientry.height()));
0922: continue;
0923: }
0924: }
0925: return result;
0926: }
0927:
0928: private static Set<String> removeAppearanceHashes(String sentence,
0929: Set<String> queryhashes) {
0930: // remove all hashes that appear in the sentence
0931: if (sentence == null)
0932: return queryhashes;
0933: HashMap<String, Integer> hs = hashSentence(sentence);
0934: Iterator<String> j = queryhashes.iterator();
0935: String hash;
0936: Integer pos;
0937: Set<String> remaininghashes = new HashSet<String>();
0938: while (j.hasNext()) {
0939: hash = j.next();
0940: pos = hs.get(hash);
0941: if (pos == null) {
0942: remaininghashes.add(new String(hash));
0943: }
0944: }
0945: return remaininghashes;
0946: }
0947:
0948: private static HashMap<String, Integer> hashSentence(String sentence) {
0949: // generates a word-wordPos mapping
0950: HashMap<String, Integer> map = new HashMap<String, Integer>();
0951: Enumeration<StringBuffer> words = plasmaCondenser
0952: .wordTokenizer(sentence, "UTF-8", 0);
0953: int pos = 0;
0954: StringBuffer word;
0955: String hash;
0956: while (words.hasMoreElements()) {
0957: word = words.nextElement();
0958: hash = plasmaCondenser.word2hash(new String(word));
0959: if (!map.containsKey(hash))
0960: map.put(hash, new Integer(pos)); // dont overwrite old values, that leads to too far word distances
0961: pos += word.length() + 1;
0962: }
0963: return map;
0964: }
0965:
0966: private static boolean containsAllHashes(String sentence,
0967: Set<String> queryhashes) {
0968: HashMap<String, Integer> m = hashSentence(sentence);
0969: Iterator<String> i = queryhashes.iterator();
0970: while (i.hasNext()) {
0971: if (!(m.containsKey(i.next())))
0972: return false;
0973: }
0974: return true;
0975: }
0976:
0977: public static plasmaParserDocument parseDocument(yacyURL url,
0978: long contentLength, InputStream resourceStream)
0979: throws ParserException {
0980: return parseDocument(url, contentLength, resourceStream, null);
0981: }
0982:
0983: /**
0984: * Parse the resource
0985: * @param url the URL of the resource
0986: * @param contentLength the contentLength of the resource
0987: * @param resourceStream the resource body as stream
0988: * @param docInfo metadata about the resource
0989: * @return the extracted data
0990: * @throws ParserException
0991: */
0992: public static plasmaParserDocument parseDocument(yacyURL url,
0993: long contentLength, InputStream resourceStream,
0994: IResourceInfo docInfo) throws ParserException {
0995: try {
0996: if (resourceStream == null)
0997: return null;
0998:
0999: // STEP 1: if no resource metadata is available, try to load it from cache
1000: if (docInfo == null) {
1001: // try to get the header from the htcache directory
1002: try {
1003: docInfo = plasmaHTCache.loadResourceInfo(url);
1004: } catch (Exception e) {
1005: // ignore this. resource info loading failed
1006: }
1007: }
1008:
1009: // STEP 2: if the metadata is still null try to download it from web
1010: if ((docInfo == null)
1011: && (url.getProtocol().startsWith("http"))) {
1012: // TODO: we need a better solution here
1013: // e.g. encapsulate this in the crawlLoader class
1014:
1015: // getting URL mimeType
1016: try {
1017: httpHeader header = httpc
1018: .whead(
1019: url,
1020: url.getHost(),
1021: 10000,
1022: null,
1023: null,
1024: plasmaSwitchboard.getSwitchboard().remoteProxyConfig);
1025: docInfo = plasmaHTCache.getResourceInfoFactory()
1026: .buildResourceInfoObj(url, header);
1027: } catch (Exception e) {
1028: // ingore this. http header download failed
1029: }
1030: }
1031:
1032: // STEP 3: if the metadata is still null try to guess the mimeType of the resource
1033: if (docInfo == null) {
1034: String filename = plasmaHTCache.getCachePath(url)
1035: .getName();
1036: int p = filename.lastIndexOf('.');
1037: if ( // if no extension is available
1038: (p < 0) ||
1039: // or the extension is supported by one of the parsers
1040: ((p >= 0) && (plasmaParser
1041: .supportedFileExtContains(filename
1042: .substring(p + 1))))) {
1043: String supposedMime = "text/html";
1044:
1045: // if the mimeType Parser is installed we can set the mimeType to null to force
1046: // a mimetype detection
1047: if (plasmaParser
1048: .supportedMimeTypesContains("application/octet-stream")) {
1049: supposedMime = null;
1050: } else if (p != -1) {
1051: // otherwise we try to determine the mimeType per file Extension
1052: supposedMime = plasmaParser
1053: .getMimeTypeByFileExt(filename
1054: .substring(p + 1));
1055: }
1056:
1057: return parser.parseSource(url, supposedMime, null,
1058: contentLength, resourceStream);
1059: }
1060: return null;
1061: }
1062: if (plasmaParser.supportedMimeTypesContains(docInfo
1063: .getMimeType())) {
1064: return parser.parseSource(url, docInfo.getMimeType(),
1065: docInfo.getCharacterEncoding(), contentLength,
1066: resourceStream);
1067: }
1068: return null;
1069: } catch (InterruptedException e) {
1070: // interruption of thread detected
1071: return null;
1072: }
1073: }
1074:
1075: /**
1076: *
1077: * @param url
1078: * @param fetchOnline
1079: * @param socketTimeout
1080: * @return an Object array containing
1081: * <table>
1082: * <tr><td>[0]</td><td>the content as {@link InputStream}</td></tr>
1083: * <tr><td>[1]</td><td>the content-length as {@link Integer}</td></tr>
1084: * </table>
1085: */
1086: public static Object[] getResource(yacyURL url,
1087: boolean fetchOnline, int socketTimeout, boolean forText) {
1088: // load the url as resource from the web
1089: long contentLength = -1;
1090:
1091: // trying to load the resource body from cache
1092: InputStream resource = plasmaHTCache
1093: .getResourceContentStream(url);
1094: if (resource != null) {
1095: contentLength = plasmaHTCache.getResourceContentLength(url);
1096: } else if (fetchOnline) {
1097: // if the content is not available in cache try to download it from web
1098:
1099: // try to download the resource using a crawler
1100: plasmaHTCache.Entry entry = plasmaSwitchboard
1101: .getSwitchboard().crawlQueues.loadResourceFromWeb(
1102: url, (socketTimeout < 0) ? -1 : socketTimeout,
1103: true, forText);
1104: if (entry == null)
1105: return null; // not found in web
1106:
1107: // read resource body (if it is there)
1108: byte[] resourceArray = entry.cacheArray();
1109:
1110: // in case that the resource was not in ram, read it from disk
1111: if (resourceArray == null) {
1112: resource = plasmaHTCache.getResourceContentStream(url);
1113: contentLength = plasmaHTCache
1114: .getResourceContentLength(url);
1115: } else {
1116: resource = new ByteArrayInputStream(resourceArray);
1117: contentLength = resourceArray.length;
1118: }
1119: } else {
1120: return null;
1121: }
1122: return new Object[] { resource, new Long(contentLength) };
1123: }
1124:
1125: public static String failConsequences(TextSnippet snippet,
1126: String eventID) {
1127: // problems with snippet fetch
1128: if (yacyCore.seedDB.mySeed().isVirgin())
1129: return snippet.getError()
1130: + " (no consequences, no network connection)"; // no consequences if we do not have a network connection
1131: String urlHash = snippet.getUrl().hash();
1132: String querystring = kelondroMSetTools.setToString(snippet
1133: .getRemainingHashes(), ' ');
1134: if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING)
1135: || (snippet.getErrorCode() == ERROR_RESOURCE_LOADING)
1136: || (snippet.getErrorCode() == ERROR_PARSER_FAILED)
1137: || (snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
1138: log.logInfo("error: '" + snippet.getError()
1139: + "', remove url = "
1140: + snippet.getUrl().toNormalform(false, true)
1141: + ", cause: " + snippet.getError());
1142: plasmaSwitchboard.getSwitchboard().wordIndex.loadedURL
1143: .remove(urlHash);
1144: plasmaSearchEvent event = plasmaSearchEvent
1145: .getEvent(eventID);
1146: assert plasmaSwitchboard.getSwitchboard() != null;
1147: assert plasmaSwitchboard.getSwitchboard().wordIndex != null;
1148: assert event != null : "eventID = " + eventID;
1149: assert event.getQuery() != null;
1150: plasmaSwitchboard.getSwitchboard().wordIndex
1151: .removeEntryMultiple(event.getQuery().queryHashes,
1152: urlHash);
1153: event.remove(urlHash);
1154: }
1155: if (snippet.getErrorCode() == ERROR_NO_MATCH) {
1156: log.logInfo("error: '" + snippet.getError()
1157: + "', remove words '" + querystring
1158: + "' for url = "
1159: + snippet.getUrl().toNormalform(false, true)
1160: + ", cause: " + snippet.getError());
1161: plasmaSwitchboard.getSwitchboard().wordIndex
1162: .removeEntryMultiple(snippet.remaingHashes, urlHash);
1163: plasmaSearchEvent.getEvent(eventID).remove(urlHash);
1164: }
1165: return snippet.getError();
1166: }
1167:
1168: }
|