001: // plasmaWebStructure.java
002: // -----------------------------
003: // (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
004: // first published 15.05.2007 on http://yacy.net
005: //
006: // This is a part of YaCy, a peer-to-peer based web search engine
007: //
008: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
009: // $LastChangedRevision: 1986 $
010: // $LastChangedBy: orbiter $
011: //
012: // LICENSE
013: //
014: // This program is free software; you can redistribute it and/or modify
015: // it under the terms of the GNU General Public License as published by
016: // the Free Software Foundation; either version 2 of the License, or
017: // (at your option) any later version.
018: //
019: // This program is distributed in the hope that it will be useful,
020: // but WITHOUT ANY WARRANTY; without even the implied warranty of
021: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
022: // GNU General Public License for more details.
023: //
024: // You should have received a copy of the GNU General Public License
025: // along with this program; if not, write to the Free Software
026: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
027:
028: package de.anomic.plasma;
029:
030: import java.io.File;
031: import java.io.IOException;
032: import java.util.ConcurrentModificationException;
033: import java.util.Date;
034: import java.util.HashMap;
035: import java.util.Iterator;
036: import java.util.Map;
037: import java.util.SortedMap;
038: import java.util.TreeMap;
039: import java.util.TreeSet;
040:
041: import de.anomic.kelondro.kelondroBase64Order;
042: import de.anomic.server.serverDate;
043: import de.anomic.server.serverFileUtils;
044: import de.anomic.server.logging.serverLog;
045: import de.anomic.yacy.yacyURL;
046:
047: public class plasmaWebStructure {
048:
049: public static int maxCRLDump = 500000;
050: public static int maxCRGDump = 200000;
051: public static int maxref = 200; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
052: public static int maxhosts = 4000; // maximum number of hosts in web structure map
053:
054: private StringBuffer crg; // global citation references
055: private serverLog log;
056: private File rankingPath, structureFile;
057: private String crlFile, crgFile;
058: private TreeMap<String, String> structure; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
059:
060: public plasmaWebStructure(serverLog log, File rankingPath,
061: String crlFile, String crgFile, File structureFile) {
062: this .log = log;
063: this .rankingPath = rankingPath;
064: this .crlFile = crlFile;
065: this .crgFile = crgFile;
066: this .crg = new StringBuffer(maxCRGDump);
067: this .structure = new TreeMap<String, String>();
068: this .structureFile = structureFile;
069:
070: // load web structure
071: Map<String, String> loadedStructure = (this .structureFile
072: .exists()) ? serverFileUtils
073: .loadHashMap(this .structureFile)
074: : new TreeMap<String, String>();
075: if (loadedStructure != null)
076: this .structure.putAll(loadedStructure);
077:
078: // delete outdated entries in case the structure is too big
079: if (this .structure.size() > maxhosts) {
080: // fill a set with last-modified - dates of the structure
081: TreeSet<String> delset = new TreeSet<String>();
082: Map.Entry<String, String> entry;
083: Iterator<Map.Entry<String, String>> i = this .structure
084: .entrySet().iterator();
085: String key, value;
086: while (i.hasNext()) {
087: entry = i.next();
088: key = entry.getKey();
089: value = entry.getValue();
090: delset.add(value.substring(0, 8) + key);
091: }
092: int delcount = this .structure.size() - (maxhosts * 9 / 10);
093: Iterator<String> j = delset.iterator();
094: while ((delcount > 0) && (j.hasNext())) {
095: this .structure.remove(j.next().substring(8));
096: delcount--;
097: }
098: }
099: }
100:
101: public Integer[] /*(outlinksSame, outlinksOther)*/generateCitationReference(
102: yacyURL url, String baseurlhash, Date docDate,
103: plasmaParserDocument document, plasmaCondenser condenser) {
104: assert url.hash().equals(baseurlhash);
105:
106: // generate citation reference
107: Map<yacyURL, String> hl = document.getHyperlinks();
108: Iterator<yacyURL> it = hl.keySet().iterator();
109: String nexturlhash;
110: StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1);
111: StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1);
112: String lhp = baseurlhash.substring(6); // local hash part
113: int GCount = 0;
114: int LCount = 0;
115: while (it.hasNext()) {
116: nexturlhash = it.next().hash();
117: if (nexturlhash != null) {
118: if (nexturlhash.substring(6).equals(lhp)) {
119: // this is a inbound link
120: cpl.append(nexturlhash.substring(0, 6)); // store only local part
121: LCount++;
122: } else {
123: // this is a outbound link
124: cpg.append(nexturlhash); // store complete hash
125: GCount++;
126: }
127: }
128: }
129:
130: // append this reference to buffer
131: // generate header info
132: String head = baseurlhash
133: + "="
134: + plasmaWordIndex.microDateHoursStr(docDate.getTime())
135: + // latest update timestamp of the URL
136: plasmaWordIndex.microDateHoursStr(System
137: .currentTimeMillis())
138: + // last visit timestamp of the URL
139: kelondroBase64Order.enhancedCoder.encodeLongSmart(
140: LCount, 2)
141: + // count of links to local resources
142: kelondroBase64Order.enhancedCoder.encodeLongSmart(
143: GCount, 2)
144: + // count of links to global resources
145: kelondroBase64Order.enhancedCoder.encodeLongSmart(
146: document.getImages().size(), 2)
147: + // count of Images in document
148: kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2)
149: + // count of links to other documents
150: kelondroBase64Order.enhancedCoder.encodeLongSmart(
151: document.getTextLength(), 3) + // length of plain text in bytes
152: kelondroBase64Order.enhancedCoder.encodeLongSmart(
153: condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
154: kelondroBase64Order.enhancedCoder.encodeLongSmart(
155: condenser.words().size(), 3) + // count of all unique words
156: kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
157:
158: //crl.append(head); crl.append ('|'); crl.append(cpl); crl.append((char) 13); crl.append((char) 10);
159: crg.append(head);
160: crg.append('|');
161: crg.append(cpg);
162: crg.append((char) 13);
163: crg.append((char) 10);
164:
165: learn(url, cpg);
166:
167: // if buffer is full, flush it.
168: /*
169: if (crl.length() > maxCRLDump) {
170: flushCitationReference(crl, "crl");
171: crl = new StringBuffer(maxCRLDump);
172: }
173: **/
174: if (crg.length() > maxCRGDump) {
175: flushCitationReference("crg");
176: crg = new StringBuffer(maxCRGDump);
177: }
178:
179: return new Integer[] { new Integer(LCount), new Integer(GCount) };
180: }
181:
182: public void flushCitationReference(String type) {
183: if (crg.length() < 12)
184: return;
185: String filename = type.toUpperCase() + "-A-"
186: + new serverDate().toShortString(true) + "."
187: + crg.substring(0, 12) + ".cr.gz";
188: File path = new File(rankingPath,
189: (type.equals("crl")) ? crlFile : crgFile);
190: path.mkdirs();
191: File file = new File(path, filename);
192:
193: // generate header
194: StringBuffer header = new StringBuffer(200);
195: header.append("# Name=YaCy "
196: + ((type.equals("crl")) ? "Local" : "Global")
197: + " Citation Reference Ticket");
198: header.append((char) 13);
199: header.append((char) 10);
200: header.append("# Created=" + System.currentTimeMillis());
201: header.append((char) 13);
202: header.append((char) 10);
203: header
204: .append("# Structure=<Referee-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,'|',*<Anchor-"
205: + ((type.equals("crl")) ? "6" : "12") + ">");
206: header.append((char) 13);
207: header.append((char) 10);
208: header.append("# ---");
209: header.append((char) 13);
210: header.append((char) 10);
211: crg.insert(0, header.toString());
212: try {
213: serverFileUtils.writeAndGZip(crg.toString().getBytes(),
214: file);
215: log.logFine("wrote citation reference dump "
216: + file.toString());
217: } catch (IOException e) {
218: e.printStackTrace();
219: }
220: }
221:
222: private static int refstr2count(String refs) {
223: if ((refs == null) || (refs.length() <= 8))
224: return 0;
225: assert (refs.length() - 8) % 10 == 0;
226: return (refs.length() - 8) / 10;
227: }
228:
229: private static Map<String, Integer> refstr2map(String refs) {
230: if ((refs == null) || (refs.length() <= 8))
231: return new HashMap<String, Integer>();
232: Map<String, Integer> map = new HashMap<String, Integer>();
233: String c;
234: int refsc = refstr2count(refs);
235: for (int i = 0; i < refsc; i++) {
236: c = refs.substring(8 + i * 10, 8 + (i + 1) * 10);
237: map.put(c.substring(0, 6), new Integer(Integer.parseInt(c
238: .substring(6), 16)));
239: }
240: return map;
241: }
242:
243: private static String map2refstr(Map<String, Integer> map) {
244: StringBuffer s = new StringBuffer(map.size() * 10);
245: s.append(serverDate.formatShortDay(new Date()));
246: Iterator<Map.Entry<String, Integer>> i = map.entrySet()
247: .iterator();
248: Map.Entry<String, Integer> entry;
249: String h;
250: while (i.hasNext()) {
251: entry = i.next();
252: s.append(entry.getKey());
253: h = Integer.toHexString(entry.getValue().intValue());
254: if (h.length() == 0) {
255: s.append("0000");
256: } else if (h.length() == 1) {
257: s.append("000").append(h);
258: } else if (h.length() == 2) {
259: s.append("00").append(h);
260: } else if (h.length() == 3) {
261: s.append('0').append(h);
262: } else if (h.length() == 4) {
263: s.append(h);
264: } else {
265: s.append("FFFF");
266: }
267: }
268: return s.toString();
269: }
270:
271: public Map<String, Integer> references(String domhash) {
272: // returns a map with a domhash(String):refcount(Integer) relation
273: assert domhash.length() == 6;
274: SortedMap<String, String> tailMap = structure.tailMap(domhash);
275: if ((tailMap == null) || (tailMap.size() == 0))
276: return new HashMap<String, Integer>();
277: String key = tailMap.firstKey();
278: if (key.startsWith(domhash)) {
279: return refstr2map(tailMap.get(key));
280: } else {
281: return new HashMap<String, Integer>();
282: }
283: }
284:
285: public int referencesCount(String domhash) {
286: // returns the number of domains that are referenced by this domhash
287: assert domhash.length() == 6 : "domhash = " + domhash;
288: try {
289: SortedMap<String, String> tailMap = structure
290: .tailMap(domhash);
291: if ((tailMap == null) || (tailMap.size() == 0))
292: return 0;
293: String key = tailMap.firstKey();
294: if (key.startsWith(domhash)) {
295: return refstr2count(tailMap.get(key));
296: } else {
297: return 0;
298: }
299: } catch (ConcurrentModificationException e) {
300: return 0;
301: }
302: }
303:
304: public String resolveDomHash2DomString(String domhash) {
305: // returns the domain as string, null if unknown
306: assert domhash.length() == 6;
307: try {
308: SortedMap<String, String> tailMap = structure
309: .tailMap(domhash);
310: if ((tailMap == null) || (tailMap.size() == 0))
311: return null;
312: String key = tailMap.firstKey();
313: if (key.startsWith(domhash)) {
314: return key.substring(7);
315: } else {
316: return null;
317: }
318: } catch (ConcurrentModificationException e) {
319: // we don't want to implement a synchronization here,
320: // because this is 'only' used for a graphics application
321: // just return null
322: return null;
323: }
324: }
325:
326: private void learn(yacyURL url, StringBuffer reference /*string of b64(12digits)-hashes*/) {
327: String domhash = url.hash().substring(6);
328:
329: // parse the new reference string and join it with the stored references
330: Map<String, Integer> refs = references(domhash);
331: assert reference.length() % 12 == 0;
332: String dom;
333: int c;
334: for (int i = 0; i < reference.length() / 12; i++) {
335: dom = reference.substring(i * 12 + 6, (i + 1) * 12);
336: c = 0;
337: if (refs.containsKey(dom)) {
338: c = ((Integer) refs.get(dom)).intValue();
339: }
340: refs.put(dom, new Integer(++c));
341: }
342:
343: // check if the maxref is exceeded
344: if (refs.size() > maxref) {
345: int shrink = refs.size() - (maxref * 9 / 10);
346: delloop: while (shrink > 0) {
347: // shrink the references: the entry with the smallest number of references is removed
348: int minrefcount = Integer.MAX_VALUE;
349: String minrefkey = null;
350: Iterator<Map.Entry<String, Integer>> i = refs
351: .entrySet().iterator();
352: Map.Entry<String, Integer> entry;
353: findloop: while (i.hasNext()) {
354: entry = i.next();
355: if (entry.getValue().intValue() < minrefcount) {
356: minrefcount = entry.getValue().intValue();
357: minrefkey = entry.getKey();
358: }
359: if (minrefcount == 1)
360: break findloop;
361: }
362: // remove the smallest
363: if (minrefkey == null)
364: break delloop;
365: refs.remove(minrefkey);
366: shrink--;
367: }
368: }
369:
370: // store the map back to the structure
371: structure.put(domhash + "," + url.getHost(), map2refstr(refs));
372: }
373:
374: public void saveWebStructure() {
375: try {
376: serverFileUtils
377: .saveMap(
378: this .structureFile,
379: this .structure,
380: "Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
381: } catch (IOException e) {
382: e.printStackTrace();
383: }
384: }
385:
386: public String hostWithMaxReferences() {
387: // find domain with most references
388: Iterator<Map.Entry<String, String>> i = structure.entrySet()
389: .iterator();
390: int refsize, maxref = 0;
391: String maxhost = null;
392: Map.Entry<String, String> entry;
393: while (i.hasNext()) {
394: entry = i.next();
395: refsize = entry.getValue().length();
396: if (refsize > maxref) {
397: maxref = refsize;
398: maxhost = entry.getKey().substring(7);
399: }
400: }
401: return maxhost;
402: }
403:
404: public Iterator<structureEntry> structureEntryIterator() {
405: // iterates objects of type structureEntry
406: return new structureIterator();
407: }
408:
409: public class structureIterator implements Iterator<structureEntry> {
410:
411: private Iterator<Map.Entry<String, String>> i;
412: private structureEntry nextentry;
413:
414: public structureIterator() {
415: i = structure.entrySet().iterator();
416: next0();
417: }
418:
419: public boolean hasNext() {
420: return nextentry != null;
421: }
422:
423: private void next0() {
424: Map.Entry<String, String> entry = null;
425: String dom = null, ref;
426: while (i.hasNext()) {
427: entry = i.next();
428: dom = entry.getKey();
429: if (dom.length() >= 8)
430: break;
431: if (!i.hasNext()) {
432: nextentry = null;
433: return;
434: }
435: }
436: if ((entry == null) || (dom == null)) {
437: nextentry = null;
438: return;
439: }
440: ref = entry.getValue();
441: nextentry = new structureEntry(dom.substring(0, 6), dom
442: .substring(7), ref.substring(0, 8), refstr2map(ref));
443: }
444:
445: public structureEntry next() {
446: structureEntry r = nextentry;
447: next0();
448: return r;
449: }
450:
451: public void remove() {
452: throw new UnsupportedOperationException("not implemented");
453: }
454:
455: }
456:
457: public class structureEntry {
458: public String domhash, domain, date;
459: public Map<String, Integer> references;
460:
461: public structureEntry(String domhash, String domain,
462: String date, Map<String, Integer> references) {
463: this .domhash = domhash;
464: this .domain = domain;
465: this .date = date;
466: this .references = references;
467: }
468: }
469:
470: public void close() {
471: log.logInfo("Saving Web Structure File");
472: saveWebStructure();
473: }
474: }
|