Source Code Cross Referenced for plasmaWebStructure.java in  » Search-Engine » yacy » de » anomic » plasma » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » yacy » de.anomic.plasma 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        // plasmaWebStructure.java
002:        // -----------------------------
003:        // (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
004:        // first published 15.05.2007 on http://yacy.net
005:        //
006:        // This is a part of YaCy, a peer-to-peer based web search engine
007:        //
008:        // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
009:        // $LastChangedRevision: 1986 $
010:        // $LastChangedBy: orbiter $
011:        //
012:        // LICENSE
013:        // 
014:        // This program is free software; you can redistribute it and/or modify
015:        // it under the terms of the GNU General Public License as published by
016:        // the Free Software Foundation; either version 2 of the License, or
017:        // (at your option) any later version.
018:        //
019:        // This program is distributed in the hope that it will be useful,
020:        // but WITHOUT ANY WARRANTY; without even the implied warranty of
021:        // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
022:        // GNU General Public License for more details.
023:        //
024:        // You should have received a copy of the GNU General Public License
025:        // along with this program; if not, write to the Free Software
026:        // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
027:
028:        package de.anomic.plasma;
029:
030:        import java.io.File;
031:        import java.io.IOException;
032:        import java.util.ConcurrentModificationException;
033:        import java.util.Date;
034:        import java.util.HashMap;
035:        import java.util.Iterator;
036:        import java.util.Map;
037:        import java.util.SortedMap;
038:        import java.util.TreeMap;
039:        import java.util.TreeSet;
040:
041:        import de.anomic.kelondro.kelondroBase64Order;
042:        import de.anomic.server.serverDate;
043:        import de.anomic.server.serverFileUtils;
044:        import de.anomic.server.logging.serverLog;
045:        import de.anomic.yacy.yacyURL;
046:
047:        public class plasmaWebStructure {
048:
049:            public static int maxCRLDump = 500000;
050:            public static int maxCRGDump = 200000;
051:            public static int maxref = 200; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
052:            public static int maxhosts = 4000; // maximum number of hosts in web structure map
053:
054:            private StringBuffer crg; // global citation references
055:            private serverLog log;
056:            private File rankingPath, structureFile;
057:            private String crlFile, crgFile;
058:            private TreeMap<String, String> structure; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
059:
060:            public plasmaWebStructure(serverLog log, File rankingPath,
061:                    String crlFile, String crgFile, File structureFile) {
062:                this .log = log;
063:                this .rankingPath = rankingPath;
064:                this .crlFile = crlFile;
065:                this .crgFile = crgFile;
066:                this .crg = new StringBuffer(maxCRGDump);
067:                this .structure = new TreeMap<String, String>();
068:                this .structureFile = structureFile;
069:
070:                // load web structure
071:                Map<String, String> loadedStructure = (this .structureFile
072:                        .exists()) ? serverFileUtils
073:                        .loadHashMap(this .structureFile)
074:                        : new TreeMap<String, String>();
075:                if (loadedStructure != null)
076:                    this .structure.putAll(loadedStructure);
077:
078:                // delete outdated entries in case the structure is too big
079:                if (this .structure.size() > maxhosts) {
080:                    // fill a set with last-modified - dates of the structure
081:                    TreeSet<String> delset = new TreeSet<String>();
082:                    Map.Entry<String, String> entry;
083:                    Iterator<Map.Entry<String, String>> i = this .structure
084:                            .entrySet().iterator();
085:                    String key, value;
086:                    while (i.hasNext()) {
087:                        entry = i.next();
088:                        key = entry.getKey();
089:                        value = entry.getValue();
090:                        delset.add(value.substring(0, 8) + key);
091:                    }
092:                    int delcount = this .structure.size() - (maxhosts * 9 / 10);
093:                    Iterator<String> j = delset.iterator();
094:                    while ((delcount > 0) && (j.hasNext())) {
095:                        this .structure.remove(j.next().substring(8));
096:                        delcount--;
097:                    }
098:                }
099:            }
100:
101:            public Integer[] /*(outlinksSame, outlinksOther)*/generateCitationReference(
102:                    yacyURL url, String baseurlhash, Date docDate,
103:                    plasmaParserDocument document, plasmaCondenser condenser) {
104:                assert url.hash().equals(baseurlhash);
105:
106:                // generate citation reference
107:                Map<yacyURL, String> hl = document.getHyperlinks();
108:                Iterator<yacyURL> it = hl.keySet().iterator();
109:                String nexturlhash;
110:                StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1);
111:                StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1);
112:                String lhp = baseurlhash.substring(6); // local hash part
113:                int GCount = 0;
114:                int LCount = 0;
115:                while (it.hasNext()) {
116:                    nexturlhash = it.next().hash();
117:                    if (nexturlhash != null) {
118:                        if (nexturlhash.substring(6).equals(lhp)) {
119:                            // this is a inbound link
120:                            cpl.append(nexturlhash.substring(0, 6)); // store only local part
121:                            LCount++;
122:                        } else {
123:                            // this is a outbound link
124:                            cpg.append(nexturlhash); // store complete hash
125:                            GCount++;
126:                        }
127:                    }
128:                }
129:
130:                // append this reference to buffer
131:                // generate header info
132:                String head = baseurlhash
133:                        + "="
134:                        + plasmaWordIndex.microDateHoursStr(docDate.getTime())
135:                        + // latest update timestamp of the URL
136:                        plasmaWordIndex.microDateHoursStr(System
137:                                .currentTimeMillis())
138:                        + // last visit timestamp of the URL
139:                        kelondroBase64Order.enhancedCoder.encodeLongSmart(
140:                                LCount, 2)
141:                        + // count of links to local resources
142:                        kelondroBase64Order.enhancedCoder.encodeLongSmart(
143:                                GCount, 2)
144:                        + // count of links to global resources
145:                        kelondroBase64Order.enhancedCoder.encodeLongSmart(
146:                                document.getImages().size(), 2)
147:                        + // count of Images in document
148:                        kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2)
149:                        + // count of links to other documents
150:                        kelondroBase64Order.enhancedCoder.encodeLongSmart(
151:                                document.getTextLength(), 3) + // length of plain text in bytes
152:                        kelondroBase64Order.enhancedCoder.encodeLongSmart(
153:                                condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
154:                        kelondroBase64Order.enhancedCoder.encodeLongSmart(
155:                                condenser.words().size(), 3) + // count of all unique words
156:                        kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
157:
158:                //crl.append(head); crl.append ('|'); crl.append(cpl); crl.append((char) 13); crl.append((char) 10);
159:                crg.append(head);
160:                crg.append('|');
161:                crg.append(cpg);
162:                crg.append((char) 13);
163:                crg.append((char) 10);
164:
165:                learn(url, cpg);
166:
167:                // if buffer is full, flush it.
168:                /*
169:                if (crl.length() > maxCRLDump) {
170:                    flushCitationReference(crl, "crl");
171:                    crl = new StringBuffer(maxCRLDump);
172:                }
173:                 **/
174:                if (crg.length() > maxCRGDump) {
175:                    flushCitationReference("crg");
176:                    crg = new StringBuffer(maxCRGDump);
177:                }
178:
179:                return new Integer[] { new Integer(LCount), new Integer(GCount) };
180:            }
181:
182:            public void flushCitationReference(String type) {
183:                if (crg.length() < 12)
184:                    return;
185:                String filename = type.toUpperCase() + "-A-"
186:                        + new serverDate().toShortString(true) + "."
187:                        + crg.substring(0, 12) + ".cr.gz";
188:                File path = new File(rankingPath,
189:                        (type.equals("crl")) ? crlFile : crgFile);
190:                path.mkdirs();
191:                File file = new File(path, filename);
192:
193:                // generate header
194:                StringBuffer header = new StringBuffer(200);
195:                header.append("# Name=YaCy "
196:                        + ((type.equals("crl")) ? "Local" : "Global")
197:                        + " Citation Reference Ticket");
198:                header.append((char) 13);
199:                header.append((char) 10);
200:                header.append("# Created=" + System.currentTimeMillis());
201:                header.append((char) 13);
202:                header.append((char) 10);
203:                header
204:                        .append("# Structure=<Referee-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,'|',*<Anchor-"
205:                                + ((type.equals("crl")) ? "6" : "12") + ">");
206:                header.append((char) 13);
207:                header.append((char) 10);
208:                header.append("# ---");
209:                header.append((char) 13);
210:                header.append((char) 10);
211:                crg.insert(0, header.toString());
212:                try {
213:                    serverFileUtils.writeAndGZip(crg.toString().getBytes(),
214:                            file);
215:                    log.logFine("wrote citation reference dump "
216:                            + file.toString());
217:                } catch (IOException e) {
218:                    e.printStackTrace();
219:                }
220:            }
221:
222:            private static int refstr2count(String refs) {
223:                if ((refs == null) || (refs.length() <= 8))
224:                    return 0;
225:                assert (refs.length() - 8) % 10 == 0;
226:                return (refs.length() - 8) / 10;
227:            }
228:
229:            private static Map<String, Integer> refstr2map(String refs) {
230:                if ((refs == null) || (refs.length() <= 8))
231:                    return new HashMap<String, Integer>();
232:                Map<String, Integer> map = new HashMap<String, Integer>();
233:                String c;
234:                int refsc = refstr2count(refs);
235:                for (int i = 0; i < refsc; i++) {
236:                    c = refs.substring(8 + i * 10, 8 + (i + 1) * 10);
237:                    map.put(c.substring(0, 6), new Integer(Integer.parseInt(c
238:                            .substring(6), 16)));
239:                }
240:                return map;
241:            }
242:
243:            private static String map2refstr(Map<String, Integer> map) {
244:                StringBuffer s = new StringBuffer(map.size() * 10);
245:                s.append(serverDate.formatShortDay(new Date()));
246:                Iterator<Map.Entry<String, Integer>> i = map.entrySet()
247:                        .iterator();
248:                Map.Entry<String, Integer> entry;
249:                String h;
250:                while (i.hasNext()) {
251:                    entry = i.next();
252:                    s.append(entry.getKey());
253:                    h = Integer.toHexString(entry.getValue().intValue());
254:                    if (h.length() == 0) {
255:                        s.append("0000");
256:                    } else if (h.length() == 1) {
257:                        s.append("000").append(h);
258:                    } else if (h.length() == 2) {
259:                        s.append("00").append(h);
260:                    } else if (h.length() == 3) {
261:                        s.append('0').append(h);
262:                    } else if (h.length() == 4) {
263:                        s.append(h);
264:                    } else {
265:                        s.append("FFFF");
266:                    }
267:                }
268:                return s.toString();
269:            }
270:
271:            public Map<String, Integer> references(String domhash) {
272:                // returns a map with a domhash(String):refcount(Integer) relation
273:                assert domhash.length() == 6;
274:                SortedMap<String, String> tailMap = structure.tailMap(domhash);
275:                if ((tailMap == null) || (tailMap.size() == 0))
276:                    return new HashMap<String, Integer>();
277:                String key = tailMap.firstKey();
278:                if (key.startsWith(domhash)) {
279:                    return refstr2map(tailMap.get(key));
280:                } else {
281:                    return new HashMap<String, Integer>();
282:                }
283:            }
284:
285:            public int referencesCount(String domhash) {
286:                // returns the number of domains that are referenced by this domhash
287:                assert domhash.length() == 6 : "domhash = " + domhash;
288:                try {
289:                    SortedMap<String, String> tailMap = structure
290:                            .tailMap(domhash);
291:                    if ((tailMap == null) || (tailMap.size() == 0))
292:                        return 0;
293:                    String key = tailMap.firstKey();
294:                    if (key.startsWith(domhash)) {
295:                        return refstr2count(tailMap.get(key));
296:                    } else {
297:                        return 0;
298:                    }
299:                } catch (ConcurrentModificationException e) {
300:                    return 0;
301:                }
302:            }
303:
304:            public String resolveDomHash2DomString(String domhash) {
305:                // returns the domain as string, null if unknown
306:                assert domhash.length() == 6;
307:                try {
308:                    SortedMap<String, String> tailMap = structure
309:                            .tailMap(domhash);
310:                    if ((tailMap == null) || (tailMap.size() == 0))
311:                        return null;
312:                    String key = tailMap.firstKey();
313:                    if (key.startsWith(domhash)) {
314:                        return key.substring(7);
315:                    } else {
316:                        return null;
317:                    }
318:                } catch (ConcurrentModificationException e) {
319:                    // we don't want to implement a synchronization here,
320:                    // because this is 'only' used for a graphics application
321:                    // just return null
322:                    return null;
323:                }
324:            }
325:
326:            private void learn(yacyURL url, StringBuffer reference /*string of b64(12digits)-hashes*/) {
327:                String domhash = url.hash().substring(6);
328:
329:                // parse the new reference string and join it with the stored references
330:                Map<String, Integer> refs = references(domhash);
331:                assert reference.length() % 12 == 0;
332:                String dom;
333:                int c;
334:                for (int i = 0; i < reference.length() / 12; i++) {
335:                    dom = reference.substring(i * 12 + 6, (i + 1) * 12);
336:                    c = 0;
337:                    if (refs.containsKey(dom)) {
338:                        c = ((Integer) refs.get(dom)).intValue();
339:                    }
340:                    refs.put(dom, new Integer(++c));
341:                }
342:
343:                // check if the maxref is exceeded
344:                if (refs.size() > maxref) {
345:                    int shrink = refs.size() - (maxref * 9 / 10);
346:                    delloop: while (shrink > 0) {
347:                        // shrink the references: the entry with the smallest number of references is removed
348:                        int minrefcount = Integer.MAX_VALUE;
349:                        String minrefkey = null;
350:                        Iterator<Map.Entry<String, Integer>> i = refs
351:                                .entrySet().iterator();
352:                        Map.Entry<String, Integer> entry;
353:                        findloop: while (i.hasNext()) {
354:                            entry = i.next();
355:                            if (entry.getValue().intValue() < minrefcount) {
356:                                minrefcount = entry.getValue().intValue();
357:                                minrefkey = entry.getKey();
358:                            }
359:                            if (minrefcount == 1)
360:                                break findloop;
361:                        }
362:                        // remove the smallest
363:                        if (minrefkey == null)
364:                            break delloop;
365:                        refs.remove(minrefkey);
366:                        shrink--;
367:                    }
368:                }
369:
370:                // store the map back to the structure
371:                structure.put(domhash + "," + url.getHost(), map2refstr(refs));
372:            }
373:
374:            public void saveWebStructure() {
375:                try {
376:                    serverFileUtils
377:                            .saveMap(
378:                                    this .structureFile,
379:                                    this .structure,
380:                                    "Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
381:                } catch (IOException e) {
382:                    e.printStackTrace();
383:                }
384:            }
385:
386:            public String hostWithMaxReferences() {
387:                // find domain with most references
388:                Iterator<Map.Entry<String, String>> i = structure.entrySet()
389:                        .iterator();
390:                int refsize, maxref = 0;
391:                String maxhost = null;
392:                Map.Entry<String, String> entry;
393:                while (i.hasNext()) {
394:                    entry = i.next();
395:                    refsize = entry.getValue().length();
396:                    if (refsize > maxref) {
397:                        maxref = refsize;
398:                        maxhost = entry.getKey().substring(7);
399:                    }
400:                }
401:                return maxhost;
402:            }
403:
404:            public Iterator<structureEntry> structureEntryIterator() {
405:                // iterates objects of type structureEntry
406:                return new structureIterator();
407:            }
408:
409:            public class structureIterator implements  Iterator<structureEntry> {
410:
411:                private Iterator<Map.Entry<String, String>> i;
412:                private structureEntry nextentry;
413:
414:                public structureIterator() {
415:                    i = structure.entrySet().iterator();
416:                    next0();
417:                }
418:
419:                public boolean hasNext() {
420:                    return nextentry != null;
421:                }
422:
423:                private void next0() {
424:                    Map.Entry<String, String> entry = null;
425:                    String dom = null, ref;
426:                    while (i.hasNext()) {
427:                        entry = i.next();
428:                        dom = entry.getKey();
429:                        if (dom.length() >= 8)
430:                            break;
431:                        if (!i.hasNext()) {
432:                            nextentry = null;
433:                            return;
434:                        }
435:                    }
436:                    if ((entry == null) || (dom == null)) {
437:                        nextentry = null;
438:                        return;
439:                    }
440:                    ref = entry.getValue();
441:                    nextentry = new structureEntry(dom.substring(0, 6), dom
442:                            .substring(7), ref.substring(0, 8), refstr2map(ref));
443:                }
444:
445:                public structureEntry next() {
446:                    structureEntry r = nextentry;
447:                    next0();
448:                    return r;
449:                }
450:
451:                public void remove() {
452:                    throw new UnsupportedOperationException("not implemented");
453:                }
454:
455:            }
456:
457:            public class structureEntry {
458:                public String domhash, domain, date;
459:                public Map<String, Integer> references;
460:
461:                public structureEntry(String domhash, String domain,
462:                        String date, Map<String, Integer> references) {
463:                    this .domhash = domhash;
464:                    this .domain = domain;
465:                    this .date = date;
466:                    this .references = references;
467:                }
468:            }
469:
470:            public void close() {
471:                log.logInfo("Saving Web Structure File");
472:                saveWebStructure();
473:            }
474:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.