001: // plasmaCrawlLURL.java
002: // -----------------------
003: // part of YaCy
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2004
007: //
008: // $LastChangedDate: 2008-01-30 21:58:30 +0000 (Mi, 30 Jan 2008) $
009: // $LastChangedRevision: 4420 $
010: // $LastChangedBy: orbiter $
011: //
012: // This program is free software; you can redistribute it and/or modify
013: // it under the terms of the GNU General Public License as published by
014: // the Free Software Foundation; either version 2 of the License, or
015: // (at your option) any later version.
016: //
017: // This program is distributed in the hope that it will be useful,
018: // but WITHOUT ANY WARRANTY; without even the implied warranty of
019: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: // GNU General Public License for more details.
021: //
022: // You should have received a copy of the GNU General Public License
023: // along with this program; if not, write to the Free Software
024: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: //
026: // Using this software in any meaning (reading, learning, copying, compiling,
027: // running) means that you agree that the Author(s) is (are) not responsible
028: // for cost, loss of data or any harm that may be caused directly or indirectly
029: // by usage of this softare or this documentation. The usage of this software
030: // is on your own risk. The installation and usage (starting/running) of this
031: // software may allow other people or application to access your computer and
032: // any attached devices and is highly dependent on the configuration of the
033: // software which must be done by the user of the software; the author(s) is
034: // (are) also not responsible for proper configuration and usage of the
035: // software, even if provoked by documentation provided together with
036: // the software.
037: //
038: // Any changes to this file according to the GPL as documented in the file
039: // gpl.txt aside this file in the shipment you received can be done to the
040: // lines that follows this copyright notice here, but changes must not be
041: // done inside the copyright notive above. A re-distribution must contain
042: // the intact and unchanged copyright notice.
043: // Contributions and changes to the program code must be marked as such.
044:
045: /*
046: This class provides storage functions for the plasma search engine.
047: - the url-specific properties, including condenser results
048: - the text content of the url
049: Both entities are accessed with a hash, which is based on the MD5
050: algorithm. The MD5 is not encoded as a hex value, but a b64 value.
051: */
052:
053: package de.anomic.plasma;
054:
055: import java.io.BufferedOutputStream;
056: import java.io.File;
057: import java.io.FileOutputStream;
058: import java.io.IOException;
059: import java.io.PrintWriter;
060: import java.net.MalformedURLException;
061: import java.util.Date;
062: import java.util.HashSet;
063: import java.util.Iterator;
064: import java.util.LinkedList;
065:
066: import de.anomic.data.htmlTools;
067: import de.anomic.http.httpc;
068: import de.anomic.http.httpc.response;
069: import de.anomic.index.indexRWIRowEntry;
070: import de.anomic.index.indexURLEntry;
071: import de.anomic.kelondro.kelondroBase64Order;
072: import de.anomic.kelondro.kelondroCache;
073: import de.anomic.kelondro.kelondroCloneableIterator;
074: import de.anomic.kelondro.kelondroException;
075: import de.anomic.kelondro.kelondroIndex;
076: import de.anomic.kelondro.kelondroRow;
077: import de.anomic.kelondro.kelondroRowSet;
078: import de.anomic.kelondro.kelondroSplitTable;
079: import de.anomic.plasma.urlPattern.plasmaURLPattern;
080: import de.anomic.server.serverCodings;
081: import de.anomic.server.logging.serverLog;
082: import de.anomic.yacy.yacySeedDB;
083: import de.anomic.yacy.yacyURL;
084:
085: public final class plasmaCrawlLURL {
086:
087: // result stacks;
088: // these have all entries of form
089: // strings: urlHash + initiatorHash + ExecutorHash
090: private final LinkedList<String> externResultStack; // 1 - remote index: retrieved by other peer
091: private final LinkedList<String> searchResultStack; // 2 - partly remote/local index: result of search queries
092: private final LinkedList<String> transfResultStack; // 3 - partly remote/local index: result of index transfer
093: private final LinkedList<String> proxyResultStack; // 4 - local index: result of proxy fetch/prefetch
094: private final LinkedList<String> lcrawlResultStack; // 5 - local index: result of local crawling
095: private final LinkedList<String> gcrawlResultStack; // 6 - local index: triggered external
096:
097: // the class object
098: private kelondroIndex urlIndexFile;
099:
100: public plasmaCrawlLURL(File indexPath, long preloadTime) {
101: super ();
102:
103: urlIndexFile = new kelondroSplitTable(new File(indexPath,
104: "PUBLIC/TEXT"), "urls", preloadTime,
105: indexURLEntry.rowdef, false);
106:
107: // init result stacks
108: externResultStack = new LinkedList<String>();
109: searchResultStack = new LinkedList<String>();
110: transfResultStack = new LinkedList<String>();
111: proxyResultStack = new LinkedList<String>();
112: lcrawlResultStack = new LinkedList<String>();
113: gcrawlResultStack = new LinkedList<String>();
114: }
115:
116: public int size() {
117: return urlIndexFile.size();
118: }
119:
120: public void close() {
121: if (urlIndexFile != null) {
122: urlIndexFile.close();
123: urlIndexFile = null;
124: }
125: }
126:
127: public synchronized void stack(indexURLEntry e,
128: String initiatorHash, String executorHash, int stackType) {
129: if (e == null) {
130: return;
131: }
132: try {
133: if (initiatorHash == null) {
134: initiatorHash = yacyURL.dummyHash;
135: }
136: if (executorHash == null) {
137: executorHash = yacyURL.dummyHash;
138: }
139: switch (stackType) {
140: case 0:
141: break;
142: case 1:
143: externResultStack.add(e.hash() + initiatorHash
144: + executorHash);
145: break;
146: case 2:
147: searchResultStack.add(e.hash() + initiatorHash
148: + executorHash);
149: break;
150: case 3:
151: transfResultStack.add(e.hash() + initiatorHash
152: + executorHash);
153: break;
154: case 4:
155: proxyResultStack.add(e.hash() + initiatorHash
156: + executorHash);
157: break;
158: case 5:
159: lcrawlResultStack.add(e.hash() + initiatorHash
160: + executorHash);
161: break;
162: case 6:
163: gcrawlResultStack.add(e.hash() + initiatorHash
164: + executorHash);
165: break;
166: }
167: return;
168: } catch (Exception ex) {
169: System.out.println("INTERNAL ERROR in newEntry/2: "
170: + ex.toString());
171: return;
172: }
173: }
174:
175: public synchronized void notifyGCrawl(String urlHash,
176: String initiatorHash, String executorHash) {
177: gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
178: }
179:
180: public synchronized int writeCacheSize() {
181: if (urlIndexFile instanceof kelondroSplitTable)
182: return ((kelondroSplitTable) urlIndexFile)
183: .writeBufferSize();
184: if (urlIndexFile instanceof kelondroCache)
185: return ((kelondroCache) urlIndexFile).writeBufferSize();
186: return 0;
187: }
188:
189: public synchronized indexURLEntry load(String urlHash,
190: indexRWIRowEntry searchedWord, long ranking) {
191: // generates an plasmaLURLEntry using the url hash
192: // to speed up the access, the url-hashes are buffered
193: // in the hash cache.
194: // we have two options to find the url:
195: // - look into the hash cache
196: // - look into the filed properties
197: // if the url cannot be found, this returns null
198: if (urlHash == null)
199: return null;
200: try {
201: kelondroRow.Entry entry = urlIndexFile.get(urlHash
202: .getBytes());
203: if (entry == null)
204: return null;
205: return new indexURLEntry(entry, searchedWord, ranking);
206: } catch (IOException e) {
207: return null;
208: }
209: }
210:
211: public synchronized void store(indexURLEntry entry)
212: throws IOException {
213: // Check if there is a more recent Entry already in the DB
214: indexURLEntry oldEntry;
215: try {
216: if (exists(entry.hash())) {
217: oldEntry = load(entry.hash(), null, 0);
218: } else {
219: oldEntry = null;
220: }
221: } catch (Exception e) {
222: e.printStackTrace();
223: oldEntry = null;
224: }
225: if ((oldEntry != null) && (entry.isOlder(oldEntry))) {
226: // the fetched oldEntry is better, so return its properties instead of the new ones
227: // this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
228: // this.url = oldEntry.url; // unnecessary, should be the same
229: entry = oldEntry;
230: return; // this did not need to be stored, but is updated
231: }
232:
233: urlIndexFile
234: .put(entry.toRowEntry(), new Date() /*entry.loaddate()*/);
235: }
236:
237: public synchronized indexURLEntry newEntry(String propStr) {
238: if (propStr != null && propStr.startsWith("{")
239: && propStr.endsWith("}"))
240: try {
241: return new indexURLEntry(serverCodings.s2p(propStr
242: .substring(1, propStr.length() - 1)));
243: } catch (kelondroException e) {
244: // wrong format
245: return null;
246: }
247: else {
248: return null;
249: }
250: }
251:
252: public synchronized int getStackSize(int stack) {
253: switch (stack) {
254: case 1:
255: return externResultStack.size();
256: case 2:
257: return searchResultStack.size();
258: case 3:
259: return transfResultStack.size();
260: case 4:
261: return proxyResultStack.size();
262: case 5:
263: return lcrawlResultStack.size();
264: case 6:
265: return gcrawlResultStack.size();
266: }
267: return -1;
268: }
269:
270: public synchronized String getUrlHash(int stack, int pos) {
271: switch (stack) {
272: case 1:
273: return ((String) externResultStack.get(pos)).substring(0,
274: yacySeedDB.commonHashLength);
275: case 2:
276: return ((String) searchResultStack.get(pos)).substring(0,
277: yacySeedDB.commonHashLength);
278: case 3:
279: return ((String) transfResultStack.get(pos)).substring(0,
280: yacySeedDB.commonHashLength);
281: case 4:
282: return ((String) proxyResultStack.get(pos)).substring(0,
283: yacySeedDB.commonHashLength);
284: case 5:
285: return ((String) lcrawlResultStack.get(pos)).substring(0,
286: yacySeedDB.commonHashLength);
287: case 6:
288: return ((String) gcrawlResultStack.get(pos)).substring(0,
289: yacySeedDB.commonHashLength);
290: }
291: return null;
292: }
293:
294: public synchronized String getInitiatorHash(int stack, int pos) {
295: switch (stack) {
296: case 1:
297: return ((String) externResultStack.get(pos)).substring(
298: yacySeedDB.commonHashLength,
299: yacySeedDB.commonHashLength * 2);
300: case 2:
301: return ((String) searchResultStack.get(pos)).substring(
302: yacySeedDB.commonHashLength,
303: yacySeedDB.commonHashLength * 2);
304: case 3:
305: return ((String) transfResultStack.get(pos)).substring(
306: yacySeedDB.commonHashLength,
307: yacySeedDB.commonHashLength * 2);
308: case 4:
309: return ((String) proxyResultStack.get(pos)).substring(
310: yacySeedDB.commonHashLength,
311: yacySeedDB.commonHashLength * 2);
312: case 5:
313: return ((String) lcrawlResultStack.get(pos)).substring(
314: yacySeedDB.commonHashLength,
315: yacySeedDB.commonHashLength * 2);
316: case 6:
317: return ((String) gcrawlResultStack.get(pos)).substring(
318: yacySeedDB.commonHashLength,
319: yacySeedDB.commonHashLength * 2);
320: }
321: return null;
322: }
323:
324: public synchronized String getExecutorHash(int stack, int pos) {
325: switch (stack) {
326: case 1:
327: return ((String) externResultStack.get(pos)).substring(
328: yacySeedDB.commonHashLength * 2,
329: yacySeedDB.commonHashLength * 3);
330: case 2:
331: return ((String) searchResultStack.get(pos)).substring(
332: yacySeedDB.commonHashLength * 2,
333: yacySeedDB.commonHashLength * 3);
334: case 3:
335: return ((String) transfResultStack.get(pos)).substring(
336: yacySeedDB.commonHashLength * 2,
337: yacySeedDB.commonHashLength * 3);
338: case 4:
339: return ((String) proxyResultStack.get(pos)).substring(
340: yacySeedDB.commonHashLength * 2,
341: yacySeedDB.commonHashLength * 3);
342: case 5:
343: return ((String) lcrawlResultStack.get(pos)).substring(
344: yacySeedDB.commonHashLength * 2,
345: yacySeedDB.commonHashLength * 3);
346: case 6:
347: return ((String) gcrawlResultStack.get(pos)).substring(
348: yacySeedDB.commonHashLength * 2,
349: yacySeedDB.commonHashLength * 3);
350: }
351: return null;
352: }
353:
354: public synchronized boolean removeStack(int stack, int pos) {
355: Object prevElement = null;
356: switch (stack) {
357: case 1:
358: prevElement = externResultStack.remove(pos);
359: break;
360: case 2:
361: prevElement = searchResultStack.remove(pos);
362: break;
363: case 3:
364: prevElement = transfResultStack.remove(pos);
365: break;
366: case 4:
367: prevElement = proxyResultStack.remove(pos);
368: break;
369: case 5:
370: prevElement = lcrawlResultStack.remove(pos);
371: break;
372: case 6:
373: prevElement = gcrawlResultStack.remove(pos);
374: break;
375: }
376: return prevElement != null;
377: }
378:
379: public synchronized void clearStack(int stack) {
380: switch (stack) {
381: case 1:
382: externResultStack.clear();
383: break;
384: case 2:
385: searchResultStack.clear();
386: break;
387: case 3:
388: transfResultStack.clear();
389: break;
390: case 4:
391: proxyResultStack.clear();
392: break;
393: case 5:
394: lcrawlResultStack.clear();
395: break;
396: case 6:
397: gcrawlResultStack.clear();
398: break;
399: }
400: }
401:
402: public synchronized boolean remove(String urlHash) {
403: if (urlHash == null)
404: return false;
405: try {
406: kelondroRow.Entry r = urlIndexFile.remove(urlHash
407: .getBytes(), false);
408: if (r == null)
409: return false;
410: for (int stack = 1; stack <= 6; stack++) {
411: for (int i = getStackSize(stack) - 1; i >= 0; i--) {
412: if (getUrlHash(stack, i).equals(urlHash)) {
413: removeStack(stack, i);
414: return true;
415: }
416: }
417: }
418: return true;
419: } catch (IOException e) {
420: return false;
421: }
422: }
423:
424: public synchronized boolean exists(String urlHash) {
425: if (urlIndexFile == null)
426: return false; // case may happen during shutdown
427: try {
428: return urlIndexFile.has(urlHash.getBytes());
429: } catch (IOException e) {
430: return false;
431: }
432: }
433:
434: public kelondroCloneableIterator<indexURLEntry> entries(boolean up,
435: String firstHash) throws IOException {
436: // enumerates entry elements
437: return new kiter(up, firstHash);
438: }
439:
440: public class kiter implements
441: kelondroCloneableIterator<indexURLEntry> {
442: // enumerates entry elements
443: private Iterator<kelondroRow.Entry> iter;
444: private boolean error;
445: boolean up;
446:
447: public kiter(boolean up, String firstHash) throws IOException {
448: this .up = up;
449: this .iter = plasmaCrawlLURL.this .urlIndexFile.rows(up,
450: (firstHash == null) ? null : firstHash.getBytes());
451: this .error = false;
452: }
453:
454: public kiter clone(Object secondHash) {
455: try {
456: return new kiter(up, (String) secondHash);
457: } catch (IOException e) {
458: return null;
459: }
460: }
461:
462: public final boolean hasNext() {
463: if (this .error)
464: return false;
465: if (this .iter == null)
466: return false;
467: return this .iter.hasNext();
468: }
469:
470: public final indexURLEntry next() {
471: kelondroRow.Entry e = null;
472: if (this .iter == null) {
473: return null;
474: }
475: if (this .iter.hasNext()) {
476: e = this .iter.next();
477: }
478: if (e == null) {
479: return null;
480: }
481: return new indexURLEntry(e, null, 0);
482: }
483:
484: public final void remove() {
485: this .iter.remove();
486: }
487: }
488:
489: /**
490: * Uses an Iteration over urlHash.db to detect malformed URL-Entries.
491: * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
492: *
493: * @param homePath Root-Path where all information is to be found.
494: */
495: public void urldbcleanup() {
496: serverLog log = new serverLog("URLDBCLEANUP");
497: HashSet<String> damagedURLS = new HashSet<String>();
498: try {
499: Iterator<indexURLEntry> eiter = entries(true, null);
500: int iteratorCount = 0;
501: while (eiter.hasNext())
502: try {
503: eiter.next();
504: iteratorCount++;
505: } catch (RuntimeException e) {
506: if (e.getMessage() != null) {
507: String m = e.getMessage();
508: damagedURLS.add(m.substring(m.length() - 12));
509: } else {
510: log.logSevere("RuntimeException:", e);
511: }
512: }
513: try {
514: Thread.sleep(1000);
515: } catch (InterruptedException e) {
516: }
517: log.logInfo("URLs vorher: " + size()
518: + " Entries loaded during Iteratorloop: "
519: + iteratorCount + " kaputte URLs: "
520: + damagedURLS.size());
521:
522: Iterator<String> eiter2 = damagedURLS.iterator();
523: String urlHash;
524: while (eiter2.hasNext()) {
525: urlHash = (String) eiter2.next();
526:
527: // trying to fix the invalid URL
528: httpc theHttpc = null;
529: String oldUrlStr = null;
530: try {
531: // getting the url data as byte array
532: kelondroRow.Entry entry = urlIndexFile.get(urlHash
533: .getBytes());
534:
535: // getting the wrong url string
536: oldUrlStr = entry.getColString(1, null).trim();
537:
538: int pos = -1;
539: if ((pos = oldUrlStr.indexOf("://")) != -1) {
540: // trying to correct the url
541: String newUrlStr = "http://"
542: + oldUrlStr.substring(pos + 3);
543: yacyURL newUrl = new yacyURL(newUrlStr, null);
544:
545: // doing a http head request to test if the url is correct
546: theHttpc = new httpc(
547: newUrl.getHost(),
548: newUrl.getHost(),
549: newUrl.getPort(),
550: 30000,
551: false,
552: plasmaSwitchboard.getSwitchboard().remoteProxyConfig,
553: null, null);
554: response res = theHttpc.HEAD(newUrl.getPath(),
555: null);
556:
557: if (res.statusCode == 200) {
558: entry.setCol(1, newUrl.toString()
559: .getBytes());
560: urlIndexFile.put(entry);
561: log.logInfo("UrlDB-Entry with urlHash '"
562: + urlHash + "' corrected\n\tURL: "
563: + oldUrlStr + " -> " + newUrlStr);
564: } else {
565: remove(urlHash);
566: log.logInfo("UrlDB-Entry with urlHash '"
567: + urlHash + "' removed\n\tURL: "
568: + oldUrlStr
569: + "\n\tConnection Status: "
570: + res.status);
571: }
572: theHttpc.close();
573: }
574: } catch (Exception e) {
575: remove(urlHash);
576: log.logInfo("UrlDB-Entry with urlHash '" + urlHash
577: + "' removed\n\tURL: " + oldUrlStr
578: + "\n\tExecption: " + e.getMessage());
579: } finally {
580: if (theHttpc != null)
581: try {
582: theHttpc.close();
583: } catch (Exception e) {
584: }
585: }
586: }
587:
588: log.logInfo("URLs nachher: " + size() + " kaputte URLs: "
589: + damagedURLS.size());
590: } catch (IOException e) {
591: log.logSevere("IOException", e);
592: }
593: }
594:
595: // The Cleaner class was provided as "UrldbCleaner" by Hydrox
596: public Cleaner makeCleaner() {
597: return new Cleaner();
598: }
599:
600: public class Cleaner extends Thread {
601:
602: private boolean run = true;
603: private boolean pause;
604: public int blacklistedUrls = 0;
605: public int totalSearchedUrls = 1;
606: public String lastBlacklistedUrl = "";
607: public String lastBlacklistedHash = "";
608: public String lastUrl = "";
609: public String lastHash = "";
610:
611: public Cleaner() {
612: }
613:
614: public void run() {
615: try {
616: serverLog.logInfo("URLDBCLEANER",
617: "UrldbCleaner-Thread startet");
618: final Iterator<indexURLEntry> eiter = entries(true,
619: null);
620: while (eiter.hasNext() && run) {
621: synchronized (this ) {
622: if (this .pause) {
623: try {
624: this .wait();
625: } catch (InterruptedException e) {
626: serverLog.logWarning("URLDBCLEANER",
627: "InterruptedException", e);
628: this .run = false;
629: return;
630: }
631: }
632: }
633: final indexURLEntry entry = eiter.next();
634: if (entry == null) {
635: serverLog.logFine("URLDBCLEANER",
636: "entry == null");
637: } else if (entry.hash() == null) {
638: serverLog
639: .logFine(
640: "URLDBCLEANER",
641: ++blacklistedUrls
642: + " blacklisted ("
643: + ((double) blacklistedUrls / totalSearchedUrls)
644: * 100 + "%): "
645: + "hash == null");
646: } else {
647: final indexURLEntry.Components comp = entry
648: .comp();
649: totalSearchedUrls++;
650: if (comp.url() == null) {
651: serverLog
652: .logFine(
653: "URLDBCLEANER",
654: ++blacklistedUrls
655: + " blacklisted ("
656: + ((double) blacklistedUrls / totalSearchedUrls)
657: * 100 + "%): "
658: + entry.hash()
659: + "URL == null");
660: remove(entry.hash());
661: } else if (plasmaSwitchboard.urlBlacklist
662: .isListed(
663: plasmaURLPattern.BLACKLIST_CRAWLER,
664: comp.url())
665: || plasmaSwitchboard.urlBlacklist
666: .isListed(
667: plasmaURLPattern.BLACKLIST_DHT,
668: comp.url())) {
669: lastBlacklistedUrl = comp.url()
670: .toNormalform(true, true);
671: lastBlacklistedHash = entry.hash();
672: serverLog
673: .logFine(
674: "URLDBCLEANER",
675: ++blacklistedUrls
676: + " blacklisted ("
677: + ((double) blacklistedUrls / totalSearchedUrls)
678: * 100
679: + "%): "
680: + entry.hash()
681: + " "
682: + comp
683: .url()
684: .toNormalform(
685: false,
686: true));
687: remove(entry.hash());
688: if (blacklistedUrls % 100 == 0) {
689: serverLog
690: .logInfo(
691: "URLDBCLEANER",
692: "Deleted "
693: + blacklistedUrls
694: + " URLs until now. Last deleted URL-Hash: "
695: + lastBlacklistedUrl);
696: }
697: }
698: lastUrl = comp.url().toNormalform(true, true);
699: lastHash = entry.hash();
700: }
701: }
702: } catch (RuntimeException e) {
703: if (e.getMessage() != null
704: && e.getMessage().indexOf("not found in LURL") != -1) {
705: serverLog.logWarning("URLDBCLEANER",
706: "urlHash not found in LURL", e);
707: } else {
708: serverLog.logWarning("URLDBCLEANER",
709: "RuntimeException", e);
710: run = false;
711: }
712: } catch (IOException e) {
713: e.printStackTrace();
714: run = false;
715: }
716: serverLog.logInfo("URLDBCLEANER",
717: "UrldbCleaner-Thread stopped");
718: }
719:
720: public void abort() {
721: synchronized (this ) {
722: run = false;
723: this .notifyAll();
724: }
725: }
726:
727: public void pause() {
728: synchronized (this ) {
729: if (!pause) {
730: pause = true;
731: serverLog.logInfo("URLDBCLEANER",
732: "UrldbCleaner-Thread paused");
733: }
734: }
735: }
736:
737: public void endPause() {
738: synchronized (this ) {
739: if (pause) {
740: pause = false;
741: this .notifyAll();
742: serverLog.logInfo("URLDBCLEANER",
743: "UrldbCleaner-Thread resumed");
744: }
745: }
746: }
747: }
748:
749: private exportc exportthread = null;
750:
751: public boolean export(File f, String filter, int format, boolean dom) {
752: if ((exportthread != null) && (exportthread.isAlive())) {
753: serverLog
754: .logWarning("LURL-EXPORT",
755: "cannot start another export thread, already one running");
756: return false;
757: }
758: this .exportthread = new exportc(f, filter, format, dom);
759: this .exportthread.start();
760: return (this .exportthread.isAlive());
761: }
762:
763: public String export_failed() {
764: if (exportthread == null)
765: return null;
766: return exportthread.failure;
767: }
768:
769: public int export_count() {
770: if (exportthread == null)
771: return 0;
772: return exportthread.count();
773: }
774:
775: public boolean export_running() {
776: if (exportthread == null)
777: return false;
778: return exportthread.isAlive();
779: }
780:
781: public File export_file() {
782: if (exportthread == null)
783: return null;
784: return exportthread.file();
785: }
786:
787: public class exportc extends Thread {
788: File f;
789: String filter;
790: int count;
791: String failure;
792: int format;
793: boolean dom;
794: kelondroRowSet doms;
795:
796: public exportc(File f, String filter, int format, boolean dom) {
797: // format: 0=text, 1=html, 2=rss/xml
798: this .f = f;
799: this .filter = filter;
800: this .count = 0;
801: this .failure = null;
802: this .format = format;
803: this .dom = dom;
804: if ((dom) && (format == 2))
805: dom = false;
806: this .doms = new kelondroRowSet(new kelondroRow(
807: "String hash-6", kelondroBase64Order.enhancedCoder,
808: 0), 0);
809: }
810:
811: public void run() {
812: try {
813: f.getParentFile().mkdirs();
814: PrintWriter pw = new PrintWriter(
815: new BufferedOutputStream(
816: new FileOutputStream(f)));
817: if (format == 1) {
818: pw.println("<html><head></head><body>");
819: }
820: if (format == 2) {
821: pw
822: .println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
823: pw
824: .println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
825: pw.println("<rss version=\"2.0\">");
826: pw.println("<channel>");
827: pw
828: .println("<title>YaCy Peer-to-Peer - Web-Search LURL Export</title>");
829: pw.println("<description></description>");
830: pw.println("<link>http://yacy.net</link>");
831: }
832:
833: Iterator<indexURLEntry> i = entries(true, null); // iterates indexURLEntry objects
834: indexURLEntry entry;
835: indexURLEntry.Components comp;
836: String url;
837: loop: while (i.hasNext()) {
838: entry = i.next();
839: comp = entry.comp();
840: url = comp.url().toNormalform(true, false);
841: if (!url.matches(filter))
842: continue;
843: if (dom) {
844: if (doms.has(entry.hash().substring(6)
845: .getBytes()))
846: continue loop;
847: doms.add(entry.hash().substring(6).getBytes());
848: url = comp.url().getHost();
849: if (format == 0) {
850: pw.println(url);
851: }
852: if (format == 1) {
853: pw.println("<a href=\"http://" + url
854: + "\">" + url + "</a><br>");
855: }
856: } else {
857: if (format == 0) {
858: pw.println(url);
859: }
860: if (format == 1) {
861: pw.println("<a href=\""
862: + url
863: + "\">"
864: + htmlTools.encodeUnicode2html(comp
865: .dc_title(), true, true)
866: + "</a><br>");
867: }
868: if (format == 2) {
869: pw.println("<item>");
870: pw.println("<title>"
871: + htmlTools.encodeUnicode2html(comp
872: .dc_title(), true, true)
873: + "</title>");
874: pw.println("<link>" + yacyURL.escape(url)
875: + "</link>");
876: if (comp.dc_creator().length() > 0)
877: pw.println("<author>"
878: + htmlTools.encodeUnicode2html(
879: comp.dc_creator(),
880: true, true)
881: + "</author>");
882: if (comp.dc_subject().length() > 0)
883: pw.println("<description>"
884: + htmlTools.encodeUnicode2html(
885: comp.dc_subject(),
886: true, true)
887: + "</description>");
888: pw.println("<pubDate>"
889: + entry.moddate().toString()
890: + "</pubDate>");
891: pw.println("<guid isPermaLink=\"false\">"
892: + entry.hash() + "</guid>");
893: pw.println("</item>");
894: }
895: }
896: count++;
897: }
898: if (format == 1) {
899: pw.println("</body></html>");
900: }
901: if (format == 2) {
902: pw.println("</channel>");
903: pw.println("</rss>");
904: }
905: pw.close();
906: } catch (IOException e) {
907: e.printStackTrace();
908: this .failure = e.getMessage();
909: }
910: // terminate process
911: }
912:
913: public File file() {
914: return this .f;
915: }
916:
917: public String failed() {
918: return this .failure;
919: }
920:
921: public int count() {
922: return this .count;
923: }
924: }
925:
926: public static void main(String[] args) {
927: // test-generation of url hashes for debugging
928: // one argument requires, will be treated as url
929: // returns url-hash
930: if (args[0].equals("-h"))
931: try {
932: // arg 1 is url
933: System.out.println("HASH: "
934: + (new yacyURL(args[1], null)).hash());
935: } catch (MalformedURLException e) {
936: }
937: if (args[0].equals("-l"))
938: try {
939: // arg 1 is path to URLCache
940: final plasmaCrawlLURL urls = new plasmaCrawlLURL(
941: new File(args[2]), 0);
942: final Iterator<indexURLEntry> enu = urls.entries(true,
943: null);
944: while (enu.hasNext()) {
945: System.out.println(enu.next().toString());
946: }
947: } catch (Exception e) {
948: e.printStackTrace();
949: }
950: }
951:
952: }
|