001: /*
002: * regain - A file search engine providing plenty of formats
003: * Copyright (C) 2004 Til Schneider
004: *
005: * This library is free software; you can redistribute it and/or
006: * modify it under the terms of the GNU Lesser General Public
007: * License as published by the Free Software Foundation; either
008: * version 2.1 of the License, or (at your option) any later version.
009: *
010: * This library is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
013: * Lesser General Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser General Public
016: * License along with this library; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * Contact: Til Schneider, info@murfman.de
020: *
021: * CVS information:
022: * $RCSfile$
023: * $Source$
024: * $Date: 2006-08-11 11:22:53 +0200 (Fr, 11 Aug 2006) $
025: * $Author: til132 $
026: * $Revision: 226 $
027: */
028: package net.sf.regain.crawler;
029:
030: import java.io.File;
031: import java.io.FileOutputStream;
032: import java.io.IOException;
033: import java.io.PrintStream;
034: import java.util.Iterator;
035: import java.util.LinkedList;
036:
037: import net.sf.regain.RegainException;
038: import net.sf.regain.RegainToolkit;
039: import net.sf.regain.crawler.config.CrawlerConfig;
040: import net.sf.regain.crawler.config.StartUrl;
041: import net.sf.regain.crawler.config.UrlPattern;
042: import net.sf.regain.crawler.config.WhiteListEntry;
043: import net.sf.regain.crawler.document.RawDocument;
044:
045: import org.apache.log4j.Logger;
046: import org.apache.regexp.RE;
047: import org.apache.regexp.RESyntaxException;
048:
049: /**
050: * Durchsucht alle konfigurierten Startseiten nach URLs. Die gefundenen Seiten
051: * werden je nach Einstellung nur geladen, in den Suchindex aufgenommen oder
052: * wiederum nach URLs durchsucht.
053: * <p>
054: * F�r jede URL wird Anhand der Schwarzen und der Wei�en Liste entschieden, ob sie
055: * ignoriert oder bearbeitet wird. Wenn <CODE>loadUnparsedUrls</CODE> auf
056: * <CODE>false</CODE> gesetzt wurde, dann werden auch URLs ignoriert, die weder
057: * durchsucht noch indiziert werden.
058: *
059: * @author Til Schneider, www.murfman.de
060: */
061: public class Crawler implements ErrorLogger {
062:
063: /** The logger for this class. */
064: private static Logger mLog = Logger.getLogger(Crawler.class);
065:
066: /** The configuration with the preferences. */
067: private CrawlerConfig mConfiguration;
068:
069: /** The URL checker. */
070: private UrlChecker mUrlChecker;
071:
072: /** Die Liste der noch zu bearbeitenden Jobs. */
073: private LinkedList mJobList;
074:
075: /** The number of occured errors. */
076: private int mErrorCount;
077:
078: /**
079: * Die Anzahl der fatalen Fehler, die aufgetreten sind.
080: * <p>
081: * Fatale Fehler sind Fehler, durch die eine Erstellung oder Aktualisierung
082: * des Index verhindert wurde.
083: */
084: private int mFatalErrorCount;
085:
086: /** The current crawler job. May be null. */
087: private CrawlerJob mCurrentJob;
088:
089: /**
090: * Contains all found dead links.
091: * <p>
092: * Contains Object[]s with two elements: The first is the URL that couldn't be
093: * found (a String), the second the URL of the document where the dead link
094: * was found (a String).
095: */
096: private LinkedList mDeadlinkList;
097:
098: /** The UrlPattern the HTML-Parser should use to identify URLs. */
099: private UrlPattern[] mHtmlParserUrlPatternArr;
100: /**
101: * The regular expressions that belong to the respective UrlPattern for the
102: * HTML-Parser.
103: *
104: * @see #mHtmlParserUrlPatternArr
105: */
106: private RE[] mHtmlParserPatternReArr;
107:
108: /** The profiler that measures the whole crawler jobs. */
109: private Profiler mCrawlerJobProfiler;
110: /** The profiler that measures the HTML-Parser. */
111: private Profiler mHtmlParsingProfiler;
112:
113: /** The IndexWriterManager to use for adding documents to the index. */
114: private IndexWriterManager mIndexWriterManager;
115:
116: /** Specifies whether the crawler should pause as soon as possible, */
117: private boolean mShouldPause;
118:
119: /**
120: * Creates a new instance of Crawler.
121: *
122: * @param config The Configuration
123: *
124: * @throws RegainException If the regular expressions have errors.
125: */
126: public Crawler(CrawlerConfig config) throws RegainException {
127: Profiler.clearRegisteredProfilers();
128:
129: mCrawlerJobProfiler = new Profiler("Whole crawler jobs", "jobs");
130: mHtmlParsingProfiler = new Profiler("Parsed HTML documents",
131: "docs");
132:
133: mConfiguration = config;
134:
135: mJobList = new LinkedList();
136: mDeadlinkList = new LinkedList();
137:
138: mFatalErrorCount = 0;
139:
140: RawDocument.setHttpTimeoutSecs(config.getHttpTimeoutSecs());
141:
142: mHtmlParserUrlPatternArr = config.getHtmlParserUrlPatterns();
143: mHtmlParserPatternReArr = new RE[mHtmlParserUrlPatternArr.length];
144: for (int i = 0; i < mHtmlParserPatternReArr.length; i++) {
145: String regex = mHtmlParserUrlPatternArr[i]
146: .getRegexPattern();
147: try {
148: mHtmlParserPatternReArr[i] = new RE(regex);
149: } catch (RESyntaxException exc) {
150: throw new RegainException(
151: "Regular exception of HTML parser pattern #"
152: + (i + 1) + " has a wrong syntax: '"
153: + regex + "'", exc);
154: }
155: }
156: }
157:
158: /**
159: * Gets the number of processed documents.
160: *
161: * @return The number of processed documents.
162: */
163: public int getFinishedJobCount() {
164: return mCrawlerJobProfiler.getMeasureCount();
165: }
166:
167: /**
168: * Gets the number of documents that were in the (old) index when the
169: * IndexWriterManager was created.
170: *
171: * @return The initial number of documents in the index.
172: */
173: public int getInitialDocCount() {
174: IndexWriterManager mng = mIndexWriterManager;
175: return (mng == null) ? -1 : mng.getInitialDocCount();
176: }
177:
178: /**
179: * Gets the number of documents that were added to the index.
180: *
181: * @return The number of documents added to the index.
182: */
183: public int getAddedDocCount() {
184: IndexWriterManager mng = mIndexWriterManager;
185: return (mng == null) ? -1 : mng.getAddedDocCount();
186: }
187:
188: /**
189: * Gets the number of documents that will be removed from the index.
190: *
191: * @return The number of documents removed from the index.
192: */
193: public int getRemovedDocCount() {
194: IndexWriterManager mng = mIndexWriterManager;
195: return (mng == null) ? -1 : mng.getRemovedDocCount();
196: }
197:
198: /**
199: * Gets the URL of the current job. Returns null, if the crawler has currently
200: * no job.
201: *
202: * @return The URL of the current job.
203: */
204: public String getCurrentJobUrl() {
205: // NOTE: We put the current job in a local variable to avoid it is set to
206: // null while this method is executed.
207: CrawlerJob job = mCurrentJob;
208: if (job == null) {
209: return null;
210: } else {
211: return job.getUrl();
212: }
213: }
214:
215: /**
216: * Get the time the crawler is already working on the current job.
217: *
218: * @return The current working time in milli seconds. Returns -1 if the
219: * crawler has currently no job.
220: */
221: public long getCurrentJobTime() {
222: return mCrawlerJobProfiler.getCurrentMeasuringTime();
223: }
224:
225: /**
226: * Sets whether the crawler should pause.
227: *
228: * @param shouldPause Whether the crawler should pause.
229: */
230: public void setShouldPause(boolean shouldPause) {
231: mShouldPause = shouldPause;
232: }
233:
234: /**
235: * Gets whether the crawler is currently pausing or is pausing soon.
236: *
237: * @return Whether the crawler is currently pausing.
238: */
239: public boolean getShouldPause() {
240: return mShouldPause;
241: }
242:
243: /**
244: * Analysiert die URL und entscheidet, ob sie bearbeitet werden soll oder nicht.
245: * <p>
246: * Wenn ja, dann wird ein neuer Job erzeugt und der Job-Liste hinzugef�gt.
247: *
248: * @param url Die URL des zu pr�fenden Jobs.
249: * @param sourceUrl Die URL des Dokuments in der die URL des zu pr�fenden Jobs
250: * gefunden wurde.
251: * @param shouldBeParsed Gibt an, ob die URL geparst werden soll.
252: * @param shouldBeIndexed Gibt an, ob die URL indiziert werden soll.
253: * @param sourceLinkText Der Text des Links in dem die URL gefunden wurde. Ist
254: * <code>null</code>, falls die URL nicht in einem Link (also einem
255: * a-Tag) gefunden wurde oder wenn aus sonstigen Gr�nden kein Link-Text
256: * vorhanden ist.
257: */
258: private void addJob(String url, String sourceUrl,
259: boolean shouldBeParsed, boolean shouldBeIndexed,
260: String sourceLinkText) {
261: if (!mConfiguration.getBuildIndex()) {
262: // Indexing is disabled
263: shouldBeIndexed = false;
264: }
265:
266: // Change all blanks to %20, since blanks are not allowed in URLs
267: url = RegainToolkit.replace(url, " ", "%20");
268:
269: boolean alreadyAccepted = mUrlChecker.wasAlreadyAccepted(url);
270: boolean alreadyIgnored = mUrlChecker.wasAlreadyIgnored(url);
271:
272: if ((!alreadyAccepted) && (!alreadyIgnored)) {
273: boolean accepted = mUrlChecker.isUrlAccepted(url);
274:
275: // Check whether this page has to be loaded at all
276: if (!mConfiguration.getLoadUnparsedUrls()) {
277: // Pages that are neither parsed nor indexed can be skipped
278: if ((!shouldBeParsed) && (!shouldBeIndexed)) {
279: accepted = false;
280: }
281: }
282:
283: if (accepted) {
284: mUrlChecker.setAccepted(url);
285: if (mLog.isDebugEnabled()) {
286: mLog.debug("Found new URL: " + url);
287: }
288:
289: CrawlerJob job = new CrawlerJob(url, sourceUrl,
290: sourceLinkText, shouldBeParsed, shouldBeIndexed);
291:
292: // NOTE: This is a little trick: We put documents that aren't parsed at
293: // the beginning of the job list and documents that are parsed at
294: // the end. This keeps the job list small as first all documents
295: // are processed, before new documents are added.
296: if (shouldBeParsed) {
297: mJobList.addLast(job);
298: } else {
299: mJobList.addFirst(job);
300: }
301: } else {
302: mUrlChecker.setIgnored(url);
303: if (mLog.isDebugEnabled()) {
304: mLog.debug("Ignoring URL: " + url);
305: }
306: }
307: }
308: }
309:
310: /**
311: * Executes the crawler process and prints out a statistik, the dead-link-list
312: * and the error-list at the end.
313: *
314: * @param updateIndex Specifies whether an already existing index should be
315: * updated.
316: * @param retryFailedDocs Specifies whether a document that couldn't be
317: * prepared the last time should be retried.
318: * @param onlyEntriesArr The names of the white list entries, that should be
319: * updated. If <code>null</code> or empty, all entries will be updated.
320: */
321: public void run(boolean updateIndex, boolean retryFailedDocs,
322: String[] onlyEntriesArr) {
323: mLog.info("Starting crawling...");
324: mShouldPause = false;
325:
326: // Init the HTTP client
327: CrawlerToolkit.initHttpClient(mConfiguration);
328:
329: // Initialize the IndexWriterManager if building the index is wanted
330: mIndexWriterManager = null;
331: if (mConfiguration.getBuildIndex()) {
332: mLog.info("Preparing the index");
333: try {
334: mIndexWriterManager = new IndexWriterManager(
335: mConfiguration, updateIndex, retryFailedDocs);
336: updateIndex = mIndexWriterManager.getUpdateIndex();
337: } catch (RegainException exc) {
338: logError("Preparing the index failed!", exc, true);
339: return;
340: }
341: }
342:
343: // Get the white list and set the "should be updated"-flags
344: WhiteListEntry[] whiteList = mConfiguration.getWhiteList();
345: whiteList = useOnlyWhiteListEntries(whiteList, onlyEntriesArr,
346: updateIndex);
347:
348: // Create the UrlChecker
349: mUrlChecker = new UrlChecker(whiteList, mConfiguration
350: .getBlackList());
351:
352: // Add the start URLs
353: addStartUrls();
354:
355: // Remember the last time when a breakpoint was created
356: long lastBreakpointTime = System.currentTimeMillis();
357:
358: // Work in the job list
359: while (!mJobList.isEmpty()) {
360: mCrawlerJobProfiler.startMeasuring();
361:
362: mCurrentJob = (CrawlerJob) mJobList.removeFirst();
363: String url = mCurrentJob.getUrl();
364:
365: boolean shouldBeParsed = mCurrentJob.shouldBeParsed();
366: boolean shouldBeIndexed = mCurrentJob.shouldBeIndexed();
367:
368: // Check whether this is a directory
369: if (url.startsWith("file://")) {
370: try {
371: File file = RegainToolkit.urlToFile(url);
372: // Check whether the file is readable.
373: if (!file.canRead()) {
374: mCrawlerJobProfiler.abortMeasuring();
375: logError("File is not readable: '" + url + "'",
376: null, false);
377: continue;
378: } else if (file.isDirectory()) {
379: // This IS a directory -> Add all child files as Jobs
380: if (shouldBeParsed) {
381: parseDirectory(file);
382: }
383:
384: // A directory can't be parsed or indexed -> continue
385: mCrawlerJobProfiler.stopMeasuring(0);
386: continue;
387: }
388: } catch (Throwable thr) {
389: logError("Invalid URL: '" + url + "'", thr, false);
390: }
391: }
392:
393: // Create a raw document
394: RawDocument rawDocument;
395: try {
396: rawDocument = new RawDocument(url, mCurrentJob
397: .getSourceUrl(), mCurrentJob
398: .getSourceLinkText());
399: } catch (RedirectException exc) {
400: String redirectUrl = exc.getRedirectUrl();
401: mLog.info("Redirect '" + url + "' -> '" + redirectUrl
402: + "'");
403: mUrlChecker.setIgnored(url);
404: addJob(redirectUrl, mCurrentJob.getSourceUrl(),
405: shouldBeParsed, shouldBeIndexed, mCurrentJob
406: .getSourceLinkText());
407: mCrawlerJobProfiler.stopMeasuring(0);
408: continue;
409: } catch (RegainException exc) {
410: // Check whether the exception was caused by a dead link
411: handleDocumentLoadingException(exc, mCurrentJob);
412:
413: // This document does not exist -> We can't parse or index anything
414: // -> continue
415: mCrawlerJobProfiler.abortMeasuring();
416: continue;
417: }
418:
419: // Parse the content
420: if (shouldBeParsed) {
421: mLog.info("Parsing " + rawDocument.getUrl());
422: mHtmlParsingProfiler.startMeasuring();
423: try {
424: parseHtmlDocument(rawDocument);
425: mHtmlParsingProfiler.stopMeasuring(rawDocument
426: .getLength());
427: } catch (RegainException exc) {
428: logError("Parsing HTML failed: "
429: + rawDocument.getUrl(), exc, false);
430: }
431: }
432:
433: // Index the content
434: if (shouldBeIndexed) {
435: if (mLog.isDebugEnabled()) {
436: mLog.debug("Indexing " + rawDocument.getUrl());
437: }
438: try {
439: mIndexWriterManager.addToIndex(rawDocument, this );
440: } catch (RegainException exc) {
441: logError(
442: "Indexing failed: " + rawDocument.getUrl(),
443: exc, false);
444: }
445: }
446:
447: // System-Ressourcen des RawDocument wieder frei geben.
448: rawDocument.dispose();
449:
450: // Zeitmessung stoppen
451: mCrawlerJobProfiler.stopMeasuring(rawDocument.getLength());
452: mCurrentJob = null;
453:
454: // Check whether to create a breakpoint
455: int breakpointInterval = mConfiguration
456: .getBreakpointInterval();
457: boolean breakpointIntervalIsOver = (breakpointInterval > 0)
458: && (System.currentTimeMillis() > lastBreakpointTime
459: + breakpointInterval * 60 * 1000);
460: if (mShouldPause || breakpointIntervalIsOver) {
461: try {
462: mIndexWriterManager.createBreakpoint();
463: } catch (RegainException exc) {
464: logError("Creating breakpoint failed", exc, false);
465: }
466:
467: // Pause
468: while (mShouldPause) {
469: try {
470: Thread.sleep(1000);
471: } catch (InterruptedException exc) {
472: }
473: }
474:
475: lastBreakpointTime = System.currentTimeMillis();
476: }
477: } // while (! mJobList.isEmpty())
478:
479: // Nicht mehr vorhandene Dokumente aus dem Index l�schen
480: if (mConfiguration.getBuildIndex()) {
481: mLog
482: .info("Removing index entries of documents that do not exist any more...");
483: try {
484: mIndexWriterManager.removeObsoleteEntries(mUrlChecker);
485: } catch (Throwable thr) {
486: logError(
487: "Removing non-existing documents from index failed",
488: thr, true);
489: }
490: }
491:
492: // Pr�fen, ob Index leer ist
493: int entryCount = 0;
494: try {
495: entryCount = mIndexWriterManager.getIndexEntryCount();
496: // NOTE: We've got to substract the errors, because for each failed
497: // document a substitude document is added to the index
498: // (which should not be counted).
499: entryCount -= mErrorCount;
500: if (entryCount < 0) {
501: entryCount = 0;
502: }
503: } catch (Throwable thr) {
504: logError("Counting index entries failed", thr, true);
505: }
506: double failedPercent;
507: if (entryCount == 0) {
508: logError("The index is empty.", null, true);
509: failedPercent = 1;
510: } else {
511: // Pr�fen, ob die Anzahl der abgebrochenen Dokumente �ber der Toleranzgranze
512: // ist.
513: double failedDocCount = mDeadlinkList.size() + mErrorCount;
514: double totalDocCount = failedDocCount + entryCount;
515: failedPercent = failedDocCount / totalDocCount;
516: double maxAbortedPercent = mConfiguration
517: .getMaxFailedDocuments();
518: if (failedPercent > maxAbortedPercent) {
519: logError(
520: "There are more failed documents than allowed (Failed: "
521: + RegainToolkit
522: .toPercentString(failedPercent)
523: + ", allowed: "
524: + RegainToolkit
525: .toPercentString(maxAbortedPercent)
526: + ").", null, true);
527: }
528: }
529:
530: // Fehler und Deadlink-Liste schreiben
531: writeDeadlinkAndErrorList();
532:
533: // Index abschlie�en
534: if (mIndexWriterManager != null) {
535: boolean thereWereFatalErrors = (mFatalErrorCount > 0);
536: if (thereWereFatalErrors) {
537: mLog
538: .warn("There were "
539: + mFatalErrorCount
540: + " fatal errors. "
541: + "The index will be finished but put into quarantine.");
542: } else {
543: mLog
544: .info("Finishing the index and providing it to the search mask");
545: }
546: try {
547: mIndexWriterManager.close(thereWereFatalErrors);
548: } catch (RegainException exc) {
549: logError("Finishing index failed!", exc, true);
550: }
551: mIndexWriterManager = null;
552: }
553:
554: mLog.info("... Finished crawling\n");
555:
556: mLog.info(Profiler.getProfilerResults());
557:
558: // Systemspeziefischen Zeilenumbruch holen
559: String lineSeparator = RegainToolkit.getLineSeparator();
560:
561: mLog.info("Statistics:" + lineSeparator
562: + " Ignored URLs: "
563: + mUrlChecker.getIgnoredCount() + lineSeparator
564: + " Documents in index: " + entryCount + lineSeparator
565: + " Dead links: " + mDeadlinkList.size()
566: + lineSeparator + " Errors: "
567: + mErrorCount + lineSeparator
568: + " Error ratio: "
569: + RegainToolkit.toPercentString(failedPercent));
570: }
571:
572: /**
573: * Handles an exception caused by a failed document loadung. Checks whether
574: * the exception was caused by a dead link and puts it to the dead link list
575: * if necessary.
576: *
577: * @param exc The exception to check.
578: * @param job The job of the document.
579: */
580: private void handleDocumentLoadingException(RegainException exc,
581: CrawlerJob job) {
582: if (isExceptionFromDeadLink(exc)) {
583: // Don't put this exception in the error list, because it's already in
584: // the dead link list. (Use mCat.error() directly)
585: mLog.error("Dead link: '" + job.getUrl() + "'. Found in '"
586: + job.getSourceUrl() + "'", exc);
587: mDeadlinkList.add(new Object[] { job.getUrl(),
588: job.getSourceUrl() });
589: } else {
590: logError("Loading " + job.getUrl() + " failed!", exc, false);
591: }
592: }
593:
594: /**
595: * Adds all start URL to the job list.
596: */
597: private void addStartUrls() {
598: // Get the start URLs from the config
599: StartUrl[] startUrlArr = mConfiguration.getStartUrls();
600:
601: // Normalize the start URLs
602: startUrlArr = mUrlChecker.normalizeStartUrls(startUrlArr);
603:
604: // Add the start URLs as jobs
605: for (int i = 0; i < startUrlArr.length; i++) {
606: String url = startUrlArr[i].getUrl();
607: boolean shouldBeParsed = startUrlArr[i].getShouldBeParsed();
608: boolean shouldBeIndexed = startUrlArr[i]
609: .getShouldBeIndexed();
610:
611: addJob(url, "Start URL from configuration", shouldBeParsed,
612: shouldBeIndexed, null);
613: }
614: }
615:
616: /**
617: * Sets the "should be updated"-flag for each entry in the white list.
618: *
619: * @param whiteList The white list to process.
620: * @param onlyEntriesArr The names of the white list entries, that should be
621: * updated. If <code>null</code> or empty, all entries will be updated.
622: * @param updateIndex Specifies whether an already existing index will be
623: * updated in this crawler run.
624: * @return The processed white list.
625: */
626: private WhiteListEntry[] useOnlyWhiteListEntries(
627: WhiteListEntry[] whiteList, String[] onlyEntriesArr,
628: boolean updateIndex) {
629: // NOTE: At that moment all white list entries are set to "should be updated"
630:
631: if ((onlyEntriesArr != null) && (onlyEntriesArr.length != 0)) {
632: if (updateIndex) {
633: // First set all white list entries to "should NOT be updated".
634: for (int i = 0; i < whiteList.length; i++) {
635: whiteList[i].setShouldBeUpdated(false);
636: }
637:
638: // Now set those entries to "should be updated" that are in the list
639: for (int i = 0; i < onlyEntriesArr.length; i++) {
640: // Find the matching white list entry
641: WhiteListEntry entry = null;
642: for (int j = 0; j < whiteList.length; j++) {
643: if (onlyEntriesArr[i].equals(whiteList[j]
644: .getName())) {
645: entry = whiteList[j];
646: break;
647: }
648: }
649:
650: if (entry == null) {
651: // No matching white list entry found
652: logError("There is no white list entry named '"
653: + onlyEntriesArr[i] + "'", null, true);
654: } else {
655: entry.setShouldBeUpdated(true);
656: }
657: }
658:
659: // Log all ignored entries
660: for (int i = 0; i < whiteList.length; i++) {
661: if (!whiteList[i].shouldBeUpdated()) {
662: mLog.info("Ignoring white list entry: "
663: + whiteList[i].getUrlMatcher());
664: }
665: }
666: } else {
667: mLog
668: .warn("Unable to ignore white list entries, because a new index "
669: + "will be created");
670: }
671: }
672:
673: return whiteList;
674: }
675:
676: /**
677: * Schreibt die Deadlink- und Fehlerliste ins Logfile und nochmal in eine
678: * eigene Datei. Diese stehen in einem Unterverzeichnis namens 'log'.
679: * Bei eingeschalteter Indizierung steht dieses Unterverzeichnis im Index, bei
680: * ausgeschalteter Indizierung im aktuellen Verzeichnis.
681: */
682: private void writeDeadlinkAndErrorList() {
683: if (mDeadlinkList.isEmpty() && (mErrorCount == 0)) {
684: // Nothing to do
685: return;
686: }
687:
688: // Get the directory where the files should be put in
689: File listDir;
690: if (mConfiguration.getBuildIndex()) {
691: listDir = new File(mConfiguration.getIndexDir()
692: + File.separator + "temp" + File.separator + "log");
693: } else {
694: listDir = new File("log");
695: }
696:
697: String msg;
698: FileOutputStream stream = null;
699: PrintStream printer = null;
700: try {
701: // Create the directory if doesn't exist
702: if (!listDir.exists()) {
703: if (!listDir.mkdir()) {
704: throw new IOException("Creating directory failed: "
705: + listDir.getAbsolutePath());
706: }
707: }
708:
709: // Write the deadlink list
710: if (!mDeadlinkList.isEmpty()) {
711: stream = new FileOutputStream(new File(listDir,
712: "deadlinks.txt"));
713: printer = new PrintStream(stream);
714:
715: msg = "There were " + mDeadlinkList.size()
716: + " dead links:";
717: System.out.println(msg);
718: printer.println(msg);
719:
720: Iterator iter = mDeadlinkList.iterator();
721: for (int i = 0; iter.hasNext(); i++) {
722: Object[] tupel = (Object[]) iter.next();
723: String url = (String) tupel[0];
724: String sourceUrl = (String) tupel[1];
725:
726: msg = " Dead link #" + (i + 1) + ": '" + url
727: + "' found in '" + sourceUrl + "'";
728: System.out.println(msg);
729: printer.println(msg);
730: }
731:
732: printer.close();
733: stream.close();
734: }
735:
736: // Write the error list
737: if (mErrorCount > 0) {
738: mLog.warn("There were " + mErrorCount + " errors");
739: }
740: } catch (IOException exc) {
741: logError("Writing deadlink list and error list failed",
742: exc, false);
743: } finally {
744: if (printer != null) {
745: printer.close();
746: }
747: if (stream != null) {
748: try {
749: stream.close();
750: } catch (IOException exc) {
751: }
752: }
753: }
754: }
755:
756: /**
757: * Pr�ft, ob die Exception von einem Dead-Link herr�hrt.
758: *
759: * @param thr Die zu pr�fende Exception
760: * @return Ob die Exception von einem Dead-Link herr�hrt.
761: */
762: private boolean isExceptionFromDeadLink(Throwable thr) {
763: if (thr instanceof HttpStreamException) {
764: HttpStreamException exc = (HttpStreamException) thr;
765: return exc.isHttpReturnCodeFromDeadLink();
766: } else if (thr instanceof RegainException) {
767: RegainException exc = (RegainException) thr;
768: return isExceptionFromDeadLink(exc.getCause());
769: } else {
770: return false;
771: }
772: }
773:
774: /**
775: * Durchsucht ein Verzeichnis nach URLs, also Dateien und Unterverzeichnissen,
776: * und erzeugt f�r jeden Treffer einen neuen Job.
777: *
778: * @param dir Das zu durchsuchende Verzeichnis.
779: * @throws RegainException If encoding the found URLs failed.
780: */
781: private void parseDirectory(File dir) throws RegainException {
782: // Get the URL for the directory
783: String sourceUrl = RegainToolkit.fileToUrl(dir);
784:
785: // Parse the directory
786: File[] childArr = dir.listFiles();
787: for (int childIdx = 0; childIdx < childArr.length; childIdx++) {
788: // Get the URL for the current child file
789: String url = RegainToolkit.fileToUrl(childArr[childIdx]);
790:
791: // Check whether this is a directory
792: if (childArr[childIdx].isDirectory()) {
793: // It's a directory -> Add a parse job
794: addJob(url, sourceUrl, true, false, null);
795: } else {
796: // It's a file -> Add a index job
797: addJob(url, sourceUrl, false, true, null);
798: }
799: }
800: }
801:
802: /**
803: * Durchsucht den Inhalt eines HTML-Dokuments nach URLs und erzeugt f�r jeden
804: * Treffer einen neuen Job.
805: *
806: * @param rawDocument Das zu durchsuchende Dokument.
807: * @throws RegainException Wenn das Dokument nicht gelesen werden konnte.
808: */
809: private void parseHtmlDocument(RawDocument rawDocument)
810: throws RegainException {
811: for (int i = 0; i < mHtmlParserPatternReArr.length; i++) {
812: RE re = mHtmlParserPatternReArr[i];
813: int urlGroup = mHtmlParserUrlPatternArr[i]
814: .getRegexUrlGroup();
815: boolean shouldBeParsed = mHtmlParserUrlPatternArr[i]
816: .getShouldBeParsed();
817: boolean shouldBeIndexed = mHtmlParserUrlPatternArr[i]
818: .getShouldBeIndexed();
819:
820: int offset = 0;
821: String contentAsString = rawDocument.getContentAsString();
822: while (re.match(contentAsString, offset)) {
823: offset = re.getParenEnd(0);
824:
825: String parentUrl = rawDocument.getUrl();
826: String url = re.getParen(urlGroup);
827:
828: if (url != null) {
829: // Convert the URL to an absolute URL
830: url = CrawlerToolkit.toAbsoluteUrl(url, parentUrl);
831:
832: // Try to get a link text
833: String linkText = getLinkText(contentAsString,
834: offset);
835:
836: // Add the job
837: addJob(url, parentUrl, shouldBeParsed,
838: shouldBeIndexed, linkText);
839: }
840: }
841: }
842: }
843:
844: /**
845: * Tries to extract a link text from a position where a URL was found.
846: *
847: * @param content The content to extract the link text from
848: * @param offset The offset where o start looking
849: * @return A link text or <code>null</code> if there was no link text found.
850: */
851: private String getLinkText(String content, int offset) {
852: // NOTE: if there is a link text the following code must be something
853: // like: ' someParam="someValue">The link text</a>'
854: // Assumed that the tag started with '<a href="aDocument.doc"'
855:
856: // Find the end of the current tag
857: int tagEnd = content.indexOf('>', offset);
858: if (tagEnd == -1) {
859: // No tag end found
860: return null;
861: }
862:
863: // If there is a link text the next part must be: 'The link text</a>'
864: // -> Find the start of the next tag
865: int tagStart = content.indexOf('<', tagEnd);
866: if (tagStart == -1) {
867: // No starting tag found
868: return null;
869: }
870:
871: // Check whether the starting tag is a '</a>' tag.
872: if ((content.length() > tagStart + 3)
873: && (content.charAt(tagStart + 1) == '/')
874: && (content.charAt(tagStart + 2) == 'a')
875: && (content.charAt(tagStart + 3) == '>')) {
876: // We have a link text
877: String linkText = content.substring(tagEnd + 1, tagStart);
878: linkText = linkText.trim();
879: if (linkText.length() == 0) {
880: linkText = null;
881: }
882:
883: return linkText;
884: } else {
885: // The tag was no </a> tag, so the text was no link text
886: return null;
887: }
888: }
889:
890: /**
891: * Gibt die Anzahl der Fehler zur�ck (das beinhaltet fatale und nicht fatale
892: * Fehler).
893: *
894: * @return Die Anzahl der Fehler.
895: * @see #getFatalErrorCount()
896: */
897: public int getErrorCount() {
898: return mErrorCount;
899: }
900:
901: /**
902: * Gibt Die Anzahl der fatalen Fehler zur�ck.
903: * <p>
904: * Fatale Fehler sind Fehler, durch die eine Erstellung oder Aktualisierung
905: * des Index verhindert wurde.
906: *
907: * @return Die Anzahl der fatalen Fehler.
908: * @see #getErrorCount()
909: */
910: public int getFatalErrorCount() {
911: return mFatalErrorCount;
912: }
913:
914: /**
915: * Loggs an error.
916: *
917: * @param msg The error message.
918: * @param thr The error. May be <code>null</code>.
919: * @param fatal Specifies whether the error was fatal. An error is fatal if
920: * it caused that the index could not be created.
921: */
922: public void logError(String msg, Throwable thr, boolean fatal) {
923: if (fatal) {
924: msg = "Fatal: " + msg;
925: }
926: mLog.error(msg, thr);
927: try {
928: if (mIndexWriterManager != null) {
929: mIndexWriterManager.logError(msg, thr);
930: }
931: } catch (RegainException exc) {
932: mLog.error("Logging error in error log of index failed",
933: exc);
934: }
935:
936: mErrorCount++;
937: if (fatal) {
938: mFatalErrorCount++;
939: }
940: }
941:
942: }
|