Source Code Cross Referenced for Crawler.java in » Search-Engine » regain » net » sf » regain » crawler » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » regain » net.sf.regain.crawler
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         * regain - A file search engine providing plenty of formats
003:         * Copyright (C) 2004  Til Schneider
004:         *
005:         * This library is free software; you can redistribute it and/or
006:         * modify it under the terms of the GNU Lesser General Public
007:         * License as published by the Free Software Foundation; either
008:         * version 2.1 of the License, or (at your option) any later version.
009:         *
010:         * This library is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013:         * Lesser General Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser General Public
016:         * License along with this library; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         *
019:         * Contact: Til Schneider, info@murfman.de
020:         *
021:         * CVS information:
022:         *  $RCSfile$
023:         *   $Source$
024:         *     $Date: 2006-08-11 11:22:53 +0200 (Fr, 11 Aug 2006) $
025:         *   $Author: til132 $
026:         * $Revision: 226 $
027:         */
028:        package net.sf.regain.crawler;
029:
030:        import java.io.File;
031:        import java.io.FileOutputStream;
032:        import java.io.IOException;
033:        import java.io.PrintStream;
034:        import java.util.Iterator;
035:        import java.util.LinkedList;
036:
037:        import net.sf.regain.RegainException;
038:        import net.sf.regain.RegainToolkit;
039:        import net.sf.regain.crawler.config.CrawlerConfig;
040:        import net.sf.regain.crawler.config.StartUrl;
041:        import net.sf.regain.crawler.config.UrlPattern;
042:        import net.sf.regain.crawler.config.WhiteListEntry;
043:        import net.sf.regain.crawler.document.RawDocument;
044:
045:        import org.apache.log4j.Logger;
046:        import org.apache.regexp.RE;
047:        import org.apache.regexp.RESyntaxException;
048:
049:        /**
050:         * Durchsucht alle konfigurierten Startseiten nach URLs. Die gefundenen Seiten
051:         * werden je nach Einstellung nur geladen, in den Suchindex aufgenommen oder
052:         * wiederum nach URLs durchsucht.
053:         * <p>
054:         * F�r jede URL wird Anhand der Schwarzen und der Wei�en Liste entschieden, ob sie
055:         * ignoriert oder bearbeitet wird. Wenn <CODE>loadUnparsedUrls</CODE> auf
056:         * <CODE>false</CODE> gesetzt wurde, dann werden auch URLs ignoriert, die weder
057:         * durchsucht noch indiziert werden.
058:         *
059:         * @author Til Schneider, www.murfman.de
060:         */
061:        public class Crawler implements  ErrorLogger {
062:
063:            /** The logger for this class. */
064:            private static Logger mLog = Logger.getLogger(Crawler.class);
065:
066:            /** The configuration with the preferences. */
067:            private CrawlerConfig mConfiguration;
068:
069:            /** The URL checker. */
070:            private UrlChecker mUrlChecker;
071:
072:            /** Die Liste der noch zu bearbeitenden Jobs. */
073:            private LinkedList mJobList;
074:
075:            /** The number of occured errors. */
076:            private int mErrorCount;
077:
078:            /**
079:             * Die Anzahl der fatalen Fehler, die aufgetreten sind.
080:             * <p>
081:             * Fatale Fehler sind Fehler, durch die eine Erstellung oder Aktualisierung
082:             * des Index verhindert wurde.
083:             */
084:            private int mFatalErrorCount;
085:
086:            /** The current crawler job. May be null. */
087:            private CrawlerJob mCurrentJob;
088:
089:            /**
090:             * Contains all found dead links.
091:             * <p>
092:             * Contains Object[]s with two elements: The first is the URL that couldn't be
093:             * found (a String), the second the URL of the document where the dead link
094:             * was found (a String).
095:             */
096:            private LinkedList mDeadlinkList;
097:
098:            /** The UrlPattern the HTML-Parser should use to identify URLs. */
099:            private UrlPattern[] mHtmlParserUrlPatternArr;
100:            /**
101:             * The regular expressions that belong to the respective UrlPattern for the
102:             * HTML-Parser.
103:             *
104:             * @see #mHtmlParserUrlPatternArr
105:             */
106:            private RE[] mHtmlParserPatternReArr;
107:
108:            /** The profiler that measures the whole crawler jobs. */
109:            private Profiler mCrawlerJobProfiler;
110:            /** The profiler that measures the HTML-Parser. */
111:            private Profiler mHtmlParsingProfiler;
112:
113:            /** The IndexWriterManager to use for adding documents to the index. */
114:            private IndexWriterManager mIndexWriterManager;
115:
116:            /** Specifies whether the crawler should pause as soon as possible, */
117:            private boolean mShouldPause;
118:
119:            /**
120:             * Creates a new instance of Crawler.
121:             * 
122:             * @param config The Configuration
123:             *
124:             * @throws RegainException If the regular expressions have errors.
125:             */
126:            public Crawler(CrawlerConfig config) throws RegainException {
127:                Profiler.clearRegisteredProfilers();
128:
129:                mCrawlerJobProfiler = new Profiler("Whole crawler jobs", "jobs");
130:                mHtmlParsingProfiler = new Profiler("Parsed HTML documents",
131:                        "docs");
132:
133:                mConfiguration = config;
134:
135:                mJobList = new LinkedList();
136:                mDeadlinkList = new LinkedList();
137:
138:                mFatalErrorCount = 0;
139:
140:                RawDocument.setHttpTimeoutSecs(config.getHttpTimeoutSecs());
141:
142:                mHtmlParserUrlPatternArr = config.getHtmlParserUrlPatterns();
143:                mHtmlParserPatternReArr = new RE[mHtmlParserUrlPatternArr.length];
144:                for (int i = 0; i < mHtmlParserPatternReArr.length; i++) {
145:                    String regex = mHtmlParserUrlPatternArr[i]
146:                            .getRegexPattern();
147:                    try {
148:                        mHtmlParserPatternReArr[i] = new RE(regex);
149:                    } catch (RESyntaxException exc) {
150:                        throw new RegainException(
151:                                "Regular exception of HTML parser pattern #"
152:                                        + (i + 1) + " has a wrong syntax: '"
153:                                        + regex + "'", exc);
154:                    }
155:                }
156:            }
157:
158:            /**
159:             * Gets the number of processed documents.
160:             * 
161:             * @return The number of processed documents.
162:             */
163:            public int getFinishedJobCount() {
164:                return mCrawlerJobProfiler.getMeasureCount();
165:            }
166:
167:            /**
168:             * Gets the number of documents that were in the (old) index when the
169:             * IndexWriterManager was created.
170:             * 
171:             * @return The initial number of documents in the index.
172:             */
173:            public int getInitialDocCount() {
174:                IndexWriterManager mng = mIndexWriterManager;
175:                return (mng == null) ? -1 : mng.getInitialDocCount();
176:            }
177:
178:            /**
179:             * Gets the number of documents that were added to the index.
180:             * 
181:             * @return The number of documents added to the index.
182:             */
183:            public int getAddedDocCount() {
184:                IndexWriterManager mng = mIndexWriterManager;
185:                return (mng == null) ? -1 : mng.getAddedDocCount();
186:            }
187:
188:            /**
189:             * Gets the number of documents that will be removed from the index.
190:             * 
191:             * @return The number of documents removed from the index.
192:             */
193:            public int getRemovedDocCount() {
194:                IndexWriterManager mng = mIndexWriterManager;
195:                return (mng == null) ? -1 : mng.getRemovedDocCount();
196:            }
197:
198:            /**
199:             * Gets the URL of the current job. Returns null, if the crawler has currently
200:             * no job.
201:             * 
202:             * @return The URL of the current job.
203:             */
204:            public String getCurrentJobUrl() {
205:                // NOTE: We put the current job in a local variable to avoid it is set to
206:                //       null while this method is executed.
207:                CrawlerJob job = mCurrentJob;
208:                if (job == null) {
209:                    return null;
210:                } else {
211:                    return job.getUrl();
212:                }
213:            }
214:
215:            /**
216:             * Get the time the crawler is already working on the current job.
217:             * 
218:             * @return The current working time in milli seconds. Returns -1 if the
219:             *         crawler has currently no job.
220:             */
221:            public long getCurrentJobTime() {
222:                return mCrawlerJobProfiler.getCurrentMeasuringTime();
223:            }
224:
225:            /**
226:             * Sets whether the crawler should pause.
227:             *  
228:             * @param shouldPause Whether the crawler should pause.
229:             */
230:            public void setShouldPause(boolean shouldPause) {
231:                mShouldPause = shouldPause;
232:            }
233:
234:            /**
235:             * Gets whether the crawler is currently pausing or is pausing soon.
236:             * 
237:             * @return Whether the crawler is currently pausing.
238:             */
239:            public boolean getShouldPause() {
240:                return mShouldPause;
241:            }
242:
243:            /**
244:             * Analysiert die URL und entscheidet, ob sie bearbeitet werden soll oder nicht.
245:             * <p>
246:             * Wenn ja, dann wird ein neuer Job erzeugt und der Job-Liste hinzugef�gt.
247:             *
248:             * @param url Die URL des zu pr�fenden Jobs.
249:             * @param sourceUrl Die URL des Dokuments in der die URL des zu pr�fenden Jobs
250:             *        gefunden wurde.
251:             * @param shouldBeParsed Gibt an, ob die URL geparst werden soll.
252:             * @param shouldBeIndexed Gibt an, ob die URL indiziert werden soll.
253:             * @param sourceLinkText Der Text des Links in dem die URL gefunden wurde. Ist
254:             *        <code>null</code>, falls die URL nicht in einem Link (also einem
255:             *        a-Tag) gefunden wurde oder wenn aus sonstigen Gr�nden kein Link-Text
256:             *        vorhanden ist.
257:             */
258:            private void addJob(String url, String sourceUrl,
259:                    boolean shouldBeParsed, boolean shouldBeIndexed,
260:                    String sourceLinkText) {
261:                if (!mConfiguration.getBuildIndex()) {
262:                    // Indexing is disabled
263:                    shouldBeIndexed = false;
264:                }
265:
266:                // Change all blanks to %20, since blanks are not allowed in URLs
267:                url = RegainToolkit.replace(url, " ", "%20");
268:
269:                boolean alreadyAccepted = mUrlChecker.wasAlreadyAccepted(url);
270:                boolean alreadyIgnored = mUrlChecker.wasAlreadyIgnored(url);
271:
272:                if ((!alreadyAccepted) && (!alreadyIgnored)) {
273:                    boolean accepted = mUrlChecker.isUrlAccepted(url);
274:
275:                    // Check whether this page has to be loaded at all
276:                    if (!mConfiguration.getLoadUnparsedUrls()) {
277:                        // Pages that are neither parsed nor indexed can be skipped
278:                        if ((!shouldBeParsed) && (!shouldBeIndexed)) {
279:                            accepted = false;
280:                        }
281:                    }
282:
283:                    if (accepted) {
284:                        mUrlChecker.setAccepted(url);
285:                        if (mLog.isDebugEnabled()) {
286:                            mLog.debug("Found new URL: " + url);
287:                        }
288:
289:                        CrawlerJob job = new CrawlerJob(url, sourceUrl,
290:                                sourceLinkText, shouldBeParsed, shouldBeIndexed);
291:
292:                        // NOTE: This is a little trick: We put documents that aren't parsed at
293:                        //       the beginning of the job list and documents that are parsed at
294:                        //       the end. This keeps the job list small as first all documents
295:                        //       are processed, before new documents are added.
296:                        if (shouldBeParsed) {
297:                            mJobList.addLast(job);
298:                        } else {
299:                            mJobList.addFirst(job);
300:                        }
301:                    } else {
302:                        mUrlChecker.setIgnored(url);
303:                        if (mLog.isDebugEnabled()) {
304:                            mLog.debug("Ignoring URL: " + url);
305:                        }
306:                    }
307:                }
308:            }
309:
310:            /**
311:             * Executes the crawler process and prints out a statistik, the dead-link-list
312:             * and the error-list at the end.
313:             *
314:             * @param updateIndex Specifies whether an already existing index should be
315:             *        updated.
316:             * @param retryFailedDocs Specifies whether a document that couldn't be
317:             *        prepared the last time should be retried.
318:             * @param onlyEntriesArr The names of the white list entries, that should be
319:             *        updated. If <code>null</code> or empty, all entries will be updated.
320:             */
321:            public void run(boolean updateIndex, boolean retryFailedDocs,
322:                    String[] onlyEntriesArr) {
323:                mLog.info("Starting crawling...");
324:                mShouldPause = false;
325:
326:                // Init the HTTP client
327:                CrawlerToolkit.initHttpClient(mConfiguration);
328:
329:                // Initialize the IndexWriterManager if building the index is wanted
330:                mIndexWriterManager = null;
331:                if (mConfiguration.getBuildIndex()) {
332:                    mLog.info("Preparing the index");
333:                    try {
334:                        mIndexWriterManager = new IndexWriterManager(
335:                                mConfiguration, updateIndex, retryFailedDocs);
336:                        updateIndex = mIndexWriterManager.getUpdateIndex();
337:                    } catch (RegainException exc) {
338:                        logError("Preparing the index failed!", exc, true);
339:                        return;
340:                    }
341:                }
342:
343:                // Get the white list and set the "should be updated"-flags
344:                WhiteListEntry[] whiteList = mConfiguration.getWhiteList();
345:                whiteList = useOnlyWhiteListEntries(whiteList, onlyEntriesArr,
346:                        updateIndex);
347:
348:                // Create the UrlChecker
349:                mUrlChecker = new UrlChecker(whiteList, mConfiguration
350:                        .getBlackList());
351:
352:                // Add the start URLs
353:                addStartUrls();
354:
355:                // Remember the last time when a breakpoint was created
356:                long lastBreakpointTime = System.currentTimeMillis();
357:
358:                // Work in the job list
359:                while (!mJobList.isEmpty()) {
360:                    mCrawlerJobProfiler.startMeasuring();
361:
362:                    mCurrentJob = (CrawlerJob) mJobList.removeFirst();
363:                    String url = mCurrentJob.getUrl();
364:
365:                    boolean shouldBeParsed = mCurrentJob.shouldBeParsed();
366:                    boolean shouldBeIndexed = mCurrentJob.shouldBeIndexed();
367:
368:                    // Check whether this is a directory
369:                    if (url.startsWith("file://")) {
370:                        try {
371:                            File file = RegainToolkit.urlToFile(url);
372:                            // Check whether the file is readable.
373:                            if (!file.canRead()) {
374:                                mCrawlerJobProfiler.abortMeasuring();
375:                                logError("File is not readable: '" + url + "'",
376:                                        null, false);
377:                                continue;
378:                            } else if (file.isDirectory()) {
379:                                // This IS a directory -> Add all child files as Jobs
380:                                if (shouldBeParsed) {
381:                                    parseDirectory(file);
382:                                }
383:
384:                                // A directory can't be parsed or indexed -> continue
385:                                mCrawlerJobProfiler.stopMeasuring(0);
386:                                continue;
387:                            }
388:                        } catch (Throwable thr) {
389:                            logError("Invalid URL: '" + url + "'", thr, false);
390:                        }
391:                    }
392:
393:                    // Create a raw document
394:                    RawDocument rawDocument;
395:                    try {
396:                        rawDocument = new RawDocument(url, mCurrentJob
397:                                .getSourceUrl(), mCurrentJob
398:                                .getSourceLinkText());
399:                    } catch (RedirectException exc) {
400:                        String redirectUrl = exc.getRedirectUrl();
401:                        mLog.info("Redirect '" + url + "' -> '" + redirectUrl
402:                                + "'");
403:                        mUrlChecker.setIgnored(url);
404:                        addJob(redirectUrl, mCurrentJob.getSourceUrl(),
405:                                shouldBeParsed, shouldBeIndexed, mCurrentJob
406:                                        .getSourceLinkText());
407:                        mCrawlerJobProfiler.stopMeasuring(0);
408:                        continue;
409:                    } catch (RegainException exc) {
410:                        // Check whether the exception was caused by a dead link
411:                        handleDocumentLoadingException(exc, mCurrentJob);
412:
413:                        // This document does not exist -> We can't parse or index anything
414:                        // -> continue
415:                        mCrawlerJobProfiler.abortMeasuring();
416:                        continue;
417:                    }
418:
419:                    // Parse the content
420:                    if (shouldBeParsed) {
421:                        mLog.info("Parsing " + rawDocument.getUrl());
422:                        mHtmlParsingProfiler.startMeasuring();
423:                        try {
424:                            parseHtmlDocument(rawDocument);
425:                            mHtmlParsingProfiler.stopMeasuring(rawDocument
426:                                    .getLength());
427:                        } catch (RegainException exc) {
428:                            logError("Parsing HTML failed: "
429:                                    + rawDocument.getUrl(), exc, false);
430:                        }
431:                    }
432:
433:                    // Index the content
434:                    if (shouldBeIndexed) {
435:                        if (mLog.isDebugEnabled()) {
436:                            mLog.debug("Indexing " + rawDocument.getUrl());
437:                        }
438:                        try {
439:                            mIndexWriterManager.addToIndex(rawDocument, this );
440:                        } catch (RegainException exc) {
441:                            logError(
442:                                    "Indexing failed: " + rawDocument.getUrl(),
443:                                    exc, false);
444:                        }
445:                    }
446:
447:                    // System-Ressourcen des RawDocument wieder frei geben.
448:                    rawDocument.dispose();
449:
450:                    // Zeitmessung stoppen
451:                    mCrawlerJobProfiler.stopMeasuring(rawDocument.getLength());
452:                    mCurrentJob = null;
453:
454:                    // Check whether to create a breakpoint
455:                    int breakpointInterval = mConfiguration
456:                            .getBreakpointInterval();
457:                    boolean breakpointIntervalIsOver = (breakpointInterval > 0)
458:                            && (System.currentTimeMillis() > lastBreakpointTime
459:                                    + breakpointInterval * 60 * 1000);
460:                    if (mShouldPause || breakpointIntervalIsOver) {
461:                        try {
462:                            mIndexWriterManager.createBreakpoint();
463:                        } catch (RegainException exc) {
464:                            logError("Creating breakpoint failed", exc, false);
465:                        }
466:
467:                        // Pause
468:                        while (mShouldPause) {
469:                            try {
470:                                Thread.sleep(1000);
471:                            } catch (InterruptedException exc) {
472:                            }
473:                        }
474:
475:                        lastBreakpointTime = System.currentTimeMillis();
476:                    }
477:                } // while (! mJobList.isEmpty())
478:
479:                // Nicht mehr vorhandene Dokumente aus dem Index l�schen
480:                if (mConfiguration.getBuildIndex()) {
481:                    mLog
482:                            .info("Removing index entries of documents that do not exist any more...");
483:                    try {
484:                        mIndexWriterManager.removeObsoleteEntries(mUrlChecker);
485:                    } catch (Throwable thr) {
486:                        logError(
487:                                "Removing non-existing documents from index failed",
488:                                thr, true);
489:                    }
490:                }
491:
492:                // Pr�fen, ob Index leer ist
493:                int entryCount = 0;
494:                try {
495:                    entryCount = mIndexWriterManager.getIndexEntryCount();
496:                    // NOTE: We've got to substract the errors, because for each failed
497:                    //       document a substitude document is added to the index
498:                    //       (which should not be counted).
499:                    entryCount -= mErrorCount;
500:                    if (entryCount < 0) {
501:                        entryCount = 0;
502:                    }
503:                } catch (Throwable thr) {
504:                    logError("Counting index entries failed", thr, true);
505:                }
506:                double failedPercent;
507:                if (entryCount == 0) {
508:                    logError("The index is empty.", null, true);
509:                    failedPercent = 1;
510:                } else {
511:                    // Pr�fen, ob die Anzahl der abgebrochenen Dokumente �ber der Toleranzgranze
512:                    // ist.
513:                    double failedDocCount = mDeadlinkList.size() + mErrorCount;
514:                    double totalDocCount = failedDocCount + entryCount;
515:                    failedPercent = failedDocCount / totalDocCount;
516:                    double maxAbortedPercent = mConfiguration
517:                            .getMaxFailedDocuments();
518:                    if (failedPercent > maxAbortedPercent) {
519:                        logError(
520:                                "There are more failed documents than allowed (Failed: "
521:                                        + RegainToolkit
522:                                                .toPercentString(failedPercent)
523:                                        + ", allowed: "
524:                                        + RegainToolkit
525:                                                .toPercentString(maxAbortedPercent)
526:                                        + ").", null, true);
527:                    }
528:                }
529:
530:                // Fehler und Deadlink-Liste schreiben
531:                writeDeadlinkAndErrorList();
532:
533:                // Index abschlie�en
534:                if (mIndexWriterManager != null) {
535:                    boolean thereWereFatalErrors = (mFatalErrorCount > 0);
536:                    if (thereWereFatalErrors) {
537:                        mLog
538:                                .warn("There were "
539:                                        + mFatalErrorCount
540:                                        + " fatal errors. "
541:                                        + "The index will be finished but put into quarantine.");
542:                    } else {
543:                        mLog
544:                                .info("Finishing the index and providing it to the search mask");
545:                    }
546:                    try {
547:                        mIndexWriterManager.close(thereWereFatalErrors);
548:                    } catch (RegainException exc) {
549:                        logError("Finishing index failed!", exc, true);
550:                    }
551:                    mIndexWriterManager = null;
552:                }
553:
554:                mLog.info("... Finished crawling\n");
555:
556:                mLog.info(Profiler.getProfilerResults());
557:
558:                // Systemspeziefischen Zeilenumbruch holen
559:                String lineSeparator = RegainToolkit.getLineSeparator();
560:
561:                mLog.info("Statistics:" + lineSeparator
562:                        + "  Ignored URLs:       "
563:                        + mUrlChecker.getIgnoredCount() + lineSeparator
564:                        + "  Documents in index: " + entryCount + lineSeparator
565:                        + "  Dead links:         " + mDeadlinkList.size()
566:                        + lineSeparator + "  Errors:             "
567:                        + mErrorCount + lineSeparator
568:                        + "  Error ratio:        "
569:                        + RegainToolkit.toPercentString(failedPercent));
570:            }
571:
572:            /**
573:             * Handles an exception caused by a failed document loadung. Checks whether
574:             * the exception was caused by a dead link and puts it to the dead link list
575:             * if necessary.
576:             * 
577:             * @param exc The exception to check.
578:             * @param job The job of the document.
579:             */
580:            private void handleDocumentLoadingException(RegainException exc,
581:                    CrawlerJob job) {
582:                if (isExceptionFromDeadLink(exc)) {
583:                    // Don't put this exception in the error list, because it's already in
584:                    // the dead link list. (Use mCat.error() directly)
585:                    mLog.error("Dead link: '" + job.getUrl() + "'. Found in '"
586:                            + job.getSourceUrl() + "'", exc);
587:                    mDeadlinkList.add(new Object[] { job.getUrl(),
588:                            job.getSourceUrl() });
589:                } else {
590:                    logError("Loading " + job.getUrl() + " failed!", exc, false);
591:                }
592:            }
593:
594:            /**
595:             * Adds all start URL to the job list.
596:             */
597:            private void addStartUrls() {
598:                // Get the start URLs from the config
599:                StartUrl[] startUrlArr = mConfiguration.getStartUrls();
600:
601:                // Normalize the start URLs
602:                startUrlArr = mUrlChecker.normalizeStartUrls(startUrlArr);
603:
604:                // Add the start URLs as jobs
605:                for (int i = 0; i < startUrlArr.length; i++) {
606:                    String url = startUrlArr[i].getUrl();
607:                    boolean shouldBeParsed = startUrlArr[i].getShouldBeParsed();
608:                    boolean shouldBeIndexed = startUrlArr[i]
609:                            .getShouldBeIndexed();
610:
611:                    addJob(url, "Start URL from configuration", shouldBeParsed,
612:                            shouldBeIndexed, null);
613:                }
614:            }
615:
616:            /**
617:             * Sets the "should be updated"-flag for each entry in the white list.
618:             * 
619:             * @param whiteList The white list to process.
620:             * @param onlyEntriesArr The names of the white list entries, that should be
621:             *        updated. If <code>null</code> or empty, all entries will be updated.
622:             * @param updateIndex Specifies whether an already existing index will be
623:             *        updated in this crawler run.
624:             * @return The processed white list.
625:             */
626:            private WhiteListEntry[] useOnlyWhiteListEntries(
627:                    WhiteListEntry[] whiteList, String[] onlyEntriesArr,
628:                    boolean updateIndex) {
629:                // NOTE: At that moment all white list entries are set to "should be updated"
630:
631:                if ((onlyEntriesArr != null) && (onlyEntriesArr.length != 0)) {
632:                    if (updateIndex) {
633:                        // First set all white list entries to "should NOT be updated".
634:                        for (int i = 0; i < whiteList.length; i++) {
635:                            whiteList[i].setShouldBeUpdated(false);
636:                        }
637:
638:                        // Now set those entries to "should be updated" that are in the list
639:                        for (int i = 0; i < onlyEntriesArr.length; i++) {
640:                            // Find the matching white list entry
641:                            WhiteListEntry entry = null;
642:                            for (int j = 0; j < whiteList.length; j++) {
643:                                if (onlyEntriesArr[i].equals(whiteList[j]
644:                                        .getName())) {
645:                                    entry = whiteList[j];
646:                                    break;
647:                                }
648:                            }
649:
650:                            if (entry == null) {
651:                                // No matching white list entry found
652:                                logError("There is no white list entry named '"
653:                                        + onlyEntriesArr[i] + "'", null, true);
654:                            } else {
655:                                entry.setShouldBeUpdated(true);
656:                            }
657:                        }
658:
659:                        // Log all ignored entries
660:                        for (int i = 0; i < whiteList.length; i++) {
661:                            if (!whiteList[i].shouldBeUpdated()) {
662:                                mLog.info("Ignoring white list entry: "
663:                                        + whiteList[i].getUrlMatcher());
664:                            }
665:                        }
666:                    } else {
667:                        mLog
668:                                .warn("Unable to ignore white list entries, because a new index "
669:                                        + "will be created");
670:                    }
671:                }
672:
673:                return whiteList;
674:            }
675:
676:            /**
677:             * Schreibt die Deadlink- und Fehlerliste ins Logfile und nochmal in eine
678:             * eigene Datei. Diese stehen in einem Unterverzeichnis namens 'log'.
679:             * Bei eingeschalteter Indizierung steht dieses Unterverzeichnis im Index, bei
680:             * ausgeschalteter Indizierung im aktuellen Verzeichnis.
681:             */
682:            private void writeDeadlinkAndErrorList() {
683:                if (mDeadlinkList.isEmpty() && (mErrorCount == 0)) {
684:                    // Nothing to do
685:                    return;
686:                }
687:
688:                // Get the directory where the files should be put in
689:                File listDir;
690:                if (mConfiguration.getBuildIndex()) {
691:                    listDir = new File(mConfiguration.getIndexDir()
692:                            + File.separator + "temp" + File.separator + "log");
693:                } else {
694:                    listDir = new File("log");
695:                }
696:
697:                String msg;
698:                FileOutputStream stream = null;
699:                PrintStream printer = null;
700:                try {
701:                    // Create the directory if doesn't exist
702:                    if (!listDir.exists()) {
703:                        if (!listDir.mkdir()) {
704:                            throw new IOException("Creating directory failed: "
705:                                    + listDir.getAbsolutePath());
706:                        }
707:                    }
708:
709:                    // Write the deadlink list
710:                    if (!mDeadlinkList.isEmpty()) {
711:                        stream = new FileOutputStream(new File(listDir,
712:                                "deadlinks.txt"));
713:                        printer = new PrintStream(stream);
714:
715:                        msg = "There were " + mDeadlinkList.size()
716:                                + " dead links:";
717:                        System.out.println(msg);
718:                        printer.println(msg);
719:
720:                        Iterator iter = mDeadlinkList.iterator();
721:                        for (int i = 0; iter.hasNext(); i++) {
722:                            Object[] tupel = (Object[]) iter.next();
723:                            String url = (String) tupel[0];
724:                            String sourceUrl = (String) tupel[1];
725:
726:                            msg = "  Dead link #" + (i + 1) + ": '" + url
727:                                    + "' found in '" + sourceUrl + "'";
728:                            System.out.println(msg);
729:                            printer.println(msg);
730:                        }
731:
732:                        printer.close();
733:                        stream.close();
734:                    }
735:
736:                    // Write the error list
737:                    if (mErrorCount > 0) {
738:                        mLog.warn("There were " + mErrorCount + " errors");
739:                    }
740:                } catch (IOException exc) {
741:                    logError("Writing deadlink list and error list failed",
742:                            exc, false);
743:                } finally {
744:                    if (printer != null) {
745:                        printer.close();
746:                    }
747:                    if (stream != null) {
748:                        try {
749:                            stream.close();
750:                        } catch (IOException exc) {
751:                        }
752:                    }
753:                }
754:            }
755:
756:            /**
757:             * Pr�ft, ob die Exception von einem Dead-Link herr�hrt.
758:             *
759:             * @param thr Die zu pr�fende Exception
760:             * @return Ob die Exception von einem Dead-Link herr�hrt.
761:             */
762:            private boolean isExceptionFromDeadLink(Throwable thr) {
763:                if (thr instanceof  HttpStreamException) {
764:                    HttpStreamException exc = (HttpStreamException) thr;
765:                    return exc.isHttpReturnCodeFromDeadLink();
766:                } else if (thr instanceof  RegainException) {
767:                    RegainException exc = (RegainException) thr;
768:                    return isExceptionFromDeadLink(exc.getCause());
769:                } else {
770:                    return false;
771:                }
772:            }
773:
774:            /**
775:             * Durchsucht ein Verzeichnis nach URLs, also Dateien und Unterverzeichnissen,
776:             * und erzeugt f�r jeden Treffer einen neuen Job.
777:             *
778:             * @param dir Das zu durchsuchende Verzeichnis.
779:             * @throws RegainException If encoding the found URLs failed. 
780:             */
781:            private void parseDirectory(File dir) throws RegainException {
782:                // Get the URL for the directory
783:                String sourceUrl = RegainToolkit.fileToUrl(dir);
784:
785:                // Parse the directory
786:                File[] childArr = dir.listFiles();
787:                for (int childIdx = 0; childIdx < childArr.length; childIdx++) {
788:                    // Get the URL for the current child file
789:                    String url = RegainToolkit.fileToUrl(childArr[childIdx]);
790:
791:                    // Check whether this is a directory
792:                    if (childArr[childIdx].isDirectory()) {
793:                        // It's a directory -> Add a parse job
794:                        addJob(url, sourceUrl, true, false, null);
795:                    } else {
796:                        // It's a file -> Add a index job
797:                        addJob(url, sourceUrl, false, true, null);
798:                    }
799:                }
800:            }
801:
802:            /**
803:             * Durchsucht den Inhalt eines HTML-Dokuments nach URLs und erzeugt f�r jeden
804:             * Treffer einen neuen Job.
805:             *
806:             * @param rawDocument Das zu durchsuchende Dokument.
807:             * @throws RegainException Wenn das Dokument nicht gelesen werden konnte.
808:             */
809:            private void parseHtmlDocument(RawDocument rawDocument)
810:                    throws RegainException {
811:                for (int i = 0; i < mHtmlParserPatternReArr.length; i++) {
812:                    RE re = mHtmlParserPatternReArr[i];
813:                    int urlGroup = mHtmlParserUrlPatternArr[i]
814:                            .getRegexUrlGroup();
815:                    boolean shouldBeParsed = mHtmlParserUrlPatternArr[i]
816:                            .getShouldBeParsed();
817:                    boolean shouldBeIndexed = mHtmlParserUrlPatternArr[i]
818:                            .getShouldBeIndexed();
819:
820:                    int offset = 0;
821:                    String contentAsString = rawDocument.getContentAsString();
822:                    while (re.match(contentAsString, offset)) {
823:                        offset = re.getParenEnd(0);
824:
825:                        String parentUrl = rawDocument.getUrl();
826:                        String url = re.getParen(urlGroup);
827:
828:                        if (url != null) {
829:                            // Convert the URL to an absolute URL
830:                            url = CrawlerToolkit.toAbsoluteUrl(url, parentUrl);
831:
832:                            // Try to get a link text
833:                            String linkText = getLinkText(contentAsString,
834:                                    offset);
835:
836:                            // Add the job
837:                            addJob(url, parentUrl, shouldBeParsed,
838:                                    shouldBeIndexed, linkText);
839:                        }
840:                    }
841:                }
842:            }
843:
844:            /**
845:             * Tries to extract a link text from a position where a URL was found.
846:             *
847:             * @param content The content to extract the link text from
848:             * @param offset The offset where o start looking
849:             * @return A link text or <code>null</code> if there was no link text found.
850:             */
851:            private String getLinkText(String content, int offset) {
852:                // NOTE: if there is a link text the following code must be something
853:                //       like: ' someParam="someValue">The link text</a>'
854:                //       Assumed that the tag started with '<a href="aDocument.doc"'
855:
856:                // Find the end of the current tag
857:                int tagEnd = content.indexOf('>', offset);
858:                if (tagEnd == -1) {
859:                    // No tag end found
860:                    return null;
861:                }
862:
863:                // If there is a link text the next part must be: 'The link text</a>'
864:                // -> Find the start of the next tag
865:                int tagStart = content.indexOf('<', tagEnd);
866:                if (tagStart == -1) {
867:                    // No starting tag found
868:                    return null;
869:                }
870:
871:                // Check whether the starting tag is a '</a>' tag.
872:                if ((content.length() > tagStart + 3)
873:                        && (content.charAt(tagStart + 1) == '/')
874:                        && (content.charAt(tagStart + 2) == 'a')
875:                        && (content.charAt(tagStart + 3) == '>')) {
876:                    // We have a link text
877:                    String linkText = content.substring(tagEnd + 1, tagStart);
878:                    linkText = linkText.trim();
879:                    if (linkText.length() == 0) {
880:                        linkText = null;
881:                    }
882:
883:                    return linkText;
884:                } else {
885:                    // The tag was no </a> tag, so the text was no link text
886:                    return null;
887:                }
888:            }
889:
890:            /**
891:             * Gibt die Anzahl der Fehler zur�ck (das beinhaltet fatale und nicht fatale
892:             * Fehler).
893:             *
894:             * @return Die Anzahl der Fehler.
895:             * @see #getFatalErrorCount()
896:             */
897:            public int getErrorCount() {
898:                return mErrorCount;
899:            }
900:
901:            /**
902:             * Gibt Die Anzahl der fatalen Fehler zur�ck.
903:             * <p>
904:             * Fatale Fehler sind Fehler, durch die eine Erstellung oder Aktualisierung
905:             * des Index verhindert wurde.
906:             *
907:             * @return Die Anzahl der fatalen Fehler.
908:             * @see #getErrorCount()
909:             */
910:            public int getFatalErrorCount() {
911:                return mFatalErrorCount;
912:            }
913:
914:            /**
915:             * Loggs an error.
916:             *
917:             * @param msg The error message.
918:             * @param thr The error. May be <code>null</code>.
919:             * @param fatal Specifies whether the error was fatal. An error is fatal if
920:             *        it caused that the index could not be created.
921:             */
922:            public void logError(String msg, Throwable thr, boolean fatal) {
923:                if (fatal) {
924:                    msg = "Fatal: " + msg;
925:                }
926:                mLog.error(msg, thr);
927:                try {
928:                    if (mIndexWriterManager != null) {
929:                        mIndexWriterManager.logError(msg, thr);
930:                    }
931:                } catch (RegainException exc) {
932:                    mLog.error("Logging error in error log of index failed",
933:                            exc);
934:                }
935:
936:                mErrorCount++;
937:                if (fatal) {
938:                    mFatalErrorCount++;
939:                }
940:            }
941:
942:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.