0001: /*
0002: * Copyright 2007 Hippo.
0003: *
0004: * Licensed under the Apache License, Version 2.0 (the "License");
0005: * you may not use this file except in compliance with the License.
0006: * You may obtain a copy of the License at
0007: *
0008: * http://www.apache.org/licenses/LICENSE-2.0
0009: *
0010: * Unless required by applicable law or agreed to in writing, software
0011: * distributed under the License is distributed on an "AS IS" BASIS,
0012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0013: * See the License for the specific language governing permissions and
0014: * limitations under the License.
0015: */
0016: package nl.hippo.cms.brokenlinkchecker;
0017:
0018: import java.io.BufferedReader;
0019: import java.io.ByteArrayOutputStream;
0020: import java.io.IOException;
0021: import java.io.InputStream;
0022: import java.io.InputStreamReader;
0023: import java.io.OutputStream;
0024: import java.io.OutputStreamWriter;
0025: import java.io.UnsupportedEncodingException;
0026: import java.net.MalformedURLException;
0027: import java.net.URL;
0028: import java.util.Collection;
0029: import java.util.Enumeration;
0030: import java.util.HashMap;
0031: import java.util.HashSet;
0032: import java.util.Iterator;
0033: import java.util.Map;
0034: import java.util.Set;
0035: import java.util.StringTokenizer;
0036: import nl.hippo.cms.brokenlinkchecker.log.BrokenLinkCheckerLog;
0037: import nl.hippo.cms.brokenlinkchecker.util.MethodCleanup;
0038: import nl.hippo.cms.brokenlinkchecker.util.ReaderCleanup;
0039: import nl.hippo.cms.brokenlinkchecker.util.StreamCleanup;
0040: import nl.hippo.cms.brokenlinkchecker.util.Validation;
0041: import nl.hippo.cms.brokenlinkchecker.util.WriterCleanup;
0042: import org.apache.commons.httpclient.Credentials;
0043: import org.apache.commons.httpclient.HttpClient;
0044: import org.apache.commons.httpclient.HttpState;
0045: import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
0046: import org.apache.commons.httpclient.UsernamePasswordCredentials;
0047: import org.apache.webdav.lib.Property;
0048: import org.apache.webdav.lib.methods.SearchMethod;
0049:
0050: /**
0051: * <p>
0052: * This class searches a tree in a WebDAV repository for broken links. It does
0053: * so by executing a DASL that finds documents that have property
0054: * <code>links</code> in namespace <code>http://hippo.nl/cms/1.0</code>
0055: * defined. The result of the check is stored in a document in the repository
0056: * with the following structure:
0057: * </p>
0058: *
0059: * <pre>
0060: * <broken-links date="...">
0061: * <page url="...">
0062: * <link url="...">[error message]</link>
0063: * </page>
0064: * </broken-links>
0065: * </pre>
0066: *
0067: * <p>
0068: * The error message can be a simple text, or an element with namespace
0069: * <code>http://apache.org/cocoon/i18n/2.1</code> and name <code>text</code>
0070: * which contains a localization key.
0071: * </p>
0072: *
0073: * <p>
0074: * If a link exists in multiple documents it is checked only once.
0075: * </p>
0076: *
0077: * <p>
0078: * Checking links is a time-consuming task. It does not require a lot of
0079: * resources though. That is why the checking of links is performed using
0080: * multiple threads.
0081: * </p>
0082: *
0083: * <p>
0084: * NOTE: this class assumes that the result document will be written to the
0085: * WebDAV repository that is checked for broken links. Therefore only one set of
0086: * credentials have to be given.
0087: * </p>
0088: */
0089: public class BrokenLinkCheckerRun {
0090: /**
0091: * <p>
0092: * The name of the resource containing the template for the DASL to find
0093: * documents containing links.
0094: * </p>
0095: */
0096: private static final String FIND_LINKS_DASL_TEMPLATE_RESOURCE_NAME = "findLinks.xml";
0097:
0098: /**
0099: * <p>
0100: * The tag in the find links DASL template which marks the location at
0101: * which to set the results offset.
0102: * </p>
0103: */
0104: private static final String RESULTS_OFFSET_TAG = "@results-offset@";
0105:
0106: /**
0107: * <p>
0108: * The tag in the find links DASL template which marks the location at
0109: * which to set the number of results limit.
0110: * </p>
0111: */
0112: private static final String NUMBER_OF_RESULTS_LIMIT_TAG = "@number-of-results-limit@";
0113:
0114: /**
0115: * <p>
0116: * The URI of the namespace for Hippo CMS specific WebDAV properties.
0117: * </p>
0118: */
0119: private static final String HIPPO_CMS_NAMESPACE_URI = "http://hippo.nl/cms/1.0";
0120:
0121: /**
0122: * <p>
0123: * The name of the WebDAV property that contains a space-separated list
0124: * of the links of the document.
0125: * </p>
0126: */
0127: private static final String LINKS_PROPERTY_NAME = "links";
0128:
0129: /**
0130: * <p>
0131: * The separator, a space, use to separate links in the WebDAV property
0132: * that contains the links of the document.
0133: * </p>
0134: */
0135: private static final String LINK_SEPARATORS = " ";
0136:
0137: /**
0138: * <p>
0139: * The number of milliseconds in a second.
0140: * </p>
0141: */
0142: private static final int NUMBER_OF_MILLIS_IN_A_SECOND = 1000;
0143:
0144: /**
0145: * <p>
0146: * The minimum number of documents to retrieve in one request.
0147: * </p>
0148: */
0149: public static final int MINIMUM_DOCUMENT_BATCH_SIZE = 1;
0150:
0151: /**
0152: * <p>
0153: * The maximum number of documents to retrieve in one request.
0154: * </p>
0155: */
0156: public static final int MAXIMUM_DOCUMENT_BATCH_SIZE = 1000;
0157:
0158: /**
0159: * <p>
0160: * The minimum number of thread to use for checking for broken links.
0161: * </p>
0162: */
0163: public static final int MINIMUM_NUMBER_OF_LINK_CHECKING_THREADS = 1;
0164:
0165: /**
0166: * <p>
0167: * The minimum timeout for requests used to check for broken links.
0168: * </p>
0169: */
0170: public static final int MINIMUM_LINK_CHECK_TIMEOUT_SECONDS = 1;
0171:
0172: /**
0173: * <p>
0174: * The HTTP client timeout is specified in milliseconds as an integer.
0175: * This limits the maximum value for the number of seconds for the
0176: * timeout that can be specified in the configuration to this value.
0177: * </p>
0178: */
0179: public static final int MAXIMUM_LINK_CHECK_TIMEOUT_SECONDS = Integer.MAX_VALUE
0180: / NUMBER_OF_MILLIS_IN_A_SECOND;
0181:
0182: /**
0183: * <p>
0184: * The name of the UTF-8 character encoding.
0185: * </p>
0186: */
0187: private static final String UTF_8_ENCODING_NAME = "UTF-8";
0188:
0189: /**
0190: * <p>
0191: * The object holding the information and resources needed by this
0192: * object.
0193: * </p>
0194: */
0195: private BrokenLinkCheckerRunConfiguration configuration;
0196:
0197: /**
0198: * <p>
0199: * This flag is used to check if an instance of this class is not used
0200: * twice.
0201: * </p>
0202: */
0203: private boolean hasBeenStarted;
0204:
0205: /**
0206: * <p>
0207: * The set of links that should be checked by background checks.
0208: * </p>
0209: *
0210: * <p>
0211: * <code>Map<String></code>
0212: * </p>
0213: */
0214: private Set linksToCheck = new HashSet();
0215:
0216: /**
0217: * <p>
0218: * The set of links that have already been checked.
0219: * </p>
0220: *
0221: * <p>
0222: * <code>Map<String></code>
0223: * </p>
0224: */
0225: private Set checkedLinks = new HashSet();
0226:
0227: /**
0228: * <p>
0229: * The map containing the broken links. A broken link maps to the error
0230: * message returned during the brokenness check.
0231: * </p>
0232: *
0233: * <p>
0234: * <code>Map<String, BrokenLinkErrorMessage></code>
0235: * </p>
0236: */
0237: private Map brokenLinks = new HashMap();
0238:
0239: /**
0240: * <p>
0241: * The map containing the URLs of the documents that have broken links.
0242: * Each document maps to the set of links that are broken.
0243: * </p>
0244: *
0245: * <p>
0246: * <code>Map<String, Set<String>></code>
0247: * </p>
0248: */
0249: private Map documentsWithBrokenLinks = new HashMap();
0250:
0251: /**
0252: * <p>
0253: * The HTTP client that is used to communicate with the WebDAV
0254: * repository.
0255: * </p>
0256: */
0257: private HttpClient httpClient;
0258:
0259: /**
0260: * <p>
0261: * The configuration to use for the tasks that check the links. This is
0262: * created once before the actual checking of links is started because
0263: * the same configuration can be used by all tasks.
0264: * </p>
0265: */
0266: private BrokenLinkCheckerTaskConfiguration taskConfiguration;
0267:
0268: /**
0269: * <p>
0270: * The currently active tasks that check if links are borken.
0271: * </p>
0272: */
0273: private Set activeTasks = new HashSet();
0274:
0275: /**
0276: * <p>
0277: * Check that the configuration is valid. If it is not an
0278: * {@link IllegalArgumentException} is thrown.
0279: * </p>
0280: * <p>
0281: * A configuration is valid if the following rules are followed:
0282: * </p>
0283: * <table>
0284: * <tr>
0285: * <th>Attribute</th>
0286: * <th>Rules</th>
0287: * </tr>
0288: * <tr>
0289: * <td><code>documentTreeToCheckRootUrl</code></td>
0290: * <td>Cannot be <code>null</code>.</td>
0291: * </tr>
0292: * <tr>
0293: * <td><code>documentsBaseUrl</code></td>
0294: * <td>Cannot be <code>null</code>.</td>
0295: * </tr>
0296: * <tr>
0297: * <td><code>internalLinksBaseUrl</code></td>
0298: * <td>Cannot be <code>null</code>.</td>
0299: * </tr>
0300: * <tr>
0301: * <td><code>repositoryUsername</code></td>
0302: * <td>Cannot be <code>null</code>.</td>
0303: * </tr>
0304: * <tr>
0305: * <td><code>repositoryPassword</code></td>
0306: * <td>Cannot be <code>null</code>.</td>
0307: * </tr>
0308: * <tr>
0309: * <td><code>resultDocumentUrl</code></td>
0310: * <td>Cannot be <code>null</code>.</td>
0311: * </tr>
0312: * <tr>
0313: * <td><code>documentBatchSize</code></td>
0314: * <td>Greater than or equal to 1 and less than or equal to 1000.</td>
0315: * </tr>
0316: * <tr>
0317: * <td><code>numberOfLinkCheckingThreads</code></td>
0318: * <td>Greater than or equal to 1.</td>
0319: * </tr>
0320: * <tr>
0321: * <td><code>linkCheckTimeoutSeconds</code></td>
0322: * <td>Greater than or equal to 1 and less than or equal to 2147483.</td>
0323: * </tr>
0324: * <tr>
0325: * <td><code>log</code></td>
0326: * <td>Cannot be <code>null</code>.</td>
0327: * </tr>
0328: * </table>
0329: *
0330: * @param configuration
0331: * the configuration to check.
0332: */
0333: public static void assertConfigurationIsValid(
0334: BrokenLinkCheckerRunConfiguration configuration) {
0335: boolean isValid = true;
0336: StringBuffer validationErrorMessage = new StringBuffer(1000);
0337:
0338: isValid &= Validation
0339: .assertTrue(configuration
0340: .getDocumentTreeToCheckRootUrl() != null,
0341: validationErrorMessage,
0342: "The document tree to check root URL cannot be 'null'.");
0343:
0344: isValid &= Validation.assertTrue(configuration
0345: .getDocumentsBaseUrl() != null, validationErrorMessage,
0346: "The documents base URL cannot be 'null'.");
0347:
0348: isValid &= Validation.assertTrue(configuration
0349: .getInternalLinksBaseUrl() != null,
0350: validationErrorMessage,
0351: "The internal links base URL cannot be 'null'.");
0352:
0353: isValid &= Validation.assertTrue(configuration
0354: .getRepositoryUsername() != null,
0355: validationErrorMessage,
0356: "The username cannot be 'null'.");
0357:
0358: isValid &= Validation.assertTrue(configuration
0359: .getRepositoryPassword() != null,
0360: validationErrorMessage,
0361: "The password cannot be 'null'.");
0362:
0363: isValid &= Validation.assertTrue(configuration
0364: .getResultDocumentUrl() != null,
0365: validationErrorMessage,
0366: "The result document URL cannot be 'null'.");
0367:
0368: int documentBatchSize = configuration.getDocumentBatchSize();
0369: isValid &= Validation
0370: .assertTrue(
0371: MINIMUM_DOCUMENT_BATCH_SIZE <= documentBatchSize
0372: && documentBatchSize <= MAXIMUM_DOCUMENT_BATCH_SIZE,
0373: validationErrorMessage,
0374: "The number of documents in a batch must be greater than or equal to "
0375: + MINIMUM_DOCUMENT_BATCH_SIZE
0376: + " and less than or equal to "
0377: + MAXIMUM_DOCUMENT_BATCH_SIZE + ".");
0378:
0379: isValid &= Validation
0380: .assertTrue(
0381: MINIMUM_NUMBER_OF_LINK_CHECKING_THREADS <= configuration
0382: .getNumberOfLinkCheckingThreads(),
0383: validationErrorMessage,
0384: "The number of link checking threads must be greater than or equal to "
0385: + MINIMUM_NUMBER_OF_LINK_CHECKING_THREADS
0386: + ".");
0387:
0388: int linkCheckTimeoutSeconds = configuration
0389: .getLinkCheckTimeoutSeconds();
0390: isValid &= Validation
0391: .assertTrue(
0392: MINIMUM_LINK_CHECK_TIMEOUT_SECONDS <= linkCheckTimeoutSeconds
0393: && linkCheckTimeoutSeconds <= MAXIMUM_LINK_CHECK_TIMEOUT_SECONDS,
0394: validationErrorMessage,
0395: "The number of seconds to wait for a response must be greater than or equal to "
0396: + MINIMUM_LINK_CHECK_TIMEOUT_SECONDS
0397: + " and less than or equal to "
0398: + MAXIMUM_LINK_CHECK_TIMEOUT_SECONDS
0399: + ".");
0400:
0401: isValid &= Validation.assertTrue(
0402: configuration.getLog() != null, validationErrorMessage,
0403: "The log cannot be 'null'.");
0404:
0405: if (!isValid) {
0406: throw new IllegalArgumentException(validationErrorMessage
0407: .toString());
0408: }
0409: }
0410:
0411: /**
0412: * <p>
0413: * Create an instance of this class passing it the information and
0414: * resources it needs.
0415: * </p>
0416: *
0417: * <p>
0418: * The broken link checking is not started by this constructor. Invoke
0419: * {@link #execute()} to start the process.
0420: * </p>
0421: *
0422: * @param configuration
0423: * the object holding the information and resources.
0424: */
0425: public BrokenLinkCheckerRun(
0426: BrokenLinkCheckerRunConfiguration configuration) {
0427: super ();
0428:
0429: assertConfigurationIsValid(configuration);
0430:
0431: this .configuration = configuration;
0432: }
0433:
0434: /**
0435: * <p>
0436: * Start the broken link checking process. This method can only be
0437: * invoked once.
0438: * </p>
0439: */
0440: public void execute() {
0441: synchronized (this ) {
0442: assertHasNotBeenStarted();
0443:
0444: hasBeenStarted = true;
0445: }
0446:
0447: createHttpClient();
0448:
0449: findBrokenLinks();
0450:
0451: storeResult();
0452: }
0453:
0454: /**
0455: * <p>
0456: * Throw an {@link IllegalStateException} if this run has already been
0457: * started.
0458: * </p>
0459: */
0460: private void assertHasNotBeenStarted() {
0461: if (hasBeenStarted) {
0462: throw new IllegalStateException(
0463: "This instance has already been started. Use a new instance for each run.");
0464: }
0465: }
0466:
0467: /**
0468: * <p>
0469: * Find broken links by querying the repository for all documents with
0470: * links, discarding links that should not be checked, and checking the
0471: * remaining links.
0472: * </p>
0473: */
0474: private void findBrokenLinks() {
0475: taskConfiguration = createBrokenLinkCheckerTaskConfiguration();
0476:
0477: int numberOfResults;
0478: int offset = 0;
0479: do {
0480: byte[] findLinksDasl = generateFindLinksDasl(offset);
0481:
0482: offset += getDocumentBatchSize();
0483:
0484: Map documentsWithLinks = new HashMap();
0485: numberOfResults = executeFindLinksDasl(findLinksDasl,
0486: documentsWithLinks);
0487:
0488: addLinksToLinksToCheck(documentsWithLinks.values());
0489:
0490: checkLinks();
0491:
0492: determineBrokenLinksOfDocuments(documentsWithLinks);
0493: } while (!shouldStop(numberOfResults));
0494: }
0495:
0496: /**
0497: * <p>
0498: * Determine the broken links of documents and add the broken ones to
0499: * the set of broken links of the document.
0500: * </p>
0501: *
0502: * @param documentsWithLinks
0503: * a set of URLs of documents mapping to the links
0504: * contained in a document.
0505: */
0506: private void determineBrokenLinksOfDocuments(Map documentsWithLinks) {
0507: Iterator documentUrlsIterator = documentsWithLinks.keySet()
0508: .iterator();
0509: while (documentUrlsIterator.hasNext()) {
0510: String documentUrl = (String) documentUrlsIterator.next();
0511:
0512: Set links = (Set) documentsWithLinks.get(documentUrl);
0513: determineBrokenLinksOfDocument(documentUrl, links);
0514: }
0515: }
0516:
0517: /**
0518: * <p>
0519: * Determine which links of a document are broken and add the broken
0520: * ones to the set of broken links of a document.
0521: * </p>
0522: *
0523: * @param documentUrl
0524: * the URL of the document containing the links.
0525: * @param links
0526: * the links contained in the document.
0527: */
0528: private void determineBrokenLinksOfDocument(String documentUrl,
0529: Set links) {
0530: Iterator linksIterator = links.iterator();
0531: while (linksIterator.hasNext()) {
0532: String link = (String) linksIterator.next();
0533:
0534: if (isLinkBroken(link)) {
0535: addBrokenLinkToDocument(documentUrl, link);
0536: }
0537: }
0538: }
0539:
0540: /**
0541: * <p>
0542: * Determine if a link was found to be broken.
0543: * </p>
0544: *
0545: * @param link
0546: * the link for which to determine whether or not it is
0547: * broken.
0548: * @return <code>true</code> if the link is broken, <code>false</code>
0549: * otherwise.
0550: */
0551: private boolean isLinkBroken(String link) {
0552: return brokenLinks.containsKey(link);
0553: }
0554:
0555: /**
0556: * <p>
0557: * Add a link to the set of broken links of a document.
0558: * </p>
0559: *
0560: * @param documentUrl
0561: * the URL of the document to which to add the broken
0562: * link.
0563: * @param link
0564: * the link to add.
0565: */
0566: private void addBrokenLinkToDocument(String documentUrl, String link) {
0567: Set brokenLinks = getBrokenLinksForDocument(documentUrl);
0568:
0569: brokenLinks.add(link);
0570: }
0571:
0572: /**
0573: * <p>
0574: * Get the set of broken links for a specific document.
0575: * </p>
0576: *
0577: * @param documentUrl
0578: * the URL of the document for which to get the set of
0579: * broken links.
0580: * @return the set of broken links. If there are no broken links for a
0581: * document an empty set is returned.
0582: */
0583: private Set getBrokenLinksForDocument(String documentUrl) {
0584: Set result = (Set) documentsWithBrokenLinks.get(documentUrl);
0585: if (result == null) {
0586: result = new HashSet();
0587: documentsWithBrokenLinks.put(documentUrl, result);
0588: }
0589:
0590: return result;
0591: }
0592:
0593: /**
0594: * <p>
0595: * Determine whether or not the finding of broken links should be
0596: * stopped.
0597: * </p>
0598: *
0599: * @param numberOfResults
0600: * the number of results returned on the current page of
0601: * the find links DASL.
0602: * @return <code>true</code> if the finding of broken links should be
0603: * stopped, <code>false</code> otherwise.
0604: */
0605: private boolean shouldStop(int numberOfResults) {
0606: return isLastPageOfDaslResults(numberOfResults)
0607: || Thread.currentThread().isInterrupted();
0608: }
0609:
0610: /**
0611: * <p>
0612: * Determine whether the last page of results was retrieved using the
0613: * find links DASL.
0614: * </p>
0615: *
0616: * @param numberOfResults
0617: * the number of results returned on the current page of
0618: * the find links DASL.
0619: * @return <code>true</code> if the last page of the search results
0620: * has been reached, <code>false</code> otherwise.
0621: */
0622: private boolean isLastPageOfDaslResults(int numberOfResults) {
0623: return numberOfResults != getDocumentBatchSize();
0624: }
0625:
0626: /**
0627: * <p>
0628: * Add the links to the links to check if the link has not been checked
0629: * before.
0630: * </p>
0631: *
0632: * @param links
0633: * the link to add to the links to check (<code>Collection<Set<String>></code>).
0634: */
0635: private void addLinksToLinksToCheck(Collection links) {
0636: Iterator linksSetIterator = links.iterator();
0637: while (linksSetIterator.hasNext()) {
0638: Set linksSet = (Set) linksSetIterator.next();
0639:
0640: Iterator linksIterator = linksSet.iterator();
0641: while (linksIterator.hasNext()) {
0642: String link = (String) linksIterator.next();
0643:
0644: if (!checkedLinks.contains(link)) {
0645: checkedLinks.add(link);
0646: linksToCheck.add(link);
0647: }
0648: }
0649: }
0650: }
0651:
0652: /**
0653: * <p>
0654: * Check if the links that are currently in {@link #linksToCheck} are
0655: * broken.
0656: * </p>
0657: */
0658: private void checkLinks() {
0659: try {
0660: startTasksToCheckLinks();
0661:
0662: waitForLinkCheckingToFinish();
0663: } finally {
0664: stopActiveTasks();
0665: }
0666: }
0667:
0668: /**
0669: * <p>
0670: * Start the tasks that will do the actual link checking.
0671: * </p>
0672: */
0673: private synchronized void startTasksToCheckLinks() {
0674: /*
0675: * Implementation note: a new thread is created for each task for each
0676: * DASL. This means that during one run lots of threads will be created.
0677: * Using a thread pool was considered, but the idea was rejected because
0678: * the broken links check runs infrequently (typically daily) and at a
0679: * time when the load on the system is low. Therefore the current
0680: * implementation should not have a noticeable effect on the system.
0681: */
0682:
0683: int numberOfTasksToStart = getNumberOfLinkCheckingThreads();
0684: for (int taskIndex = 0; taskIndex < numberOfTasksToStart; taskIndex++) {
0685: BrokenLinkCheckerTask task = new BrokenLinkCheckerTask(
0686: taskConfiguration);
0687:
0688: Thread thread = new Thread(task);
0689: thread.start();
0690:
0691: activeTasks.add(task);
0692: }
0693: }
0694:
0695: /**
0696: * <p>
0697: * Create the configuration for a {@link BrokenLinkCheckerTask}.
0698: * </p>
0699: */
0700: private BrokenLinkCheckerTaskConfiguration createBrokenLinkCheckerTaskConfiguration() {
0701: BrokenLinkCheckerTaskConfigurationBean result = new BrokenLinkCheckerTaskConfigurationBean();
0702:
0703: result.setBrokenLinkCheckerRun(this );
0704: HttpClient httpClient = createHttpClientToUseForCheckingLinks();
0705: result.setHttpClient(httpClient);
0706: result.setInternalLinksBaseUrl(getInternalLinksBaseUrl());
0707: result.setLog(getLog());
0708:
0709: return result;
0710: }
0711:
0712: /**
0713: * <p>
0714: * Wait for all the tasks that do the actual checking of links to
0715: * finish. When a task ends it will let this object know, and this
0716: * object will notify itself once all tasks indicated that they
0717: * finished. When this object notifies itself, this method returns.
0718: * </p>
0719: */
0720: private void waitForLinkCheckingToFinish() {
0721: synchronized (this ) {
0722: try {
0723: this .wait();
0724: } catch (InterruptedException e) {
0725: /*
0726: * The thread was interrupted before all broken links checking
0727: * tasks finished. Stop waiting for those tasks.
0728: *
0729: * Reinterrupt the thread so methods down the stack get notified
0730: * so they can stop too.
0731: */
0732: Thread.currentThread().interrupt();
0733: }
0734: }
0735: }
0736:
0737: /**
0738: * <p>
0739: * Stop tasks that are still active. This method will not wait for the
0740: * tasks to stop because the current thread could be interrupted.
0741: * </p>
0742: */
0743: private synchronized void stopActiveTasks() {
0744: while (!activeTasks.isEmpty()) {
0745: BrokenLinkCheckerTask activeTask = (BrokenLinkCheckerTask) activeTasks
0746: .iterator().next();
0747: activeTasks.remove(activeTask);
0748:
0749: activeTask.stop();
0750: }
0751: }
0752:
0753: /**
0754: * <p>
0755: * Get the URL of a link to check.
0756: * </p>
0757: *
0758: * @return a link to check, or <code>null</code> if there are no more
0759: * links to check.
0760: */
0761: synchronized String getLinkToCheck() {
0762: String result = null;
0763:
0764: if (!linksToCheck.isEmpty()) {
0765: result = (String) linksToCheck.iterator().next();
0766: linksToCheck.remove(result);
0767: }
0768:
0769: return result;
0770: }
0771:
0772: /**
0773: * <p>
0774: * Determine whether or not the links to check have been exhausted. If
0775: * the links to check have been exhausted the task checking the links
0776: * can stop.
0777: * </p>
0778: *
0779: * @return <code>true</code> if the links to check have been
0780: * exhausted, <code>false</code> otherwise.
0781: */
0782: synchronized boolean haveLinksToCheckBeenExhausted() {
0783: return linksToCheck.isEmpty();
0784: }
0785:
0786: /**
0787: * <p>
0788: * Add a link to the set of broken links.
0789: * </p>
0790: *
0791: * @param link
0792: * the link that is broken.
0793: * @param errorMessage
0794: * the message of the error causing the brokenness.
0795: */
0796: synchronized void addBrokenLink(String link,
0797: BrokenLinkErrorMessage errorMessage) {
0798: brokenLinks.put(link, errorMessage);
0799: }
0800:
0801: /**
0802: * <p>
0803: * Handle the event that a broken links checking task has finished. If
0804: * all tasks have finished, notify this object that it can stop waiting
0805: * for the tasks to finish.
0806: * </p>
0807: */
0808: synchronized void handleBrokenLinksCheckingTaskFinished(
0809: BrokenLinkCheckerTask task) {
0810: activeTasks.remove(task);
0811: if (activeTasks.isEmpty()) {
0812: notify();
0813: }
0814: }
0815:
0816: /**
0817: * <p>
0818: * Execute the DASL to find links and get the documents with their
0819: * links. The number of documents added to
0820: * <code>documentWithLinks</code> can be less than the total number of
0821: * documents returned by the DASL, because only documents with at least
0822: * one link eligible for checking for brokenness are added.
0823: * </p>
0824: *
0825: * @param dasl
0826: * the DASL to execute.
0827: * @param documentsWithLinks
0828: * the map in which to store the documents found by the
0829: * DASL and their links (<code>Map<String, Set<String>></code>).
0830: * Only documents with at least one link eligble for
0831: * checking for brokenness will be added.
0832: * @return the total number of documents returned by the DASL.
0833: * @throws IOException
0834: * if an I/O error occurs.
0835: */
0836: private int executeFindLinksDasl(byte[] dasl, Map documentsWithLinks) {
0837: int result;
0838:
0839: SearchMethod searchMethod = new SearchMethod(
0840: getDocumentTreeToCheckRootUrl());
0841: try {
0842: searchMethod.setDoAuthentication(true);
0843: searchMethod.setRequestBody(dasl);
0844:
0845: int searchResultCode;
0846: try {
0847: searchResultCode = httpClient
0848: .executeMethod(searchMethod);
0849: } catch (IOException e) {
0850: throw new IllegalStateException(
0851: "I/O error occurred during execution of find links DASL: "
0852: + e);
0853: }
0854:
0855: if (searchResultCode != 207) {
0856: throw new IllegalStateException(
0857: "The execution of the find links DASL returned an unexpected result code: "
0858: + searchResultCode + ", with message: "
0859: + searchMethod.getStatusText());
0860: }
0861:
0862: result = retrieveFindLinksDaslResults(searchMethod,
0863: documentsWithLinks);
0864: } finally {
0865: MethodCleanup.releaseConnection(searchMethod,
0866: "find links DASL", getLog());
0867: }
0868:
0869: return result;
0870: }
0871:
0872: /**
0873: * <p>
0874: * Retrieve the results from a (successful) find links DASL.
0875: * </p>
0876: *
0877: * @param searchMethod
0878: * the search method used to execute the DASL.
0879: * @param documentsWithLinks
0880: * the map in which to store the documents and their
0881: * links (<code>Map<String, Set<String>></code>).
0882: * Only documents with at least one link eligble for
0883: * checking for brokenness will be added.
0884: * @return the total number of documents returned by the DASL.
0885: */
0886: private int retrieveFindLinksDaslResults(SearchMethod searchMethod,
0887: Map documentsWithLinks) {
0888: int result = 0;
0889:
0890: Enumeration responseUrlsEnum = searchMethod
0891: .getAllResponseURLs();
0892: while (responseUrlsEnum.hasMoreElements()) {
0893: String documentUrl = (String) responseUrlsEnum
0894: .nextElement();
0895:
0896: result += 1;
0897:
0898: Enumeration propertiesEnum = searchMethod
0899: .getResponseProperties(documentUrl);
0900: while (propertiesEnum.hasMoreElements()) {
0901: Property property = (Property) propertiesEnum
0902: .nextElement();
0903:
0904: if (isUsableLinksProperty(property)) {
0905: String documentRelativeUrl = getDocumentResultUrl(documentUrl);
0906:
0907: Set allLinks = parseLinks(property
0908: .getPropertyAsString());
0909: Set linksEligibleForCheckingForBrokenness = getLinksEligibleForCheckingForBrokenness(allLinks);
0910:
0911: if (!linksEligibleForCheckingForBrokenness
0912: .isEmpty()) {
0913: documentsWithLinks.put(documentRelativeUrl,
0914: linksEligibleForCheckingForBrokenness);
0915: }
0916: }
0917: }
0918: }
0919:
0920: return result;
0921: }
0922:
0923: /**
0924: * <p>
0925: * Get the URL of a document which should be used in the result. This is
0926: * done by making the document URL relative to the base URL. If the
0927: * document is not part of the tree to which the base URL points, then
0928: * the unmodified document URL is returned.
0929: * </p>
0930: *
0931: * @param documentUrl
0932: * the document URL from which to derive the result URL.
0933: * @return the URL that should be used for the document in the result.
0934: */
0935: private String getDocumentResultUrl(String documentUrl) {
0936: String result;
0937:
0938: if (documentUrl.startsWith(getDocumentsBaseUrl())) {
0939: int baseUrlLength = getDocumentsBaseUrl().length();
0940: result = documentUrl.substring(baseUrlLength);
0941: } else {
0942: result = documentUrl;
0943: }
0944:
0945: return result;
0946: }
0947:
0948: /**
0949: * <p>
0950: * Parse the string containg a space-separated list of links and return
0951: * a set containing the links in the string.
0952: * </p>
0953: *
0954: * @param spaceSeparatedLinks
0955: * the space-separated list of links to parse.
0956: * @return the set containing the links in the string.
0957: */
0958: private Set parseLinks(String spaceSeparatedLinks) {
0959: Set result = new HashSet();
0960:
0961: StringTokenizer linksTokenizer = new StringTokenizer(
0962: spaceSeparatedLinks, LINK_SEPARATORS);
0963: while (linksTokenizer.hasMoreTokens()) {
0964: String link = linksTokenizer.nextToken();
0965:
0966: result.add(link);
0967: }
0968:
0969: return result;
0970: }
0971:
0972: /**
0973: * <p>
0974: * Determine which links are eligible for checking for brokenness.
0975: * </p>
0976: *
0977: * @param the
0978: * set of links to check for eligibility.
0979: * @return the set of links eligible for checking for brokenness. If no
0980: * links are eligible and empty set is returned.
0981: */
0982: private Set getLinksEligibleForCheckingForBrokenness(Set links) {
0983: Set result = new HashSet();
0984:
0985: Iterator linksIterator = links.iterator();
0986: while (linksIterator.hasNext()) {
0987: String link = (String) linksIterator.next();
0988:
0989: if (isLinkEligibleForCheckingForBrokenness(link)) {
0990: result.add(link);
0991: }
0992: }
0993:
0994: return result;
0995: }
0996:
0997: /**
0998: * <p>
0999: * Determine whether or not a link is eligible for checking for
1000: * brokenness.
1001: * </p>
1002: *
1003: * @param link
1004: * the link to check.
1005: * @return <code>true</code> if the link should be checked for
1006: * brokenness, <code>false</code> otherwise.
1007: */
1008: private boolean isLinkEligibleForCheckingForBrokenness(String link) {
1009: boolean result;
1010:
1011: result = LinkClassifier.isHttpLink(link)
1012: || LinkClassifier.isHttpsLink(link)
1013: || (LinkClassifier.isInternalLink(link) && !shouldInternalLinkBeIgnored(link));
1014:
1015: return result;
1016: }
1017:
1018: /**
1019: * <p>
1020: * Determine if an internal link should be ignored.
1021: * </p>
1022: *
1023: * @param link
1024: * the link of which to determine whether it should be
1025: * ignored.
1026: * @return <code>true</code> if the link should be ignored,
1027: * <code>false</code> otherwise.
1028: */
1029: private boolean shouldInternalLinkBeIgnored(String link) {
1030: boolean result = false;
1031:
1032: Iterator prefixesIterator = internalUrlPrefixesToIgnoreIterator();
1033: while (!result && prefixesIterator.hasNext()) {
1034: String prefix = (String) prefixesIterator.next();
1035:
1036: result = link.startsWith(prefix);
1037: }
1038:
1039: return result;
1040: }
1041:
1042: /**
1043: * <p>
1044: * Return whether or not a WebDAV property is a property containing
1045: * links and if it is usable.
1046: * </p>
1047: *
1048: * @param property
1049: * the property to check.
1050: * @return <code>true</code> if the property is the property
1051: * containing links and it is usable, <code>false</code>.
1052: */
1053: private boolean isUsableLinksProperty(Property property) {
1054: return property != null
1055: && property.getNamespaceURI().equals(
1056: HIPPO_CMS_NAMESPACE_URI)
1057: && property.getLocalName().equals(LINKS_PROPERTY_NAME)
1058: && 0 < property.getPropertyAsString().length();
1059: }
1060:
1061: /**
1062: * <p>
1063: * Store the result of the broken link checks in a document in the
1064: * WebDAV repository.
1065: * </p>
1066: */
1067: private void storeResult() {
1068: BrokenLinksToXmlDocumentInRepositoryWriter resultWriter = new BrokenLinksToXmlDocumentInRepositoryWriter(
1069: documentsWithBrokenLinks, brokenLinks,
1070: getResultDocumentUrl(), httpClient, getLog());
1071: resultWriter.writeResult();
1072: }
1073:
1074: /**
1075: * <p>
1076: * Create the HTTP client to use for executing the DASLs based on
1077: * information passed in through the configuration.
1078: * </p>
1079: */
1080: private void createHttpClient() {
1081: HttpState httpState = createHttpState();
1082:
1083: MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
1084:
1085: httpClient = new HttpClient(connectionManager);
1086: httpClient.setState(httpState);
1087: }
1088:
1089: /**
1090: * <p>
1091: * Create the HTTP client to use for checking the links based on
1092: * information passed in through the configuration.
1093: * </p>
1094: *
1095: * @return the HTTP client to use for checking the links.
1096: */
1097: private HttpClient createHttpClientToUseForCheckingLinks() {
1098: HttpClient result;
1099:
1100: HttpState httpState = createHttpState();
1101:
1102: MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
1103:
1104: result = new HttpClient(connectionManager);
1105: int timeoutMillis = getLinkCheckTimeoutSeconds()
1106: * NUMBER_OF_MILLIS_IN_A_SECOND;
1107: result.setConnectionTimeout(timeoutMillis);
1108: result.setTimeout(timeoutMillis);
1109: result.setState(httpState);
1110:
1111: return result;
1112: }
1113:
1114: /**
1115: * <p>
1116: * Create the HTTP state based on information passed in through the
1117: * configuration.
1118: * </p>
1119: *
1120: * @return the HTTP state.
1121: */
1122: private HttpState createHttpState() {
1123: HttpState httpState = new HttpState();
1124:
1125: httpState.setAuthenticationPreemptive(true);
1126:
1127: String hostname = determineHostnameOfRepository();
1128: Credentials credentials = new UsernamePasswordCredentials(
1129: getRepositoryUsername(), getRepositoryPassword());
1130: httpState.setCredentials(null, hostname, credentials);
1131:
1132: return httpState;
1133: }
1134:
1135: /**
1136: * <p>
1137: * Determine the hostname of the repository. The hostname will be
1138: * determined from the result document URL.
1139: * </p>
1140: *
1141: * @return the hostname of the repository.
1142: */
1143: private String determineHostnameOfRepository() {
1144: URL resultDocumentUrl;
1145: try {
1146: resultDocumentUrl = new URL(getResultDocumentUrl());
1147: } catch (MalformedURLException e) {
1148: throw new IllegalStateException(
1149: "The result document URL is not valid: "
1150: + getResultDocumentUrl());
1151: }
1152: return resultDocumentUrl.getHost();
1153: }
1154:
1155: /**
1156: * <p>
1157: * Generate a DASL to retrieve a set of documents containing links. The
1158: * DASL is returned as a byte array containing the UTF-8 representation
1159: * of the XML of the DASL.
1160: * </p>
1161: *
1162: * @return a byte array representation of the XML of the DASL.
1163: * @throws IOException
1164: * if an I/O error occurs.
1165: */
1166: private byte[] generateFindLinksDasl(int offset) {
1167: byte[] result;
1168:
1169: try {
1170: String findLinksDasl = generateFindLinksDaslAsString(offset);
1171:
1172: ByteArrayOutputStream findLinksDaslOutput = new ByteArrayOutputStream();
1173: try {
1174: OutputStreamWriter findLinksDaslWriter = createUtf8Writer(findLinksDaslOutput);
1175: try {
1176: findLinksDaslWriter.write(findLinksDasl);
1177: } finally {
1178: WriterCleanup.close(findLinksDaslWriter,
1179: "find links DASL", getLog());
1180: }
1181: } finally {
1182: StreamCleanup.close(findLinksDaslOutput,
1183: "find links DASL", getLog());
1184: }
1185:
1186: result = findLinksDaslOutput.toByteArray();
1187: } catch (IOException e) {
1188: throw new IllegalStateException(
1189: "Unable to generate the find links DASL: " + e);
1190: }
1191:
1192: return result;
1193: }
1194:
1195: /**
1196: * <p>
1197: * Generate a DASL to retrieve a set of documents containing links. The
1198: * DASL is returned as String representation of the XML of the DASL.
1199: * </p>
1200: *
1201: * @return a String representation of the XML of the DASL.
1202: * @throws IOException
1203: * if an I/O error occurs.
1204: */
1205: private String generateFindLinksDaslAsString(int offset)
1206: throws IOException {
1207: String result = loadFindLinksDaslTemplate();
1208:
1209: result = result.replaceAll(RESULTS_OFFSET_TAG, String
1210: .valueOf(offset));
1211: result = result.replaceAll(NUMBER_OF_RESULTS_LIMIT_TAG, String
1212: .valueOf(getDocumentBatchSize()));
1213:
1214: return result;
1215: }
1216:
1217: /**
1218: * <p>
1219: * Load the find links DASL template.
1220: * </p>
1221: *
1222: * @return the find links DASL template.
1223: * @throws IOException
1224: * if an I/O error occurs.
1225: */
1226: private String loadFindLinksDaslTemplate() throws IOException {
1227: String result;
1228:
1229: InputStream findLinksDaslTemplateInput = getClass()
1230: .getResourceAsStream(
1231: FIND_LINKS_DASL_TEMPLATE_RESOURCE_NAME);
1232: try {
1233: InputStreamReader findLinksDaslTemplateReader = createUtf8Reader(findLinksDaslTemplateInput);
1234: try {
1235: BufferedReader bufferedFindLinksDaslTemplateReader = new BufferedReader(
1236: findLinksDaslTemplateReader);
1237: try {
1238: StringBuffer findLinksDaslTemplateBuffer = new StringBuffer(
1239: 2000);
1240: String line = bufferedFindLinksDaslTemplateReader
1241: .readLine();
1242: while (line != null) {
1243: findLinksDaslTemplateBuffer.append(line);
1244:
1245: line = bufferedFindLinksDaslTemplateReader
1246: .readLine();
1247: }
1248:
1249: result = findLinksDaslTemplateBuffer.toString();
1250: } finally {
1251: ReaderCleanup.close(
1252: bufferedFindLinksDaslTemplateReader,
1253: "buffered find links DASL template",
1254: getLog());
1255: }
1256: } finally {
1257: ReaderCleanup.close(findLinksDaslTemplateReader,
1258: "find links DASL template", getLog());
1259: }
1260: } finally {
1261: StreamCleanup.close(findLinksDaslTemplateInput,
1262: "find links DASL template", getLog());
1263: }
1264:
1265: return result;
1266: }
1267:
1268: /**
1269: * <p>
1270: * Create an input stream reader that uses the UTF-8 encoding to convert
1271: * bytes to characters.
1272: * </p>
1273: *
1274: * @param input
1275: * the input stream to wrap.
1276: * @return a reader reading the input stream as a UTF-8 encoded string.
1277: */
1278: private InputStreamReader createUtf8Reader(InputStream input) {
1279: InputStreamReader findLinksDaslTemplateReader;
1280: try {
1281: findLinksDaslTemplateReader = new InputStreamReader(input,
1282: UTF_8_ENCODING_NAME);
1283: } catch (UnsupportedEncodingException e) {
1284: throw new IllegalStateException("JVM must support UTF-8: "
1285: + e);
1286: }
1287: return findLinksDaslTemplateReader;
1288: }
1289:
1290: /**
1291: * <p>
1292: * Create an output stream writer that uses the UTF-8 encoding to
1293: * convert characters to bytes.
1294: * </p>
1295: *
1296: * @param output
1297: * the output stream to wrap.
1298: * @return a writer writing characters to the output stream as UTF-8
1299: * encoded bytes.
1300: */
1301: private OutputStreamWriter createUtf8Writer(OutputStream output) {
1302: OutputStreamWriter result;
1303:
1304: try {
1305: result = new OutputStreamWriter(output, UTF_8_ENCODING_NAME);
1306: } catch (UnsupportedEncodingException e) {
1307: throw new IllegalStateException("JVM must support UTF-8: "
1308: + e);
1309: }
1310:
1311: return result;
1312: }
1313:
1314: /**
1315: * <p>
1316: * Get the document tree to check root URL from the configuration.
1317: * </p>
1318: *
1319: * @return the repository username.
1320: */
1321: private String getDocumentTreeToCheckRootUrl() {
1322: return configuration.getDocumentTreeToCheckRootUrl();
1323: }
1324:
1325: /**
1326: * <p>
1327: * Get the base URL to which the URLs of the documents must be relative
1328: * from the configuration.
1329: * </p>
1330: *
1331: * @return the base URL to which the URLs of the documents must be
1332: * relative.
1333: */
1334: private String getDocumentsBaseUrl() {
1335: return configuration.getDocumentsBaseUrl();
1336: }
1337:
1338: /**
1339: * <p>
1340: * Get the iterator over the prefixes of internal links to ignore from
1341: * the configuration.
1342: * </p>
1343: *
1344: * @return an iterator over the prefixes to ignore.
1345: */
1346: private Iterator internalUrlPrefixesToIgnoreIterator() {
1347: return configuration.internalUrlPrefixesToIgnoreIterator();
1348: }
1349:
1350: /**
1351: * <p>
1352: * Get the base URL for internal links from the configuration.
1353: * </p>
1354: *
1355: * @return the base URL for internal links.
1356: */
1357: private String getInternalLinksBaseUrl() {
1358: return configuration.getInternalLinksBaseUrl();
1359: }
1360:
1361: /**
1362: * <p>
1363: * Get the repository username from the configuration.
1364: * </p>
1365: *
1366: * @return the repository username.
1367: */
1368: private String getRepositoryUsername() {
1369: return configuration.getRepositoryUsername();
1370: }
1371:
1372: /**
1373: * <p>
1374: * Get the repository password from the configuration.
1375: * </p>
1376: *
1377: * @return the repository password.
1378: */
1379: private String getRepositoryPassword() {
1380: return configuration.getRepositoryPassword();
1381: }
1382:
1383: /**
1384: * <p>
1385: * Get the result document URL from the configuration.
1386: * </p>
1387: *
1388: * @return the result document URL.
1389: */
1390: private String getResultDocumentUrl() {
1391: return configuration.getResultDocumentUrl();
1392: }
1393:
1394: /**
1395: * <p>
1396: * Get the document batch size from the configuration.
1397: * </p>
1398: *
1399: * @return the document batch size.
1400: */
1401: private int getDocumentBatchSize() {
1402: return configuration.getDocumentBatchSize();
1403: }
1404:
1405: /**
1406: * <p>
1407: * Get the number of threads to use for checking links from the
1408: * configuration.
1409: * </p>
1410: *
1411: * @return the number of threads to use for checking links.
1412: */
1413: private int getNumberOfLinkCheckingThreads() {
1414: return configuration.getNumberOfLinkCheckingThreads();
1415: }
1416:
1417: /**
1418: * <p>
1419: * Get the maximum number of seconds to wait for a response when
1420: * checking a link from the configuration.
1421: * </p>
1422: *
1423: * @return the maximum number of seconds to wait for a response.
1424: */
1425: private int getLinkCheckTimeoutSeconds() {
1426: return configuration.getLinkCheckTimeoutSeconds();
1427: }
1428:
1429: /**
1430: * <p>
1431: * Get the log from the configuration.
1432: * </p>
1433: *
1434: * @return the log.
1435: */
1436: private BrokenLinkCheckerLog getLog() {
1437: return configuration.getLog();
1438: }
1439: }
|