0001: /* Copyright (C) 2003 Internet Archive.
0002: *
0003: * This file is part of the Heritrix web crawler (crawler.archive.org).
0004: *
0005: * Heritrix is free software; you can redistribute it and/or modify
0006: * it under the terms of the GNU Lesser Public License as published by
0007: * the Free Software Foundation; either version 2.1 of the License, or
0008: * any later version.
0009: *
0010: * Heritrix is distributed in the hope that it will be useful,
0011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0013: * GNU Lesser Public License for more details.
0014: *
0015: * You should have received a copy of the GNU Lesser Public License
0016: * along with Heritrix; if not, write to the Free Software
0017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0018: *
0019: * CrawlURI.java
0020: * Created on Apr 16, 2003
0021: *
0022: * $Header$
0023: */
0024: package org.archive.crawler.datamodel;
0025:
0026: import java.io.IOException;
0027: import java.io.ObjectInputStream;
0028: import java.io.ObjectOutputStream;
0029: import java.util.ArrayList;
0030: import java.util.Collection;
0031: import java.util.HashSet;
0032: import java.util.Iterator;
0033: import java.util.List;
0034: import java.util.Set;
0035: import java.util.concurrent.CopyOnWriteArrayList;
0036:
0037: import org.apache.commons.httpclient.HttpStatus;
0038: import org.apache.commons.httpclient.URIException;
0039: import org.archive.crawler.datamodel.credential.CredentialAvatar;
0040: import org.archive.crawler.datamodel.credential.Rfc2617Credential;
0041: import org.archive.crawler.extractor.Link;
0042: import org.archive.crawler.framework.Processor;
0043: import org.archive.crawler.framework.ProcessorChain;
0044: import org.archive.crawler.util.Transform;
0045: import org.archive.net.UURI;
0046: import org.archive.net.UURIFactory;
0047: import org.archive.util.Base32;
0048: import org.archive.util.HttpRecorder;
0049:
0050: import st.ata.util.AList;
0051: import st.ata.util.HashtableAList;
0052:
0053: /**
0054: * Represents a candidate URI and the associated state it
0055: * collects as it is crawled.
0056: *
0057: * <p>Core state is in instance variables but a flexible
0058: * attribute list is also available. Use this 'bucket' to carry
0059: * custom processing extracted data and state across CrawlURI
0060: * processing. See the {@link #putString(String, String)},
0061: * {@link #getString(String)}, etc.
0062: *
0063: * @author Gordon Mohr
0064: */
0065: public class CrawlURI extends CandidateURI implements FetchStatusCodes {
0066:
0067: private static final long serialVersionUID = 7874096757350100472L;
0068:
0069: public static final int UNCALCULATED = -1;
0070:
0071: // INHERITED FROM CANDIDATEURI
0072: // uuri: core identity: the "usable URI" to be crawled
0073: // isSeed
0074: // inScopeVersion
0075: // pathFromSeed
0076: // via
0077:
0078: // Processing progress
0079: transient private Processor nextProcessor;
0080: transient private ProcessorChain nextProcessorChain;
0081: private int fetchStatus = 0; // default to unattempted
0082: private int deferrals = 0; // count of postponements for prerequisites
0083: private int fetchAttempts = 0; // the number of fetch attempts that have been made
0084: transient private int threadNumber;
0085:
0086: // dynamic context
0087: /** @deprecated */
0088: private int linkHopCount = UNCALCULATED; // from seeds
0089: /** @deprecated */
0090: private int embedHopCount = UNCALCULATED; // from a sure link; reset upon any link traversal
0091:
0092: // User agent to masquerade as when crawling this URI. If null, globals should be used
0093: private String userAgent = null;
0094:
0095: // Once a link extractor has finished processing this curi this will be
0096: // set as true
0097: transient private boolean linkExtractorFinished = false;
0098:
0099: /**
0100: * Protection against outlink overflow.
0101: * Change value by setting alternate maximum in heritrix.properties.
0102: */
0103: public static final int MAX_OUTLINKS = Integer.parseInt(System
0104: .getProperty(CrawlURI.class.getName() + ".maxOutLinks",
0105: "6000"));
0106:
0107: transient private int discardedOutlinks = 0;
0108:
0109: ////////////////////////////////////////////////////////////////////
0110: private long contentSize = UNCALCULATED;
0111: private long contentLength = UNCALCULATED;
0112:
0113: /**
0114: * Current http recorder.
0115: *
0116: * Gets set upon successful request. Reset at start of processing chain.
0117: */
0118: private transient HttpRecorder httpRecorder = null;
0119:
0120: /**
0121: * Content type of a successfully fetched URI.
0122: *
0123: * May be null even on successfully fetched URI.
0124: */
0125: private String contentType = null;
0126:
0127: /**
0128: * True if this CrawlURI has been deemed a prerequisite by the
0129: * {@link org.archive.crawler.prefetch.PreconditionEnforcer}.
0130: *
0131: * This flag is used at least inside in the precondition enforcer so that
0132: * subsequent prerequisite tests know to let this CrawlURI through because
0133: * its a prerequisite needed by an earlier prerequisite tests (e.g. If
0134: * this is a robots.txt, then the subsequent login credentials prereq
0135: * test must not throw it out because its not a login curi).
0136: */
0137: private boolean prerequisite = false;
0138:
0139: /**
0140: * Set to true if this <code>curi</code> is to be POST'd rather than GET-d.
0141: */
0142: private boolean post = false;
0143:
0144: /**
0145: * Monotonically increasing number within a crawl;
0146: * useful for tending towards breadth-first ordering.
0147: * Will sometimes be truncated to 48 bits, so behavior
0148: * over 281 trillion instantiated CrawlURIs may be
0149: * buggy
0150: */
0151: protected long ordinal;
0152:
0153: /**
0154: * Cache of this candidate uuri as a string.
0155: *
0156: * Profiling shows us spending about 1-2% of total elapsed time in
0157: * toString.
0158: */
0159: private String cachedCrawlURIString = null;
0160:
0161: /**
0162: * Array to hold keys of alist members that persist across URI processings.
0163: * Any key mentioned in this list will not be cleared out at the end
0164: * of a pass down the processing chain.
0165: */
0166: private static final List<Object> alistPersistentMember = new CopyOnWriteArrayList<Object>(
0167: new String[] { A_CREDENTIAL_AVATARS_KEY });
0168:
0169: /**
0170: * A digest (hash, usually SHA1) of retrieved content-body.
0171: *
0172: */
0173: private byte[] contentDigest = null;
0174: private String contentDigestScheme = null;
0175:
0176: /**
0177: * Create a new instance of CrawlURI from a {@link UURI}.
0178: *
0179: * @param uuri the UURI to base this CrawlURI on.
0180: */
0181: public CrawlURI(UURI uuri) {
0182: super (uuri);
0183: }
0184:
0185: /**
0186: * Create a new instance of CrawlURI from a {@link CandidateURI}
0187: *
0188: * @param caUri the CandidateURI to base this CrawlURI on.
0189: * @param o Monotonically increasing number within a crawl.
0190: */
0191: @SuppressWarnings("deprecation")
0192: public CrawlURI(CandidateURI caUri, long o) {
0193: super (caUri.getUURI(), caUri.getPathFromSeed(), caUri.getVia(),
0194: caUri.getViaContext());
0195: ordinal = o;
0196: setIsSeed(caUri.isSeed());
0197: setSchedulingDirective(caUri.getSchedulingDirective());
0198: setAList(caUri.getAList());
0199: }
0200:
0201: /**
0202: * Takes a status code and converts it into a human readable string.
0203: *
0204: * @param code the status code
0205: * @return a human readable string declaring what the status code is.
0206: */
0207: public static String fetchStatusCodesToString(int code) {
0208: switch (code) {
0209: // DNS
0210: case S_DNS_SUCCESS:
0211: return "DNS-1-OK";
0212: // HTTP Informational 1xx
0213: case 100:
0214: return "HTTP-100-Info-Continue";
0215: case 101:
0216: return "HTTP-101-Info-Switching Protocols";
0217: // HTTP Successful 2xx
0218: case 200:
0219: return "HTTP-200-Success-OK";
0220: case 201:
0221: return "HTTP-201-Success-Created";
0222: case 202:
0223: return "HTTP-202-Success-Accepted";
0224: case 203:
0225: return "HTTP-203-Success-Non-Authoritative";
0226: case 204:
0227: return "HTTP-204-Success-No Content ";
0228: case 205:
0229: return "HTTP-205-Success-Reset Content";
0230: case 206:
0231: return "HTTP-206-Success-Partial Content";
0232: // HTTP Redirection 3xx
0233: case 300:
0234: return "HTTP-300-Redirect-Multiple Choices";
0235: case 301:
0236: return "HTTP-301-Redirect-Moved Permanently";
0237: case 302:
0238: return "HTTP-302-Redirect-Found";
0239: case 303:
0240: return "HTTP-303-Redirect-See Other";
0241: case 304:
0242: return "HTTP-304-Redirect-Not Modified";
0243: case 305:
0244: return "HTTP-305-Redirect-Use Proxy";
0245: case 307:
0246: return "HTTP-307-Redirect-Temporary Redirect";
0247: // HTTP Client Error 4xx
0248: case 400:
0249: return "HTTP-400-ClientErr-Bad Request";
0250: case 401:
0251: return "HTTP-401-ClientErr-Unauthorized";
0252: case 402:
0253: return "HTTP-402-ClientErr-Payment Required";
0254: case 403:
0255: return "HTTP-403-ClientErr-Forbidden";
0256: case 404:
0257: return "HTTP-404-ClientErr-Not Found";
0258: case 405:
0259: return "HTTP-405-ClientErr-Method Not Allowed";
0260: case 407:
0261: return "HTTP-406-ClientErr-Not Acceptable";
0262: case 408:
0263: return "HTTP-407-ClientErr-Proxy Authentication Required";
0264: case 409:
0265: return "HTTP-408-ClientErr-Request Timeout";
0266: case 410:
0267: return "HTTP-409-ClientErr-Conflict";
0268: case 406:
0269: return "HTTP-410-ClientErr-Gone";
0270: case 411:
0271: return "HTTP-411-ClientErr-Length Required";
0272: case 412:
0273: return "HTTP-412-ClientErr-Precondition Failed";
0274: case 413:
0275: return "HTTP-413-ClientErr-Request Entity Too Large";
0276: case 414:
0277: return "HTTP-414-ClientErr-Request-URI Too Long";
0278: case 415:
0279: return "HTTP-415-ClientErr-Unsupported Media Type";
0280: case 416:
0281: return "HTTP-416-ClientErr-Requested Range Not Satisfiable";
0282: case 417:
0283: return "HTTP-417-ClientErr-Expectation Failed";
0284: // HTTP Server Error 5xx
0285: case 500:
0286: return "HTTP-500-ServerErr-Internal Server Error";
0287: case 501:
0288: return "HTTP-501-ServerErr-Not Implemented";
0289: case 502:
0290: return "HTTP-502-ServerErr-Bad Gateway";
0291: case 503:
0292: return "HTTP-503-ServerErr-Service Unavailable";
0293: case 504:
0294: return "HTTP-504-ServerErr-Gateway Timeout";
0295: case 505:
0296: return "HTTP-505-ServerErr-HTTP Version Not Supported";
0297: // Heritrix internal codes (all negative numbers
0298: case S_BLOCKED_BY_USER:
0299: return "Heritrix(" + S_BLOCKED_BY_USER
0300: + ")-Blocked by user";
0301: case S_BLOCKED_BY_CUSTOM_PROCESSOR:
0302: return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR
0303: + ")-Blocked by custom prefetch processor";
0304: case S_DELETED_BY_USER:
0305: return "Heritrix(" + S_DELETED_BY_USER
0306: + ")-Deleted by user";
0307: case S_CONNECT_FAILED:
0308: return "Heritrix(" + S_CONNECT_FAILED
0309: + ")-Connection failed";
0310: case S_CONNECT_LOST:
0311: return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost";
0312: case S_DEEMED_CHAFF:
0313: return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff";
0314: case S_DEFERRED:
0315: return "Heritrix(" + S_DEFERRED + ")-Deferred";
0316: case S_DOMAIN_UNRESOLVABLE:
0317: return "Heritrix(" + S_DOMAIN_UNRESOLVABLE
0318: + ")-Domain unresolvable";
0319: case S_OUT_OF_SCOPE:
0320: return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope";
0321: case S_DOMAIN_PREREQUISITE_FAILURE:
0322: return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE
0323: + ")-Domain prerequisite failure";
0324: case S_ROBOTS_PREREQUISITE_FAILURE:
0325: return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE
0326: + ")-Robots prerequisite failure";
0327: case S_OTHER_PREREQUISITE_FAILURE:
0328: return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE
0329: + ")-Other prerequisite failure";
0330: case S_PREREQUISITE_UNSCHEDULABLE_FAILURE:
0331: return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE
0332: + ")-Prerequisite unschedulable failure";
0333: case S_ROBOTS_PRECLUDED:
0334: return "Heritrix(" + S_ROBOTS_PRECLUDED
0335: + ")-Robots precluded";
0336: case S_RUNTIME_EXCEPTION:
0337: return "Heritrix(" + S_RUNTIME_EXCEPTION
0338: + ")-Runtime exception";
0339: case S_SERIOUS_ERROR:
0340: return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error";
0341: case S_TIMEOUT:
0342: return "Heritrix(" + S_TIMEOUT + ")-Timeout";
0343: case S_TOO_MANY_EMBED_HOPS:
0344: return "Heritrix(" + S_TOO_MANY_EMBED_HOPS
0345: + ")-Too many embed hops";
0346: case S_TOO_MANY_LINK_HOPS:
0347: return "Heritrix(" + S_TOO_MANY_LINK_HOPS
0348: + ")-Too many link hops";
0349: case S_TOO_MANY_RETRIES:
0350: return "Heritrix(" + S_TOO_MANY_RETRIES
0351: + ")-Too many retries";
0352: case S_UNATTEMPTED:
0353: return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted";
0354: case S_UNFETCHABLE_URI:
0355: return "Heritrix(" + S_UNFETCHABLE_URI
0356: + ")-Unfetchable URI";
0357: case S_PROCESSING_THREAD_KILLED:
0358: return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-"
0359: + "Processing thread killed";
0360: // Unknown return code
0361: default:
0362: return Integer.toString(code);
0363: }
0364: }
0365:
0366: /**
0367: * Return the overall/fetch status of this CrawlURI for its
0368: * current trip through the processing loop.
0369: *
0370: * @return a value from FetchStatusCodes
0371: */
0372: public int getFetchStatus() {
0373: return fetchStatus;
0374: }
0375:
0376: /**
0377: * Set the overall/fetch status of this CrawlURI for
0378: * its current trip through the processing loop.
0379: *
0380: * @param newstatus a value from FetchStatusCodes
0381: */
0382: public void setFetchStatus(int newstatus) {
0383: fetchStatus = newstatus;
0384: }
0385:
0386: /**
0387: * Get the number of attempts at getting the document referenced by this
0388: * URI.
0389: *
0390: * @return the number of attempts at getting the document referenced by this
0391: * URI.
0392: */
0393: public int getFetchAttempts() {
0394: return fetchAttempts;
0395: }
0396:
0397: /**
0398: * Increment the number of attempts at getting the document referenced by
0399: * this URI.
0400: *
0401: * @return the number of attempts at getting the document referenced by this
0402: * URI.
0403: */
0404: public int incrementFetchAttempts() {
0405: // TODO: rename, this is actually processing-loop-attempts
0406: return fetchAttempts++;
0407: }
0408:
0409: /**
0410: * Reset fetchAttempts counter.
0411: */
0412: public void resetFetchAttempts() {
0413: this .fetchAttempts = 0;
0414: }
0415:
0416: /**
0417: * Reset deferrals counter.
0418: */
0419: public void resetDeferrals() {
0420: this .deferrals = 0;
0421: }
0422:
0423: /**
0424: * Get the next processor to process this URI.
0425: *
0426: * @return the processor that should process this URI next.
0427: */
0428: public Processor nextProcessor() {
0429: return nextProcessor;
0430: }
0431:
0432: /**
0433: * Get the processor chain that should be processing this URI after the
0434: * current chain is finished with it.
0435: *
0436: * @return the next processor chain to process this URI.
0437: */
0438: public ProcessorChain nextProcessorChain() {
0439: return nextProcessorChain;
0440: }
0441:
0442: /**
0443: * Set the next processor to process this URI.
0444: *
0445: * @param processor the next processor to process this URI.
0446: */
0447: public void setNextProcessor(Processor processor) {
0448: nextProcessor = processor;
0449: }
0450:
0451: /**
0452: * Set the next processor chain to process this URI.
0453: *
0454: * @param nextProcessorChain the next processor chain to process this URI.
0455: */
0456: public void setNextProcessorChain(ProcessorChain nextProcessorChain) {
0457: this .nextProcessorChain = nextProcessorChain;
0458: }
0459:
0460: /**
0461: * Do all actions associated with setting a <code>CrawlURI</code> as
0462: * requiring a prerequisite.
0463: *
0464: * @param lastProcessorChain Last processor chain reference. This chain is
0465: * where this <code>CrawlURI</code> goes next.
0466: * @param preq Object to set a prerequisite.
0467: * @throws URIException
0468: */
0469: public void markPrerequisite(String preq,
0470: ProcessorChain lastProcessorChain) throws URIException {
0471: Link link = createLink(preq, Link.PREREQ_MISC, Link.PREREQ_HOP);
0472: setPrerequisiteUri(link);
0473: incrementDeferrals();
0474: setFetchStatus(S_DEFERRED);
0475: skipToProcessorChain(lastProcessorChain);
0476: }
0477:
0478: /**
0479: * Set a prerequisite for this URI.
0480: * <p>
0481: * A prerequisite is a URI that must be crawled before this URI can be
0482: * crawled.
0483: *
0484: * @param link Link to set as prereq.
0485: */
0486: public void setPrerequisiteUri(Object link) {
0487: putObject(A_PREREQUISITE_URI, link);
0488: }
0489:
0490: /**
0491: * Get the prerequisite for this URI.
0492: * <p>
0493: * A prerequisite is a URI that must be crawled before this URI can be
0494: * crawled.
0495: *
0496: * @return the prerequisite for this URI or null if no prerequisite.
0497: */
0498: public Object getPrerequisiteUri() {
0499: return getObject(A_PREREQUISITE_URI);
0500: }
0501:
0502: /**
0503: * @return True if this CrawlURI has a prerequisite.
0504: */
0505: public boolean hasPrerequisiteUri() {
0506: return containsKey(A_PREREQUISITE_URI);
0507: }
0508:
0509: /**
0510: * Returns true if this CrawlURI is a prerequisite.
0511: *
0512: * @return true if this CrawlURI is a prerequisite.
0513: */
0514: public boolean isPrerequisite() {
0515: return this .prerequisite;
0516: }
0517:
0518: /**
0519: * Set if this CrawlURI is itself a prerequisite URI.
0520: *
0521: * @param prerequisite True if this CrawlURI is itself a prerequiste uri.
0522: */
0523: public void setPrerequisite(boolean prerequisite) {
0524: this .prerequisite = prerequisite;
0525: }
0526:
0527: /**
0528: * @return This crawl URI as a string wrapped with 'CrawlURI(' +
0529: * ')'.
0530: */
0531: public String getCrawlURIString() {
0532: if (this .cachedCrawlURIString == null) {
0533: synchronized (this ) {
0534: if (this .cachedCrawlURIString == null) {
0535: this .cachedCrawlURIString = "CrawlURI("
0536: + toString() + ")";
0537: }
0538: }
0539: }
0540: return this .cachedCrawlURIString;
0541: }
0542:
0543: /**
0544: * Get the content type of this URI.
0545: *
0546: * @return Fetched URIs content type. May be null.
0547: */
0548: public String getContentType() {
0549: return this .contentType;
0550: }
0551:
0552: /**
0553: * Set a fetched uri's content type.
0554: *
0555: * @param ct Contenttype. May be null.
0556: */
0557: public void setContentType(String ct) {
0558: this .contentType = ct;
0559: }
0560:
0561: /**
0562: * Set the number of the ToeThread responsible for processing this uri.
0563: *
0564: * @param i the ToeThread number.
0565: */
0566: public void setThreadNumber(int i) {
0567: threadNumber = i;
0568: }
0569:
0570: /**
0571: * Get the number of the ToeThread responsible for processing this uri.
0572: *
0573: * @return the ToeThread number.
0574: */
0575: public int getThreadNumber() {
0576: return threadNumber;
0577: }
0578:
0579: /**
0580: * Increment the deferral count.
0581: *
0582: */
0583: public void incrementDeferrals() {
0584: deferrals++;
0585: }
0586:
0587: /**
0588: * Get the deferral count.
0589: *
0590: * @return the deferral count.
0591: */
0592: public int getDeferrals() {
0593: return deferrals;
0594: }
0595:
0596: /**
0597: * Remove all attributes set on this uri.
0598: * <p>
0599: * This methods removes the attribute list.
0600: */
0601: public void stripToMinimal() {
0602: clearAList();
0603: }
0604:
0605: /**
0606: * Get the size in bytes of this URI's recorded content, inclusive
0607: * of things like protocol headers. It is the responsibility of the
0608: * classes which fetch the URI to set this value accordingly -- it is
0609: * not calculated/verified within CrawlURI.
0610: *
0611: * This value is consulted in reporting/logging/writing-decisions.
0612: *
0613: * @see #setContentSize()
0614: * @return contentSize
0615: */
0616: public long getContentSize() {
0617: return contentSize;
0618: }
0619:
0620: /**
0621: * Make note of a non-fatal error, local to a particular Processor,
0622: * which should be logged somewhere, but allows processing to continue.
0623: *
0624: * This is how you add to the local-error log (the 'localized' in
0625: * the below is making an error local rather than global, not
0626: * making a swiss-french version of the error.).
0627: *
0628: * @param processorName Name of processor the exception was thrown
0629: * in.
0630: * @param ex Throwable to log.
0631: * @param message Extra message to log beyond exception message.
0632: */
0633: public void addLocalizedError(final String processorName,
0634: final Throwable ex, final String message) {
0635: List<LocalizedError> localizedErrors;
0636: if (containsKey(A_LOCALIZED_ERRORS)) {
0637: @SuppressWarnings("unchecked")
0638: List<LocalizedError> temp // to prevent warning on cast
0639: = (List<LocalizedError>) getObject(A_LOCALIZED_ERRORS);
0640: localizedErrors = temp;
0641: } else {
0642: localizedErrors = new ArrayList<LocalizedError>();
0643: putObject(A_LOCALIZED_ERRORS, localizedErrors);
0644: }
0645:
0646: localizedErrors.add(new LocalizedError(processorName, ex,
0647: message));
0648: addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@"
0649: + processorName);
0650: }
0651:
0652: // TODO: Move to utils.
0653: protected String getClassSimpleName(final Class c) {
0654: String classname = c.getName();
0655: int index = classname.lastIndexOf('.');
0656: return ((index > 0 && (index + 1) < classname.length()) ? classname
0657: .substring(index + 1)
0658: : classname);
0659: }
0660:
0661: /**
0662: * Add an annotation: an abbrieviated indication of something special
0663: * about this URI that need not be present in every crawl.log line,
0664: * but should be noted for future reference.
0665: *
0666: * @param annotation the annotation to add; should not contain
0667: * whitespace or a comma
0668: */
0669: public void addAnnotation(String annotation) {
0670: String annotations;
0671: if (containsKey(A_ANNOTATIONS)) {
0672: annotations = getString(A_ANNOTATIONS);
0673: annotations += "," + annotation;
0674: } else {
0675: annotations = annotation;
0676: }
0677:
0678: putString(A_ANNOTATIONS, annotations);
0679: }
0680:
0681: /**
0682: * TODO: Implement truncation using booleans rather than as this
0683: * ugly String parse.
0684: * @return True if fetch was truncated.
0685: */
0686: public boolean isTruncatedFetch() {
0687: return annotationContains(TRUNC_SUFFIX);
0688: }
0689:
0690: public boolean isLengthTruncatedFetch() {
0691: return annotationContains(LENGTH_TRUNC);
0692: }
0693:
0694: public boolean isTimeTruncatedFetch() {
0695: return annotationContains(TIMER_TRUNC);
0696: }
0697:
0698: public boolean isHeaderTruncatedFetch() {
0699: return annotationContains(HEADER_TRUNC);
0700: }
0701:
0702: protected boolean annotationContains(final String str2Find) {
0703: boolean result = false;
0704: if (!containsKey(A_ANNOTATIONS)) {
0705: return result;
0706: }
0707: String annotations = getString(A_ANNOTATIONS);
0708: if (annotations != null && annotations.length() > 0) {
0709: result = annotations.indexOf(str2Find) >= 0;
0710: }
0711: return result;
0712: }
0713:
0714: /**
0715: * Get the annotations set for this uri.
0716: *
0717: * @return the annotations set for this uri.
0718: */
0719: public String getAnnotations() {
0720: return (containsKey(A_ANNOTATIONS)) ? getString(A_ANNOTATIONS)
0721: : null;
0722: }
0723:
0724: /**
0725: * Get the embeded hop count.
0726: *
0727: * @return the embeded hop count.
0728: * @deprecated
0729: */
0730: public int getEmbedHopCount() {
0731: return embedHopCount;
0732: }
0733:
0734: /**
0735: * Get the link hop count.
0736: *
0737: * @return the link hop count.
0738: * @deprecated
0739: */
0740: public int getLinkHopCount() {
0741: return linkHopCount;
0742: }
0743:
0744: /**
0745: * Mark this uri as being a seed.
0746: *
0747: *
0748: * @deprecated
0749: */
0750: public void markAsSeed() {
0751: linkHopCount = 0;
0752: embedHopCount = 0;
0753: }
0754:
0755: /**
0756: * Get the user agent to use for crawling this URI.
0757: *
0758: * If null the global setting should be used.
0759: *
0760: * @return user agent or null
0761: */
0762: public String getUserAgent() {
0763: return userAgent;
0764: }
0765:
0766: /**
0767: * Set the user agent to use when crawling this URI.
0768: *
0769: * If not set the global settings should be used.
0770: *
0771: * @param string user agent to use
0772: */
0773: public void setUserAgent(String string) {
0774: userAgent = string;
0775: }
0776:
0777: /**
0778: * Set which processor should be the next processor to process this uri
0779: * instead of using the default next processor.
0780: *
0781: * @param processorChain the processor chain to skip to.
0782: * @param processor the processor in the processor chain to skip to.
0783: */
0784: public void skipToProcessor(ProcessorChain processorChain,
0785: Processor processor) {
0786: setNextProcessorChain(processorChain);
0787: setNextProcessor(processor);
0788: }
0789:
0790: /**
0791: * Set which processor chain should be processing this uri next.
0792: *
0793: * @param processorChain the processor chain to skip to.
0794: */
0795: public void skipToProcessorChain(ProcessorChain processorChain) {
0796: setNextProcessorChain(processorChain);
0797: setNextProcessor(null);
0798: }
0799:
0800: /**
0801: * For completed HTTP transactions, the length of the content-body.
0802: *
0803: * @return For completed HTTP transactions, the length of the content-body.
0804: */
0805: public long getContentLength() {
0806: if (this .contentLength < 0) {
0807: this .contentLength = (getHttpRecorder() != null) ? getHttpRecorder()
0808: .getResponseContentLength()
0809: : 0;
0810: }
0811: return this .contentLength;
0812: }
0813:
0814: /**
0815: * Get size of data recorded (transferred)
0816: *
0817: * @return recorded data size
0818: */
0819: public long getRecordedSize() {
0820: return (getHttpRecorder() != null) ? getHttpRecorder()
0821: .getRecordedInput().getSize()
0822: // if unavailable fall back on content-size
0823: : getContentSize();
0824: }
0825:
0826: /**
0827: * Sets the 'content size' for the URI, which is considered inclusive
0828: * of all recorded material (such as protocol headers) or even material
0829: * 'virtually' considered (as in material from a previous fetch
0830: * confirmed unchanged with a server). (In contrast, content-length
0831: * matches the HTTP definition, that of the enclosed content-body.)
0832: *
0833: * Should be set by a fetcher or other processor as soon as the final
0834: * size of recorded content is known. Setting to an artificial/incorrect
0835: * value may affect other reporting/processing.
0836: *
0837: * @param l Content size.
0838: */
0839: public void setContentSize(long l) {
0840: contentSize = l;
0841: }
0842:
0843: /**
0844: * If true then a link extractor has already claimed this CrawlURI and
0845: * performed link extraction on the document content. This does not
0846: * preclude other link extractors that may have an interest in this
0847: * CrawlURI from also doing link extraction but default behavior should
0848: * be to not run if link extraction has already been done.
0849: *
0850: * <p>There is an onus on link extractors to set this flag if they have
0851: * run.
0852: *
0853: * <p>The only extractor of the default Heritrix set that does not
0854: * respect this flag is
0855: * {@link org.archive.crawler.extractor.ExtractorHTTP}.
0856: * It runs against HTTP headers, not the document content.
0857: *
0858: * @return True if a processor has performed link extraction on this
0859: * CrawlURI
0860: *
0861: * @see #linkExtractorFinished()
0862: */
0863: public boolean hasBeenLinkExtracted() {
0864: return linkExtractorFinished;
0865: }
0866:
0867: /**
0868: * Note that link extraction has been performed on this CrawlURI. A processor
0869: * doing link extraction should invoke this method once it has finished it's
0870: * work. It should invoke it even if no links are extracted. It should only
0871: * invoke this method if the link extraction was performed on the document
0872: * body (not the HTTP headers etc.).
0873: *
0874: * @see #hasBeenLinkExtracted()
0875: */
0876: public void linkExtractorFinished() {
0877: linkExtractorFinished = true;
0878: if (discardedOutlinks > 0) {
0879: addAnnotation("dol:" + discardedOutlinks);
0880: }
0881: }
0882:
0883: /**
0884: * Notify CrawlURI it is about to be logged; opportunity
0885: * for self-annotation
0886: */
0887: public void aboutToLog() {
0888: if (fetchAttempts > 1) {
0889: addAnnotation(fetchAttempts + "t");
0890: }
0891: }
0892:
0893: /**
0894: * Get the http recorder associated with this uri.
0895: *
0896: * @return Returns the httpRecorder. May be null but its set early in
0897: * FetchHttp so there is an issue if its null.
0898: */
0899: public HttpRecorder getHttpRecorder() {
0900: return httpRecorder;
0901: }
0902:
0903: /**
0904: * Set the http recorder to be associated with this uri.
0905: *
0906: * @param httpRecorder The httpRecorder to set.
0907: */
0908: public void setHttpRecorder(HttpRecorder httpRecorder) {
0909: this .httpRecorder = httpRecorder;
0910: }
0911:
0912: /**
0913: * Return true if this is a http transaction.
0914: *
0915: * TODO: Compound this and {@link #isPost()} method so that there is one
0916: * place to go to find out if get http, post http, ftp, dns.
0917: *
0918: * @return True if this is a http transaction.
0919: */
0920: public boolean isHttpTransaction() {
0921: return containsKey(A_HTTP_TRANSACTION);
0922: }
0923:
0924: /**
0925: * Clean up after a run through the processing chain.
0926: *
0927: * Called on the end of processing chain by Frontier#finish. Null out any
0928: * state gathered during processing.
0929: */
0930: public void processingCleanup() {
0931: this .httpRecorder = null;
0932: this .fetchStatus = S_UNATTEMPTED;
0933: this .setPrerequisite(false);
0934: this .contentSize = UNCALCULATED;
0935: this .contentLength = UNCALCULATED;
0936: // Clear 'links extracted' flag.
0937: this .linkExtractorFinished = false;
0938: // Clean the alist of all but registered permanent members.
0939: setAList(getPersistentAList());
0940: }
0941:
0942: public AList getPersistentAList() {
0943: AList newAList = new HashtableAList();
0944: // copy declared persistent keys
0945: if (alistPersistentMember != null
0946: && alistPersistentMember.size() > 0) {
0947: newAList.copyKeysFrom(alistPersistentMember.iterator(),
0948: getAList());
0949: }
0950: // also copy declared 'heritable' keys
0951: List heritableKeys = (List) getObject(A_HERITABLE_KEYS);
0952: if (heritableKeys != null) {
0953: newAList.copyKeysFrom(heritableKeys.iterator(), getAList());
0954: }
0955: return newAList;
0956: }
0957:
0958: /**
0959: * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>.
0960: *
0961: * Its safe to pass a CrawlURI instance. In this case we just return it
0962: * as a result. Otherwise, we create new CrawlURI instance.
0963: *
0964: * @param caUri Candidate URI.
0965: * @param ordinal
0966: * @return A crawlURI made from the passed CandidateURI.
0967: */
0968: public static CrawlURI from(CandidateURI caUri, long ordinal) {
0969: return (caUri instanceof CrawlURI) ? (CrawlURI) caUri
0970: : new CrawlURI(caUri, ordinal);
0971: }
0972:
0973: /**
0974: * @param avatars Credential avatars to save off.
0975: */
0976: private void setCredentialAvatars(Set avatars) {
0977: putObject(A_CREDENTIAL_AVATARS_KEY, avatars);
0978: }
0979:
0980: /**
0981: * @return Credential avatars. Null if none set.
0982: */
0983: @SuppressWarnings("unchecked")
0984: public Set<CredentialAvatar> getCredentialAvatars() {
0985: return (Set) getObject(A_CREDENTIAL_AVATARS_KEY);
0986: }
0987:
0988: /**
0989: * @return True if there are avatars attached to this instance.
0990: */
0991: public boolean hasCredentialAvatars() {
0992: return getCredentialAvatars() != null
0993: && getCredentialAvatars().size() > 0;
0994: }
0995:
0996: /**
0997: * Add an avatar.
0998: *
0999: * We do lazy instantiation.
1000: *
1001: * @param ca Credential avatar to add to set of avatars.
1002: */
1003: public void addCredentialAvatar(CredentialAvatar ca) {
1004: Set<CredentialAvatar> avatars = getCredentialAvatars();
1005: if (avatars == null) {
1006: avatars = new HashSet<CredentialAvatar>();
1007: setCredentialAvatars(avatars);
1008: }
1009: avatars.add(ca);
1010: }
1011:
1012: /**
1013: * Remove all credential avatars from this crawl uri.
1014: */
1015: public void removeCredentialAvatars() {
1016: if (hasCredentialAvatars()) {
1017: remove(A_CREDENTIAL_AVATARS_KEY);
1018: }
1019: }
1020:
1021: /**
1022: * Remove all credential avatars from this crawl uri.
1023: * @param ca Avatar to remove.
1024: * @return True if we removed passed parameter. False if no operation
1025: * performed.
1026: */
1027: public boolean removeCredentialAvatar(CredentialAvatar ca) {
1028: boolean result = false;
1029: Set avatars = getCredentialAvatars();
1030: if (avatars != null && avatars.size() > 0) {
1031: result = avatars.remove(ca);
1032: }
1033: return result;
1034: }
1035:
1036: /**
1037: * Ask this URI if it was a success or not.
1038: *
1039: * Only makes sense to call this method after execution of
1040: * HttpMethod#execute. Regard any status larger then 0 as success
1041: * except for below caveat regarding 401s. Use {@link #is2XXSuccess()} if
1042: * looking for a status code in the 200 range.
1043: *
1044: * <p>401s caveat: If any rfc2617 credential data present and we got a 401
1045: * assume it got loaded in FetchHTTP on expectation that we're to go around
1046: * the processing chain again. Report this condition as a failure so we
1047: * get another crack at the processing chain only this time we'll be making
1048: * use of the loaded credential data.
1049: *
1050: * @return True if ths URI has been successfully processed.
1051: * @see #is2XXSuccess()
1052: */
1053: public boolean isSuccess() {
1054: boolean result = false;
1055: int statusCode = this .fetchStatus;
1056: if (statusCode == HttpStatus.SC_UNAUTHORIZED
1057: && hasRfc2617CredentialAvatar()) {
1058: result = false;
1059: } else {
1060: result = (statusCode > 0);
1061: }
1062: return result;
1063: }
1064:
1065: /**
1066: * @return True if status code is in the 2xx range.
1067: * @see #isSuccess()
1068: */
1069: public boolean is2XXSuccess() {
1070: return this .fetchStatus >= 200 && this .fetchStatus < 300;
1071: }
1072:
1073: /**
1074: * @return True if we have an rfc2617 payload.
1075: */
1076: public boolean hasRfc2617CredentialAvatar() {
1077: boolean result = false;
1078: Set avatars = getCredentialAvatars();
1079: if (avatars != null && avatars.size() > 0) {
1080: for (Iterator i = avatars.iterator(); i.hasNext();) {
1081: if (((CredentialAvatar) i.next())
1082: .match(Rfc2617Credential.class)) {
1083: result = true;
1084: break;
1085: }
1086: }
1087: }
1088: return result;
1089: }
1090:
1091: /**
1092: * Set whether this URI should be fetched by sending a HTTP POST request.
1093: * Else a HTTP GET request will be used.
1094: *
1095: * @param b Set whether this curi is to be POST'd. Else its to be GET'd.
1096: */
1097: public void setPost(boolean b) {
1098: this .post = b;
1099: }
1100:
1101: /**
1102: * Returns true if this URI should be fetched by sending a HTTP POST request.
1103: *
1104: *
1105: * TODO: Compound this and {@link #isHttpTransaction()} method so that there
1106: * is one place to go to find out if get http, post http, ftp, dns.
1107: *
1108: * @return Returns is this CrawlURI instance is to be posted.
1109: */
1110: public boolean isPost() {
1111: return this .post;
1112: }
1113:
1114: /**
1115: * Set the retained content-digest value (usu. SHA1).
1116: *
1117: * @param digestValue
1118: * @deprecated Use {@link #setContentDigest(String scheme, byte[])}
1119: */
1120: public void setContentDigest(byte[] digestValue) {
1121: setContentDigest("SHA1", digestValue);
1122: }
1123:
1124: public void setContentDigest(final String scheme,
1125: final byte[] digestValue) {
1126: this .contentDigest = digestValue;
1127: this .contentDigestScheme = scheme;
1128: }
1129:
1130: public String getContentDigestSchemeString() {
1131: if (this .contentDigest == null) {
1132: return null;
1133: }
1134: return this .contentDigestScheme + ":"
1135: + getContentDigestString();
1136: }
1137:
1138: /**
1139: * Return the retained content-digest value, if any.
1140: *
1141: * @return Digest value.
1142: */
1143: public Object getContentDigest() {
1144: return contentDigest;
1145: }
1146:
1147: public String getContentDigestString() {
1148: if (this .contentDigest == null) {
1149: return null;
1150: }
1151: return Base32.encode(this .contentDigest);
1152: }
1153:
1154: transient Object holder;
1155: transient Object holderKey;
1156:
1157: /**
1158: * Remember a 'holder' to which some enclosing/queueing
1159: * facility has assigned this CrawlURI
1160: * .
1161: * @param obj
1162: */
1163: public void setHolder(Object obj) {
1164: holder = obj;
1165: }
1166:
1167: /**
1168: * Return the 'holder' for the convenience of
1169: * an external facility.
1170: *
1171: * @return holder
1172: */
1173: public Object getHolder() {
1174: return holder;
1175: }
1176:
1177: /**
1178: * Remember a 'holderKey' which some enclosing/queueing
1179: * facility has assigned this CrawlURI
1180: * .
1181: * @param obj
1182: */
1183: public void setHolderKey(Object obj) {
1184: holderKey = obj;
1185: }
1186:
1187: /**
1188: * Return the 'holderKey' for convenience of
1189: * an external facility (Frontier).
1190: *
1191: * @return holderKey
1192: */
1193: public Object getHolderKey() {
1194: return holderKey;
1195: }
1196:
1197: /**
1198: * Get the ordinal (serial number) assigned at creation.
1199: *
1200: * @return ordinal
1201: */
1202: public long getOrdinal() {
1203: return ordinal;
1204: }
1205:
1206: /** spot for an integer cost to be placed by external facility (frontier).
1207: * cost is truncated to 8 bits at times, so should not exceed 255 */
1208: int holderCost = UNCALCULATED;
1209:
1210: /**
1211: * Return the 'holderCost' for convenience of external facility (frontier)
1212: * @return value of holderCost
1213: */
1214: public int getHolderCost() {
1215: return holderCost;
1216: }
1217:
1218: /**
1219: * Remember a 'holderCost' which some enclosing/queueing
1220: * facility has assigned this CrawlURI
1221: * @param cost value to remember
1222: */
1223: public void setHolderCost(int cost) {
1224: holderCost = cost;
1225: }
1226:
1227: /**
1228: * All discovered outbound Links (navlinks, embeds, etc.)
1229: * Can either contain Link instances or CandidateURI instances, or both.
1230: * The LinksScoper processor converts Link instances in this collection
1231: * to CandidateURI instances.
1232: */
1233: transient Collection<Object> outLinks = new HashSet<Object>();
1234:
1235: /**
1236: * Returns discovered links. The returned collection might be empty if
1237: * no links were discovered, or if something like LinksScoper promoted
1238: * the links to CandidateURIs.
1239: *
1240: * Elements can be removed from the returned collection, but not added.
1241: * To add a discovered link, use one of the createAndAdd methods or
1242: * {@link #getOutObjects()}.
1243: *
1244: * @return Collection of all discovered outbound Links
1245: */
1246: public Collection<Link> getOutLinks() {
1247: return Transform.subclasses(outLinks, Link.class);
1248: }
1249:
1250: /**
1251: * Returns discovered candidate URIs. The returned collection will be
1252: * emtpy until something like LinksScoper promotes discovered Links
1253: * into CandidateURIs.
1254: *
1255: * Elements can be removed from the returned collection, but not added.
1256: * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or
1257: * {@link #getOutObjects}.
1258: *
1259: * @return Collection of candidate URIs
1260: */
1261: public Collection<CandidateURI> getOutCandidates() {
1262: return Transform.subclasses(outLinks, CandidateURI.class);
1263: }
1264:
1265: /**
1266: * Returns all of the outbound objects. The returned Collection will
1267: * contain Link instances, or CandidateURI instances, or both.
1268: *
1269: * @return the collection of Links and/or CandidateURIs
1270: */
1271: public Collection<Object> getOutObjects() {
1272: return outLinks;
1273: }
1274:
1275: /**
1276: * Add a discovered Link, unless it would exceed the max number
1277: * to accept. (If so, increment discarded link counter.)
1278: *
1279: * @param link the Link to add
1280: */
1281: public void addOutLink(Link link) {
1282: if (outLinks.size() < MAX_OUTLINKS) {
1283: outLinks.add(link);
1284: } else {
1285: // note & discard
1286: discardedOutlinks++;
1287: }
1288: }
1289:
1290: public void clearOutlinks() {
1291: this .outLinks.clear();
1292: }
1293:
1294: /**
1295: * Replace current collection of links w/ passed list.
1296: * Used by Scopers adjusting the list of links (removing those
1297: * not in scope and promoting Links to CandidateURIs).
1298: *
1299: * @param a collection of CandidateURIs replacing any previously
1300: * existing outLinks or outCandidates
1301: */
1302: public void replaceOutlinks(Collection<CandidateURI> links) {
1303: clearOutlinks();
1304: this .outLinks.addAll(links);
1305: }
1306:
1307: /**
1308: * @return Count of outlinks.
1309: */
1310: public int outlinksSize() {
1311: return this .outLinks.size();
1312: }
1313:
1314: /**
1315: * Convenience method for creating a Link discovered at this URI
1316: * with the given string and context
1317: *
1318: * @param url
1319: * String to use to create Link
1320: * @param context
1321: * CharSequence context to use
1322: * @param hopType
1323: * @return Link.
1324: * @throws URIException
1325: * if Link UURI cannot be constructed
1326: */
1327: public Link createLink(String url, CharSequence context,
1328: char hopType) throws URIException {
1329: return new Link(getUURI(), UURIFactory.getInstance(getUURI(),
1330: url), context, hopType);
1331: }
1332:
1333: /**
1334: * Convenience method for creating a Link with the given string and
1335: * context
1336: *
1337: * @param url
1338: * String to use to create Link
1339: * @param context
1340: * CharSequence context to use
1341: * @param hopType
1342: * @throws URIException
1343: * if Link UURI cannot be constructed
1344: */
1345: public void createAndAddLink(String url, CharSequence context,
1346: char hopType) throws URIException {
1347: addOutLink(createLink(url, context, hopType));
1348: }
1349:
1350: /**
1351: * Convenience method for creating a Link with the given string and
1352: * context, relative to a previously set base HREF if available (or
1353: * relative to the current CrawlURI if no other base has been set)
1354: *
1355: * @param url String URL to add as destination of link
1356: * @param context String context where link was discovered
1357: * @param hopType char hop-type indicator
1358: * @throws URIException
1359: */
1360: public void createAndAddLinkRelativeToBase(String url,
1361: CharSequence context, char hopType) throws URIException {
1362: addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1363: getBaseURI(), url), context, hopType));
1364: }
1365:
1366: /**
1367: * Convenience method for creating a Link with the given string and
1368: * context, relative to this CrawlURI's via UURI if available. (If
1369: * a via is not available, falls back to using
1370: * #createAndAddLinkRelativeToBase.)
1371: *
1372: * @param url String URL to add as destination of link
1373: * @param context String context where link was discovered
1374: * @param hopType char hop-type indicator
1375: * @throws URIException
1376: */
1377: public void createAndAddLinkRelativeToVia(String url,
1378: CharSequence context, char hopType) throws URIException {
1379: if (getVia() != null) {
1380: addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1381: getVia(), url), context, hopType));
1382: } else {
1383: // if no 'via', fall back to base/self
1384: createAndAddLinkRelativeToBase(url, context, hopType);
1385: }
1386: }
1387:
1388: /**
1389: * Set the (HTML) Base URI used for derelativizing internal URIs.
1390: *
1391: * @param baseHref String base href to use
1392: * @throws URIException if supplied string cannot be interpreted as URI
1393: */
1394: public void setBaseURI(String baseHref) throws URIException {
1395: putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref));
1396: }
1397:
1398: /**
1399: * Get the (HTML) Base URI used for derelativizing internal URIs.
1400: *
1401: * @return UURI base URI previously set
1402: */
1403: public UURI getBaseURI() {
1404: if (!containsKey(A_HTML_BASE)) {
1405: return getUURI();
1406: }
1407: return (UURI) getObject(A_HTML_BASE);
1408: }
1409:
1410: /**
1411: * Add the key of alist items you want to persist across
1412: * processings.
1413: * @param key Key to add.
1414: */
1415: public static void addAlistPersistentMember(Object key) {
1416: alistPersistentMember.add(key);
1417: }
1418:
1419: /**
1420: * @param key Key to remove.
1421: * @return True if list contained the element.
1422: */
1423: public static boolean removeAlistPersistentMember(Object key) {
1424: return alistPersistentMember.remove(key);
1425: }
1426:
1427: /**
1428: * Custom serialization writing an empty 'outLinks' as null. Estimated
1429: * to save ~20 bytes in serialized form.
1430: *
1431: * @param stream
1432: * @throws IOException
1433: */
1434: private void writeObject(ObjectOutputStream stream)
1435: throws IOException {
1436: stream.defaultWriteObject();
1437: stream.writeObject((outLinks.isEmpty()) ? null : outLinks);
1438: }
1439:
1440: /**
1441: * Custom deserialization recreating empty HashSet from null in 'outLinks'
1442: * slot.
1443: *
1444: * @param stream
1445: * @throws IOException
1446: * @throws ClassNotFoundException
1447: */
1448: private void readObject(ObjectInputStream stream)
1449: throws IOException, ClassNotFoundException {
1450: stream.defaultReadObject();
1451: @SuppressWarnings("unchecked")
1452: HashSet<Object> ol = (HashSet<Object>) stream.readObject();
1453: outLinks = (ol == null) ? new HashSet<Object>() : ol;
1454: }
1455:
1456: }
|