| java.lang.Object org.archive.crawler.datamodel.CandidateURI
All known Subclasses: org.archive.crawler.datamodel.CrawlURI,
Field Summary | |
final public static int | HIGH High scheduling priority. | final public static int | HIGHEST Highest scheduling priority. | final public static int | MEDIUM Medium priority. | final public static int | NORMAL Normal/low priority. |
Method Summary | |
protected void | clearAList() | public boolean | containsKey(String key) | public CandidateURI | createCandidateURI(UURI baseUURI, Link link) Utility method for creation of CandidateURIs found extracting
links from this CrawlURI.
Parameters: baseUURI - BaseUURI for link . Parameters: link - Link to wrap CandidateURI in. | public CandidateURI | createCandidateURI(UURI baseUURI, Link link, int scheduling, boolean seed) Utility method for creation of CandidateURIs found extracting
links from this CrawlURI.
Parameters: baseUURI - BaseUURI for link . Parameters: link - Link to wrap CandidateURI in. Parameters: scheduling - How new CandidateURI should be scheduled. Parameters: seed - True if this CandidateURI is a seed. | public static CandidateURI | createSeedCandidateURI(UURI uuri) | public String | flattenVia() Method returns string version of this URI's referral URI. | public boolean | forceFetch() If this method returns true, this URI should be fetched even though
it already has been crawled. | public static CandidateURI | fromString(String uriHopsViaString) Given a string containing a URI, then optional whitespace
delimited hops-path and via info, create a CandidateURI
instance.
Parameters: uriHopsViaString - String with a URI. | public AList | getAList() Assumption is that only one thread at a time will ever be accessing
a particular CandidateURI. | public synchronized String | getCandidateURIString() | public String | getClassKey() Get the token (usually the hostname + port) which indicates
what "class" this CrawlURI should be grouped with,
for the purposes of ensuring only one item of the
class is processed at once, all items of the class
are held for a politeness period, etc. | public int | getInt(String key) | public long | getLong(String key) | public Object | getObject(String key) | public String | getPathFromSeed() | public String[] | getReports() | public int | getSchedulingDirective() | public String | getString(String key) | public int | getTransHops() Tally up the number of transitive (non-simple-link) hops at
the end of this CandidateURI's pathFromSeed.
In some cases, URIs with greater than zero but less than some
threshold such hops are treated specially. | public String | getURIString() | public UURI | getUURI() | public UURI | getVia() | public CharSequence | getViaContext() | protected void | inheritFrom(CandidateURI ancestor) Inherit (copy) the relevant keys-values from the ancestor. | public boolean | isLocation() True if this CandidateURI was result of a redirect:i.e. | public boolean | isSeed() | public Iterator | keys() | public void | makeHeritable(String key) Make the given key 'heritable', meaning its value will be
added to descendant CandidateURIs. | public void | makeNonHeritable(String key) Make the given key non-'heritable', meaning its value will
not be added to descendant CandidateURIs. | public boolean | needsImmediateScheduling() | public boolean | needsSoonScheduling() | public void | putInt(String key, int value) | public void | putLong(String key, long value) | public void | putObject(String key, Object value) | public void | putString(String key, String value) | protected UURI | readUuri(String u) | public void | remove(String key) | public void | reportTo(String name, PrintWriter writer) | public void | reportTo(PrintWriter writer) | public boolean | sameDomainAs(CandidateURI other) | protected void | setAList(AList alist) Called when making a copy of another CandidateURI. | public void | setClassKey(String key) | public void | setForceFetch(boolean b) Method to signal that this URI should be fetched even though
it already has been crawled. | public void | setIsSeed(boolean b) Set the isSeed attribute of this URI. | protected void | setPathFromSeed(String string) | public void | setSchedulingDirective(int schedulingDirective) | public void | setVia(UURI via) | public String | singleLineLegend() | public String | singleLineReport() | public void | singleLineReportTo(PrintWriter w) | public String | toString() |
HIGHEST | final public static int HIGHEST(Code) | | Highest scheduling priority.
Before any others of its class.
|
NORMAL | final public static int NORMAL(Code) | | Normal/low priority.
Whenever/end of queue.
|
CandidateURI | protected CandidateURI()(Code) | | Constructor.
Protected access to block access to default constructor.
|
CandidateURI | public CandidateURI(UURI u)(Code) | | Parameters: u - uuri instance this CandidateURI wraps. |
CandidateURI | public CandidateURI(UURI u, String pathFromSeed, UURI via, CharSequence viaContext)(Code) | | Parameters: u - uuri instance this CandidateURI wraps. Parameters: pathFromSeed - Parameters: via - Parameters: viaContext - |
clearAList | protected void clearAList()(Code) | | |
createCandidateURI | public CandidateURI createCandidateURI(UURI baseUURI, Link link) throws URIException(Code) | | Utility method for creation of CandidateURIs found extracting
links from this CrawlURI.
Parameters: baseUURI - BaseUURI for link . Parameters: link - Link to wrap CandidateURI in. New candidateURI wrapper around link . throws: URIException - |
createCandidateURI | public CandidateURI createCandidateURI(UURI baseUURI, Link link, int scheduling, boolean seed) throws URIException(Code) | | Utility method for creation of CandidateURIs found extracting
links from this CrawlURI.
Parameters: baseUURI - BaseUURI for link . Parameters: link - Link to wrap CandidateURI in. Parameters: scheduling - How new CandidateURI should be scheduled. Parameters: seed - True if this CandidateURI is a seed. New candidateURI wrapper around link . throws: URIException - |
flattenVia | public String flattenVia()(Code) | | Method returns string version of this URI's referral URI.
String version of referral URI |
forceFetch | public boolean forceFetch()(Code) | | If this method returns true, this URI should be fetched even though
it already has been crawled. This also implies
that this URI will be scheduled for crawl before any other waiting
URIs for the same host.
This value is used to refetch any expired robots.txt or dns-lookups.
true if crawling of this URI should be forced |
fromString | public static CandidateURI fromString(String uriHopsViaString) throws URIException(Code) | | Given a string containing a URI, then optional whitespace
delimited hops-path and via info, create a CandidateURI
instance.
Parameters: uriHopsViaString - String with a URI. A CandidateURI made from passed uriHopsViaString . throws: URIException - |
getAList | public AList getAList()(Code) | | Assumption is that only one thread at a time will ever be accessing
a particular CandidateURI.
the attribute list. |
getCandidateURIString | public synchronized String getCandidateURIString()(Code) | | This candidate URI as a string wrapped with 'CandidateURI(' +')'. |
getClassKey | public String getClassKey()(Code) | | Get the token (usually the hostname + port) which indicates
what "class" this CrawlURI should be grouped with,
for the purposes of ensuring only one item of the
class is processed at once, all items of the class
are held for a politeness period, etc.
Token (usually the hostname) which indicateswhat "class" this CrawlURI should be grouped with. |
getPathFromSeed | public String getPathFromSeed()(Code) | | path (hop-types) from seed |
getSchedulingDirective | public int getSchedulingDirective()(Code) | | Returns the schedulingDirective. |
getTransHops | public int getTransHops()(Code) | | Tally up the number of transitive (non-simple-link) hops at
the end of this CandidateURI's pathFromSeed.
In some cases, URIs with greater than zero but less than some
threshold such hops are treated specially.
TODO: consider moving link-count in here as well, caching
calculation, and refactoring CrawlScope.exceedsMaxHops() to use this.
Transhop count. |
getVia | public UURI getVia()(Code) | | URI via which this one was discovered |
getViaContext | public CharSequence getViaContext()(Code) | | CharSequence context in which this one was discovered |
inheritFrom | protected void inheritFrom(CandidateURI ancestor)(Code) | | Inherit (copy) the relevant keys-values from the ancestor.
Parameters: ancestor - |
isLocation | public boolean isLocation()(Code) | | True if this CandidateURI was result of a redirect:i.e. Its parent URI redirected to here, this URI was what was in the 'Location:' or 'Content-Location:' HTTP Header. |
isSeed | public boolean isSeed()(Code) | | Whether seeded. |
makeHeritable | public void makeHeritable(String key)(Code) | | Make the given key 'heritable', meaning its value will be
added to descendant CandidateURIs. Only keys with immutable
values should be made heritable -- the value instance may
be shared until the AList is serialized/deserialized.
Parameters: key - to make heritable |
makeNonHeritable | public void makeNonHeritable(String key)(Code) | | Make the given key non-'heritable', meaning its value will
not be added to descendant CandidateURIs. Only meaningful if
key was previously made heritable.
Parameters: key - to make non-heritable |
needsImmediateScheduling | public boolean needsImmediateScheduling()(Code) | | True if needs immediate scheduling. |
needsSoonScheduling | public boolean needsSoonScheduling()(Code) | | True if needs soon but not top scheduling. |
readUuri | protected UURI readUuri(String u)(Code) | | Read a UURI from a String, handling a null or URIException
Parameters: u - String or null from which to create UURI the best UURI instance creatable |
sameDomainAs | public boolean sameDomainAs(CandidateURI other) throws URIException(Code) | | Compares the domain of this CandidateURI with that of another
CandidateURI
Parameters: other - The other CandidateURI True if both are in the same domain, false otherwise. throws: URIException - |
setAList | protected void setAList(AList alist)(Code) | | Called when making a copy of another CandidateURI.
Parameters: alist - AList to use. |
setForceFetch | public void setForceFetch(boolean b)(Code) | | Method to signal that this URI should be fetched even though
it already has been crawled. Setting this to true also implies
that this URI will be scheduled for crawl before any other waiting
URIs for the same host.
This value is used to refetch any expired robots.txt or dns-lookups.
Parameters: b - set to true to enforce the crawling of this URI |
setIsSeed | public void setIsSeed(boolean b)(Code) | | Set the isSeed attribute of this URI.
Parameters: b - Is this URI a seed, true or false. |
setPathFromSeed | protected void setPathFromSeed(String string)(Code) | | Parameters: string - |
setSchedulingDirective | public void setSchedulingDirective(int schedulingDirective)(Code) | | Parameters: schedulingDirective - The schedulingDirective to set. |
|
|