| org.archive.crawler.extractor.Extractor org.archive.crawler.extractor.ExtractorHTML org.archive.crawler.extractor.AggressiveExtractorHTML
AggressiveExtractorHTML | public class AggressiveExtractorHTML extends ExtractorHTML (Code) | | Extended version of ExtractorHTML with more aggressive javascript link
extraction where javascript code is parsed first with general HTML tags
regexp, and than by javascript speculative link regexp.
author: Igor Ranitovic |
AggressiveExtractorHTML | public AggressiveExtractorHTML(String name)(Code) | | |
Methods inherited from org.archive.crawler.extractor.ExtractorHTML | public void extract(CrawlURI curi)(Code)(Java Doc) void extract(CrawlURI curi, CharSequence cs)(Code)(Java Doc) protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException(Code)(Java Doc) final protected void processEmbed(CrawlURI curi, CharSequence value, CharSequence context)(Code)(Java Doc) protected void processEmbed(CrawlURI curi, CharSequence value, CharSequence context, char hopType)(Code)(Java Doc) protected void processGeneralTag(CrawlURI curi, CharSequence element, CharSequence cs)(Code)(Java Doc) protected void processLink(CrawlURI curi, CharSequence value, CharSequence context)(Code)(Java Doc) protected boolean processMeta(CrawlURI curi, CharSequence cs)(Code)(Java Doc) protected void processScript(CrawlURI curi, CharSequence sequence, int endOfOpenTag)(Code)(Java Doc) protected void processScriptCode(CrawlURI curi, CharSequence cs)(Code)(Java Doc) protected void processStyle(CrawlURI curi, CharSequence sequence, int endOfOpenTag)(Code)(Java Doc) public String report()(Code)(Java Doc)
|
|
|