001: /* JerichoExtractorHTMLTest
002: *
003: * Copyright (C) 2006 Olaf Freyer
004: *
005: * This file is part of the Heritrix web crawler (crawler.archive.org).
006: *
007: * Heritrix is free software; you can redistribute it and/or modify
008: * it under the terms of the GNU Lesser Public License as published by
009: * the Free Software Foundation; either version 2.1 of the License, or
010: * any later version.
011: *
012: * Heritrix is distributed in the hope that it will be useful,
013: * but WITHOUT ANY WARRANTY; without even the implied warranty of
014: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
015: * GNU Lesser Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser Public License
018: * along with Heritrix; if not, write to the Free Software
019: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
020: *
021: *
022: */
023: package org.archive.crawler.extractor;
024:
025: import java.io.File;
026: import java.io.FileOutputStream;
027: import java.io.IOException;
028: import java.net.URL;
029: import java.util.Collection;
030: import java.util.Iterator;
031:
032: import javax.management.AttributeNotFoundException;
033: import javax.management.InvalidAttributeValueException;
034: import javax.management.MBeanException;
035: import javax.management.ReflectionException;
036:
037: import org.apache.commons.collections.CollectionUtils;
038: import org.apache.commons.collections.Predicate;
039: import org.apache.commons.httpclient.URIException;
040: import org.archive.crawler.datamodel.CoreAttributeConstants;
041: import org.archive.crawler.datamodel.CrawlOrder;
042: import org.archive.crawler.datamodel.CrawlURI;
043: import org.archive.crawler.settings.MapType;
044: import org.archive.crawler.settings.SettingsHandler;
045: import org.archive.crawler.settings.XMLSettingsHandler;
046: import org.archive.net.UURI;
047: import org.archive.net.UURIFactory;
048: import org.archive.util.HttpRecorder;
049:
050: /**
051: * Test html extractor.
052: *
053: * @author stack
054: * @version $Revision: 4703 $, $Date: 2006-10-18 15:26:56 +0000 (Wed, 18 Oct 2006) $
055: */
056: public class JerichoExtractorHTMLTest extends ExtractorHTMLTest
057: implements CoreAttributeConstants {
058: private final String ARCHIVE_DOT_ORG = "archive.org";
059: private final String LINK_TO_FIND = "http://www.hewlett.org/";
060: private HttpRecorder recorder = null;
061: private JerichoExtractorHTML extractor = null;
062:
063: protected JerichoExtractorHTML createExtractor()
064: throws InvalidAttributeValueException,
065: AttributeNotFoundException, MBeanException,
066: ReflectionException {
067: // Hack in a settings handler. Do this by adding this extractor
068: // to the order file (I'm adding it to a random MapType; seemingly
069: // can only add to MapTypes post-construction). This takes care
070: // of setting a valid SettingsHandler into the ExtractorHTML (This
071: // shouldn't be so difficult). Of note, the order file below is
072: // not written to disk.
073: final String name = this .getClass().getName();
074: SettingsHandler handler = new XMLSettingsHandler(new File(
075: getTmpDir(), name + ".order.xml"));
076: handler.initialize();
077: return (JerichoExtractorHTML) ((MapType) handler.getOrder()
078: .getAttribute(CrawlOrder.ATTR_RULES)).addElement(
079: handler.getSettingsObject(null),
080: new JerichoExtractorHTML(name));
081: }
082:
083: protected void setUp() throws Exception {
084: super .setUp();
085: this .extractor = createExtractor();
086: final boolean USE_NET = false;
087: URL url = null;
088: if (USE_NET) {
089: url = new URL("http://" + this .ARCHIVE_DOT_ORG);
090: } else {
091: File f = new File(getTmpDir(), this .ARCHIVE_DOT_ORG
092: + ".html");
093: url = new URL("file://" + f.getAbsolutePath());
094: FileOutputStream fos = new FileOutputStream(f);
095: fos.write(("<html><head><title>test</title><body>"
096: + "<a href=" + this .LINK_TO_FIND
097: + ">Hewlett Foundation</a>" + "</body></html>")
098: .getBytes());
099: fos.flush();
100: fos.close();
101: }
102: this .recorder = HttpRecorder.wrapInputStreamWithHttpRecord(
103: getTmpDir(), this .getClass().getName(), url
104: .openStream(), null);
105: }
106:
107: public void testInnerProcess() throws IOException {
108: UURI uuri = UURIFactory.getInstance("http://"
109: + this .ARCHIVE_DOT_ORG);
110: CrawlURI curi = setupCrawlURI(this .recorder, uuri.toString());
111: this .extractor.innerProcess(curi);
112: Collection links = curi.getOutLinks();
113: boolean foundLinkToHewlettFoundation = false;
114: for (Iterator i = links.iterator(); i.hasNext();) {
115: Link link = (Link) i.next();
116: if (link.getDestination().toString().equals(
117: this .LINK_TO_FIND)) {
118: foundLinkToHewlettFoundation = true;
119: break;
120: }
121: }
122: assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
123: }
124:
125: private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
126: throws URIException {
127: CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
128: curi.setContentSize(this .recorder.getRecordedInput().getSize());
129: curi.setContentType("text/html");
130: curi.setFetchStatus(200);
131: curi.setHttpRecorder(rec);
132: // Fake out the extractor that this is a HTTP transaction.
133: curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
134: new Object());
135: return curi;
136: }
137:
138: /**
139: * Test a forms link extraction
140: *
141: * @throws URIException
142: */
143: public void testFormsLink() throws URIException {
144: CrawlURI curi = new CrawlURI(UURIFactory
145: .getInstance("http://www.example.org"));
146: CharSequence cs = "<form name=\"testform\" method=\"POST\" action=\"redirect_me?form=true\"> "
147: + " <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "
148: + " <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> "
149: + " <select name=\"selectBox\">"
150: + " <option value=\"selectedOption\" selected>option1</option>"
151: + " <option value=\"nonselectedOption\">option2</option>"
152: + " </select>"
153: + " <input type=\"submit\" name=\"test\" value=\"Go\">"
154: + "</form>";
155: this .extractor.extract(curi, cs);
156: curi.getOutLinks();
157: assertTrue(CollectionUtils.exists(curi.getOutLinks(),
158: new Predicate() {
159: public boolean evaluate(Object object) {
160: return ((Link) object)
161: .getDestination()
162: .toString()
163: .indexOf(
164: "/redirect_me?form=true&checked[]=1&unchecked[]=&selectBox=selectedOption&test=Go") >= 0;
165: }
166: }));
167: }
168: }
|