001: /* ExtractorHTMLTest
002: *
003: * Created on May 19, 2004
004: *
005: * Copyright (C) 2004 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.extractor;
024:
025: import java.io.File;
026: import java.io.FileOutputStream;
027: import java.io.IOException;
028: import java.net.URL;
029: import java.util.Collection;
030: import java.util.Iterator;
031:
032: import javax.management.AttributeNotFoundException;
033: import javax.management.InvalidAttributeValueException;
034: import javax.management.MBeanException;
035: import javax.management.ReflectionException;
036:
037: import org.apache.commons.collections.CollectionUtils;
038: import org.apache.commons.collections.Predicate;
039: import org.apache.commons.httpclient.URIException;
040: import org.archive.crawler.datamodel.CoreAttributeConstants;
041: import org.archive.crawler.datamodel.CrawlOrder;
042: import org.archive.crawler.datamodel.CrawlURI;
043: import org.archive.crawler.settings.MapType;
044: import org.archive.crawler.settings.SettingsHandler;
045: import org.archive.crawler.settings.XMLSettingsHandler;
046: import org.archive.net.UURI;
047: import org.archive.net.UURIFactory;
048: import org.archive.util.HttpRecorder;
049: import org.archive.util.TmpDirTestCase;
050:
051: /**
052: * Test html extractor.
053: *
054: * @author stack
055: * @version $Revision: 3842 $, $Date: 2005-09-22 23:03:13 +0000 (Thu, 22 Sep 2005) $
056: */
057: public class ExtractorHTMLTest extends TmpDirTestCase implements
058: CoreAttributeConstants {
059: private final String ARCHIVE_DOT_ORG = "archive.org";
060: private final String LINK_TO_FIND = "http://www.hewlett.org/";
061: private HttpRecorder recorder = null;
062: private ExtractorHTML extractor = null;
063:
064: protected ExtractorHTML createExtractor()
065: throws InvalidAttributeValueException,
066: AttributeNotFoundException, MBeanException,
067: ReflectionException {
068: // Hack in a settings handler. Do this by adding this extractor
069: // to the order file (I'm adding it to a random MapType; seemingly
070: // can only add to MapTypes post-construction). This takes care
071: // of setting a valid SettingsHandler into the ExtractorHTML (This
072: // shouldn't be so difficult). Of note, the order file below is
073: // not written to disk.
074: final String name = this .getClass().getName();
075: SettingsHandler handler = new XMLSettingsHandler(new File(
076: getTmpDir(), name + ".order.xml"));
077: handler.initialize();
078: return (ExtractorHTML) ((MapType) handler.getOrder()
079: .getAttribute(CrawlOrder.ATTR_RULES)).addElement(
080: handler.getSettingsObject(null),
081: new ExtractorHTML(name));
082: }
083:
084: protected void setUp() throws Exception {
085: super .setUp();
086: this .extractor = createExtractor();
087: final boolean USE_NET = false;
088: URL url = null;
089: if (USE_NET) {
090: url = new URL("http://" + this .ARCHIVE_DOT_ORG);
091: } else {
092: File f = new File(getTmpDir(), this .ARCHIVE_DOT_ORG
093: + ".html");
094: url = new URL("file://" + f.getAbsolutePath());
095: FileOutputStream fos = new FileOutputStream(f);
096: fos.write(("<html><head><title>test</title><body>"
097: + "<a href=" + this .LINK_TO_FIND
098: + ">Hewlett Foundation</a>" + "</body></html>")
099: .getBytes());
100: fos.flush();
101: fos.close();
102: }
103: this .recorder = HttpRecorder.wrapInputStreamWithHttpRecord(
104: getTmpDir(), this .getClass().getName(), url
105: .openStream(), null);
106: }
107:
108: /*
109: * @see TestCase#tearDown()
110: */
111: protected void tearDown() throws Exception {
112: super .tearDown();
113: }
114:
115: public void testInnerProcess() throws IOException {
116: UURI uuri = UURIFactory.getInstance("http://"
117: + this .ARCHIVE_DOT_ORG);
118: CrawlURI curi = setupCrawlURI(this .recorder, uuri.toString());
119: this .extractor.innerProcess(curi);
120: Collection links = curi.getOutLinks();
121: boolean foundLinkToHewlettFoundation = false;
122: for (Iterator i = links.iterator(); i.hasNext();) {
123: Link link = (Link) i.next();
124: if (link.getDestination().toString().equals(
125: this .LINK_TO_FIND)) {
126: foundLinkToHewlettFoundation = true;
127: break;
128: }
129: }
130: assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
131: }
132:
133: private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
134: throws URIException {
135: CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
136: curi.setContentSize(this .recorder.getRecordedInput().getSize());
137: curi.setContentType("text/html");
138: curi.setFetchStatus(200);
139: curi.setHttpRecorder(rec);
140: // Fake out the extractor that this is a HTTP transaction.
141: curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
142: new Object());
143: return curi;
144: }
145:
146: /**
147: * Test single net or local filesystem page parse.
148: * Set the uuri to be a net url or instead put in place a file
149: * named for this class under the unit test directory.
150: * @throws IOException
151: * @throws ReflectionException
152: * @throws MBeanException
153: * @throws AttributeNotFoundException
154: * @throws InvalidAttributeValueException
155: */
156: public void testPageParse() throws InvalidAttributeValueException,
157: AttributeNotFoundException, MBeanException,
158: ReflectionException, IOException {
159: UURI uuri = null;
160:
161: // DO
162: // uuri = UURIFactory.getInstance("http://www.xjmu.edu.cn/");
163: // OR
164: // File f = new File(getTmpDir(), this.getClass().getName() +
165: // ".html");
166: // if (f.exists()) {
167: // uuri = UURIFactory.getInstance("file://" +
168: // f.getAbsolutePath());
169: // }
170: // OR
171: // uuri = getUURI(URL or PATH)
172: //
173: // OR
174: // Use the main method below and pass this class an argument.
175: //
176: if (uuri != null) {
177: runExtractor(uuri);
178: }
179: }
180:
181: protected UURI getUURI(String url) throws URIException {
182: url = (url.indexOf("://") > 0) ? url : "file://" + url;
183: return UURIFactory.getInstance(url);
184: }
185:
186: protected void runExtractor(UURI baseUURI)
187: throws InvalidAttributeValueException,
188: AttributeNotFoundException, MBeanException,
189: ReflectionException, IOException {
190: runExtractor(baseUURI, null);
191: }
192:
193: protected void runExtractor(UURI baseUURI, String encoding)
194: throws IOException, InvalidAttributeValueException,
195: AttributeNotFoundException, MBeanException,
196: ReflectionException {
197: if (baseUURI == null) {
198: return;
199: }
200: this .extractor = createExtractor();
201: URL url = new URL(baseUURI.toString());
202: this .recorder = HttpRecorder.wrapInputStreamWithHttpRecord(
203: getTmpDir(), this .getClass().getName(), url
204: .openStream(), encoding);
205: CrawlURI curi = setupCrawlURI(this .recorder, url.toString());
206: this .extractor.innerProcess(curi);
207:
208: System.out.println("+" + this .extractor.report());
209: int count = 0;
210: Collection links = curi.getOutLinks();
211: System.out.println("+HTML Links (hopType=" + Link.NAVLINK_HOP
212: + "):");
213: if (links != null) {
214: for (Iterator i = links.iterator(); i.hasNext();) {
215: Link link = (Link) i.next();
216: if (link.getHopType() == Link.NAVLINK_HOP) {
217: count++;
218: System.out.println(link.getDestination());
219: }
220: }
221: }
222: System.out.println("+HTML Embeds (hopType=" + Link.EMBED_HOP
223: + "):");
224: if (links != null) {
225: for (Iterator i = links.iterator(); i.hasNext();) {
226: Link link = (Link) i.next();
227: if (link.getHopType() == Link.EMBED_HOP) {
228: count++;
229: System.out.println(link.getDestination());
230: }
231: }
232: }
233: System.out.println("+HTML Speculative Embeds (hopType="
234: + Link.SPECULATIVE_HOP + "):");
235: if (links != null) {
236: for (Iterator i = links.iterator(); i.hasNext();) {
237: Link link = (Link) i.next();
238: if (link.getHopType() == Link.SPECULATIVE_HOP) {
239: count++;
240: System.out.println(link.getDestination());
241: }
242: }
243: }
244: System.out.println("+HTML Other (all other hopTypes):");
245: if (links != null) {
246: for (Iterator i = links.iterator(); i.hasNext();) {
247: Link link = (Link) i.next();
248: if (link.getHopType() != Link.SPECULATIVE_HOP
249: && link.getHopType() != Link.NAVLINK_HOP
250: && link.getHopType() != Link.EMBED_HOP) {
251: count++;
252: System.out.println(link.getHopType() + " "
253: + link.getDestination());
254: }
255: }
256: }
257: System.out.println("TOTAL URIS EXTRACTED: " + count);
258: }
259:
260: /**
261: * Test a particular <embed src=...> construct that was suspicious in
262: * the No10GovUk crawl.
263: *
264: * @throws URIException
265: */
266: public void testEmbedSrc() throws URIException {
267: CrawlURI curi = new CrawlURI(UURIFactory
268: .getInstance("http://www.example.org"));
269: // An example from http://www.records.pro.gov.uk/documents/prem/18/1/default.asp?PageId=62&qt=true
270: CharSequence cs = "<embed src=\"/documents/prem/18/1/graphics/qtvr/"
271: + "hall.mov\" width=\"320\" height=\"212\" controller=\"true\" "
272: + "CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/"
273: + "quicktime/download/\" /> ";
274: this .extractor.extract(curi, cs);
275: assertTrue(CollectionUtils.exists(curi.getOutLinks(),
276: new Predicate() {
277: public boolean evaluate(Object object) {
278: return ((Link) object)
279: .getDestination()
280: .toString()
281: .indexOf(
282: "/documents/prem/18/1/graphics/qtvr/hall.mov") >= 0;
283: }
284: }));
285: }
286:
287: /**
288: * Test a whitespace issue found in href.
289: *
290: * See [ 963965 ] Either UURI or ExtractHTML should strip whitespace better.
291: * https://sourceforge.net/tracker/?func=detail&atid=539099&aid=963965&group_id=73833
292: *
293: * @throws URIException
294: */
295: public void testHrefWhitespace() throws URIException {
296: CrawlURI curi = new CrawlURI(UURIFactory
297: .getInstance("http://www.carsound.dk"));
298: CharSequence cs = "<a href=\"http://www.carsound.dk\n\n\n"
299: + "\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>";
300: this .extractor.extract(curi, cs);
301: curi.getOutLinks();
302: assertTrue("Not stripping new lines", CollectionUtils.exists(
303: curi.getOutLinks(), new Predicate() {
304: public boolean evaluate(Object object) {
305: return ((Link) object).getDestination()
306: .toString().indexOf(
307: "http://www.carsound.dk/") >= 0;
308: }
309: }));
310: }
311:
312: public static void main(String[] args) throws Exception {
313: if (args.length != 1 && args.length != 2) {
314: System.err.println("Usage: "
315: + ExtractorHTMLTest.class.getName()
316: + " URL|PATH [ENCODING]");
317: System.exit(1);
318: }
319: ExtractorHTMLTest testCase = new ExtractorHTMLTest();
320: testCase.setUp();
321: try {
322: testCase.runExtractor(testCase.getUURI(args[0]),
323: (args.length == 2) ? args[1] : null);
324: } finally {
325: testCase.tearDown();
326: }
327: }
328: }
|