Source Code Cross Referenced for ExtractorHTMLTest.java in » Web-Crawler » heritrix » org » archive » crawler » extractor » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.extractor

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /* ExtractorHTMLTest
002:         *
003:         * Created on May 19, 2004
004:         *
005:         * Copyright (C) 2004 Internet Archive.
006:         *
007:         * This file is part of the Heritrix web crawler (crawler.archive.org).
008:         *
009:         * Heritrix is free software; you can redistribute it and/or modify
010:         * it under the terms of the GNU Lesser Public License as published by
011:         * the Free Software Foundation; either version 2.1 of the License, or
012:         * any later version.
013:         *
014:         * Heritrix is distributed in the hope that it will be useful,
015:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
016:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017:         * GNU Lesser Public License for more details.
018:         *
019:         * You should have received a copy of the GNU Lesser Public License
020:         * along with Heritrix; if not, write to the Free Software
021:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022:         */
023:        package org.archive.crawler.extractor;
024:
025:        import java.io.File;
026:        import java.io.FileOutputStream;
027:        import java.io.IOException;
028:        import java.net.URL;
029:        import java.util.Collection;
030:        import java.util.Iterator;
031:
032:        import javax.management.AttributeNotFoundException;
033:        import javax.management.InvalidAttributeValueException;
034:        import javax.management.MBeanException;
035:        import javax.management.ReflectionException;
036:
037:        import org.apache.commons.collections.CollectionUtils;
038:        import org.apache.commons.collections.Predicate;
039:        import org.apache.commons.httpclient.URIException;
040:        import org.archive.crawler.datamodel.CoreAttributeConstants;
041:        import org.archive.crawler.datamodel.CrawlOrder;
042:        import org.archive.crawler.datamodel.CrawlURI;
043:        import org.archive.crawler.settings.MapType;
044:        import org.archive.crawler.settings.SettingsHandler;
045:        import org.archive.crawler.settings.XMLSettingsHandler;
046:        import org.archive.net.UURI;
047:        import org.archive.net.UURIFactory;
048:        import org.archive.util.HttpRecorder;
049:        import org.archive.util.TmpDirTestCase;
050:
051:        /**
052:         * Test html extractor.
053:         *
054:         * @author stack
055:         * @version $Revision: 3842 $, $Date: 2005-09-22 23:03:13 +0000 (Thu, 22 Sep 2005) $
056:         */
057:        public class ExtractorHTMLTest extends TmpDirTestCase implements 
058:                CoreAttributeConstants {
059:            private final String ARCHIVE_DOT_ORG = "archive.org";
060:            private final String LINK_TO_FIND = "http://www.hewlett.org/";
061:            private HttpRecorder recorder = null;
062:            private ExtractorHTML extractor = null;
063:
064:            protected ExtractorHTML createExtractor()
065:                    throws InvalidAttributeValueException,
066:                    AttributeNotFoundException, MBeanException,
067:                    ReflectionException {
068:                // Hack in a settings handler.  Do this by adding this extractor
069:                // to the order file (I'm adding it to a random MapType; seemingly
070:                // can only add to MapTypes post-construction). This takes care
071:                // of setting a valid SettingsHandler into the ExtractorHTML (This
072:                // shouldn't be so difficult).  Of note, the order file below is
073:                // not written to disk.
074:                final String name = this .getClass().getName();
075:                SettingsHandler handler = new XMLSettingsHandler(new File(
076:                        getTmpDir(), name + ".order.xml"));
077:                handler.initialize();
078:                return (ExtractorHTML) ((MapType) handler.getOrder()
079:                        .getAttribute(CrawlOrder.ATTR_RULES)).addElement(
080:                        handler.getSettingsObject(null),
081:                        new ExtractorHTML(name));
082:            }
083:
084:            protected void setUp() throws Exception {
085:                super .setUp();
086:                this .extractor = createExtractor();
087:                final boolean USE_NET = false;
088:                URL url = null;
089:                if (USE_NET) {
090:                    url = new URL("http://" + this .ARCHIVE_DOT_ORG);
091:                } else {
092:                    File f = new File(getTmpDir(), this .ARCHIVE_DOT_ORG
093:                            + ".html");
094:                    url = new URL("file://" + f.getAbsolutePath());
095:                    FileOutputStream fos = new FileOutputStream(f);
096:                    fos.write(("<html><head><title>test</title><body>"
097:                            + "<a href=" + this .LINK_TO_FIND
098:                            + ">Hewlett Foundation</a>" + "</body></html>")
099:                            .getBytes());
100:                    fos.flush();
101:                    fos.close();
102:                }
103:                this .recorder = HttpRecorder.wrapInputStreamWithHttpRecord(
104:                        getTmpDir(), this .getClass().getName(), url
105:                                .openStream(), null);
106:            }
107:
108:            /*
109:             * @see TestCase#tearDown()
110:             */
111:            protected void tearDown() throws Exception {
112:                super .tearDown();
113:            }
114:
115:            public void testInnerProcess() throws IOException {
116:                UURI uuri = UURIFactory.getInstance("http://"
117:                        + this .ARCHIVE_DOT_ORG);
118:                CrawlURI curi = setupCrawlURI(this .recorder, uuri.toString());
119:                this .extractor.innerProcess(curi);
120:                Collection links = curi.getOutLinks();
121:                boolean foundLinkToHewlettFoundation = false;
122:                for (Iterator i = links.iterator(); i.hasNext();) {
123:                    Link link = (Link) i.next();
124:                    if (link.getDestination().toString().equals(
125:                            this .LINK_TO_FIND)) {
126:                        foundLinkToHewlettFoundation = true;
127:                        break;
128:                    }
129:                }
130:                assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
131:            }
132:
133:            private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
134:                    throws URIException {
135:                CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
136:                curi.setContentSize(this .recorder.getRecordedInput().getSize());
137:                curi.setContentType("text/html");
138:                curi.setFetchStatus(200);
139:                curi.setHttpRecorder(rec);
140:                // Fake out the extractor that this is a HTTP transaction.
141:                curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
142:                        new Object());
143:                return curi;
144:            }
145:
146:            /**
147:             * Test single net or local filesystem page parse.
148:             * Set the uuri to be a net url or instead put in place a file
149:             * named for this class under the unit test directory.
150:             * @throws IOException
151:             * @throws ReflectionException
152:             * @throws MBeanException
153:             * @throws AttributeNotFoundException
154:             * @throws InvalidAttributeValueException
155:             */
156:            public void testPageParse() throws InvalidAttributeValueException,
157:                    AttributeNotFoundException, MBeanException,
158:                    ReflectionException, IOException {
159:                UURI uuri = null;
160:
161:                // DO
162:                //      uuri = UURIFactory.getInstance("http://www.xjmu.edu.cn/");
163:                // OR
164:                //        File f = new File(getTmpDir(), this.getClass().getName() +
165:                //        ".html");
166:                //        if (f.exists()) {
167:                //        	uuri = UURIFactory.getInstance("file://" +
168:                //        			f.getAbsolutePath());
169:                //        }
170:                // OR 
171:                //      uuri = getUURI(URL or PATH)
172:                //
173:                // OR 
174:                //      Use the main method below and pass this class an argument.
175:                //     
176:                if (uuri != null) {
177:                    runExtractor(uuri);
178:                }
179:            }
180:
181:            protected UURI getUURI(String url) throws URIException {
182:                url = (url.indexOf("://") > 0) ? url : "file://" + url;
183:                return UURIFactory.getInstance(url);
184:            }
185:
186:            protected void runExtractor(UURI baseUURI)
187:                    throws InvalidAttributeValueException,
188:                    AttributeNotFoundException, MBeanException,
189:                    ReflectionException, IOException {
190:                runExtractor(baseUURI, null);
191:            }
192:
193:            protected void runExtractor(UURI baseUURI, String encoding)
194:                    throws IOException, InvalidAttributeValueException,
195:                    AttributeNotFoundException, MBeanException,
196:                    ReflectionException {
197:                if (baseUURI == null) {
198:                    return;
199:                }
200:                this .extractor = createExtractor();
201:                URL url = new URL(baseUURI.toString());
202:                this .recorder = HttpRecorder.wrapInputStreamWithHttpRecord(
203:                        getTmpDir(), this .getClass().getName(), url
204:                                .openStream(), encoding);
205:                CrawlURI curi = setupCrawlURI(this .recorder, url.toString());
206:                this .extractor.innerProcess(curi);
207:
208:                System.out.println("+" + this .extractor.report());
209:                int count = 0;
210:                Collection links = curi.getOutLinks();
211:                System.out.println("+HTML Links (hopType=" + Link.NAVLINK_HOP
212:                        + "):");
213:                if (links != null) {
214:                    for (Iterator i = links.iterator(); i.hasNext();) {
215:                        Link link = (Link) i.next();
216:                        if (link.getHopType() == Link.NAVLINK_HOP) {
217:                            count++;
218:                            System.out.println(link.getDestination());
219:                        }
220:                    }
221:                }
222:                System.out.println("+HTML Embeds (hopType=" + Link.EMBED_HOP
223:                        + "):");
224:                if (links != null) {
225:                    for (Iterator i = links.iterator(); i.hasNext();) {
226:                        Link link = (Link) i.next();
227:                        if (link.getHopType() == Link.EMBED_HOP) {
228:                            count++;
229:                            System.out.println(link.getDestination());
230:                        }
231:                    }
232:                }
233:                System.out.println("+HTML Speculative Embeds (hopType="
234:                        + Link.SPECULATIVE_HOP + "):");
235:                if (links != null) {
236:                    for (Iterator i = links.iterator(); i.hasNext();) {
237:                        Link link = (Link) i.next();
238:                        if (link.getHopType() == Link.SPECULATIVE_HOP) {
239:                            count++;
240:                            System.out.println(link.getDestination());
241:                        }
242:                    }
243:                }
244:                System.out.println("+HTML Other (all other hopTypes):");
245:                if (links != null) {
246:                    for (Iterator i = links.iterator(); i.hasNext();) {
247:                        Link link = (Link) i.next();
248:                        if (link.getHopType() != Link.SPECULATIVE_HOP
249:                                && link.getHopType() != Link.NAVLINK_HOP
250:                                && link.getHopType() != Link.EMBED_HOP) {
251:                            count++;
252:                            System.out.println(link.getHopType() + " "
253:                                    + link.getDestination());
254:                        }
255:                    }
256:                }
257:                System.out.println("TOTAL URIS EXTRACTED: " + count);
258:            }
259:
260:            /**
261:             * Test a particular <embed src=...> construct that was suspicious in
262:             * the No10GovUk crawl.
263:             *
264:             * @throws URIException
265:             */
266:            public void testEmbedSrc() throws URIException {
267:                CrawlURI curi = new CrawlURI(UURIFactory
268:                        .getInstance("http://www.example.org"));
269:                // An example from http://www.records.pro.gov.uk/documents/prem/18/1/default.asp?PageId=62&qt=true
270:                CharSequence cs = "<embed src=\"/documents/prem/18/1/graphics/qtvr/"
271:                        + "hall.mov\" width=\"320\" height=\"212\" controller=\"true\" "
272:                        + "CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/"
273:                        + "quicktime/download/\" /> ";
274:                this .extractor.extract(curi, cs);
275:                assertTrue(CollectionUtils.exists(curi.getOutLinks(),
276:                        new Predicate() {
277:                            public boolean evaluate(Object object) {
278:                                return ((Link) object)
279:                                        .getDestination()
280:                                        .toString()
281:                                        .indexOf(
282:                                                "/documents/prem/18/1/graphics/qtvr/hall.mov") >= 0;
283:                            }
284:                        }));
285:            }
286:
287:            /**
288:             * Test a whitespace issue found in href.
289:             * 
290:             * See [ 963965 ] Either UURI or ExtractHTML should strip whitespace better.
291:             * https://sourceforge.net/tracker/?func=detail&atid=539099&aid=963965&group_id=73833
292:             *
293:             * @throws URIException
294:             */
295:            public void testHrefWhitespace() throws URIException {
296:                CrawlURI curi = new CrawlURI(UURIFactory
297:                        .getInstance("http://www.carsound.dk"));
298:                CharSequence cs = "<a href=\"http://www.carsound.dk\n\n\n"
299:                        + "\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>";
300:                this .extractor.extract(curi, cs);
301:                curi.getOutLinks();
302:                assertTrue("Not stripping new lines", CollectionUtils.exists(
303:                        curi.getOutLinks(), new Predicate() {
304:                            public boolean evaluate(Object object) {
305:                                return ((Link) object).getDestination()
306:                                        .toString().indexOf(
307:                                                "http://www.carsound.dk/") >= 0;
308:                            }
309:                        }));
310:            }
311:
312:            public static void main(String[] args) throws Exception {
313:                if (args.length != 1 && args.length != 2) {
314:                    System.err.println("Usage: "
315:                            + ExtractorHTMLTest.class.getName()
316:                            + " URL|PATH [ENCODING]");
317:                    System.exit(1);
318:                }
319:                ExtractorHTMLTest testCase = new ExtractorHTMLTest();
320:                testCase.setUp();
321:                try {
322:                    testCase.runExtractor(testCase.getUURI(args[0]),
323:                            (args.length == 2) ? args[1] : null);
324:                } finally {
325:                    testCase.tearDown();
326:                }
327:            }
328:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.