001: /*
002: * Copyright (c) 2005 Canoo Engineering. All Rights Reserved.
003: */
004: package com.canoo.webtest.extension.spider;
005:
006: import java.io.File;
007: import java.io.FileWriter;
008: import java.io.IOException;
009: import java.io.OutputStreamWriter;
010: import java.io.Writer;
011: import java.util.HashMap;
012: import java.util.Iterator;
013: import java.util.Map;
014: import java.util.Properties;
015:
016: import org.apache.commons.io.IOUtils;
017: import org.apache.log4j.Logger;
018:
019: import com.canoo.webtest.engine.Context;
020: import com.canoo.webtest.engine.StepFailedException;
021: import com.canoo.webtest.steps.StepUtil;
022: import com.gargoylesoftware.htmlunit.Page;
023: import com.gargoylesoftware.htmlunit.WebResponse;
024: import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
025: import com.gargoylesoftware.htmlunit.html.HtmlPage;
026:
027: /**
028: * @author Denis N. Antonioli
029: */
030: public class Spider {
031: private static final Logger LOG = Logger.getLogger(Spider.class);
032: public static final IVisitorStrategy ALWAYS_ACCEPT_VISITOR_STRATEGY = new AlwaysAcceptVisitorStrategy();
033: public static final IReporter NO_OP_REPORTER = new NoOpReporter();
034: public static final IValidator NO_OP_VALIDATOR = new NoOpValidator();
035:
036: private final Map fVisitedLinks = new HashMap();
037:
038: private IReporter fReporter;
039: private IVisitorStrategy fVisitorStrategy;
040: private IValidator fValidator;
041:
042: private String fFileName;
043: private int fDepth;
044: private boolean fFailOnError;
045: private Context fContext;
046:
047: public void setFailOnError(boolean failOnError) {
048: fFailOnError = failOnError;
049: }
050:
051: public void setFileName(final String filename) {
052: fFileName = filename;
053: }
054:
055: public String getFileName() {
056: return fFileName;
057: }
058:
059: public void setDepth(final int depth) {
060: fDepth = depth;
061: }
062:
063: public void setReporter(final IReporter reporter) {
064: fReporter = reporter;
065: }
066:
067: public IReporter getReporter() {
068: return fReporter;
069: }
070:
071: public void setVisitorStrategy(
072: final IVisitorStrategy visitorStrategy) {
073: fVisitorStrategy = visitorStrategy;
074: }
075:
076: public IVisitorStrategy getVisitorStrategy() {
077: return fVisitorStrategy;
078: }
079:
080: public void setValidator(final IValidator validator) {
081: fValidator = validator;
082: }
083:
084: public IValidator getValidator() {
085: return fValidator;
086: }
087:
088: Writer getWriter() throws IOException {
089: final Writer writer;
090: if (fFileName != null) {
091: final File file = new File(fContext.getConfig()
092: .getWebTestResultDir(), fFileName);
093: LOG.info("Writing in " + file);
094: writer = new FileWriter(file);
095: } else {
096: LOG.info("Writing in standard output");
097: writer = new OutputStreamWriter(System.out);
098: }
099: return writer;
100: }
101:
102: void setContext(Context context) {
103: fContext = context;
104: }
105:
106: public void execute(final Context context) {
107: validate();
108: fVisitedLinks.clear();
109: setContext(context);
110: doExecute();
111: }
112:
113: boolean doExecute() {
114: Writer writer = null;
115: boolean success = false;
116: try {
117: writer = getWriter();
118: fReporter.setWriter(writer);
119: fReporter.writeHeader();
120: visit((HtmlPage) fContext.getCurrentResponse(), fDepth);
121: fReporter.writeFooter();
122: success = true;
123: } catch (final Throwable e) {
124: LOG.error("Problems during write: " + e.getMessage(), e);
125: } finally {
126: // doing this to Stdout will cause interuption
127: IOUtils.closeQuietly(writer);
128: }
129: return success;
130: }
131:
132: void validate() {
133: if (fDepth < 0) {
134: throw new IllegalArgumentException("depth must be >= 0");
135: }
136: if (fFileName == null) {
137: LOG.info("No file name defined, will output to console");
138: }
139: if (fReporter == null) {
140: LOG.info("No reporter defined, using noop reporter");
141: fReporter = NO_OP_REPORTER;
142: }
143: if (fValidator == null) {
144: LOG.info("No validator defined, using noop validator");
145: fValidator = NO_OP_VALIDATOR;
146: }
147: if (fVisitorStrategy == null) {
148: LOG.info("No visitor strategy set, using noop strategy");
149: fVisitorStrategy = ALWAYS_ACCEPT_VISITOR_STRATEGY;
150: }
151: }
152:
153: void visit(final HtmlPage currentResponse, final int depth)
154: throws IOException {
155: LOG.debug("report depth " + depth);
156: for (final Iterator iter = currentResponse.getAnchors()
157: .iterator(); iter.hasNext();) {
158: final HtmlAnchor link = (HtmlAnchor) iter.next();
159:
160: final Properties linkInfo = fValidator.validate(fDepth
161: - depth, currentResponse, link);
162: fReporter.write(linkInfo);
163: if (depth > 0 && needsReport(link)) {
164: processLink(link, depth);
165: }
166: }
167: }
168:
169: void processLink(final HtmlAnchor link, final int depth)
170: throws IOException {
171: try {
172: follow(link);
173: final Page page = fContext.getCurrentResponse();
174: if (page instanceof HtmlPage) {
175: visit((HtmlPage) page, depth - 1);
176: } else {
177: final WebResponse response = page.getWebResponse();
178: LOG.info("Don't going deeper in response for "
179: + response.getUrl()
180: + " as it isn't an html page (content type: "
181: + response.getContentType() + ", page" + page
182: + ")");
183: }
184: } catch (final StepFailedException e) {
185: LOG.error(e.getMessage(), e);
186: if (fFailOnError) {
187: throw e;
188: }
189: }
190: }
191:
192: void follow(final HtmlAnchor link) {
193: LOG.debug("Clicking on link with href: "
194: + link.getHrefAttribute());
195: try {
196: link.click();
197: } catch (final Exception ex) {
198: StepUtil.handleException(ex);
199: }
200: }
201:
202: boolean needsReport(final HtmlAnchor link) {
203: if (fVisitedLinks.containsKey(link.getHrefAttribute())) {
204: LOG.info(link.getHrefAttribute()
205: + " skipped: already visited");
206: return false;
207: }
208: if (!fVisitorStrategy.accept(link)) {
209: LOG.info(link.getHrefAttribute()
210: + " skipped: rejected by visitor");
211: return false;
212: }
213: fVisitedLinks.put(link.getHrefAttribute(), Boolean.TRUE);
214: return true;
215: }
216:
217: private static class AlwaysAcceptVisitorStrategy implements
218: IVisitorStrategy {
219: public boolean accept(HtmlAnchor link) {
220: return true;
221: }
222: }
223:
224: private static class NoOpReporter implements IReporter {
225: public void writeHeader() {
226: }
227:
228: public void write(Properties linkInfo) {
229: }
230:
231: public void setWriter(Writer writer) {
232: }
233:
234: public void writeFooter() {
235: }
236: }
237:
238: private static class NoOpValidator implements IValidator {
239: private static final Properties EMPTY_PROPERTIES = new Properties();
240:
241: public Properties validate(final int depth,
242: final HtmlPage webResponse, final HtmlAnchor link) {
243: return EMPTY_PROPERTIES;
244: }
245: }
246: }
|