001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.components.crawler;
018:
019: import org.apache.avalon.excalibur.pool.Recyclable;
020: import org.apache.avalon.framework.activity.Disposable;
021: import org.apache.avalon.framework.configuration.Configurable;
022: import org.apache.avalon.framework.configuration.Configuration;
023: import org.apache.avalon.framework.configuration.ConfigurationException;
024: import org.apache.avalon.framework.logger.AbstractLogEnabled;
025: import org.apache.cocoon.Constants;
026: import org.apache.commons.lang.StringUtils;
027: import org.apache.regexp.RE;
028: import org.apache.regexp.RESyntaxException;
029:
030: import java.io.BufferedReader;
031: import java.io.IOException;
032: import java.io.InputStream;
033: import java.io.InputStreamReader;
034: import java.net.URL;
035: import java.net.URLConnection;
036: import java.util.ArrayList;
037: import java.util.HashSet;
038: import java.util.Iterator;
039: import java.util.List;
040:
041: /**
042: * A simple cocoon crawler.
043: *
044: * @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
045: * @version CVS $Id: SimpleCocoonCrawlerImpl.java 433543 2006-08-22 06:22:54Z crossley $
046: */
047: public class SimpleCocoonCrawlerImpl extends AbstractLogEnabled
048: implements CocoonCrawler, Configurable, Disposable, Recyclable {
049:
050: /**
051: * Config element name specifying expected link content-typ.
052: * <p>
053: * Its value is <code>link-content-type</code>.
054: * </p>
055: */
056: public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
057:
058: /**
059: * Default value of <code>link-content-type</code> configuration value.
060: * <p>
061: * Its value is <code>application/x-cocoon-links</code>.
062: * </p>
063: */
064: public final String LINK_CONTENT_TYPE_DEFAULT = Constants.LINK_CONTENT_TYPE;
065:
066: /**
067: * Config element name specifying query-string appendend for requesting links
068: * of an URL.
069: * <p>
070: * Its value is <code>link-view-query</code>.
071: * </p>
072: */
073: public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
074:
075: /**
076: * Default value of <code>link-view-query</code> configuration option.
077: * <p>
078: * Its value is <code>?cocoon-view=links</code>.
079: * </p>
080: */
081: public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
082:
083: /**
084: * Config element name specifying excluding regular expression pattern.
085: * <p>
086: * Its value is <code>exclude</code>.
087: * </p>
088: */
089: public final static String EXCLUDE_CONFIG = "exclude";
090:
091: /**
092: * Config element name specifying including regular expression pattern.
093: * <p>
094: * Its value is <code>include</code>.
095: * </p>
096: */
097: public final static String INCLUDE_CONFIG = "include";
098:
099: /**
100: * Config element name specifying http header value for user-Agent.
101: * <p>
102: * Its value is <code>user-agent</code>.
103: * </p>
104: */
105: public final static String USER_AGENT_CONFIG = "user-agent";
106:
107: /**
108: * Default value of <code>user-agent</code> configuration option.
109: * @see Constants#COMPLETE_NAME
110: */
111: public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
112:
113: /**
114: * Config element name specifying http header value for accept.
115: * <p>
116: * Its value is <code>accept</code>.
117: * </p>
118: */
119: public final static String ACCEPT_CONFIG = "accept";
120:
121: /**
122: * Default value of <code>accept</code> configuration option.
123: * <p>
124: * Its value is <code>* / *</code>
125: * </p>
126: */
127: public final static String ACCEPT_DEFAULT = "*/*";
128:
129: private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
130: private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
131: private HashSet excludeCrawlingURL;
132: private HashSet includeCrawlingURL;
133: private String userAgent = USER_AGENT_DEFAULT;
134: private String accept = ACCEPT_DEFAULT;
135: private HashSet crawled;
136:
137: protected int depth;
138: protected HashSet urlsToProcess;
139: protected HashSet urlsNextDepth;
140:
141: /**
142: * Constructor for the SimpleCocoonCrawlerImpl object
143: */
144: public SimpleCocoonCrawlerImpl() {
145: // by default include everything
146: includeCrawlingURL = null;
147: // by default exclude common image patterns
148: excludeCrawlingURL = null;
149: }
150:
151: /**
152: * Configure the crawler component.
153: * <p>
154: * Configure can specify which URI to include, and which URI to exclude
155: * from crawling. You specify the patterns as regular expressions.
156: * </p>
157: * <p>
158: * Morover you can configure
159: * the required content-type of crawling request, and the
160: * query-string appended to each crawling request.
161: * </p>
162: * <pre><tt>
163: * <include>.*\.html?</exclude> or <exclude>.*\.html?, .*\.xsp</exclude>
164: * <exclude>.*\.gif</exclude> or <exclude>.*\.gif, .*\.jpe?g</exclude>
165: * <link-content-type> application/x-cocoon-links </link-content-type>
166: * <link-view-query> ?cocoon-view=links </link-view-query>
167: * </tt></pre>
168: *
169: * @param configuration XML configuration of this avalon component.
170: * @exception ConfigurationException is throwing if configuration is invalid.
171: */
172: public void configure(Configuration configuration)
173: throws ConfigurationException {
174:
175: Configuration[] children;
176: children = configuration.getChildren(INCLUDE_CONFIG);
177: if (children.length > 0) {
178: includeCrawlingURL = new HashSet();
179: for (int i = 0; i < children.length; i++) {
180: String pattern = children[i].getValue();
181: try {
182: String params[] = StringUtils.split(pattern, ", ");
183: for (int index = 0; index < params.length; index++) {
184: String tokenized_pattern = params[index];
185: this .includeCrawlingURL.add(new RE(
186: tokenized_pattern));
187: }
188: } catch (RESyntaxException rese) {
189: getLogger().error(
190: "Cannot create including regular-expression for "
191: + pattern, rese);
192: }
193: }
194: } else {
195: if (getLogger().isDebugEnabled()) {
196: getLogger().debug("Include all URLs");
197: }
198: }
199:
200: children = configuration.getChildren(EXCLUDE_CONFIG);
201: if (children.length > 0) {
202: excludeCrawlingURL = new HashSet();
203: for (int i = 0; i < children.length; i++) {
204: String pattern = children[i].getValue();
205: try {
206: String params[] = StringUtils.split(pattern, ", ");
207: for (int index = 0; index < params.length; index++) {
208: String tokenized_pattern = params[index];
209: this .excludeCrawlingURL.add(new RE(
210: tokenized_pattern));
211: }
212: } catch (RESyntaxException rese) {
213: getLogger().error(
214: "Cannot create excluding regular-expression for "
215: + pattern, rese);
216: }
217: }
218: } else {
219: excludeCrawlingURL = new HashSet();
220: setDefaultExcludeFromCrawling();
221: if (getLogger().isDebugEnabled()) {
222: getLogger().debug("Exclude default URLs only");
223: }
224: }
225:
226: Configuration child;
227: String value;
228: child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
229: if (child != null) {
230: value = child.getValue();
231: if (value != null && value.length() > 0) {
232: this .linkContentType = value.trim();
233: }
234: }
235: child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
236: if (child != null) {
237: value = child.getValue();
238: if (value != null && value.length() > 0) {
239: this .linkViewQuery = value.trim();
240: }
241: }
242:
243: child = configuration.getChild(USER_AGENT_CONFIG, false);
244: if (child != null) {
245: value = child.getValue();
246: if (value != null && value.length() > 0) {
247: this .userAgent = value;
248: }
249: }
250:
251: child = configuration.getChild(ACCEPT_CONFIG, false);
252: if (child != null) {
253: value = child.getValue();
254: if (value != null && value.length() > 0) {
255: this .accept = value;
256: }
257: }
258:
259: }
260:
261: /**
262: * dispose at end of life cycle, releasing all resources.
263: */
264: public void dispose() {
265: crawled = null;
266: urlsToProcess = null;
267: urlsNextDepth = null;
268: excludeCrawlingURL = null;
269: includeCrawlingURL = null;
270: }
271:
272: /**
273: * recylcle this object, relasing resources
274: */
275: public void recycle() {
276: crawled = null;
277: urlsToProcess = null;
278: urlsNextDepth = null;
279: depth = -1;
280: }
281:
282: /**
283: * The same as calling crawl(url,-1);
284: *
285: * @param url Crawl this URL, getting all links from this URL.
286: */
287: public void crawl(URL url) {
288: crawl(url, -1);
289: }
290:
291: /**
292: * Start crawling a URL.
293: *
294: * <p>
295: * Use this method to start crawling.
296: * Get the this url, and all its children by using <code>iterator()</code>.
297: * The Iterator object will return URL objects.
298: * </p>
299: * <p>
300: * You may use the crawl(), and iterator() methods the following way:
301: * </p>
302: * <pre><tt>
303: * SimpleCocoonCrawlerImpl scci = ....;
304: * scci.crawl( "http://foo/bar" );
305: * Iterator i = scci.iterator();
306: * while (i.hasNext()) {
307: * URL url = (URL)i.next();
308: * ...
309: * }
310: * </tt></pre>
311: * <p>
312: * The i.next() method returns a URL, and calculates the links of the
313: * URL before return it.
314: * </p>
315: *
316: * @param url Crawl this URL, getting all links from this URL.
317: * @param maxDepth maximum depth to crawl to. -1 for no maximum.
318: */
319: public void crawl(URL url, int maxDepth) {
320: crawled = new HashSet();
321: urlsToProcess = new HashSet();
322: urlsNextDepth = new HashSet();
323: depth = maxDepth;
324:
325: if (getLogger().isDebugEnabled()) {
326: getLogger().debug(
327: "crawl URL " + url + " to depth " + maxDepth);
328: }
329:
330: urlsToProcess.add(url);
331: }
332:
333: /**
334: * Return iterator, iterating over all links of the currently crawled URL.
335: * <p>
336: * The Iterator object will return URL objects at its <code>next()</code>
337: * method.
338: * </p>
339: *
340: * @return Iterator iterator of all links from the crawl URL.
341: * @since
342: */
343: public Iterator iterator() {
344: return new CocoonCrawlerIterator(this );
345: }
346:
347: /**
348: * Default exclude patterns.
349: * <p>
350: * By default URLs matching following patterns are excluded:
351: * </p>
352: * <ul>
353: * <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
354: * <li>.*\\.png(\\?.*)?$ - exclude png images</li>
355: * <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
356: * <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
357: * <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
358: * </ul>
359: *
360: * @since
361: */
362: private void setDefaultExcludeFromCrawling() {
363: String[] EXCLUDE_FROM_CRAWLING_DEFAULT = { ".*\\.gif(\\?.*)?$",
364: ".*\\.png(\\?.*)?$", ".*\\.jpe?g(\\?.*)?$",
365: ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$" };
366:
367: for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
368: String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
369: try {
370: excludeCrawlingURL.add(new RE(pattern));
371: } catch (RESyntaxException rese) {
372: getLogger().error(
373: "Cannot create excluding regular-expression for "
374: + pattern, rese);
375: }
376: }
377: }
378:
379: /**
380: * Compute list of links from the url.
381: * <p>
382: * Check for include, exclude pattern, content-type, and if url
383: * has been craweled already.
384: * </p>
385: *
386: * @param url Crawl this URL
387: * @return List of URLs, which are links from url, asserting the conditions.
388: * @since
389: */
390: private List getLinks(URL url) {
391: ArrayList url_links = null;
392: String sURL = url.toString();
393:
394: if (!isIncludedURL(sURL) || isExcludedURL(sURL)) {
395: return null;
396: }
397:
398: // don't try to get links for url which has been crawled already
399: if (crawled.contains(sURL)) {
400: return null;
401: }
402:
403: // mark it as crawled
404: crawled.add(sURL);
405:
406: // get links of url
407: if (getLogger().isDebugEnabled()) {
408: getLogger().debug("Getting links of URL " + sURL);
409: }
410: BufferedReader br = null;
411: try {
412: sURL = url.getFile();
413: URL links = new URL(url, sURL
414: + ((sURL.indexOf("?") == -1) ? "?" : "&")
415: + linkViewQuery);
416: URLConnection links_url_connection = links.openConnection();
417: links_url_connection.setRequestProperty("Accept", accept);
418: links_url_connection.setRequestProperty("User-Agent",
419: userAgent);
420: links_url_connection.connect();
421: InputStream is = links_url_connection.getInputStream();
422: br = new BufferedReader(new InputStreamReader(is));
423:
424: String contentType = links_url_connection.getContentType();
425: if (contentType == null) {
426: if (getLogger().isDebugEnabled()) {
427: getLogger().debug(
428: "Ignoring " + sURL + " (no content type)");
429: }
430: // there is a check on null in the calling method
431: return null;
432: }
433:
434: int index = contentType.indexOf(';');
435: if (index != -1) {
436: contentType = contentType.substring(0, index);
437: }
438:
439: if (getLogger().isDebugEnabled()) {
440: getLogger().debug("Content-type: " + contentType);
441: }
442:
443: if (contentType.equals(linkContentType)) {
444: url_links = new ArrayList();
445:
446: // content is supposed to be a list of links,
447: // relative to current URL
448: String line;
449: while ((line = br.readLine()) != null) {
450: final URL newUrl = new URL(url, line);
451: final String sNewUrl = newUrl.toString();
452:
453: boolean add_url = true;
454: // don't add new_url twice
455: if (add_url) {
456: add_url &= !url_links.contains(sNewUrl);
457: }
458:
459: // don't add new_url if it has been crawled already
460: if (add_url) {
461: add_url &= !crawled.contains(sNewUrl);
462: }
463:
464: // don't add if is not matched by existing include definition
465: if (add_url) {
466: add_url &= isIncludedURL(sNewUrl);
467: }
468:
469: // don't add if is matched by existing exclude definition
470: if (add_url) {
471: add_url &= !isExcludedURL(sNewUrl);
472: }
473: if (add_url) {
474: if (getLogger().isDebugEnabled()) {
475: getLogger().debug("Add URL: " + sNewUrl);
476: }
477: url_links.add(newUrl);
478: }
479: }
480: // now we have a list of URL which should be examined
481: }
482: } catch (IOException ioe) {
483: getLogger().warn("Problems get links of " + url, ioe);
484: } finally {
485: if (br != null) {
486: try {
487: br.close();
488: br = null;
489: } catch (IOException ignored) {
490: }
491: }
492: }
493: return url_links;
494: }
495:
496: /**
497: * check if URL is a candidate for indexing
498: *
499: * @param url the URL to check
500: * @return The excludedURL value
501: */
502: private boolean isExcludedURL(String url) {
503: // by default do not exclude URL for crawling
504: if (excludeCrawlingURL == null) {
505: return false;
506: }
507:
508: final String s = url;
509: Iterator i = excludeCrawlingURL.iterator();
510: while (i.hasNext()) {
511: RE pattern = (RE) i.next();
512: if (pattern.match(s)) {
513: if (getLogger().isDebugEnabled()) {
514: getLogger().debug("Excluded URL " + url);
515: }
516: return true;
517: }
518: }
519: if (getLogger().isDebugEnabled()) {
520: getLogger().debug("Not excluded URL " + url);
521: }
522: return false;
523: }
524:
525: /**
526: * check if URL is a candidate for indexing
527: *
528: * @param url Description of Parameter
529: * @return The includedURL value
530: */
531: private boolean isIncludedURL(String url) {
532: // by default include URL for crawling
533: if (includeCrawlingURL == null) {
534: return true;
535: }
536:
537: final String s = url;
538: Iterator i = includeCrawlingURL.iterator();
539: while (i.hasNext()) {
540: RE pattern = (RE) i.next();
541: if (pattern.match(s)) {
542: if (getLogger().isDebugEnabled()) {
543: getLogger().debug("Included URL " + url);
544: }
545: return true;
546: }
547: }
548: if (getLogger().isDebugEnabled()) {
549: getLogger().debug("Not included URL " + url);
550: }
551: return false;
552: }
553:
554: /**
555: * Helper class implementing an Iterator
556: * <p>
557: * This Iterator implementation calculates the links of an URL
558: * before returning in the next() method.
559: * </p>
560: *
561: * @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
562: * @version $Id: SimpleCocoonCrawlerImpl.java 433543 2006-08-22 06:22:54Z crossley $
563: */
564: public static class CocoonCrawlerIterator implements Iterator {
565: private SimpleCocoonCrawlerImpl cocoonCrawler;
566:
567: /**
568: * Constructor for the CocoonCrawlerIterator object
569: *
570: * @param cocoonCrawler the containing CocoonCrawler instance.
571: */
572: CocoonCrawlerIterator(SimpleCocoonCrawlerImpl cocoonCrawler) {
573: this .cocoonCrawler = cocoonCrawler;
574: }
575:
576: /**
577: * check if crawling is finished.
578: *
579: * @return <code>true</code> if crawling has finished,
580: * else <code>false</code>.
581: */
582: public boolean hasNext() {
583: return cocoonCrawler.urlsToProcess.size() > 0
584: || cocoonCrawler.urlsNextDepth.size() > 0;
585: }
586:
587: /**
588: * @return the next URL
589: */
590: public Object next() {
591: if (cocoonCrawler.urlsToProcess.size() == 0
592: && cocoonCrawler.urlsNextDepth.size() > 0) {
593: // process queued urls belonging to the next depth level
594: cocoonCrawler.urlsToProcess = cocoonCrawler.urlsNextDepth;
595: cocoonCrawler.urlsNextDepth = new HashSet();
596: // fix Bugzilla Bug 25270
597: // only decrease if depth > 0, excluding decreasing
598: // if depth is already equal to -1
599: if (cocoonCrawler.depth > 0) {
600: cocoonCrawler.depth--;
601: }
602: }
603: URL theNextUrl = null;
604: // fix Bugzilla Bug 25270
605: // return NextUrl != null only if getLinks() returns non-null
606: // list
607: for (Iterator i = cocoonCrawler.urlsToProcess.iterator(); i
608: .hasNext()
609: && theNextUrl == null;) {
610: // fetch a URL
611: URL url = (URL) i.next();
612:
613: // remove it from the to-do list
614: i.remove();
615:
616: if (cocoonCrawler.depth == -1
617: || cocoonCrawler.depth > 0) {
618: // calc all links from this url
619: List url_links = cocoonCrawler.getLinks(url);
620: if (url_links != null) {
621: // add links of this url to the to-do list
622: cocoonCrawler.urlsNextDepth.addAll(url_links);
623: theNextUrl = url;
624: }
625: }
626: }
627: // finally return url
628: return theNextUrl;
629: }
630:
631: /**
632: * remove is not implemented
633: */
634: public void remove() {
635: throw new UnsupportedOperationException(
636: "remove is not implemented");
637: }
638: }
639: }
|