001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.generation;
018:
019: import org.apache.avalon.framework.parameters.Parameters;
020: import org.apache.avalon.framework.configuration.Configurable;
021: import org.apache.avalon.framework.configuration.Configuration;
022: import org.apache.avalon.framework.configuration.ConfigurationException;
023: import org.apache.cocoon.ProcessingException;
024: import org.apache.cocoon.ResourceNotFoundException;
025: import org.apache.cocoon.environment.SourceResolver;
026: import org.apache.cocoon.Constants;
027: import org.apache.commons.lang.StringUtils;
028: import org.apache.regexp.RE;
029: import org.apache.regexp.RESyntaxException;
030:
031: import org.xml.sax.SAXException;
032: import org.xml.sax.helpers.AttributesImpl;
033:
034: import java.io.IOException;
035: import java.io.InputStream;
036: import java.io.BufferedReader;
037: import java.io.InputStreamReader;
038: import java.net.URLConnection;
039: import java.net.HttpURLConnection;
040: import java.net.URL;
041: import java.util.Map;
042: import java.util.HashSet;
043: import java.util.Iterator;
044: import java.util.List;
045: import java.util.ArrayList;
046:
047: /**
048: * @cocoon.sitemap.component.documentation
049: * Generates a list of links that are reachable from the src and their status.
050: *
051: * @cocoon.sitemap.component.name linkstatus
052: * @cocoon.sitemap.component.label content
053: * @cocoon.sitemap.component.logger sitemap.generator.linkstatus
054: *
055: * @author Michael Homeijer
056: * @author Nicola Ken Barozzi (nicolaken@apache.org)
057: * @author Bernhard Huber (huber@apache.org)
058: * @version $Id: LinkStatusGenerator.java 433543 2006-08-22 06:22:54Z crossley $
059: */
060: public class LinkStatusGenerator extends ServiceableGenerator implements
061: Configurable {
062:
063: /** The URI of the namespace of this generator. */
064: protected static final String URI = "http://apache.org/cocoon/linkstatus/2.0";
065:
066: /** The namespace prefix for this namespace. */
067: protected static final String PREFIX = "linkstatus";
068:
069: /* Node and attribute names */
070: protected static final String TOP_NODE_NAME = "linkstatus";
071: protected static final String LINK_NODE_NAME = "link";
072:
073: protected static final String HREF_ATTR_NAME = "href";
074: protected static final String REFERRER_ATTR_NAME = "referrer";
075: protected static final String CONTENT_ATTR_NAME = "content";
076: protected static final String STATUS_ATTR_NAME = "status";
077: protected static final String MESSAGE_ATTR_NAME = "message";
078:
079: protected AttributesImpl attributes;
080:
081: /**
082: * Config element name specifying expected link content-typ.
083: * <p>
084: * Its value is <code>link-content-type</code>.
085: * </p>
086: *
087: * @since
088: */
089: public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
090:
091: /**
092: * Default value of <code>link-content-type</code> configuration value.
093: * <p>
094: * Its value is <code>application/x-cocoon-links</code>.
095: * </p>
096: *
097: * @since
098: */
099: public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
100:
101: /**
102: * Config element name specifying query-string appendend for requesting links
103: * of an URL.
104: * <p>
105: * Its value is <code>link-view-query</code>.
106: * </p>
107: *
108: * @since
109: */
110: public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
111: /**
112: * Default value of <code>link-view-query</code> configuration value.
113: * <p>
114: * Its value is <code>?cocoon-view=links</code>.
115: * </p>
116: *
117: * @since
118: */
119: public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
120:
121: /**
122: * Config element name specifying excluding regular expression pattern.
123: * <p>
124: * Its value is <code>exclude</code>.
125: * </p>
126: *
127: * @since
128: */
129: public final static String EXCLUDE_CONFIG = "exclude";
130:
131: /**
132: * Config element name specifying including regular expression pattern.
133: * <p>
134: * Its value is <code>include</code>.
135: * </p>
136: *
137: * @since
138: */
139: public final static String INCLUDE_CONFIG = "include";
140:
141: /**
142: * Config element name specifying http header value for user-Agent.
143: * <p>
144: * Its value is <code>user-agent</code>.
145: * </p>
146: *
147: * @since
148: */
149: public final static String USER_AGENT_CONFIG = "user-agent";
150: /**
151: * Default value of <code>user-agent</code> configuration value.
152: *
153: * @see org.apache.cocoon.Constants#COMPLETE_NAME
154: * @since
155: */
156: public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
157:
158: /**
159: * Config element name specifying http header value for accept.
160: * <p>
161: * Its value is <code>accept</code>.
162: * </p>
163: *
164: * @since
165: */
166: public final static String ACCEPT_CONFIG = "accept";
167: /**
168: * Default value of <code>accept</code> configuration value.
169: * <p>
170: * Its value is <code>* / *</code>
171: * </p>
172: *
173: * @since
174: */
175: public final static String ACCEPT_DEFAULT = "*/*";
176:
177: private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
178: private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
179: private HashSet excludeCrawlingURL;
180: private HashSet includeCrawlingURL;
181: // FIXME - The following two are never read, can we delete them?
182: //private String userAgent = USER_AGENT_DEFAULT;
183: //private String accept = ACCEPT_DEFAULT;
184:
185: private HashSet crawled;
186: private HashSet linksToProcess;
187:
188: /**
189: * Stores links to process and the referrer links
190: */
191: private static class Link {
192: private URL url;
193: private String referrer;
194:
195: public Link(URL url, String referrer) {
196: this .url = url;
197: this .referrer = referrer;
198: }
199:
200: public URL getURL() {
201: return url;
202: }
203:
204: public String getReferrer() {
205: return referrer;
206: }
207:
208: public boolean equals(Link l) {
209: return url.equals(l.getURL());
210: }
211: }
212:
213: /**
214: * Configure the crawler component.
215: * <p>
216: * Configure can specify which URI to include, and which URI to exclude
217: * from crawling. You specify the patterns as regular expressions.
218: * </p>
219: * <p>
220: * Morover you can configure
221: * the required content-type of crawling request, and the
222: * query-string appended to each crawling request.
223: * </p>
224: * <pre><tt>
225: * <include>.*\.html?</include> or <include>.*\.html?, .*\.xsp</include>
226: * <exclude>.*\.gif</exclude> or <exclude>.*\.gif, .*\.jpe?g</exclude>
227: * <link-content-type> application/x-cocoon-links </link-content-type>
228: * <link-view-query> ?cocoon-view=links </link-view-query>
229: * <user-agent> Cocoon </user-agent>
230: * <accept> text/xml </accept>
231: * </tt></pre>
232: *
233: * @param configuration XML configuration of this avalon component.
234: * @exception ConfigurationException is throwing if configuration is invalid.
235: * @since
236: */
237: public void configure(Configuration configuration)
238: throws ConfigurationException {
239:
240: Configuration[] children;
241: children = configuration.getChildren(INCLUDE_CONFIG);
242: if (children.length > 0) {
243: includeCrawlingURL = new HashSet();
244: for (int i = 0; i < children.length; i++) {
245: String pattern = children[i].getValue();
246: try {
247: String params[] = StringUtils.split(pattern, ", ");
248: for (int index = 0; index < params.length; index++) {
249: String tokenized_pattern = params[index];
250: this .includeCrawlingURL.add(new RE(
251: tokenized_pattern));
252: }
253: } catch (RESyntaxException rese) {
254: getLogger().error(
255: "Cannot create including regular-expression for "
256: + pattern, rese);
257: }
258: }
259: }
260:
261: children = configuration.getChildren(EXCLUDE_CONFIG);
262: if (children.length > 0) {
263: excludeCrawlingURL = new HashSet();
264: for (int i = 0; i < children.length; i++) {
265: String pattern = children[i].getValue();
266: try {
267: String params[] = StringUtils.split(pattern, ", ");
268: for (int index = 0; index < params.length; index++) {
269: String tokenized_pattern = params[index];
270: this .excludeCrawlingURL.add(new RE(
271: tokenized_pattern));
272: }
273: } catch (RESyntaxException rese) {
274: getLogger().error(
275: "Cannot create excluding regular-expression for "
276: + pattern, rese);
277: }
278: }
279: } else {
280: excludeCrawlingURL = new HashSet();
281: setDefaultExcludeFromCrawling();
282: }
283:
284: Configuration child;
285: String value;
286: child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
287: if (child != null) {
288: value = child.getValue();
289: if (value != null && value.length() > 0) {
290: this .linkContentType = value.trim();
291: }
292: }
293: child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
294: if (child != null) {
295: value = child.getValue();
296: if (value != null && value.length() > 0) {
297: this .linkViewQuery = value.trim();
298: }
299: }
300: /* FIXME: Also delete this if you delete the fields above.
301: child = configuration.getChild(USER_AGENT_CONFIG, false);
302: if (child != null) {
303: value = child.getValue();
304: if (value != null && value.length() > 0) {
305: this.userAgent = value;
306: }
307: }
308:
309: child = configuration.getChild(ACCEPT_CONFIG, false);
310: if (child != null) {
311: value = child.getValue();
312: if (value != null && value.length() > 0) {
313: this.accept = value;
314: }
315: }
316: */
317: }
318:
319: public void setup(SourceResolver resolver, Map objectModel,
320: String src, Parameters par) throws ProcessingException,
321: SAXException, IOException {
322:
323: super .setup(resolver, objectModel, src, par);
324:
325: /* Create a reusable attributes for creating nodes */
326: this .attributes = new AttributesImpl();
327:
328: // already done in configure...
329: //excludeCrawlingURL = new HashSet();
330: //this.setDefaultExcludeFromCrawling();
331: }
332:
333: /**
334: * Generate XML data.
335: *
336: * @throws SAXException
337: * if an error occurs while outputting the document
338: * @throws ProcessingException
339: * if the requsted URI wasn't found
340: */
341: public void generate() throws SAXException, ProcessingException {
342: try {
343:
344: crawled = new HashSet();
345: linksToProcess = new HashSet();
346:
347: URL root = new URL(source);
348: linksToProcess.add(new Link(root, ""));
349:
350: if (getLogger().isDebugEnabled()) {
351: getLogger().debug("crawl URL " + root);
352: }
353:
354: this .contentHandler.startDocument();
355: this .contentHandler.startPrefixMapping(PREFIX, URI);
356:
357: attributes.clear();
358: super .contentHandler.startElement(URI, TOP_NODE_NAME,
359: PREFIX + ':' + TOP_NODE_NAME, attributes);
360:
361: while (linksToProcess.size() > 0) {
362: Iterator i = linksToProcess.iterator();
363:
364: if (i.hasNext()) {
365: // fetch a URL
366: Link link = (Link) i.next();
367: URL url = link.getURL();
368:
369: // remove it from the to-do list
370: linksToProcess.remove(link);
371:
372: String new_url_link = processURL(url, link
373: .getReferrer());
374:
375: // calc all links from this url
376: if (new_url_link != null) {
377:
378: List url_links = getLinksFromConnection(
379: new_url_link, url);
380: if (url_links != null) {
381: // add links of this url to the to-do list
382: linksToProcess.addAll(url_links);
383: }
384: }
385: }
386: }
387:
388: super .contentHandler.endElement(URI, TOP_NODE_NAME, PREFIX
389: + ':' + TOP_NODE_NAME);
390: this .contentHandler.endPrefixMapping(PREFIX);
391: this .contentHandler.endDocument();
392: } catch (IOException ioe) {
393: getLogger().warn("Could not read source ", ioe);
394: throw new ResourceNotFoundException(
395: "Could not read source ", ioe);
396: }
397: }
398:
399: /**
400: * Default exclude patterns.
401: * <p>
402: * By default URLs matching following patterns are excluded:
403: * </p>
404: * <ul>
405: * <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
406: * <li>.*\\.png(\\?.*)?$ - exclude png images</li>
407: * <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
408: * <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
409: * <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
410: * </ul>
411: *
412: * @since
413: */
414: private void setDefaultExcludeFromCrawling() {
415: String[] EXCLUDE_FROM_CRAWLING_DEFAULT = { ".*\\.gif(\\?.*)?$",
416: ".*\\.png(\\?.*)?$", ".*\\.jpe?g(\\?.*)?$",
417: ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$" };
418:
419: for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
420: String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
421: try {
422: excludeCrawlingURL.add(new RE(pattern));
423: } catch (RESyntaxException rese) {
424: getLogger().error(
425: "Cannot create excluding regular-expression for "
426: + pattern, rese);
427: }
428: }
429: }
430:
431: /**
432: * Retrieve a list of links of a url
433: *
434: * @param url_link_string url for requesting links, it is assumed that
435: * url_link_string queries the cocoon view links, ie of the form
436: * <code>http://host/foo/bar?cocoon-view=links</code>
437: * @param url_of_referrer base url of which links are requested, ie of the form
438: * <code>http://host/foo/bar</code>
439: * @return List of links from url_of_referrer, as result of requesting url
440: * url_link_string
441: */
442: protected List getLinksFromConnection(String url_link_string,
443: URL url_of_referrer) {
444: List url_links = null;
445: BufferedReader br = null;
446: try {
447: URL url_link = new URL(url_link_string);
448: URLConnection conn = url_link.openConnection();
449: String content_type = conn.getContentType();
450:
451: if (content_type == null) {
452: getLogger().warn(
453: "No content type available for "
454: + String.valueOf(url_link_string));
455: // caller checks if null
456: return url_links;
457: }
458:
459: if (getLogger().isDebugEnabled()) {
460: getLogger().debug("Content-type: " + content_type);
461: }
462:
463: if (content_type.equals(linkContentType)
464: || content_type.startsWith(linkContentType + ";")) {
465: url_links = new ArrayList();
466:
467: InputStream is = conn.getInputStream();
468: br = new BufferedReader(new InputStreamReader(is));
469:
470: // content is supposed to be a list of links,
471: // relative to current URL
472: String line;
473: String referrer = url_of_referrer.toString();
474:
475: while ((line = br.readLine()) != null) {
476: URL new_url = new URL(url_link, line);
477: boolean add_url = true;
478: // don't add new_url twice
479: if (add_url) {
480: add_url &= !url_links.contains(new_url);
481: }
482:
483: // don't add new_url if it has been crawled already
484: if (add_url) {
485: add_url &= !crawled
486: .contains(new_url.toString());
487: }
488:
489: Link new_link = new Link(new_url, referrer);
490: if (add_url) {
491: add_url &= !linksToProcess.contains(new_link);
492: }
493:
494: // don't add if is not matched by existing include definition
495: if (add_url) {
496: add_url &= isIncludedURL(new_url.toString());
497: }
498:
499: if (add_url) {
500: if (getLogger().isDebugEnabled()) {
501: getLogger().debug(
502: "Add URL: " + new_url.toString());
503: }
504: url_links.add(new_link);
505: }
506: }
507: // now we have a list of URL which should be examined
508: }
509: } catch (IOException ioe) {
510: getLogger().warn(
511: "Problems get links of " + url_link_string, ioe);
512: } finally {
513: // explictly close the stream
514: if (br != null) {
515: try {
516: br.close();
517: br = null;
518: } catch (IOException ignored) {
519: }
520: }
521: }
522: return url_links;
523: }
524:
525: /**
526: * Generate xml attributes of a url, calculate url for retrieving links
527: *
528: * @param url to process
529: * @param referrer of the url
530: * @return String url for retrieving links, or null if url is an excluded-url,
531: * and not an included-url.
532: */
533: protected String processURL(URL url, String referrer)
534: throws SAXException {
535:
536: if (getLogger().isDebugEnabled()) {
537: getLogger().debug("getLinks URL " + url);
538: }
539:
540: String result = null;
541:
542: // don't try to investigate a url which has been crawled already
543: if (crawled.contains(url.toString())) {
544: return null;
545: }
546:
547: // mark it as crawled
548: crawled.add(url.toString());
549:
550: attributes.clear();
551: attributes.addAttribute("", HREF_ATTR_NAME, HREF_ATTR_NAME,
552: "CDATA", url.toString());
553: attributes.addAttribute("", REFERRER_ATTR_NAME,
554: REFERRER_ATTR_NAME, "CDATA", referrer);
555:
556: // Output url, referrer, content-type, status, message for traversable url's
557: HttpURLConnection h = null;
558: try {
559:
560: URLConnection links_url_connection = url.openConnection();
561: h = (HttpURLConnection) links_url_connection;
562: String content_type = links_url_connection.getContentType();
563:
564: attributes.addAttribute("", CONTENT_ATTR_NAME,
565: CONTENT_ATTR_NAME, "CDATA", content_type);
566:
567: attributes.addAttribute("", MESSAGE_ATTR_NAME,
568: MESSAGE_ATTR_NAME, "CDATA", h.getResponseMessage());
569:
570: attributes.addAttribute("", STATUS_ATTR_NAME,
571: STATUS_ATTR_NAME, "CDATA", String.valueOf(h
572: .getResponseCode()));
573: } catch (IOException ioe) {
574: attributes.addAttribute("", MESSAGE_ATTR_NAME,
575: MESSAGE_ATTR_NAME, "CDATA", ioe.getMessage());
576: } finally {
577: if (h != null) {
578: h.disconnect();
579: }
580: }
581:
582: // don't try to get links of a url which is excluded from crawling
583: // try to get links of a url which is included for crawling
584: if (!isExcludedURL(url.toString())
585: && isIncludedURL(url.toString())) {
586: // add prefix and query to get data from the linkserializer.
587: result = url.toExternalForm()
588: + ((url.toExternalForm().indexOf("?") == -1) ? "?"
589: : "&") + linkViewQuery;
590: }
591:
592: super .contentHandler.startElement(URI, LINK_NODE_NAME, PREFIX
593: + ':' + LINK_NODE_NAME, attributes);
594: super .contentHandler.endElement(URI, LINK_NODE_NAME, PREFIX
595: + ':' + LINK_NODE_NAME);
596:
597: return result;
598: }
599:
600: /**
601: * check if URL is a candidate for indexing
602: *
603: * @param url Description of Parameter
604: * @return The excludedURL value
605: * @since
606: */
607: private boolean isExcludedURL(String url) {
608: // by default include URL for crawling
609: if (excludeCrawlingURL == null) {
610: if (getLogger().isDebugEnabled()) {
611: getLogger().debug("exclude no URL " + url);
612: }
613: return false;
614: }
615:
616: final String s = url;
617: Iterator i = excludeCrawlingURL.iterator();
618: while (i.hasNext()) {
619: RE pattern = (RE) i.next();
620: if (pattern.match(s)) {
621: if (getLogger().isDebugEnabled()) {
622: getLogger().debug("exclude URL " + url);
623: }
624: return true;
625: }
626: }
627: if (getLogger().isDebugEnabled()) {
628: getLogger().debug("exclude not URL " + url);
629: }
630: return false;
631: }
632:
633: /**
634: * check if URL is a candidate for indexing
635: *
636: * @param url Description of Parameter
637: * @return The includedURL value
638: * @since
639: */
640: private boolean isIncludedURL(String url) {
641: // by default include URL for crawling
642: if (includeCrawlingURL == null) {
643: if (getLogger().isDebugEnabled()) {
644: getLogger().debug("include all URL " + url);
645: }
646: return true;
647: }
648:
649: final String s = url;
650: Iterator i = includeCrawlingURL.iterator();
651: while (i.hasNext()) {
652: RE pattern = (RE) i.next();
653: if (pattern.match(s)) {
654: if (getLogger().isDebugEnabled()) {
655: getLogger().debug("include URL " + url);
656: }
657: return true;
658: }
659: }
660: if (getLogger().isDebugEnabled()) {
661: getLogger().debug("include not URL " + url);
662: }
663: return false;
664: }
665:
666: public void recycle() {
667: super.recycle();
668:
669: this.attributes = null;
670: }
671: }
|